From e85001b6ee9023d69b8a7b91e2feff6bca392ec7 Mon Sep 17 00:00:00 2001 From: uuuvn <83587632+uuuvn@users.noreply.github.com> Date: Tue, 11 Mar 2025 10:19:56 +0500 Subject: [PATCH] SQTT profiling (#9278) * sqtt * docs * multi-device * ProfileSQTTEvent * exec update * 256mb default * don't let people hang their gpus * bitfields from autogen * asic info from mesa * more bitfields from autogen * SQTT_ITRACE_SE_MASK --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com> --- .github/workflows/test.yml | 3 + autogen_stubs.sh | 11 + extra/sqtt/README.md | 33 + extra/sqtt/rgptool.py | 330 ++++++ extra/sqtt/sqtt.h | 840 ++++++++++++++ tinygrad/device.py | 8 +- tinygrad/runtime/autogen/sqtt.py | 1789 ++++++++++++++++++++++++++++++ tinygrad/runtime/ops_amd.py | 154 ++- tinygrad/runtime/support/hcq.py | 5 +- 9 files changed, 3164 insertions(+), 9 deletions(-) create mode 100644 extra/sqtt/README.md create mode 100755 extra/sqtt/rgptool.py create mode 100644 extra/sqtt/sqtt.h create mode 100644 tinygrad/runtime/autogen/sqtt.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 839c9f67fe..337cbf6422 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -118,12 +118,15 @@ jobs: cp tinygrad/runtime/autogen/hsa.py /tmp/hsa.py.bak cp tinygrad/runtime/autogen/comgr.py /tmp/comgr.py.bak cp tinygrad/runtime/autogen/amd_gpu.py /tmp/amd_gpu.py.bak + cp tinygrad/runtime/autogen/sqtt.py /tmp/sqtt.py.bak ./autogen_stubs.sh hsa ./autogen_stubs.sh comgr ./autogen_stubs.sh amd + ./autogen_stubs.sh sqtt diff /tmp/hsa.py.bak tinygrad/runtime/autogen/hsa.py diff /tmp/comgr.py.bak tinygrad/runtime/autogen/comgr.py diff /tmp/amd_gpu.py.bak tinygrad/runtime/autogen/amd_gpu.py + diff /tmp/sqtt.py.bak tinygrad/runtime/autogen/sqtt.py - name: Verify Linux autogen run: | cp tinygrad/runtime/autogen/libc.py /tmp/libc.py.bak diff --git a/autogen_stubs.sh b/autogen_stubs.sh index 9074c858b7..92579e1706 100755 --- a/autogen_stubs.sh +++ b/autogen_stubs.sh @@ -362,6 +362,16 @@ generate_am() { fixup $BASE/am/hdp_6_0_0.py } +generate_sqtt() { + clang2py -k cdefstum \ + extra/sqtt/sqtt.h \ + -o $BASE/sqtt.py + + fixup $BASE/sqtt.py + sed -i "s\import ctypes\import ctypes, os\g" $BASE/sqtt.py + python3 -c "import tinygrad.runtime.autogen.sqtt" +} + generate_webgpu() { clang2py -l /usr/local/lib/libwebgpu_dawn.so extra/webgpu/webgpu.h -o $BASE/webgpu.py fixup $BASE/webgpu.py @@ -380,6 +390,7 @@ elif [ "$1" == "kfd" ]; then generate_kfd elif [ "$1" == "nv" ]; then generate_nv elif [ "$1" == "amd" ]; then generate_amd elif [ "$1" == "am" ]; then generate_am +elif [ "$1" == "sqtt" ]; then generate_sqtt elif [ "$1" == "qcom" ]; then generate_qcom elif [ "$1" == "io_uring" ]; then generate_io_uring elif [ "$1" == "libc" ]; then generate_libc diff --git a/extra/sqtt/README.md b/extra/sqtt/README.md new file mode 100644 index 0000000000..ccd36279f8 --- /dev/null +++ b/extra/sqtt/README.md @@ -0,0 +1,33 @@ +# SQTT Profiling + +## Getting SQ Thread Trace + +Only supported on 7900XTX, requires either AM (`rmmod amdgpu`) or disabling power gating on AMD (`ppfeaturemask=0xffff3fff`, don't forget to rebuild initramfs) + +SQTT is implemented on top of normal tinygrad PROFILE=1, `PROFILE=1 SQTT=1` to get profile pickle with sqtt data embedded in it. + +`SQTT_BUFFER_SIZE=X` to change size of SQTT buffer (per shader engine, 6 SEs on 7900xtx) in megabytes, default 256. + +`SQTT_ITRACE_SE_MASK=X` to select for which shader engines instruction tracing will be enabled, -1 is all, 0 is none (instruction tracing disabled), >0 is +bitfield/mask for SEs to enable instruction tracing on. Masking shader engines will give smaller file sizes at a cost of less hits and kernels that +don't have any wavefront on first simd of shdaer engine with instruction tracing enabled will not have instruction timings. +The default is 2 (second shader engine only), only one for file size reasons, second instead of first because dispatch starts from it so there is +greater chance that kernels with small global size will have instruction tracing data. + +Note that instruction tracing might not be available for kernels with small global dims, this is not a bug, but it can be improved with various hacks +to the point where it can reliably trace a kernel consisting of a single wavefront (am only, not quite reliable under amdgpu due to waves sometimes +being dispatched starting from different simds). More info in comments in ops_amd.py + +## Converting pickled profile with SQTT data into RGP file + +```bash +extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -o /tmp/gpu0.rgp +``` + +Then load gpu0.rgp into Radeon GPU Profiler. It works just fine both in wine (macos, native version available for linux) and via ssh X forwarding + +If multiplle gpus are used you can select which one to export with `-d` like this: + +```bash +extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -d 'AMD:5' -o /tmp/gpu5.rgp +``` diff --git a/extra/sqtt/rgptool.py b/extra/sqtt/rgptool.py new file mode 100755 index 0000000000..3244cc153b --- /dev/null +++ b/extra/sqtt/rgptool.py @@ -0,0 +1,330 @@ +#!/usr/bin/env python3 +from __future__ import annotations +import argparse, ctypes, struct, hashlib, pickle, code, typing, functools +import tinygrad.runtime.autogen.sqtt as sqtt +from tinygrad.device import ProfileEvent, ProfileDeviceEvent, ProfileProgramEvent +from tinygrad.runtime.ops_amd import ProfileSQTTEvent +from tinygrad.helpers import round_up, flatten, all_same +from dataclasses import dataclass + +CHUNK_CLASSES = { + sqtt.SQTT_FILE_CHUNK_TYPE_ASIC_INFO: sqtt.struct_sqtt_file_chunk_asic_info, + sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DESC: sqtt.struct_sqtt_file_chunk_sqtt_desc, + sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DATA: sqtt.struct_sqtt_file_chunk_sqtt_data, + sqtt.SQTT_FILE_CHUNK_TYPE_API_INFO: sqtt.struct_sqtt_file_chunk_api_info, + sqtt.SQTT_FILE_CHUNK_TYPE_QUEUE_EVENT_TIMINGS: sqtt.struct_sqtt_file_chunk_queue_event_timings, + sqtt.SQTT_FILE_CHUNK_TYPE_CLOCK_CALIBRATION: sqtt.struct_sqtt_file_chunk_clock_calibration, + sqtt.SQTT_FILE_CHUNK_TYPE_CPU_INFO: sqtt.struct_sqtt_file_chunk_cpu_info, + sqtt.SQTT_FILE_CHUNK_TYPE_SPM_DB: sqtt.struct_sqtt_file_chunk_spm_db, + sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE: sqtt.struct_sqtt_file_chunk_code_object_database, + sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS: sqtt.struct_sqtt_file_chunk_code_object_loader_events, + sqtt.SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION: sqtt.struct_sqtt_file_chunk_pso_correlation, +} + +def pretty(val, pad=0) -> str: + if isinstance(val, ctypes.Structure) or isinstance(val, ctypes.Union): + nl = '\n' # old python versions don't support \ in f-strings + return f"{val.__class__.__name__}({nl}{' '*(pad+2)}{(f', {nl}'+' '*(pad+2)).join([f'{field[0]}={pretty(getattr(val, field[0]), pad=pad+2)}' for field in val._fields_])}{nl}{' '*pad})" + if isinstance(val, ctypes.Array): + return f"[{', '.join(map(pretty, val))}]" + if isinstance(val, int) and val >= 1024: return hex(val) + return repr(val) + +@dataclass(frozen=True) +class RGPChunk: + header: sqtt.Structure + data: list[typing.Any]|list[tuple[typing.Any, bytes]]|bytes|None = None + def print(self): + print(pretty(self.header)) + # if isinstance(self.data, bytes): print(repr(self.data)) + if isinstance(self.data, list): + for dchunk in self.data: + if isinstance(dchunk, tuple): + print(pretty(dchunk[0])) + # print(repr(dchunk[1])) + else: + print(pretty(dchunk)) + # TODO: `def fixup` and true immutability + def to_bytes(self, offset:int) -> bytes: + cid = self.header.header.chunk_id.type + match cid: + case _ if cid in {sqtt.SQTT_FILE_CHUNK_TYPE_ASIC_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_CPU_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_API_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DESC}: + self.header.header.size_in_bytes = ctypes.sizeof(self.header) + return bytes(self.header) + case sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DATA: + assert isinstance(self.data, bytes) + self.header.header.size_in_bytes = ctypes.sizeof(self.header) + len(self.data) + self.header.offset = offset+ctypes.sizeof(self.header) + self.header.size = len(self.data) + return bytes(self.header) + self.data + case sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE: + assert isinstance(self.data, list) + data_codb = typing.cast(list[tuple[sqtt.struct_sqtt_code_object_database_record, bytes]], self.data) + ret = bytearray() + sz = ctypes.sizeof(self.header)+sum([ctypes.sizeof(record_hdr)+round_up(len(record_blob), 4) for record_hdr,record_blob in data_codb]) + self.header.header.size_in_bytes = sz + self.header.offset = offset + self.header.record_count = len(data_codb) + self.header.size = sz + ret += self.header + for record_hdr,record_blob in data_codb: + record_hdr.size = round_up(len(record_blob), 4) + ret += record_hdr + ret += record_blob.ljust(4, b'\x00') + return ret + case sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS: + assert isinstance(self.data, list) + data_lev = typing.cast(list[tuple[sqtt.struct_sqtt_code_object_loader_events_record]], self.data) + self.header.header.size_in_bytes = ctypes.sizeof(self.header)+ctypes.sizeof(sqtt.struct_sqtt_code_object_loader_events_record)*len(data_lev) + self.header.offset = offset + self.header.record_size = ctypes.sizeof(sqtt.struct_sqtt_code_object_loader_events_record) + self.header.record_count = len(data_lev) + return bytes(self.header) + b''.join(map(bytes, data_lev)) + case sqtt.SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION: + assert isinstance(self.data, list) + data_pso = typing.cast(list[tuple[sqtt.struct_sqtt_pso_correlation_record]], self.data) + self.header.header.size_in_bytes = ctypes.sizeof(self.header)+ctypes.sizeof(sqtt.struct_sqtt_pso_correlation_record)*len(data_pso) + self.header.offset = offset + self.header.record_size = ctypes.sizeof(sqtt.struct_sqtt_pso_correlation_record) + self.header.record_count = len(data_pso) + return bytes(self.header) + b''.join(map(bytes, data_pso)) + case _: raise NotImplementedError(pretty(self.header)) + +@dataclass(frozen=True) +class RGP: + header: sqtt.struct_sqtt_file_header + chunks: list[RGPChunk] + @staticmethod + def from_bytes(blob: bytes) -> RGP: + file_header = sqtt.struct_sqtt_file_header.from_buffer_copy(blob) + assert file_header.magic_number == sqtt.SQTT_FILE_MAGIC_NUMBER and file_header.version_major == sqtt.SQTT_FILE_VERSION_MAJOR + i = file_header.chunk_offset + chunks = [] + while i < len(blob): + assert i%4==0, hex(i) + hdr = sqtt.struct_sqtt_file_chunk_header.from_buffer_copy(blob, i) + cid = hdr.chunk_id.type + header: ctypes.Structure + match cid: + case _ if cid in {sqtt.SQTT_FILE_CHUNK_TYPE_RESERVED, sqtt.SQTT_FILE_CHUNK_TYPE_QUEUE_EVENT_TIMINGS, sqtt.SQTT_FILE_CHUNK_TYPE_CLOCK_CALIBRATION, sqtt.SQTT_FILE_CHUNK_TYPE_SPM_DB}: + chunk = None + case sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE: + header = sqtt.struct_sqtt_file_chunk_code_object_database.from_buffer_copy(blob, i) + j = header.offset + ctypes.sizeof(header) + data: list = [] + while j < header.offset + header.size: + rec_hdr: ctypes.Structure = sqtt.struct_sqtt_code_object_database_record.from_buffer_copy(blob, j) + data.append((rec_hdr, elf:=blob[j+ctypes.sizeof(rec_hdr):j+ctypes.sizeof(rec_hdr)+rec_hdr.size])) + assert elf[:4] == b'\x7fELF', repr(elf[:16]) + j += ctypes.sizeof(rec_hdr)+rec_hdr.size + assert len(data) == header.record_count + chunk = RGPChunk(header, data) + case sqtt.SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS: + header = sqtt.struct_sqtt_file_chunk_code_object_loader_events.from_buffer_copy(blob, i) + data = [sqtt.struct_sqtt_code_object_loader_events_record.from_buffer_copy(blob, header.offset+ctypes.sizeof(header)+j*header.record_size) + for j in range(header.record_count)] + chunk = RGPChunk(header, data) + case sqtt.SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION: + header = sqtt.struct_sqtt_file_chunk_pso_correlation.from_buffer_copy(blob, i) + data = [sqtt.struct_sqtt_pso_correlation_record.from_buffer_copy(blob, header.offset+ctypes.sizeof(header)+j*header.record_size) + for j in range(header.record_count)] + chunk = RGPChunk(header, data) + case sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DATA: + header = sqtt.struct_sqtt_file_chunk_sqtt_data.from_buffer_copy(blob, i) + chunk = RGPChunk(header, blob[header.offset:header.offset+header.size]) + case _ if cid in {sqtt.SQTT_FILE_CHUNK_TYPE_ASIC_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_CPU_INFO, sqtt.SQTT_FILE_CHUNK_TYPE_API_INFO, + sqtt.SQTT_FILE_CHUNK_TYPE_SQTT_DESC}: + chunk = RGPChunk(CHUNK_CLASSES[cid].from_buffer_copy(blob, i)) + case _: + chunk = None + print(f"unknown chunk id {cid}") + if chunk is not None: chunks.append(chunk) + i += hdr.size_in_bytes + assert i == len(blob), f'{i} != {len(blob)}' + return RGP(file_header, chunks) + @staticmethod + def from_profile(profile_pickled, device:str|None=None): + profile: list[ProfileEvent] = pickle.loads(profile_pickled) + device_events = {x.device:x for x in profile if isinstance(x, ProfileDeviceEvent) and x.device.startswith('AMD')} + if device is None: + if len(device_events) == 0: raise RuntimeError('No supported devices found in profile') + if len(device_events) > 1: raise RuntimeError(f"More than one supported device found, select which one to export: {', '.join(device_events.keys())}") + _, device_event = device_events.popitem() + else: + if device not in device_events: raise RuntimeError(f"Device {device} not found in profile, devices in profile: {', '.join(device_events.keys())} ") + device_event = device_events[device] + sqtt_events = [x for x in profile if isinstance(x, ProfileSQTTEvent) and x.device == device_event.device] + if len(sqtt_events) == 0: raise RuntimeError(f"Device {device_event.device} doesn't contain SQTT data") + sqtt_itrace_enabled = any([event.itrace for event in sqtt_events]) + sqtt_itrace_masked = not all_same([event.itrace for event in sqtt_events]) + sqtt_itrace_se_mask = functools.reduce(lambda a,b: a|b, [int(event.itrace) << event.se for event in sqtt_events], 0) if sqtt_itrace_masked else 0 + load_events = [x for x in profile if isinstance(x, ProfileProgramEvent) and x.device == device_event.device] + loads = [(event.base, struct.unpack(' bytes: + ret = bytearray() + ret += self.header + for chunk in self.chunks: + ret += chunk.to_bytes(len(ret)) + return bytes(ret) + def print(self): + print(pretty(self.header)) + for chunk in self.chunks: chunk.print() + +if __name__ == '__main__': + parser = argparse.ArgumentParser(prog='rgptool', description='A tool to create (from pickled tinygrad profile), inspect and modify Radeon GPU Profiler files') + parser.add_argument('command') + parser.add_argument('input') + parser.add_argument('-d', '--device') + parser.add_argument('-o', '--output') + args = parser.parse_args() + + with open(args.input, 'rb') as fd: input_bytes = fd.read() + + match args.command: + case 'print': + rgp = RGP.from_bytes(input_bytes) + rgp.print() + case 'create': + rgp = RGP.from_profile(input_bytes, device=args.device) + # rgp.to_bytes() # fixup + # rgp.print() + case 'repl': + rgp = RGP.from_bytes(input_bytes) + code.interact(local=locals()) + case _: raise RuntimeError(args.command) + + if args.output is not None: + with open(args.output, 'wb+') as fd: fd.write(rgp.to_bytes()) diff --git a/extra/sqtt/sqtt.h b/extra/sqtt/sqtt.h new file mode 100644 index 0000000000..775655840c --- /dev/null +++ b/extra/sqtt/sqtt.h @@ -0,0 +1,840 @@ +#include + +// Original definition in pal is in c++ and clang2py can't autogen it correctly +// Most of this is copy pasted from mesa/src/amd/common/ac_rgp.{h, c} + +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * Copyright 2020 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#define SQTT_FILE_MAGIC_NUMBER 0x50303042 +#define SQTT_FILE_VERSION_MAJOR 1 +#define SQTT_FILE_VERSION_MINOR 5 + +#define SQTT_GPU_NAME_MAX_SIZE 256 +#define SQTT_MAX_NUM_SE 32 +#define SQTT_SA_PER_SE 2 +#define SQTT_ACTIVE_PIXEL_PACKER_MASK_DWORDS 4 + +struct sqtt_data_info { + uint32_t cur_offset; + uint32_t trace_status; + union { + uint32_t gfx9_write_counter; + uint32_t gfx10_dropped_cntr; + }; +}; + +struct sqtt_data_se { + struct sqtt_data_info info; + void *data_ptr; + uint32_t shader_engine; + uint32_t compute_unit; +}; + + +enum sqtt_version +{ + SQTT_VERSION_NONE = 0x0, + SQTT_VERSION_2_2 = 0x5, /* GFX8 */ + SQTT_VERSION_2_3 = 0x6, /* GFX9 */ + SQTT_VERSION_2_4 = 0x7, /* GFX10+ */ + SQTT_VERSION_3_2 = 0xb, /* GFX11+ */ +}; + +enum sqtt_file_chunk_type +{ + SQTT_FILE_CHUNK_TYPE_ASIC_INFO, + SQTT_FILE_CHUNK_TYPE_SQTT_DESC, + SQTT_FILE_CHUNK_TYPE_SQTT_DATA, + SQTT_FILE_CHUNK_TYPE_API_INFO, + SQTT_FILE_CHUNK_TYPE_RESERVED, + SQTT_FILE_CHUNK_TYPE_QUEUE_EVENT_TIMINGS, + SQTT_FILE_CHUNK_TYPE_CLOCK_CALIBRATION, + SQTT_FILE_CHUNK_TYPE_CPU_INFO, + SQTT_FILE_CHUNK_TYPE_SPM_DB, + SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE, + SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS, + SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION, + SQTT_FILE_CHUNK_TYPE_INSTRUMENTATION_TABLE, + SQTT_FILE_CHUNK_TYPE_COUNT +}; + + +struct sqtt_file_chunk_id { + int32_t type : 8; + int32_t index : 8; + int32_t reserved : 16; +}; + +struct sqtt_file_chunk_header { + struct sqtt_file_chunk_id chunk_id; + uint16_t minor_version; + uint16_t major_version; + int32_t size_in_bytes; + int32_t padding; +}; + +struct sqtt_file_header_flags { + union { + struct { + uint32_t is_semaphore_queue_timing_etw : 1; + uint32_t no_queue_semaphore_timestamps : 1; + uint32_t reserved : 30; + }; + + uint32_t value; + }; +}; + +struct sqtt_file_header { + uint32_t magic_number; + uint32_t version_major; + uint32_t version_minor; + struct sqtt_file_header_flags flags; + int32_t chunk_offset; + int32_t second; + int32_t minute; + int32_t hour; + int32_t day_in_month; + int32_t month; + int32_t year; + int32_t day_in_week; + int32_t day_in_year; + int32_t is_daylight_savings; +}; + +struct sqtt_file_chunk_cpu_info { + struct sqtt_file_chunk_header header; + uint32_t vendor_id[4]; + uint32_t processor_brand[12]; + uint32_t reserved[2]; + uint64_t cpu_timestamp_freq; + uint32_t clock_speed; + uint32_t num_logical_cores; + uint32_t num_physical_cores; + uint32_t system_ram_size; +}; + +enum sqtt_file_chunk_asic_info_flags +{ + SQTT_FILE_CHUNK_ASIC_INFO_FLAG_SC_PACKER_NUMBERING = (1 << 0), + SQTT_FILE_CHUNK_ASIC_INFO_FLAG_PS1_EVENT_TOKENS_ENABLED = (1 << 1) +}; + +enum sqtt_gpu_type +{ + SQTT_GPU_TYPE_UNKNOWN = 0x0, + SQTT_GPU_TYPE_INTEGRATED = 0x1, + SQTT_GPU_TYPE_DISCRETE = 0x2, + SQTT_GPU_TYPE_VIRTUAL = 0x3 +}; + +enum sqtt_gfxip_level +{ + SQTT_GFXIP_LEVEL_NONE = 0x0, + SQTT_GFXIP_LEVEL_GFXIP_6 = 0x1, + SQTT_GFXIP_LEVEL_GFXIP_7 = 0x2, + SQTT_GFXIP_LEVEL_GFXIP_8 = 0x3, + SQTT_GFXIP_LEVEL_GFXIP_8_1 = 0x4, + SQTT_GFXIP_LEVEL_GFXIP_9 = 0x5, + SQTT_GFXIP_LEVEL_GFXIP_10_1 = 0x7, + SQTT_GFXIP_LEVEL_GFXIP_10_3 = 0x9, + SQTT_GFXIP_LEVEL_GFXIP_11_0 = 0xc, +}; + +enum sqtt_memory_type +{ + SQTT_MEMORY_TYPE_UNKNOWN = 0x0, + SQTT_MEMORY_TYPE_DDR = 0x1, + SQTT_MEMORY_TYPE_DDR2 = 0x2, + SQTT_MEMORY_TYPE_DDR3 = 0x3, + SQTT_MEMORY_TYPE_DDR4 = 0x4, + SQTT_MEMORY_TYPE_DDR5 = 0x5, + SQTT_MEMORY_TYPE_GDDR3 = 0x10, + SQTT_MEMORY_TYPE_GDDR4 = 0x11, + SQTT_MEMORY_TYPE_GDDR5 = 0x12, + SQTT_MEMORY_TYPE_GDDR6 = 0x13, + SQTT_MEMORY_TYPE_HBM = 0x20, + SQTT_MEMORY_TYPE_HBM2 = 0x21, + SQTT_MEMORY_TYPE_HBM3 = 0x22, + SQTT_MEMORY_TYPE_LPDDR4 = 0x30, + SQTT_MEMORY_TYPE_LPDDR5 = 0x31, +}; + +struct sqtt_file_chunk_asic_info { + struct sqtt_file_chunk_header header; + uint64_t flags; + uint64_t trace_shader_core_clock; + uint64_t trace_memory_clock; + int32_t device_id; + int32_t device_revision_id; + int32_t vgprs_per_simd; + int32_t sgprs_per_simd; + int32_t shader_engines; + int32_t compute_unit_per_shader_engine; + int32_t simd_per_compute_unit; + int32_t wavefronts_per_simd; + int32_t minimum_vgpr_alloc; + int32_t vgpr_alloc_granularity; + int32_t minimum_sgpr_alloc; + int32_t sgpr_alloc_granularity; + int32_t hardware_contexts; + enum sqtt_gpu_type gpu_type; + enum sqtt_gfxip_level gfxip_level; + int32_t gpu_index; + int32_t gds_size; + int32_t gds_per_shader_engine; + int32_t ce_ram_size; + int32_t ce_ram_size_graphics; + int32_t ce_ram_size_compute; + int32_t max_number_of_dedicated_cus; + int64_t vram_size; + int32_t vram_bus_width; + int32_t l2_cache_size; + int32_t l1_cache_size; + int32_t lds_size; + char gpu_name[SQTT_GPU_NAME_MAX_SIZE]; + float alu_per_clock; + float texture_per_clock; + float prims_per_clock; + float pixels_per_clock; + uint64_t gpu_timestamp_frequency; + uint64_t max_shader_core_clock; + uint64_t max_memory_clock; + uint32_t memory_ops_per_clock; + enum sqtt_memory_type memory_chip_type; + uint32_t lds_granularity; + uint16_t cu_mask[SQTT_MAX_NUM_SE][SQTT_SA_PER_SE]; + char reserved1[128]; + uint32_t active_pixel_packer_mask[SQTT_ACTIVE_PIXEL_PACKER_MASK_DWORDS]; + char reserved2[16]; + uint32_t gl1_cache_size; + uint32_t instruction_cache_size; + uint32_t scalar_cache_size; + uint32_t mall_cache_size; + char padding[4]; +}; + +enum sqtt_api_type +{ + SQTT_API_TYPE_DIRECTX_12, + SQTT_API_TYPE_VULKAN, + SQTT_API_TYPE_GENERIC, + SQTT_API_TYPE_OPENCL +}; + +enum sqtt_instruction_trace_mode +{ + SQTT_INSTRUCTION_TRACE_DISABLED = 0x0, + SQTT_INSTRUCTION_TRACE_FULL_FRAME = 0x1, + SQTT_INSTRUCTION_TRACE_API_PSO = 0x2, +}; + +enum sqtt_profiling_mode +{ + SQTT_PROFILING_MODE_PRESENT = 0x0, + SQTT_PROFILING_MODE_USER_MARKERS = 0x1, + SQTT_PROFILING_MODE_INDEX = 0x2, + SQTT_PROFILING_MODE_TAG = 0x3, +}; + +union sqtt_profiling_mode_data { + struct { + char start[256]; + char end[256]; + } user_marker_profiling_data; + + struct { + uint32_t start; + uint32_t end; + } index_profiling_data; + + struct { + uint32_t begin_hi; + uint32_t begin_lo; + uint32_t end_hi; + uint32_t end_lo; + } tag_profiling_data; +}; + +union sqtt_instruction_trace_data { + struct { + uint64_t api_pso_filter; + } api_pso_data; + + struct { + uint32_t mask; + } shader_engine_filter; +}; + +struct sqtt_file_chunk_api_info { + struct sqtt_file_chunk_header header; + enum sqtt_api_type api_type; + uint16_t major_version; + uint16_t minor_version; + enum sqtt_profiling_mode profiling_mode; + uint32_t reserved; + union sqtt_profiling_mode_data profiling_mode_data; + enum sqtt_instruction_trace_mode instruction_trace_mode; + uint32_t reserved2; + union sqtt_instruction_trace_data instruction_trace_data; +}; + + +struct sqtt_code_object_database_record { + uint32_t size; +}; + +struct sqtt_file_chunk_code_object_database { + struct sqtt_file_chunk_header header; + uint32_t offset; + uint32_t flags; + uint32_t size; + uint32_t record_count; +}; + + +struct sqtt_code_object_loader_events_record { + uint32_t loader_event_type; + uint32_t reserved; + uint64_t base_address; + uint64_t code_object_hash[2]; + uint64_t time_stamp; +}; + +struct sqtt_file_chunk_code_object_loader_events { + struct sqtt_file_chunk_header header; + uint32_t offset; + uint32_t flags; + uint32_t record_size; + uint32_t record_count; +}; + +struct sqtt_pso_correlation_record { + uint64_t api_pso_hash; + uint64_t pipeline_hash[2]; + char api_level_obj_name[64]; +}; + +struct sqtt_file_chunk_pso_correlation { + struct sqtt_file_chunk_header header; + uint32_t offset; + uint32_t flags; + uint32_t record_size; + uint32_t record_count; +}; + +struct sqtt_file_chunk_sqtt_desc { + struct sqtt_file_chunk_header header; + int32_t shader_engine_index; + enum sqtt_version sqtt_version; + union { + struct { + int32_t instrumentation_version; + } v0; + struct { + int16_t instrumentation_spec_version; + int16_t instrumentation_api_version; + int32_t compute_unit_index; + } v1; + }; +}; + +struct sqtt_file_chunk_sqtt_data { + struct sqtt_file_chunk_header header; + int32_t offset; /* in bytes */ + int32_t size; /* in bytes */ +}; + +struct sqtt_file_chunk_queue_event_timings { + struct sqtt_file_chunk_header header; + uint32_t queue_info_table_record_count; + uint32_t queue_info_table_size; + uint32_t queue_event_table_record_count; + uint32_t queue_event_table_size; +}; + + +enum sqtt_queue_type { + SQTT_QUEUE_TYPE_UNKNOWN = 0x0, + SQTT_QUEUE_TYPE_UNIVERSAL = 0x1, + SQTT_QUEUE_TYPE_COMPUTE = 0x2, + SQTT_QUEUE_TYPE_DMA = 0x3, +}; + +enum sqtt_engine_type { + SQTT_ENGINE_TYPE_UNKNOWN = 0x0, + SQTT_ENGINE_TYPE_UNIVERSAL = 0x1, + SQTT_ENGINE_TYPE_COMPUTE = 0x2, + SQTT_ENGINE_TYPE_EXCLUSIVE_COMPUTE = 0x3, + SQTT_ENGINE_TYPE_DMA = 0x4, + SQTT_ENGINE_TYPE_HIGH_PRIORITY_UNIVERSAL = 0x7, + SQTT_ENGINE_TYPE_HIGH_PRIORITY_GRAPHICS = 0x8, +}; + +struct sqtt_queue_hardware_info { + union { + struct { + int32_t queue_type : 8; + int32_t engine_type : 8; + uint32_t reserved : 16; + }; + uint32_t value; + }; +}; + + +struct sqtt_queue_info_record { + uint64_t queue_id; + uint64_t queue_context; + struct sqtt_queue_hardware_info hardware_info; + uint32_t reserved; +}; + +enum sqtt_queue_event_type { + SQTT_QUEUE_TIMING_EVENT_CMDBUF_SUBMIT, + SQTT_QUEUE_TIMING_EVENT_SIGNAL_SEMAPHORE, + SQTT_QUEUE_TIMING_EVENT_WAIT_SEMAPHORE, + SQTT_QUEUE_TIMING_EVENT_PRESENT +}; + +struct sqtt_queue_event_record { + enum sqtt_queue_event_type event_type; + uint32_t sqtt_cb_id; + uint64_t frame_index; + uint32_t queue_info_index; + uint32_t submit_sub_index; + uint64_t api_id; + uint64_t cpu_timestamp; + uint64_t gpu_timestamps[2]; +}; + +struct sqtt_file_chunk_clock_calibration { + struct sqtt_file_chunk_header header; + uint64_t cpu_timestamp; + uint64_t gpu_timestamp; + uint64_t reserved; +}; + +enum elf_gfxip_level +{ + EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028, + EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c, + EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, + EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036, + EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, +}; + +struct sqtt_file_chunk_spm_db { + struct sqtt_file_chunk_header header; + uint32_t flags; + uint32_t preamble_size; + uint32_t num_timestamps; + uint32_t num_spm_counter_info; + uint32_t spm_counter_info_size; + uint32_t sample_interval; +}; + +/** + * Identifiers for RGP SQ thread-tracing markers (Table 1) + */ +enum rgp_sqtt_marker_identifier +{ + RGP_SQTT_MARKER_IDENTIFIER_EVENT = 0x0, + RGP_SQTT_MARKER_IDENTIFIER_CB_START = 0x1, + RGP_SQTT_MARKER_IDENTIFIER_CB_END = 0x2, + RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START = 0x3, + RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END = 0x4, + RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT = 0x5, + RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API = 0x6, + RGP_SQTT_MARKER_IDENTIFIER_SYNC = 0x7, + RGP_SQTT_MARKER_IDENTIFIER_PRESENT = 0x8, + RGP_SQTT_MARKER_IDENTIFIER_LAYOUT_TRANSITION = 0x9, + RGP_SQTT_MARKER_IDENTIFIER_RENDER_PASS = 0xA, + RGP_SQTT_MARKER_IDENTIFIER_RESERVED2 = 0xB, + RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE = 0xC, + RGP_SQTT_MARKER_IDENTIFIER_RESERVED4 = 0xD, + RGP_SQTT_MARKER_IDENTIFIER_RESERVED5 = 0xE, + RGP_SQTT_MARKER_IDENTIFIER_RESERVED6 = 0xF +}; + +/** + * Command buffer IDs used in RGP SQ thread-tracing markers (only 20 bits). + */ +union rgp_sqtt_marker_cb_id { + struct { + uint32_t per_frame : 1; /* Must be 1, frame-based command buffer ID. */ + uint32_t frame_index : 7; + uint32_t cb_index : 12; /* Command buffer index within the frame. */ + uint32_t reserved : 12; + } per_frame_cb_id; + + struct { + uint32_t per_frame : 1; /* Must be 0, global command buffer ID. */ + uint32_t cb_index : 19; /* Global command buffer index. */ + uint32_t reserved : 12; + } global_cb_id; + + uint32_t all; +}; + +/** + * RGP SQ thread-tracing marker for the start of a command buffer. (Table 2) + */ +struct rgp_sqtt_marker_cb_start { + union { + struct { + uint32_t identifier : 4; + uint32_t ext_dwords : 3; + uint32_t cb_id : 20; + uint32_t queue : 5; + }; + uint32_t dword01; + }; + union { + uint32_t device_id_low; + uint32_t dword02; + }; + union { + uint32_t device_id_high; + uint32_t dword03; + }; + union { + uint32_t queue_flags; + uint32_t dword04; + }; +}; + +/** + * + * RGP SQ thread-tracing marker for the end of a command buffer. (Table 3) + */ +struct rgp_sqtt_marker_cb_end { + union { + struct { + uint32_t identifier : 4; + uint32_t ext_dwords : 3; + uint32_t cb_id : 20; + uint32_t reserved : 5; + }; + uint32_t dword01; + }; + union { + uint32_t device_id_low; + uint32_t dword02; + }; + union { + uint32_t device_id_high; + uint32_t dword03; + }; +}; + +/** + * API types used in RGP SQ thread-tracing markers for the "General API" + * packet. + */ +enum rgp_sqtt_marker_general_api_type +{ + ApiCmdBindPipeline = 0, + ApiCmdBindDescriptorSets = 1, + ApiCmdBindIndexBuffer = 2, + ApiCmdBindVertexBuffers = 3, + ApiCmdDraw = 4, + ApiCmdDrawIndexed = 5, + ApiCmdDrawIndirect = 6, + ApiCmdDrawIndexedIndirect = 7, + ApiCmdDrawIndirectCountAMD = 8, + ApiCmdDrawIndexedIndirectCountAMD = 9, + ApiCmdDispatch = 10, + ApiCmdDispatchIndirect = 11, + ApiCmdCopyBuffer = 12, + ApiCmdCopyImage = 13, + ApiCmdBlitImage = 14, + ApiCmdCopyBufferToImage = 15, + ApiCmdCopyImageToBuffer = 16, + ApiCmdUpdateBuffer = 17, + ApiCmdFillBuffer = 18, + ApiCmdClearColorImage = 19, + ApiCmdClearDepthStencilImage = 20, + ApiCmdClearAttachments = 21, + ApiCmdResolveImage = 22, + ApiCmdWaitEvents = 23, + ApiCmdPipelineBarrier = 24, + ApiCmdBeginQuery = 25, + ApiCmdEndQuery = 26, + ApiCmdResetQueryPool = 27, + ApiCmdWriteTimestamp = 28, + ApiCmdCopyQueryPoolResults = 29, + ApiCmdPushConstants = 30, + ApiCmdBeginRenderPass = 31, + ApiCmdNextSubpass = 32, + ApiCmdEndRenderPass = 33, + ApiCmdExecuteCommands = 34, + ApiCmdSetViewport = 35, + ApiCmdSetScissor = 36, + ApiCmdSetLineWidth = 37, + ApiCmdSetDepthBias = 38, + ApiCmdSetBlendConstants = 39, + ApiCmdSetDepthBounds = 40, + ApiCmdSetStencilCompareMask = 41, + ApiCmdSetStencilWriteMask = 42, + ApiCmdSetStencilReference = 43, + ApiCmdDrawIndirectCount = 44, + ApiCmdDrawIndexedIndirectCount = 45, + /* gap */ + ApiCmdDrawMeshTasksEXT = 47, + ApiCmdDrawMeshTasksIndirectCountEXT = 48, + ApiCmdDrawMeshTasksIndirectEXT = 49, + + ApiRayTracingSeparateCompiled = 0x800000, + ApiInvalid = 0xffffffff +}; + +/** + * RGP SQ thread-tracing marker for a "General API" instrumentation packet. + */ +struct rgp_sqtt_marker_general_api { + union { + struct { + uint32_t identifier : 4; + uint32_t ext_dwords : 3; + uint32_t api_type : 20; + uint32_t is_end : 1; + uint32_t reserved : 4; + }; + uint32_t dword01; + }; +}; + +/** + * API types used in RGP SQ thread-tracing markers (Table 16). + */ +enum rgp_sqtt_marker_event_type +{ + EventCmdDraw = 0, + EventCmdDrawIndexed = 1, + EventCmdDrawIndirect = 2, + EventCmdDrawIndexedIndirect = 3, + EventCmdDrawIndirectCountAMD = 4, + EventCmdDrawIndexedIndirectCountAMD = 5, + EventCmdDispatch = 6, + EventCmdDispatchIndirect = 7, + EventCmdCopyBuffer = 8, + EventCmdCopyImage = 9, + EventCmdBlitImage = 10, + EventCmdCopyBufferToImage = 11, + EventCmdCopyImageToBuffer = 12, + EventCmdUpdateBuffer = 13, + EventCmdFillBuffer = 14, + EventCmdClearColorImage = 15, + EventCmdClearDepthStencilImage = 16, + EventCmdClearAttachments = 17, + EventCmdResolveImage = 18, + EventCmdWaitEvents = 19, + EventCmdPipelineBarrier = 20, + EventCmdResetQueryPool = 21, + EventCmdCopyQueryPoolResults = 22, + EventRenderPassColorClear = 23, + EventRenderPassDepthStencilClear = 24, + EventRenderPassResolve = 25, + EventInternalUnknown = 26, + EventCmdDrawIndirectCount = 27, + EventCmdDrawIndexedIndirectCount = 28, + /* gap */ + EventCmdTraceRaysKHR = 30, + EventCmdTraceRaysIndirectKHR = 31, + EventCmdBuildAccelerationStructuresKHR = 32, + EventCmdBuildAccelerationStructuresIndirectKHR = 33, + EventCmdCopyAccelerationStructureKHR = 34, + EventCmdCopyAccelerationStructureToMemoryKHR = 35, + EventCmdCopyMemoryToAccelerationStructureKHR = 36, + /* gap */ + EventCmdDrawMeshTasksEXT = 41, + EventCmdDrawMeshTasksIndirectCountEXT = 42, + EventCmdDrawMeshTasksIndirectEXT = 43, + EventUnknown = 0x7fff, + EventInvalid = 0xffffffff +}; + +/** + * "Event (Per-draw/dispatch)" RGP SQ thread-tracing marker. (Table 4) + */ +struct rgp_sqtt_marker_event { + union { + struct { + uint32_t identifier : 4; + uint32_t ext_dwords : 3; + uint32_t api_type : 24; + uint32_t has_thread_dims : 1; + }; + uint32_t dword01; + }; + union { + struct { + uint32_t cb_id : 20; + uint32_t vertex_offset_reg_idx : 4; + uint32_t instance_offset_reg_idx : 4; + uint32_t draw_index_reg_idx : 4; + }; + uint32_t dword02; + }; + union { + uint32_t cmd_id; + uint32_t dword03; + }; +}; + +/** + * Per-dispatch specific marker where workgroup dims are included. + */ +struct rgp_sqtt_marker_event_with_dims { + struct rgp_sqtt_marker_event event; + uint32_t thread_x; + uint32_t thread_y; + uint32_t thread_z; +}; + +/** + * "Barrier Start" RGP SQTT instrumentation marker (Table 5) + */ +struct rgp_sqtt_marker_barrier_start { + union { + struct { + uint32_t identifier : 4; + uint32_t ext_dwords : 3; + uint32_t cb_id : 20; + uint32_t reserved : 5; + }; + uint32_t dword01; + }; + union { + struct { + uint32_t driver_reason : 31; + uint32_t internal : 1; + }; + uint32_t dword02; + }; +}; + +/** + * "Barrier End" RGP SQTT instrumentation marker (Table 6) + */ +struct rgp_sqtt_marker_barrier_end { + union { + struct { + uint32_t identifier : 4; + uint32_t ext_dwords : 3; + uint32_t cb_id : 20; + uint32_t wait_on_eop_ts : 1; + uint32_t vs_partial_flush : 1; + uint32_t ps_partial_flush : 1; + uint32_t cs_partial_flush : 1; + uint32_t pfp_sync_me : 1; + }; + uint32_t dword01; + }; + union { + struct { + uint32_t sync_cp_dma : 1; + uint32_t inval_tcp : 1; + uint32_t inval_sqI : 1; + uint32_t inval_sqK : 1; + uint32_t flush_tcc : 1; + uint32_t inval_tcc : 1; + uint32_t flush_cb : 1; + uint32_t inval_cb : 1; + uint32_t flush_db : 1; + uint32_t inval_db : 1; + uint32_t num_layout_transitions : 16; + uint32_t inval_gl1 : 1; + uint32_t wait_on_ts : 1; + uint32_t eop_ts_bottom_of_pipe : 1; + uint32_t eos_ts_ps_done : 1; + uint32_t eos_ts_cs_done : 1; + uint32_t reserved : 1; + }; + uint32_t dword02; + }; +}; + +/** + * "Layout Transition" RGP SQTT instrumentation marker (Table 7) + */ +struct rgp_sqtt_marker_layout_transition { + union { + struct { + uint32_t identifier : 4; + uint32_t ext_dwords : 3; + uint32_t depth_stencil_expand : 1; + uint32_t htile_hiz_range_expand : 1; + uint32_t depth_stencil_resummarize : 1; + uint32_t dcc_decompress : 1; + uint32_t fmask_decompress : 1; + uint32_t fast_clear_eliminate : 1; + uint32_t fmask_color_expand : 1; + uint32_t init_mask_ram : 1; + uint32_t reserved1 : 17; + }; + uint32_t dword01; + }; + union { + struct { + uint32_t reserved2 : 32; + }; + uint32_t dword02; + }; +}; + +/** + * "User Event" RGP SQTT instrumentation marker (Table 8) + */ +struct rgp_sqtt_marker_user_event { + union { + struct { + uint32_t identifier : 4; + uint32_t reserved0 : 8; + uint32_t data_type : 8; + uint32_t reserved1 : 12; + }; + uint32_t dword01; + }; +}; +struct rgp_sqtt_marker_user_event_with_length { + struct rgp_sqtt_marker_user_event user_event; + uint32_t length; +}; + +enum rgp_sqtt_marker_user_event_type +{ + UserEventTrigger = 0, + UserEventPop, + UserEventPush, + UserEventObjectName, +}; + +/** + * "Pipeline bind" RGP SQTT instrumentation marker (Table 12) + */ +struct rgp_sqtt_marker_pipeline_bind { + union { + struct { + uint32_t identifier : 4; + uint32_t ext_dwords : 3; + uint32_t bind_point : 1; + uint32_t cb_id : 20; + uint32_t reserved : 4; + }; + uint32_t dword01; + }; + union { + uint32_t api_pso_hash[2]; + struct { + uint32_t dword02; + uint32_t dword03; + }; + }; +}; diff --git a/tinygrad/device.py b/tinygrad/device.py index ce08cbc081..d2260f582c 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -57,6 +57,9 @@ class ProfileDeviceEvent(ProfileEvent): @dataclass(frozen=True) class ProfileRangeEvent(ProfileEvent): device:str; name:str; st:decimal.Decimal; en:decimal.Decimal; is_copy:bool # noqa: E702 +@dataclass(frozen=True) +class ProfileProgramEvent(ProfileEvent): device:str; name:str; lib:bytes|None; base:int|None # noqa: E702 + @dataclass(frozen=True) class ProfileGraphEntry: device:str; name:str; st_id:int; en_id:int; is_copy:bool # noqa: E702 @@ -342,8 +345,9 @@ if PROFILE: with open(fn:=temp("profile.pkl", append_user=True), "wb") as f: pickle.dump(Compiled.profile_events, f) - from tinygrad.ops import launch_viz - launch_viz("PROFILE", fn) + if not getenv("SQTT", 0): + from tinygrad.ops import launch_viz + launch_viz("PROFILE", fn) if __name__ == "__main__": for device in ALL_DEVICES: diff --git a/tinygrad/runtime/autogen/sqtt.py b/tinygrad/runtime/autogen/sqtt.py new file mode 100644 index 0000000000..5d246bff15 --- /dev/null +++ b/tinygrad/runtime/autogen/sqtt.py @@ -0,0 +1,1789 @@ +# mypy: ignore-errors +# -*- coding: utf-8 -*- +# +# TARGET arch is: [] +# WORD_SIZE is: 8 +# POINTER_SIZE is: 8 +# LONGDOUBLE_SIZE is: 16 +# +import ctypes, os + + +class AsDictMixin: + @classmethod + def as_dict(cls, self): + result = {} + if not isinstance(self, AsDictMixin): + # not a structure, assume it's already a python object + return self + if not hasattr(cls, "_fields_"): + return result + # sys.version_info >= (3, 5) + # for (field, *_) in cls._fields_: # noqa + for field_tuple in cls._fields_: # noqa + field = field_tuple[0] + if field.startswith('PADDING_'): + continue + value = getattr(self, field) + type_ = type(value) + if hasattr(value, "_length_") and hasattr(value, "_type_"): + # array + if not hasattr(type_, "as_dict"): + value = [v for v in value] + else: + type_ = type_._type_ + value = [type_.as_dict(v) for v in value] + elif hasattr(value, "contents") and hasattr(value, "_type_"): + # pointer + try: + if not hasattr(type_, "as_dict"): + value = value.contents + else: + type_ = type_._type_ + value = type_.as_dict(value.contents) + except ValueError: + # nullptr + value = None + elif isinstance(value, AsDictMixin): + # other structure + value = type_.as_dict(value) + result[field] = value + return result + + +class Structure(ctypes.Structure, AsDictMixin): + + def __init__(self, *args, **kwds): + # We don't want to use positional arguments fill PADDING_* fields + + args = dict(zip(self.__class__._field_names_(), args)) + args.update(kwds) + super(Structure, self).__init__(**args) + + @classmethod + def _field_names_(cls): + if hasattr(cls, '_fields_'): + return (f[0] for f in cls._fields_ if not f[0].startswith('PADDING')) + else: + return () + + @classmethod + def get_type(cls, field): + for f in cls._fields_: + if f[0] == field: + return f[1] + return None + + @classmethod + def bind(cls, bound_fields): + fields = {} + for name, type_ in cls._fields_: + if hasattr(type_, "restype"): + if name in bound_fields: + if bound_fields[name] is None: + fields[name] = type_() + else: + # use a closure to capture the callback from the loop scope + fields[name] = ( + type_((lambda callback: lambda *args: callback(*args))( + bound_fields[name])) + ) + del bound_fields[name] + else: + # default callback implementation (does nothing) + try: + default_ = type_(0).restype().value + except TypeError: + default_ = None + fields[name] = type_(( + lambda default_: lambda *args: default_)(default_)) + else: + # not a callback function, use default initialization + if name in bound_fields: + fields[name] = bound_fields[name] + del bound_fields[name] + else: + fields[name] = type_() + if len(bound_fields) != 0: + raise ValueError( + "Cannot bind the following unknown callback(s) {}.{}".format( + cls.__name__, bound_fields.keys() + )) + return cls(**fields) + + +class Union(ctypes.Union, AsDictMixin): + pass + + + +c_int128 = ctypes.c_ubyte*16 +c_uint128 = c_int128 +void = None +if ctypes.sizeof(ctypes.c_longdouble) == 16: + c_long_double_t = ctypes.c_longdouble +else: + c_long_double_t = ctypes.c_ubyte*16 + + + +SQTT_FILE_MAGIC_NUMBER = 0x50303042 # macro +SQTT_FILE_VERSION_MAJOR = 1 # macro +SQTT_FILE_VERSION_MINOR = 5 # macro +SQTT_GPU_NAME_MAX_SIZE = 256 # macro +SQTT_MAX_NUM_SE = 32 # macro +SQTT_SA_PER_SE = 2 # macro +SQTT_ACTIVE_PIXEL_PACKER_MASK_DWORDS = 4 # macro +class struct_sqtt_data_info(Structure): + pass + +class union_sqtt_data_info_0(Union): + pass + +union_sqtt_data_info_0._pack_ = 1 # source:False +union_sqtt_data_info_0._fields_ = [ + ('gfx9_write_counter', ctypes.c_uint32), + ('gfx10_dropped_cntr', ctypes.c_uint32), +] + +struct_sqtt_data_info._pack_ = 1 # source:False +struct_sqtt_data_info._anonymous_ = ('_0',) +struct_sqtt_data_info._fields_ = [ + ('cur_offset', ctypes.c_uint32), + ('trace_status', ctypes.c_uint32), + ('_0', union_sqtt_data_info_0), +] + +class struct_sqtt_data_se(Structure): + pass + +struct_sqtt_data_se._pack_ = 1 # source:False +struct_sqtt_data_se._fields_ = [ + ('info', struct_sqtt_data_info), + ('PADDING_0', ctypes.c_ubyte * 4), + ('data_ptr', ctypes.POINTER(None)), + ('shader_engine', ctypes.c_uint32), + ('compute_unit', ctypes.c_uint32), +] + + +# values for enumeration 'sqtt_version' +sqtt_version__enumvalues = { + 0: 'SQTT_VERSION_NONE', + 5: 'SQTT_VERSION_2_2', + 6: 'SQTT_VERSION_2_3', + 7: 'SQTT_VERSION_2_4', + 11: 'SQTT_VERSION_3_2', +} +SQTT_VERSION_NONE = 0 +SQTT_VERSION_2_2 = 5 +SQTT_VERSION_2_3 = 6 +SQTT_VERSION_2_4 = 7 +SQTT_VERSION_3_2 = 11 +sqtt_version = ctypes.c_uint32 # enum + +# values for enumeration 'sqtt_file_chunk_type' +sqtt_file_chunk_type__enumvalues = { + 0: 'SQTT_FILE_CHUNK_TYPE_ASIC_INFO', + 1: 'SQTT_FILE_CHUNK_TYPE_SQTT_DESC', + 2: 'SQTT_FILE_CHUNK_TYPE_SQTT_DATA', + 3: 'SQTT_FILE_CHUNK_TYPE_API_INFO', + 4: 'SQTT_FILE_CHUNK_TYPE_RESERVED', + 5: 'SQTT_FILE_CHUNK_TYPE_QUEUE_EVENT_TIMINGS', + 6: 'SQTT_FILE_CHUNK_TYPE_CLOCK_CALIBRATION', + 7: 'SQTT_FILE_CHUNK_TYPE_CPU_INFO', + 8: 'SQTT_FILE_CHUNK_TYPE_SPM_DB', + 9: 'SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE', + 10: 'SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS', + 11: 'SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION', + 12: 'SQTT_FILE_CHUNK_TYPE_INSTRUMENTATION_TABLE', + 13: 'SQTT_FILE_CHUNK_TYPE_COUNT', +} +SQTT_FILE_CHUNK_TYPE_ASIC_INFO = 0 +SQTT_FILE_CHUNK_TYPE_SQTT_DESC = 1 +SQTT_FILE_CHUNK_TYPE_SQTT_DATA = 2 +SQTT_FILE_CHUNK_TYPE_API_INFO = 3 +SQTT_FILE_CHUNK_TYPE_RESERVED = 4 +SQTT_FILE_CHUNK_TYPE_QUEUE_EVENT_TIMINGS = 5 +SQTT_FILE_CHUNK_TYPE_CLOCK_CALIBRATION = 6 +SQTT_FILE_CHUNK_TYPE_CPU_INFO = 7 +SQTT_FILE_CHUNK_TYPE_SPM_DB = 8 +SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE = 9 +SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS = 10 +SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION = 11 +SQTT_FILE_CHUNK_TYPE_INSTRUMENTATION_TABLE = 12 +SQTT_FILE_CHUNK_TYPE_COUNT = 13 +sqtt_file_chunk_type = ctypes.c_uint32 # enum +class struct_sqtt_file_chunk_id(Structure): + pass + +struct_sqtt_file_chunk_id._pack_ = 1 # source:False +struct_sqtt_file_chunk_id._fields_ = [ + ('type', ctypes.c_int32, 8), + ('index', ctypes.c_int32, 8), + ('reserved', ctypes.c_int32, 16), +] + +class struct_sqtt_file_chunk_header(Structure): + pass + +struct_sqtt_file_chunk_header._pack_ = 1 # source:False +struct_sqtt_file_chunk_header._fields_ = [ + ('chunk_id', struct_sqtt_file_chunk_id), + ('minor_version', ctypes.c_uint16), + ('major_version', ctypes.c_uint16), + ('size_in_bytes', ctypes.c_int32), + ('padding', ctypes.c_int32), +] + +class struct_sqtt_file_header_flags(Structure): + pass + +class union_sqtt_file_header_flags_0(Union): + pass + +class struct_sqtt_file_header_flags_0_0(Structure): + pass + +struct_sqtt_file_header_flags_0_0._pack_ = 1 # source:False +struct_sqtt_file_header_flags_0_0._fields_ = [ + ('is_semaphore_queue_timing_etw', ctypes.c_uint32, 1), + ('no_queue_semaphore_timestamps', ctypes.c_uint32, 1), + ('reserved', ctypes.c_uint32, 30), +] + +union_sqtt_file_header_flags_0._pack_ = 1 # source:False +union_sqtt_file_header_flags_0._anonymous_ = ('_0',) +union_sqtt_file_header_flags_0._fields_ = [ + ('_0', struct_sqtt_file_header_flags_0_0), + ('value', ctypes.c_uint32), +] + +struct_sqtt_file_header_flags._pack_ = 1 # source:False +struct_sqtt_file_header_flags._anonymous_ = ('_0',) +struct_sqtt_file_header_flags._fields_ = [ + ('_0', union_sqtt_file_header_flags_0), +] + +class struct_sqtt_file_header(Structure): + pass + +struct_sqtt_file_header._pack_ = 1 # source:False +struct_sqtt_file_header._fields_ = [ + ('magic_number', ctypes.c_uint32), + ('version_major', ctypes.c_uint32), + ('version_minor', ctypes.c_uint32), + ('flags', struct_sqtt_file_header_flags), + ('chunk_offset', ctypes.c_int32), + ('second', ctypes.c_int32), + ('minute', ctypes.c_int32), + ('hour', ctypes.c_int32), + ('day_in_month', ctypes.c_int32), + ('month', ctypes.c_int32), + ('year', ctypes.c_int32), + ('day_in_week', ctypes.c_int32), + ('day_in_year', ctypes.c_int32), + ('is_daylight_savings', ctypes.c_int32), +] + +class struct_sqtt_file_chunk_cpu_info(Structure): + pass + +struct_sqtt_file_chunk_cpu_info._pack_ = 1 # source:False +struct_sqtt_file_chunk_cpu_info._fields_ = [ + ('header', struct_sqtt_file_chunk_header), + ('vendor_id', ctypes.c_uint32 * 4), + ('processor_brand', ctypes.c_uint32 * 12), + ('reserved', ctypes.c_uint32 * 2), + ('cpu_timestamp_freq', ctypes.c_uint64), + ('clock_speed', ctypes.c_uint32), + ('num_logical_cores', ctypes.c_uint32), + ('num_physical_cores', ctypes.c_uint32), + ('system_ram_size', ctypes.c_uint32), +] + + +# values for enumeration 'sqtt_file_chunk_asic_info_flags' +sqtt_file_chunk_asic_info_flags__enumvalues = { + 1: 'SQTT_FILE_CHUNK_ASIC_INFO_FLAG_SC_PACKER_NUMBERING', + 2: 'SQTT_FILE_CHUNK_ASIC_INFO_FLAG_PS1_EVENT_TOKENS_ENABLED', +} +SQTT_FILE_CHUNK_ASIC_INFO_FLAG_SC_PACKER_NUMBERING = 1 +SQTT_FILE_CHUNK_ASIC_INFO_FLAG_PS1_EVENT_TOKENS_ENABLED = 2 +sqtt_file_chunk_asic_info_flags = ctypes.c_uint32 # enum + +# values for enumeration 'sqtt_gpu_type' +sqtt_gpu_type__enumvalues = { + 0: 'SQTT_GPU_TYPE_UNKNOWN', + 1: 'SQTT_GPU_TYPE_INTEGRATED', + 2: 'SQTT_GPU_TYPE_DISCRETE', + 3: 'SQTT_GPU_TYPE_VIRTUAL', +} +SQTT_GPU_TYPE_UNKNOWN = 0 +SQTT_GPU_TYPE_INTEGRATED = 1 +SQTT_GPU_TYPE_DISCRETE = 2 +SQTT_GPU_TYPE_VIRTUAL = 3 +sqtt_gpu_type = ctypes.c_uint32 # enum + +# values for enumeration 'sqtt_gfxip_level' +sqtt_gfxip_level__enumvalues = { + 0: 'SQTT_GFXIP_LEVEL_NONE', + 1: 'SQTT_GFXIP_LEVEL_GFXIP_6', + 2: 'SQTT_GFXIP_LEVEL_GFXIP_7', + 3: 'SQTT_GFXIP_LEVEL_GFXIP_8', + 4: 'SQTT_GFXIP_LEVEL_GFXIP_8_1', + 5: 'SQTT_GFXIP_LEVEL_GFXIP_9', + 7: 'SQTT_GFXIP_LEVEL_GFXIP_10_1', + 9: 'SQTT_GFXIP_LEVEL_GFXIP_10_3', + 12: 'SQTT_GFXIP_LEVEL_GFXIP_11_0', +} +SQTT_GFXIP_LEVEL_NONE = 0 +SQTT_GFXIP_LEVEL_GFXIP_6 = 1 +SQTT_GFXIP_LEVEL_GFXIP_7 = 2 +SQTT_GFXIP_LEVEL_GFXIP_8 = 3 +SQTT_GFXIP_LEVEL_GFXIP_8_1 = 4 +SQTT_GFXIP_LEVEL_GFXIP_9 = 5 +SQTT_GFXIP_LEVEL_GFXIP_10_1 = 7 +SQTT_GFXIP_LEVEL_GFXIP_10_3 = 9 +SQTT_GFXIP_LEVEL_GFXIP_11_0 = 12 +sqtt_gfxip_level = ctypes.c_uint32 # enum + +# values for enumeration 'sqtt_memory_type' +sqtt_memory_type__enumvalues = { + 0: 'SQTT_MEMORY_TYPE_UNKNOWN', + 1: 'SQTT_MEMORY_TYPE_DDR', + 2: 'SQTT_MEMORY_TYPE_DDR2', + 3: 'SQTT_MEMORY_TYPE_DDR3', + 4: 'SQTT_MEMORY_TYPE_DDR4', + 5: 'SQTT_MEMORY_TYPE_DDR5', + 16: 'SQTT_MEMORY_TYPE_GDDR3', + 17: 'SQTT_MEMORY_TYPE_GDDR4', + 18: 'SQTT_MEMORY_TYPE_GDDR5', + 19: 'SQTT_MEMORY_TYPE_GDDR6', + 32: 'SQTT_MEMORY_TYPE_HBM', + 33: 'SQTT_MEMORY_TYPE_HBM2', + 34: 'SQTT_MEMORY_TYPE_HBM3', + 48: 'SQTT_MEMORY_TYPE_LPDDR4', + 49: 'SQTT_MEMORY_TYPE_LPDDR5', +} +SQTT_MEMORY_TYPE_UNKNOWN = 0 +SQTT_MEMORY_TYPE_DDR = 1 +SQTT_MEMORY_TYPE_DDR2 = 2 +SQTT_MEMORY_TYPE_DDR3 = 3 +SQTT_MEMORY_TYPE_DDR4 = 4 +SQTT_MEMORY_TYPE_DDR5 = 5 +SQTT_MEMORY_TYPE_GDDR3 = 16 +SQTT_MEMORY_TYPE_GDDR4 = 17 +SQTT_MEMORY_TYPE_GDDR5 = 18 +SQTT_MEMORY_TYPE_GDDR6 = 19 +SQTT_MEMORY_TYPE_HBM = 32 +SQTT_MEMORY_TYPE_HBM2 = 33 +SQTT_MEMORY_TYPE_HBM3 = 34 +SQTT_MEMORY_TYPE_LPDDR4 = 48 +SQTT_MEMORY_TYPE_LPDDR5 = 49 +sqtt_memory_type = ctypes.c_uint32 # enum +class struct_sqtt_file_chunk_asic_info(Structure): + pass + +struct_sqtt_file_chunk_asic_info._pack_ = 1 # source:False +struct_sqtt_file_chunk_asic_info._fields_ = [ + ('header', struct_sqtt_file_chunk_header), + ('flags', ctypes.c_uint64), + ('trace_shader_core_clock', ctypes.c_uint64), + ('trace_memory_clock', ctypes.c_uint64), + ('device_id', ctypes.c_int32), + ('device_revision_id', ctypes.c_int32), + ('vgprs_per_simd', ctypes.c_int32), + ('sgprs_per_simd', ctypes.c_int32), + ('shader_engines', ctypes.c_int32), + ('compute_unit_per_shader_engine', ctypes.c_int32), + ('simd_per_compute_unit', ctypes.c_int32), + ('wavefronts_per_simd', ctypes.c_int32), + ('minimum_vgpr_alloc', ctypes.c_int32), + ('vgpr_alloc_granularity', ctypes.c_int32), + ('minimum_sgpr_alloc', ctypes.c_int32), + ('sgpr_alloc_granularity', ctypes.c_int32), + ('hardware_contexts', ctypes.c_int32), + ('gpu_type', sqtt_gpu_type), + ('gfxip_level', sqtt_gfxip_level), + ('gpu_index', ctypes.c_int32), + ('gds_size', ctypes.c_int32), + ('gds_per_shader_engine', ctypes.c_int32), + ('ce_ram_size', ctypes.c_int32), + ('ce_ram_size_graphics', ctypes.c_int32), + ('ce_ram_size_compute', ctypes.c_int32), + ('max_number_of_dedicated_cus', ctypes.c_int32), + ('vram_size', ctypes.c_int64), + ('vram_bus_width', ctypes.c_int32), + ('l2_cache_size', ctypes.c_int32), + ('l1_cache_size', ctypes.c_int32), + ('lds_size', ctypes.c_int32), + ('gpu_name', ctypes.c_char * 256), + ('alu_per_clock', ctypes.c_float), + ('texture_per_clock', ctypes.c_float), + ('prims_per_clock', ctypes.c_float), + ('pixels_per_clock', ctypes.c_float), + ('gpu_timestamp_frequency', ctypes.c_uint64), + ('max_shader_core_clock', ctypes.c_uint64), + ('max_memory_clock', ctypes.c_uint64), + ('memory_ops_per_clock', ctypes.c_uint32), + ('memory_chip_type', sqtt_memory_type), + ('lds_granularity', ctypes.c_uint32), + ('cu_mask', ctypes.c_uint16 * 2 * 32), + ('reserved1', ctypes.c_char * 128), + ('active_pixel_packer_mask', ctypes.c_uint32 * 4), + ('reserved2', ctypes.c_char * 16), + ('gl1_cache_size', ctypes.c_uint32), + ('instruction_cache_size', ctypes.c_uint32), + ('scalar_cache_size', ctypes.c_uint32), + ('mall_cache_size', ctypes.c_uint32), + ('padding', ctypes.c_char * 4), +] + + +# values for enumeration 'sqtt_api_type' +sqtt_api_type__enumvalues = { + 0: 'SQTT_API_TYPE_DIRECTX_12', + 1: 'SQTT_API_TYPE_VULKAN', + 2: 'SQTT_API_TYPE_GENERIC', + 3: 'SQTT_API_TYPE_OPENCL', +} +SQTT_API_TYPE_DIRECTX_12 = 0 +SQTT_API_TYPE_VULKAN = 1 +SQTT_API_TYPE_GENERIC = 2 +SQTT_API_TYPE_OPENCL = 3 +sqtt_api_type = ctypes.c_uint32 # enum + +# values for enumeration 'sqtt_instruction_trace_mode' +sqtt_instruction_trace_mode__enumvalues = { + 0: 'SQTT_INSTRUCTION_TRACE_DISABLED', + 1: 'SQTT_INSTRUCTION_TRACE_FULL_FRAME', + 2: 'SQTT_INSTRUCTION_TRACE_API_PSO', +} +SQTT_INSTRUCTION_TRACE_DISABLED = 0 +SQTT_INSTRUCTION_TRACE_FULL_FRAME = 1 +SQTT_INSTRUCTION_TRACE_API_PSO = 2 +sqtt_instruction_trace_mode = ctypes.c_uint32 # enum + +# values for enumeration 'sqtt_profiling_mode' +sqtt_profiling_mode__enumvalues = { + 0: 'SQTT_PROFILING_MODE_PRESENT', + 1: 'SQTT_PROFILING_MODE_USER_MARKERS', + 2: 'SQTT_PROFILING_MODE_INDEX', + 3: 'SQTT_PROFILING_MODE_TAG', +} +SQTT_PROFILING_MODE_PRESENT = 0 +SQTT_PROFILING_MODE_USER_MARKERS = 1 +SQTT_PROFILING_MODE_INDEX = 2 +SQTT_PROFILING_MODE_TAG = 3 +sqtt_profiling_mode = ctypes.c_uint32 # enum +class union_sqtt_profiling_mode_data(Union): + pass + +class struct_sqtt_profiling_mode_data_user_marker_profiling_data(Structure): + pass + +struct_sqtt_profiling_mode_data_user_marker_profiling_data._pack_ = 1 # source:False +struct_sqtt_profiling_mode_data_user_marker_profiling_data._fields_ = [ + ('start', ctypes.c_char * 256), + ('end', ctypes.c_char * 256), +] + +class struct_sqtt_profiling_mode_data_index_profiling_data(Structure): + pass + +struct_sqtt_profiling_mode_data_index_profiling_data._pack_ = 1 # source:False +struct_sqtt_profiling_mode_data_index_profiling_data._fields_ = [ + ('start', ctypes.c_uint32), + ('end', ctypes.c_uint32), +] + +class struct_sqtt_profiling_mode_data_tag_profiling_data(Structure): + pass + +struct_sqtt_profiling_mode_data_tag_profiling_data._pack_ = 1 # source:False +struct_sqtt_profiling_mode_data_tag_profiling_data._fields_ = [ + ('begin_hi', ctypes.c_uint32), + ('begin_lo', ctypes.c_uint32), + ('end_hi', ctypes.c_uint32), + ('end_lo', ctypes.c_uint32), +] + +union_sqtt_profiling_mode_data._pack_ = 1 # source:False +union_sqtt_profiling_mode_data._fields_ = [ + ('user_marker_profiling_data', struct_sqtt_profiling_mode_data_user_marker_profiling_data), + ('index_profiling_data', struct_sqtt_profiling_mode_data_index_profiling_data), + ('tag_profiling_data', struct_sqtt_profiling_mode_data_tag_profiling_data), + ('PADDING_0', ctypes.c_ubyte * 496), +] + +class union_sqtt_instruction_trace_data(Union): + pass + +class struct_sqtt_instruction_trace_data_api_pso_data(Structure): + pass + +struct_sqtt_instruction_trace_data_api_pso_data._pack_ = 1 # source:False +struct_sqtt_instruction_trace_data_api_pso_data._fields_ = [ + ('api_pso_filter', ctypes.c_uint64), +] + +class struct_sqtt_instruction_trace_data_shader_engine_filter(Structure): + pass + +struct_sqtt_instruction_trace_data_shader_engine_filter._pack_ = 1 # source:False +struct_sqtt_instruction_trace_data_shader_engine_filter._fields_ = [ + ('mask', ctypes.c_uint32), +] + +union_sqtt_instruction_trace_data._pack_ = 1 # source:False +union_sqtt_instruction_trace_data._fields_ = [ + ('api_pso_data', struct_sqtt_instruction_trace_data_api_pso_data), + ('shader_engine_filter', struct_sqtt_instruction_trace_data_shader_engine_filter), + ('PADDING_0', ctypes.c_ubyte * 4), +] + +class struct_sqtt_file_chunk_api_info(Structure): + pass + +struct_sqtt_file_chunk_api_info._pack_ = 1 # source:False +struct_sqtt_file_chunk_api_info._fields_ = [ + ('header', struct_sqtt_file_chunk_header), + ('api_type', sqtt_api_type), + ('major_version', ctypes.c_uint16), + ('minor_version', ctypes.c_uint16), + ('profiling_mode', sqtt_profiling_mode), + ('reserved', ctypes.c_uint32), + ('profiling_mode_data', union_sqtt_profiling_mode_data), + ('instruction_trace_mode', sqtt_instruction_trace_mode), + ('reserved2', ctypes.c_uint32), + ('instruction_trace_data', union_sqtt_instruction_trace_data), +] + +class struct_sqtt_code_object_database_record(Structure): + pass + +struct_sqtt_code_object_database_record._pack_ = 1 # source:False +struct_sqtt_code_object_database_record._fields_ = [ + ('size', ctypes.c_uint32), +] + +class struct_sqtt_file_chunk_code_object_database(Structure): + pass + +struct_sqtt_file_chunk_code_object_database._pack_ = 1 # source:False +struct_sqtt_file_chunk_code_object_database._fields_ = [ + ('header', struct_sqtt_file_chunk_header), + ('offset', ctypes.c_uint32), + ('flags', ctypes.c_uint32), + ('size', ctypes.c_uint32), + ('record_count', ctypes.c_uint32), +] + +class struct_sqtt_code_object_loader_events_record(Structure): + pass + +struct_sqtt_code_object_loader_events_record._pack_ = 1 # source:False +struct_sqtt_code_object_loader_events_record._fields_ = [ + ('loader_event_type', ctypes.c_uint32), + ('reserved', ctypes.c_uint32), + ('base_address', ctypes.c_uint64), + ('code_object_hash', ctypes.c_uint64 * 2), + ('time_stamp', ctypes.c_uint64), +] + +class struct_sqtt_file_chunk_code_object_loader_events(Structure): + pass + +struct_sqtt_file_chunk_code_object_loader_events._pack_ = 1 # source:False +struct_sqtt_file_chunk_code_object_loader_events._fields_ = [ + ('header', struct_sqtt_file_chunk_header), + ('offset', ctypes.c_uint32), + ('flags', ctypes.c_uint32), + ('record_size', ctypes.c_uint32), + ('record_count', ctypes.c_uint32), +] + +class struct_sqtt_pso_correlation_record(Structure): + pass + +struct_sqtt_pso_correlation_record._pack_ = 1 # source:False +struct_sqtt_pso_correlation_record._fields_ = [ + ('api_pso_hash', ctypes.c_uint64), + ('pipeline_hash', ctypes.c_uint64 * 2), + ('api_level_obj_name', ctypes.c_char * 64), +] + +class struct_sqtt_file_chunk_pso_correlation(Structure): + pass + +struct_sqtt_file_chunk_pso_correlation._pack_ = 1 # source:False +struct_sqtt_file_chunk_pso_correlation._fields_ = [ + ('header', struct_sqtt_file_chunk_header), + ('offset', ctypes.c_uint32), + ('flags', ctypes.c_uint32), + ('record_size', ctypes.c_uint32), + ('record_count', ctypes.c_uint32), +] + +class struct_sqtt_file_chunk_sqtt_desc(Structure): + pass + +class union_sqtt_file_chunk_sqtt_desc_0(Union): + pass + +class struct_sqtt_file_chunk_sqtt_desc_0_v0(Structure): + pass + +struct_sqtt_file_chunk_sqtt_desc_0_v0._pack_ = 1 # source:False +struct_sqtt_file_chunk_sqtt_desc_0_v0._fields_ = [ + ('instrumentation_version', ctypes.c_int32), +] + +class struct_sqtt_file_chunk_sqtt_desc_0_v1(Structure): + pass + +struct_sqtt_file_chunk_sqtt_desc_0_v1._pack_ = 1 # source:False +struct_sqtt_file_chunk_sqtt_desc_0_v1._fields_ = [ + ('instrumentation_spec_version', ctypes.c_int16), + ('instrumentation_api_version', ctypes.c_int16), + ('compute_unit_index', ctypes.c_int32), +] + +union_sqtt_file_chunk_sqtt_desc_0._pack_ = 1 # source:False +union_sqtt_file_chunk_sqtt_desc_0._fields_ = [ + ('v0', struct_sqtt_file_chunk_sqtt_desc_0_v0), + ('v1', struct_sqtt_file_chunk_sqtt_desc_0_v1), +] + +struct_sqtt_file_chunk_sqtt_desc._pack_ = 1 # source:False +struct_sqtt_file_chunk_sqtt_desc._anonymous_ = ('_0',) +struct_sqtt_file_chunk_sqtt_desc._fields_ = [ + ('header', struct_sqtt_file_chunk_header), + ('shader_engine_index', ctypes.c_int32), + ('sqtt_version', sqtt_version), + ('_0', union_sqtt_file_chunk_sqtt_desc_0), +] + +class struct_sqtt_file_chunk_sqtt_data(Structure): + pass + +struct_sqtt_file_chunk_sqtt_data._pack_ = 1 # source:False +struct_sqtt_file_chunk_sqtt_data._fields_ = [ + ('header', struct_sqtt_file_chunk_header), + ('offset', ctypes.c_int32), + ('size', ctypes.c_int32), +] + +class struct_sqtt_file_chunk_queue_event_timings(Structure): + pass + +struct_sqtt_file_chunk_queue_event_timings._pack_ = 1 # source:False +struct_sqtt_file_chunk_queue_event_timings._fields_ = [ + ('header', struct_sqtt_file_chunk_header), + ('queue_info_table_record_count', ctypes.c_uint32), + ('queue_info_table_size', ctypes.c_uint32), + ('queue_event_table_record_count', ctypes.c_uint32), + ('queue_event_table_size', ctypes.c_uint32), +] + + +# values for enumeration 'sqtt_queue_type' +sqtt_queue_type__enumvalues = { + 0: 'SQTT_QUEUE_TYPE_UNKNOWN', + 1: 'SQTT_QUEUE_TYPE_UNIVERSAL', + 2: 'SQTT_QUEUE_TYPE_COMPUTE', + 3: 'SQTT_QUEUE_TYPE_DMA', +} +SQTT_QUEUE_TYPE_UNKNOWN = 0 +SQTT_QUEUE_TYPE_UNIVERSAL = 1 +SQTT_QUEUE_TYPE_COMPUTE = 2 +SQTT_QUEUE_TYPE_DMA = 3 +sqtt_queue_type = ctypes.c_uint32 # enum + +# values for enumeration 'sqtt_engine_type' +sqtt_engine_type__enumvalues = { + 0: 'SQTT_ENGINE_TYPE_UNKNOWN', + 1: 'SQTT_ENGINE_TYPE_UNIVERSAL', + 2: 'SQTT_ENGINE_TYPE_COMPUTE', + 3: 'SQTT_ENGINE_TYPE_EXCLUSIVE_COMPUTE', + 4: 'SQTT_ENGINE_TYPE_DMA', + 7: 'SQTT_ENGINE_TYPE_HIGH_PRIORITY_UNIVERSAL', + 8: 'SQTT_ENGINE_TYPE_HIGH_PRIORITY_GRAPHICS', +} +SQTT_ENGINE_TYPE_UNKNOWN = 0 +SQTT_ENGINE_TYPE_UNIVERSAL = 1 +SQTT_ENGINE_TYPE_COMPUTE = 2 +SQTT_ENGINE_TYPE_EXCLUSIVE_COMPUTE = 3 +SQTT_ENGINE_TYPE_DMA = 4 +SQTT_ENGINE_TYPE_HIGH_PRIORITY_UNIVERSAL = 7 +SQTT_ENGINE_TYPE_HIGH_PRIORITY_GRAPHICS = 8 +sqtt_engine_type = ctypes.c_uint32 # enum +class struct_sqtt_queue_hardware_info(Structure): + pass + +class union_sqtt_queue_hardware_info_0(Union): + pass + +class struct_sqtt_queue_hardware_info_0_0(Structure): + pass + +struct_sqtt_queue_hardware_info_0_0._pack_ = 1 # source:False +struct_sqtt_queue_hardware_info_0_0._fields_ = [ + ('queue_type', ctypes.c_int32, 8), + ('engine_type', ctypes.c_int32, 8), + ('reserved', ctypes.c_int32, 16), +] + +union_sqtt_queue_hardware_info_0._pack_ = 1 # source:False +union_sqtt_queue_hardware_info_0._anonymous_ = ('_0',) +union_sqtt_queue_hardware_info_0._fields_ = [ + ('_0', struct_sqtt_queue_hardware_info_0_0), + ('value', ctypes.c_uint32), +] + +struct_sqtt_queue_hardware_info._pack_ = 1 # source:False +struct_sqtt_queue_hardware_info._anonymous_ = ('_0',) +struct_sqtt_queue_hardware_info._fields_ = [ + ('_0', union_sqtt_queue_hardware_info_0), +] + +class struct_sqtt_queue_info_record(Structure): + pass + +struct_sqtt_queue_info_record._pack_ = 1 # source:False +struct_sqtt_queue_info_record._fields_ = [ + ('queue_id', ctypes.c_uint64), + ('queue_context', ctypes.c_uint64), + ('hardware_info', struct_sqtt_queue_hardware_info), + ('reserved', ctypes.c_uint32), +] + + +# values for enumeration 'sqtt_queue_event_type' +sqtt_queue_event_type__enumvalues = { + 0: 'SQTT_QUEUE_TIMING_EVENT_CMDBUF_SUBMIT', + 1: 'SQTT_QUEUE_TIMING_EVENT_SIGNAL_SEMAPHORE', + 2: 'SQTT_QUEUE_TIMING_EVENT_WAIT_SEMAPHORE', + 3: 'SQTT_QUEUE_TIMING_EVENT_PRESENT', +} +SQTT_QUEUE_TIMING_EVENT_CMDBUF_SUBMIT = 0 +SQTT_QUEUE_TIMING_EVENT_SIGNAL_SEMAPHORE = 1 +SQTT_QUEUE_TIMING_EVENT_WAIT_SEMAPHORE = 2 +SQTT_QUEUE_TIMING_EVENT_PRESENT = 3 +sqtt_queue_event_type = ctypes.c_uint32 # enum +class struct_sqtt_queue_event_record(Structure): + pass + +struct_sqtt_queue_event_record._pack_ = 1 # source:False +struct_sqtt_queue_event_record._fields_ = [ + ('event_type', sqtt_queue_event_type), + ('sqtt_cb_id', ctypes.c_uint32), + ('frame_index', ctypes.c_uint64), + ('queue_info_index', ctypes.c_uint32), + ('submit_sub_index', ctypes.c_uint32), + ('api_id', ctypes.c_uint64), + ('cpu_timestamp', ctypes.c_uint64), + ('gpu_timestamps', ctypes.c_uint64 * 2), +] + +class struct_sqtt_file_chunk_clock_calibration(Structure): + pass + +struct_sqtt_file_chunk_clock_calibration._pack_ = 1 # source:False +struct_sqtt_file_chunk_clock_calibration._fields_ = [ + ('header', struct_sqtt_file_chunk_header), + ('cpu_timestamp', ctypes.c_uint64), + ('gpu_timestamp', ctypes.c_uint64), + ('reserved', ctypes.c_uint64), +] + + +# values for enumeration 'elf_gfxip_level' +elf_gfxip_level__enumvalues = { + 40: 'EF_AMDGPU_MACH_AMDGCN_GFX801', + 44: 'EF_AMDGPU_MACH_AMDGCN_GFX900', + 51: 'EF_AMDGPU_MACH_AMDGCN_GFX1010', + 54: 'EF_AMDGPU_MACH_AMDGCN_GFX1030', + 65: 'EF_AMDGPU_MACH_AMDGCN_GFX1100', +} +EF_AMDGPU_MACH_AMDGCN_GFX801 = 40 +EF_AMDGPU_MACH_AMDGCN_GFX900 = 44 +EF_AMDGPU_MACH_AMDGCN_GFX1010 = 51 +EF_AMDGPU_MACH_AMDGCN_GFX1030 = 54 +EF_AMDGPU_MACH_AMDGCN_GFX1100 = 65 +elf_gfxip_level = ctypes.c_uint32 # enum +class struct_sqtt_file_chunk_spm_db(Structure): + pass + +struct_sqtt_file_chunk_spm_db._pack_ = 1 # source:False +struct_sqtt_file_chunk_spm_db._fields_ = [ + ('header', struct_sqtt_file_chunk_header), + ('flags', ctypes.c_uint32), + ('preamble_size', ctypes.c_uint32), + ('num_timestamps', ctypes.c_uint32), + ('num_spm_counter_info', ctypes.c_uint32), + ('spm_counter_info_size', ctypes.c_uint32), + ('sample_interval', ctypes.c_uint32), +] + + +# values for enumeration 'rgp_sqtt_marker_identifier' +rgp_sqtt_marker_identifier__enumvalues = { + 0: 'RGP_SQTT_MARKER_IDENTIFIER_EVENT', + 1: 'RGP_SQTT_MARKER_IDENTIFIER_CB_START', + 2: 'RGP_SQTT_MARKER_IDENTIFIER_CB_END', + 3: 'RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START', + 4: 'RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END', + 5: 'RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT', + 6: 'RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API', + 7: 'RGP_SQTT_MARKER_IDENTIFIER_SYNC', + 8: 'RGP_SQTT_MARKER_IDENTIFIER_PRESENT', + 9: 'RGP_SQTT_MARKER_IDENTIFIER_LAYOUT_TRANSITION', + 10: 'RGP_SQTT_MARKER_IDENTIFIER_RENDER_PASS', + 11: 'RGP_SQTT_MARKER_IDENTIFIER_RESERVED2', + 12: 'RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE', + 13: 'RGP_SQTT_MARKER_IDENTIFIER_RESERVED4', + 14: 'RGP_SQTT_MARKER_IDENTIFIER_RESERVED5', + 15: 'RGP_SQTT_MARKER_IDENTIFIER_RESERVED6', +} +RGP_SQTT_MARKER_IDENTIFIER_EVENT = 0 +RGP_SQTT_MARKER_IDENTIFIER_CB_START = 1 +RGP_SQTT_MARKER_IDENTIFIER_CB_END = 2 +RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START = 3 +RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END = 4 +RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT = 5 +RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API = 6 +RGP_SQTT_MARKER_IDENTIFIER_SYNC = 7 +RGP_SQTT_MARKER_IDENTIFIER_PRESENT = 8 +RGP_SQTT_MARKER_IDENTIFIER_LAYOUT_TRANSITION = 9 +RGP_SQTT_MARKER_IDENTIFIER_RENDER_PASS = 10 +RGP_SQTT_MARKER_IDENTIFIER_RESERVED2 = 11 +RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE = 12 +RGP_SQTT_MARKER_IDENTIFIER_RESERVED4 = 13 +RGP_SQTT_MARKER_IDENTIFIER_RESERVED5 = 14 +RGP_SQTT_MARKER_IDENTIFIER_RESERVED6 = 15 +rgp_sqtt_marker_identifier = ctypes.c_uint32 # enum +class union_rgp_sqtt_marker_cb_id(Union): + pass + +class struct_rgp_sqtt_marker_cb_id_per_frame_cb_id(Structure): + pass + +struct_rgp_sqtt_marker_cb_id_per_frame_cb_id._pack_ = 1 # source:False +struct_rgp_sqtt_marker_cb_id_per_frame_cb_id._fields_ = [ + ('per_frame', ctypes.c_uint32, 1), + ('frame_index', ctypes.c_uint32, 7), + ('cb_index', ctypes.c_uint32, 12), + ('reserved', ctypes.c_uint32, 12), +] + +class struct_rgp_sqtt_marker_cb_id_global_cb_id(Structure): + pass + +struct_rgp_sqtt_marker_cb_id_global_cb_id._pack_ = 1 # source:False +struct_rgp_sqtt_marker_cb_id_global_cb_id._fields_ = [ + ('per_frame', ctypes.c_uint32, 1), + ('cb_index', ctypes.c_uint32, 19), + ('reserved', ctypes.c_uint32, 12), +] + +union_rgp_sqtt_marker_cb_id._pack_ = 1 # source:False +union_rgp_sqtt_marker_cb_id._fields_ = [ + ('per_frame_cb_id', struct_rgp_sqtt_marker_cb_id_per_frame_cb_id), + ('global_cb_id', struct_rgp_sqtt_marker_cb_id_global_cb_id), + ('all', ctypes.c_uint32), +] + +class struct_rgp_sqtt_marker_cb_start(Structure): + pass + +class union_rgp_sqtt_marker_cb_start_0(Union): + pass + +class struct_rgp_sqtt_marker_cb_start_0_0(Structure): + pass + +struct_rgp_sqtt_marker_cb_start_0_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_cb_start_0_0._fields_ = [ + ('identifier', ctypes.c_uint32, 4), + ('ext_dwords', ctypes.c_uint32, 3), + ('cb_id', ctypes.c_uint32, 20), + ('queue', ctypes.c_uint32, 5), +] + +union_rgp_sqtt_marker_cb_start_0._pack_ = 1 # source:False +union_rgp_sqtt_marker_cb_start_0._anonymous_ = ('_0',) +union_rgp_sqtt_marker_cb_start_0._fields_ = [ + ('_0', struct_rgp_sqtt_marker_cb_start_0_0), + ('dword01', ctypes.c_uint32), +] + +class union_rgp_sqtt_marker_cb_start_1(Union): + pass + +union_rgp_sqtt_marker_cb_start_1._pack_ = 1 # source:False +union_rgp_sqtt_marker_cb_start_1._fields_ = [ + ('device_id_low', ctypes.c_uint32), + ('dword02', ctypes.c_uint32), +] + +class union_rgp_sqtt_marker_cb_start_2(Union): + pass + +union_rgp_sqtt_marker_cb_start_2._pack_ = 1 # source:False +union_rgp_sqtt_marker_cb_start_2._fields_ = [ + ('device_id_high', ctypes.c_uint32), + ('dword03', ctypes.c_uint32), +] + +class union_rgp_sqtt_marker_cb_start_3(Union): + pass + +union_rgp_sqtt_marker_cb_start_3._pack_ = 1 # source:False +union_rgp_sqtt_marker_cb_start_3._fields_ = [ + ('queue_flags', ctypes.c_uint32), + ('dword04', ctypes.c_uint32), +] + +struct_rgp_sqtt_marker_cb_start._pack_ = 1 # source:False +struct_rgp_sqtt_marker_cb_start._anonymous_ = ('_0', '_1', '_2', '_3',) +struct_rgp_sqtt_marker_cb_start._fields_ = [ + ('_0', union_rgp_sqtt_marker_cb_start_0), + ('_1', union_rgp_sqtt_marker_cb_start_1), + ('_2', union_rgp_sqtt_marker_cb_start_2), + ('_3', union_rgp_sqtt_marker_cb_start_3), +] + +class struct_rgp_sqtt_marker_cb_end(Structure): + pass + +class union_rgp_sqtt_marker_cb_end_0(Union): + pass + +class struct_rgp_sqtt_marker_cb_end_0_0(Structure): + pass + +struct_rgp_sqtt_marker_cb_end_0_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_cb_end_0_0._fields_ = [ + ('identifier', ctypes.c_uint32, 4), + ('ext_dwords', ctypes.c_uint32, 3), + ('cb_id', ctypes.c_uint32, 20), + ('reserved', ctypes.c_uint32, 5), +] + +union_rgp_sqtt_marker_cb_end_0._pack_ = 1 # source:False +union_rgp_sqtt_marker_cb_end_0._anonymous_ = ('_0',) +union_rgp_sqtt_marker_cb_end_0._fields_ = [ + ('_0', struct_rgp_sqtt_marker_cb_end_0_0), + ('dword01', ctypes.c_uint32), +] + +class union_rgp_sqtt_marker_cb_end_1(Union): + pass + +union_rgp_sqtt_marker_cb_end_1._pack_ = 1 # source:False +union_rgp_sqtt_marker_cb_end_1._fields_ = [ + ('device_id_low', ctypes.c_uint32), + ('dword02', ctypes.c_uint32), +] + +class union_rgp_sqtt_marker_cb_end_2(Union): + pass + +union_rgp_sqtt_marker_cb_end_2._pack_ = 1 # source:False +union_rgp_sqtt_marker_cb_end_2._fields_ = [ + ('device_id_high', ctypes.c_uint32), + ('dword03', ctypes.c_uint32), +] + +struct_rgp_sqtt_marker_cb_end._pack_ = 1 # source:False +struct_rgp_sqtt_marker_cb_end._anonymous_ = ('_0', '_1', '_2',) +struct_rgp_sqtt_marker_cb_end._fields_ = [ + ('_0', union_rgp_sqtt_marker_cb_end_0), + ('_1', union_rgp_sqtt_marker_cb_end_1), + ('_2', union_rgp_sqtt_marker_cb_end_2), +] + + +# values for enumeration 'rgp_sqtt_marker_general_api_type' +rgp_sqtt_marker_general_api_type__enumvalues = { + 0: 'ApiCmdBindPipeline', + 1: 'ApiCmdBindDescriptorSets', + 2: 'ApiCmdBindIndexBuffer', + 3: 'ApiCmdBindVertexBuffers', + 4: 'ApiCmdDraw', + 5: 'ApiCmdDrawIndexed', + 6: 'ApiCmdDrawIndirect', + 7: 'ApiCmdDrawIndexedIndirect', + 8: 'ApiCmdDrawIndirectCountAMD', + 9: 'ApiCmdDrawIndexedIndirectCountAMD', + 10: 'ApiCmdDispatch', + 11: 'ApiCmdDispatchIndirect', + 12: 'ApiCmdCopyBuffer', + 13: 'ApiCmdCopyImage', + 14: 'ApiCmdBlitImage', + 15: 'ApiCmdCopyBufferToImage', + 16: 'ApiCmdCopyImageToBuffer', + 17: 'ApiCmdUpdateBuffer', + 18: 'ApiCmdFillBuffer', + 19: 'ApiCmdClearColorImage', + 20: 'ApiCmdClearDepthStencilImage', + 21: 'ApiCmdClearAttachments', + 22: 'ApiCmdResolveImage', + 23: 'ApiCmdWaitEvents', + 24: 'ApiCmdPipelineBarrier', + 25: 'ApiCmdBeginQuery', + 26: 'ApiCmdEndQuery', + 27: 'ApiCmdResetQueryPool', + 28: 'ApiCmdWriteTimestamp', + 29: 'ApiCmdCopyQueryPoolResults', + 30: 'ApiCmdPushConstants', + 31: 'ApiCmdBeginRenderPass', + 32: 'ApiCmdNextSubpass', + 33: 'ApiCmdEndRenderPass', + 34: 'ApiCmdExecuteCommands', + 35: 'ApiCmdSetViewport', + 36: 'ApiCmdSetScissor', + 37: 'ApiCmdSetLineWidth', + 38: 'ApiCmdSetDepthBias', + 39: 'ApiCmdSetBlendConstants', + 40: 'ApiCmdSetDepthBounds', + 41: 'ApiCmdSetStencilCompareMask', + 42: 'ApiCmdSetStencilWriteMask', + 43: 'ApiCmdSetStencilReference', + 44: 'ApiCmdDrawIndirectCount', + 45: 'ApiCmdDrawIndexedIndirectCount', + 47: 'ApiCmdDrawMeshTasksEXT', + 48: 'ApiCmdDrawMeshTasksIndirectCountEXT', + 49: 'ApiCmdDrawMeshTasksIndirectEXT', + 8388608: 'ApiRayTracingSeparateCompiled', + 4294967295: 'ApiInvalid', +} +ApiCmdBindPipeline = 0 +ApiCmdBindDescriptorSets = 1 +ApiCmdBindIndexBuffer = 2 +ApiCmdBindVertexBuffers = 3 +ApiCmdDraw = 4 +ApiCmdDrawIndexed = 5 +ApiCmdDrawIndirect = 6 +ApiCmdDrawIndexedIndirect = 7 +ApiCmdDrawIndirectCountAMD = 8 +ApiCmdDrawIndexedIndirectCountAMD = 9 +ApiCmdDispatch = 10 +ApiCmdDispatchIndirect = 11 +ApiCmdCopyBuffer = 12 +ApiCmdCopyImage = 13 +ApiCmdBlitImage = 14 +ApiCmdCopyBufferToImage = 15 +ApiCmdCopyImageToBuffer = 16 +ApiCmdUpdateBuffer = 17 +ApiCmdFillBuffer = 18 +ApiCmdClearColorImage = 19 +ApiCmdClearDepthStencilImage = 20 +ApiCmdClearAttachments = 21 +ApiCmdResolveImage = 22 +ApiCmdWaitEvents = 23 +ApiCmdPipelineBarrier = 24 +ApiCmdBeginQuery = 25 +ApiCmdEndQuery = 26 +ApiCmdResetQueryPool = 27 +ApiCmdWriteTimestamp = 28 +ApiCmdCopyQueryPoolResults = 29 +ApiCmdPushConstants = 30 +ApiCmdBeginRenderPass = 31 +ApiCmdNextSubpass = 32 +ApiCmdEndRenderPass = 33 +ApiCmdExecuteCommands = 34 +ApiCmdSetViewport = 35 +ApiCmdSetScissor = 36 +ApiCmdSetLineWidth = 37 +ApiCmdSetDepthBias = 38 +ApiCmdSetBlendConstants = 39 +ApiCmdSetDepthBounds = 40 +ApiCmdSetStencilCompareMask = 41 +ApiCmdSetStencilWriteMask = 42 +ApiCmdSetStencilReference = 43 +ApiCmdDrawIndirectCount = 44 +ApiCmdDrawIndexedIndirectCount = 45 +ApiCmdDrawMeshTasksEXT = 47 +ApiCmdDrawMeshTasksIndirectCountEXT = 48 +ApiCmdDrawMeshTasksIndirectEXT = 49 +ApiRayTracingSeparateCompiled = 8388608 +ApiInvalid = 4294967295 +rgp_sqtt_marker_general_api_type = ctypes.c_uint32 # enum +class struct_rgp_sqtt_marker_general_api(Structure): + pass + +class union_rgp_sqtt_marker_general_api_0(Union): + pass + +class struct_rgp_sqtt_marker_general_api_0_0(Structure): + pass + +struct_rgp_sqtt_marker_general_api_0_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_general_api_0_0._fields_ = [ + ('identifier', ctypes.c_uint32, 4), + ('ext_dwords', ctypes.c_uint32, 3), + ('api_type', ctypes.c_uint32, 20), + ('is_end', ctypes.c_uint32, 1), + ('reserved', ctypes.c_uint32, 4), +] + +union_rgp_sqtt_marker_general_api_0._pack_ = 1 # source:False +union_rgp_sqtt_marker_general_api_0._anonymous_ = ('_0',) +union_rgp_sqtt_marker_general_api_0._fields_ = [ + ('_0', struct_rgp_sqtt_marker_general_api_0_0), + ('dword01', ctypes.c_uint32), +] + +struct_rgp_sqtt_marker_general_api._pack_ = 1 # source:False +struct_rgp_sqtt_marker_general_api._anonymous_ = ('_0',) +struct_rgp_sqtt_marker_general_api._fields_ = [ + ('_0', union_rgp_sqtt_marker_general_api_0), +] + + +# values for enumeration 'rgp_sqtt_marker_event_type' +rgp_sqtt_marker_event_type__enumvalues = { + 0: 'EventCmdDraw', + 1: 'EventCmdDrawIndexed', + 2: 'EventCmdDrawIndirect', + 3: 'EventCmdDrawIndexedIndirect', + 4: 'EventCmdDrawIndirectCountAMD', + 5: 'EventCmdDrawIndexedIndirectCountAMD', + 6: 'EventCmdDispatch', + 7: 'EventCmdDispatchIndirect', + 8: 'EventCmdCopyBuffer', + 9: 'EventCmdCopyImage', + 10: 'EventCmdBlitImage', + 11: 'EventCmdCopyBufferToImage', + 12: 'EventCmdCopyImageToBuffer', + 13: 'EventCmdUpdateBuffer', + 14: 'EventCmdFillBuffer', + 15: 'EventCmdClearColorImage', + 16: 'EventCmdClearDepthStencilImage', + 17: 'EventCmdClearAttachments', + 18: 'EventCmdResolveImage', + 19: 'EventCmdWaitEvents', + 20: 'EventCmdPipelineBarrier', + 21: 'EventCmdResetQueryPool', + 22: 'EventCmdCopyQueryPoolResults', + 23: 'EventRenderPassColorClear', + 24: 'EventRenderPassDepthStencilClear', + 25: 'EventRenderPassResolve', + 26: 'EventInternalUnknown', + 27: 'EventCmdDrawIndirectCount', + 28: 'EventCmdDrawIndexedIndirectCount', + 30: 'EventCmdTraceRaysKHR', + 31: 'EventCmdTraceRaysIndirectKHR', + 32: 'EventCmdBuildAccelerationStructuresKHR', + 33: 'EventCmdBuildAccelerationStructuresIndirectKHR', + 34: 'EventCmdCopyAccelerationStructureKHR', + 35: 'EventCmdCopyAccelerationStructureToMemoryKHR', + 36: 'EventCmdCopyMemoryToAccelerationStructureKHR', + 41: 'EventCmdDrawMeshTasksEXT', + 42: 'EventCmdDrawMeshTasksIndirectCountEXT', + 43: 'EventCmdDrawMeshTasksIndirectEXT', + 32767: 'EventUnknown', + 4294967295: 'EventInvalid', +} +EventCmdDraw = 0 +EventCmdDrawIndexed = 1 +EventCmdDrawIndirect = 2 +EventCmdDrawIndexedIndirect = 3 +EventCmdDrawIndirectCountAMD = 4 +EventCmdDrawIndexedIndirectCountAMD = 5 +EventCmdDispatch = 6 +EventCmdDispatchIndirect = 7 +EventCmdCopyBuffer = 8 +EventCmdCopyImage = 9 +EventCmdBlitImage = 10 +EventCmdCopyBufferToImage = 11 +EventCmdCopyImageToBuffer = 12 +EventCmdUpdateBuffer = 13 +EventCmdFillBuffer = 14 +EventCmdClearColorImage = 15 +EventCmdClearDepthStencilImage = 16 +EventCmdClearAttachments = 17 +EventCmdResolveImage = 18 +EventCmdWaitEvents = 19 +EventCmdPipelineBarrier = 20 +EventCmdResetQueryPool = 21 +EventCmdCopyQueryPoolResults = 22 +EventRenderPassColorClear = 23 +EventRenderPassDepthStencilClear = 24 +EventRenderPassResolve = 25 +EventInternalUnknown = 26 +EventCmdDrawIndirectCount = 27 +EventCmdDrawIndexedIndirectCount = 28 +EventCmdTraceRaysKHR = 30 +EventCmdTraceRaysIndirectKHR = 31 +EventCmdBuildAccelerationStructuresKHR = 32 +EventCmdBuildAccelerationStructuresIndirectKHR = 33 +EventCmdCopyAccelerationStructureKHR = 34 +EventCmdCopyAccelerationStructureToMemoryKHR = 35 +EventCmdCopyMemoryToAccelerationStructureKHR = 36 +EventCmdDrawMeshTasksEXT = 41 +EventCmdDrawMeshTasksIndirectCountEXT = 42 +EventCmdDrawMeshTasksIndirectEXT = 43 +EventUnknown = 32767 +EventInvalid = 4294967295 +rgp_sqtt_marker_event_type = ctypes.c_uint32 # enum +class struct_rgp_sqtt_marker_event(Structure): + pass + +class union_rgp_sqtt_marker_event_0(Union): + pass + +class struct_rgp_sqtt_marker_event_0_0(Structure): + pass + +struct_rgp_sqtt_marker_event_0_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_event_0_0._fields_ = [ + ('identifier', ctypes.c_uint32, 4), + ('ext_dwords', ctypes.c_uint32, 3), + ('api_type', ctypes.c_uint32, 24), + ('has_thread_dims', ctypes.c_uint32, 1), +] + +union_rgp_sqtt_marker_event_0._pack_ = 1 # source:False +union_rgp_sqtt_marker_event_0._anonymous_ = ('_0',) +union_rgp_sqtt_marker_event_0._fields_ = [ + ('_0', struct_rgp_sqtt_marker_event_0_0), + ('dword01', ctypes.c_uint32), +] + +class union_rgp_sqtt_marker_event_1(Union): + pass + +class struct_rgp_sqtt_marker_event_1_0(Structure): + pass + +struct_rgp_sqtt_marker_event_1_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_event_1_0._fields_ = [ + ('cb_id', ctypes.c_uint32, 20), + ('vertex_offset_reg_idx', ctypes.c_uint32, 4), + ('instance_offset_reg_idx', ctypes.c_uint32, 4), + ('draw_index_reg_idx', ctypes.c_uint32, 4), +] + +union_rgp_sqtt_marker_event_1._pack_ = 1 # source:False +union_rgp_sqtt_marker_event_1._anonymous_ = ('_0',) +union_rgp_sqtt_marker_event_1._fields_ = [ + ('_0', struct_rgp_sqtt_marker_event_1_0), + ('dword02', ctypes.c_uint32), +] + +class union_rgp_sqtt_marker_event_2(Union): + pass + +union_rgp_sqtt_marker_event_2._pack_ = 1 # source:False +union_rgp_sqtt_marker_event_2._fields_ = [ + ('cmd_id', ctypes.c_uint32), + ('dword03', ctypes.c_uint32), +] + +struct_rgp_sqtt_marker_event._pack_ = 1 # source:False +struct_rgp_sqtt_marker_event._anonymous_ = ('_0', '_1', '_2',) +struct_rgp_sqtt_marker_event._fields_ = [ + ('_0', union_rgp_sqtt_marker_event_0), + ('_1', union_rgp_sqtt_marker_event_1), + ('_2', union_rgp_sqtt_marker_event_2), +] + +class struct_rgp_sqtt_marker_event_with_dims(Structure): + pass + +struct_rgp_sqtt_marker_event_with_dims._pack_ = 1 # source:False +struct_rgp_sqtt_marker_event_with_dims._fields_ = [ + ('event', struct_rgp_sqtt_marker_event), + ('thread_x', ctypes.c_uint32), + ('thread_y', ctypes.c_uint32), + ('thread_z', ctypes.c_uint32), +] + +class struct_rgp_sqtt_marker_barrier_start(Structure): + pass + +class union_rgp_sqtt_marker_barrier_start_0(Union): + pass + +class struct_rgp_sqtt_marker_barrier_start_0_0(Structure): + pass + +struct_rgp_sqtt_marker_barrier_start_0_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_barrier_start_0_0._fields_ = [ + ('identifier', ctypes.c_uint32, 4), + ('ext_dwords', ctypes.c_uint32, 3), + ('cb_id', ctypes.c_uint32, 20), + ('reserved', ctypes.c_uint32, 5), +] + +union_rgp_sqtt_marker_barrier_start_0._pack_ = 1 # source:False +union_rgp_sqtt_marker_barrier_start_0._anonymous_ = ('_0',) +union_rgp_sqtt_marker_barrier_start_0._fields_ = [ + ('_0', struct_rgp_sqtt_marker_barrier_start_0_0), + ('dword01', ctypes.c_uint32), +] + +class union_rgp_sqtt_marker_barrier_start_1(Union): + pass + +class struct_rgp_sqtt_marker_barrier_start_1_0(Structure): + pass + +struct_rgp_sqtt_marker_barrier_start_1_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_barrier_start_1_0._fields_ = [ + ('driver_reason', ctypes.c_uint32, 31), + ('internal', ctypes.c_uint32, 1), +] + +union_rgp_sqtt_marker_barrier_start_1._pack_ = 1 # source:False +union_rgp_sqtt_marker_barrier_start_1._anonymous_ = ('_0',) +union_rgp_sqtt_marker_barrier_start_1._fields_ = [ + ('_0', struct_rgp_sqtt_marker_barrier_start_1_0), + ('dword02', ctypes.c_uint32), +] + +struct_rgp_sqtt_marker_barrier_start._pack_ = 1 # source:False +struct_rgp_sqtt_marker_barrier_start._anonymous_ = ('_0', '_1',) +struct_rgp_sqtt_marker_barrier_start._fields_ = [ + ('_0', union_rgp_sqtt_marker_barrier_start_0), + ('_1', union_rgp_sqtt_marker_barrier_start_1), +] + +class struct_rgp_sqtt_marker_barrier_end(Structure): + pass + +class union_rgp_sqtt_marker_barrier_end_0(Union): + pass + +class struct_rgp_sqtt_marker_barrier_end_0_0(Structure): + pass + +struct_rgp_sqtt_marker_barrier_end_0_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_barrier_end_0_0._fields_ = [ + ('identifier', ctypes.c_uint32, 4), + ('ext_dwords', ctypes.c_uint32, 3), + ('cb_id', ctypes.c_uint32, 20), + ('wait_on_eop_ts', ctypes.c_uint32, 1), + ('vs_partial_flush', ctypes.c_uint32, 1), + ('ps_partial_flush', ctypes.c_uint32, 1), + ('cs_partial_flush', ctypes.c_uint32, 1), + ('pfp_sync_me', ctypes.c_uint32, 1), +] + +union_rgp_sqtt_marker_barrier_end_0._pack_ = 1 # source:False +union_rgp_sqtt_marker_barrier_end_0._anonymous_ = ('_0',) +union_rgp_sqtt_marker_barrier_end_0._fields_ = [ + ('_0', struct_rgp_sqtt_marker_barrier_end_0_0), + ('dword01', ctypes.c_uint32), +] + +class union_rgp_sqtt_marker_barrier_end_1(Union): + pass + +class struct_rgp_sqtt_marker_barrier_end_1_0(Structure): + pass + +struct_rgp_sqtt_marker_barrier_end_1_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_barrier_end_1_0._fields_ = [ + ('sync_cp_dma', ctypes.c_uint32, 1), + ('inval_tcp', ctypes.c_uint32, 1), + ('inval_sqI', ctypes.c_uint32, 1), + ('inval_sqK', ctypes.c_uint32, 1), + ('flush_tcc', ctypes.c_uint32, 1), + ('inval_tcc', ctypes.c_uint32, 1), + ('flush_cb', ctypes.c_uint32, 1), + ('inval_cb', ctypes.c_uint32, 1), + ('flush_db', ctypes.c_uint32, 1), + ('inval_db', ctypes.c_uint32, 1), + ('num_layout_transitions', ctypes.c_uint32, 16), + ('inval_gl1', ctypes.c_uint32, 1), + ('wait_on_ts', ctypes.c_uint32, 1), + ('eop_ts_bottom_of_pipe', ctypes.c_uint32, 1), + ('eos_ts_ps_done', ctypes.c_uint32, 1), + ('eos_ts_cs_done', ctypes.c_uint32, 1), + ('reserved', ctypes.c_uint32, 1), +] + +union_rgp_sqtt_marker_barrier_end_1._pack_ = 1 # source:False +union_rgp_sqtt_marker_barrier_end_1._anonymous_ = ('_0',) +union_rgp_sqtt_marker_barrier_end_1._fields_ = [ + ('_0', struct_rgp_sqtt_marker_barrier_end_1_0), + ('dword02', ctypes.c_uint32), +] + +struct_rgp_sqtt_marker_barrier_end._pack_ = 1 # source:False +struct_rgp_sqtt_marker_barrier_end._anonymous_ = ('_0', '_1',) +struct_rgp_sqtt_marker_barrier_end._fields_ = [ + ('_0', union_rgp_sqtt_marker_barrier_end_0), + ('_1', union_rgp_sqtt_marker_barrier_end_1), +] + +class struct_rgp_sqtt_marker_layout_transition(Structure): + pass + +class union_rgp_sqtt_marker_layout_transition_0(Union): + pass + +class struct_rgp_sqtt_marker_layout_transition_0_0(Structure): + pass + +struct_rgp_sqtt_marker_layout_transition_0_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_layout_transition_0_0._fields_ = [ + ('identifier', ctypes.c_uint32, 4), + ('ext_dwords', ctypes.c_uint32, 3), + ('depth_stencil_expand', ctypes.c_uint32, 1), + ('htile_hiz_range_expand', ctypes.c_uint32, 1), + ('depth_stencil_resummarize', ctypes.c_uint32, 1), + ('dcc_decompress', ctypes.c_uint32, 1), + ('fmask_decompress', ctypes.c_uint32, 1), + ('fast_clear_eliminate', ctypes.c_uint32, 1), + ('fmask_color_expand', ctypes.c_uint32, 1), + ('init_mask_ram', ctypes.c_uint32, 1), + ('reserved1', ctypes.c_uint32, 17), +] + +union_rgp_sqtt_marker_layout_transition_0._pack_ = 1 # source:False +union_rgp_sqtt_marker_layout_transition_0._anonymous_ = ('_0',) +union_rgp_sqtt_marker_layout_transition_0._fields_ = [ + ('_0', struct_rgp_sqtt_marker_layout_transition_0_0), + ('dword01', ctypes.c_uint32), +] + +class union_rgp_sqtt_marker_layout_transition_1(Union): + pass + +class struct_rgp_sqtt_marker_layout_transition_1_0(Structure): + pass + +struct_rgp_sqtt_marker_layout_transition_1_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_layout_transition_1_0._fields_ = [ + ('reserved2', ctypes.c_uint32, 32), +] + +union_rgp_sqtt_marker_layout_transition_1._pack_ = 1 # source:False +union_rgp_sqtt_marker_layout_transition_1._anonymous_ = ('_0',) +union_rgp_sqtt_marker_layout_transition_1._fields_ = [ + ('_0', struct_rgp_sqtt_marker_layout_transition_1_0), + ('dword02', ctypes.c_uint32), +] + +struct_rgp_sqtt_marker_layout_transition._pack_ = 1 # source:False +struct_rgp_sqtt_marker_layout_transition._anonymous_ = ('_0', '_1',) +struct_rgp_sqtt_marker_layout_transition._fields_ = [ + ('_0', union_rgp_sqtt_marker_layout_transition_0), + ('_1', union_rgp_sqtt_marker_layout_transition_1), +] + +class struct_rgp_sqtt_marker_user_event(Structure): + pass + +class union_rgp_sqtt_marker_user_event_0(Union): + pass + +class struct_rgp_sqtt_marker_user_event_0_0(Structure): + pass + +struct_rgp_sqtt_marker_user_event_0_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_user_event_0_0._fields_ = [ + ('identifier', ctypes.c_uint32, 4), + ('reserved0', ctypes.c_uint32, 8), + ('data_type', ctypes.c_uint32, 8), + ('reserved1', ctypes.c_uint32, 12), +] + +union_rgp_sqtt_marker_user_event_0._pack_ = 1 # source:False +union_rgp_sqtt_marker_user_event_0._anonymous_ = ('_0',) +union_rgp_sqtt_marker_user_event_0._fields_ = [ + ('_0', struct_rgp_sqtt_marker_user_event_0_0), + ('dword01', ctypes.c_uint32), +] + +struct_rgp_sqtt_marker_user_event._pack_ = 1 # source:False +struct_rgp_sqtt_marker_user_event._anonymous_ = ('_0',) +struct_rgp_sqtt_marker_user_event._fields_ = [ + ('_0', union_rgp_sqtt_marker_user_event_0), +] + +class struct_rgp_sqtt_marker_user_event_with_length(Structure): + pass + +struct_rgp_sqtt_marker_user_event_with_length._pack_ = 1 # source:False +struct_rgp_sqtt_marker_user_event_with_length._fields_ = [ + ('user_event', struct_rgp_sqtt_marker_user_event), + ('length', ctypes.c_uint32), +] + + +# values for enumeration 'rgp_sqtt_marker_user_event_type' +rgp_sqtt_marker_user_event_type__enumvalues = { + 0: 'UserEventTrigger', + 1: 'UserEventPop', + 2: 'UserEventPush', + 3: 'UserEventObjectName', +} +UserEventTrigger = 0 +UserEventPop = 1 +UserEventPush = 2 +UserEventObjectName = 3 +rgp_sqtt_marker_user_event_type = ctypes.c_uint32 # enum +class struct_rgp_sqtt_marker_pipeline_bind(Structure): + pass + +class union_rgp_sqtt_marker_pipeline_bind_0(Union): + pass + +class struct_rgp_sqtt_marker_pipeline_bind_0_0(Structure): + pass + +struct_rgp_sqtt_marker_pipeline_bind_0_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_pipeline_bind_0_0._fields_ = [ + ('identifier', ctypes.c_uint32, 4), + ('ext_dwords', ctypes.c_uint32, 3), + ('bind_point', ctypes.c_uint32, 1), + ('cb_id', ctypes.c_uint32, 20), + ('reserved', ctypes.c_uint32, 4), +] + +union_rgp_sqtt_marker_pipeline_bind_0._pack_ = 1 # source:False +union_rgp_sqtt_marker_pipeline_bind_0._anonymous_ = ('_0',) +union_rgp_sqtt_marker_pipeline_bind_0._fields_ = [ + ('_0', struct_rgp_sqtt_marker_pipeline_bind_0_0), + ('dword01', ctypes.c_uint32), +] + +class union_rgp_sqtt_marker_pipeline_bind_1(Union): + pass + +class struct_rgp_sqtt_marker_pipeline_bind_1_0(Structure): + pass + +struct_rgp_sqtt_marker_pipeline_bind_1_0._pack_ = 1 # source:False +struct_rgp_sqtt_marker_pipeline_bind_1_0._fields_ = [ + ('dword02', ctypes.c_uint32), + ('dword03', ctypes.c_uint32), +] + +union_rgp_sqtt_marker_pipeline_bind_1._pack_ = 1 # source:False +union_rgp_sqtt_marker_pipeline_bind_1._anonymous_ = ('_0',) +union_rgp_sqtt_marker_pipeline_bind_1._fields_ = [ + ('api_pso_hash', ctypes.c_uint32 * 2), + ('_0', struct_rgp_sqtt_marker_pipeline_bind_1_0), +] + +struct_rgp_sqtt_marker_pipeline_bind._pack_ = 1 # source:False +struct_rgp_sqtt_marker_pipeline_bind._anonymous_ = ('_0', '_1',) +struct_rgp_sqtt_marker_pipeline_bind._fields_ = [ + ('_0', union_rgp_sqtt_marker_pipeline_bind_0), + ('_1', union_rgp_sqtt_marker_pipeline_bind_1), +] + +__all__ = \ + ['ApiCmdBeginQuery', 'ApiCmdBeginRenderPass', + 'ApiCmdBindDescriptorSets', 'ApiCmdBindIndexBuffer', + 'ApiCmdBindPipeline', 'ApiCmdBindVertexBuffers', + 'ApiCmdBlitImage', 'ApiCmdClearAttachments', + 'ApiCmdClearColorImage', 'ApiCmdClearDepthStencilImage', + 'ApiCmdCopyBuffer', 'ApiCmdCopyBufferToImage', 'ApiCmdCopyImage', + 'ApiCmdCopyImageToBuffer', 'ApiCmdCopyQueryPoolResults', + 'ApiCmdDispatch', 'ApiCmdDispatchIndirect', 'ApiCmdDraw', + 'ApiCmdDrawIndexed', 'ApiCmdDrawIndexedIndirect', + 'ApiCmdDrawIndexedIndirectCount', + 'ApiCmdDrawIndexedIndirectCountAMD', 'ApiCmdDrawIndirect', + 'ApiCmdDrawIndirectCount', 'ApiCmdDrawIndirectCountAMD', + 'ApiCmdDrawMeshTasksEXT', 'ApiCmdDrawMeshTasksIndirectCountEXT', + 'ApiCmdDrawMeshTasksIndirectEXT', 'ApiCmdEndQuery', + 'ApiCmdEndRenderPass', 'ApiCmdExecuteCommands', + 'ApiCmdFillBuffer', 'ApiCmdNextSubpass', 'ApiCmdPipelineBarrier', + 'ApiCmdPushConstants', 'ApiCmdResetQueryPool', + 'ApiCmdResolveImage', 'ApiCmdSetBlendConstants', + 'ApiCmdSetDepthBias', 'ApiCmdSetDepthBounds', + 'ApiCmdSetLineWidth', 'ApiCmdSetScissor', + 'ApiCmdSetStencilCompareMask', 'ApiCmdSetStencilReference', + 'ApiCmdSetStencilWriteMask', 'ApiCmdSetViewport', + 'ApiCmdUpdateBuffer', 'ApiCmdWaitEvents', 'ApiCmdWriteTimestamp', + 'ApiInvalid', 'ApiRayTracingSeparateCompiled', + 'EF_AMDGPU_MACH_AMDGCN_GFX1010', 'EF_AMDGPU_MACH_AMDGCN_GFX1030', + 'EF_AMDGPU_MACH_AMDGCN_GFX1100', 'EF_AMDGPU_MACH_AMDGCN_GFX801', + 'EF_AMDGPU_MACH_AMDGCN_GFX900', 'EventCmdBlitImage', + 'EventCmdBuildAccelerationStructuresIndirectKHR', + 'EventCmdBuildAccelerationStructuresKHR', + 'EventCmdClearAttachments', 'EventCmdClearColorImage', + 'EventCmdClearDepthStencilImage', + 'EventCmdCopyAccelerationStructureKHR', + 'EventCmdCopyAccelerationStructureToMemoryKHR', + 'EventCmdCopyBuffer', 'EventCmdCopyBufferToImage', + 'EventCmdCopyImage', 'EventCmdCopyImageToBuffer', + 'EventCmdCopyMemoryToAccelerationStructureKHR', + 'EventCmdCopyQueryPoolResults', 'EventCmdDispatch', + 'EventCmdDispatchIndirect', 'EventCmdDraw', 'EventCmdDrawIndexed', + 'EventCmdDrawIndexedIndirect', 'EventCmdDrawIndexedIndirectCount', + 'EventCmdDrawIndexedIndirectCountAMD', 'EventCmdDrawIndirect', + 'EventCmdDrawIndirectCount', 'EventCmdDrawIndirectCountAMD', + 'EventCmdDrawMeshTasksEXT', + 'EventCmdDrawMeshTasksIndirectCountEXT', + 'EventCmdDrawMeshTasksIndirectEXT', 'EventCmdFillBuffer', + 'EventCmdPipelineBarrier', 'EventCmdResetQueryPool', + 'EventCmdResolveImage', 'EventCmdTraceRaysIndirectKHR', + 'EventCmdTraceRaysKHR', 'EventCmdUpdateBuffer', + 'EventCmdWaitEvents', 'EventInternalUnknown', 'EventInvalid', + 'EventRenderPassColorClear', 'EventRenderPassDepthStencilClear', + 'EventRenderPassResolve', 'EventUnknown', + 'RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END', + 'RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START', + 'RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE', + 'RGP_SQTT_MARKER_IDENTIFIER_CB_END', + 'RGP_SQTT_MARKER_IDENTIFIER_CB_START', + 'RGP_SQTT_MARKER_IDENTIFIER_EVENT', + 'RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API', + 'RGP_SQTT_MARKER_IDENTIFIER_LAYOUT_TRANSITION', + 'RGP_SQTT_MARKER_IDENTIFIER_PRESENT', + 'RGP_SQTT_MARKER_IDENTIFIER_RENDER_PASS', + 'RGP_SQTT_MARKER_IDENTIFIER_RESERVED2', + 'RGP_SQTT_MARKER_IDENTIFIER_RESERVED4', + 'RGP_SQTT_MARKER_IDENTIFIER_RESERVED5', + 'RGP_SQTT_MARKER_IDENTIFIER_RESERVED6', + 'RGP_SQTT_MARKER_IDENTIFIER_SYNC', + 'RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT', + 'SQTT_ACTIVE_PIXEL_PACKER_MASK_DWORDS', + 'SQTT_API_TYPE_DIRECTX_12', 'SQTT_API_TYPE_GENERIC', + 'SQTT_API_TYPE_OPENCL', 'SQTT_API_TYPE_VULKAN', + 'SQTT_ENGINE_TYPE_COMPUTE', 'SQTT_ENGINE_TYPE_DMA', + 'SQTT_ENGINE_TYPE_EXCLUSIVE_COMPUTE', + 'SQTT_ENGINE_TYPE_HIGH_PRIORITY_GRAPHICS', + 'SQTT_ENGINE_TYPE_HIGH_PRIORITY_UNIVERSAL', + 'SQTT_ENGINE_TYPE_UNIVERSAL', 'SQTT_ENGINE_TYPE_UNKNOWN', + 'SQTT_FILE_CHUNK_ASIC_INFO_FLAG_PS1_EVENT_TOKENS_ENABLED', + 'SQTT_FILE_CHUNK_ASIC_INFO_FLAG_SC_PACKER_NUMBERING', + 'SQTT_FILE_CHUNK_TYPE_API_INFO', 'SQTT_FILE_CHUNK_TYPE_ASIC_INFO', + 'SQTT_FILE_CHUNK_TYPE_CLOCK_CALIBRATION', + 'SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_DATABASE', + 'SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS', + 'SQTT_FILE_CHUNK_TYPE_COUNT', 'SQTT_FILE_CHUNK_TYPE_CPU_INFO', + 'SQTT_FILE_CHUNK_TYPE_INSTRUMENTATION_TABLE', + 'SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION', + 'SQTT_FILE_CHUNK_TYPE_QUEUE_EVENT_TIMINGS', + 'SQTT_FILE_CHUNK_TYPE_RESERVED', 'SQTT_FILE_CHUNK_TYPE_SPM_DB', + 'SQTT_FILE_CHUNK_TYPE_SQTT_DATA', + 'SQTT_FILE_CHUNK_TYPE_SQTT_DESC', 'SQTT_FILE_MAGIC_NUMBER', + 'SQTT_FILE_VERSION_MAJOR', 'SQTT_FILE_VERSION_MINOR', + 'SQTT_GFXIP_LEVEL_GFXIP_10_1', 'SQTT_GFXIP_LEVEL_GFXIP_10_3', + 'SQTT_GFXIP_LEVEL_GFXIP_11_0', 'SQTT_GFXIP_LEVEL_GFXIP_6', + 'SQTT_GFXIP_LEVEL_GFXIP_7', 'SQTT_GFXIP_LEVEL_GFXIP_8', + 'SQTT_GFXIP_LEVEL_GFXIP_8_1', 'SQTT_GFXIP_LEVEL_GFXIP_9', + 'SQTT_GFXIP_LEVEL_NONE', 'SQTT_GPU_NAME_MAX_SIZE', + 'SQTT_GPU_TYPE_DISCRETE', 'SQTT_GPU_TYPE_INTEGRATED', + 'SQTT_GPU_TYPE_UNKNOWN', 'SQTT_GPU_TYPE_VIRTUAL', + 'SQTT_INSTRUCTION_TRACE_API_PSO', + 'SQTT_INSTRUCTION_TRACE_DISABLED', + 'SQTT_INSTRUCTION_TRACE_FULL_FRAME', 'SQTT_MAX_NUM_SE', + 'SQTT_MEMORY_TYPE_DDR', 'SQTT_MEMORY_TYPE_DDR2', + 'SQTT_MEMORY_TYPE_DDR3', 'SQTT_MEMORY_TYPE_DDR4', + 'SQTT_MEMORY_TYPE_DDR5', 'SQTT_MEMORY_TYPE_GDDR3', + 'SQTT_MEMORY_TYPE_GDDR4', 'SQTT_MEMORY_TYPE_GDDR5', + 'SQTT_MEMORY_TYPE_GDDR6', 'SQTT_MEMORY_TYPE_HBM', + 'SQTT_MEMORY_TYPE_HBM2', 'SQTT_MEMORY_TYPE_HBM3', + 'SQTT_MEMORY_TYPE_LPDDR4', 'SQTT_MEMORY_TYPE_LPDDR5', + 'SQTT_MEMORY_TYPE_UNKNOWN', 'SQTT_PROFILING_MODE_INDEX', + 'SQTT_PROFILING_MODE_PRESENT', 'SQTT_PROFILING_MODE_TAG', + 'SQTT_PROFILING_MODE_USER_MARKERS', + 'SQTT_QUEUE_TIMING_EVENT_CMDBUF_SUBMIT', + 'SQTT_QUEUE_TIMING_EVENT_PRESENT', + 'SQTT_QUEUE_TIMING_EVENT_SIGNAL_SEMAPHORE', + 'SQTT_QUEUE_TIMING_EVENT_WAIT_SEMAPHORE', + 'SQTT_QUEUE_TYPE_COMPUTE', 'SQTT_QUEUE_TYPE_DMA', + 'SQTT_QUEUE_TYPE_UNIVERSAL', 'SQTT_QUEUE_TYPE_UNKNOWN', + 'SQTT_SA_PER_SE', 'SQTT_VERSION_2_2', 'SQTT_VERSION_2_3', + 'SQTT_VERSION_2_4', 'SQTT_VERSION_3_2', 'SQTT_VERSION_NONE', + 'UserEventObjectName', 'UserEventPop', 'UserEventPush', + 'UserEventTrigger', 'elf_gfxip_level', + 'rgp_sqtt_marker_event_type', 'rgp_sqtt_marker_general_api_type', + 'rgp_sqtt_marker_identifier', 'rgp_sqtt_marker_user_event_type', + 'sqtt_api_type', 'sqtt_engine_type', + 'sqtt_file_chunk_asic_info_flags', 'sqtt_file_chunk_type', + 'sqtt_gfxip_level', 'sqtt_gpu_type', + 'sqtt_instruction_trace_mode', 'sqtt_memory_type', + 'sqtt_profiling_mode', 'sqtt_queue_event_type', 'sqtt_queue_type', + 'sqtt_version', 'struct_rgp_sqtt_marker_barrier_end', + 'struct_rgp_sqtt_marker_barrier_end_0_0', + 'struct_rgp_sqtt_marker_barrier_end_1_0', + 'struct_rgp_sqtt_marker_barrier_start', + 'struct_rgp_sqtt_marker_barrier_start_0_0', + 'struct_rgp_sqtt_marker_barrier_start_1_0', + 'struct_rgp_sqtt_marker_cb_end', + 'struct_rgp_sqtt_marker_cb_end_0_0', + 'struct_rgp_sqtt_marker_cb_id_global_cb_id', + 'struct_rgp_sqtt_marker_cb_id_per_frame_cb_id', + 'struct_rgp_sqtt_marker_cb_start', + 'struct_rgp_sqtt_marker_cb_start_0_0', + 'struct_rgp_sqtt_marker_event', + 'struct_rgp_sqtt_marker_event_0_0', + 'struct_rgp_sqtt_marker_event_1_0', + 'struct_rgp_sqtt_marker_event_with_dims', + 'struct_rgp_sqtt_marker_general_api', + 'struct_rgp_sqtt_marker_general_api_0_0', + 'struct_rgp_sqtt_marker_layout_transition', + 'struct_rgp_sqtt_marker_layout_transition_0_0', + 'struct_rgp_sqtt_marker_layout_transition_1_0', + 'struct_rgp_sqtt_marker_pipeline_bind', + 'struct_rgp_sqtt_marker_pipeline_bind_0_0', + 'struct_rgp_sqtt_marker_pipeline_bind_1_0', + 'struct_rgp_sqtt_marker_user_event', + 'struct_rgp_sqtt_marker_user_event_0_0', + 'struct_rgp_sqtt_marker_user_event_with_length', + 'struct_sqtt_code_object_database_record', + 'struct_sqtt_code_object_loader_events_record', + 'struct_sqtt_data_info', 'struct_sqtt_data_se', + 'struct_sqtt_file_chunk_api_info', + 'struct_sqtt_file_chunk_asic_info', + 'struct_sqtt_file_chunk_clock_calibration', + 'struct_sqtt_file_chunk_code_object_database', + 'struct_sqtt_file_chunk_code_object_loader_events', + 'struct_sqtt_file_chunk_cpu_info', + 'struct_sqtt_file_chunk_header', 'struct_sqtt_file_chunk_id', + 'struct_sqtt_file_chunk_pso_correlation', + 'struct_sqtt_file_chunk_queue_event_timings', + 'struct_sqtt_file_chunk_spm_db', + 'struct_sqtt_file_chunk_sqtt_data', + 'struct_sqtt_file_chunk_sqtt_desc', + 'struct_sqtt_file_chunk_sqtt_desc_0_v0', + 'struct_sqtt_file_chunk_sqtt_desc_0_v1', + 'struct_sqtt_file_header', 'struct_sqtt_file_header_flags', + 'struct_sqtt_file_header_flags_0_0', + 'struct_sqtt_instruction_trace_data_api_pso_data', + 'struct_sqtt_instruction_trace_data_shader_engine_filter', + 'struct_sqtt_profiling_mode_data_index_profiling_data', + 'struct_sqtt_profiling_mode_data_tag_profiling_data', + 'struct_sqtt_profiling_mode_data_user_marker_profiling_data', + 'struct_sqtt_pso_correlation_record', + 'struct_sqtt_queue_event_record', + 'struct_sqtt_queue_hardware_info', + 'struct_sqtt_queue_hardware_info_0_0', + 'struct_sqtt_queue_info_record', + 'union_rgp_sqtt_marker_barrier_end_0', + 'union_rgp_sqtt_marker_barrier_end_1', + 'union_rgp_sqtt_marker_barrier_start_0', + 'union_rgp_sqtt_marker_barrier_start_1', + 'union_rgp_sqtt_marker_cb_end_0', + 'union_rgp_sqtt_marker_cb_end_1', + 'union_rgp_sqtt_marker_cb_end_2', 'union_rgp_sqtt_marker_cb_id', + 'union_rgp_sqtt_marker_cb_start_0', + 'union_rgp_sqtt_marker_cb_start_1', + 'union_rgp_sqtt_marker_cb_start_2', + 'union_rgp_sqtt_marker_cb_start_3', + 'union_rgp_sqtt_marker_event_0', 'union_rgp_sqtt_marker_event_1', + 'union_rgp_sqtt_marker_event_2', + 'union_rgp_sqtt_marker_general_api_0', + 'union_rgp_sqtt_marker_layout_transition_0', + 'union_rgp_sqtt_marker_layout_transition_1', + 'union_rgp_sqtt_marker_pipeline_bind_0', + 'union_rgp_sqtt_marker_pipeline_bind_1', + 'union_rgp_sqtt_marker_user_event_0', 'union_sqtt_data_info_0', + 'union_sqtt_file_chunk_sqtt_desc_0', + 'union_sqtt_file_header_flags_0', + 'union_sqtt_instruction_trace_data', + 'union_sqtt_profiling_mode_data', + 'union_sqtt_queue_hardware_info_0'] diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 8bf0ba08ed..c01b5147d4 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -1,15 +1,15 @@ from __future__ import annotations from typing import Any, cast, ClassVar -import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select +import os, ctypes, ctypes.util, struct, hashlib, functools, mmap, errno, array, contextlib, sys, select assert sys.platform != 'win32' from dataclasses import dataclass from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface from tinygrad.ops import sint -from tinygrad.device import BufferSpec, CPUProgram +from tinygrad.device import Compiled, ProfileEvent, BufferSpec, CPUProgram, PROFILE from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX from tinygrad.renderer.cstyle import AMDRenderer -from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio -from tinygrad.runtime.autogen.am import am +from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio, sqtt +from tinygrad.runtime.autogen.am import am, gc_11_0_0 from tinygrad.runtime.support.compiler_hip import AMDCompiler from tinygrad.runtime.support.elf import elf_loader from tinygrad.runtime.support.am.amdev import AMDev, AMMapping @@ -18,13 +18,21 @@ if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107 EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h +WAIT_REG_MEM_FUNCTION_EQ = 3 # == +WAIT_REG_MEM_FUNCTION_NEQ = 4 # != WAIT_REG_MEM_FUNCTION_GEQ = 5 # >= COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15) def gfxreg(reg): return reg + amd_gpu.GC_BASE__INST0_SEG0 - amd_gpu.PACKET3_SET_SH_REG_START +def ucfgreg(reg, pkt3_set:bool=True): return reg + amd_gpu.GC_BASE__INST0_SEG1 - (amd_gpu.PACKET3_SET_UCONFIG_REG_START if pkt3_set else 0) def nbioreg(reg): return reg + amd_gpu.NBIO_BASE__INST0_SEG2 +# This can potentially be shared with AMRegister._parse_kwargs. NOTE: This is hardcoded to gfx11, bitfields might be different in other gfxvers. +# Currently not a problem because this is only used by sqtt and sqtt is only supported on 7900xtx +def encode_bitfields(regname: str, **kwargs) -> int: + return functools.reduce(lambda x,y: x|y, [v << getattr(gc_11_0_0, f'{regname}__{k.upper()}__SHIFT') for k,v in kwargs.items()], 0) + class AMDSignal(HCQSignal): def __init__(self, base_addr:int|None=None, **kwargs): super().__init__(base_addr, **kwargs, timestamp_divider=100, dev_t=AMDDevice) @@ -40,6 +48,11 @@ class AMDComputeQueue(HWQueue): def pkt3(self, cmd, *vals): self.q(amd_gpu.PACKET3(cmd, len(vals) - 1), *vals) + def sqtt_userdata(self, data, *extra_dwords): + data_ints = [x[0] for x in struct.iter_unpack('>12) + self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_BUF0_SIZE), + encode_bitfields('SQ_THREAD_TRACE_BUF0_SIZE', base_hi=buf0_hi, size=buf0s[se].size>>12)) + self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_BUF0_BASE), buf0_lo) + # NOTE: SQTT can only trace instructions on one simd per se, this selects first simd in first wgp in first sa. + # For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se, + # and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but + # sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and + # be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the + # CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE and trace even kernels that only have one wavefront. + self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_MASK), + encode_bitfields('SQ_THREAD_TRACE_MASK', wtype_include=amd_gpu.SQ_TT_WTYPE_INCLUDE_CS_BIT, simd_sel=0, wgp_sel=0, sa_sel=0)) + REG_INCLUDE = amd_gpu.SQ_TT_TOKEN_MASK_SQDEC_BIT | amd_gpu.SQ_TT_TOKEN_MASK_SHDEC_BIT | amd_gpu.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \ + amd_gpu.SQ_TT_TOKEN_MASK_COMP_BIT | amd_gpu.SQ_TT_TOKEN_MASK_CONTEXT_BIT | amd_gpu.SQ_TT_TOKEN_MASK_CONTEXT_BIT + TOKEN_EXCLUDE = 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT + if not (se_mask >> se) & 0b1: + TOKEN_EXCLUDE |= 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \ + 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \ + 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT + self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_TOKEN_MASK), + encode_bitfields('SQ_THREAD_TRACE_TOKEN_MASK', reg_include=REG_INCLUDE, token_exclude=TOKEN_EXCLUDE, bop_events_token_include=1)) + # Enable SQTT + self.sqtt_config(tracing=True) + # Restore global broadcasting + self.grbm_gfx_index(se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1) + self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_THREAD_TRACE_ENABLE), 1) + self.memory_barrier() + return self + + # Magic values from src/amd/common/ac_sqtt.c:ac_sqtt_emit_stop and src/amd/common/ac_sqtt.c:ac_sqtt_emit_wait + def stop_trace(self, ses: int, wptrs: HCQBuffer): + self.memory_barrier() + # Start shutting everything down + self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_THREAD_TRACE_ENABLE), 0) + self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.THREAD_TRACE_FINISH) | amd_gpu.EVENT_INDEX(0)) + # For each SE wait for finish to complete and copy regSQ_THREAD_TRACE_WPTR to know where in the buffer trace data ends + for se in range(ses): + self.grbm_gfx_index(se_index=se, instance_broadcast_writes=1) # select se, broadcast to all instances in that se + # Wait for FINISH_PENDING==0 + self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ), + ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__FINISH_PENDING_MASK, 4) + # Wait for FINISH_DONE!=0 + self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_NEQ), + ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__FINISH_DONE_MASK, 4) + # Disable SQTT + self.sqtt_config(tracing=False) + # Wait for BUSY==0 + self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ), + ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__BUSY_MASK, 4) + # Copy WPTR to memory (src_sel = perf, dst_sel = tc_l2, wr_confirm = True), ucfgreg with False adds GC_BASE__INST0_SEG1 but not pkt3 reg offset + self.pkt3(amd_gpu.PACKET3_COPY_DATA, 1 << 20 | 2 << 8 | 4, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_WPTR, False), 0, *data64_le(wptrs.va_addr+(se*4))) + # Restore global broadcasting + self.grbm_gfx_index(se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1) + self.spi_config(tracing=False) + self.memory_barrier() + return self + def exec(self, prg:AMDProgram, args_state:CLikeArgsState, global_size:tuple[sint, ...], local_size:tuple[sint, ...]): self.bind_args_state(args_state) @@ -93,6 +183,20 @@ class AMDComputeQueue(HWQueue): user_regs += [*data64_le(args_state.ptr)] + if prg.dev.sqtt_enabled: + self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_pipeline_bind( + _0=sqtt.union_rgp_sqtt_marker_pipeline_bind_0(_0=sqtt.struct_rgp_sqtt_marker_pipeline_bind_0_0( + identifier=sqtt.RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE, + bind_point=1, # compute + )), + _1=sqtt.union_rgp_sqtt_marker_pipeline_bind_1(api_pso_hash=data64_le(prg.libhash[0])), + )) + self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_event( + _0=sqtt.union_rgp_sqtt_marker_event_0(_0=sqtt.struct_rgp_sqtt_marker_event_0_0(has_thread_dims=1)), + _2=sqtt.union_rgp_sqtt_marker_event_2(cmd_id=prg.dev.cmd_id), + ), *global_size) + prg.dev.cmd_id += 1 + self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8)) self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2) self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0) @@ -110,6 +214,7 @@ class AMDComputeQueue(HWQueue): self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0) self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN) + if prg.dev.sqtt_enabled: self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.THREAD_TRACE_MARKER) | amd_gpu.EVENT_INDEX(0)) self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH)) return self @@ -268,7 +373,10 @@ class AMDProgram(HCQProgram): self.enable_private_segment_sgpr: int = code.kernel_code_properties & hsa.AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER additional_alloc_sz = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t) if self.enable_dispatch_ptr else 0 - super().__init__(CLikeArgsState, self.dev, self.name, kernargs_alloc_size=self.kernargs_segment_size+additional_alloc_sz) + if dev.sqtt_enabled: self.libhash: tuple[int, int] = struct.unpack('0 bitmask for where to enable instruction tracing + self.cmd_id = 0 + AMDComputeQueue().start_trace(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self) + def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0): ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True) gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True) @@ -629,6 +755,24 @@ class AMDDevice(HCQCompiled): def on_device_hang(self): self.dev_iface.on_device_hang() + def _at_profile_finalize(self): + if self.sqtt_enabled: + wptrs_buf = self.allocator.alloc(round_up(len(self.sqtt_buffers), 0x1000), BufferSpec(cpu_access=True, nolru=True)) + wptrs = to_mv(wptrs_buf.va_addr, wptrs_buf.size) + AMDComputeQueue().stop_trace(len(self.sqtt_buffers), wptrs_buf).signal(self.timeline_signal, self.timeline_value).submit(self) + self.timeline_value += 1 + self.synchronize() + if DEBUG>=2: print('Saving SQTT in profile...') + for i,buf0 in enumerate(self.sqtt_buffers): + wptr = ((struct.unpack('=2: print(f'Se {i} blob size {wptr:#x}') + assert wptr >= 0 and wptr <= buf0.size, f"{wptr} > {buf0.size}, should never happen" + # When sqtt buffer overflows, wptr stops at the last dword + if wptr >= buf0.size-32: print(f"WARNING: SQTT BUFFER IS FULL (SE {i})! INCREASE SQTT BUFFER SIZE WITH SQTT_BUFFER_SIZE=X (in MB)") + self.allocator._copyout(sqtt_buf:=memoryview(bytearray(wptr)), buf0) + Compiled.profile_events += [ProfileSQTTEvent(self.device, i, bytes(sqtt_buf), bool((self.sqtt_itrace_se_mask >> i) & 0b1))] + super()._at_profile_finalize() + def finalize(self): self.synchronize() if hasattr(self.dev_iface, 'device_fini'): self.dev_iface.device_fini() diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 9b2134e3d6..42678f55b1 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -3,7 +3,7 @@ from typing import cast, Type, TypeVar, Generic, Any, ClassVar import contextlib, decimal, statistics, time, ctypes, array, os, fcntl from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up from tinygrad.renderer import Renderer -from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator, ProfileRangeEvent, ProfileDeviceEvent +from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator, ProfileRangeEvent, ProfileDeviceEvent, ProfileProgramEvent from tinygrad.ops import sym_infer, sint, Variable, UOp from tinygrad.runtime.autogen import libc @@ -290,8 +290,9 @@ class CLikeArgsState(HCQArgsState[ProgramType]): self.bind_sints_to_ptr(*vals, ptr=self.ptr + len(prefix or []) * 4 + len(bufs) * 8, fmt='I') class HCQProgram(Generic[DeviceType]): - def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int): + def __init__(self, args_state_t:Type[HCQArgsState], dev:DeviceType, name:str, kernargs_alloc_size:int, lib:bytes|None=None, base:int|None=None): self.args_state_t, self.dev, self.name, self.kernargs_alloc_size = args_state_t, dev, name, kernargs_alloc_size + if PROFILE: Compiled.profile_events += [ProfileProgramEvent(dev.device, name, lib, base)] def fill_kernargs(self, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=(), kernargs_ptr:int|None=None) -> HCQArgsState: """