Files
tinygrad/extra/sqtt/active_sqtt_parse.py
George Hotz 9d7a17ee39 beautiful SQTT_PARSE=1 with color (#13428)
* beautiful SQTT_PARSE=1 with color

* linter

* linter 2

* a few more labels

* filter and or

* wave alloc

* a few more
2025-11-23 01:05:14 -08:00

152 lines
3.9 KiB
Python

import os
os.environ["PYTHONPATH"] = "."
os.environ["SQTT"] = "1"
if "DEV" not in os.environ: os.environ["DEV"] = "AMD"
os.environ["PROFILE"] = "1"
os.environ["AMD_LLVM"] = "0"
from dataclasses import replace
import atexit, contextlib
from tinygrad import Tensor
from tinygrad.helpers import system, OSX
from tinygrad.runtime.ops_amd import AMDProgram
from extra.sqtt.roc import decode, WaveExec, ProfileSQTTEvent
from tinygrad.device import Device, ProfileDeviceEvent
from extra.sqtt.attempt_sqtt_parse import parse_sqtt_print_packets
dev = Device["AMD"]
@contextlib.contextmanager
def save_sqtt():
# clear the old traces
dev.profile_events.clear()
sqtt:dict[str, list[WaveExec]] = {}
yield sqtt
events = dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())]
#rctx = decode(events)
#assert len(rctx.inst_execs) > 0, "empty sqtt output"
#sqtt.update(rctx.inst_execs)
for e in events:
if isinstance(e, ProfileSQTTEvent):
print(replace(e, blob=b''))
if e.se == 0:
parse_sqtt_print_packets(e.blob)
template = """.text
.globl matmul
.p2align 8
.type matmul,@function
matmul:
INSTRUCTION
.rodata
.p2align 6
.amdhsa_kernel matmul
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_next_free_vgpr .amdgcn.next_free_vgpr
.amdhsa_next_free_sgpr .amdgcn.next_free_sgpr
.amdhsa_wavefront_size32 1
.end_amdhsa_kernel
.amdgpu_metadata
---
amdhsa.version:
- 1
- 0
amdhsa.kernels:
- .name: matmul
.symbol: matmul.kd
.group_segment_fixed_size: 0
.private_segment_fixed_size: 0
.wavefront_size: 32
.sgpr_count: 8
.vgpr_count: 8
.max_flat_workgroup_size: 1024
.kernarg_segment_align: 8
.kernarg_segment_size: 8
.args:
- .address_space: global
.name: a
.offset: 0
.size: 8
.type_name: 'float*'
.value_kind: global_buffer
...
.end_amdgpu_metadata
"""
def run_asm(src, num_workgroups=1, num_waves=1):
WAVE_SIZE = 32
t = Tensor.empty(0x1000).realize()
buf = t.uop.buffer.ensure_allocated()
lib = dev.compiler.compile(template.replace("INSTRUCTION", '\n'.join(src)))
dev.compiler.disassemble(lib)
fxn = AMDProgram(dev, "matmul", lib)
fxn(buf._buf, global_size=(num_workgroups,1,1), local_size=(WAVE_SIZE*num_waves,1,1), wait=True)
if __name__ == "__main__":
with save_sqtt() as sqtt:
run_asm([
"s_nop 100",
"s_nop 100",
"s_load_b64 s[0:1], s[0:1], null",
"s_waitcnt lgkmcnt(0)",
"s_nop 100",
"s_nop 100",
"s_add_i32 s2, s2, 10",
"s_add_i32 s2, s2, 10",
"s_nop 100",
"s_nop 100",
"v_mov_b32_e32 v0, 0",
"v_mov_b32_e32 v0, 0",
"s_nop 100",
"s_nop 100",
"v_dual_fmac_f32 v2, v48, v24 :: v_dual_fmac_f32 v9, v37, v51",
"v_dual_fmac_f32 v2, v48, v24 :: v_dual_fmac_f32 v9, v37, v51",
"s_nop 100",
"s_nop 100",
"global_load_b128 v[2:5], v0, s[0:1]",
"global_load_b128 v[2:5], v0, s[0:1]",
"s_nop 100",
"s_nop 100",
"s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)",
"s_endpgm",
], num_workgroups=1, num_waves=1)
exit(0)
with save_sqtt() as sqtt:
#(Tensor.empty(16,16) @ Tensor.empty(16,16)).elu().realize()
#Tensor.empty(1, 64).sum(axis=1).realize()
Tensor.empty(1).log2().realize()
exit(0)
with save_sqtt() as sqtt:
# what's in v0?
run_asm([
"v_mov_b32_e32 v0, 0",
"v_mov_b32_e32 v1, 0",
"s_clause 0x1",
"s_load_b64 s[0:1], s[0:1], null",
"s_waitcnt lgkmcnt(0)",
]+[
"global_load_b32 v1, v0, s[0:1]",
]*10+[
"global_load_b32 v10, v1, s[0:1]",
"s_waitcnt vmcnt(0)",
#"v_rcp_f32 v1, v0"
#"v_add_f32_e32 v1 v0 v0",
#"v_add_f32_e32 v5 v4 v4",
#"v_add_f32_e32 v7 v6 v6",
#"v_add_f32_e32 v1 v0 v0",
#"v_add_f32_e32 v2 v1 v1",
#"s_nop 1"
]*5+[
"v_add_f32_e32 v3 v2 v2",
]*5+[
"v_mul_f32_e32 v3 v2 v2",
]*7)