viz: simplify amdgpu cfg (#14326)

* viz: replace llvm disasm with our disasm * it starts with more code * then it becomes less * simpler, cdna disassembles with decimal simm16 * s_branch is upper case, add test * simm16s and others
2026-04-29 03:00:14 -04:00 · 2026-01-25 01:21:45 -05:00
parent 647e527a7e
commit bf2d9d138f
3 changed files with 30 additions and 47 deletions
--- a/extra/assembly/amd/disasm.py
+++ b/extra/assembly/amd/disasm.py
@@ -258,7 +258,7 @@ def _disasm_sopp(inst: SOPP) -> str:
    dep = lambda v: deps[v-1] if 0 < v <= len(deps) else str(v)
    p = [f"instid0({dep(id0)})" if id0 else "", f"instskip({skips[skip]})" if skip else "", f"instid1({dep(id1)})" if id1 else ""]
    return f"s_delay_alu {' | '.join(x for x in p if x) or '0'}"
-  if name.startswith(('s_cbranch', 's_branch')): return f"{name} 0x{inst.simm16:x}"
+  if name.startswith(('s_cbranch', 's_branch')): return f"{name} {inst.simm16}"
  return f"{name} 0x{inst.simm16:x}"

 def _disasm_smem(inst: SMEM) -> str:
--- a/test/testextra/test_cfg_viz.py
+++ b/test/testextra/test_cfg_viz.py
@@ -111,6 +111,8 @@ class TestCfg(unittest.TestCase):
    _, lib = assemble("diamond", insts, Device[Device.DEFAULT].compiler)
    cfg = amdgpu_cfg(lib, Device[Device.DEFAULT].device_props()["gfx_target_version"])["data"]
    self.assertEqual(len(cfg["blocks"]), 5)
+    edge_count = sum(len(v) for v in cfg["paths"].values())
+    self.assertEqual(edge_count, 5)
    references:dict[str, list[str]] = {}
    for pc, tokens in cfg["pc_tokens"].items():
      for t in tokens:
--- a/tinygrad/viz/serve.py
+++ b/tinygrad/viz/serve.py
@@ -345,7 +345,8 @@ def unpack_sqtt(key:tuple[str, int], data:list, p:ProfileProgramEvent) -> tuple[
  # * init decoder
  from extra.sqtt.roc import decode
  base = unwrap(p.base)
-  disasm = {addr+base:inst_disasm for addr,inst_disasm in amd_disasm(device_props[p.device]["gfx_target_version"], unwrap(p.lib)).items()}
+  addr_table = amd_decode(device_props[p.device]["gfx_target_version"], unwrap(p.lib))
+  disasm:dict[int, tuple[str, int]] = {addr+base:(inst.disasm(), inst.size()) for addr, inst in addr_table.items()}
  rctx = decode(data, {p.name:disasm})
  cu_events:dict[str, list[ProfileEvent]] = {}
  # * INST waves
@@ -431,74 +432,49 @@ def amd_readelf(lib:bytes) -> list[dict]:
          ".group_segment_fixed_size":"LDS size", ".private_segment_fixed_size":"Scratch size"}
  return [{"label":label, "value":v} for k,label in keys.items() if (v:=notes["amdhsa.kernels"][0][k]) > 0]

-def amd_disasm(target:int, lib:bytes) -> dict[int, tuple[str, int]]:
+def amd_decode(target:int, lib:bytes) -> dict[int, Any]: # Any is the Inst class from extra.assembly.amd.dsl
  from tinygrad.runtime.support.elf import elf_loader
  from extra.assembly.amd.decode import detect_format
+  from extra.assembly.amd.dsl import Inst
  image, sections, _ = elf_loader(lib)
  text = next((sh for sh in sections if sh.name == ".text"), None)
  assert text is not None, "no .text section found in ELF"
  off, buf = text.header.sh_addr, text.content
  arch = {11:"rdna3", 12:"rdna4"}.get(target//10000, "cdna")
-  addr_table:dict[int, tuple[str, int]] = {}
+  addr_table:dict[int, Inst] = {}
  offset = 0
  while offset < len(buf):
    remaining = buf[offset:]
    fmt = detect_format(remaining, arch)
    decoded = fmt.from_bytes(remaining)
-    disasm = decoded.disasm()
-    # note: rocprof trace decoder assumes simm16 is a decimal integer, our disasm uses hex
-    # keep the decimal int for backwards compatibility, remove once there's no rocprof decoder
-    if "branch" in disasm: disasm = f"{decoded.op_name.lower()} {decoded.simm16}"
-    addr_table[off+offset] = (disasm, decoded.size())
+    addr_table[off+offset] = decoded
    offset += decoded.size()
  return addr_table

-SOPP_INSTS = {"s_branch", "s_cbranch_scc0", "s_cbranch_scc1", "s_cbranch_vccz", "s_cbranch_vccnz", "s_cbranch_execz", "s_cbranch_execnz"}
-def parse_branch(asm:str) -> int|None:
-  inst, *operands = asm.split(" ")
-  if inst in SOPP_INSTS:
-    x = int(operands[0]) & 0xffff
+def parse_branch(inst) -> int|None:
+  if "branch" in getattr(inst, "op_name", "").lower():
+    x = inst.simm16 & 0xffff
    return (x - 0x10000 if x & 0x8000 else x)*4
  return None

-def _op2dsl(op: str) -> str:
-  """Convert LLVM asm operand (s0, s[0:1], v0) to DSL format (s[0], s[0:1], v[0])."""
-  import re
-  op = op.strip()
-  lo = op.lower()
-  SPEC_DSL = {'vcc_lo': 'VCC_LO', 'vcc_hi': 'VCC_HI', 'vcc': 'VCC', 'exec_lo': 'EXEC_LO', 'exec_hi': 'EXEC_HI', 'exec': 'EXEC',
-              'scc': 'SCC', 'm0': 'M0', 'null': 'NULL', 'off': 'OFF'}
-  if lo in SPEC_DSL: return SPEC_DSL[lo]
-  rp = {'s': 's', 'v': 'v', 't': 'ttmp', 'ttmp': 'ttmp'}
-  if m := re.match(r'^([svt](?:tmp)?)\[(\d+):(\d+)\]$', lo): return f"{rp[m.group(1)]}[{m.group(2)}:{m.group(3)}]"
-  if m := re.match(r'^([svt](?:tmp)?)(\d+)$', lo): return f"{rp[m.group(1)]}[{m.group(2)}]"
-  return op
-
-def amdgpu_tokenize(st:str) -> list[str]:
-  try:
-    from extra.assembly.amd.dsl import s, v, Reg, VCC_LO, VCC_HI, VCC, EXEC_LO, EXEC_HI, EXEC, SCC, M0, NULL, OFF
-    dsl = eval(_op2dsl(st), {'s':s, 'v':v, 'VCC_LO':VCC_LO, 'VCC_HI':VCC_HI, 'VCC':VCC, 'EXEC_LO':EXEC_LO, 'EXEC_HI':EXEC_HI, 'EXEC':EXEC,
-                             'SCC':SCC, 'M0':M0, 'NULL':NULL, 'OFF':OFF})
-    return [f"{type(dsl).__name__[0].lower()}{dsl.offset + i}" for i in range(dsl.sz)] if isinstance(dsl, Reg) else [st]
-  except (ImportError, NameError, SyntaxError, TypeError): return []
-
 COND_TAKEN, COND_NOT_TAKEN, UNCOND = range(3)
 def amdgpu_cfg(lib:bytes, target:int) -> dict:
-  # disassemble
-  pc_table = amd_disasm(target, lib)
+  # decode
+  pc_table = amd_decode(target, lib)
  # get leaders
  leaders:set[int] = {next(iter(pc_table))}
-  for pc, (asm, sz) in pc_table.items():
-    if (offset:=parse_branch(asm)) is not None: leaders.update((pc+sz+offset, pc+sz))
+  for pc, inst in pc_table.items():
+    if (offset:=parse_branch(inst)) is not None: leaders.update((pc+inst.size()+offset, pc+inst.size()))
  # build the cfg
  curr:int|None = None
  blocks:dict[int, list[int]] = {}
  paths:dict[int, dict[int, int]] = {}
  lines:list[str] = []
-  asm_width = max(len(asm) for asm, _ in pc_table.values())
-  for pc, (asm, sz) in pc_table.items():
+  disasm = {pc:inst.disasm() for pc,inst in pc_table.items()}
+  asm_width = max(len(asm) for asm in disasm.values())
+  for pc, inst in pc_table.items():
    # skip instructions only used for padding
-    if asm == "s_code_end": continue
+    if (asm:=disasm[pc]) == "s_code_end": continue
    lines.append(f"  {asm:<{asm_width}}  // {pc:012X}")
    if pc in leaders:
      paths[curr:=pc] = {}
@@ -506,14 +482,19 @@ def amdgpu_cfg(lib:bytes, target:int) -> dict:
    else: assert curr is not None, f"no basic block found for {pc}"
    blocks[curr].append(pc)
    # otherwise a basic block can have exactly one or two paths
-    nx = pc+sz
-    if (offset:=parse_branch(asm)) is not None:
-      if asm.startswith("s_branch"): paths[curr][nx+offset] = UNCOND
+    nx = pc+inst.size()
+    if (offset:=parse_branch(inst)) is not None:
+      if inst.op_name == "S_BRANCH": paths[curr][nx+offset] = UNCOND
      else: paths[curr].update([(nx+offset, COND_TAKEN), (nx, COND_NOT_TAKEN)])
    elif nx in leaders: paths[curr][nx] = UNCOND
  pc_tokens:dict[int, list[dict]] = {}
-  for pc, (text, _) in pc_table.items():
-    pc_tokens[pc] = [{"st":s, "keys":amdgpu_tokenize(s) if i>0 else [s], "kind":int(i>0)} for i,s in enumerate(text.replace(",", " , ").split(" "))]
+  from extra.assembly.amd.dsl import Reg
+  for pc, inst in pc_table.items():
+    pc_tokens[pc] = tokens = []
+    for name, field in inst._fields:
+      if isinstance(val:=getattr(inst, name), Reg): tokens.append({"st":val.fmt(), "keys":[f"r{val.offset+i}" for i in range(val.sz)], "kind":1})
+      elif name in {"op","opx","opy"}: tokens.append({"st":(op_name:=val.name.lower()), "keys":[op_name], "kind":0})
+      elif name != "encoding" and val != field.default: tokens.append({"st":(s:=repr(val)), "keys":[s], "kind":1})
  return {"data":{"blocks":blocks, "paths":paths, "pc_tokens":pc_tokens}, "src":"\n".join(lines)}

 # ** Main render function to get the complete details about a trace event