diff --git a/test/amd/test_sqttmap.py b/test/amd/test_sqttmap.py
index fbfd9c3933..2fa046d96b 100644
--- a/test/amd/test_sqttmap.py
+++ b/test/amd/test_sqttmap.py
@@ -58,9 +58,8 @@ class TestSQTTMapBase(unittest.TestCase):
         data = pickle.load(f)
       sqtt_events = [e for e in data if type(e).__name__ == "ProfileSQTTEvent"]
       kern_events = {e.tag:e for e in data if type(e).__name__ == "ProfileProgramEvent"}
-      dev = next((e for e in data if type(e).__name__ == "ProfileDeviceEvent" and e.device.startswith("AMD")), None)
-      if sqtt_events and kern_events and dev:
-        cls.examples[pkl_path.stem] = (sqtt_events, kern_events, dev.props["gfx_target_version"])
+      if sqtt_events and kern_events:
+        cls.examples[pkl_path.stem] = (sqtt_events, kern_events, cls.target)
 
   def test_rocprof_inst_traces_match(self):
     for name, (events, kern_events, target) in self.examples.items():
diff --git a/test/testextra/test_cfg_viz.py b/test/testextra/test_cfg_viz.py
index a2f3511c02..bf56f17a28 100644
--- a/test/testextra/test_cfg_viz.py
+++ b/test/testextra/test_cfg_viz.py
@@ -28,8 +28,8 @@ def run_asm(name:str, k:Kernel):
 @unittest.skipUnless(Device.DEFAULT == "AMD", "only on AMD")
 class TestCfg(unittest.TestCase):
   def setUp(self):
-    arch = Device["AMD"].arch
-    if not any(arch.startswith(a) for a in {"gfx11", "gfx12"}):
+    self.arch = Device["AMD"].arch
+    if not any(self.arch.startswith(a) for a in {"gfx11", "gfx12"}):
       self.skipTest(f"tests written for RDNA, got arch {arch}")
 
   def test_simple(self):
@@ -58,7 +58,7 @@ class TestCfg(unittest.TestCase):
     k.emit(s_endpgm())
     k.emit(s_code_end())
     ei = run_asm("diamond", k)
-    cfg = amdgpu_cfg(ei.prg.p.lib, Device[Device.DEFAULT].device_props()["gfx_target_version"])["data"]
+    cfg = amdgpu_cfg(ei.prg.p.lib, self.arch)["data"]
     self.assertEqual(len(cfg["blocks"]), 5)
     edge_count = sum(len(v) for v in cfg["paths"].values())
     self.assertEqual(edge_count, 5)
diff --git a/tinygrad/renderer/amd/sqtt.py b/tinygrad/renderer/amd/sqtt.py
index f037e75e97..68dab2659b 100644
--- a/tinygrad/renderer/amd/sqtt.py
+++ b/tinygrad/renderer/amd/sqtt.py
@@ -575,7 +575,7 @@ class InstructionInfo:
   wave: int
   inst: Inst
 
-def map_insts(data:bytes, lib:bytes, target:int) -> Iterator[tuple[PacketType, InstructionInfo|None]]:
+def map_insts(data:bytes, lib:bytes, target:str) -> Iterator[tuple[PacketType, InstructionInfo|None]]:
   """maps SQTT packets to instructions, yields (packet, instruction_info or None)"""
   # map pcs to insts
   from tinygrad.viz.serve import amd_decode
diff --git a/tinygrad/viz/serve.py b/tinygrad/viz/serve.py
index 3c2a432dcd..725b6733ba 100755
--- a/tinygrad/viz/serve.py
+++ b/tinygrad/viz/serve.py
@@ -173,7 +173,7 @@ def rel_ts(ts:int|Decimal, start_ts:int) -> int:
 device_ts_diffs:dict[str, Decimal] = {}
 def cpu_ts_diff(device:str) -> Decimal: return device_ts_diffs.get(device, Decimal(0))
 
-amdgpu_targets:dict[str, int] = {}
+amdgpu_targets:dict[str, str] = {}
 
 DevEvent = ProfileRangeEvent|ProfileGraphEntry|ProfilePointEvent
 def flatten_events(profile:list[ProfileEvent]) -> Generator[tuple[Decimal, Decimal, DevEvent], None, None]:
@@ -314,7 +314,7 @@ def load_counters(profile:list[ProfileEvent]) -> None:
       steps.append(create_step("SQTT", ("/prg-sqtt", len(ctxs), len(steps)), ((k, tag), sqtt, prg_events[k])))
     ctxs.append({"name":f"Exec {name}"+(f" n{run_number[k]}" if run_number[k] > 1 else ""), "steps":steps})
 
-def sqtt_timeline(data:bytes, lib:bytes, target:int) -> list[ProfileEvent]:
+def sqtt_timeline(data:bytes, lib:bytes, target:str) -> list[ProfileEvent]:
   from tinygrad.renderer.amd.sqtt import map_insts, InstructionInfo, PacketType, INST, InstOp, VALUINST, IMMEDIATE, IMMEDIATE_MASK, VMEMEXEC, ALUEXEC
   ret:list[ProfileEvent] = []
   rows:dict[str, None] = {}
@@ -389,7 +389,7 @@ def get_profile(profile:list[ProfileEvent], sort_fn:Callable[[str], Any]=device_
       device_ts_diffs[ev.device] = ev.tdiff
       if (d:=ev.device.split(":")[0]) == "AMD":
         device_decoders[d] = load_counters
-        amdgpu_targets[d] = unwrap(ev.props)["gfx_target_version"]
+        amdgpu_targets[d] = f"gfx{unwrap(ev.props)['gfx_target_version']//1000}"
   # load device specific counters
   for fxn in device_decoders.values(): fxn(profile)
   # map events per device
@@ -436,7 +436,7 @@ def amd_readelf(lib:bytes) -> list[dict]:
   return [{"label":f"{resource} Alloc", "value":val} for resource,val in [("VGPR", (vgpr_gran+1)*8-7), ("LDS",kd.group_segment_fixed_size),
                                                                           ("Scratch", kd.private_segment_fixed_size)] if val > 0]
 
-def amd_decode(lib:bytes, target:int) -> dict[int, Any]: # Any is the Inst class from tinygrad.renderer.amd.dsl
+def amd_decode(lib:bytes, target:str) -> dict[int, Any]: # Any is the Inst class from tinygrad.renderer.amd.dsl
   from tinygrad.runtime.support.elf import elf_loader
   from tinygrad.renderer.amd import detect_format
   from tinygrad.renderer.amd.dsl import Inst
@@ -444,7 +444,7 @@ def amd_decode(lib:bytes, target:int) -> dict[int, Any]: # Any is the Inst class
   text = next((sh for sh in sections if sh.name == ".text"), None)
   assert text is not None, "no .text section found in ELF"
   off, buf = text.header.sh_addr, text.content
-  arch = {11:"rdna3", 12:"rdna4"}.get(target//10000, "cdna")
+  arch = "rdna3" if target.startswith("gfx11") else "rdna4" if target.startswith("gfx12") else "cdna"
   addr_table:dict[int, Inst] = {}
   offset = 0
   while offset < len(buf):
@@ -462,7 +462,7 @@ def parse_branch(inst) -> int|None:
   return None
 
 COND_TAKEN, COND_NOT_TAKEN, UNCOND = range(3)
-def amdgpu_cfg(lib:bytes, target:int) -> dict:
+def amdgpu_cfg(lib:bytes, target:str) -> dict:
   # decode
   pc_table = amd_decode(lib, target)
   # get leaders
@@ -477,9 +477,7 @@ def amdgpu_cfg(lib:bytes, target:int) -> dict:
   disasm = {pc:str(inst) for pc,inst in pc_table.items()}
   asm_width = max(len(asm) for asm in disasm.values())
   for pc, inst in pc_table.items():
-    # skip instructions only used for padding
-    if (asm:=disasm[pc]) == "s_code_end": continue
-    lines.append(f"  {asm:<{asm_width}}  // {pc:012X}")
+    lines.append(f"  {disasm[pc]:<{asm_width}}  // {pc:012X}")
     if pc in leaders:
       paths[curr:=pc] = {}
       blocks[pc] = []