diff --git a/extra/helpers.py b/extra/helpers.py
index 052ce1a132..5ed3f9cf35 100644
--- a/extra/helpers.py
+++ b/extra/helpers.py
@@ -6,3 +6,18 @@ class Timing(object):
   def __exit__(self, exc_type, exc_val, exc_tb):
     self.et = time.perf_counter_ns() - self.st
     if self.enabled: print(f"{self.prefix}{self.et*1e-6:.2f} ms"+(self.on_exit(self.et) if self.on_exit else ""))
+
+def enable_early_exec():
+  import subprocess, multiprocessing
+  qin, qout = multiprocessing.Queue(), multiprocessing.Queue()
+  def _early_exec_process(qin, qout):
+    while 1:
+      path, inp = qin.get()
+      qout.put(subprocess.check_output(path, input=inp))
+  p = multiprocessing.Process(target=_early_exec_process, args=(qin, qout))
+  p.daemon = True
+  p.start()
+  def early_exec(x):
+    qin.put(x)
+    return qout.get()
+  return early_exec
diff --git a/extra/rocm/sniffer/sniff.cc b/extra/rocm/sniffer/sniff.cc
index a594ecd456..392932956d 100644
--- a/extra/rocm/sniffer/sniff.cc
+++ b/extra/rocm/sniffer/sniff.cc
@@ -72,8 +72,14 @@ static void handler(int sig, siginfo_t *si, void *unused) {
       D("HSA_PACKET_TYPE_KERNEL_DISPATCH -- setup:%d workgroup[%d, %d, %d] grid[%d, %d, %d] kernel_object:0x%lx kernarg_address:%p\n", pkt->setup, pkt->workgroup_size_x, pkt->workgroup_size_y, pkt->workgroup_size_z, pkt->grid_size_x, pkt->grid_size_y, pkt->grid_size_z, pkt->kernel_object, pkt->kernarg_address);
       amd_kernel_code_t *code = (amd_kernel_code_t *)pkt->kernel_object;
       D("kernel_code_entry_byte_offset:%lx\n", code->kernel_code_entry_byte_offset);
-      hexdump((void*)(pkt->kernel_object + code->kernel_code_entry_byte_offset), 0x200);
-      //hexdump((void*)pkt->kernel_object, sizeof(amd_kernel_code_t));
+      uint32_t *kernel_code = (uint32_t*)(pkt->kernel_object + code->kernel_code_entry_byte_offset);
+      int code_len = 0;
+      while (kernel_code[code_len] != 0xbf9f0000 && kernel_code[code_len] != 0) code_len++;
+      hexdump(kernel_code, code_len*4);
+      /*FILE *f = fopen("/tmp/kernel_code", "wb");
+      fwrite(kernel_code, 4, code_len, f);
+      fclose(f);
+      system("python -c 'print(\" \".join([(\"0x%02X\"%x) for x in open(\"/tmp/kernel_code\", \"rb\").read()]))' | ../build/llvm-project/bin/llvm-mc --disassemble --arch=amdgcn --mcpu=gfx1100 --show-encoding");*/
     } else if ((pkt->header&0xFF) == HSA_PACKET_TYPE_BARRIER_AND) {
       D("HSA_PACKET_TYPE_BARRIER_AND\n");
     }
diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py
index 4dfda730f2..9cb9aa76b1 100644
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-import platform
+import platform, pathlib
 import numpy as np
 import pyopencl as cl  # type: ignore
 from typing import Optional, List
@@ -12,6 +12,11 @@ OSX = platform.system() == "Darwin"
 OSX_TIMING_RATIO = (125/3) if OSX else 1.0   # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
 FLOAT16 = getenv("FLOAT16", 0)
 
+# TODO: if you fork and exit the child process after creating anything with cl on AMD, it hangs on e.wait()
+if DEBUG >= 5:
+  from extra.helpers import enable_early_exec
+  early_exec = enable_early_exec()
+
 class _CL:
   def __init__(self):
     platforms: List[List[cl.Device]] = [y for y in ([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()] + [x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()]) if len(y)]
@@ -54,6 +59,9 @@ class CLProgram:
       if 'Adreno' in CL.cl_ctx.devices[0].name:
         from disassemblers.adreno import disasm
         disasm(self.binary())
+      elif 'gfx1100' in CL.cl_ctx.devices[0].name:
+        asm = early_exec(([pathlib.Path(__file__).parent.parent.parent / "extra/rocm/build/llvm-project/bin/llvm-objdump", '-d', '-'], self.binary()))
+        print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
       else:
         # print the PTX for NVIDIA. TODO: probably broken for everything else
         print(self.binary().decode('utf-8'))