diff --git a/extra/gemm/metal_conv.py b/extra/gemm/metal_conv.py
index 0750650325..694b235864 100644
--- a/extra/gemm/metal_conv.py
+++ b/extra/gemm/metal_conv.py
@@ -1,5 +1,5 @@
import os
-os.environ["METAL"] = "1"
+#os.environ["METAL"] = "1"
import numpy as np
BS = 64
@@ -7,6 +7,7 @@ CIN = 256
COUT = 256
HW = 32
K = 3
+PADDING = 0
# TODO: this is doing some trick, since with CIN=256 COUT=256 it's over 10.4 TFLOPS.
# are winograd convs less flops? it appears so if they are batched
# https://www.cse.ust.hk/~weiwa/papers/yan-ppopp20.pdf
@@ -15,31 +16,34 @@ FLOPS = BS*K*K*CIN*HW*HW*COUT*2
nb = np.random.default_rng().standard_normal(size=(BS,CIN,HW,HW), dtype=np.float32)
nc = np.random.default_rng().standard_normal(size=(COUT,CIN,K,K), dtype=np.float32)
-import time, torch, torch.mps
-b = torch.from_numpy(nb).to('mps')
-c = torch.from_numpy(nc).to('mps')
+try:
+ import time, torch, torch.mps
+ b = torch.from_numpy(nb).to('mps')
+ c = torch.from_numpy(nc).to('mps')
-def torch_prog(b, c):
- st = time.perf_counter()
- a = torch.nn.functional.conv2d(b, c, padding=1)
- torch.mps.synchronize()
- return time.perf_counter() - st
-tm = min([torch_prog(b, c) for _ in range(20)])
-print(f"{tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS conv in torch")
+ def torch_prog(b, c):
+ st = time.perf_counter()
+ a = torch.nn.functional.conv2d(b, c, padding=PADDING)
+ torch.mps.synchronize()
+ return time.perf_counter() - st
+ tm = min([torch_prog(b, c) for _ in range(20)])
+ print(f"{tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS conv in torch")
+except RuntimeError:
+ print("no torch metal conv")
from tinygrad.tensor import Tensor
from tinygrad.jit import TinyJit
-from tinygrad.runtime.ops_metal import METAL
+from tinygrad.lazy import Device
b = Tensor(nb)
c = Tensor(nc)
# TODO: slowness without the JIT I suspect comes from a lack of a caching allocator
@TinyJit
def tiny_jit(b, c):
- return b.conv2d(c, padding=1).realize()
+ return b.conv2d(c, padding=PADDING).realize()
def tiny_prog(b, c):
st = time.perf_counter()
a = tiny_jit(b, c)
- METAL.synchronize()
+ Device[a.device].synchronize()
return time.perf_counter() - st
tm = min([tiny_prog(b, c) for _ in range(5)])
print(f"{tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS conv in tinygrad")
diff --git a/extra/gemm/simple_matmul.py b/extra/gemm/simple_matmul.py
new file mode 100644
index 0000000000..998c37aa9d
--- /dev/null
+++ b/extra/gemm/simple_matmul.py
@@ -0,0 +1,5 @@
+from tinygrad.tensor import Tensor
+N = 1024
+a, b = Tensor.randn(N, N), Tensor.randn(N, N)
+c = (a.reshape(N, 1, N) * b.permute(1,0).reshape(1, N, N)).sum(axis=2)
+print((c.numpy() - (a.numpy() @ b.numpy())).mean())
diff --git a/extra/rocm/kernel_crashes/dump b/extra/rocm/kernel_crashes/dump
new file mode 100644
index 0000000000..cc09d4decc
--- /dev/null
+++ b/extra/rocm/kernel_crashes/dump
@@ -0,0 +1,41 @@
+# run two "rocm-bandwidth-test" in a loop
+# amdgpu-6.0.5-1581431.20.04
+# fixed in kernel 6.2.14
+
+[ 72.153646] RIP: 0010:pm_send_runlist+0x4a/0x630 [amdgpu]
+[ 72.153815] Code: 30 65 48 8b 04 25 28 00 00 00 48 89 45 d0 31 c0 80 fb 01 0f 87 aa 9d 49 00 83 e3 01 0f 85 1c 05 00 00 49 8b 3f b8 01 00 00 00 <48> 8b 97 30 01 00 00 44 8b b7 6c 01 00 00 8b 9f 70 01 00 00 8b 8a
+[ 72.153900] RSP: 0018:ffffb48445c03c30 EFLAGS: 00010246
+[ 72.153928] RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000
+[ 72.153962] RDX: 000000000000007b RSI: ffff9395e1562558 RDI: 0000000000000000
+[ 72.153996] RBP: ffffb48445c03cb8 R08: 0000000000000000 R09: 0000000000000001
+[ 72.154030] R10: ffff9395c900d840 R11: 0000000000000000 R12: 0000000000000000
+[ 72.154065] R13: ffff9395c9e00400 R14: 0000000000000001 R15: ffff9395e15624e0
+[ 72.154099] FS: 00007f345c6463c0(0000) GS:ffff93a4aee80000(0000) knlGS:0000000000000000
+[ 72.154137] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 72.154165] CR2: 0000000000000130 CR3: 0000000112840000 CR4: 0000000000750ee0
+[ 72.154201] PKRU: 55555554
+[ 72.154215] Call Trace:
+[ 72.154230]
+[ 72.154244] map_queues_cpsch+0x75/0xc0 [amdgpu]
+[ 72.154365] debug_map_and_unlock+0x51/0x90 [amdgpu]
+[ 72.154480] debug_refresh_runlist+0x1f/0x30 [amdgpu]
+[ 72.154591] kfd_dbg_runtime_disable+0x13c/0x240 [amdgpu]
+[ 72.154705] kfd_ioctl_dbg_set_debug_trap+0x69d/0x8b0 [amdgpu]
+[ 72.154820] kfd_ioctl+0x24a/0x5b0 [amdgpu]
+[ 72.154925] ? kfd_ioctl_create_queue+0x770/0x770 [amdgpu]
+[ 72.155035] ? syscall_exit_to_user_mode+0x27/0x50
+[ 72.155061] ? exit_to_user_mode_prepare+0x3d/0x1c0
+[ 72.155088] __x64_sys_ioctl+0x95/0xd0
+[ 72.155109] do_syscall_64+0x5c/0xc0
+[ 72.155128] ? syscall_exit_to_user_mode+0x27/0x50
+[ 72.155151] ? do_syscall_64+0x69/0xc0
+[ 72.155172] entry_SYSCALL_64_after_hwframe+0x61/0xcb
+[ 72.155198] RIP: 0033:0x7f345c7f63ab
+[ 72.155218] Code: 0f 1e fa 48 8b 05 e5 7a 0d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b5 7a 0d 00 f7 d8 64 89 01 48
+[ 72.155301] RSP: 002b:00007ffc97cc89f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+[ 72.155339] RAX: ffffffffffffffda RBX: 00007ffc97cc8a30 RCX: 00007f345c7f63ab
+[ 72.155375] RDX: 00007ffc97cc8a30 RSI: 00000000c0284b82 RDI: 0000000000000003
+[ 72.155411] RBP: 00000000c0284b82 R08: 0000000000000000 R09: 0000000000000000
+[ 72.155447] R10: 00007f345cd4ddb0 R11: 0000000000000246 R12: 00007ffc97cc8a30
+[ 72.155481] R13: 0000000000000003 R14: 00007ffc97cc8d20 R15: 0000000000000000
+[ 72.155517]
diff --git a/extra/rocm/kernel_crashes/dump2 b/extra/rocm/kernel_crashes/dump2
new file mode 100644
index 0000000000..8cd24ccfce
--- /dev/null
+++ b/extra/rocm/kernel_crashes/dump2
@@ -0,0 +1,41 @@
+# run two tinygrad matrix example in a loop
+# amdgpu-6.0.5-1581431.20.04
+# NOT fixed in kernel 6.2.14
+
+[ 553.016624] gmc_v11_0_process_interrupt: 30 callbacks suppressed
+[ 553.016631] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:9 pasid:32770, for process python3 pid 10001 thread python3 pid 10001)
+[ 553.016790] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f0000000000 from client 10
+[ 553.016892] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00901A30
+[ 553.016974] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd)
+[ 553.017051] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x0
+[ 553.017111] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0
+[ 553.017173] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3
+[ 553.017238] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0
+[ 553.017300] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0
+[ 553.123921] [drm:mes_v11_0_submit_pkt_and_poll_completion.constprop.0 [amdgpu]] *ERROR* MES failed to response msg=2
+[ 553.124153] amdgpu: failed to add hardware queue to MES, doorbell=0x1a16
+[ 553.124195] amdgpu: MES might be in unrecoverable state, issue a GPU reset
+[ 553.124237] amdgpu: Failed to restore queue 2
+[ 553.124266] amdgpu: Failed to restore process queues
+[ 553.124270] amdgpu: Failed to evict queue 3
+[ 553.124297] amdgpu: amdgpu_amdkfd_restore_userptr_worker: Failed to resume KFD
+
+# alternative crash in kernel 6.2.14
+
+[ 151.097948] gmc_v11_0_process_interrupt: 30 callbacks suppressed
+[ 151.097953] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:8 pasid:32771, for process python3 pid 7525 thread python3 pid 7525)
+[ 151.097993] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f0000000000 from client 10
+[ 151.098008] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00801A30
+[ 151.098020] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd)
+[ 151.098032] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x0
+[ 151.098042] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0
+[ 151.098052] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3
+[ 151.098062] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0
+[ 151.098071] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0
+[ 151.209517] [drm:mes_v11_0_submit_pkt_and_poll_completion.constprop.0 [amdgpu]] *ERROR* MES failed to response msg=2
+[ 151.209724] amdgpu: failed to add hardware queue to MES, doorbell=0x1002
+[ 151.209734] amdgpu: MES might be in unrecoverable state, issue a GPU reset
+[ 151.209743] amdgpu: Failed to restore queue 1
+[ 151.209751] amdgpu: Failed to restore process queues
+[ 151.209759] amdgpu: amdgpu_amdkfd_restore_userptr_worker: Failed to resume KFD
+[ 151.209858] amdgpu 0000:0b:00.0: amdgpu: GPU reset begin!
diff --git a/extra/rocm/kernel_crashes/dump3 b/extra/rocm/kernel_crashes/dump3
new file mode 100644
index 0000000000..ea7ec1d76b
--- /dev/null
+++ b/extra/rocm/kernel_crashes/dump3
@@ -0,0 +1,20 @@
+# two tinygrad + two bandwidth test
+# RDNA2, driver 6.0.5
+# recovered from this!
+
+[ 136.971209] gmc_v10_0_process_interrupt: 39 callbacks suppressed
+[ 136.971218] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:11 pasid:32773, for process rocm-bandwidth- pid 20281 thread rocm-bandwidth- pid 20281)
+[ 136.971228] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f5c2b800000 from client 0x1b (UTCL2)
+[ 136.971232] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00B01A31
+[ 136.971233] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd)
+[ 136.971235] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x1
+[ 136.971236] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0
+[ 136.971236] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3
+[ 136.971237] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0
+[ 136.971238] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0
+...
+[ 136.993979] amdgpu 0000:0b:00.0: amdgpu: IH ring buffer overflow (0x000BE5A0, 0x0003C480, 0x0003E5C0)
+[ 138.209072] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00004000 flags=0x0000]
+[ 138.209078] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00004d80 flags=0x0000]
+[ 138.209081] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00005000 flags=0x0000]
+[ 138.209084] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00005d80 flags=0x0000]
diff --git a/extra/rocm/kernel_crashes/dump4 b/extra/rocm/kernel_crashes/dump4
new file mode 100644
index 0000000000..8c1fa909ac
--- /dev/null
+++ b/extra/rocm/kernel_crashes/dump4
@@ -0,0 +1,33 @@
+# ROCK-Kernel-Driver 0b579de9622f5c93021dcb7927d13926313740a2
+# non fatal "crash"
+
+[ 127.418045] ------------[ cut here ]------------
+[ 127.418046] User pages unexpectedly invalid
+[ 127.418056] WARNING: CPU: 16 PID: 260 at drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:3000 amdgpu_amdkfd_restore_userptr_worker+0x4d9/0x500 [amdgpu]
+[ 127.418235] Modules linked in: rfcomm cmac algif_hash algif_skcipher af_alg bnep nls_iso8859_1 iwlmvm mac80211 intel_rapl_msr intel_rapl_common edac_mce_amd snd_hda_codec_realtek snd_hda_codec_generic snd_hda_codec_hdmi kvm_amd binfmt_misc snd_hda_intel snd_intel_dspcfg kvm libarc4 snd_intel_sdw_acpi snd_hda_codec btusb iwlwifi btrtl snd_hda_core btbcm btintel irqbypass btmtk snd_hwdep crct10dif_pclmul snd_pcm polyval_clmulni bluetooth snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq polyval_generic cfg80211 ghash_clmulni_intel eeepc_wmi snd_seq_device snd_timer aesni_intel asus_wmi ecdh_generic snd platform_profile crypto_simd ledtrig_audio cryptd ecc ccp soundcore sparse_keymap rapl k10temp wmi_bmof mac_hid sch_fq_codel msr parport_pc ppdev lp parport ramoops pstore_blk efi_pstore reed_solomon pstore_zone ip_tables x_tables autofs4 amdgpu hid_generic usbhid hid i2c_algo_bit drm_ttm_helper ttm video iommu_v2 drm_buddy gpu_sched drm_display_helper drm_kms_helper syscopyarea
+[ 127.418276] sysfillrect sysimgblt fb_sys_fops drm nvme nvme_core cec r8169 ahci crc32_pclmul rc_core i2c_piix4 xhci_pci libahci nvme_common xhci_pci_renesas realtek wmi
+[ 127.418284] CPU: 16 PID: 260 Comm: kworker/16:1 Tainted: G W 6.0.0 #4
+[ 127.418286] Hardware name: System manufacturer System Product Name/TUF GAMING X570-PLUS (WI-FI), BIOS 3603 03/20/2021
+[ 127.418287] Workqueue: events amdgpu_amdkfd_restore_userptr_worker [amdgpu]
+[ 127.418455] RIP: 0010:amdgpu_amdkfd_restore_userptr_worker+0x4d9/0x500 [amdgpu]
+[ 127.418601] Code: ff e8 2b 8a 96 d1 e9 66 fe ff ff 48 c7 c7 40 4f f5 c0 e8 56 7b 8a d1 0f 0b e9 2e ff ff ff 48 c7 c7 d8 d0 ed c0 e8 43 7b 8a d1 <0f> 0b e9 0a fe ff ff 4c 89 ef e8 f8 89 96 d1 e9 cb fd ff ff e8 ce
+[ 127.418603] RSP: 0018:ffffb36740a83dc8 EFLAGS: 00010282
+[ 127.418604] RAX: 0000000000000000 RBX: ffff9d159ee9df30 RCX: 0000000000000027
+[ 127.418605] RDX: 0000000000000027 RSI: ffffb36740a83c88 RDI: ffff9d242a220568
+[ 127.418606] RBP: ffffb36740a83e58 R08: ffff9d242a220560 R09: 0000000000000001
+[ 127.418607] R10: 0000000000000001 R11: 0000000000000020 R12: ffff9d159ee9df98
+[ 127.418607] R13: ffff9d159ee9df70 R14: ffff9d159ee9dee0 R15: ffff9d159ee9dee0
+[ 127.418608] FS: 0000000000000000(0000) GS:ffff9d242a200000(0000) knlGS:0000000000000000
+[ 127.418609] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 127.418610] CR2: 00007fd5d4715000 CR3: 0000000120ffe000 CR4: 0000000000750ee0
+[ 127.418611] PKRU: 55555554
+[ 127.418611] Call Trace:
+[ 127.418612]
+[ 127.418613] process_one_work+0x21f/0x3f0
+[ 127.418615] worker_thread+0x4a/0x3c0
+[ 127.418617] ? process_one_work+0x3f0/0x3f0
+[ 127.418618] kthread+0xf0/0x120
+[ 127.418619] ? kthread_complete_and_exit+0x20/0x20
+[ 127.418620] ret_from_fork+0x22/0x30
+[ 127.418622]
+[ 127.418623] ---[ end trace 0000000000000000 ]---
\ No newline at end of file
diff --git a/extra/rocm/rdna3/asm.py b/extra/rocm/rdna3/asm.py
new file mode 100644
index 0000000000..aa58c87a48
--- /dev/null
+++ b/extra/rocm/rdna3/asm.py
@@ -0,0 +1,57 @@
+import numpy as np
+import pathlib
+from hexdump import hexdump
+from tinygrad.helpers import colored
+from extra.helpers import enable_early_exec
+early_exec = enable_early_exec()
+
+from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer, ROCM_LLVM_PATH
+
+ENABLE_NON_ASM = False
+
+if ENABLE_NON_ASM:
+ buf = CLBuffer.fromCPU(np.zeros(10, np.float32))
+ prg_empty = CLProgram("code", "__kernel void code(__global float *a) { a[0] = 1; }")
+ asm_real = prg_empty.binary()
+ with open("/tmp/cc.elf", "wb") as f:
+ f.write(asm_real)
+ prg_empty([1], [1], buf, wait=True)
+ print(buf.toCPU())
+
+print(colored("creating CLBuffer", "green"))
+buf = CLBuffer.fromCPU(np.zeros(10, np.float32))
+code = open(pathlib.Path(__file__).parent / "prog.s", "r").read()
+
+gen = []
+FLOPS = 0
+for j in range(4):
+ for i in range(0, 251, 6):
+ #gen.append(f"v_dual_fmac_f32 v{i+0}, v{i+1}, v{i+2} :: v_dual_fmac_f32 v{i+3}, v{i+4}, v{i+5}")
+ #FLOPS += 4
+ gen.append(f"v_dual_dot2acc_f32_f16 v{i+0}, v{i+1}, v{i+2} :: v_dual_dot2acc_f32_f16 v{i+3}, v{i+4}, v{i+5}")
+ FLOPS += 8
+code = code.replace("// FLOPS", '\n'.join(gen))
+print(code)
+
+
+# fix: COMGR failed to get code object ISA name. set triple to 'amdgcn-amd-amdhsa'
+
+object = early_exec(([ROCM_LLVM_PATH / "llvm-mc", '--arch=amdgcn', '--mcpu=gfx1100', '--triple=amdgcn-amd-amdhsa', '--filetype=obj', '-'], code.encode("utf-8")))
+asm = early_exec(([ROCM_LLVM_PATH / "ld.lld", "/dev/stdin", "-o", "/dev/stdout", "--pie"], object))
+
+with open("/tmp/cc2.o", "wb") as f:
+ f.write(object)
+with open("/tmp/cc2.elf", "wb") as f:
+ f.write(asm)
+
+print(colored("creating CLProgram", "green"))
+prg = CLProgram("code", asm, binary=True)
+
+print(colored("running program", "green"))
+FLOPS *= 100000*1024*1024 # loop * global_size
+for i in range(3):
+ tm = prg([1024, 1024], [256, 1], buf, wait=True)
+ print(f"ran in {tm*1e3:.2f} ms, {FLOPS/(tm*1e9):.2f} GFLOPS")
+
+print(colored("transferring buffer", "green"))
+print(buf.toCPU())
diff --git a/extra/rocm/rdna3/prog.s b/extra/rocm/rdna3/prog.s
new file mode 100644
index 0000000000..38efca5407
--- /dev/null
+++ b/extra/rocm/rdna3/prog.s
@@ -0,0 +1,80 @@
+.global _start
+_start:
+.rodata
+.align 0x10
+.global code.kd
+.type code.kd,STT_OBJECT
+# amd_kernel_code_t (must be at 0x440 for kernel_code_entry_byte_offset to be right)
+code.kd:
+# amd_kernel_..., amd_machine_...
+.long 0,0,0,0
+# kernel_code_entry_byte_offset, kernel_code_prefetch_byte_offset
+.long 0x00000bc0,0x00000000,0x00000000,0x00000000
+# kernel_code_prefetch_byte_size, max_scratch_backing_memory_byte_size
+.long 0,0,0,0
+# compute_pgm_rsrc1, compute_pgm_rsrc2, kernel_code_properties, workitem_private_segment_byte_size
+.long 0x60af0000,0x0000009e,0x00000408,0x00000000
+# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32 | AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64
+# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP | AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE
+# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT = 0xF
+# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X
+# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR = 1
+# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_RESERVED1 = 1
+.text
+.global code
+.type code,STT_FUNC
+code:
+# https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state
+# s[0:1] contains the kernarg_address
+# TODO: can we use s[2:3] if this was really a wave since we only alloced 2 SGPRs?
+s_load_b64 s[2:3], s[0:1], null
+
+s_mov_b32 s8, 0
+loop:
+s_addk_i32 s8, 1
+s_cmp_eq_u32 s8, 100000
+// FLOPS
+s_cbranch_scc0 loop
+
+# wait for the s_load_b64
+s_waitcnt lgkmcnt(0)
+
+v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 2.0
+global_store_b32 v0, v1, s[2:3]
+
+# Deallocate all VGPRs for this wave. Use only when next instruction is S_ENDPGM.
+s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+s_endpgm
+s_code_end
+
+.amdgpu_metadata
+amdhsa.kernels:
+ - .args:
+ - .address_space: global
+ .name: a
+ .offset: 0
+ .size: 8
+ .type_name: 'float*'
+ .value_kind: global_buffer
+ .group_segment_fixed_size: 0
+ .kernarg_segment_align: 8
+ .kernarg_segment_size: 8
+ .language: OpenCL C
+ .language_version:
+ - 1
+ - 2
+ .max_flat_workgroup_size: 256
+ .name: code
+ .private_segment_fixed_size: 0
+ .sgpr_count: 2
+ .sgpr_spill_count: 0
+ .symbol: code.kd
+ .uses_dynamic_stack: false
+ .vgpr_count: 256
+ .vgpr_spill_count: 0
+ .wavefront_size: 32
+amdhsa.target: amdgcn-amd-amdhsa--gfx1100
+amdhsa.version:
+ - 1
+ - 2
+.end_amdgpu_metadata
diff --git a/extra/rocm/rocm_setup.sh b/extra/rocm/rocm_setup.sh
index f4eb7a4461..880105cb8c 100755
--- a/extra/rocm/rocm_setup.sh
+++ b/extra/rocm/rocm_setup.sh
@@ -1,11 +1,14 @@
#!/bin/bash
-wget https://repo.radeon.com/amdgpu-install/5.5/ubuntu/jammy/amdgpu-install_5.5.50500-1_all.deb
+rm amdgpu-install_5.5.50500-1_all.deb
+wget https://repo.radeon.com/amdgpu-install/5.5/ubuntu/$(lsb_release -cs)/amdgpu-install_5.5.50500-1_all.deb
sudo dpkg -i amdgpu-install_5.5.50500-1_all.deb
sudo apt-get update
+# kernel driver
+sudo apt-get install amdgpu-dkms
+
# for opencl
sudo apt-get install rocm-opencl-runtime
# for HIP
-sudo apt-get install hip-runtime-amd rocm-device-libs
-
+sudo apt-get install hip-runtime-amd rocm-device-libs hip-dev
diff --git a/extra/rocm/sniffer/build.sh b/extra/rocm/sniffer/build.sh
index 859102858a..86fcc05fa2 100755
--- a/extra/rocm/sniffer/build.sh
+++ b/extra/rocm/sniffer/build.sh
@@ -1,7 +1,11 @@
#!/bin/bash -e
-clang sniff.cc -Werror -shared -fPIC -I../src/ROCT-Thunk-Interface/include -I../src/ROCm-Device-Libs/ockl/inc -o sniff.so -lstdc++
+clang sniff.cc -Werror -shared -fPIC -I../src/ -I../src/ROCT-Thunk-Interface/include -I../src/ROCm-Device-Libs/ockl/inc -o sniff.so -lstdc++
#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so /home/tiny/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld
-AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so /home/tiny/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld
+#AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so $HOME/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld
+#AMD_LOG_LEVEL=5 LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py
+DEBUG=5 LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py
+#AMD_LOG_LEVEL=5 HSAKMT_DEBUG_LEVEL=7 DEBUG=5 LD_PRELOAD=$PWD/sniff.so strace -F python3 ../rdna3/asm.py
+#LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py
#AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so FORWARD_ONLY=1 DEBUG=2 python3 ../../../test/test_ops.py TestOps.test_add
#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so rocm-bandwidth-test -s 0 -d 1 -m 1
#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so rocm-bandwidth-test -s 1 -d 2 -m 1
diff --git a/extra/rocm/sniffer/sniff.cc b/extra/rocm/sniffer/sniff.cc
index 392932956d..9527059190 100644
--- a/extra/rocm/sniffer/sniff.cc
+++ b/extra/rocm/sniffer/sniff.cc
@@ -13,6 +13,8 @@
#include
#include
#include
+#include
+using namespace rocr::AMD;
#include
#include