From 90fff82c8ab5902fccaf87876db1651620b35388 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Tue, 16 May 2023 05:33:57 -0700 Subject: [PATCH] Rdna (#776) * assembler maybe * custom asm * rdna3 on quiet * trigger crashes * fixed notes * non-fatal rdna2 crash * Crash4 * improve rdna sniffer * comments * improve sniffer * asm * 131 TFLOPS RDNA3 * opt simple matmul * todos --- extra/gemm/metal_conv.py | 32 +++++++------ extra/gemm/simple_matmul.py | 5 ++ extra/rocm/kernel_crashes/dump | 41 +++++++++++++++++ extra/rocm/kernel_crashes/dump2 | 41 +++++++++++++++++ extra/rocm/kernel_crashes/dump3 | 20 ++++++++ extra/rocm/kernel_crashes/dump4 | 33 +++++++++++++ extra/rocm/rdna3/asm.py | 57 +++++++++++++++++++++++ extra/rocm/rdna3/prog.s | 80 ++++++++++++++++++++++++++++++++ extra/rocm/rocm_setup.sh | 9 ++-- extra/rocm/sniffer/build.sh | 8 +++- extra/rocm/sniffer/sniff.cc | 82 ++++++++++++++++++++++++++++----- tinygrad/codegen/cstyle.py | 2 +- tinygrad/codegen/linearizer.py | 3 +- tinygrad/runtime/ops_gpu.py | 11 +++-- 14 files changed, 389 insertions(+), 35 deletions(-) create mode 100644 extra/gemm/simple_matmul.py create mode 100644 extra/rocm/kernel_crashes/dump create mode 100644 extra/rocm/kernel_crashes/dump2 create mode 100644 extra/rocm/kernel_crashes/dump3 create mode 100644 extra/rocm/kernel_crashes/dump4 create mode 100644 extra/rocm/rdna3/asm.py create mode 100644 extra/rocm/rdna3/prog.s diff --git a/extra/gemm/metal_conv.py b/extra/gemm/metal_conv.py index 0750650325..694b235864 100644 --- a/extra/gemm/metal_conv.py +++ b/extra/gemm/metal_conv.py @@ -1,5 +1,5 @@ import os -os.environ["METAL"] = "1" +#os.environ["METAL"] = "1" import numpy as np BS = 64 @@ -7,6 +7,7 @@ CIN = 256 COUT = 256 HW = 32 K = 3 +PADDING = 0 # TODO: this is doing some trick, since with CIN=256 COUT=256 it's over 10.4 TFLOPS. # are winograd convs less flops? it appears so if they are batched # https://www.cse.ust.hk/~weiwa/papers/yan-ppopp20.pdf @@ -15,31 +16,34 @@ FLOPS = BS*K*K*CIN*HW*HW*COUT*2 nb = np.random.default_rng().standard_normal(size=(BS,CIN,HW,HW), dtype=np.float32) nc = np.random.default_rng().standard_normal(size=(COUT,CIN,K,K), dtype=np.float32) -import time, torch, torch.mps -b = torch.from_numpy(nb).to('mps') -c = torch.from_numpy(nc).to('mps') +try: + import time, torch, torch.mps + b = torch.from_numpy(nb).to('mps') + c = torch.from_numpy(nc).to('mps') -def torch_prog(b, c): - st = time.perf_counter() - a = torch.nn.functional.conv2d(b, c, padding=1) - torch.mps.synchronize() - return time.perf_counter() - st -tm = min([torch_prog(b, c) for _ in range(20)]) -print(f"{tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS conv in torch") + def torch_prog(b, c): + st = time.perf_counter() + a = torch.nn.functional.conv2d(b, c, padding=PADDING) + torch.mps.synchronize() + return time.perf_counter() - st + tm = min([torch_prog(b, c) for _ in range(20)]) + print(f"{tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS conv in torch") +except RuntimeError: + print("no torch metal conv") from tinygrad.tensor import Tensor from tinygrad.jit import TinyJit -from tinygrad.runtime.ops_metal import METAL +from tinygrad.lazy import Device b = Tensor(nb) c = Tensor(nc) # TODO: slowness without the JIT I suspect comes from a lack of a caching allocator @TinyJit def tiny_jit(b, c): - return b.conv2d(c, padding=1).realize() + return b.conv2d(c, padding=PADDING).realize() def tiny_prog(b, c): st = time.perf_counter() a = tiny_jit(b, c) - METAL.synchronize() + Device[a.device].synchronize() return time.perf_counter() - st tm = min([tiny_prog(b, c) for _ in range(5)]) print(f"{tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS conv in tinygrad") diff --git a/extra/gemm/simple_matmul.py b/extra/gemm/simple_matmul.py new file mode 100644 index 0000000000..998c37aa9d --- /dev/null +++ b/extra/gemm/simple_matmul.py @@ -0,0 +1,5 @@ +from tinygrad.tensor import Tensor +N = 1024 +a, b = Tensor.randn(N, N), Tensor.randn(N, N) +c = (a.reshape(N, 1, N) * b.permute(1,0).reshape(1, N, N)).sum(axis=2) +print((c.numpy() - (a.numpy() @ b.numpy())).mean()) diff --git a/extra/rocm/kernel_crashes/dump b/extra/rocm/kernel_crashes/dump new file mode 100644 index 0000000000..cc09d4decc --- /dev/null +++ b/extra/rocm/kernel_crashes/dump @@ -0,0 +1,41 @@ +# run two "rocm-bandwidth-test" in a loop +# amdgpu-6.0.5-1581431.20.04 +# fixed in kernel 6.2.14 + +[ 72.153646] RIP: 0010:pm_send_runlist+0x4a/0x630 [amdgpu] +[ 72.153815] Code: 30 65 48 8b 04 25 28 00 00 00 48 89 45 d0 31 c0 80 fb 01 0f 87 aa 9d 49 00 83 e3 01 0f 85 1c 05 00 00 49 8b 3f b8 01 00 00 00 <48> 8b 97 30 01 00 00 44 8b b7 6c 01 00 00 8b 9f 70 01 00 00 8b 8a +[ 72.153900] RSP: 0018:ffffb48445c03c30 EFLAGS: 00010246 +[ 72.153928] RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000 +[ 72.153962] RDX: 000000000000007b RSI: ffff9395e1562558 RDI: 0000000000000000 +[ 72.153996] RBP: ffffb48445c03cb8 R08: 0000000000000000 R09: 0000000000000001 +[ 72.154030] R10: ffff9395c900d840 R11: 0000000000000000 R12: 0000000000000000 +[ 72.154065] R13: ffff9395c9e00400 R14: 0000000000000001 R15: ffff9395e15624e0 +[ 72.154099] FS: 00007f345c6463c0(0000) GS:ffff93a4aee80000(0000) knlGS:0000000000000000 +[ 72.154137] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 72.154165] CR2: 0000000000000130 CR3: 0000000112840000 CR4: 0000000000750ee0 +[ 72.154201] PKRU: 55555554 +[ 72.154215] Call Trace: +[ 72.154230] +[ 72.154244] map_queues_cpsch+0x75/0xc0 [amdgpu] +[ 72.154365] debug_map_and_unlock+0x51/0x90 [amdgpu] +[ 72.154480] debug_refresh_runlist+0x1f/0x30 [amdgpu] +[ 72.154591] kfd_dbg_runtime_disable+0x13c/0x240 [amdgpu] +[ 72.154705] kfd_ioctl_dbg_set_debug_trap+0x69d/0x8b0 [amdgpu] +[ 72.154820] kfd_ioctl+0x24a/0x5b0 [amdgpu] +[ 72.154925] ? kfd_ioctl_create_queue+0x770/0x770 [amdgpu] +[ 72.155035] ? syscall_exit_to_user_mode+0x27/0x50 +[ 72.155061] ? exit_to_user_mode_prepare+0x3d/0x1c0 +[ 72.155088] __x64_sys_ioctl+0x95/0xd0 +[ 72.155109] do_syscall_64+0x5c/0xc0 +[ 72.155128] ? syscall_exit_to_user_mode+0x27/0x50 +[ 72.155151] ? do_syscall_64+0x69/0xc0 +[ 72.155172] entry_SYSCALL_64_after_hwframe+0x61/0xcb +[ 72.155198] RIP: 0033:0x7f345c7f63ab +[ 72.155218] Code: 0f 1e fa 48 8b 05 e5 7a 0d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b5 7a 0d 00 f7 d8 64 89 01 48 +[ 72.155301] RSP: 002b:00007ffc97cc89f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 +[ 72.155339] RAX: ffffffffffffffda RBX: 00007ffc97cc8a30 RCX: 00007f345c7f63ab +[ 72.155375] RDX: 00007ffc97cc8a30 RSI: 00000000c0284b82 RDI: 0000000000000003 +[ 72.155411] RBP: 00000000c0284b82 R08: 0000000000000000 R09: 0000000000000000 +[ 72.155447] R10: 00007f345cd4ddb0 R11: 0000000000000246 R12: 00007ffc97cc8a30 +[ 72.155481] R13: 0000000000000003 R14: 00007ffc97cc8d20 R15: 0000000000000000 +[ 72.155517] diff --git a/extra/rocm/kernel_crashes/dump2 b/extra/rocm/kernel_crashes/dump2 new file mode 100644 index 0000000000..8cd24ccfce --- /dev/null +++ b/extra/rocm/kernel_crashes/dump2 @@ -0,0 +1,41 @@ +# run two tinygrad matrix example in a loop +# amdgpu-6.0.5-1581431.20.04 +# NOT fixed in kernel 6.2.14 + +[ 553.016624] gmc_v11_0_process_interrupt: 30 callbacks suppressed +[ 553.016631] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:9 pasid:32770, for process python3 pid 10001 thread python3 pid 10001) +[ 553.016790] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f0000000000 from client 10 +[ 553.016892] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00901A30 +[ 553.016974] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd) +[ 553.017051] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x0 +[ 553.017111] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0 +[ 553.017173] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3 +[ 553.017238] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0 +[ 553.017300] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0 +[ 553.123921] [drm:mes_v11_0_submit_pkt_and_poll_completion.constprop.0 [amdgpu]] *ERROR* MES failed to response msg=2 +[ 553.124153] amdgpu: failed to add hardware queue to MES, doorbell=0x1a16 +[ 553.124195] amdgpu: MES might be in unrecoverable state, issue a GPU reset +[ 553.124237] amdgpu: Failed to restore queue 2 +[ 553.124266] amdgpu: Failed to restore process queues +[ 553.124270] amdgpu: Failed to evict queue 3 +[ 553.124297] amdgpu: amdgpu_amdkfd_restore_userptr_worker: Failed to resume KFD + +# alternative crash in kernel 6.2.14 + +[ 151.097948] gmc_v11_0_process_interrupt: 30 callbacks suppressed +[ 151.097953] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:8 pasid:32771, for process python3 pid 7525 thread python3 pid 7525) +[ 151.097993] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f0000000000 from client 10 +[ 151.098008] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00801A30 +[ 151.098020] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd) +[ 151.098032] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x0 +[ 151.098042] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0 +[ 151.098052] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3 +[ 151.098062] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0 +[ 151.098071] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0 +[ 151.209517] [drm:mes_v11_0_submit_pkt_and_poll_completion.constprop.0 [amdgpu]] *ERROR* MES failed to response msg=2 +[ 151.209724] amdgpu: failed to add hardware queue to MES, doorbell=0x1002 +[ 151.209734] amdgpu: MES might be in unrecoverable state, issue a GPU reset +[ 151.209743] amdgpu: Failed to restore queue 1 +[ 151.209751] amdgpu: Failed to restore process queues +[ 151.209759] amdgpu: amdgpu_amdkfd_restore_userptr_worker: Failed to resume KFD +[ 151.209858] amdgpu 0000:0b:00.0: amdgpu: GPU reset begin! diff --git a/extra/rocm/kernel_crashes/dump3 b/extra/rocm/kernel_crashes/dump3 new file mode 100644 index 0000000000..ea7ec1d76b --- /dev/null +++ b/extra/rocm/kernel_crashes/dump3 @@ -0,0 +1,20 @@ +# two tinygrad + two bandwidth test +# RDNA2, driver 6.0.5 +# recovered from this! + +[ 136.971209] gmc_v10_0_process_interrupt: 39 callbacks suppressed +[ 136.971218] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:11 pasid:32773, for process rocm-bandwidth- pid 20281 thread rocm-bandwidth- pid 20281) +[ 136.971228] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f5c2b800000 from client 0x1b (UTCL2) +[ 136.971232] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00B01A31 +[ 136.971233] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd) +[ 136.971235] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x1 +[ 136.971236] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0 +[ 136.971236] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3 +[ 136.971237] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0 +[ 136.971238] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0 +... +[ 136.993979] amdgpu 0000:0b:00.0: amdgpu: IH ring buffer overflow (0x000BE5A0, 0x0003C480, 0x0003E5C0) +[ 138.209072] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00004000 flags=0x0000] +[ 138.209078] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00004d80 flags=0x0000] +[ 138.209081] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00005000 flags=0x0000] +[ 138.209084] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00005d80 flags=0x0000] diff --git a/extra/rocm/kernel_crashes/dump4 b/extra/rocm/kernel_crashes/dump4 new file mode 100644 index 0000000000..8c1fa909ac --- /dev/null +++ b/extra/rocm/kernel_crashes/dump4 @@ -0,0 +1,33 @@ +# ROCK-Kernel-Driver 0b579de9622f5c93021dcb7927d13926313740a2 +# non fatal "crash" + +[ 127.418045] ------------[ cut here ]------------ +[ 127.418046] User pages unexpectedly invalid +[ 127.418056] WARNING: CPU: 16 PID: 260 at drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:3000 amdgpu_amdkfd_restore_userptr_worker+0x4d9/0x500 [amdgpu] +[ 127.418235] Modules linked in: rfcomm cmac algif_hash algif_skcipher af_alg bnep nls_iso8859_1 iwlmvm mac80211 intel_rapl_msr intel_rapl_common edac_mce_amd snd_hda_codec_realtek snd_hda_codec_generic snd_hda_codec_hdmi kvm_amd binfmt_misc snd_hda_intel snd_intel_dspcfg kvm libarc4 snd_intel_sdw_acpi snd_hda_codec btusb iwlwifi btrtl snd_hda_core btbcm btintel irqbypass btmtk snd_hwdep crct10dif_pclmul snd_pcm polyval_clmulni bluetooth snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq polyval_generic cfg80211 ghash_clmulni_intel eeepc_wmi snd_seq_device snd_timer aesni_intel asus_wmi ecdh_generic snd platform_profile crypto_simd ledtrig_audio cryptd ecc ccp soundcore sparse_keymap rapl k10temp wmi_bmof mac_hid sch_fq_codel msr parport_pc ppdev lp parport ramoops pstore_blk efi_pstore reed_solomon pstore_zone ip_tables x_tables autofs4 amdgpu hid_generic usbhid hid i2c_algo_bit drm_ttm_helper ttm video iommu_v2 drm_buddy gpu_sched drm_display_helper drm_kms_helper syscopyarea +[ 127.418276] sysfillrect sysimgblt fb_sys_fops drm nvme nvme_core cec r8169 ahci crc32_pclmul rc_core i2c_piix4 xhci_pci libahci nvme_common xhci_pci_renesas realtek wmi +[ 127.418284] CPU: 16 PID: 260 Comm: kworker/16:1 Tainted: G W 6.0.0 #4 +[ 127.418286] Hardware name: System manufacturer System Product Name/TUF GAMING X570-PLUS (WI-FI), BIOS 3603 03/20/2021 +[ 127.418287] Workqueue: events amdgpu_amdkfd_restore_userptr_worker [amdgpu] +[ 127.418455] RIP: 0010:amdgpu_amdkfd_restore_userptr_worker+0x4d9/0x500 [amdgpu] +[ 127.418601] Code: ff e8 2b 8a 96 d1 e9 66 fe ff ff 48 c7 c7 40 4f f5 c0 e8 56 7b 8a d1 0f 0b e9 2e ff ff ff 48 c7 c7 d8 d0 ed c0 e8 43 7b 8a d1 <0f> 0b e9 0a fe ff ff 4c 89 ef e8 f8 89 96 d1 e9 cb fd ff ff e8 ce +[ 127.418603] RSP: 0018:ffffb36740a83dc8 EFLAGS: 00010282 +[ 127.418604] RAX: 0000000000000000 RBX: ffff9d159ee9df30 RCX: 0000000000000027 +[ 127.418605] RDX: 0000000000000027 RSI: ffffb36740a83c88 RDI: ffff9d242a220568 +[ 127.418606] RBP: ffffb36740a83e58 R08: ffff9d242a220560 R09: 0000000000000001 +[ 127.418607] R10: 0000000000000001 R11: 0000000000000020 R12: ffff9d159ee9df98 +[ 127.418607] R13: ffff9d159ee9df70 R14: ffff9d159ee9dee0 R15: ffff9d159ee9dee0 +[ 127.418608] FS: 0000000000000000(0000) GS:ffff9d242a200000(0000) knlGS:0000000000000000 +[ 127.418609] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 127.418610] CR2: 00007fd5d4715000 CR3: 0000000120ffe000 CR4: 0000000000750ee0 +[ 127.418611] PKRU: 55555554 +[ 127.418611] Call Trace: +[ 127.418612] +[ 127.418613] process_one_work+0x21f/0x3f0 +[ 127.418615] worker_thread+0x4a/0x3c0 +[ 127.418617] ? process_one_work+0x3f0/0x3f0 +[ 127.418618] kthread+0xf0/0x120 +[ 127.418619] ? kthread_complete_and_exit+0x20/0x20 +[ 127.418620] ret_from_fork+0x22/0x30 +[ 127.418622] +[ 127.418623] ---[ end trace 0000000000000000 ]--- \ No newline at end of file diff --git a/extra/rocm/rdna3/asm.py b/extra/rocm/rdna3/asm.py new file mode 100644 index 0000000000..aa58c87a48 --- /dev/null +++ b/extra/rocm/rdna3/asm.py @@ -0,0 +1,57 @@ +import numpy as np +import pathlib +from hexdump import hexdump +from tinygrad.helpers import colored +from extra.helpers import enable_early_exec +early_exec = enable_early_exec() + +from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer, ROCM_LLVM_PATH + +ENABLE_NON_ASM = False + +if ENABLE_NON_ASM: + buf = CLBuffer.fromCPU(np.zeros(10, np.float32)) + prg_empty = CLProgram("code", "__kernel void code(__global float *a) { a[0] = 1; }") + asm_real = prg_empty.binary() + with open("/tmp/cc.elf", "wb") as f: + f.write(asm_real) + prg_empty([1], [1], buf, wait=True) + print(buf.toCPU()) + +print(colored("creating CLBuffer", "green")) +buf = CLBuffer.fromCPU(np.zeros(10, np.float32)) +code = open(pathlib.Path(__file__).parent / "prog.s", "r").read() + +gen = [] +FLOPS = 0 +for j in range(4): + for i in range(0, 251, 6): + #gen.append(f"v_dual_fmac_f32 v{i+0}, v{i+1}, v{i+2} :: v_dual_fmac_f32 v{i+3}, v{i+4}, v{i+5}") + #FLOPS += 4 + gen.append(f"v_dual_dot2acc_f32_f16 v{i+0}, v{i+1}, v{i+2} :: v_dual_dot2acc_f32_f16 v{i+3}, v{i+4}, v{i+5}") + FLOPS += 8 +code = code.replace("// FLOPS", '\n'.join(gen)) +print(code) + + +# fix: COMGR failed to get code object ISA name. set triple to 'amdgcn-amd-amdhsa' + +object = early_exec(([ROCM_LLVM_PATH / "llvm-mc", '--arch=amdgcn', '--mcpu=gfx1100', '--triple=amdgcn-amd-amdhsa', '--filetype=obj', '-'], code.encode("utf-8"))) +asm = early_exec(([ROCM_LLVM_PATH / "ld.lld", "/dev/stdin", "-o", "/dev/stdout", "--pie"], object)) + +with open("/tmp/cc2.o", "wb") as f: + f.write(object) +with open("/tmp/cc2.elf", "wb") as f: + f.write(asm) + +print(colored("creating CLProgram", "green")) +prg = CLProgram("code", asm, binary=True) + +print(colored("running program", "green")) +FLOPS *= 100000*1024*1024 # loop * global_size +for i in range(3): + tm = prg([1024, 1024], [256, 1], buf, wait=True) + print(f"ran in {tm*1e3:.2f} ms, {FLOPS/(tm*1e9):.2f} GFLOPS") + +print(colored("transferring buffer", "green")) +print(buf.toCPU()) diff --git a/extra/rocm/rdna3/prog.s b/extra/rocm/rdna3/prog.s new file mode 100644 index 0000000000..38efca5407 --- /dev/null +++ b/extra/rocm/rdna3/prog.s @@ -0,0 +1,80 @@ +.global _start +_start: +.rodata +.align 0x10 +.global code.kd +.type code.kd,STT_OBJECT +# amd_kernel_code_t (must be at 0x440 for kernel_code_entry_byte_offset to be right) +code.kd: +# amd_kernel_..., amd_machine_... +.long 0,0,0,0 +# kernel_code_entry_byte_offset, kernel_code_prefetch_byte_offset +.long 0x00000bc0,0x00000000,0x00000000,0x00000000 +# kernel_code_prefetch_byte_size, max_scratch_backing_memory_byte_size +.long 0,0,0,0 +# compute_pgm_rsrc1, compute_pgm_rsrc2, kernel_code_properties, workitem_private_segment_byte_size +.long 0x60af0000,0x0000009e,0x00000408,0x00000000 +# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32 | AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64 +# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP | AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE +# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT = 0xF +# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X +# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR = 1 +# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_RESERVED1 = 1 +.text +.global code +.type code,STT_FUNC +code: +# https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state +# s[0:1] contains the kernarg_address +# TODO: can we use s[2:3] if this was really a wave since we only alloced 2 SGPRs? +s_load_b64 s[2:3], s[0:1], null + +s_mov_b32 s8, 0 +loop: +s_addk_i32 s8, 1 +s_cmp_eq_u32 s8, 100000 +// FLOPS +s_cbranch_scc0 loop + +# wait for the s_load_b64 +s_waitcnt lgkmcnt(0) + +v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 2.0 +global_store_b32 v0, v1, s[2:3] + +# Deallocate all VGPRs for this wave. Use only when next instruction is S_ENDPGM. +s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +s_endpgm +s_code_end + +.amdgpu_metadata +amdhsa.kernels: + - .args: + - .address_space: global + .name: a + .offset: 0 + .size: 8 + .type_name: 'float*' + .value_kind: global_buffer + .group_segment_fixed_size: 0 + .kernarg_segment_align: 8 + .kernarg_segment_size: 8 + .language: OpenCL C + .language_version: + - 1 + - 2 + .max_flat_workgroup_size: 256 + .name: code + .private_segment_fixed_size: 0 + .sgpr_count: 2 + .sgpr_spill_count: 0 + .symbol: code.kd + .uses_dynamic_stack: false + .vgpr_count: 256 + .vgpr_spill_count: 0 + .wavefront_size: 32 +amdhsa.target: amdgcn-amd-amdhsa--gfx1100 +amdhsa.version: + - 1 + - 2 +.end_amdgpu_metadata diff --git a/extra/rocm/rocm_setup.sh b/extra/rocm/rocm_setup.sh index f4eb7a4461..880105cb8c 100755 --- a/extra/rocm/rocm_setup.sh +++ b/extra/rocm/rocm_setup.sh @@ -1,11 +1,14 @@ #!/bin/bash -wget https://repo.radeon.com/amdgpu-install/5.5/ubuntu/jammy/amdgpu-install_5.5.50500-1_all.deb +rm amdgpu-install_5.5.50500-1_all.deb +wget https://repo.radeon.com/amdgpu-install/5.5/ubuntu/$(lsb_release -cs)/amdgpu-install_5.5.50500-1_all.deb sudo dpkg -i amdgpu-install_5.5.50500-1_all.deb sudo apt-get update +# kernel driver +sudo apt-get install amdgpu-dkms + # for opencl sudo apt-get install rocm-opencl-runtime # for HIP -sudo apt-get install hip-runtime-amd rocm-device-libs - +sudo apt-get install hip-runtime-amd rocm-device-libs hip-dev diff --git a/extra/rocm/sniffer/build.sh b/extra/rocm/sniffer/build.sh index 859102858a..86fcc05fa2 100755 --- a/extra/rocm/sniffer/build.sh +++ b/extra/rocm/sniffer/build.sh @@ -1,7 +1,11 @@ #!/bin/bash -e -clang sniff.cc -Werror -shared -fPIC -I../src/ROCT-Thunk-Interface/include -I../src/ROCm-Device-Libs/ockl/inc -o sniff.so -lstdc++ +clang sniff.cc -Werror -shared -fPIC -I../src/ -I../src/ROCT-Thunk-Interface/include -I../src/ROCm-Device-Libs/ockl/inc -o sniff.so -lstdc++ #AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so /home/tiny/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld -AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so /home/tiny/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld +#AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so $HOME/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld +#AMD_LOG_LEVEL=5 LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py +DEBUG=5 LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py +#AMD_LOG_LEVEL=5 HSAKMT_DEBUG_LEVEL=7 DEBUG=5 LD_PRELOAD=$PWD/sniff.so strace -F python3 ../rdna3/asm.py +#LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py #AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so FORWARD_ONLY=1 DEBUG=2 python3 ../../../test/test_ops.py TestOps.test_add #AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so rocm-bandwidth-test -s 0 -d 1 -m 1 #AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so rocm-bandwidth-test -s 1 -d 2 -m 1 diff --git a/extra/rocm/sniffer/sniff.cc b/extra/rocm/sniffer/sniff.cc index 392932956d..9527059190 100644 --- a/extra/rocm/sniffer/sniff.cc +++ b/extra/rocm/sniffer/sniff.cc @@ -13,6 +13,8 @@ #include #include #include +#include +using namespace rocr::AMD; #include #include @@ -22,7 +24,7 @@ std::map ring_base_addresses; #define D(args...) fprintf(stderr, args) uint64_t doorbell_offset = -1; -int queue_type = 0; +std::map queue_types; void hexdump(void *d, int l) { for (int i = 0; i < l; i++) { @@ -60,13 +62,43 @@ static void handler(int sig, siginfo_t *si, void *unused) { } uint64_t ring_base_address = ring_base_addresses[((uint64_t)si->si_addr)&0xFFF]; - D("%16p: DING DONG store(%d): 0x%8lx -> %p ring_base_address:0x%lx\n", rip, store_size, value, si->si_addr, ring_base_address); + int queue_type = queue_types[((uint64_t)si->si_addr)&0xFFF]; + D("%16p: \u001b[31mDING DONG\u001b[0m (queue_type %d) store(%d): 0x%8lx -> %p ring_base_address:0x%lx\n", rip, queue_type, store_size, value, si->si_addr, ring_base_address); if (queue_type == KFD_IOC_QUEUE_TYPE_SDMA) { - hexdump((void*)(ring_base_address), 0x100); - } else if (queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) { - hexdump((void*)(ring_base_address+value*0x40), 0x40); + uint8_t *sdma_ptr = (uint8_t*)(ring_base_address); + while (sdma_ptr < ((uint8_t*)(ring_base_address)+value)) { + D("0x%3lx: ", sdma_ptr-(uint8_t*)(ring_base_address)); + if (sdma_ptr[0] == SDMA_OP_TIMESTAMP) { + D("SDMA_PKT_TIMESTAMP\n"); + sdma_ptr += sizeof(SDMA_PKT_TIMESTAMP); + } else if (sdma_ptr[0] == SDMA_OP_GCR) { + D("SDMA_PKT_GCR\n"); + sdma_ptr += sizeof(SDMA_PKT_GCR); + } else if (sdma_ptr[0] == SDMA_OP_ATOMIC) { + D("SDMA_PKT_ATOMIC\n"); + sdma_ptr += sizeof(SDMA_PKT_ATOMIC); + } else if (sdma_ptr[0] == SDMA_OP_FENCE) { + D("SDMA_PKT_FENCE\n"); + sdma_ptr += sizeof(SDMA_PKT_FENCE); + } else if (sdma_ptr[0] == SDMA_OP_TRAP) { + D("SDMA_PKT_TRAP\n"); + sdma_ptr += sizeof(SDMA_PKT_TRAP); + } else if (sdma_ptr[0] == SDMA_OP_COPY && sdma_ptr[1] == SDMA_SUBOP_COPY_LINEAR) { + SDMA_PKT_COPY_LINEAR *pkt = (SDMA_PKT_COPY_LINEAR *)sdma_ptr; + D("SDMA_PKT_COPY_LINEAR: count:0x%x src:0x%lx dst:0x%lx\n", pkt->COUNT_UNION.count+1, + (uint64_t)pkt->SRC_ADDR_LO_UNION.src_addr_31_0 | ((uint64_t)pkt->SRC_ADDR_HI_UNION.src_addr_63_32 << 32), + (uint64_t)pkt->DST_ADDR_LO_UNION.dst_addr_31_0 | ((uint64_t)pkt->DST_ADDR_HI_UNION.dst_addr_63_32 << 32) + ); + sdma_ptr += sizeof(SDMA_PKT_COPY_LINEAR); + } else { + D("unhandled packet type %d %d, exiting\n", sdma_ptr[0], sdma_ptr[1]); + break; + } + } + //hexdump((void*)(ring_base_address), 0x100); + } else if (queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) { hsa_kernel_dispatch_packet_t *pkt = (hsa_kernel_dispatch_packet_t *)(ring_base_address+value*0x40); if ((pkt->header&0xFF) == HSA_PACKET_TYPE_KERNEL_DISPATCH) { D("HSA_PACKET_TYPE_KERNEL_DISPATCH -- setup:%d workgroup[%d, %d, %d] grid[%d, %d, %d] kernel_object:0x%lx kernarg_address:%p\n", pkt->setup, pkt->workgroup_size_x, pkt->workgroup_size_y, pkt->workgroup_size_z, pkt->grid_size_x, pkt->grid_size_y, pkt->grid_size_z, pkt->kernel_object, pkt->kernarg_address); @@ -80,8 +112,22 @@ static void handler(int sig, siginfo_t *si, void *unused) { fwrite(kernel_code, 4, code_len, f); fclose(f); system("python -c 'print(\" \".join([(\"0x%02X\"%x) for x in open(\"/tmp/kernel_code\", \"rb\").read()]))' | ../build/llvm-project/bin/llvm-mc --disassemble --arch=amdgcn --mcpu=gfx1100 --show-encoding");*/ + D("kernargs (kernarg_segment_byte_size:0x%lx)\n", code->kernarg_segment_byte_size); + // get length + int i; + for (i = 0; i < 0x400; i+=0x10) { + if (memcmp((void*)((uint64_t)pkt->kernarg_address+i), "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", 0x10) == 0) break; + } + hexdump((void*)pkt->kernarg_address, i+0x10); } else if ((pkt->header&0xFF) == HSA_PACKET_TYPE_BARRIER_AND) { - D("HSA_PACKET_TYPE_BARRIER_AND\n"); + hsa_barrier_and_packet_t *pkt_and = (hsa_barrier_and_packet_t *)(ring_base_address+value*0x40); + D("HSA_PACKET_TYPE_BARRIER_AND completion_signal:0x%lx\n", pkt_and->completion_signal.handle); + //hexdump((void*)(ring_base_address+value*0x40), 0x40); + } else if ((pkt->header&0xFF) == HSA_PACKET_TYPE_VENDOR_SPECIFIC) { + D("HSA_PACKET_TYPE_VENDOR_SPECIFIC\n"); + hexdump((void*)(ring_base_address+value*0x40), 0x40); + } else { + hexdump((void*)(ring_base_address+value*0x40), 0x40); } } @@ -100,11 +146,19 @@ static void handler(int sig, siginfo_t *si, void *unused) { } void register_sigsegv_handler() { - struct sigaction sa; + struct sigaction sa = {0}; sa.sa_flags = SA_SIGINFO; sigemptyset(&sa.sa_mask); sa.sa_sigaction = handler; - sigaction(SIGSEGV, &sa, NULL); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + D("ERROR: failed to register sigsegv handler"); + exit(-1); + } + // NOTE: python (or ocl runtime?) blocks the SIGSEGV signal + sigset_t x; + sigemptyset(&x); + sigaddset(&x, SIGSEGV); + sigprocmask(SIG_UNBLOCK, &x, NULL); } int (*my_open)(const char *pathname, int flags, mode_t mode); @@ -135,7 +189,7 @@ void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) void *ret = my_mmap(addr, length, prot, flags, fd, offset); if (doorbell_offset != -1 && offset == doorbell_offset) { - D("HIDDEN DOORBELL %p\n", addr); + D("HIDDEN DOORBELL %p, handled by %p\n", addr, handler); register_sigsegv_handler(); mprotect(addr, length, PROT_NONE); } @@ -174,9 +228,15 @@ int ioctl(int filedes, unsigned long request, void *argp) { D("AMDKFD_IOC_MAP_MEMORY_TO_GPU handle:%llX", args->handle); } else if (request == AMDKFD_IOC_CREATE_EVENT) { kfd_ioctl_create_event_args *args = (kfd_ioctl_create_event_args *)argp; - D("AMDKFD_IOC_CREATE_EVENT event_type:%d event_id:%d", args->event_type, args->event_id); + D("AMDKFD_IOC_CREATE_EVENT event_page_offset:0x%llx event_type:%d event_id:%d", args->event_page_offset, args->event_type, args->event_id); } else if (request == AMDKFD_IOC_WAIT_EVENTS) { D("AMDKFD_IOC_WAIT_EVENTS"); + } else if (request == AMDKFD_IOC_SET_XNACK_MODE) { + D("AMDKFD_IOC_SET_XNACK_MODE"); + } else if (request == AMDKFD_IOC_SVM || (type == 0x4b && nr == 0x20)) { + // NOTE: this one is variable length + kfd_ioctl_svm_args *args = (kfd_ioctl_svm_args *)argp; + D("AMDKFD_IOC_SVM start_addr:0x%llx size:0x%llx op:%d", args->start_addr, args->size, args->op); } else if (request == AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU) { kfd_ioctl_unmap_memory_from_gpu_args *args = (kfd_ioctl_unmap_memory_from_gpu_args *)argp; D("AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU handle:%llX", args->handle); @@ -208,8 +268,8 @@ int ioctl(int filedes, unsigned long request, void *argp) { D("RETURNS write_pointer_address:0x%llx read_pointer_address:0x%llx doorbell_offset:0x%llx queue_id:%d\n", args->write_pointer_address, args->read_pointer_address, args->doorbell_offset, args->queue_id); //D("RETURNS *write_pointer_address:0x%llx *read_pointer_address:0x%llx\n", *(uint64_t*)args->write_pointer_address, *(uint64_t*)args->read_pointer_address); ring_base_addresses[args->doorbell_offset&0xFFF] = args->ring_base_address; + queue_types[args->doorbell_offset&0xFFF] = args->queue_type; doorbell_offset = args->doorbell_offset&~0xFFF; - queue_type = args->queue_type; } else { D("type:0x%x nr:0x%x size:0x%x", type, nr, size); } diff --git a/tinygrad/codegen/cstyle.py b/tinygrad/codegen/cstyle.py index 008257c604..d59050a632 100644 --- a/tinygrad/codegen/cstyle.py +++ b/tinygrad/codegen/cstyle.py @@ -197,7 +197,7 @@ class CStyleCodegen(Linearizer): # sometimes, there's more dimensions than len(self.lang.gid). # compact all the dimensions into the first # NOTE: this might make multiview shapetrackers - # TODO: this exposes bugs in the optimizers assuming the strides are on a single vie + # TODO: this exposes bugs in the optimizers assuming the strides are on a single view """ if len(self.lang.gid) and self.first_reduce > len(self.lang.gid): num_to_merge = (self.first_reduce - len(self.lang.gid))+1 diff --git a/tinygrad/codegen/linearizer.py b/tinygrad/codegen/linearizer.py index 252eb5e1da..bb7a32cbd9 100644 --- a/tinygrad/codegen/linearizer.py +++ b/tinygrad/codegen/linearizer.py @@ -295,6 +295,7 @@ class Linearizer: if x.op == ReduceOps.SUM and isinstance(x.src[0], LazyOp) and x.src[0].op == UnaryOps.CAST and isinstance(x.src[0].src[0], LazyOp) and x.src[0].src[0].op == BinaryOps.MUL: x = LazyOp(FusedOps.MULACC, x.src[0].src[0].src, x.arg) values = [self.ast_parse(v, acc, loaded_buffers, ssa) for v in x.src] + # TODO: fold float4 into a single uop when possible. if isinstance(x.op, (ReduceOps, FusedOps)): return [self.uop(UOps.ALU, val[0], list(val), {ReduceOps.SUM:BinaryOps.ADD, ReduceOps.MAX:BinaryOps.MAX, FusedOps.MULACC:FusedOps.MULACC}[x.op]) for val in zip(acc, *values)] else: @@ -465,7 +466,7 @@ class Linearizer: if self.full_unupcasted_shape[-1] <= 16: self.upcast() else: - for splits in [16,8,4]: + for splits in [4]: if self.full_unupcasted_shape[-1]%splits == 0: self.shift_to(len(self.full_unupcasted_shape)-1, splits, insert_before=len(self.full_unupcasted_shape)) self.upcast() diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index 9cb9aa76b1..c4ac38ece7 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -13,6 +13,8 @@ OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.p FLOAT16 = getenv("FLOAT16", 0) # TODO: if you fork and exit the child process after creating anything with cl on AMD, it hangs on e.wait() +ROCM_LLVM_PATH = pathlib.Path("/opt/rocm/llvm/bin") +#ROCM_LLVM_PATH = pathlib.Path(__file__).parent.parent.parent.parent / "extra/rocm/build/llvm-project/bin" if DEBUG >= 5: from extra.helpers import enable_early_exec early_exec = enable_early_exec() @@ -48,7 +50,7 @@ class CLBuffer(RawBufferCopyInOut): class CLProgram: def __init__(self, name:str, prg:str, binary=False, argdtypes=None, options=None): - self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]) if binary else cl.Program(CL.cl_ctx, prg) # type: ignore + self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]*len(CL.cl_ctx.devices)) if binary else cl.Program(CL.cl_ctx, prg) # type: ignore try: self._clprg = self.clprogram.build(options=options) except cl.RuntimeError as e: @@ -60,7 +62,7 @@ class CLProgram: from disassemblers.adreno import disasm disasm(self.binary()) elif 'gfx1100' in CL.cl_ctx.devices[0].name: - asm = early_exec(([pathlib.Path(__file__).parent.parent.parent / "extra/rocm/build/llvm-project/bin/llvm-objdump", '-d', '-'], self.binary())) + asm = early_exec(([ROCM_LLVM_PATH / "llvm-objdump", '-d', '-'], self.binary())) print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x])) else: # print the PTX for NVIDIA. TODO: probably broken for everything else @@ -77,7 +79,10 @@ class CLProgram: e = self.clprg(CL.cl_queue[cl_bufs[0].device], global_size, local_size, *cl_bufs) if wait: e.wait() - return ((e.profile.end - e.profile.start) * OSX_TIMING_RATIO) * 1e-9 + try: + return ((e.profile.end - e.profile.start) * OSX_TIMING_RATIO) * 1e-9 + except cl.RuntimeError: # no profiling info available + return None return None class CLCodegen(CStyleCodegen):