diff --git a/test/external/fuzz_kfd.py b/test/external/fuzz_kfd.py new file mode 100644 index 0000000000..3093da735f --- /dev/null +++ b/test/external/fuzz_kfd.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import random +from tqdm import trange +from typing import List +from tinygrad import Device +from tinygrad.runtime.ops_kfd import KFDDevice, HWCopyQueue, HWComputeQueue + +if __name__ == "__main__": + dev: List[KFDDevice] = [Device[f"KFD:{i}"] for i in range(6)] + print(f"got {len(dev)} devices") + + buffers = [(rd:=random.choice(dev), rd.allocator.alloc(random.randint(1, 10000))) for i in range(100)] + + for _ in trange(100000): + d1, b1 = random.choice(buffers) + d2, b2 = random.choice(buffers) + d1._gpu_map(b2) + q = HWComputeQueue() + q.signal(sig:=KFDDevice._get_signal(10)) + qc = HWCopyQueue() + qc.wait(sig) + qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size)) + d1.completion_signal.value = 1 + qc.signal(d1.completion_signal) + qc.submit(d1) + q.wait(d1.completion_signal) + q.submit(d1) + KFDDevice._wait_on(d1.completion_signal.event_id) diff --git a/test/test_multitensor.py b/test/test_multitensor.py index cfa0964608..d2d365b37e 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -115,9 +115,8 @@ class TestMultiTensor(unittest.TestCase): fn = f(n) np.testing.assert_allclose(fX.numpy(), fn, rtol=1e-6, atol=1e-6) - @unittest.skipIf(CI and Device.DEFAULT == "CLANG", "clang is slow") + @unittest.skip("slow") def test_fuzz_allreduce(self): - random.seed(41) for it in range(100): for n in range(2, 4+1): @@ -132,7 +131,6 @@ class TestMultiTensor(unittest.TestCase): assert mean_err < 1e-6, f"big mean error, iteration {it}_{n}" assert max_err < 1e-6, f"big max error, iteration {it}_{n}" - def _test_matmul_shard_axis(self, shard_x, shard_w, device): X = Tensor.kaiming_uniform(N, N).realize() W = Tensor.kaiming_uniform(N, N).realize() diff --git a/tinygrad/runtime/ops_kfd.py b/tinygrad/runtime/ops_kfd.py index 3cafb6f87e..c1b7311fef 100644 --- a/tinygrad/runtime/ops_kfd.py +++ b/tinygrad/runtime/ops_kfd.py @@ -219,6 +219,8 @@ class KFDProgram: if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu) def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False): + if self.device.kernargs_ptr >= (self.device.kernargs.va_addr + self.device.kernargs.size + self.kernargs_segment_size): + self.device.kernargs_ptr = self.device.kernargs.va_addr assert self.device.kernargs_ptr < (self.device.kernargs.va_addr + self.device.kernargs.size + self.kernargs_segment_size), "kernargs overrun" if not hasattr(self, "args_struct_t"): self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] + @@ -294,6 +296,7 @@ class KFDAllocator(LRUAllocator): self.device._wait_on(self.device.completion_signal.event_id) def copyout(self, dest:memoryview, src): + self.device.synchronize() for i in range(0, dest.nbytes, self.b[0].size): self.device.completion_signal.value = 1 self.device._submit_sdma(self.b[0].va_addr, src.va_addr+i, lsize:=min(self.b[0].size, dest.nbytes-i), @@ -315,11 +318,12 @@ class KFDDevice(Compiled): stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(mem.mapped_gpu_ids)) assert stm.n_success == len(mem.mapped_gpu_ids) - def _wait_on(self, event_id, timeout=1000): + @classmethod + def _wait_on(self, event_id, timeout=10000): evt_arr = (kfd.struct_kfd_event_data * 1)() evt_arr[0].event_id = event_id ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout) - if ret.wait_result != 0: raise RuntimeError(f"wait_result got {ret.wait_result}, hit timeout?") + if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!") def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True): flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE @@ -348,9 +352,10 @@ class KFDDevice(Compiled): @classmethod def _get_signal(self, num=None): - if num is None: num = KFDDevice.signal_number - KFDDevice.signal_number += 1 - if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 10 + if num is None: + num = KFDDevice.signal_number + KFDDevice.signal_number += 1 + if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 10 ret = hsa.amd_signal_t.from_address(KFDDevice.signals_page.va_addr + SIGNAL_SIZE*num) ret.value = 1 return ret @@ -379,8 +384,6 @@ class KFDDevice(Compiled): self.gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True) self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True) - self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True) - self.eop_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) self.kernargs_ptr = self.kernargs.va_addr @@ -401,7 +404,7 @@ class KFDDevice(Compiled): self.amd_aql_queue.max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1 # scratch setup - self.max_private_segment_size = 512 + self.max_private_segment_size = 4096 wave_scratch_len = round_up(((self.amd_aql_queue.max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256 self.scratch_len = (self.amd_aql_queue.max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) @@ -439,6 +442,8 @@ class KFDDevice(Compiled): self.sdma_doorbell = to_mv(self.doorbells + self.sdma_queue.doorbell_offset - self.doorbells_base, 4).cast("I") self.sdma_doorbell_value = 0 + # PM4 stuff + self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True) pm4_indirect_cmd = (ctypes.c_uint32*13)(amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), self.pm4_indirect_buf.va_addr & 0xffffffff, (self.pm4_indirect_buf.va_addr>>32) & 0xffffffff, 8 | amd_gpu.INDIRECT_BUFFER_VALID, 0xa) ctypes.memmove(ctypes.addressof(pm4_cmds:=(ctypes.c_uint16*27)(1))+2, ctypes.addressof(pm4_indirect_cmd), ctypes.sizeof(pm4_indirect_cmd))