diff --git a/test/test_hcq.py b/test/test_hcq.py index f107d80898..1997cebd0f 100644 --- a/test/test_hcq.py +++ b/test/test_hcq.py @@ -3,7 +3,8 @@ from tinygrad import Device, Tensor, dtypes, TinyJit from tinygrad.helpers import CI, getenv, Context from tinygrad.device import Buffer, BufferOptions, HCQCompiled from tinygrad.engine.schedule import create_schedule -from tinygrad.engine.realize import get_runner +from tinygrad.engine.realize import get_runner, CompiledRunner +from tinygrad.codegen.kernel import Kernel, Opt, OptOps MOCKGPU = getenv("MOCKGPU") @@ -134,6 +135,38 @@ class TestHCQ(unittest.TestCase): assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0]) == 1.0, f"got val {val}" assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1]) == 0.0, f"got val {val}, should not be updated" + def test_exec_update_fuzz(self): + a = Tensor.rand((3, 3, 3), dtype=dtypes.int, device=Device.DEFAULT).realize() + b = a + 1 + si = create_schedule([b.lazydata])[-1] + k = Kernel(si.ast, opts=TestHCQ.d0.renderer) + for i in range(3): k.apply_opt(Opt(op=OptOps.LOCAL, axis=0, amt=3)) + + runner = CompiledRunner(k.to_program()) + + zb = Buffer(Device.DEFAULT, 3 * 3 * 3, dtypes.int, options=BufferOptions(cpu_access=True, nolru=True)).ensure_allocated() + zt = Buffer(Device.DEFAULT, 3 * 3 * 3, dtypes.int, options=BufferOptions(cpu_access=True, nolru=True)).ensure_allocated() + ctypes.memset(zb._buf.va_addr, 0, zb.nbytes) + kernargs = runner.clprg.fill_kernargs([zt._buf, zb._buf]) + + q = TestHCQ.d0.hw_compute_queue_t() + q.memory_barrier() \ + .exec(runner.clprg, kernargs, (1,1,1), (1,1,1)) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + + for x in range(1, 4): + for y in range(1, 4): + for z in range(1, 4): + ctypes.memset(zt._buf.va_addr, 0, zb.nbytes) + + q.update_exec(1, local_size=(x,y,z)) \ + .update_signal(2, value=TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + res_sum = sum(x for x in zt.as_buffer().cast("I")) + assert x * y * z == res_sum, f"want {x * y * z}, got {res_sum}" + # Test copy def test_copy(self): if TestHCQ.d0.hw_copy_queue_t is None: self.skipTest("device does not support copy queue") diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index be0ba9a2a3..5d7973e01e 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -124,7 +124,7 @@ class AMDComputeQueue(HWComputeQueue): self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4 self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_regs)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_regs - self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 4 # +1 to skip PACKET3_SET_SH_REG + 3 zeros. + self.cmd_idx_to_local_offset[cmd_idx] = len(self.q) - self.cmds_offset[cmd_idx] + 5 # +1 to skip PACKET3_SET_SH_REG + reg + 3 zeros. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0] self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]