From 57e89645cd278b3a2823650747a9603b2d784d03 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Mon, 1 Jul 2024 17:36:37 +0300 Subject: [PATCH] hcq spec test (#5226) * start hcq spec test * more test * fixes * run on amd as well * test amdgpu exec * fix amd * amd mockgpu support sdma timestamp --- .github/workflows/test.yml | 2 +- extra/mockgpu/amd/amdgpu.py | 24 ++- test/test_hcq.py | 288 ++++++++++++++++++++++++++++++++++++ tinygrad/device.py | 3 +- tinygrad/helpers.py | 2 +- tinygrad/runtime/ops_amd.py | 8 +- 6 files changed, 321 insertions(+), 6 deletions(-) create mode 100644 test/test_hcq.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 411ee264f1..2a811fd29f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -531,7 +531,7 @@ jobs: run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --ignore test/test_gc.py --durations=20 - name: Run pytest (amd) if: matrix.backend=='amd' - run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/external/external_test_hcq.py --durations=20 + run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/test_hcq.py --durations=20 - name: Compile EfficientNet to C and test it if: matrix.backend=='clang' run: | diff --git a/extra/mockgpu/amd/amdgpu.py b/extra/mockgpu/amd/amdgpu.py index 5ca98b20db..495de20321 100644 --- a/extra/mockgpu/amd/amdgpu.py +++ b/extra/mockgpu/amd/amdgpu.py @@ -1,6 +1,6 @@ import ctypes, time from extra.mockgpu.gpu import VirtGPU -from tinygrad.helpers import to_mv, init_c_struct_t +from tinygrad.helpers import to_mv, init_c_struct_t, mv_address import tinygrad.runtime.autogen.amd_gpu as amd_gpu SDMA_MAX_COPY_SIZE = 0x400000 @@ -77,6 +77,7 @@ class PM4Executor(AMDQueue): elif op == amd_gpu.PACKET3_RELEASE_MEM: self._exec_release_mem(n) elif op == amd_gpu.PACKET3_WAIT_REG_MEM: cont = self._exec_wait_reg_mem(n) elif op == amd_gpu.PACKET3_DISPATCH_DIRECT: self._exec_dispatch_direct(n) + elif op == amd_gpu.PACKET3_INDIRECT_BUFFER: self._exec_indirect_buffer(n) elif op == amd_gpu.PACKET3_EVENT_WRITE: self._exec_event_write(n) else: raise RuntimeError(f"PM4: Unknown opcode: {op}") if not cont: return @@ -155,6 +156,18 @@ class PM4Executor(AMDQueue): err = remu.run_asm(prg_addr, prg_sz, *gl, *lc, args_addr) if err != 0: raise RuntimeError("remu does not support the new instruction introduced in this kernel") + def _exec_indirect_buffer(self, n): + addr_lo = self._next_dword() + addr_hi = self._next_dword() + buf_sz = self._next_dword() & (0x7fffff) + + rptr = memoryview(bytearray(8)).cast('Q') + wptr = memoryview(bytearray(8)).cast('Q') + rptr[0] = 0 + wptr[0] = buf_sz + PM4Executor(self.gpu, (addr_hi << 32) | addr_lo, buf_sz * 4, mv_address(rptr), mv_address(wptr)).execute() + assert rptr[0] == wptr[0], "not everything executed in amdgpu" + def _exec_event_write(self, n): assert n == 0 _ = self._next_dword() # do not emulate events for now @@ -175,6 +188,7 @@ class SDMAExecutor(AMDQueue): elif op == amd_gpu.SDMA_OP_POLL_REGMEM: cont = self._execute_poll_regmem() elif op == amd_gpu.SDMA_OP_GCR: self._execute_gcr() elif op == amd_gpu.SDMA_OP_COPY: self._execute_copy() + elif op == amd_gpu.SDMA_OP_TIMESTAMP: self._execute_timestamp() else: raise RuntimeError(f"Unknown SDMA op {op}") if not cont: return @@ -204,6 +218,14 @@ class SDMAExecutor(AMDQueue): self.rptr[0] += ctypes.sizeof(struct) return True + def _execute_timestamp(self): + struct = sdma_pkts.timestamp.from_address(self.base + self.rptr[0] % self.size) + + mem = to_mv(struct.addr, 8).cast('Q') + mem[0] = int(time.perf_counter() * 1e8) + + self.rptr[0] += ctypes.sizeof(struct) + def _execute_gcr(self): struct = sdma_pkts.gcr.from_address(self.base + self.rptr[0] % self.size) self.rptr[0] += ctypes.sizeof(struct) diff --git a/test/test_hcq.py b/test/test_hcq.py new file mode 100644 index 0000000000..035b33ea65 --- /dev/null +++ b/test/test_hcq.py @@ -0,0 +1,288 @@ +import unittest, ctypes, struct, array +from tinygrad import Device, Tensor, dtypes +from tinygrad.helpers import to_mv, CI, getenv +from tinygrad.device import Buffer, BufferOptions, HCQCompatCompiled +from tinygrad.engine.schedule import create_schedule +from tinygrad.engine.realize import get_runner + +MOCKGPU = getenv("MOCKGPU") + +@unittest.skipUnless(issubclass(type(Device[Device.DEFAULT]), HCQCompatCompiled), "HCQCompat device required to run") +class TestHCQ(unittest.TestCase): + @classmethod + def setUpClass(self): + TestHCQ.d0 = Device[Device.DEFAULT] + TestHCQ.a = Tensor([0.,1.], device=Device.DEFAULT).realize() + TestHCQ.b = self.a + 1 + si = create_schedule([self.b.lazydata])[-1] + + TestHCQ.runner = get_runner(TestHCQ.d0.dname, si.ast) + TestHCQ.b.lazydata.buffer.allocate() + TestHCQ.addr = struct.pack("QQ", TestHCQ.b.lazydata.buffer._buf.va_addr, TestHCQ.a.lazydata.buffer._buf.va_addr) + TestHCQ.addr2 = struct.pack("QQ", TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr) + TestHCQ.kernargs_off = TestHCQ.runner.clprg.kernargs_offset + TestHCQ.kernargs_size = TestHCQ.runner.clprg.kernargs_alloc_size + ctypes.memmove(TestHCQ.d0.kernargs_ptr+TestHCQ.kernargs_off, TestHCQ.addr, len(TestHCQ.addr)) + ctypes.memmove(TestHCQ.d0.kernargs_ptr+TestHCQ.kernargs_size+TestHCQ.kernargs_off, TestHCQ.addr2, len(TestHCQ.addr2)) + + if Device.DEFAULT == "NV": + # nv need to copy constbuffer there as well + if MOCKGPU: TestHCQ.runner.clprg.constbuffer_0[0:2] = [2, 0] # hack for nv mockgpu only. it needs to get count of args and vals. + to_mv(TestHCQ.d0.kernargs_ptr, 0x160).cast('I')[:] = array.array('I', TestHCQ.runner.clprg.constbuffer_0) + to_mv(TestHCQ.d0.kernargs_ptr+TestHCQ.kernargs_size, 0x160).cast('I')[:] = array.array('I', TestHCQ.runner.clprg.constbuffer_0) + + def setUp(self): + TestHCQ.d0.synchronize() + TestHCQ.a.lazydata.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 1)))) + TestHCQ.b.lazydata.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 0)))) + TestHCQ.d0.synchronize() # wait for copyins to complete + + # Test signals + def test_signal(self): + for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]: + with self.subTest(name=str(queue_type)): + queue_type().signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + def test_signal_update(self): + for queue_type in [TestHCQ.d0.hw_compute_queue_t]: + with self.subTest(name=str(queue_type)): + q = queue_type().signal(fake_signal := TestHCQ.d0._get_signal(), 0x1000) + + q.update_signal(0, signal=TestHCQ.d0.timeline_signal, value=TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + q.update_signal(0, value=TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + TestHCQ.d0.signals_pool.append(fake_signal) + + # Test wait + def test_wait(self): + for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]: + with self.subTest(name=str(queue_type)): + fake_signal = TestHCQ.d0._get_signal() + TestHCQ.d0._set_signal(fake_signal, 1) + queue_type().wait(fake_signal, 1) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + TestHCQ.d0.signals_pool.append(fake_signal) + + @unittest.skipIf(MOCKGPU, "Can't handle async update on MOCKGPU for now") + def test_wait_late_set(self): + for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]: + with self.subTest(name=str(queue_type)): + fake_signal = TestHCQ.d0._get_signal() + queue_type().wait(fake_signal, 1) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + + with self.assertRaises(RuntimeError): + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value, timeout=500) + + TestHCQ.d0._set_signal(fake_signal, 1) + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + + TestHCQ.d0.timeline_value += 1 + + TestHCQ.d0.signals_pool.append(fake_signal) + + def test_wait_update(self): + for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]: + with self.subTest(name=str(queue_type)): + fake_signal = TestHCQ.d0._get_signal() + q = queue_type().wait(TestHCQ.d0.timeline_signal, 0xffffffff).signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + + TestHCQ.d0._set_signal(fake_signal, 0x30) + + q.update_wait(0, signal=fake_signal, value=0x30).submit(TestHCQ.d0) + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + TestHCQ.d0.signals_pool.append(fake_signal) + + # Test exec + def test_exec_one_kernel(self): + TestHCQ.d0.hw_compute_queue_t().exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0]) == 1.0, f"got val {val}" + + def test_exec_2_kernels_100_times(self): + q = TestHCQ.d0.hw_compute_queue_t() + q.wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \ + .exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size) \ + .exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr + TestHCQ.kernargs_size, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + + for _ in range(100): + q.update_wait(0, value=TestHCQ.d0.timeline_value - 1).update_signal(3, value=TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + TestHCQ.d0.timeline_value += 1 + + assert (val:=TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0]) == 200.0, f"got val {val}" + + def test_exec_update(self): + q = TestHCQ.d0.hw_compute_queue_t() + q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + + q.update_exec(0, (1,1,1), (1,1,1)) + q.submit(TestHCQ.d0) + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0]) == 1.0, f"got val {val}" + assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1]) == 0.0, f"got val {val}, should not be updated" + + # Test copy + def test_copy(self): + TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \ + .copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + + TestHCQ.d0.timeline_value += 1 + + assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[1]) == 0.0, f"got val {val}" + + def test_copy_100_times(self): + q = TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \ + .copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + + for _ in range(100): + q.update_wait(0, value=TestHCQ.d0.timeline_value - 1).update_signal(2, value=TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + q.submit(TestHCQ.d0) + TestHCQ.d0.timeline_value += 1 + + # Test bind api + def test_bind(self): + for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]: + with self.subTest(name=str(queue_type)): + if not hasattr(queue_type(), 'bind'): self.skipTest("queue does not support bind api") + + fake_signal = TestHCQ.d0._get_signal() + q = queue_type().wait(TestHCQ.d0.timeline_signal, 0xffffffff).signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + q.bind(TestHCQ.d0) + + TestHCQ.d0._set_signal(fake_signal, 0x30) + + q.update_wait(0, signal=fake_signal, value=0x30).submit(TestHCQ.d0) + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + TestHCQ.d0.signals_pool.append(fake_signal) + + # Test multidevice + def test_multidevice_signal_wait(self): + d1 = Device[f"{Device.DEFAULT}:1"] + + TestHCQ.d0.hw_copy_queue_t().signal(sig:=TestHCQ.d0._get_signal(value=0), value=0xfff) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + + d1.hw_copy_queue_t().wait(sig, value=0xfff) \ + .signal(d1.timeline_signal, d1.timeline_value).submit(d1) + + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + d1._wait_signal(d1.timeline_signal, d1.timeline_value) + d1.timeline_value += 1 + + TestHCQ.d0.signals_pool.append(sig) + + # Test profile api + def test_speed_exec_time(self): + TestHCQ.d0._prof_setup() + + sig_st, sig_en = TestHCQ.d0._get_signal(), TestHCQ.d0._get_signal() + TestHCQ.d0.hw_compute_queue_t().timestamp(sig_st) \ + .exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size) \ + .timestamp(sig_en) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + et = TestHCQ.d0._gpu2cpu_time(TestHCQ.d0._read_timestamp(sig_en), True) - TestHCQ.d0._gpu2cpu_time(TestHCQ.d0._read_timestamp(sig_st), True) + + TestHCQ.d0.signals_pool += [sig_st, sig_en] + + print(f"exec kernel time: {et:.2f} us") + assert 1 <= et <= (2000 if CI else 20) + + def test_speed_copy_bandwidth(self): + TestHCQ.d0._prof_setup() + + # THEORY: the bandwidth is low here because it's only using one SDMA queue. I suspect it's more stable like this at least. + SZ = 2_000_000_000 + a = Buffer(Device.DEFAULT, SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate() + b = Buffer(Device.DEFAULT, SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate() + + sig_st, sig_en = TestHCQ.d0._get_signal(), TestHCQ.d0._get_signal() + TestHCQ.d0.hw_copy_queue_t().timestamp(sig_st) \ + .copy(a._buf.va_addr, b._buf.va_addr, SZ) \ + .timestamp(sig_en) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + et = TestHCQ.d0._gpu2cpu_time(TestHCQ.d0._read_timestamp(sig_en), True) - TestHCQ.d0._gpu2cpu_time(TestHCQ.d0._read_timestamp(sig_st), True) + et_ms = et / 1e3 + + TestHCQ.d0.signals_pool += [sig_st, sig_en] + + gb_s = ((SZ / 1e9) / et_ms) * 1e3 + print(f"same device copy: {et_ms:.2f} ms, {gb_s:.2f} GB/s") + assert (0.3 if CI else 10) <= gb_s <= 1000 + + def test_speed_cross_device_copy_bandwidth(self): + TestHCQ.d0._prof_setup() + + SZ = 2_000_000_000 + b = Buffer(f"{Device.DEFAULT}:1", SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate() + a = Buffer(Device.DEFAULT, SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate() + TestHCQ.d0._gpu_map(b._buf) + + sig_st, sig_en = TestHCQ.d0._get_signal(), TestHCQ.d0._get_signal() + TestHCQ.d0.hw_copy_queue_t().timestamp(sig_st) \ + .copy(a._buf.va_addr, b._buf.va_addr, SZ) \ + .timestamp(sig_en) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + et = TestHCQ.d0._gpu2cpu_time(TestHCQ.d0._read_timestamp(sig_en), True) - TestHCQ.d0._gpu2cpu_time(TestHCQ.d0._read_timestamp(sig_st), True) + et_ms = et / 1e3 + + TestHCQ.d0.signals_pool += [sig_st, sig_en] + + gb_s = ((SZ / 1e9) / et_ms) * 1e3 + print(f"cross device copy: {et_ms:.2f} ms, {gb_s:.2f} GB/s") + assert (0.3 if CI else 2) <= gb_s <= 50 + + def test_timeline_signal_rollover(self): + # NV 64bit, AMD 32bit + TestHCQ.d0.timeline_value = (1 << 64) - 20 if Device.DEFAULT == "NV" else (1 << 32) - 20 # close value to reset + TestHCQ.d0.hw_compute_queue_t().signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1).submit(TestHCQ.d0) + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) + + for _ in range(40): + q = TestHCQ.d0.hw_compute_queue_t() + q.wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) + q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size) + q.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0]) == 1.0, f"got val {val}" + +if __name__ == "__main__": + unittest.main() diff --git a/tinygrad/device.py b/tinygrad/device.py index b604fbbcce..0cbf6b33af 100644 --- a/tinygrad/device.py +++ b/tinygrad/device.py @@ -225,6 +225,7 @@ class HCQCompatCompiled(Compiled): def _gpu2cpu_time(self, gpu_time, is_copy): raise NotImplementedError("need _gpu2cpu_time") def _prof_setup(self): + if not hasattr(self, 'profile_logger'): atexit.register(self._prof_finalize) self.profile_logger = ProfileLogger() def _sync_queue(q_t): @@ -236,8 +237,6 @@ class HCQCompatCompiled(Compiled): self.cpu_start_time, self.gpu_start_time = _sync_queue(self.hw_compute_queue_t) self.copy_cpu_start_time, self.copy_gpu_start_time = _sync_queue(self.hw_copy_queue_t) - atexit.register(self._prof_finalize) - def _prof_process_events(self): self.raw_prof_records += [(self._read_timestamp(st), self._read_timestamp(en), name, is_cp) for st, en, name, is_cp in self.sig_prof_records] for st, en, _, _ in self.sig_prof_records: self.signals_pool += [st, en] # type: ignore diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py index dfd7f4e23b..339fd58073 100644 --- a/tinygrad/helpers.py +++ b/tinygrad/helpers.py @@ -168,7 +168,7 @@ class ProfileLogger: self.mjson.append({"name": name, "ph": "X", "pid": self.actors[actor_name], "tid": self.subactors.get(subactor_key, -1), "ts":st, "dur":et-st}) ProfileLogger.writers -= 1 - if ProfileLogger.writers == 0: + if ProfileLogger.writers == 0 and len(self.mjson) > 0: with open(self.path, "w") as f: f.write(json.dumps({"traceEvents": self.mjson})) print(f"Saved profile to {self.path}. Use https://ui.perfetto.dev/ to open it.") diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 73fda52848..e9923372d7 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -189,7 +189,7 @@ class HWPM4Queue(HWQueue): assert self.q[self.cmd_offsets[cmd_idx]] == amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6), f"Command at index {cmd_idx} is not signal" if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 3, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)]) - if signal.event_mailbox_ptr != 0: + if self.cmd_offsets[cmd_idx + 1] - self.cmd_offsets[cmd_idx] > 8: # has trap info self._patch(self.cmd_offsets[cmd_idx] + 8 + 3, [*data64_le(signal.event_mailbox_ptr), *data64_le(signal.event_id), signal.event_id]) if value is not None: self._patch(self.cmd_offsets[cmd_idx] + 5, [*data64_le(value)]) return self @@ -260,6 +260,12 @@ class HWCopyQueue(HWQueue): return self._mark_command_end() + def update_signal(self, cmd_idx, signal=None, value=None): + assert self.q[self.cmd_offsets[cmd_idx]] & 0xf == amd_gpu.SDMA_OP_FENCE, f"Command at index {cmd_idx} is not signal" + if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 1, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)]) + if value is not None: self.q[self.cmd_offsets[cmd_idx] + 3] = value + return self + def update_wait(self, cmd_idx, signal=None, value=None): assert self.q[self.cmd_offsets[cmd_idx]] & 0xf == amd_gpu.SDMA_OP_POLL_REGMEM, f"Command at index {cmd_idx} is not wait" if signal is not None: self._patch(self.cmd_offsets[cmd_idx] + 1, [*data64_le(ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET)])