diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c0ae82ec0d..566239c826 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -709,6 +709,7 @@ jobs: PYTHONPATH=. python3 extra/remote/serve.py 6482 & sleep 1 DEBUG=2 PYTHONPATH=. REMOTE=127.0.0.1:6482 AM_RESET=1 AMD=1 AMD_IFACE=PCI python3 test/test_tiny.py + DEBUG=2 PYTHONPATH=. REMOTE=127.0.0.1:6482 AM_RESET=1 AMD=1 AMD_AQL=1 AMD_IFACE=PCI python3 test/test_tiny.py pkill -f 'extra/remote/serve.py' || true - name: Run process replay tests run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index dcbeda2d85..26eeb7c319 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -1015,11 +1015,11 @@ class AMDDevice(HCQCompiled): gart = self.iface.alloc(0x100, uncached=True, cpu_access=True) if queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL: - aql_desc = hsa.amd_queue_t(queue_properties=hsa.AMD_QUEUE_PROPERTIES_IS_PTR64 | hsa.AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, + self.aql_gart = gart + self.aql_desc = hsa.amd_queue_t(queue_properties=hsa.AMD_QUEUE_PROPERTIES_IS_PTR64 | hsa.AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, read_dispatch_id_field_base_byte_offset=getattr(hsa.amd_queue_t, 'read_dispatch_id').offset, max_cu_id=(self.cu_cnt * self.xccs) - 1, max_wave_id=self.waves_per_cu - 1) - gart.cpu_view().view(fmt='B')[:ctypes.sizeof(aql_desc)] = bytes(aql_desc) - self.aql_desc = hsa.amd_queue_t.from_address(gart.cpu_view().addr) + self.aql_gart.cpu_view().view(fmt='B')[:ctypes.sizeof(self.aql_desc)] = bytes(self.aql_desc) cwsr_buffer_size = round_up((ctx_save_restore_size + debug_memory_size) * self.xccs, mmap.PAGESIZE) cwsr_buffer = self.iface.alloc(cwsr_buffer_size) if ctx_save_restore_size else None @@ -1067,6 +1067,7 @@ class AMDDevice(HCQCompiled): int.from_bytes(rsrc1_t(BASE_ADDRESS_HI=hi32(self.scratch.va_addr), SWIZZLE_ENABLE=1), 'little'), lo32(size_per_xcc), int.from_bytes(bytes(rsrc3_t(**rsrc)), 'little')] self.aql_desc.compute_tmpring_size = self.tmpring_size + self.aql_gart.cpu_view()[:ctypes.sizeof(self.aql_desc)] = bytes(self.aql_desc) def invalidate_caches(self): self.hw_compute_queue_t().memory_barrier().signal(self.timeline_signal, self.next_timeline()).submit(self)