diff --git a/test/test_hcq.py b/test/test_hcq.py index 47bb88619e..f8021d34bd 100644 --- a/test/test_hcq.py +++ b/test/test_hcq.py @@ -297,5 +297,80 @@ class TestHCQ(unittest.TestCase): TestHCQ.d0.timeline_value += 1 TestHCQ.d0.synchronize() + def test_small_copies_from_host_buf(self): + buf1 = Buffer(Device.DEFAULT, 1, dtypes.int8, options=BufferOptions(nolru=True)).ensure_allocated() + buf2 = Buffer(Device.DEFAULT, 1, dtypes.int8, options=BufferOptions(host=True, nolru=True)).ensure_allocated() + + for i in range(256): + ctypes.memset(buf2._buf.va_addr, i, 1) + + TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \ + .copy(buf1._buf.va_addr, buf2._buf.va_addr, 1) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + assert buf1.as_buffer()[0] == i + + def test_small_copies_from_host_buf_intercopy(self): + buf1 = Buffer(Device.DEFAULT, 1, dtypes.int8, options=BufferOptions(nolru=True)).ensure_allocated() + buf2 = Buffer(Device.DEFAULT, 1, dtypes.int8, options=BufferOptions(nolru=True)).ensure_allocated() + buf3 = Buffer(Device.DEFAULT, 1, dtypes.int8, options=BufferOptions(host=True, nolru=True)).ensure_allocated() + + for i in range(256): + ctypes.memset(buf3._buf.va_addr, i, 1) + + TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \ + .copy(buf1._buf.va_addr, buf3._buf.va_addr, 1) \ + .copy(buf2._buf.va_addr, buf1._buf.va_addr, 1) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + assert buf2.as_buffer()[0] == i + + def test_small_copies_from_host_buf_transfer(self): + _ = Device[f"{Device.DEFAULT}:1"] + + buf1 = Buffer(Device.DEFAULT, 1, dtypes.int8, options=BufferOptions(nolru=True)).ensure_allocated() + buf2 = Buffer(f"{Device.DEFAULT}:1", 1, dtypes.int8, options=BufferOptions(nolru=True)).ensure_allocated() + buf3 = Buffer(Device.DEFAULT, 1, dtypes.int8, options=BufferOptions(host=True, nolru=True)).ensure_allocated() + TestHCQ.d0.allocator.map(buf2._buf) + + for i in range(256): + ctypes.memset(buf3._buf.va_addr, i, 1) + + TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \ + .copy(buf1._buf.va_addr, buf3._buf.va_addr, 1) \ + .copy(buf2._buf.va_addr, buf1._buf.va_addr, 1) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + assert buf2.as_buffer()[0] == i + + def test_memory_barrier(self): + buf1 = Buffer(Device.DEFAULT, 1, dtypes.int8, options=BufferOptions(nolru=True)).ensure_allocated() + buf2 = Buffer(Device.DEFAULT, 1, dtypes.int8, options=BufferOptions(nolru=True)).ensure_allocated() + buf3 = Buffer(Device.DEFAULT, 1, dtypes.int8, options=BufferOptions(cpu_access=True, nolru=True)).ensure_allocated() + + for i in range(256): + ctypes.memset(buf3._buf.va_addr, i, 1) + + # Need memory_barrier after direct write to vram + TestHCQ.d0.hw_compute_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \ + .memory_barrier() \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + TestHCQ.d0.timeline_value += 1 + + TestHCQ.d0.hw_copy_queue_t().wait(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value - 1) \ + .copy(buf1._buf.va_addr, buf3._buf.va_addr, 1) \ + .copy(buf2._buf.va_addr, buf1._buf.va_addr, 1) \ + .signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0) + TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value) + TestHCQ.d0.timeline_value += 1 + + assert buf2.as_buffer()[0] == i + if __name__ == "__main__": unittest.main() diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index cf6f905e0a..dbe5848118 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -101,7 +101,7 @@ class AMDComputeQueue(HWComputeQueue): self.binded_device.synchronize() self.binded_device._gpu_free(self.hw_page) - def _invalidate_cache(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1): + def _acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1): self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0, amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \ @@ -109,14 +109,29 @@ class AMDComputeQueue(HWComputeQueue): amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \ amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)] + def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False): + cache_flush_flags = 0 + + if cache_flush: + cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \ + amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \ + amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ + + # event_index__mec_release_mem__end_of_pipe = 5 + # event_index__mec_release_mem__shader_done = 6 + self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6), + amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags, + amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0), + *data64_le(address), *data64_le(value), cst] + def _memory_barrier(self): self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \ amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20] - self._invalidate_cache() + self._acquire_mem() def _exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)): - self._invalidate_cache() + self._acquire_mem(gli=0, gl2=0) user_data = [*data64_le(kernargs)] if hasattr(prg, 'dispatch_packet_offset'): @@ -154,21 +169,6 @@ class AMDComputeQueue(HWComputeQueue): amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \ amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal._value_addr), value, 0xffffffff, 4] - def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False): - cache_flush_flags = 0 - - if cache_flush: - cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \ - amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \ - amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ - - # event_index__mec_release_mem__end_of_pipe = 5 - # event_index__mec_release_mem__shader_done = 6 - self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6), - amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags, - amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0), - *data64_le(address), *data64_le(value), cst] - def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal._timestamp_addr) def _signal(self, signal, value=0): @@ -176,7 +176,7 @@ class AMDComputeQueue(HWComputeQueue): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._value_addr, value=value, cache_flush=True) if signal._event_mailbox_ptr != 0: self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal._event_mailbox_ptr, - value=signal._event_id, cst=signal._event_id, cache_flush=True) + value=signal._event_id, cst=signal._event_id, cache_flush=False) def _update_wait(self, cmd_idx, signal=None, value=None): if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal._value_addr)) @@ -220,10 +220,6 @@ class AMDCopyQueue(HWCopyQueue): self.internal_cmd_sizes.append(len(arr)) def _copy(self, dest, src, copy_size): - # Invalidate cache inv - self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLM_INV | amd_gpu.SDMA_GCR_GLK_INV | amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GLV_INV | \ - amd_gpu.SDMA_GCR_GL1_INV | amd_gpu.SDMA_GCR_GL2_WB | amd_gpu.SDMA_GCR_GL2_INV, 0, 0]) - copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE self.copy_cmds_per_copy[len(self) - 1] = copy_commands for _ in range(copy_commands): @@ -234,13 +230,10 @@ class AMDCopyQueue(HWCopyQueue): copied += step_copy_size - # Invalidate cache wb - self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GL2_WB, 0, 0]) - def _update_copy(self, cmd_idx, dest=None, src=None): for i in range(self.copy_cmds_per_copy[cmd_idx]): - if src is not None: self._patch(cmd_idx, offset=8+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)]) - if dest is not None: self._patch(cmd_idx, offset=10+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)]) + if src is not None: self._patch(cmd_idx, offset=3+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)]) + if dest is not None: self._patch(cmd_idx, offset=5+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)]) def _signal(self, signal, value=0): self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal._value_addr), value])