mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 23:18:04 -05:00
amd: increase sdma copy size (#8989)
* amd: increase sdma max copy size * rm this * fix * fx * ops
This commit is contained in:
@@ -237,7 +237,8 @@ class SDMAExecutor(AMDQueue):
|
||||
|
||||
def _execute_copy(self):
|
||||
struct = sdma_pkts.copy_linear.from_address(self.base + self.rptr[0] % self.size)
|
||||
ctypes.memmove(struct.dst_addr, struct.src_addr, struct.count + 1)
|
||||
count_cnt = to_mv(self.base + self.rptr[0] + 4, 4).cast('I')[0] & 0x3FFFFFFF
|
||||
ctypes.memmove(struct.dst_addr, struct.src_addr, count_cnt + 1)
|
||||
self.rptr[0] += ctypes.sizeof(struct)
|
||||
|
||||
class AMDGPU(VirtGPU):
|
||||
|
||||
@@ -154,10 +154,9 @@ class AMDComputeQueue(HWQueue):
|
||||
dev.compute_queue.write_ptr[0] = dev.compute_queue.put_value
|
||||
dev.compute_queue.doorbell[0] = dev.compute_queue.put_value
|
||||
|
||||
SDMA_MAX_COPY_SIZE = 0x400000
|
||||
class AMDCopyQueue(HWQueue):
|
||||
def __init__(self):
|
||||
self.internal_cmd_sizes = []
|
||||
def __init__(self, max_copy_size=0x40000000):
|
||||
self.internal_cmd_sizes, self.max_copy_size = [], max_copy_size
|
||||
super().__init__()
|
||||
|
||||
def q(self, *arr):
|
||||
@@ -165,10 +164,10 @@ class AMDCopyQueue(HWQueue):
|
||||
self.internal_cmd_sizes.append(len(arr))
|
||||
|
||||
def copy(self, dest:sint, src:sint, copy_size:int):
|
||||
copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
||||
copied, copy_commands = 0, (copy_size + self.max_copy_size - 1) // self.max_copy_size
|
||||
|
||||
for _ in range(copy_commands):
|
||||
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
|
||||
step_copy_size = min(copy_size - copied, self.max_copy_size)
|
||||
|
||||
self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
|
||||
amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
|
||||
@@ -280,8 +279,6 @@ class AMDProgram(HCQProgram):
|
||||
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
|
||||
|
||||
class AMDAllocator(HCQAllocator['AMDDevice']):
|
||||
def __init__(self, dev:AMDDevice): super().__init__(dev, batch_size=SDMA_MAX_COPY_SIZE)
|
||||
|
||||
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
||||
return self.dev.dev_iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user