amd: increase sdma copy size (#8989)

* amd: increase sdma max copy size

* rm this

* fix

* fx

* ops
This commit is contained in:
nimlgen
2025-02-09 20:53:35 +03:00
committed by GitHub
parent 7eba5fb413
commit 88add71c25
2 changed files with 6 additions and 8 deletions

View File

@@ -237,7 +237,8 @@ class SDMAExecutor(AMDQueue):
def _execute_copy(self):
struct = sdma_pkts.copy_linear.from_address(self.base + self.rptr[0] % self.size)
ctypes.memmove(struct.dst_addr, struct.src_addr, struct.count + 1)
count_cnt = to_mv(self.base + self.rptr[0] + 4, 4).cast('I')[0] & 0x3FFFFFFF
ctypes.memmove(struct.dst_addr, struct.src_addr, count_cnt + 1)
self.rptr[0] += ctypes.sizeof(struct)
class AMDGPU(VirtGPU):

View File

@@ -154,10 +154,9 @@ class AMDComputeQueue(HWQueue):
dev.compute_queue.write_ptr[0] = dev.compute_queue.put_value
dev.compute_queue.doorbell[0] = dev.compute_queue.put_value
SDMA_MAX_COPY_SIZE = 0x400000
class AMDCopyQueue(HWQueue):
def __init__(self):
self.internal_cmd_sizes = []
def __init__(self, max_copy_size=0x40000000):
self.internal_cmd_sizes, self.max_copy_size = [], max_copy_size
super().__init__()
def q(self, *arr):
@@ -165,10 +164,10 @@ class AMDCopyQueue(HWQueue):
self.internal_cmd_sizes.append(len(arr))
def copy(self, dest:sint, src:sint, copy_size:int):
copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
copied, copy_commands = 0, (copy_size + self.max_copy_size - 1) // self.max_copy_size
for _ in range(copy_commands):
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
step_copy_size = min(copy_size - copied, self.max_copy_size)
self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
@@ -280,8 +279,6 @@ class AMDProgram(HCQProgram):
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
class AMDAllocator(HCQAllocator['AMDDevice']):
def __init__(self, dev:AMDDevice): super().__init__(dev, batch_size=SDMA_MAX_COPY_SIZE)
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
return self.dev.dev_iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)