diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index 7e2577a8aa..01436d2dbb 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -299,7 +299,7 @@ class QCOMAllocator(HCQAllocator): pitch = round_up((real_stride:=imgw * 4 * options.image.itemsize), 1 << pitchalign) + pitch_add if options.external_ptr: texture = QCOMBuffer(options.external_ptr, size) - else: texture = self.device._gpu_alloc(pitch * imgh, kgsl.KGSL_MEMTYPE_TEXTURE, map_to_cpu=True) + else: texture = self.device._gpu_alloc(pitch * imgh, kgsl.KGSL_MEMTYPE_TEXTURE) texture.pitch, texture.real_stride = pitch, real_stride @@ -312,7 +312,7 @@ class QCOMAllocator(HCQAllocator): return texture - return QCOMBuffer(options.external_ptr, size) if options.external_ptr else self.device._gpu_alloc(size, map_to_cpu=True) + return QCOMBuffer(options.external_ptr, size) if options.external_ptr else self.device._gpu_alloc(size) def _do_copy(self, src_addr, dest_addr, src_size, real_size, src_stride, dest_stride, dest_off=0, src_off=0): while src_off < src_size: @@ -336,7 +336,6 @@ class QCOMAllocator(HCQAllocator): self.device.synchronize() self.device._gpu_free(opaque) -MAP_FIXED = 0x10 class QCOMDevice(HCQCompiled): signals_page: Any = None signals_pool: List[Any] = [] @@ -345,10 +344,10 @@ class QCOMDevice(HCQCompiled): def __init__(self, device:str=""): self.fd = os.open('/dev/kgsl-3d0', os.O_RDWR) - QCOMDevice.dummy_addr = self._gpu_alloc(0x1000, map_to_cpu=False).va_addr - QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, map_to_cpu=True, uncached=True) + QCOMDevice.dummy_addr = self._gpu_alloc(0x1000).va_addr + QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True) QCOMDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, self.signals_page.size, 16)] - info, self.ctx, self.cmd_buf, self.cmd_buf_ptr, self.last_cmd = self._info(), self._ctx_create(), self._gpu_alloc(16 << 20, map_to_cpu=True), 0,0 + info, self.ctx, self.cmd_buf, self.cmd_buf_ptr, self.last_cmd = self._info(), self._ctx_create(), self._gpu_alloc(16 << 20), 0,0 QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF) if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}") @@ -370,29 +369,26 @@ class QCOMDevice(HCQCompiled): kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY(self.fd, type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info)) return info - def _gpu_alloc(self, size:int, flags:int=0, map_to_cpu=False, uncached=False, fill_zeroes=False): + def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False): + flags |= kgsl.KGSL_MEMALIGN(alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED) - alloc = kgsl.IOCTL_KGSL_GPUOBJ_ALLOC(self.fd, size=round_up(size, 1<<(alignment_hint:=12)), flags=flags | kgsl.KGSL_MEMALIGN(alignment_hint)) - info = kgsl.IOCTL_KGSL_GPUOBJ_INFO(self.fd, id=alloc.id) - va_addr, va_len = info.gpuaddr, info.va_len + alloc = kgsl.IOCTL_KGSL_GPUOBJ_ALLOC(self.fd, size=(bosz:=round_up(size, 1<