From e8d1a37ecd44edd161cae7d7b0c468ca57f91abb Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:51:14 +0300 Subject: [PATCH] qcom cleanups (#6351) * qcom as_buffer * revert asbuf --- tinygrad/runtime/ops_qcom.py | 37 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index 23018a646b..b697cfb512 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -290,17 +290,11 @@ class QCOMAllocator(HCQAllocator): texture.samplers, texture.descriptor, texture.ibo = [0] * 4, [0] * 16, [0] * 16 texture.samplers[0:2] = [0x1b60, 0x30] # compiled sampler. always the same in tinygrad. - texture.descriptor[0] = adreno.A6XX_TEX_CONST_0_SWIZ_X(0) | adreno.A6XX_TEX_CONST_0_SWIZ_Y(1) | adreno.A6XX_TEX_CONST_0_SWIZ_Z(2) \ - | adreno.A6XX_TEX_CONST_0_SWIZ_W(3) \ - | adreno.A6XX_TEX_CONST_0_FMT(adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT) - texture.descriptor[1] = adreno.A6XX_TEX_CONST_1_WIDTH(options.image.shape[1]) | adreno.A6XX_TEX_CONST_1_HEIGHT(options.image.shape[0]) - texture.descriptor[2] = adreno.A6XX_TEX_CONST_2_TYPE(adreno.A6XX_TEX_2D) | adreno.A6XX_TEX_CONST_2_PITCH(pitch) \ - | adreno.A6XX_TEX_CONST_2_PITCHALIGN(pitchalign - 6) - - texture.descriptor[4:6] = data64_le(texture.va_addr) - texture.descriptor[6] = 0x40000000 - texture.descriptor[7] = 0xe - + tex_fmt = adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT + texture.descriptor[0] = qreg.a6xx_tex_const_0(swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt) + texture.descriptor[1] = qreg.a6xx_tex_const_1(width=options.image.shape[1], height=options.image.shape[0]) + texture.descriptor[2] = qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=pitch, pitchalign=pitchalign-6) + texture.descriptor[4:7] = [*data64_le(texture.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000)] texture.ibo = [texture.descriptor[0] & (~0xffff), *texture.descriptor[1:len(texture.descriptor)]] return texture @@ -349,26 +343,21 @@ class QCOMDevice(HCQCompiled): return info def _gpu_alloc(self, size:int, flags:int=0, map_to_cpu=False, uncached=False, fill_zeroes=False): - size = round_up(size, 1 << (alignment_hint:=12)) - flags |= (kgsl.KGSL_MEMALIGN(alignment_hint)) - if uncached: flags |= (kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED)) + if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED) - alloc = kgsl.IOCTL_KGSL_GPUOBJ_ALLOC(self.fd, size=size, flags=flags) - va_addr, va_len = None, 0 - if not (flags & kgsl.KGSL_MEMFLAGS_USE_CPU_MAP): - info = kgsl.IOCTL_KGSL_GPUOBJ_INFO(self.fd, id=alloc.id) - va_addr, va_len = info.gpuaddr, info.va_len + alloc = kgsl.IOCTL_KGSL_GPUOBJ_ALLOC(self.fd, size=round_up(size, 1<<(alignment_hint:=12)), flags=flags | kgsl.KGSL_MEMALIGN(alignment_hint)) + info = kgsl.IOCTL_KGSL_GPUOBJ_INFO(self.fd, id=alloc.id) + va_addr, va_len = info.gpuaddr, info.va_len - if map_to_cpu or (flags & kgsl.KGSL_MEMFLAGS_USE_CPU_MAP): - va_addr = libc.mmap(va_addr, va_len := (va_len or alloc.mmapsize), mmap.PROT_READ|mmap.PROT_WRITE, - mmap.MAP_SHARED | (MAP_FIXED if va_addr is not None else 0), self.fd, alloc.id * 0x1000) + if map_to_cpu: + va_addr = libc.mmap(va_addr, va_len, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.fd, alloc.id * 0x1000) if fill_zeroes: ctypes.memset(va_addr, 0, va_len) - return SimpleNamespace(va_addr=va_addr, size=va_len, mapped=map_to_cpu or (flags & kgsl.KGSL_MEMFLAGS_USE_CPU_MAP), info=alloc) + return SimpleNamespace(va_addr=va_addr, size=size, mapped=map_to_cpu, info=alloc) def _gpu_free(self, mem): kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.info.id) - if mem.mapped: libc.munmap(mem.va_addr, mem.size) + if mem.mapped: libc.munmap(mem.va_addr, mem.info.va_len) def _alloc_cmd_buf(self, sz: int): self.cmd_buf_ptr = (cur_ptr:=self.cmd_buf_ptr if self.cmd_buf_ptr + sz < self.cmd_buf.size else 0) + sz