mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
add qcom runtime (#5213)
* qcom: driver init
* autogen stubs for msm_kgsl also fixup ioctls to show numbers instead of _IOW macros
* autogen: add adreno commands and registers
* ops_qcom: QcomAllocator + signals
* fix EDEADLK in hwqueue, init timestamps, use opencl compiler for qcom
* qcom: we do not really need all these constants input/output is enough
* qcom: perfctr for CS (do not really need all the rest)
* qcom: HALFREGFOOTPRINT and FULLREGFOOTPRINT are set to be around max
* qcom: explicitly set instruction len based on the shader size
* ops_qcom: Program init
extracts shader from open cl binary
sets input/output buffers
allocates stack
sets cs mode
runs shader
* use data64_le from helpers
* ops_qcom: use fill_kernargs for filling i/o buffers
* ops_qcom: add QcomCopyQueue just for api & set kernargs_args_offset
* new signals & fix exec
* add QCOM to the list of supported devices
* correct QcomComputeQueue._wait using CP_WAIT_REG_MEM
* fix exec, synchronize before copyout
* correct setting num_units for ST_SHADER
* fix gpu hangs on sigs with CP_MEM_WRITE, it is uncached mem anyway
* extract offsets to kernel arguments from opencl binary
* extract constants values and offsets from opencl binary
* handle KGSL_MEMFLAGS_USE_CPU_MAP correctly
* align kernel name to 4 bytes when skipping kernel opencl struct
* skip to consts directly using an offset from opencl binary header
* fix alloc
* get halfreg and fullreg from opencl bin
* set unmultipled global sizes as kernel group in HLSQ_CS_NDRANGE
* parse prg offset from open cl binary
* save loc with HLSQ_CS_CNTL. set this with HLSQ_CONTROL_2_REG
* support for vals in _fill_kernargs
* support 16-bit constants
* use KGSL_CONTEXT_NO_FAULT_TOLERANCE for contexts
this helps to not fall down when executing big kernels
/* Don't time out if the context has disabled it */
if (drawobj->context->flags & KGSL_CONTEXT_NO_FAULT_TOLERANCE)
return;
* minor changes of _exec
* QCOMRenderer
* disable HCQGraph for demo. TOOD: support HCQ update api
* support HCQ
- remove copy queue
- add updates
- add strides for buffs and vars for QCOM
* bufs_stride
* clean ups
* linter
* call super().__init__(value) in QcomSignal
* disable=unused-import
* mypy
* type ignore when queue is on the device
* fix
* query gpu_id.
Will be useful for selecting commands e.g. CP_EVENT_WRITE vs
CP_EVENT_WRITE7
* working timestamps
* free context after device is done
* move gpu stack to the device
* reserve some space with lib_gpu for gpu to write to
this fixes test_interpolate_bilinear
* exclude tests that fails with GPU=1 on qualcomm
* lint
* unmap mem in _gpu_free
* ctxt priority and preemtion policy
* remove old qcom
* pass size to self.device.allocator.free
* skip tests only on qcom
* use kgsl and adreno defines instead of numeric vals
* use allocator for allocating lib_gpu
* update to QcomArgsState from master
* intermediate commit while conquering images
* enable image tests on qcom
* fix shader disasm size, dump textures stuff
* working images
* allow signals to be 0
* set branchstack from OpenCL binary
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* set shared memory size from OpenCL binary
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* update images in QcomArgsState & less loc for images
* set stack sizes from OpenCL binary
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* stack allocation based on OpenCL binary
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* better autogen for kgsl and adreno. no more bitshifts
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* cleanup commit for parse cl lib
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* dont forget actual generated files
* refactor + less loc
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* device.py back
* lint
* ruff
* timestamp divisor
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* fix tex fmt & round global size
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* dtypes
* 19.2MHz
* -1 loc in _update_exec
* remove noqa
---------
Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
8e2a3fc165
commit
4c33192a8b
@@ -214,6 +214,23 @@ generate_libc() {
|
||||
fixup $BASE/libc.py
|
||||
}
|
||||
|
||||
generate_kgsl() {
|
||||
clang2py extra/qcom_gpu_driver/msm_kgsl.h -o $BASE/kgsl.py -k cdefstum
|
||||
fixup $BASE/kgsl.py
|
||||
sed -i "s\import ctypes\import ctypes, os\g" $BASE/kgsl.py
|
||||
sed -i -E 's/^#? ?([^= ]+) = _[^,]+, 0x([0-9A-Fa-f]+) .+$/\1 = 0x\2/' $BASE/kgsl.py
|
||||
sed -nE 's/#define ([A-Za-z0-9_]+)_SHIFT\s*[^\S\r\n]*[0-9]*$/def \1(val): return (val << \1_SHIFT) \& \1_MASK/p' extra/qcom_gpu_driver/msm_kgsl.h >> $BASE/kgsl.py
|
||||
python3 -c "import tinygrad.runtime.autogen.kgsl"
|
||||
}
|
||||
|
||||
generate_adreno() {
|
||||
clang2py extra/qcom_gpu_driver/a6xx.xml.h -o $BASE/adreno.py -k cestum
|
||||
sed -nE 's/#define ([A-Za-z0-9_]+)__SHIFT\s*[^\S\r\n]*[0-9]*$/def \1(val): return (val << \1__SHIFT) \& \1__MASK/p' extra/qcom_gpu_driver/a6xx.xml.h >> $BASE/adreno.py
|
||||
fixup $BASE/adreno.py
|
||||
sed -i "s\import ctypes\import ctypes, os\g" $BASE/adreno.py
|
||||
python3 -c "import tinygrad.runtime.autogen.adreno"
|
||||
}
|
||||
|
||||
if [ "$1" == "opencl" ]; then generate_opencl
|
||||
elif [ "$1" == "hip" ]; then generate_hip
|
||||
elif [ "$1" == "comgr" ]; then generate_comgr
|
||||
@@ -225,6 +242,8 @@ elif [ "$1" == "nv" ]; then generate_nv
|
||||
elif [ "$1" == "amd" ]; then generate_amd
|
||||
elif [ "$1" == "io_uring" ]; then generate_io_uring
|
||||
elif [ "$1" == "libc" ]; then generate_libc
|
||||
elif [ "$1" == "kgsl" ]; then generate_kgsl
|
||||
elif [ "$1" == "adreno" ]; then generate_adreno
|
||||
elif [ "$1" == "all" ]; then generate_opencl; generate_hip; generate_comgr; generate_cuda; generate_nvrtc; generate_hsa; generate_kfd; generate_nv; generate_amd; generate_io_uring; generate_libc
|
||||
else echo "usage: $0 <type>"
|
||||
fi
|
||||
|
||||
16783
extra/qcom_gpu_driver/a6xx.xml.h
Normal file
16783
extra/qcom_gpu_driver/a6xx.xml.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -56,6 +56,11 @@ def hprint(vals):
|
||||
|
||||
ST6_SHADER = 0
|
||||
ST6_CONSTANTS = 1
|
||||
ST6_UBO = 2
|
||||
ST6_IBO = 3
|
||||
|
||||
SB6_CS_TEX = 5
|
||||
SB6_CS_SHADER = 13
|
||||
|
||||
def parse_cmd_buf(dat):
|
||||
ptr = 0
|
||||
@@ -74,10 +79,20 @@ def parse_cmd_buf(dat):
|
||||
num_unit = vals[0]>>22
|
||||
print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}")
|
||||
|
||||
from extra.disassemblers.adreno import disasm_raw
|
||||
if state_type == ST6_SHADER: disasm_raw(get_mem(((vals[2] << 32) | vals[1]), 0x180))
|
||||
if state_type == ST6_CONSTANTS: hexdump(get_mem(((vals[2] << 32) | vals[1]), min(0x180, num_unit*4)))
|
||||
pass
|
||||
if state_block == SB6_CS_SHADER:
|
||||
from extra.disassemblers.adreno import disasm_raw
|
||||
if state_type == ST6_SHADER: disasm_raw(get_mem(((vals[2] << 32) | vals[1]), num_unit * 128))
|
||||
if state_type == ST6_CONSTANTS: hexdump(get_mem(((vals[2] << 32) | vals[1]), min(0x180, num_unit*4)))
|
||||
elif state_block == SB6_CS_TEX:
|
||||
if state_type == ST6_SHADER:
|
||||
samplers_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 4 * 4)
|
||||
print('texture samplers')
|
||||
hexdump(samplers_bytes)
|
||||
if state_type == ST6_CONSTANTS:
|
||||
descriptors_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4)
|
||||
print('texture descriptors')
|
||||
hexdump(descriptors_bytes)
|
||||
|
||||
elif ops[opcode] == "CP_REG_TO_MEM":
|
||||
reg, cnt, b64, accum = vals[0] & 0x3FFFF, (vals[0] >> 18) & 0xFFF, (vals[0] >> 30) & 0x1, (vals[0] >> 31) & 0x1
|
||||
dest = vals[1] | (vals[2] << 32)
|
||||
@@ -88,6 +103,13 @@ def parse_cmd_buf(dat):
|
||||
offset, size = ((cmd>>8)&0x7FFFF), cmd&0x7F
|
||||
vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size])
|
||||
print(f"{ptr:3X} -- typ 4: {size=:3d}, {offset=:#x}", hprint(vals))
|
||||
if offset == 0xa9b0:
|
||||
print(f'THREADSIZE-{(vals[0] >> 20)&0x1}\nEARLYPREAMBLE-{(vals[0] >> 23) & 0x1}\nMERGEDREGS-{(vals[0] >> 3) & 0x1}\nTHREADMODE-{vals[0] & 0x1}\nHALFREGFOOTPRINT-{(vals[0] >> 1) & 0x3f}\nFULLREGFOOTPRINT-{(vals[0] >> 7) & 0x3f}\nBRANCHSTACK-{(vals[0] >> 14) & 0x3f}\n')
|
||||
print(f'SP_CS_UNKNOWN_A9B1-{vals[1]}\nSP_CS_BRANCH_COND-{vals[2]}\nSP_CS_OBJ_FIRST_EXEC_OFFSET-{vals[3]}\nSP_CS_OBJ_START-{vals[4] | (vals[5] << 32)}\nSP_CS_PVT_MEM_PARAM-{vals[6]}\nSP_CS_PVT_MEM_ADDR-{vals[7] | (vals[8] << 32)}\nSP_CS_PVT_MEM_SIZE-{vals[9]}')
|
||||
if offset == 0xb180:
|
||||
print('border color offset', hex(vals[1] << 32 | vals[0]))
|
||||
hexdump(get_mem(vals[1] << 32 | vals[0], 0x1000))
|
||||
|
||||
ptr += 4*size
|
||||
else:
|
||||
print("unk", hex(cmd))
|
||||
|
||||
@@ -4,7 +4,7 @@ from tinygrad import Device, dtypes, Tensor, Context
|
||||
from tinygrad.dtype import ImageDType
|
||||
from tinygrad.engine.realize import lower_schedule
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT != "GPU", "only images on GPU")
|
||||
@unittest.skipIf(Device.DEFAULT not in ("QCOM", "GPU"), "only images on GPU")
|
||||
class TestImageDType(unittest.TestCase):
|
||||
def test_image_and_back(self):
|
||||
data = Tensor.randn(9*27*4).realize()
|
||||
|
||||
@@ -344,6 +344,7 @@ class TestOps(unittest.TestCase):
|
||||
helper_test_op([(45,35), (45,35), (45,35)], lambda x,y,z: x.lerp(y,z))
|
||||
helper_test_op(None, lambda x,y,z: x.lerp(y,z), vals=[[1.,2.,3.], [4.,5.,6.], 0.5])
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU/QCOM backends)")
|
||||
def test_tril(self):
|
||||
helper_test_op([(3,3)], lambda x: x.tril())
|
||||
helper_test_op([(3,3)], lambda x: x.tril(1))
|
||||
@@ -360,6 +361,8 @@ class TestOps(unittest.TestCase):
|
||||
helper_test_op([(5,0,3)], lambda x: x.tril())
|
||||
helper_test_op([(5,3,3)], lambda x: x.tril(1))
|
||||
helper_test_op(None, lambda x: x.tril(), vals=[[[True] * 3] * 3], forward_only=True)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU/QCOM backends)")
|
||||
def test_triu(self):
|
||||
helper_test_op([(3,3)], lambda x: x.triu())
|
||||
helper_test_op([(3,3)], lambda x: x.triu(1))
|
||||
@@ -668,6 +671,7 @@ class TestOps(unittest.TestCase):
|
||||
|
||||
def test_small_cumsum(self):
|
||||
helper_test_op([(10)], lambda x: torch.cumsum(x, dim=0), lambda x: Tensor.cumsum(x, axis=0))
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "Qualcomm fails on this (both on GPU/QCOM backends)")
|
||||
def test_simple_cumsum(self):
|
||||
helper_test_op([(512)], lambda x: torch.cumsum(x, dim=0), lambda x: Tensor.cumsum(x, axis=0))
|
||||
helper_test_op([(1022)], lambda x: torch.cumsum(x, dim=0), lambda x: Tensor.cumsum(x, axis=0))
|
||||
@@ -858,6 +862,7 @@ class TestOps(unittest.TestCase):
|
||||
def test_small_gemm_eye(self):
|
||||
helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)])
|
||||
@unittest.skipIf(CI and Device.DEFAULT in ["NV", "LLVM", "GPU", "CUDA"], "not supported on these in CI")
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "Qualcomm fails on this (both on GPU/QCOM backends)")
|
||||
def test_gemm_fp16(self):
|
||||
helper_test_op([(64,64), (64,64)], lambda x,y: x.half().matmul(y.half()), atol=5e-3, rtol=5e-3)
|
||||
def test_gemm(self):
|
||||
@@ -934,25 +939,31 @@ class TestOps(unittest.TestCase):
|
||||
helper_test_op([(3,4,5,6)], lambda x: x.max(axis=1)[0], lambda x: x.max(axis=1))
|
||||
helper_test_op([()], lambda x: x.max())
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU/QCOM backends)")
|
||||
def test_any(self):
|
||||
helper_test_op([(3,4,5,6)], lambda x: x.any(), forward_only=True)
|
||||
helper_test_op(None, lambda x: x.any(), vals=[[True, True]], forward_only=True)
|
||||
helper_test_op(None, lambda x: x.any(), vals=[[True, False]], forward_only=True)
|
||||
helper_test_op(None, lambda x: x.any(), vals=[[False, False]], forward_only=True)
|
||||
helper_test_op([()], lambda x: x.any(), forward_only=True)
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU/QCOM backends)")
|
||||
def test_any_axis(self):
|
||||
helper_test_op([(3,4,5,6)], lambda x: x.any(axis=(1,2)), forward_only=True)
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU/QCOM backends)")
|
||||
def test_any_zero_axis(self):
|
||||
helper_test_op([(1,0,3,0,5)], lambda x: x.any(axis=(1,3)), forward_only=True)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU/QCOM backends)")
|
||||
def test_all(self):
|
||||
helper_test_op([(3,4,5,6)], lambda x: x.all(), forward_only=True)
|
||||
helper_test_op(None, lambda x: x.all(), vals=[[True, True]], forward_only=True)
|
||||
helper_test_op(None, lambda x: x.all(), vals=[[True, False]], forward_only=True)
|
||||
helper_test_op(None, lambda x: x.all(), vals=[[False, False]], forward_only=True)
|
||||
helper_test_op([()], lambda x: x.all(), forward_only=True)
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU/QCOM backends)")
|
||||
def test_all_axis(self):
|
||||
helper_test_op([(3,4,5,6)], lambda x: x.all(axis=(1,2)), forward_only=True)
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU/QCOM backends)")
|
||||
def test_all_zero_axis(self):
|
||||
helper_test_op([(1,0,3,0,5)], lambda x: x.all(axis=(1,3)), forward_only=True)
|
||||
|
||||
@@ -1427,6 +1438,7 @@ class TestOps(unittest.TestCase):
|
||||
lambda x,w: Tensor.conv2d(x,w).relu(), grad_rtol=1e-5)
|
||||
|
||||
@unittest.skipIf(IMAGE>0, "no conv3d on images")
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "Qualcomm fails on this (both on GPU/QCOM backends)")
|
||||
def test_padded_conv3d(self):
|
||||
helper_test_op([(1,4,5,5,5), (4,4,3,3,3)],
|
||||
lambda x,w: torch.nn.functional.conv3d(x,w,padding=1).relu(),
|
||||
@@ -1447,6 +1459,7 @@ class TestOps(unittest.TestCase):
|
||||
lambda x,w: torch.nn.functional.conv2d(x,w).relu(),
|
||||
lambda x,w: Tensor.conv2d(x,w).relu(), grad_rtol=1e-5)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "Qualcomm fails on this (both on GPU/QCOM backends)")
|
||||
def test_nested_conv2d(self):
|
||||
helper_test_op([(1,32,9,9), (32,32,3,3), (32,32,3,3)],
|
||||
lambda x,w1,w2: torch.nn.functional.conv2d(torch.nn.functional.conv2d(x,w1).relu(), w2).relu(),
|
||||
@@ -1669,12 +1682,14 @@ class TestOps(unittest.TestCase):
|
||||
lambda x,w: torch.nn.functional.conv2d(torch.nn.functional.pad(x, p),w).relu(),
|
||||
lambda x,w: Tensor.conv2d(x,w,padding=p).relu())
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "Qualcomm fails on this (both on GPU/QCOM backends)")
|
||||
def test_padded_conv2d_p21(self):
|
||||
bs,cin,H,W,padding = 4, 3, 3, 3, (2,1)
|
||||
helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
|
||||
lambda x,w: torch.nn.functional.conv2d(x,w,padding=padding).relu(),
|
||||
lambda x,w: Tensor.conv2d(x,w,padding=padding).relu())
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "Qualcomm fails on this (both on GPU/QCOM backends)")
|
||||
def test_padded_conv2d_p22(self):
|
||||
bs,cin,H,W,padding = 4, 3, 3, 3, (2,2)
|
||||
helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
|
||||
@@ -1854,6 +1869,7 @@ class TestOps(unittest.TestCase):
|
||||
lambda x: torch.nn.functional.interpolate(x, size=out_sz, mode="trilinear", align_corners=True),
|
||||
lambda x: Tensor.interpolate(x, size=out_sz, mode="linear", align_corners=True), atol=1e-4)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU/QCOM backends)")
|
||||
def test_cat(self):
|
||||
for dim in range(-2, 3):
|
||||
helper_test_op([(45,65,9), (45,65,9), (45,65,9)], lambda x,y,z: torch.cat((x,y,z), dim), lambda x,y,z: x.cat(y, z, dim=dim))
|
||||
@@ -2104,6 +2120,7 @@ class TestOps(unittest.TestCase):
|
||||
helper_test_op([(32,10)], lambda x: x.masked_fill((x>0.1).detach(), -math.inf))
|
||||
helper_test_op([(32,10)], lambda x: x.masked_fill((x<0.1).detach(), -math.inf))
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU/QCOM backends)")
|
||||
def test_cast(self):
|
||||
helper_test_op([(3, 3)], lambda x: x.float())
|
||||
helper_test_op(None, lambda x: x.float(), vals=[[0, 1, 2, 3]], forward_only=True)
|
||||
|
||||
@@ -30,7 +30,7 @@ class _Device:
|
||||
@functools.cached_property
|
||||
def DEFAULT(self) -> str:
|
||||
if (from_env:=next((d for d in self._devices if d not in ["DISK", "NPY"] and getenv(d) == 1), None)): return from_env
|
||||
for device in ["METAL", "AMD", "NV", "CUDA", "GPU", "CLANG", "LLVM"]:
|
||||
for device in ["METAL", "AMD", "NV", "CUDA", "QCOM", "GPU", "CLANG", "LLVM"]:
|
||||
try:
|
||||
if self[device]:
|
||||
os.environ[device] = "1" # we set this in environment for spawned children
|
||||
|
||||
@@ -424,3 +424,4 @@ static inline __attribute__((device)) bool operator==(hip_bfloat16 a, hip_bfloat
|
||||
|
||||
class NVRenderer(CUDARenderer): device = "NV"
|
||||
class HIPRenderer(AMDRenderer): device = "HIP"
|
||||
class QCOMRenderer(OpenCLRenderer): device = "QCOM"
|
||||
|
||||
17904
tinygrad/runtime/autogen/adreno.py
Normal file
17904
tinygrad/runtime/autogen/adreno.py
Normal file
File diff suppressed because it is too large
Load Diff
1372
tinygrad/runtime/autogen/kgsl.py
Normal file
1372
tinygrad/runtime/autogen/kgsl.py
Normal file
File diff suppressed because it is too large
Load Diff
406
tinygrad/runtime/ops_qcom.py
Normal file
406
tinygrad/runtime/ops_qcom.py
Normal file
@@ -0,0 +1,406 @@
|
||||
from __future__ import annotations
|
||||
import os, time, ctypes, fcntl, functools, mmap, struct, array, decimal, math
|
||||
from types import SimpleNamespace
|
||||
from typing import Tuple, List, Dict, Any, cast
|
||||
from tinygrad.device import BufferOptions, HCQBuffer, HWComputeQueue, HCQProgram, HCQCompiled, HCQSignal, HCQAllocator, HCQArgsState, hcq_command
|
||||
import tinygrad.runtime.autogen.kgsl as kgsl
|
||||
import tinygrad.runtime.autogen.adreno as adreno
|
||||
import tinygrad.runtime.autogen.libc as libc
|
||||
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
|
||||
from tinygrad.renderer.cstyle import QCOMRenderer
|
||||
from tinygrad.helpers import getenv, from_mv, mv_address, to_mv, round_up, data64_le, prod
|
||||
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
def next_power2(x): return 1 if x == 0 else 1 << (x - 1).bit_length()
|
||||
|
||||
def prt(val: int):
|
||||
for i in range(4,1,-1): val ^= val >> (1 << i)
|
||||
return (~0x6996 >> (val & 0xf)) & 1
|
||||
|
||||
def pkt7_hdr(opcode: int, cnt: int): return adreno.CP_TYPE7_PKT | cnt & 0x3FFF | prt(cnt) << 15 | (opcode & 0x7F) << 16 | prt(opcode) << 23
|
||||
|
||||
def pkt4_hdr(reg: int, cnt: int): return adreno.CP_TYPE4_PKT | cnt & 0x7F | prt(cnt) << 7 | (reg & 0x3FFFF) << 8 | prt(reg) << 27
|
||||
|
||||
class QCOMCompiler(CLCompiler):
|
||||
def __init__(self, device:str=""): super().__init__(CLDevice(device), 'compile_qcom')
|
||||
|
||||
class QCOMSignal(HCQSignal):
|
||||
def __init__(self, value=0, **kwargs):
|
||||
self._signal = QCOMDevice.signals_pool.pop()
|
||||
super().__init__(value)
|
||||
def __del__(self): QCOMDevice.signals_pool.append(self._signal)
|
||||
def _get_value(self) -> int: return self._signal[0]
|
||||
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(19.2) # based on the 19.2MHz always-on timer
|
||||
def _set_value(self, new_value:int): self._signal[0] = new_value
|
||||
def wait(self, value:int, timeout:int=60000):
|
||||
start_time = time.time() * 1000
|
||||
while time.time() * 1000 - start_time < timeout:
|
||||
if self._signal[0] >= value: return
|
||||
raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
|
||||
|
||||
class QCOMComputeQueue(HWComputeQueue):
|
||||
def __init__(self):
|
||||
self.cmd_idx_to_dims = {}
|
||||
super().__init__()
|
||||
|
||||
def cmd(self, opcode: int, *vals: int): self.q += [pkt7_hdr(opcode, len(vals)), *vals]
|
||||
|
||||
def reg(self, reg: int, *vals: int): self.q += [pkt4_hdr(reg, len(vals)), *vals]
|
||||
|
||||
def _signal(self, signal, value=0, ts=False):
|
||||
if QCOMDevice.gpu_id < 700:
|
||||
self.cmd(adreno.CP_EVENT_WRITE, adreno.CACHE_FLUSH_TS | (0 if not ts else adreno.CP_EVENT_WRITE_0_TIMESTAMP),
|
||||
*data64_le(mv_address(signal._signal) + (0 if not ts else 8)), value & 0xFFFFFFFF)
|
||||
self.cmd(adreno.CP_EVENT_WRITE, adreno.CACHE_INVALIDATE)
|
||||
else:
|
||||
# TODO: support devices starting with 8 Gen 1. Also, 700th series have convenient CP_GLOBAL_TIMESTAMP and CP_LOCAL_TIMESTAMP
|
||||
raise RuntimeError('CP_EVENT_WRITE7 is not supported')
|
||||
|
||||
def _timestamp(self, signal): return self._signal(signal, 0, ts=True)
|
||||
|
||||
def _wait(self, signal, value=0):
|
||||
self.cmd(adreno.CP_WAIT_REG_MEM, adreno.WRITE_GE | adreno.CP_WAIT_REG_MEM_0_POLL(adreno.POLL_MEMORY),
|
||||
*data64_le(mv_address(signal._signal)), value & 0xFFFFFFFF, 0xFFFFFFFF, 32) # busy wait for 32 cycles
|
||||
|
||||
def _update_signal(self, cmd_idx, signal, value):
|
||||
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(mv_address(signal._signal)))
|
||||
if value is not None: self._patch(cmd_idx, offset=4, data=[value & 0xFFFFFFFF])
|
||||
|
||||
def _update_wait(self, cmd_idx, signal, value):
|
||||
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(mv_address(signal._signal)))
|
||||
if value is not None: self._patch(cmd_idx, offset=4, data=[value & 0xFFFFFFFF])
|
||||
|
||||
def _build_gpu_command(self, device):
|
||||
to_mv((hw_page_addr:=device._alloc_cmd_buf(len(self.q) * 4)), len(self.q) * 4).cast('I')[:] = array.array('I', self.q)
|
||||
obj = kgsl.struct_kgsl_command_object(gpuaddr=hw_page_addr, size=len(self.q) * 4, flags=kgsl.KGSL_CMDLIST_IB)
|
||||
submit_req = kgsl.struct_kgsl_gpu_command(cmdlist=ctypes.addressof(obj), numcmds=1, context_id=device.ctx,
|
||||
cmdsize=ctypes.sizeof(kgsl.struct_kgsl_command_object))
|
||||
return submit_req, obj
|
||||
|
||||
def bind(self, device):
|
||||
self.binded_device = device
|
||||
self.submit_req, self.obj = self._build_gpu_command(self.binded_device)
|
||||
# From now on, the queue is on the device for faster submission.
|
||||
self.q = to_mv(self.obj.gpuaddr, len(self.q) * 4).cast("I") # type: ignore
|
||||
|
||||
def _submit(self, device):
|
||||
if self.binded_device == device: submit_req = self.submit_req
|
||||
else: submit_req, _ = self._build_gpu_command(device)
|
||||
device._ioctl(kgsl.IOCTL_KGSL_GPU_COMMAND, submit_req)
|
||||
|
||||
@hcq_command
|
||||
def setup(self):
|
||||
self.cmd(adreno.CP_WAIT_FOR_IDLE)
|
||||
self.cmd(adreno.CP_SET_MARKER, adreno.RM6_COMPUTE)
|
||||
self.reg(adreno.REG_A6XX_HLSQ_INVALIDATE_CMD, adreno.A6XX_HLSQ_INVALIDATE_CMD_CS_STATE | adreno.A6XX_HLSQ_INVALIDATE_CMD_CS_IBO)
|
||||
self.reg(adreno.REG_A6XX_HLSQ_INVALIDATE_CMD, 0x0)
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_COUNT, 0xff) # set to max
|
||||
self.reg(adreno.REG_A6XX_SP_CS_IBO_COUNT, 0xff) # set to max
|
||||
self.reg(adreno.REG_A6XX_SP_MODE_CONTROL, adreno.A6XX_SP_MODE_CONTROL_ISAMMODE(adreno.ISAMMODE_CL))
|
||||
self.reg(adreno.REG_A6XX_SP_PERFCTR_ENABLE, adreno.A6XX_SP_PERFCTR_ENABLE_CS)
|
||||
self.reg(adreno.REG_A6XX_SP_TP_MODE_CNTL, adreno.ISAMMODE_CL | (1 << 3)) # ISAMMODE|UNK3
|
||||
self.reg(adreno.REG_A6XX_TPL1_DBG_ECO_CNTL, 0)
|
||||
|
||||
def _exec(self, prg, args_state, global_size, local_size):
|
||||
global_size_mp = cast(Tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size))) if local_size else global_size
|
||||
self.cmd_idx_to_dims[len(self) - 1] = [global_size, local_size]
|
||||
|
||||
self.cmd(adreno.CP_WAIT_FOR_IDLE)
|
||||
self.reg(adreno.REG_A6XX_HLSQ_CS_NDRANGE_0,
|
||||
adreno.A6XX_HLSQ_CS_NDRANGE_0_KERNELDIM(3) | adreno.A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1)
|
||||
| adreno.A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) | adreno.A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1),
|
||||
global_size_mp[0], 0, global_size_mp[1], 0, global_size_mp[2], 0, 0xccc0cf,
|
||||
0xfc | adreno.A6XX_HLSQ_CS_CNTL_1_THREADSIZE(adreno.THREAD64),
|
||||
int(math.ceil(global_size[0])), int(math.ceil(global_size[1])), int(math.ceil(global_size[2])))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_CTRL_REG0,
|
||||
adreno.A6XX_SP_CS_CTRL_REG0_THREADSIZE(adreno.THREAD64) | adreno.A6XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(prg.hregs_count)
|
||||
| adreno.A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(prg.fregs_count) | adreno.A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(prg.branch_stack),
|
||||
adreno.A6XX_SP_CS_UNKNOWN_A9B1_UNK5 | adreno.A6XX_SP_CS_UNKNOWN_A9B1_UNK6 | adreno.A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(prg.shared_size),
|
||||
0, prg.prg_offset, *data64_le(prg.lib_gpu.va_addr), adreno.A6XX_SP_CS_PVT_MEM_PARAM_MEMSIZEPERITEM(prg.pvtmem_size_per_item),
|
||||
*data64_le(prg.device._stack.va_addr), adreno.A6XX_SP_CS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(prg.pvtmem_size_total))
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG,
|
||||
adreno.CP_LOAD_STATE6_0_STATE_TYPE(adreno.ST_CONSTANTS) | adreno.CP_LOAD_STATE6_0_STATE_SRC(adreno.SS6_INDIRECT)
|
||||
| adreno.CP_LOAD_STATE6_0_STATE_BLOCK(adreno.SB6_CS_SHADER) | adreno.CP_LOAD_STATE6_0_NUM_UNIT(prg.kernargs_alloc_size // 4),
|
||||
*data64_le(args_state.ptr))
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG, adreno.CP_LOAD_STATE6_0_STATE_TYPE(adreno.ST_SHADER) | adreno.CP_LOAD_STATE6_0_STATE_SRC(adreno.SS6_INDIRECT)
|
||||
| adreno.CP_LOAD_STATE6_0_STATE_BLOCK(adreno.SB6_CS_SHADER) | adreno.CP_LOAD_STATE6_0_NUM_UNIT(round_up(prg.image_size, 128) // 128),
|
||||
*data64_le(prg.lib_gpu.va_addr))
|
||||
self.reg(adreno.REG_A6XX_HLSQ_CONTROL_2_REG, 0xfcfcfcfc, 0xfcfcfcfc, 0xfcfcfcfc, 0xfc,
|
||||
adreno.A6XX_HLSQ_CS_CNTL_CONSTLEN(prg.kernargs_alloc_size // 4) | adreno.A6XX_HLSQ_CS_CNTL_ENABLED)
|
||||
|
||||
self.reg(adreno.REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET, prg.hw_stack_offset)
|
||||
self.reg(adreno.REG_A6XX_SP_CS_INSTRLEN, prg.image_size // 4)
|
||||
|
||||
if hasattr(args_state, 'samplers_ptr'):
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG,
|
||||
adreno.CP_LOAD_STATE6_0_STATE_TYPE(adreno.ST_SHADER) | adreno.CP_LOAD_STATE6_0_STATE_SRC(adreno.SS6_INDIRECT)
|
||||
| adreno.CP_LOAD_STATE6_0_STATE_BLOCK(adreno.SB6_CS_TEX) | adreno.CP_LOAD_STATE6_0_NUM_UNIT(args_state.samplers_cnt),
|
||||
*data64_le(args_state.samplers_ptr.va_addr))
|
||||
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.samplers_ptr.va_addr))
|
||||
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.device._border_color_base()))
|
||||
|
||||
if hasattr(args_state, 'descriptors_ptr'):
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG,
|
||||
adreno.CP_LOAD_STATE6_0_STATE_TYPE(adreno.ST_CONSTANTS) | adreno.CP_LOAD_STATE6_0_STATE_SRC(adreno.SS6_INDIRECT)
|
||||
| adreno.CP_LOAD_STATE6_0_STATE_BLOCK(adreno.SB6_CS_TEX) | adreno.CP_LOAD_STATE6_0_NUM_UNIT(args_state.descriptors_cnt),
|
||||
*data64_le(args_state.descriptors_ptr.va_addr))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.descriptors_ptr.va_addr))
|
||||
|
||||
if hasattr(args_state, 'ibos_ptr'):
|
||||
self.cmd(adreno.CP_LOAD_STATE6_FRAG,
|
||||
adreno.CP_LOAD_STATE6_0_STATE_TYPE(adreno.ST6_IBO) | adreno.CP_LOAD_STATE6_0_STATE_SRC(adreno.SS6_INDIRECT)
|
||||
| adreno.CP_LOAD_STATE6_0_STATE_BLOCK(adreno.SB6_CS_SHADER) | adreno.CP_LOAD_STATE6_0_NUM_UNIT(args_state.ibos_cnt),
|
||||
*data64_le(args_state.ibos_ptr.va_addr))
|
||||
self.reg(adreno.REG_A6XX_SP_CS_IBO, *data64_le(args_state.ibos_ptr.va_addr))
|
||||
|
||||
self.reg(adreno.REG_A6XX_SP_CS_CONFIG,
|
||||
adreno.A6XX_SP_CS_CONFIG_ENABLED | adreno.A6XX_SP_CS_CONFIG_NSAMP(args_state.samplers_cnt)
|
||||
| adreno.A6XX_SP_CS_CONFIG_NTEX(args_state.descriptors_cnt) | adreno.A6XX_SP_CS_CONFIG_NIBO(args_state.ibos_cnt))
|
||||
|
||||
self.cmd(adreno.CP_RUN_OPENCL, 0)
|
||||
|
||||
def _update_exec(self, cmd_idx, global_size, local_size):
|
||||
if global_size is not None:
|
||||
self._patch(cmd_idx, offset=11, data=[int(math.ceil(global_size[0])), int(math.ceil(global_size[1])), int(math.ceil(global_size[2]))])
|
||||
self.cmd_idx_to_dims[cmd_idx][0] = global_size
|
||||
|
||||
if local_size is not None:
|
||||
payload = (adreno.A6XX_HLSQ_CS_NDRANGE_0_KERNELDIM(3) | adreno.A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1)
|
||||
| adreno.A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) | adreno.A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1))
|
||||
|
||||
self._patch(cmd_idx, offset=2, data=[payload])
|
||||
self.cmd_idx_to_dims[cmd_idx][1] = local_size
|
||||
|
||||
global_size_mp = self.cmd_idx_to_dims[cmd_idx][0]
|
||||
if self.cmd_idx_to_dims[cmd_idx][1]:
|
||||
global_size_mp = cast(Tuple[int,int,int], tuple(int(g*l) for g,l in zip(self.cmd_idx_to_dims[cmd_idx][0], self.cmd_idx_to_dims[cmd_idx][1])))
|
||||
self._patch(cmd_idx, offset=3, data=[global_size_mp[0], 0, global_size_mp[1], 0, global_size_mp[2], 0])
|
||||
|
||||
class QCOMArgsState(HCQArgsState):
|
||||
def __init__(self, ptr:int, prg:QCOMProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
|
||||
super().__init__(ptr, prg, bufs, vals=vals)
|
||||
self.ibos_cnt, self.descriptors_cnt, self.samplers_cnt = 0, 0, 0
|
||||
ctypes.memset(ptr, 0, 1024)
|
||||
|
||||
if len(bufs) + len(vals) != len(prg.buffs_info): raise RuntimeError(f'incorrect args size given={len(bufs)} != want={len(prg.buffs_info)}')
|
||||
self.boffs, self.aoffs = prg.buffs_info[:len(bufs)], prg.buffs_info[len(bufs):]
|
||||
for i, v in enumerate(vals): self.update_var(i, v)
|
||||
for cnst_val, cnst_off, cnst_sz in prg.consts_info:
|
||||
ctypes.memmove(self.ptr + cnst_off, (ctypes.c_int8 * cnst_sz).from_buffer_copy(cnst_val.to_bytes(cnst_sz, byteorder='little')), cnst_sz)
|
||||
|
||||
samplers: List[Any] = []
|
||||
descriptors: List[Any] = []
|
||||
ibos: List[Any] = []
|
||||
self.i2descr: Dict[int, int] = {}
|
||||
self.i2ibo: Dict[int, int] = {}
|
||||
for i, b in enumerate(bufs):
|
||||
if not hasattr(b, 'samplers') and not hasattr(b, 'descriptor') and not hasattr(b, 'ibo'): self.update_buffer(i, b)
|
||||
elif self.boffs[i][1]: ibos, self.i2ibo = [*ibos, *getattr(b, 'ibo')], {**self.i2ibo, i: len(ibos)}
|
||||
else:
|
||||
samplers, descriptors = [*samplers, *getattr(b, 'samplers')], [*descriptors, *getattr(b, 'descriptor')]
|
||||
self.i2descr[i] = len(descriptors) - 1
|
||||
|
||||
def alloc_tex_gpu(data, chunk_size) -> Tuple[HCQBuffer, int]:
|
||||
tex_gpu = self.prg.device.allocator.alloc(len(data) * 4, BufferOptions(nolru=True, cpu_access=True))
|
||||
to_mv(tex_gpu.va_addr, len(data) * 4).cast('I')[:] = array.array('I', data)
|
||||
return tex_gpu, len(data) // chunk_size
|
||||
|
||||
if len(samplers): self.samplers_ptr, self.samplers_cnt = alloc_tex_gpu(samplers, 4)
|
||||
if len(descriptors): self.descriptors_ptr, self.descriptors_cnt = alloc_tex_gpu(descriptors, 16)
|
||||
if len(ibos): self.ibos_ptr, self.ibos_cnt = alloc_tex_gpu(ibos, 16)
|
||||
|
||||
def __del__(self):
|
||||
for ptr in ('samplers_ptr', 'descriptors_ptr', 'ibos_ptr'):
|
||||
if hasattr(self, ptr): self.prg.device.allocator.free((x:=getattr(self, ptr)), x.size, BufferOptions(nolru=True, cpu_access=True))
|
||||
|
||||
def update_buffer(self, index:int, buf:HCQBuffer):
|
||||
if (descr:=self.i2descr.get(index, None)) is not None: to_mv(self.descriptors_ptr.va_addr + 16 * descr + 4 * 4, 8).cast('Q')[0] = buf.va_addr
|
||||
elif (ibo:=self.i2ibo.get(index, None)) is not None: to_mv(self.ibos_ptr.va_addr + 16 * ibo + 4 * 4, 8).cast('Q')[0] = buf.va_addr
|
||||
else: to_mv(self.ptr + self.boffs[index][0], 8).cast('Q')[0] = buf.va_addr
|
||||
|
||||
def update_var(self, index:int, val:int): to_mv(self.ptr + self.aoffs[index][0], 8).cast('Q')[0] = val
|
||||
|
||||
class QCOMProgram(HCQProgram):
|
||||
def __init__(self, device: QCOMDevice, name: str, lib: bytes):
|
||||
self.device, self.name, self.lib = device, name, lib
|
||||
self._parse_lib()
|
||||
|
||||
# reserve some space after for gpu to use
|
||||
self.lib_gpu = self.device.allocator.alloc(len(self.image) + 0x20000, options=BufferOptions(cpu_access=True, nolru=True))
|
||||
to_mv(self.lib_gpu.va_addr, self.image_size)[:] = self.image
|
||||
|
||||
self.pvtmem_size_per_item = round_up(self.pvtmem, 512) >> 9
|
||||
self.pvtmem_size_total = self.pvtmem_size_per_item * 128 * 2
|
||||
self.hw_stack_offset = round_up(next_power2(round_up(self.pvtmem, 512)) * 128 * 16, 0x1000)
|
||||
self.shared_size = max(1, (self.shmem - 1) // 1024)
|
||||
self.max_threads = min(1024, ((384 * 32) // (max(1, (self.fregs_count + round_up(self.hregs_count, 2) // 2)) * 128)) * 128)
|
||||
device._ensure_stack_size(self.hw_stack_offset * 4)
|
||||
|
||||
super().__init__(QCOMArgsState, self.device, self.name, kernargs_alloc_size=1024)
|
||||
|
||||
def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
if self.max_threads < prod(local_size): raise RuntimeError("Too many resources requsted for launch")
|
||||
if any(g*l>mx for g,l,mx in zip(global_size, local_size, [65536, 65536, 65536])) and any(l>mx for l,mx in zip(local_size, [1024, 1024, 1024])):
|
||||
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
|
||||
return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
|
||||
|
||||
def _parse_lib(self):
|
||||
def _read_lib(off): return struct.unpack("I", self.lib[off:off+4])[0]
|
||||
|
||||
# Extract image binary
|
||||
self.image_size = _read_lib(0x100)
|
||||
self.image = bytearray(self.lib[(image_offset:=_read_lib(0xc0)):image_offset+self.image_size])
|
||||
|
||||
# Parse image descriptors
|
||||
image_desc_off = _read_lib(0x110)
|
||||
self.prg_offset, self.branch_stack = _read_lib(image_desc_off+0xc4), _read_lib(image_desc_off+0x108)
|
||||
self.pvtmem, self.shmem = _read_lib(image_desc_off+0xc8), _read_lib(image_desc_off+0xd8)
|
||||
|
||||
# Fill up constants and buffers info
|
||||
self.buffs_info, self.consts_info = [], []
|
||||
|
||||
samplers_count = _read_lib(image_desc_off + 0xdc)
|
||||
bdoff = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * samplers_count
|
||||
while (bdoff + 16 <= len(self.lib)):
|
||||
length, _, _, offset_words = struct.unpack("I" * 4, self.lib[bdoff:bdoff+16])
|
||||
if length == 0: break
|
||||
self.buffs_info.append((offset_words * 4, struct.unpack("I", self.lib[bdoff+0x3c:bdoff+0x40])[0] == 0x0))
|
||||
bdoff += length
|
||||
|
||||
if _read_lib(0xb0) != 0: # check if we have constants.
|
||||
cdoff = _read_lib(0xac)
|
||||
while (cdoff + 40 <= image_offset):
|
||||
cnst, offset_words, _, is32 = struct.unpack("I", self.lib[cdoff:cdoff+4])[0], *struct.unpack("III", self.lib[cdoff+16:cdoff+28])
|
||||
self.consts_info.append((cnst, offset_words * (sz_bytes:=(2 << is32)), sz_bytes))
|
||||
cdoff += 40
|
||||
|
||||
# Registers info
|
||||
reg_desc_off = _read_lib(0x34)
|
||||
self.fregs_count, self.hregs_count = _read_lib(reg_desc_off + 0x14), _read_lib(reg_desc_off + 0x18)
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferOptions(cpu_access=True, nolru=True))
|
||||
|
||||
class QCOMAllocator(HCQAllocator):
|
||||
def __init__(self, device:QCOMDevice): super().__init__(device)
|
||||
|
||||
def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
|
||||
if options.image is not None:
|
||||
pitchalign = 6
|
||||
pitch = round_up(round_up(options.image.shape[1], 16) * (4 * options.image.base.itemsize), 1 << pitchalign)
|
||||
texture = self.device._gpu_alloc(pitch * round_up(options.image.shape[0], 16), kgsl.KGSL_MEMTYPE_TEXTURE, map_to_cpu=True)
|
||||
|
||||
# save it here to load in one command (the same approach as OpenCL and mesa)
|
||||
texture.samplers, texture.descriptor, texture.ibo = [0] * 4, [0] * 16, [0] * 16
|
||||
texture.samplers[0:2] = [0x1b60, 0x30] # compiled sampler. always the same in tinygrad.
|
||||
|
||||
texture.descriptor[0] = adreno.A6XX_TEX_CONST_0_SWIZ_X(0) | adreno.A6XX_TEX_CONST_0_SWIZ_Y(1) | adreno.A6XX_TEX_CONST_0_SWIZ_Z(2) \
|
||||
| adreno.A6XX_TEX_CONST_0_SWIZ_W(3) \
|
||||
| adreno.A6XX_TEX_CONST_0_FMT(adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT)
|
||||
texture.descriptor[1] = adreno.A6XX_TEX_CONST_1_WIDTH(options.image.shape[1]) | adreno.A6XX_TEX_CONST_1_HEIGHT(options.image.shape[0])
|
||||
texture.descriptor[2] = adreno.A6XX_TEX_CONST_2_TYPE(adreno.A6XX_TEX_2D) | adreno.A6XX_TEX_CONST_2_PITCH(pitch) \
|
||||
| adreno.A6XX_TEX_CONST_2_PITCHALIGN(pitchalign - 6)
|
||||
|
||||
texture.descriptor[4:6] = data64_le(texture.va_addr)
|
||||
texture.descriptor[6] = 0x40000000
|
||||
texture.descriptor[7] = 0xe
|
||||
|
||||
texture.ibo = [texture.descriptor[0] & (~0xffff), *texture.descriptor[1:len(texture.descriptor)]]
|
||||
|
||||
return texture
|
||||
|
||||
return self.device._gpu_alloc(size, map_to_cpu=True)
|
||||
|
||||
def copyin(self, dest:HCQBuffer, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), src.nbytes)
|
||||
|
||||
def copyout(self, dest:memoryview, src:HCQBuffer):
|
||||
self.device.synchronize()
|
||||
ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes)
|
||||
|
||||
def _free(self, opaque, options:BufferOptions):
|
||||
self.device.synchronize()
|
||||
self.device._gpu_free(opaque)
|
||||
|
||||
MAP_FIXED = 0x10
|
||||
class QCOMDevice(HCQCompiled):
|
||||
signals_page: Any = None
|
||||
signals_pool: List[Any] = []
|
||||
gpu_id: int = 0
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
self.fd = os.open('/dev/kgsl-3d0', os.O_RDWR)
|
||||
QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, map_to_cpu=True, uncached=True)
|
||||
QCOMDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, self.signals_page.size, 16)]
|
||||
info, self.ctx, self.cmd_buf, self.cmd_buf_ptr = self._info(), self._ctx_create(), self._gpu_alloc(0x1000000, map_to_cpu=True), 0
|
||||
QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF)
|
||||
assert QCOMDevice.gpu_id < 700
|
||||
|
||||
super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self),
|
||||
QCOMSignal, QCOMComputeQueue, None, timeline_signals=(QCOMSignal(), QCOMSignal()))
|
||||
|
||||
QCOMComputeQueue().setup().signal(self.timeline_signal, self.timeline_value).submit(self)
|
||||
self.timeline_value += 1
|
||||
|
||||
def _ctx_create(self):
|
||||
cr = kgsl.struct_kgsl_drawctxt_create(flags=(kgsl.KGSL_CONTEXT_TYPE(kgsl.KGSL_CONTEXT_TYPE_CL) | kgsl.KGSL_CONTEXT_PREAMBLE
|
||||
| kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE
|
||||
| kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)))
|
||||
self._ioctl(kgsl.IOCTL_KGSL_DRAWCTXT_CREATE, cr)
|
||||
self.context_id = cr.drawctxt_id
|
||||
return self.context_id
|
||||
|
||||
def _ctx_destroy(self, ctx_id):
|
||||
dstr = kgsl.struct_kgsl_drawctxt_destroy(drawctxt_id=ctx_id)
|
||||
self._ioctl(kgsl.IOCTL_KGSL_DRAWCTXT_DESTROY, dstr)
|
||||
|
||||
def _info(self):
|
||||
info = kgsl.struct_kgsl_devinfo()
|
||||
get_property = kgsl.struct_kgsl_device_getproperty(type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info))
|
||||
self._ioctl(kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY, get_property)
|
||||
return info
|
||||
|
||||
def _ioctl(self, nr, arg):
|
||||
ret = fcntl.ioctl(self.fd, (3 << 30) | (ctypes.sizeof(arg) & 0x1FFF) << 16 | 0x9 << 8 | (nr & 0xFF), arg)
|
||||
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
||||
return ret
|
||||
|
||||
def _gpu_alloc(self, size:int, flags:int=0, map_to_cpu=False, uncached=False, fill_zeroes=False):
|
||||
size = round_up(size, 1 << (alignment_hint:=12))
|
||||
flags |= (kgsl.KGSL_MEMALIGN(alignment_hint))
|
||||
if uncached: flags |= (kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED))
|
||||
|
||||
alloc = kgsl.struct_kgsl_gpuobj_alloc(size=size, flags=flags)
|
||||
self._ioctl(kgsl.IOCTL_KGSL_GPUOBJ_ALLOC, alloc)
|
||||
va_addr, va_len = None, 0
|
||||
if not (flags & kgsl.KGSL_MEMFLAGS_USE_CPU_MAP):
|
||||
info = kgsl.struct_kgsl_gpuobj_info(id=alloc.id)
|
||||
self._ioctl(kgsl.IOCTL_KGSL_GPUOBJ_INFO, info)
|
||||
va_addr, va_len = info.gpuaddr, info.va_len
|
||||
|
||||
if map_to_cpu or (flags & kgsl.KGSL_MEMFLAGS_USE_CPU_MAP):
|
||||
va_addr = libc.mmap(va_addr, va_len := (va_len or alloc.mmapsize), mmap.PROT_READ|mmap.PROT_WRITE,
|
||||
mmap.MAP_SHARED | (MAP_FIXED if va_addr is not None else 0), self.fd, alloc.id * 0x1000)
|
||||
if fill_zeroes: ctypes.memset(va_addr, 0, va_len)
|
||||
|
||||
return SimpleNamespace(va_addr=va_addr, size=va_len, mapped=map_to_cpu or (flags & kgsl.KGSL_MEMFLAGS_USE_CPU_MAP), info=alloc)
|
||||
|
||||
def _gpu_free(self, mem):
|
||||
free = kgsl.struct_kgsl_gpuobj_free(id=mem.info.id)
|
||||
self._ioctl(kgsl.IOCTL_KGSL_GPUOBJ_FREE, free)
|
||||
if mem.mapped: libc.munmap(mem.va_addr, mem.size)
|
||||
|
||||
def _alloc_cmd_buf(self, sz: int):
|
||||
self.cmd_buf_ptr = (cur_ptr:=self.cmd_buf_ptr if self.cmd_buf_ptr + sz < self.cmd_buf.size else 0) + sz
|
||||
return self.cmd_buf.va_addr + cur_ptr
|
||||
|
||||
def _border_color_base(self):
|
||||
if not hasattr(self, '_border_color_gpu'): self._border_color_gpu = self._gpu_alloc(0x1000, map_to_cpu=True, fill_zeroes=True)
|
||||
return self._border_color_gpu.va_addr
|
||||
|
||||
def _ensure_stack_size(self, sz):
|
||||
if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz)
|
||||
elif self._stack.size < sz:
|
||||
self.synchronize()
|
||||
self._gpu_free(self._stack)
|
||||
self._stack = self._gpu_alloc(sz)
|
||||
Reference in New Issue
Block a user