mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
add qcom runtime (#5213)
* qcom: driver init
* autogen stubs for msm_kgsl also fixup ioctls to show numbers instead of _IOW macros
* autogen: add adreno commands and registers
* ops_qcom: QcomAllocator + signals
* fix EDEADLK in hwqueue, init timestamps, use opencl compiler for qcom
* qcom: we do not really need all these constants input/output is enough
* qcom: perfctr for CS (do not really need all the rest)
* qcom: HALFREGFOOTPRINT and FULLREGFOOTPRINT are set to be around max
* qcom: explicitly set instruction len based on the shader size
* ops_qcom: Program init
extracts shader from open cl binary
sets input/output buffers
allocates stack
sets cs mode
runs shader
* use data64_le from helpers
* ops_qcom: use fill_kernargs for filling i/o buffers
* ops_qcom: add QcomCopyQueue just for api & set kernargs_args_offset
* new signals & fix exec
* add QCOM to the list of supported devices
* correct QcomComputeQueue._wait using CP_WAIT_REG_MEM
* fix exec, synchronize before copyout
* correct setting num_units for ST_SHADER
* fix gpu hangs on sigs with CP_MEM_WRITE, it is uncached mem anyway
* extract offsets to kernel arguments from opencl binary
* extract constants values and offsets from opencl binary
* handle KGSL_MEMFLAGS_USE_CPU_MAP correctly
* align kernel name to 4 bytes when skipping kernel opencl struct
* skip to consts directly using an offset from opencl binary header
* fix alloc
* get halfreg and fullreg from opencl bin
* set unmultipled global sizes as kernel group in HLSQ_CS_NDRANGE
* parse prg offset from open cl binary
* save loc with HLSQ_CS_CNTL. set this with HLSQ_CONTROL_2_REG
* support for vals in _fill_kernargs
* support 16-bit constants
* use KGSL_CONTEXT_NO_FAULT_TOLERANCE for contexts
this helps to not fall down when executing big kernels
/* Don't time out if the context has disabled it */
if (drawobj->context->flags & KGSL_CONTEXT_NO_FAULT_TOLERANCE)
return;
* minor changes of _exec
* QCOMRenderer
* disable HCQGraph for demo. TOOD: support HCQ update api
* support HCQ
- remove copy queue
- add updates
- add strides for buffs and vars for QCOM
* bufs_stride
* clean ups
* linter
* call super().__init__(value) in QcomSignal
* disable=unused-import
* mypy
* type ignore when queue is on the device
* fix
* query gpu_id.
Will be useful for selecting commands e.g. CP_EVENT_WRITE vs
CP_EVENT_WRITE7
* working timestamps
* free context after device is done
* move gpu stack to the device
* reserve some space with lib_gpu for gpu to write to
this fixes test_interpolate_bilinear
* exclude tests that fails with GPU=1 on qualcomm
* lint
* unmap mem in _gpu_free
* ctxt priority and preemtion policy
* remove old qcom
* pass size to self.device.allocator.free
* skip tests only on qcom
* use kgsl and adreno defines instead of numeric vals
* use allocator for allocating lib_gpu
* update to QcomArgsState from master
* intermediate commit while conquering images
* enable image tests on qcom
* fix shader disasm size, dump textures stuff
* working images
* allow signals to be 0
* set branchstack from OpenCL binary
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* set shared memory size from OpenCL binary
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* update images in QcomArgsState & less loc for images
* set stack sizes from OpenCL binary
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* stack allocation based on OpenCL binary
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* better autogen for kgsl and adreno. no more bitshifts
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* cleanup commit for parse cl lib
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* dont forget actual generated files
* refactor + less loc
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* device.py back
* lint
* ruff
* timestamp divisor
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* fix tex fmt & round global size
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
* dtypes
* 19.2MHz
* -1 loc in _update_exec
* remove noqa
---------
Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
8e2a3fc165
commit
4c33192a8b
@@ -56,6 +56,11 @@ def hprint(vals):
|
||||
|
||||
ST6_SHADER = 0
|
||||
ST6_CONSTANTS = 1
|
||||
ST6_UBO = 2
|
||||
ST6_IBO = 3
|
||||
|
||||
SB6_CS_TEX = 5
|
||||
SB6_CS_SHADER = 13
|
||||
|
||||
def parse_cmd_buf(dat):
|
||||
ptr = 0
|
||||
@@ -74,10 +79,20 @@ def parse_cmd_buf(dat):
|
||||
num_unit = vals[0]>>22
|
||||
print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}")
|
||||
|
||||
from extra.disassemblers.adreno import disasm_raw
|
||||
if state_type == ST6_SHADER: disasm_raw(get_mem(((vals[2] << 32) | vals[1]), 0x180))
|
||||
if state_type == ST6_CONSTANTS: hexdump(get_mem(((vals[2] << 32) | vals[1]), min(0x180, num_unit*4)))
|
||||
pass
|
||||
if state_block == SB6_CS_SHADER:
|
||||
from extra.disassemblers.adreno import disasm_raw
|
||||
if state_type == ST6_SHADER: disasm_raw(get_mem(((vals[2] << 32) | vals[1]), num_unit * 128))
|
||||
if state_type == ST6_CONSTANTS: hexdump(get_mem(((vals[2] << 32) | vals[1]), min(0x180, num_unit*4)))
|
||||
elif state_block == SB6_CS_TEX:
|
||||
if state_type == ST6_SHADER:
|
||||
samplers_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 4 * 4)
|
||||
print('texture samplers')
|
||||
hexdump(samplers_bytes)
|
||||
if state_type == ST6_CONSTANTS:
|
||||
descriptors_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4)
|
||||
print('texture descriptors')
|
||||
hexdump(descriptors_bytes)
|
||||
|
||||
elif ops[opcode] == "CP_REG_TO_MEM":
|
||||
reg, cnt, b64, accum = vals[0] & 0x3FFFF, (vals[0] >> 18) & 0xFFF, (vals[0] >> 30) & 0x1, (vals[0] >> 31) & 0x1
|
||||
dest = vals[1] | (vals[2] << 32)
|
||||
@@ -88,6 +103,13 @@ def parse_cmd_buf(dat):
|
||||
offset, size = ((cmd>>8)&0x7FFFF), cmd&0x7F
|
||||
vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size])
|
||||
print(f"{ptr:3X} -- typ 4: {size=:3d}, {offset=:#x}", hprint(vals))
|
||||
if offset == 0xa9b0:
|
||||
print(f'THREADSIZE-{(vals[0] >> 20)&0x1}\nEARLYPREAMBLE-{(vals[0] >> 23) & 0x1}\nMERGEDREGS-{(vals[0] >> 3) & 0x1}\nTHREADMODE-{vals[0] & 0x1}\nHALFREGFOOTPRINT-{(vals[0] >> 1) & 0x3f}\nFULLREGFOOTPRINT-{(vals[0] >> 7) & 0x3f}\nBRANCHSTACK-{(vals[0] >> 14) & 0x3f}\n')
|
||||
print(f'SP_CS_UNKNOWN_A9B1-{vals[1]}\nSP_CS_BRANCH_COND-{vals[2]}\nSP_CS_OBJ_FIRST_EXEC_OFFSET-{vals[3]}\nSP_CS_OBJ_START-{vals[4] | (vals[5] << 32)}\nSP_CS_PVT_MEM_PARAM-{vals[6]}\nSP_CS_PVT_MEM_ADDR-{vals[7] | (vals[8] << 32)}\nSP_CS_PVT_MEM_SIZE-{vals[9]}')
|
||||
if offset == 0xb180:
|
||||
print('border color offset', hex(vals[1] << 32 | vals[0]))
|
||||
hexdump(get_mem(vals[1] << 32 | vals[0], 0x1000))
|
||||
|
||||
ptr += 4*size
|
||||
else:
|
||||
print("unk", hex(cmd))
|
||||
|
||||
Reference in New Issue
Block a user