add qcom runtime (#5213)

* qcom: driver init * autogen stubs for msm_kgsl also fixup ioctls to show numbers instead of _IOW macros * autogen: add adreno commands and registers * ops_qcom: QcomAllocator + signals * fix EDEADLK in hwqueue, init timestamps, use opencl compiler for qcom * qcom: we do not really need all these constants input/output is enough * qcom: perfctr for CS (do not really need all the rest) * qcom: HALFREGFOOTPRINT and FULLREGFOOTPRINT are set to be around max * qcom: explicitly set instruction len based on the shader size * ops_qcom: Program init extracts shader from open cl binary sets input/output buffers allocates stack sets cs mode runs shader * use data64_le from helpers * ops_qcom: use fill_kernargs for filling i/o buffers * ops_qcom: add QcomCopyQueue just for api & set kernargs_args_offset * new signals & fix exec * add QCOM to the list of supported devices * correct QcomComputeQueue._wait using CP_WAIT_REG_MEM * fix exec, synchronize before copyout * correct setting num_units for ST_SHADER * fix gpu hangs on sigs with CP_MEM_WRITE, it is uncached mem anyway * extract offsets to kernel arguments from opencl binary * extract constants values and offsets from opencl binary * handle KGSL_MEMFLAGS_USE_CPU_MAP correctly * align kernel name to 4 bytes when skipping kernel opencl struct * skip to consts directly using an offset from opencl binary header * fix alloc * get halfreg and fullreg from opencl bin * set unmultipled global sizes as kernel group in HLSQ_CS_NDRANGE * parse prg offset from open cl binary * save loc with HLSQ_CS_CNTL. set this with HLSQ_CONTROL_2_REG * support for vals in _fill_kernargs * support 16-bit constants * use KGSL_CONTEXT_NO_FAULT_TOLERANCE for contexts this helps to not fall down when executing big kernels /* Don't time out if the context has disabled it */ if (drawobj->context->flags & KGSL_CONTEXT_NO_FAULT_TOLERANCE) return; * minor changes of _exec * QCOMRenderer * disable HCQGraph for demo. TOOD: support HCQ update api * support HCQ - remove copy queue - add updates - add strides for buffs and vars for QCOM * bufs_stride * clean ups * linter * call super().__init__(value) in QcomSignal * disable=unused-import * mypy * type ignore when queue is on the device * fix * query gpu_id. Will be useful for selecting commands e.g. CP_EVENT_WRITE vs CP_EVENT_WRITE7 * working timestamps * free context after device is done * move gpu stack to the device * reserve some space with lib_gpu for gpu to write to this fixes test_interpolate_bilinear * exclude tests that fails with GPU=1 on qualcomm * lint * unmap mem in _gpu_free * ctxt priority and preemtion policy * remove old qcom * pass size to self.device.allocator.free * skip tests only on qcom * use kgsl and adreno defines instead of numeric vals * use allocator for allocating lib_gpu * update to QcomArgsState from master * intermediate commit while conquering images * enable image tests on qcom * fix shader disasm size, dump textures stuff * working images * allow signals to be 0 * set branchstack from OpenCL binary Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com> * set shared memory size from OpenCL binary Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com> * update images in QcomArgsState & less loc for images * set stack sizes from OpenCL binary Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com> * stack allocation based on OpenCL binary Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com> * better autogen for kgsl and adreno. no more bitshifts Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com> * cleanup commit for parse cl lib Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com> * dont forget actual generated files * refactor + less loc Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com> * device.py back * lint * ruff * timestamp divisor Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com> * fix tex fmt & round global size Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com> * dtypes * 19.2MHz * -1 loc in _update_exec * remove noqa --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com> Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
2026-01-09 15:08:02 -05:00 · 2024-09-02 19:35:47 +03:00
parent 8e2a3fc165
commit 4c33192a8b
10 changed files with 36530 additions and 6 deletions
--- a/extra/qcom_gpu_driver/opencl_ioctl.py
+++ b/extra/qcom_gpu_driver/opencl_ioctl.py
@@ -56,6 +56,11 @@ def hprint(vals):

 ST6_SHADER = 0
 ST6_CONSTANTS = 1
+ST6_UBO = 2
+ST6_IBO = 3
+
+SB6_CS_TEX = 5
+SB6_CS_SHADER = 13

 def parse_cmd_buf(dat):
  ptr = 0
@@ -74,10 +79,20 @@ def parse_cmd_buf(dat):
        num_unit = vals[0]>>22
        print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}")

-        from extra.disassemblers.adreno import disasm_raw
-        if state_type == ST6_SHADER: disasm_raw(get_mem(((vals[2] << 32) | vals[1]), 0x180))
-        if state_type == ST6_CONSTANTS: hexdump(get_mem(((vals[2] << 32) | vals[1]), min(0x180, num_unit*4)))
-        pass
+        if state_block == SB6_CS_SHADER:
+          from extra.disassemblers.adreno import disasm_raw
+          if state_type == ST6_SHADER: disasm_raw(get_mem(((vals[2] << 32) | vals[1]), num_unit * 128))
+          if state_type == ST6_CONSTANTS: hexdump(get_mem(((vals[2] << 32) | vals[1]), min(0x180, num_unit*4)))
+        elif state_block == SB6_CS_TEX:
+          if state_type == ST6_SHADER:
+            samplers_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 4 * 4)
+            print('texture samplers')
+            hexdump(samplers_bytes)
+          if state_type == ST6_CONSTANTS:
+            descriptors_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4)
+            print('texture descriptors')
+            hexdump(descriptors_bytes)
+
      elif ops[opcode] == "CP_REG_TO_MEM":
        reg, cnt, b64, accum = vals[0] & 0x3FFFF, (vals[0] >> 18) & 0xFFF, (vals[0] >> 30) & 0x1, (vals[0] >> 31) & 0x1
        dest = vals[1] | (vals[2] << 32)
@@ -88,6 +103,13 @@ def parse_cmd_buf(dat):
      offset, size = ((cmd>>8)&0x7FFFF), cmd&0x7F
      vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size])
      print(f"{ptr:3X} -- typ 4: {size=:3d}, {offset=:#x}", hprint(vals))
+      if offset == 0xa9b0:
+        print(f'THREADSIZE-{(vals[0] >> 20)&0x1}\nEARLYPREAMBLE-{(vals[0] >> 23) & 0x1}\nMERGEDREGS-{(vals[0] >> 3) & 0x1}\nTHREADMODE-{vals[0] & 0x1}\nHALFREGFOOTPRINT-{(vals[0] >> 1) & 0x3f}\nFULLREGFOOTPRINT-{(vals[0] >> 7) & 0x3f}\nBRANCHSTACK-{(vals[0] >> 14) & 0x3f}\n')
+        print(f'SP_CS_UNKNOWN_A9B1-{vals[1]}\nSP_CS_BRANCH_COND-{vals[2]}\nSP_CS_OBJ_FIRST_EXEC_OFFSET-{vals[3]}\nSP_CS_OBJ_START-{vals[4] | (vals[5] << 32)}\nSP_CS_PVT_MEM_PARAM-{vals[6]}\nSP_CS_PVT_MEM_ADDR-{vals[7] | (vals[8] << 32)}\nSP_CS_PVT_MEM_SIZE-{vals[9]}')
+      if offset == 0xb180:
+        print('border color offset', hex(vals[1] << 32 | vals[0]))
+        hexdump(get_mem(vals[1] << 32 | vals[0], 0x1000))
+
      ptr += 4*size
    else:
      print("unk", hex(cmd))