diff --git a/extra/qcom_gpu_driver/adreno_pm4.xml b/extra/qcom_gpu_driver/adreno_pm4.xml index 1b687eed5a..c617856ba3 100644 --- a/extra/qcom_gpu_driver/adreno_pm4.xml +++ b/extra/qcom_gpu_driver/adreno_pm4.xml @@ -1,7 +1,8 @@ +xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd"> + @@ -20,9 +21,9 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> - - - + + + @@ -30,8 +31,8 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> - If A6XX_RB_SAMPLE_COUNT_CONTROL.copy is true, writes OQ Z passed - sample counts to RB_SAMPLE_COUNT_ADDR. This writes to main + If A6XX_RB_SAMPLE_COUNTER_CNTL.copy is true, writes OQ Z passed + sample counts to RB_SAMPLE_COUNTER_BASE. This writes to main memory, skipping UCHE. @@ -96,6 +97,13 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> + + Flip between the primary and secondary LRZ buffers. This is used + for concurrent binning, so that BV can write to one buffer while + BR reads from the other. + + + Clears based on GRAS_LRZ_CNTL configuration, could clear fast-clear buffer or LRZ direction. @@ -112,11 +120,12 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> - + + - - + + @@ -129,21 +138,22 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> Doesn't seem to do anything - - - - - - - - - - - - + + + + + + + + + + + + - - + + + @@ -324,7 +334,7 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> fetch state sub-blocks and initiate shader code DMAs load constant into chip and to memory - + load sequencer instruction memory (pointer-based) load sequencer instruction memory (code embedded in packet) @@ -371,7 +381,7 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> Conditionally load a IB based on a flag, prefetch enabled - + Conditionally load a IB based on a flag, prefetch disabled Load a buffer with pre-fetch enabled @@ -514,7 +524,7 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> @@ -537,7 +547,7 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> - + + + Write CP_CONTEXT_SWITCH_*_INFO from CP to the following dwords, and forcibly switch to the indicated context. - - + - + Write to a scratch memory that is read by CP_REG_TEST with SOURCE_SCRATCH_MEM set. It's not the same scratch as scratch registers. @@ -648,6 +658,11 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd"> Reset various on-chip state used for synchronization + + Invalidates the "CCHE" introduced on a740 + + + @@ -790,14 +805,14 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - + + - + @@ -903,12 +918,6 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - - - - - @@ -1084,8 +1093,10 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - + + + @@ -1119,39 +1130,63 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) + + + + + + + + A mask of bins, starting at VSC_N, whose + visibility is OR'd together. A value of 0 is + interpreted as 1 (i.e. just use VSC_N for + visbility) for backwards compatibility. Only + exists on a7xx. + + + + + If this field is 1, VSC_MASK and VSC_N are + ignored and instead a new ordinal immediately + after specifies the full 32-bit mask of bins + to use. The mask is "absolute" instead of + relative to VSC_N. + + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + @@ -1162,23 +1197,42 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) stream is recorded. + + - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1196,6 +1250,9 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) + + + @@ -1209,7 +1266,7 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - + @@ -1217,12 +1274,12 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - - - - - + + + + + + @@ -1238,12 +1295,7 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - - - - - + @@ -1263,18 +1315,8 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - - - - - - - - - - - + + @@ -1287,12 +1329,12 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - - - - - + + + + + + @@ -1312,6 +1354,10 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) + + + + + + @@ -1368,12 +1416,12 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - - - - - + + + + + + @@ -1425,24 +1473,14 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - - - - - + - - - - - - + @@ -1457,12 +1495,7 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - - - - - + @@ -1480,12 +1513,7 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - - - - - + @@ -1619,12 +1647,7 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) TODO what is gpuaddr for, seems to be all 0's.. maybe needed for context switch? --> - - - - - - + @@ -1653,8 +1676,8 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - + + @@ -1668,15 +1691,11 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) + - - - - - - + @@ -1743,9 +1762,7 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - - + @@ -1754,12 +1771,7 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - - - - - - + @@ -1771,40 +1783,88 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) Tell CP the current operation mode, indicates save and restore procedure + + + + + + + + + - - - - - - + + + + + + + - + - - - - - + + + + + + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1830,9 +1890,9 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) If concurrent binning is disabled then BR also does binning so it will also write the "real" registers in BR. --> - - - + + + @@ -1933,11 +1993,11 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) a bitmask of which modes pass the test. --> - + - + @@ -2010,54 +2070,45 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - + - Used by the userspace driver to set various IB's which are - executed during context save/restore for handling - state that isn't restored by the - context switch routine itself. + Used by the userspace and kernel drivers to set various IB's + which are executed during context save/restore for handling + state that isn't restored by the context switch routine itself. - - + + Executed unconditionally when switching back to the context. - + Executed when switching back after switching away during execution of - a CP_SET_MARKER packet with RM6_YIELD as the - payload *and* the normal save routine was - bypassed for a shorter one. I think this is - connected to the "skipsaverestore" bit set by - the kernel when preempting. + a CP_SET_MARKER packet with RM6_BIN_RENDER_END as the + payload *and* skipsaverestore is set. This is + expected to restore static register values not + saved when skipsaverestore is set. - + Executed when switching away from the context, except for context switches initiated via CP_YIELD. - + This can only be set by the RB (i.e. the kernel) and executes with protected mode off, but - is otherwise similar to SAVE_IB. - - Note, kgsl calls this CP_KMD_AMBLE_TYPE + is otherwise similar to POSTAMBLE_AMBLE_TYPE. - - - - - - + - + @@ -2089,12 +2140,12 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) Tracks GRAS_LRZ_CNTL::GREATER, GRAS_LRZ_CNTL::DIR, and - GRAS_LRZ_DEPTH_VIEW with previous values, and if one of + GRAS_LRZ_VIEW_INFO with previous values, and if one of the following is true: - GRAS_LRZ_CNTL::GREATER has changed - GRAS_LRZ_CNTL::DIR has changed, the old value is not CUR_DIR_GE, and the new value is not CUR_DIR_DISABLED - - GRAS_LRZ_DEPTH_VIEW has changed + - GRAS_LRZ_VIEW_INFO has changed then it does a LRZ_FLUSH with GRAS_LRZ_CNTL::ENABLE forced to 1. Only exists in a650_sqe.fw. @@ -2209,7 +2260,7 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - Best guess is that it is a faster way to fetch all the VSC_STATE registers + Best guess is that it is a faster way to fetch all the VSC_CHANNEL_VISIBILITY registers and keep them in a local scratch memory instead of fetching every time when skipping IBs. @@ -2257,7 +2308,25 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - + + + + + + + + + + + + + + + + + + + diff --git a/extra/qcom_gpu_driver/opencl_ioctl.py b/extra/qcom_gpu_driver/opencl_ioctl.py index ea1896bd01..bdcb8f3d32 100644 --- a/extra/qcom_gpu_driver/opencl_ioctl.py +++ b/extra/qcom_gpu_driver/opencl_ioctl.py @@ -97,7 +97,7 @@ def parse_cmd_buf(dat): if state_block == SB6_CS_SHADER: from extra.disassemblers.adreno import disasm_raw - if state_type == ST6_SHADER and IOCTL > 2: + if state_type == ST6_SHADER and IOCTL > 3: disasm_raw(get_mem(((vals[2] << 32) | vals[1]), num_unit * 128)) if state_type == ST6_CONSTANTS: x = get_mem(((vals[2] << 32) | vals[1]), num_unit*4) @@ -106,25 +106,30 @@ def parse_cmd_buf(dat): print('constants') hexdump(x) if state_type == ST6_IBO: - ibos_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4) + if state_src == 0x1: + ibos_bytes = get_mem(CAPTURED_STATE['bindless_base'] + ((vals[2] << 32) | vals[1]) * 4, num_unit * 64) + else: ibos_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 16 * 4) CAPTURED_STATE['ibos'] = ibos_bytes[:] if IOCTL > 1: print('texture ibos') hexdump(ibos_bytes) elif state_block == SB6_CS_TEX: if state_type == ST6_SHADER: - samplers_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 4 * 4) + if state_src == 0x1: + samplers_bytes = get_mem(CAPTURED_STATE['bindless_base'] + ((vals[2] << 32) | vals[1]) * 4, num_unit * 64) + else: samplers_bytes = get_mem((vals[2] << 32) | vals[1], num_unit * 4 * 4) CAPTURED_STATE['samplers'] = samplers_bytes[:] if IOCTL > 1: print('texture samplers') hexdump(samplers_bytes) if state_type == ST6_CONSTANTS: - descriptors_bytes = get_mem((vals[2] << 32) | vals[1], 1600) + if state_src == 0x1: + descriptors_bytes = get_mem(CAPTURED_STATE['bindless_base'] + ((vals[2] << 32) | vals[1]) * 4, num_unit * 64) + else: descriptors_bytes = get_mem((vals[2] << 32) | vals[1], 1600) CAPTURED_STATE['descriptors'] = descriptors_bytes[:] if IOCTL > 1: print('texture descriptors') hexdump(descriptors_bytes) - elif ops[opcode] == "CP_REG_TO_MEM": reg, cnt, b64, accum = vals[0] & 0x3FFFF, (vals[0] >> 18) & 0xFFF, (vals[0] >> 30) & 0x1, (vals[0] >> 31) & 0x1 dest = vals[1] | (vals[2] << 32) @@ -152,6 +157,10 @@ def parse_cmd_buf(dat): if IOCTL > 0: print(f'THREADSIZE-{(vals[0] >> 20)&0x1}\nEARLYPREAMBLE-{(vals[0] >> 23) & 0x1}\nMERGEDREGS-{(vals[0] >> 3) & 0x1}\nTHREADMODE-{vals[0] & 0x1}\nHALFREGFOOTPRINT-{(vals[0] >> 1) & 0x3f}\nFULLREGFOOTPRINT-{(vals[0] >> 7) & 0x3f}\nBRANCHSTACK-{(vals[0] >> 14) & 0x3f}\n') print(f'SP_CS_UNKNOWN_A9B1-{vals[1]}\nSP_CS_BRANCH_COND-{vals[2]}\nSP_CS_OBJ_FIRST_EXEC_OFFSET-{vals[3]}\nSP_CS_OBJ_START-{vals[4] | (vals[5] << 32)}\nSP_CS_PVT_MEM_PARAM-{vals[6]}\nSP_CS_PVT_MEM_ADDR-{vals[7] | (vals[8] << 32)}\nSP_CS_PVT_MEM_SIZE-{vals[9]}') + if offset == 0xa9e8: + CAPTURED_STATE['bindless_base'] = (vals[0] | (vals[1] << 32)) & ~0b11 + # print(hex(CAPTURED_STATE['bindless_base'])) + # hexdump(get_mem(CAPTURED_STATE['bindless_base'], 0x200)) if offset == 0xb180: if IOCTL > 0: print('border color offset', hex(vals[1] << 32 | vals[0])) @@ -171,8 +180,8 @@ def ioctl(fd, request, argp): name, stype = nrs[nr] s = get_struct(argp, stype) if IOCTL > 0: print(f"{ret:2d} = {name:40s}", ' '.join(format_struct(s))) - if name == "IOCTL_KGSL_GPUOBJ_INFO": pass - # mmaped[s.gpuaddr] = mmap.mmap(fd, s.size, offset=s.id*0x1000) + if name == "IOCTL_KGSL_GPUOBJ_INFO": + mmaped[s.gpuaddr] = mmap.mmap(fd, s.size, offset=s.id*0x1000) if name == "IOCTL_KGSL_GPU_COMMAND": for i in range(s.numcmds): cmd = get_struct(s.cmdlist+ctypes.sizeof(msm_kgsl.struct_kgsl_command_object)*i, msm_kgsl.struct_kgsl_command_object)