mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-07 22:23:55 -05:00
nv driver (#4044)
* start * fix err 93 * gpu * ioctl mappings * alloc like cuda * semaphores * wait for semaphores value * start ops_nv * very simple kernels work * init several gpus * qmd dumper * dirty, but most of kernels work * always all test_ops * progress, more tests, stable * test_ops passes, gpt2 works but wth big fifo, wrap of fifo doesn't work, i think it's something coherency releated * need better sync * fix sync * alloc2 * all tests pass! * cleanup 1 * cleanup * multigpu, simple transfer * fix sync * correct init * nv_gpu autogen + sync bug fix * clean extra/nv_gpu_driver * p2p * clean up * remove old gen * small fixes * cleanup * cleanup 2 * small fixes * bigger queue size * cleanups * wait * fixed signals for devs * fix hang + parallel beam * small fixes * detect when local memory is big in kernel * correct assert * small fixes * correct tls size est * one va space * less lines * shorter * save 2 lines * save some lines * remove type ignores --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
3
.github/workflows/test.yml
vendored
3
.github/workflows/test.yml
vendored
@@ -458,8 +458,11 @@ jobs:
|
||||
if: matrix.backend == 'cuda'
|
||||
run: |
|
||||
cp tinygrad/runtime/autogen/cuda.py /tmp/cuda.py.bak
|
||||
cp tinygrad/runtime/autogen/nv_gpu.py /tmp/nv_gpu.py.bak
|
||||
./autogen_stubs.sh cuda
|
||||
./autogen_stubs.sh nv
|
||||
diff /tmp/cuda.py.bak tinygrad/runtime/autogen/cuda.py
|
||||
diff /tmp/nv_gpu.py.bak tinygrad/runtime/autogen/nv_gpu.py
|
||||
- name: Verify HIP autogen
|
||||
if: matrix.backend == 'hip'
|
||||
run: |
|
||||
|
||||
@@ -72,6 +72,56 @@ generate_cuda() {
|
||||
python3 -c "import tinygrad.runtime.autogen.cuda"
|
||||
}
|
||||
|
||||
generate_nv() {
|
||||
NVKERN_COMMIT_HASH=d6b75a34094b0f56c2ccadf14e5d0bd515ed1ab6
|
||||
NVKERN_SRC=/tmp/open-gpu-kernel-modules-$NVKERN_COMMIT_HASH
|
||||
if [ ! -d "$NVKERN_SRC" ]; then
|
||||
git clone https://github.com/tinygrad/open-gpu-kernel-modules $NVKERN_SRC
|
||||
pushd .
|
||||
cd $NVKERN_SRC
|
||||
git reset --hard $NVKERN_COMMIT_HASH
|
||||
popd
|
||||
fi
|
||||
|
||||
clang2py \
|
||||
extra/nv_gpu_driver/clc6c0qmd.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl0080.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl2080_notification.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
|
||||
$NVKERN_SRC/src/nvidia/generated/g_allclasses.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc6c0.h \
|
||||
$NVKERN_SRC/kernel-open/nvidia-uvm/clc6b5.h \
|
||||
$NVKERN_SRC/kernel-open/nvidia-uvm/uvm_ioctl.h \
|
||||
$NVKERN_SRC/kernel-open/nvidia-uvm/uvm_linux_ioctl.h \
|
||||
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv_escape.h \
|
||||
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv-ioctl.h \
|
||||
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv-ioctl-numbers.h \
|
||||
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv-ioctl-numa.h \
|
||||
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv-unix-nvos-params-wrappers.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/alloc/alloc_channel.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/nvos.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl0000/*.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl0080/*.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl2080/*.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl83de/*.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrlc36f.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrlcb33.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrla06c.h \
|
||||
--clang-args="-include $NVKERN_SRC/src/common/sdk/nvidia/inc/nvtypes.h -I$NVKERN_SRC/src/common/inc -I$NVKERN_SRC/kernel-open/nvidia-uvm -I$NVKERN_SRC/kernel-open/common/inc -I$NVKERN_SRC/src/common/sdk/nvidia/inc -I$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include -I$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl" \
|
||||
-o $BASE/nv_gpu.py -k cdefstum
|
||||
fixup $BASE/nv_gpu.py
|
||||
sed -i "s\(0000000001)\1\g" $BASE/nv_gpu.py
|
||||
sed -i "s\import ctypes\import ctypes, os\g" $BASE/nv_gpu.py
|
||||
sed -i 's/#\?\s\([A-Za-z0-9_]\+\) = MW ( \([0-9]\+\) : \([0-9]\+\) )/\1 = (\2 , \3)/' $BASE/nv_gpu.py # NVC6C0_QMDV03_00 processing
|
||||
sed -i 's/#\sdef NVC6C0_QMD\([A-Za-z0-9_()]\+\):/def NVC6C0_QMD\1:/' $BASE/nv_gpu.py
|
||||
sed -i 's/#\s*return MW(\([0-9i()*+]\+\):\([0-9i()*+]\+\))/ return (\1 , \2)/' $BASE/nv_gpu.py
|
||||
sed -i 's/#\?\s*\(.*\)\s*=\s*\(NV\)\?BIT\(32\)\?\s*(\s*\([0-9]\+\)\s*)/\1 = (1 << \4)/' $BASE/nv_gpu.py # name = BIT(x) -> name = (1 << x)
|
||||
sed -i "s/UVM_\([A-Za-z0-9_]\+\) = \['i', '(', '\([0-9]\+\)', ')'\]/UVM_\1 = \2/" $BASE/nv_gpu.py # UVM_name = ['i', '(', '<num>', ')'] -> UVM_name = <num>
|
||||
python3 -c "import tinygrad.runtime.autogen.nv_gpu"
|
||||
}
|
||||
|
||||
generate_hsa() {
|
||||
clang2py \
|
||||
/opt/rocm/include/hsa/hsa.h \
|
||||
@@ -105,6 +155,7 @@ elif [ "$1" == "comgr" ]; then generate_comgr
|
||||
elif [ "$1" == "cuda" ]; then generate_cuda
|
||||
elif [ "$1" == "hsa" ]; then generate_hsa
|
||||
elif [ "$1" == "kfd" ]; then generate_kfd
|
||||
elif [ "$1" == "all" ]; then generate_opencl; generate_hip; generate_comgr; generate_cuda; generate_hsa; generate_kfd
|
||||
elif [ "$1" == "nv" ]; then generate_nv
|
||||
elif [ "$1" == "all" ]; then generate_opencl; generate_hip; generate_comgr; generate_cuda; generate_hsa; generate_kfd; generate_nv
|
||||
else echo "usage: $0 <type>"
|
||||
fi
|
||||
|
||||
@@ -1,258 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# TARGET arch is: []
|
||||
# WORD_SIZE is: 8
|
||||
# POINTER_SIZE is: 8
|
||||
# LONGDOUBLE_SIZE is: 16
|
||||
#
|
||||
import ctypes
|
||||
|
||||
|
||||
|
||||
|
||||
NV01_ROOT = (0x00000000) # macro
|
||||
NV1_ROOT = (0x00000000) # macro
|
||||
NV01_NULL_OBJECT = (0x00000000) # macro
|
||||
NV1_NULL_OBJECT = (0x00000000) # macro
|
||||
NV01_ROOT_NON_PRIV = (0x00000001) # macro
|
||||
NV1_ROOT_NON_PRIV = (0x00000001) # macro
|
||||
NV01_ROOT_CLIENT = (0x00000041) # macro
|
||||
FABRIC_MANAGER_SESSION = (0x0000000f) # macro
|
||||
NV0020_GPU_MANAGEMENT = (0x00000020) # macro
|
||||
NV01_DEVICE_0 = (0x00000080) # macro
|
||||
NV20_SUBDEVICE_0 = (0x00002080) # macro
|
||||
NV2081_BINAPI = (0x00002081) # macro
|
||||
NV2082_BINAPI_PRIVILEGED = (0x00002082) # macro
|
||||
NV20_SUBDEVICE_DIAG = (0x0000208f) # macro
|
||||
NV01_CONTEXT_DMA = (0x00000002) # macro
|
||||
NV01_MEMORY_SYSTEM = (0x0000003e) # macro
|
||||
NV1_MEMORY_SYSTEM = (0x0000003e) # macro
|
||||
NV01_MEMORY_LOCAL_PRIVILEGED = (0x0000003f) # macro
|
||||
NV1_MEMORY_LOCAL_PRIVILEGED = (0x0000003f) # macro
|
||||
NV01_MEMORY_PRIVILEGED = (0x0000003f) # macro
|
||||
NV1_MEMORY_PRIVILEGED = (0x0000003f) # macro
|
||||
NV01_MEMORY_LOCAL_USER = (0x00000040) # macro
|
||||
NV1_MEMORY_LOCAL_USER = (0x00000040) # macro
|
||||
NV01_MEMORY_USER = (0x00000040) # macro
|
||||
NV1_MEMORY_USER = (0x00000040) # macro
|
||||
NV_MEMORY_EXTENDED_USER = (0x00000042) # macro
|
||||
NV01_MEMORY_VIRTUAL = (0x00000070) # macro
|
||||
NV01_MEMORY_SYSTEM_DYNAMIC = (0x00000070) # macro
|
||||
NV1_MEMORY_SYSTEM_DYNAMIC = (0x00000070) # macro
|
||||
NV_MEMORY_MAPPER = (0x000000fe) # macro
|
||||
NV01_MEMORY_LOCAL_PHYSICAL = (0x000000c2) # macro
|
||||
NV01_MEMORY_SYSTEM_OS_DESCRIPTOR = (0x00000071) # macro
|
||||
NV01_MEMORY_DEVICELESS = (0x000090ce) # macro
|
||||
NV01_MEMORY_FRAMEBUFFER_CONSOLE = (0x00000076) # macro
|
||||
NV01_MEMORY_HW_RESOURCES = (0x000000b1) # macro
|
||||
NV01_MEMORY_LIST_SYSTEM = (0x00000081) # macro
|
||||
NV01_MEMORY_LIST_FBMEM = (0x00000082) # macro
|
||||
NV01_MEMORY_LIST_OBJECT = (0x00000083) # macro
|
||||
NV01_MEMORY_FLA = (0x000000f3) # macro
|
||||
NV_CE_UTILS = (0x00000050) # macro
|
||||
NV_MEMORY_FABRIC = (0x000000f8) # macro
|
||||
FABRIC_VASPACE_A = (0x000000fc) # macro
|
||||
NV_MEMORY_MULTICAST_FABRIC = (0x000000fd) # macro
|
||||
IO_VASPACE_A = (0x000000f2) # macro
|
||||
NV01_NULL = (0x00000030) # macro
|
||||
NV1_NULL = (0x00000030) # macro
|
||||
NV01_EVENT = (0x00000005) # macro
|
||||
NV1_EVENT = (0x00000005) # macro
|
||||
NV01_EVENT_KERNEL_CALLBACK = (0x00000078) # macro
|
||||
NV1_EVENT_KERNEL_CALLBACK = (0x00000078) # macro
|
||||
NV01_EVENT_OS_EVENT = (0x00000079) # macro
|
||||
NV1_EVENT_OS_EVENT = (0x00000079) # macro
|
||||
NV01_EVENT_WIN32_EVENT = (0x00000079) # macro
|
||||
NV1_EVENT_WIN32_EVENT = (0x00000079) # macro
|
||||
NV01_EVENT_KERNEL_CALLBACK_EX = (0x0000007e) # macro
|
||||
NV1_EVENT_KERNEL_CALLBACK_EX = (0x0000007e) # macro
|
||||
NV01_TIMER = (0x00000004) # macro
|
||||
NV1_TIMER = (0x00000004) # macro
|
||||
KERNEL_GRAPHICS_CONTEXT = (0x00000090) # macro
|
||||
NV50_CHANNEL_GPFIFO = (0x0000506f) # macro
|
||||
GF100_CHANNEL_GPFIFO = (0x0000906f) # macro
|
||||
KEPLER_CHANNEL_GPFIFO_A = (0x0000a06f) # macro
|
||||
UVM_CHANNEL_RETAINER = (0x0000c574) # macro
|
||||
KEPLER_CHANNEL_GPFIFO_B = (0x0000a16f) # macro
|
||||
MAXWELL_CHANNEL_GPFIFO_A = (0x0000b06f) # macro
|
||||
PASCAL_CHANNEL_GPFIFO_A = (0x0000c06f) # macro
|
||||
VOLTA_CHANNEL_GPFIFO_A = (0x0000c36f) # macro
|
||||
TURING_CHANNEL_GPFIFO_A = (0x0000c46f) # macro
|
||||
AMPERE_CHANNEL_GPFIFO_A = (0x0000c56f) # macro
|
||||
HOPPER_CHANNEL_GPFIFO_A = (0x0000c86f) # macro
|
||||
NV04_SOFTWARE_TEST = (0x0000007d) # macro
|
||||
NV4_SOFTWARE_TEST = (0x0000007d) # macro
|
||||
NV30_GSYNC = (0x000030f1) # macro
|
||||
VOLTA_USERMODE_A = (0x0000c361) # macro
|
||||
TURING_USERMODE_A = (0x0000c461) # macro
|
||||
AMPERE_USERMODE_A = (0x0000c561) # macro
|
||||
HOPPER_USERMODE_A = (0x0000c661) # macro
|
||||
NVC371_DISP_SF_USER = (0x0000c371) # macro
|
||||
NVC372_DISPLAY_SW = (0x0000c372) # macro
|
||||
NVC573_DISP_CAPABILITIES = (0x0000c573) # macro
|
||||
NVC673_DISP_CAPABILITIES = (0x0000c673) # macro
|
||||
NVC773_DISP_CAPABILITIES = (0x0000c773) # macro
|
||||
NV04_DISPLAY_COMMON = (0x00000073) # macro
|
||||
NV50_DEFERRED_API_CLASS = (0x00005080) # macro
|
||||
MPS_COMPUTE = (0x0000900e) # macro
|
||||
NVC570_DISPLAY = (0x0000c570) # macro
|
||||
NVC57A_CURSOR_IMM_CHANNEL_PIO = (0x0000c57a) # macro
|
||||
NVC57B_WINDOW_IMM_CHANNEL_DMA = (0x0000c57b) # macro
|
||||
NVC57D_CORE_CHANNEL_DMA = (0x0000c57d) # macro
|
||||
NVC57E_WINDOW_CHANNEL_DMA = (0x0000c57e) # macro
|
||||
NVC670_DISPLAY = (0x0000c670) # macro
|
||||
NVC671_DISP_SF_USER = (0x0000c671) # macro
|
||||
NVC67A_CURSOR_IMM_CHANNEL_PIO = (0x0000c67a) # macro
|
||||
NVC67B_WINDOW_IMM_CHANNEL_DMA = (0x0000c67b) # macro
|
||||
NVC67D_CORE_CHANNEL_DMA = (0x0000c67d) # macro
|
||||
NVC67E_WINDOW_CHANNEL_DMA = (0x0000c67e) # macro
|
||||
NVC77F_ANY_CHANNEL_DMA = (0x0000c77f) # macro
|
||||
NVC770_DISPLAY = (0x0000c770) # macro
|
||||
NVC771_DISP_SF_USER = (0x0000c771) # macro
|
||||
NVC77D_CORE_CHANNEL_DMA = (0x0000c77d) # macro
|
||||
NV9010_VBLANK_CALLBACK = (0x00009010) # macro
|
||||
GF100_PROFILER = (0x000090cc) # macro
|
||||
MAXWELL_PROFILER = (0x0000b0cc) # macro
|
||||
MAXWELL_PROFILER_DEVICE = (0x0000b2cc) # macro
|
||||
GF100_SUBDEVICE_MASTER = (0x000090e6) # macro
|
||||
GF100_SUBDEVICE_INFOROM = (0x000090e7) # macro
|
||||
GF100_ZBC_CLEAR = (0x00009096) # macro
|
||||
GF100_DISP_SW = (0x00009072) # macro
|
||||
GF100_TIMED_SEMAPHORE_SW = (0x00009074) # macro
|
||||
G84_PERFBUFFER = (0x0000844c) # macro
|
||||
NV50_MEMORY_VIRTUAL = (0x000050a0) # macro
|
||||
NV50_P2P = (0x0000503b) # macro
|
||||
NV50_THIRD_PARTY_P2P = (0x0000503c) # macro
|
||||
FERMI_TWOD_A = (0x0000902d) # macro
|
||||
FERMI_VASPACE_A = (0x000090f1) # macro
|
||||
HOPPER_SEC2_WORK_LAUNCH_A = (0x0000cba2) # macro
|
||||
GF100_HDACODEC = (0x000090ec) # macro
|
||||
NVB8B0_VIDEO_DECODER = (0x0000b8b0) # macro
|
||||
NVC4B0_VIDEO_DECODER = (0x0000c4b0) # macro
|
||||
NVC6B0_VIDEO_DECODER = (0x0000c6b0) # macro
|
||||
NVC7B0_VIDEO_DECODER = (0x0000c7b0) # macro
|
||||
NVC9B0_VIDEO_DECODER = (0x0000c9b0) # macro
|
||||
NVC4B7_VIDEO_ENCODER = (0x0000c4b7) # macro
|
||||
NVB4B7_VIDEO_ENCODER = (0x0000b4b7) # macro
|
||||
NVC7B7_VIDEO_ENCODER = (0x0000c7b7) # macro
|
||||
NVC9B7_VIDEO_ENCODER = (0x0000c9b7) # macro
|
||||
NVB8D1_VIDEO_NVJPG = (0x0000b8d1) # macro
|
||||
NVC4D1_VIDEO_NVJPG = (0x0000c4d1) # macro
|
||||
NVC9D1_VIDEO_NVJPG = (0x0000c9d1) # macro
|
||||
NVB8FA_VIDEO_OFA = (0x0000b8fa) # macro
|
||||
NVC6FA_VIDEO_OFA = (0x0000c6fa) # macro
|
||||
NVC7FA_VIDEO_OFA = (0x0000c7fa) # macro
|
||||
NVC9FA_VIDEO_OFA = (0x0000c9fa) # macro
|
||||
KEPLER_INLINE_TO_MEMORY_B = (0x0000a140) # macro
|
||||
FERMI_CONTEXT_SHARE_A = (0x00009067) # macro
|
||||
KEPLER_CHANNEL_GROUP_A = (0x0000a06c) # macro
|
||||
PASCAL_DMA_COPY_A = (0x0000c0b5) # macro
|
||||
TURING_DMA_COPY_A = (0x0000c5b5) # macro
|
||||
AMPERE_DMA_COPY_A = (0x0000c6b5) # macro
|
||||
AMPERE_DMA_COPY_B = (0x0000c7b5) # macro
|
||||
HOPPER_DMA_COPY_A = (0x0000c8b5) # macro
|
||||
MAXWELL_DMA_COPY_A = (0x0000b0b5) # macro
|
||||
ACCESS_COUNTER_NOTIFY_BUFFER = (0x0000c365) # macro
|
||||
MMU_FAULT_BUFFER = (0x0000c369) # macro
|
||||
MMU_VIDMEM_ACCESS_BIT_BUFFER = (0x0000c763) # macro
|
||||
TURING_A = (0x0000c597) # macro
|
||||
TURING_COMPUTE_A = (0x0000c5c0) # macro
|
||||
AMPERE_A = (0x0000c697) # macro
|
||||
AMPERE_COMPUTE_A = (0x0000c6c0) # macro
|
||||
AMPERE_B = (0x0000c797) # macro
|
||||
AMPERE_COMPUTE_B = (0x0000c7c0) # macro
|
||||
ADA_A = (0x0000c997) # macro
|
||||
ADA_COMPUTE_A = (0x0000c9c0) # macro
|
||||
AMPERE_SMC_PARTITION_REF = (0x0000c637) # macro
|
||||
AMPERE_SMC_EXEC_PARTITION_REF = (0x0000c638) # macro
|
||||
AMPERE_SMC_CONFIG_SESSION = (0x0000c639) # macro
|
||||
NV0092_RG_LINE_CALLBACK = (0x00000092) # macro
|
||||
AMPERE_SMC_MONITOR_SESSION = (0x0000c640) # macro
|
||||
HOPPER_A = (0x0000cb97) # macro
|
||||
HOPPER_COMPUTE_A = (0x0000cbc0) # macro
|
||||
NV40_DEBUG_BUFFER = (0x000000db) # macro
|
||||
RM_USER_SHARED_DATA = (0x000000de) # macro
|
||||
GT200_DEBUGGER = (0x000083de) # macro
|
||||
NV40_I2C = (0x0000402c) # macro
|
||||
NVA081_VGPU_CONFIG = (0x0000a081) # macro
|
||||
NVA084_KERNEL_HOST_VGPU_DEVICE = (0x0000a084) # macro
|
||||
NV0060_SYNC_GPU_BOOST = (0x00000060) # macro
|
||||
GP100_UVM_SW = (0x0000c076) # macro
|
||||
NV_EVENT_BUFFER = (0x000090cd) # macro
|
||||
NV_CONFIDENTIAL_COMPUTE = (0x0000cb33) # macro
|
||||
NV_COUNTER_COLLECTION_UNIT = (0x0000cbca) # macro
|
||||
NV_SEMAPHORE_SURFACE = (0x000000da) # macro
|
||||
__all__ = \
|
||||
['ACCESS_COUNTER_NOTIFY_BUFFER', 'ADA_A', 'ADA_COMPUTE_A',
|
||||
'AMPERE_A', 'AMPERE_B', 'AMPERE_CHANNEL_GPFIFO_A',
|
||||
'AMPERE_COMPUTE_A', 'AMPERE_COMPUTE_B', 'AMPERE_DMA_COPY_A',
|
||||
'AMPERE_DMA_COPY_B', 'AMPERE_SMC_CONFIG_SESSION',
|
||||
'AMPERE_SMC_EXEC_PARTITION_REF', 'AMPERE_SMC_MONITOR_SESSION',
|
||||
'AMPERE_SMC_PARTITION_REF', 'AMPERE_USERMODE_A',
|
||||
'FABRIC_MANAGER_SESSION', 'FABRIC_VASPACE_A',
|
||||
'FERMI_CONTEXT_SHARE_A', 'FERMI_TWOD_A', 'FERMI_VASPACE_A',
|
||||
'G84_PERFBUFFER', 'GF100_CHANNEL_GPFIFO', 'GF100_DISP_SW',
|
||||
'GF100_HDACODEC', 'GF100_PROFILER', 'GF100_SUBDEVICE_INFOROM',
|
||||
'GF100_SUBDEVICE_MASTER', 'GF100_TIMED_SEMAPHORE_SW',
|
||||
'GF100_ZBC_CLEAR', 'GP100_UVM_SW', 'GT200_DEBUGGER', 'HOPPER_A',
|
||||
'HOPPER_CHANNEL_GPFIFO_A', 'HOPPER_COMPUTE_A',
|
||||
'HOPPER_DMA_COPY_A', 'HOPPER_SEC2_WORK_LAUNCH_A',
|
||||
'HOPPER_USERMODE_A', 'IO_VASPACE_A', 'KEPLER_CHANNEL_GPFIFO_A',
|
||||
'KEPLER_CHANNEL_GPFIFO_B', 'KEPLER_CHANNEL_GROUP_A',
|
||||
'KEPLER_INLINE_TO_MEMORY_B', 'KERNEL_GRAPHICS_CONTEXT',
|
||||
'MAXWELL_CHANNEL_GPFIFO_A', 'MAXWELL_DMA_COPY_A',
|
||||
'MAXWELL_PROFILER', 'MAXWELL_PROFILER_DEVICE', 'MMU_FAULT_BUFFER',
|
||||
'MMU_VIDMEM_ACCESS_BIT_BUFFER', 'MPS_COMPUTE',
|
||||
'NV0020_GPU_MANAGEMENT', 'NV0060_SYNC_GPU_BOOST',
|
||||
'NV0092_RG_LINE_CALLBACK', 'NV01_CONTEXT_DMA', 'NV01_DEVICE_0',
|
||||
'NV01_EVENT', 'NV01_EVENT_KERNEL_CALLBACK',
|
||||
'NV01_EVENT_KERNEL_CALLBACK_EX', 'NV01_EVENT_OS_EVENT',
|
||||
'NV01_EVENT_WIN32_EVENT', 'NV01_MEMORY_DEVICELESS',
|
||||
'NV01_MEMORY_FLA', 'NV01_MEMORY_FRAMEBUFFER_CONSOLE',
|
||||
'NV01_MEMORY_HW_RESOURCES', 'NV01_MEMORY_LIST_FBMEM',
|
||||
'NV01_MEMORY_LIST_OBJECT', 'NV01_MEMORY_LIST_SYSTEM',
|
||||
'NV01_MEMORY_LOCAL_PHYSICAL', 'NV01_MEMORY_LOCAL_PRIVILEGED',
|
||||
'NV01_MEMORY_LOCAL_USER', 'NV01_MEMORY_PRIVILEGED',
|
||||
'NV01_MEMORY_SYSTEM', 'NV01_MEMORY_SYSTEM_DYNAMIC',
|
||||
'NV01_MEMORY_SYSTEM_OS_DESCRIPTOR', 'NV01_MEMORY_USER',
|
||||
'NV01_MEMORY_VIRTUAL', 'NV01_NULL', 'NV01_NULL_OBJECT',
|
||||
'NV01_ROOT', 'NV01_ROOT_CLIENT', 'NV01_ROOT_NON_PRIV',
|
||||
'NV01_TIMER', 'NV04_DISPLAY_COMMON', 'NV04_SOFTWARE_TEST',
|
||||
'NV1_EVENT', 'NV1_EVENT_KERNEL_CALLBACK',
|
||||
'NV1_EVENT_KERNEL_CALLBACK_EX', 'NV1_EVENT_OS_EVENT',
|
||||
'NV1_EVENT_WIN32_EVENT', 'NV1_MEMORY_LOCAL_PRIVILEGED',
|
||||
'NV1_MEMORY_LOCAL_USER', 'NV1_MEMORY_PRIVILEGED',
|
||||
'NV1_MEMORY_SYSTEM', 'NV1_MEMORY_SYSTEM_DYNAMIC',
|
||||
'NV1_MEMORY_USER', 'NV1_NULL', 'NV1_NULL_OBJECT', 'NV1_ROOT',
|
||||
'NV1_ROOT_NON_PRIV', 'NV1_TIMER', 'NV2081_BINAPI',
|
||||
'NV2082_BINAPI_PRIVILEGED', 'NV20_SUBDEVICE_0',
|
||||
'NV20_SUBDEVICE_DIAG', 'NV30_GSYNC', 'NV40_DEBUG_BUFFER',
|
||||
'NV40_I2C', 'NV4_SOFTWARE_TEST', 'NV50_CHANNEL_GPFIFO',
|
||||
'NV50_DEFERRED_API_CLASS', 'NV50_MEMORY_VIRTUAL', 'NV50_P2P',
|
||||
'NV50_THIRD_PARTY_P2P', 'NV9010_VBLANK_CALLBACK',
|
||||
'NVA081_VGPU_CONFIG', 'NVA084_KERNEL_HOST_VGPU_DEVICE',
|
||||
'NVB4B7_VIDEO_ENCODER', 'NVB8B0_VIDEO_DECODER',
|
||||
'NVB8D1_VIDEO_NVJPG', 'NVB8FA_VIDEO_OFA', 'NVC371_DISP_SF_USER',
|
||||
'NVC372_DISPLAY_SW', 'NVC4B0_VIDEO_DECODER',
|
||||
'NVC4B7_VIDEO_ENCODER', 'NVC4D1_VIDEO_NVJPG', 'NVC570_DISPLAY',
|
||||
'NVC573_DISP_CAPABILITIES', 'NVC57A_CURSOR_IMM_CHANNEL_PIO',
|
||||
'NVC57B_WINDOW_IMM_CHANNEL_DMA', 'NVC57D_CORE_CHANNEL_DMA',
|
||||
'NVC57E_WINDOW_CHANNEL_DMA', 'NVC670_DISPLAY',
|
||||
'NVC671_DISP_SF_USER', 'NVC673_DISP_CAPABILITIES',
|
||||
'NVC67A_CURSOR_IMM_CHANNEL_PIO', 'NVC67B_WINDOW_IMM_CHANNEL_DMA',
|
||||
'NVC67D_CORE_CHANNEL_DMA', 'NVC67E_WINDOW_CHANNEL_DMA',
|
||||
'NVC6B0_VIDEO_DECODER', 'NVC6FA_VIDEO_OFA', 'NVC770_DISPLAY',
|
||||
'NVC771_DISP_SF_USER', 'NVC773_DISP_CAPABILITIES',
|
||||
'NVC77D_CORE_CHANNEL_DMA', 'NVC77F_ANY_CHANNEL_DMA',
|
||||
'NVC7B0_VIDEO_DECODER', 'NVC7B7_VIDEO_ENCODER',
|
||||
'NVC7FA_VIDEO_OFA', 'NVC9B0_VIDEO_DECODER',
|
||||
'NVC9B7_VIDEO_ENCODER', 'NVC9D1_VIDEO_NVJPG', 'NVC9FA_VIDEO_OFA',
|
||||
'NV_CE_UTILS', 'NV_CONFIDENTIAL_COMPUTE',
|
||||
'NV_COUNTER_COLLECTION_UNIT', 'NV_EVENT_BUFFER',
|
||||
'NV_MEMORY_EXTENDED_USER', 'NV_MEMORY_FABRIC', 'NV_MEMORY_MAPPER',
|
||||
'NV_MEMORY_MULTICAST_FABRIC', 'NV_SEMAPHORE_SURFACE',
|
||||
'PASCAL_CHANNEL_GPFIFO_A', 'PASCAL_DMA_COPY_A',
|
||||
'RM_USER_SHARED_DATA', 'TURING_A', 'TURING_CHANNEL_GPFIFO_A',
|
||||
'TURING_COMPUTE_A', 'TURING_DMA_COPY_A', 'TURING_USERMODE_A',
|
||||
'UVM_CHANNEL_RETAINER', 'VOLTA_CHANNEL_GPFIFO_A',
|
||||
'VOLTA_USERMODE_A']
|
||||
763
extra/nv_gpu_driver/clc6c0qmd.h
Normal file
763
extra/nv_gpu_driver/clc6c0qmd.h
Normal file
@@ -0,0 +1,763 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of this software and associated documentation files (the "Software"),
|
||||
to deal in the Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
and/or sell copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
/* AUTO GENERATED FILE -- DO NOT EDIT */
|
||||
|
||||
#ifndef __CLC6C0QMD_H__
|
||||
#define __CLC6C0QMD_H__
|
||||
|
||||
/*
|
||||
** Queue Meta Data, Version 02_03
|
||||
*/
|
||||
|
||||
// The below C preprocessor definitions describe "multi-word" structures, where
|
||||
// fields may have bit numbers beyond 32. For example, MW(127:96) means
|
||||
// the field is in bits 0-31 of word number 3 of the structure. The "MW(X:Y)"
|
||||
// syntax is to distinguish from similar "X:Y" single-word definitions: the
|
||||
// macros historically used for single-word definitions would fail with
|
||||
// multi-word definitions.
|
||||
//
|
||||
// See nvmisc.h:DRF_VAL_MW() in the source code of the kernel
|
||||
// interface layer of nvidia.ko for an example of how to manipulate
|
||||
// these MW(X:Y) definitions.
|
||||
|
||||
#define NVC6C0_QMDV02_03_OUTER_PUT MW(30:0)
|
||||
#define NVC6C0_QMDV02_03_OUTER_OVERFLOW MW(31:31)
|
||||
#define NVC6C0_QMDV02_03_OUTER_GET MW(62:32)
|
||||
#define NVC6C0_QMDV02_03_OUTER_STICKY_OVERFLOW MW(63:63)
|
||||
#define NVC6C0_QMDV02_03_INNER_GET MW(94:64)
|
||||
#define NVC6C0_QMDV02_03_INNER_OVERFLOW MW(95:95)
|
||||
#define NVC6C0_QMDV02_03_INNER_PUT MW(126:96)
|
||||
#define NVC6C0_QMDV02_03_INNER_STICKY_OVERFLOW MW(127:127)
|
||||
#define NVC6C0_QMDV02_03_QMD_GROUP_ID MW(133:128)
|
||||
#define NVC6C0_QMDV02_03_SM_GLOBAL_CACHING_ENABLE MW(134:134)
|
||||
#define NVC6C0_QMDV02_03_RUN_CTA_IN_ONE_SM_PARTITION MW(135:135)
|
||||
#define NVC6C0_QMDV02_03_RUN_CTA_IN_ONE_SM_PARTITION_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_RUN_CTA_IN_ONE_SM_PARTITION_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_IS_QUEUE MW(136:136)
|
||||
#define NVC6C0_QMDV02_03_IS_QUEUE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_IS_QUEUE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(137:137)
|
||||
#define NVC6C0_QMDV02_03_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_SEMAPHORE_RELEASE_ENABLE0 MW(138:138)
|
||||
#define NVC6C0_QMDV02_03_SEMAPHORE_RELEASE_ENABLE0_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_SEMAPHORE_RELEASE_ENABLE0_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_SEMAPHORE_RELEASE_ENABLE1 MW(139:139)
|
||||
#define NVC6C0_QMDV02_03_SEMAPHORE_RELEASE_ENABLE1_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_SEMAPHORE_RELEASE_ENABLE1_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_REQUIRE_SCHEDULING_PCAS MW(140:140)
|
||||
#define NVC6C0_QMDV02_03_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_SCHEDULE_ENABLE MW(141:141)
|
||||
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_SCHEDULE_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_SCHEDULE_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_TYPE MW(142:142)
|
||||
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_TYPE_QUEUE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_TYPE_GRID 0x00000001
|
||||
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_FIELD_COPY MW(143:143)
|
||||
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_FIELD_COPY_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_FIELD_COPY_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED_B MW(159:144)
|
||||
#define NVC6C0_QMDV02_03_CIRCULAR_QUEUE_SIZE MW(184:160)
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED_C MW(185:185)
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_HEADER_CACHE MW(186:186)
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(187:187)
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_DATA_CACHE MW(188:188)
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_SHADER_DATA_CACHE MW(189:189)
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_INSTRUCTION_CACHE MW(190:190)
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_SHADER_CONSTANT_CACHE MW(191:191)
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_CTA_RASTER_WIDTH_RESUME MW(223:192)
|
||||
#define NVC6C0_QMDV02_03_CTA_RASTER_HEIGHT_RESUME MW(239:224)
|
||||
#define NVC6C0_QMDV02_03_CTA_RASTER_DEPTH_RESUME MW(255:240)
|
||||
#define NVC6C0_QMDV02_03_PROGRAM_PREFETCH_ADDR_LOWER_SHIFTED MW(287:256)
|
||||
#define NVC6C0_QMDV02_03_CIRCULAR_QUEUE_ADDR_LOWER MW(319:288)
|
||||
#define NVC6C0_QMDV02_03_CIRCULAR_QUEUE_ADDR_UPPER MW(327:320)
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED_D MW(335:328)
|
||||
#define NVC6C0_QMDV02_03_CIRCULAR_QUEUE_ENTRY_SIZE MW(351:336)
|
||||
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_ID MW(357:352)
|
||||
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(365:358)
|
||||
#define NVC6C0_QMDV02_03_RELEASE_MEMBAR_TYPE MW(366:366)
|
||||
#define NVC6C0_QMDV02_03_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001
|
||||
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_INCR_ENABLE MW(367:367)
|
||||
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_CWD_MEMBAR_TYPE MW(369:368)
|
||||
#define NVC6C0_QMDV02_03_CWD_MEMBAR_TYPE_L1_NONE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001
|
||||
#define NVC6C0_QMDV02_03_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003
|
||||
#define NVC6C0_QMDV02_03_SEQUENTIALLY_RUN_CTAS MW(370:370)
|
||||
#define NVC6C0_QMDV02_03_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_DECR_ENABLE MW(371:371)
|
||||
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_API_VISIBLE_CALL_LIMIT MW(378:378)
|
||||
#define NVC6C0_QMDV02_03_API_VISIBLE_CALL_LIMIT__32 0x00000000
|
||||
#define NVC6C0_QMDV02_03_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001
|
||||
#define NVC6C0_QMDV02_03_SAMPLER_INDEX MW(382:382)
|
||||
#define NVC6C0_QMDV02_03_SAMPLER_INDEX_INDEPENDENTLY 0x00000000
|
||||
#define NVC6C0_QMDV02_03_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001
|
||||
#define NVC6C0_QMDV02_03_CTA_RASTER_WIDTH MW(415:384)
|
||||
#define NVC6C0_QMDV02_03_CTA_RASTER_HEIGHT MW(431:416)
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED13A MW(447:432)
|
||||
#define NVC6C0_QMDV02_03_CTA_RASTER_DEPTH MW(463:448)
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED14A MW(479:464)
|
||||
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_POINTER MW(511:480)
|
||||
#define NVC6C0_QMDV02_03_COALESCE_WAITING_PERIOD MW(529:522)
|
||||
#define NVC6C0_QMDV02_03_QUEUE_ENTRIES_PER_CTA_LOG2 MW(534:530)
|
||||
#define NVC6C0_QMDV02_03_SHARED_MEMORY_SIZE MW(561:544)
|
||||
#define NVC6C0_QMDV02_03_MIN_SM_CONFIG_SHARED_MEM_SIZE MW(568:562)
|
||||
#define NVC6C0_QMDV02_03_MAX_SM_CONFIG_SHARED_MEM_SIZE MW(575:569)
|
||||
#define NVC6C0_QMDV02_03_QMD_VERSION MW(579:576)
|
||||
#define NVC6C0_QMDV02_03_QMD_MAJOR_VERSION MW(583:580)
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED_H MW(591:584)
|
||||
#define NVC6C0_QMDV02_03_CTA_THREAD_DIMENSION0 MW(607:592)
|
||||
#define NVC6C0_QMDV02_03_CTA_THREAD_DIMENSION1 MW(623:608)
|
||||
#define NVC6C0_QMDV02_03_CTA_THREAD_DIMENSION2 MW(639:624)
|
||||
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1))
|
||||
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_VALID_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_VALID_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_REGISTER_COUNT_V MW(656:648)
|
||||
#define NVC6C0_QMDV02_03_TARGET_SM_CONFIG_SHARED_MEM_SIZE MW(663:657)
|
||||
#define NVC6C0_QMDV02_03_FREE_CTA_SLOTS_EMPTY_SM MW(671:664)
|
||||
#define NVC6C0_QMDV02_03_SM_DISABLE_MASK_LOWER MW(703:672)
|
||||
#define NVC6C0_QMDV02_03_SM_DISABLE_MASK_UPPER MW(735:704)
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_ADDRESS_LOWER MW(767:736)
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_ADDRESS_UPPER MW(775:768)
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED_J MW(783:776)
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP MW(790:788)
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_INC 0x00000003
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_AND 0x00000005
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_OR 0x00000006
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED_K MW(791:791)
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_FORMAT MW(793:792)
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_ENABLE MW(794:794)
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_STRUCTURE_SIZE MW(799:799)
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS 0x00000000
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_STRUCTURE_SIZE_ONE_WORD 0x00000001
|
||||
#define NVC6C0_QMDV02_03_RELEASE0_PAYLOAD MW(831:800)
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_ADDRESS_LOWER MW(863:832)
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_ADDRESS_UPPER MW(871:864)
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED_L MW(879:872)
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP MW(886:884)
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_INC 0x00000003
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_AND 0x00000005
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_OR 0x00000006
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED_M MW(887:887)
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_FORMAT MW(889:888)
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_ENABLE MW(890:890)
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_STRUCTURE_SIZE MW(895:895)
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS 0x00000000
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_STRUCTURE_SIZE_ONE_WORD 0x00000001
|
||||
#define NVC6C0_QMDV02_03_RELEASE1_PAYLOAD MW(927:896)
|
||||
#define NVC6C0_QMDV02_03_SHADER_LOCAL_MEMORY_LOW_SIZE MW(951:928)
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED_N MW(954:952)
|
||||
#define NVC6C0_QMDV02_03_BARRIER_COUNT MW(959:955)
|
||||
#define NVC6C0_QMDV02_03_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(983:960)
|
||||
#define NVC6C0_QMDV02_03_REGISTER_COUNT MW(991:984)
|
||||
#define NVC6C0_QMDV02_03_PROGRAM_PREFETCH_ADDR_UPPER_SHIFTED MW(1000:992)
|
||||
#define NVC6C0_QMDV02_03_PROGRAM_PREFETCH_SIZE MW(1009:1001)
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED_A MW(1015:1010)
|
||||
#define NVC6C0_QMDV02_03_SASS_VERSION MW(1023:1016)
|
||||
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_ADDR_LOWER(i) MW((1055+(i)*64):(1024+(i)*64))
|
||||
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_ADDR_UPPER(i) MW((1072+(i)*64):(1056+(i)*64))
|
||||
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_PREFETCH_POST(i) MW((1073+(i)*64):(1073+(i)*64))
|
||||
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_PREFETCH_POST_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_PREFETCH_POST_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_INVALIDATE(i) MW((1074+(i)*64):(1074+(i)*64))
|
||||
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1087+(i)*64):(1075+(i)*64))
|
||||
#define NVC6C0_QMDV02_03_PROGRAM_ADDRESS_LOWER MW(1567:1536)
|
||||
#define NVC6C0_QMDV02_03_PROGRAM_ADDRESS_UPPER MW(1584:1568)
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED_S MW(1599:1585)
|
||||
#define NVC6C0_QMDV02_03_HW_ONLY_INNER_GET MW(1630:1600)
|
||||
#define NVC6C0_QMDV02_03_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(1631:1631)
|
||||
#define NVC6C0_QMDV02_03_HW_ONLY_INNER_PUT MW(1662:1632)
|
||||
#define NVC6C0_QMDV02_03_HW_ONLY_SCG_TYPE MW(1663:1663)
|
||||
#define NVC6C0_QMDV02_03_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(1693:1664)
|
||||
#define NVC6C0_QMDV02_03_QMD_RESERVED_Q MW(1694:1694)
|
||||
#define NVC6C0_QMDV02_03_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(1695:1695)
|
||||
#define NVC6C0_QMDV02_03_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_03_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_03_HW_ONLY_SKED_NEXT_QMD_POINTER MW(1727:1696)
|
||||
#define NVC6C0_QMDV02_03_QMD_SPARE_G MW(1759:1728)
|
||||
#define NVC6C0_QMDV02_03_QMD_SPARE_H MW(1791:1760)
|
||||
#define NVC6C0_QMDV02_03_QMD_SPARE_I MW(1823:1792)
|
||||
#define NVC6C0_QMDV02_03_QMD_SPARE_J MW(1855:1824)
|
||||
#define NVC6C0_QMDV02_03_QMD_SPARE_K MW(1887:1856)
|
||||
#define NVC6C0_QMDV02_03_QMD_SPARE_L MW(1919:1888)
|
||||
#define NVC6C0_QMDV02_03_QMD_SPARE_M MW(1951:1920)
|
||||
#define NVC6C0_QMDV02_03_QMD_SPARE_N MW(1983:1952)
|
||||
#define NVC6C0_QMDV02_03_DEBUG_ID_UPPER MW(2015:1984)
|
||||
#define NVC6C0_QMDV02_03_DEBUG_ID_LOWER MW(2047:2016)
|
||||
|
||||
|
||||
/*
|
||||
** Queue Meta Data, Version 02_04
|
||||
*/
|
||||
|
||||
#define NVC6C0_QMDV02_04_OUTER_PUT MW(30:0)
|
||||
#define NVC6C0_QMDV02_04_OUTER_OVERFLOW MW(31:31)
|
||||
#define NVC6C0_QMDV02_04_OUTER_GET MW(62:32)
|
||||
#define NVC6C0_QMDV02_04_OUTER_STICKY_OVERFLOW MW(63:63)
|
||||
#define NVC6C0_QMDV02_04_INNER_GET MW(94:64)
|
||||
#define NVC6C0_QMDV02_04_INNER_OVERFLOW MW(95:95)
|
||||
#define NVC6C0_QMDV02_04_INNER_PUT MW(126:96)
|
||||
#define NVC6C0_QMDV02_04_INNER_STICKY_OVERFLOW MW(127:127)
|
||||
#define NVC6C0_QMDV02_04_QMD_GROUP_ID MW(133:128)
|
||||
#define NVC6C0_QMDV02_04_SM_GLOBAL_CACHING_ENABLE MW(134:134)
|
||||
#define NVC6C0_QMDV02_04_RUN_CTA_IN_ONE_SM_PARTITION MW(135:135)
|
||||
#define NVC6C0_QMDV02_04_RUN_CTA_IN_ONE_SM_PARTITION_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_RUN_CTA_IN_ONE_SM_PARTITION_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_IS_QUEUE MW(136:136)
|
||||
#define NVC6C0_QMDV02_04_IS_QUEUE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_IS_QUEUE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(137:137)
|
||||
#define NVC6C0_QMDV02_04_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_SEMAPHORE_RELEASE_ENABLE0 MW(138:138)
|
||||
#define NVC6C0_QMDV02_04_SEMAPHORE_RELEASE_ENABLE0_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_SEMAPHORE_RELEASE_ENABLE0_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_SEMAPHORE_RELEASE_ENABLE1 MW(139:139)
|
||||
#define NVC6C0_QMDV02_04_SEMAPHORE_RELEASE_ENABLE1_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_SEMAPHORE_RELEASE_ENABLE1_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_REQUIRE_SCHEDULING_PCAS MW(140:140)
|
||||
#define NVC6C0_QMDV02_04_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ENABLE MW(141:141)
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ACTION MW(144:142)
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ACTION_QMD_INCREMENT_PUT 0x00000000
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ACTION_QMD_SCHEDULE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_PREFETCH MW(145:145)
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_PREFETCH_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_PREFETCH_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ENABLE MW(146:146)
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ACTION MW(149:147)
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ACTION_QMD_INCREMENT_PUT 0x00000000
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ACTION_QMD_SCHEDULE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_PREFETCH MW(150:150)
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_PREFETCH_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_PREFETCH_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_DEPENDENCE_COUNTER MW(157:151)
|
||||
#define NVC6C0_QMDV02_04_SELF_COPY_ON_COMPLETION MW(158:158)
|
||||
#define NVC6C0_QMDV02_04_SELF_COPY_ON_COMPLETION_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_SELF_COPY_ON_COMPLETION_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_B MW(159:159)
|
||||
#define NVC6C0_QMDV02_04_CIRCULAR_QUEUE_SIZE MW(184:160)
|
||||
#define NVC6C0_QMDV02_04_DEMOTE_L2_EVICT_LAST MW(185:185)
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_HEADER_CACHE MW(186:186)
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(187:187)
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_DATA_CACHE MW(188:188)
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_SHADER_DATA_CACHE MW(189:189)
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_INSTRUCTION_CACHE MW(190:190)
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_SHADER_CONSTANT_CACHE MW(191:191)
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_CTA_RASTER_WIDTH_RESUME MW(223:192)
|
||||
#define NVC6C0_QMDV02_04_CTA_RASTER_HEIGHT_RESUME MW(239:224)
|
||||
#define NVC6C0_QMDV02_04_CTA_RASTER_DEPTH_RESUME MW(255:240)
|
||||
#define NVC6C0_QMDV02_04_PROGRAM_PREFETCH_ADDR_LOWER_SHIFTED MW(287:256)
|
||||
#define NVC6C0_QMDV02_04_CIRCULAR_QUEUE_ADDR_LOWER MW(319:288)
|
||||
#define NVC6C0_QMDV02_04_CIRCULAR_QUEUE_ADDR_UPPER MW(327:320)
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_D MW(335:328)
|
||||
#define NVC6C0_QMDV02_04_CIRCULAR_QUEUE_ENTRY_SIZE MW(351:336)
|
||||
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_ID MW(357:352)
|
||||
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(365:358)
|
||||
#define NVC6C0_QMDV02_04_RELEASE_MEMBAR_TYPE MW(366:366)
|
||||
#define NVC6C0_QMDV02_04_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001
|
||||
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_INCR_ENABLE MW(367:367)
|
||||
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_CWD_MEMBAR_TYPE MW(369:368)
|
||||
#define NVC6C0_QMDV02_04_CWD_MEMBAR_TYPE_L1_NONE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001
|
||||
#define NVC6C0_QMDV02_04_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003
|
||||
#define NVC6C0_QMDV02_04_SEQUENTIALLY_RUN_CTAS MW(370:370)
|
||||
#define NVC6C0_QMDV02_04_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_DECR_ENABLE MW(371:371)
|
||||
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_API_VISIBLE_CALL_LIMIT MW(378:378)
|
||||
#define NVC6C0_QMDV02_04_API_VISIBLE_CALL_LIMIT__32 0x00000000
|
||||
#define NVC6C0_QMDV02_04_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001
|
||||
#define NVC6C0_QMDV02_04_SAMPLER_INDEX MW(382:382)
|
||||
#define NVC6C0_QMDV02_04_SAMPLER_INDEX_INDEPENDENTLY 0x00000000
|
||||
#define NVC6C0_QMDV02_04_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001
|
||||
#define NVC6C0_QMDV02_04_DISABLE_AUTO_INVALIDATE MW(383:383)
|
||||
#define NVC6C0_QMDV02_04_DISABLE_AUTO_INVALIDATE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_DISABLE_AUTO_INVALIDATE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_CTA_RASTER_WIDTH MW(415:384)
|
||||
#define NVC6C0_QMDV02_04_CTA_RASTER_HEIGHT MW(431:416)
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED13A MW(447:432)
|
||||
#define NVC6C0_QMDV02_04_CTA_RASTER_DEPTH MW(463:448)
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED14A MW(479:464)
|
||||
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_POINTER MW(511:480)
|
||||
#define NVC6C0_QMDV02_04_COALESCE_WAITING_PERIOD MW(529:522)
|
||||
#define NVC6C0_QMDV02_04_QUEUE_ENTRIES_PER_CTA_LOG2 MW(534:530)
|
||||
#define NVC6C0_QMDV02_04_SHARED_MEMORY_SIZE MW(561:544)
|
||||
#define NVC6C0_QMDV02_04_MIN_SM_CONFIG_SHARED_MEM_SIZE MW(568:562)
|
||||
#define NVC6C0_QMDV02_04_MAX_SM_CONFIG_SHARED_MEM_SIZE MW(575:569)
|
||||
#define NVC6C0_QMDV02_04_QMD_VERSION MW(579:576)
|
||||
#define NVC6C0_QMDV02_04_QMD_MAJOR_VERSION MW(583:580)
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_H MW(591:584)
|
||||
#define NVC6C0_QMDV02_04_CTA_THREAD_DIMENSION0 MW(607:592)
|
||||
#define NVC6C0_QMDV02_04_CTA_THREAD_DIMENSION1 MW(623:608)
|
||||
#define NVC6C0_QMDV02_04_CTA_THREAD_DIMENSION2 MW(639:624)
|
||||
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1))
|
||||
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_VALID_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_VALID_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_REGISTER_COUNT_V MW(656:648)
|
||||
#define NVC6C0_QMDV02_04_TARGET_SM_CONFIG_SHARED_MEM_SIZE MW(663:657)
|
||||
#define NVC6C0_QMDV02_04_FREE_CTA_SLOTS_EMPTY_SM MW(671:664)
|
||||
#define NVC6C0_QMDV02_04_SM_DISABLE_MASK_LOWER MW(703:672)
|
||||
#define NVC6C0_QMDV02_04_SM_DISABLE_MASK_UPPER MW(735:704)
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_ADDRESS_LOWER MW(767:736)
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_ADDRESS_UPPER MW(775:768)
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_J MW(783:776)
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP MW(790:788)
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_INC 0x00000003
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_AND 0x00000005
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_OR 0x00000006
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_K MW(791:791)
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_FORMAT MW(793:792)
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_ENABLE MW(794:794)
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_STRUCTURE_SIZE MW(799:799)
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS 0x00000000
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_STRUCTURE_SIZE_ONE_WORD 0x00000001
|
||||
#define NVC6C0_QMDV02_04_RELEASE0_PAYLOAD MW(831:800)
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_ADDRESS_LOWER MW(863:832)
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_ADDRESS_UPPER MW(871:864)
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_L MW(879:872)
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP MW(886:884)
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_INC 0x00000003
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_AND 0x00000005
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_OR 0x00000006
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_M MW(887:887)
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_FORMAT MW(889:888)
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_ENABLE MW(890:890)
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_STRUCTURE_SIZE MW(895:895)
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS 0x00000000
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_STRUCTURE_SIZE_ONE_WORD 0x00000001
|
||||
#define NVC6C0_QMDV02_04_RELEASE1_PAYLOAD MW(927:896)
|
||||
#define NVC6C0_QMDV02_04_SHADER_LOCAL_MEMORY_LOW_SIZE MW(951:928)
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_N MW(954:952)
|
||||
#define NVC6C0_QMDV02_04_BARRIER_COUNT MW(959:955)
|
||||
#define NVC6C0_QMDV02_04_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(983:960)
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_G MW(991:984)
|
||||
#define NVC6C0_QMDV02_04_PROGRAM_PREFETCH_ADDR_UPPER_SHIFTED MW(1000:992)
|
||||
#define NVC6C0_QMDV02_04_PROGRAM_PREFETCH_SIZE MW(1009:1001)
|
||||
#define NVC6C0_QMDV02_04_PROGRAM_PREFETCH_TYPE MW(1011:1010)
|
||||
#define NVC6C0_QMDV02_04_PROGRAM_PREFETCH_TYPE_PREFETCH_LAUNCH 0x00000000
|
||||
#define NVC6C0_QMDV02_04_PROGRAM_PREFETCH_TYPE_PREFTECH_POST 0x00000001
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_A MW(1015:1012)
|
||||
#define NVC6C0_QMDV02_04_SASS_VERSION MW(1023:1016)
|
||||
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_ADDR_LOWER(i) MW((1055+(i)*64):(1024+(i)*64))
|
||||
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_ADDR_UPPER(i) MW((1072+(i)*64):(1056+(i)*64))
|
||||
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_PREFETCH_POST(i) MW((1073+(i)*64):(1073+(i)*64))
|
||||
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_PREFETCH_POST_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_PREFETCH_POST_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_INVALIDATE(i) MW((1074+(i)*64):(1074+(i)*64))
|
||||
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1087+(i)*64):(1075+(i)*64))
|
||||
#define NVC6C0_QMDV02_04_PROGRAM_ADDRESS_LOWER MW(1567:1536)
|
||||
#define NVC6C0_QMDV02_04_PROGRAM_ADDRESS_UPPER MW(1584:1568)
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_S MW(1599:1585)
|
||||
#define NVC6C0_QMDV02_04_HW_ONLY_INNER_GET MW(1630:1600)
|
||||
#define NVC6C0_QMDV02_04_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(1631:1631)
|
||||
#define NVC6C0_QMDV02_04_HW_ONLY_INNER_PUT MW(1662:1632)
|
||||
#define NVC6C0_QMDV02_04_HW_ONLY_SCG_TYPE MW(1663:1663)
|
||||
#define NVC6C0_QMDV02_04_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(1693:1664)
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_Q MW(1694:1694)
|
||||
#define NVC6C0_QMDV02_04_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(1695:1695)
|
||||
#define NVC6C0_QMDV02_04_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV02_04_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV02_04_HW_ONLY_SKED_NEXT_QMD_POINTER MW(1727:1696)
|
||||
#define NVC6C0_QMDV02_04_HW_ONLY_DEPENDENCE_COUNTER MW(1734:1728)
|
||||
#define NVC6C0_QMDV02_04_QMD_RESERVED_I MW(1759:1735)
|
||||
#define NVC6C0_QMDV02_04_QMD_SPARE_H MW(1791:1760)
|
||||
#define NVC6C0_QMDV02_04_QMD_SPARE_I MW(1823:1792)
|
||||
#define NVC6C0_QMDV02_04_QMD_SPARE_J MW(1855:1824)
|
||||
#define NVC6C0_QMDV02_04_QMD_SPARE_K MW(1887:1856)
|
||||
#define NVC6C0_QMDV02_04_QMD_SPARE_L MW(1919:1888)
|
||||
#define NVC6C0_QMDV02_04_QMD_SPARE_M MW(1951:1920)
|
||||
#define NVC6C0_QMDV02_04_QMD_SPARE_N MW(1983:1952)
|
||||
#define NVC6C0_QMDV02_04_DEBUG_ID_UPPER MW(2015:1984)
|
||||
#define NVC6C0_QMDV02_04_DEBUG_ID_LOWER MW(2047:2016)
|
||||
|
||||
|
||||
/*
|
||||
** Queue Meta Data, Version 03_00
|
||||
*/
|
||||
|
||||
#define NVC6C0_QMDV03_00_OUTER_PUT MW(30:0)
|
||||
#define NVC6C0_QMDV03_00_OUTER_OVERFLOW MW(31:31)
|
||||
#define NVC6C0_QMDV03_00_OUTER_GET MW(62:32)
|
||||
#define NVC6C0_QMDV03_00_OUTER_STICKY_OVERFLOW MW(63:63)
|
||||
#define NVC6C0_QMDV03_00_INNER_GET MW(94:64)
|
||||
#define NVC6C0_QMDV03_00_INNER_OVERFLOW MW(95:95)
|
||||
#define NVC6C0_QMDV03_00_INNER_PUT MW(126:96)
|
||||
#define NVC6C0_QMDV03_00_INNER_STICKY_OVERFLOW MW(127:127)
|
||||
#define NVC6C0_QMDV03_00_QMD_GROUP_ID MW(133:128)
|
||||
#define NVC6C0_QMDV03_00_SM_GLOBAL_CACHING_ENABLE MW(134:134)
|
||||
#define NVC6C0_QMDV03_00_RUN_CTA_IN_ONE_SM_PARTITION MW(135:135)
|
||||
#define NVC6C0_QMDV03_00_RUN_CTA_IN_ONE_SM_PARTITION_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RUN_CTA_IN_ONE_SM_PARTITION_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_IS_QUEUE MW(136:136)
|
||||
#define NVC6C0_QMDV03_00_IS_QUEUE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_IS_QUEUE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(137:137)
|
||||
#define NVC6C0_QMDV03_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_QMD_RESERVED04A MW(139:138)
|
||||
#define NVC6C0_QMDV03_00_REQUIRE_SCHEDULING_PCAS MW(140:140)
|
||||
#define NVC6C0_QMDV03_00_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_QMD_RESERVED04B MW(141:141)
|
||||
#define NVC6C0_QMDV03_00_DEPENDENCE_COUNTER MW(157:142)
|
||||
#define NVC6C0_QMDV03_00_SELF_COPY_ON_COMPLETION MW(158:158)
|
||||
#define NVC6C0_QMDV03_00_SELF_COPY_ON_COMPLETION_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_SELF_COPY_ON_COMPLETION_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_QMD_RESERVED04C MW(159:159)
|
||||
#define NVC6C0_QMDV03_00_CIRCULAR_QUEUE_SIZE MW(184:160)
|
||||
#define NVC6C0_QMDV03_00_DEMOTE_L2_EVICT_LAST MW(185:185)
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_HEADER_CACHE MW(186:186)
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(187:187)
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_DATA_CACHE MW(188:188)
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_SHADER_DATA_CACHE MW(189:189)
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_INSTRUCTION_CACHE MW(190:190)
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_SHADER_CONSTANT_CACHE MW(191:191)
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_CTA_RASTER_WIDTH_RESUME MW(223:192)
|
||||
#define NVC6C0_QMDV03_00_CTA_RASTER_HEIGHT_RESUME MW(239:224)
|
||||
#define NVC6C0_QMDV03_00_CTA_RASTER_DEPTH_RESUME MW(255:240)
|
||||
#define NVC6C0_QMDV03_00_PROGRAM_PREFETCH_ADDR_LOWER_SHIFTED MW(287:256)
|
||||
#define NVC6C0_QMDV03_00_CIRCULAR_QUEUE_ADDR_LOWER MW(319:288)
|
||||
#define NVC6C0_QMDV03_00_CIRCULAR_QUEUE_ADDR_UPPER MW(327:320)
|
||||
#define NVC6C0_QMDV03_00_QMD_RESERVED_D MW(335:328)
|
||||
#define NVC6C0_QMDV03_00_CIRCULAR_QUEUE_ENTRY_SIZE MW(351:336)
|
||||
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_ID MW(357:352)
|
||||
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(365:358)
|
||||
#define NVC6C0_QMDV03_00_QMD_RESERVED11A MW(366:366)
|
||||
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_INCR_ENABLE MW(367:367)
|
||||
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE MW(369:368)
|
||||
#define NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_NONE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001
|
||||
#define NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003
|
||||
#define NVC6C0_QMDV03_00_SEQUENTIALLY_RUN_CTAS MW(370:370)
|
||||
#define NVC6C0_QMDV03_00_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_DECR_ENABLE MW(371:371)
|
||||
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_QMD_RESERVED11B MW(377:372)
|
||||
#define NVC6C0_QMDV03_00_API_VISIBLE_CALL_LIMIT MW(378:378)
|
||||
#define NVC6C0_QMDV03_00_API_VISIBLE_CALL_LIMIT__32 0x00000000
|
||||
#define NVC6C0_QMDV03_00_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001
|
||||
#define NVC6C0_QMDV03_00_QMD_RESERVED11C MW(381:379)
|
||||
#define NVC6C0_QMDV03_00_SAMPLER_INDEX MW(382:382)
|
||||
#define NVC6C0_QMDV03_00_SAMPLER_INDEX_INDEPENDENTLY 0x00000000
|
||||
#define NVC6C0_QMDV03_00_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001
|
||||
#define NVC6C0_QMDV03_00_DISABLE_AUTO_INVALIDATE MW(383:383)
|
||||
#define NVC6C0_QMDV03_00_DISABLE_AUTO_INVALIDATE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_DISABLE_AUTO_INVALIDATE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_CTA_RASTER_WIDTH MW(415:384)
|
||||
#define NVC6C0_QMDV03_00_CTA_RASTER_HEIGHT MW(431:416)
|
||||
#define NVC6C0_QMDV03_00_CTA_RASTER_DEPTH MW(463:448)
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_POINTER MW(511:480)
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ENABLE MW(512:512)
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ACTION MW(515:513)
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ACTION_QMD_INCREMENT_PUT 0x00000000
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ACTION_QMD_SCHEDULE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_PREFETCH MW(516:516)
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_PREFETCH_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_PREFETCH_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ENABLE MW(517:517)
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ACTION MW(520:518)
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ACTION_QMD_INCREMENT_PUT 0x00000000
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ACTION_QMD_SCHEDULE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_PREFETCH MW(521:521)
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_PREFETCH_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_PREFETCH_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_COALESCE_WAITING_PERIOD MW(529:522)
|
||||
#define NVC6C0_QMDV03_00_QUEUE_ENTRIES_PER_CTA_LOG2 MW(534:530)
|
||||
#define NVC6C0_QMDV03_00_SHARED_MEMORY_SIZE MW(561:544)
|
||||
#define NVC6C0_QMDV03_00_MIN_SM_CONFIG_SHARED_MEM_SIZE MW(567:562)
|
||||
#define NVC6C0_QMDV03_00_QMD_RESERVED17A MW(568:568)
|
||||
#define NVC6C0_QMDV03_00_MAX_SM_CONFIG_SHARED_MEM_SIZE MW(574:569)
|
||||
#define NVC6C0_QMDV03_00_QMD_RESERVED17B MW(575:575)
|
||||
#define NVC6C0_QMDV03_00_QMD_VERSION MW(579:576)
|
||||
#define NVC6C0_QMDV03_00_QMD_MAJOR_VERSION MW(583:580)
|
||||
#define NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0 MW(607:592)
|
||||
#define NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION1 MW(623:608)
|
||||
#define NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION2 MW(639:624)
|
||||
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1))
|
||||
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_VALID_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_VALID_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_REGISTER_COUNT_V MW(656:648)
|
||||
#define NVC6C0_QMDV03_00_TARGET_SM_CONFIG_SHARED_MEM_SIZE MW(662:657)
|
||||
#define NVC6C0_QMDV03_00_SHARED_ALLOCATION_ENABLE MW(663:663)
|
||||
#define NVC6C0_QMDV03_00_SHARED_ALLOCATION_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_SHARED_ALLOCATION_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_FREE_CTA_SLOTS_EMPTY_SM MW(671:664)
|
||||
#define NVC6C0_QMDV03_00_SM_DISABLE_MASK_LOWER MW(703:672)
|
||||
#define NVC6C0_QMDV03_00_SM_DISABLE_MASK_UPPER MW(735:704)
|
||||
#define NVC6C0_QMDV03_00_SHADER_LOCAL_MEMORY_LOW_SIZE MW(759:736)
|
||||
#define NVC6C0_QMDV03_00_BARRIER_COUNT MW(767:763)
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_ADDRESS_LOWER MW(799:768)
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_ADDRESS_UPPER MW(807:800)
|
||||
#define NVC6C0_QMDV03_00_SEMAPHORE_RESERVED25A MW(818:808)
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_MEMBAR_TYPE MW(819:819)
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_MEMBAR_TYPE_FE_NONE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP MW(822:820)
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_INC 0x00000003
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_AND 0x00000005
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_OR 0x00000006
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_ENABLE MW(823:823)
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_FORMAT MW(825:824)
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_ENABLE MW(826:826)
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_NON_BLOCKING_INTR_TYPE MW(828:827)
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_NON_BLOCKING_INTR_TYPE_NONE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_NON_BLOCKING_INTR_TYPE_TRAP 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_NON_BLOCKING_INTR_TYPE_CONDITIONAL_TRAP 0x00000002
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_PAYLOAD64B MW(829:829)
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_PAYLOAD64B_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_PAYLOAD64B_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_STRUCTURE_SIZE MW(831:830)
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_STRUCTURE_SIZE_SEMAPHORE_FOUR_WORDS 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_STRUCTURE_SIZE_SEMAPHORE_ONE_WORD 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_STRUCTURE_SIZE_SEMAPHORE_TWO_WORDS 0x00000002
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_PAYLOAD_LOWER MW(863:832)
|
||||
#define NVC6C0_QMDV03_00_RELEASE0_PAYLOAD_UPPER MW(895:864)
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_ADDRESS_LOWER MW(927:896)
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_ADDRESS_UPPER MW(935:928)
|
||||
#define NVC6C0_QMDV03_00_SEMAPHORE_RESERVED29A MW(946:936)
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_MEMBAR_TYPE MW(947:947)
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_MEMBAR_TYPE_FE_NONE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP MW(950:948)
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_INC 0x00000003
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_AND 0x00000005
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_OR 0x00000006
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_ENABLE MW(951:951)
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_FORMAT MW(953:952)
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_ENABLE MW(954:954)
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_NON_BLOCKING_INTR_TYPE MW(956:955)
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_NON_BLOCKING_INTR_TYPE_NONE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_NON_BLOCKING_INTR_TYPE_TRAP 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_NON_BLOCKING_INTR_TYPE_CONDITIONAL_TRAP 0x00000002
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_PAYLOAD64B MW(957:957)
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_PAYLOAD64B_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_PAYLOAD64B_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_STRUCTURE_SIZE MW(959:958)
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_STRUCTURE_SIZE_SEMAPHORE_FOUR_WORDS 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_STRUCTURE_SIZE_SEMAPHORE_ONE_WORD 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_STRUCTURE_SIZE_SEMAPHORE_TWO_WORDS 0x00000002
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_PAYLOAD_LOWER MW(991:960)
|
||||
#define NVC6C0_QMDV03_00_RELEASE1_PAYLOAD_UPPER MW(1023:992)
|
||||
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_ADDR_LOWER(i) MW((1055+(i)*64):(1024+(i)*64))
|
||||
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_ADDR_UPPER(i) MW((1072+(i)*64):(1056+(i)*64))
|
||||
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_PREFETCH_POST(i) MW((1073+(i)*64):(1073+(i)*64))
|
||||
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_PREFETCH_POST_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_PREFETCH_POST_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_INVALIDATE(i) MW((1074+(i)*64):(1074+(i)*64))
|
||||
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1087+(i)*64):(1075+(i)*64))
|
||||
#define NVC6C0_QMDV03_00_PROGRAM_ADDRESS_LOWER MW(1567:1536)
|
||||
#define NVC6C0_QMDV03_00_PROGRAM_ADDRESS_UPPER MW(1584:1568)
|
||||
#define NVC6C0_QMDV03_00_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(1623:1600)
|
||||
#define NVC6C0_QMDV03_00_PROGRAM_PREFETCH_ADDR_UPPER_SHIFTED MW(1640:1632)
|
||||
#define NVC6C0_QMDV03_00_PROGRAM_PREFETCH_SIZE MW(1649:1641)
|
||||
#define NVC6C0_QMDV03_00_PROGRAM_PREFETCH_TYPE MW(1651:1650)
|
||||
#define NVC6C0_QMDV03_00_PROGRAM_PREFETCH_TYPE_PREFETCH_LAUNCH 0x00000000
|
||||
#define NVC6C0_QMDV03_00_PROGRAM_PREFETCH_TYPE_PREFTECH_POST 0x00000001
|
||||
#define NVC6C0_QMDV03_00_SASS_VERSION MW(1663:1656)
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_ADDRESS_LOWER MW(1695:1664)
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_ADDRESS_UPPER MW(1703:1696)
|
||||
#define NVC6C0_QMDV03_00_SEMAPHORE_RESERVED53A MW(1714:1704)
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_MEMBAR_TYPE MW(1715:1715)
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_MEMBAR_TYPE_FE_NONE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP MW(1718:1716)
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_ADD 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_MIN 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_MAX 0x00000002
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_INC 0x00000003
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_DEC 0x00000004
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_AND 0x00000005
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_OR 0x00000006
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_XOR 0x00000007
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_ENABLE MW(1719:1719)
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_FORMAT MW(1721:1720)
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_FORMAT_SIGNED_32 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_ENABLE MW(1722:1722)
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_ENABLE_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_ENABLE_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_NON_BLOCKING_INTR_TYPE MW(1724:1723)
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_NON_BLOCKING_INTR_TYPE_NONE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_NON_BLOCKING_INTR_TYPE_TRAP 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_NON_BLOCKING_INTR_TYPE_CONDITIONAL_TRAP 0x00000002
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_PAYLOAD64B MW(1725:1725)
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_PAYLOAD64B_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_PAYLOAD64B_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_STRUCTURE_SIZE MW(1727:1726)
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_STRUCTURE_SIZE_SEMAPHORE_FOUR_WORDS 0x00000000
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_STRUCTURE_SIZE_SEMAPHORE_ONE_WORD 0x00000001
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_STRUCTURE_SIZE_SEMAPHORE_TWO_WORDS 0x00000002
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_PAYLOAD_LOWER MW(1759:1728)
|
||||
#define NVC6C0_QMDV03_00_RELEASE2_PAYLOAD_UPPER MW(1791:1760)
|
||||
#define NVC6C0_QMDV03_00_QMD_SPARE_I MW(1823:1792)
|
||||
#define NVC6C0_QMDV03_00_HW_ONLY_INNER_GET MW(1854:1824)
|
||||
#define NVC6C0_QMDV03_00_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(1855:1855)
|
||||
#define NVC6C0_QMDV03_00_HW_ONLY_INNER_PUT MW(1886:1856)
|
||||
#define NVC6C0_QMDV03_00_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(1917:1888)
|
||||
#define NVC6C0_QMDV03_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(1919:1919)
|
||||
#define NVC6C0_QMDV03_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000
|
||||
#define NVC6C0_QMDV03_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001
|
||||
#define NVC6C0_QMDV03_00_HW_ONLY_SKED_NEXT_QMD_POINTER MW(1951:1920)
|
||||
#define NVC6C0_QMDV03_00_HW_ONLY_DEPENDENCE_COUNTER MW(1958:1952)
|
||||
#define NVC6C0_QMDV03_00_DEBUG_ID_UPPER MW(2015:1984)
|
||||
#define NVC6C0_QMDV03_00_DEBUG_ID_LOWER MW(2047:2016)
|
||||
|
||||
|
||||
|
||||
#endif // #ifndef __CLC6C0QMD_H__
|
||||
@@ -1,26 +0,0 @@
|
||||
#!/bin/bash
|
||||
SRC=/home/kafka/build/open-gpu-kernel-modules
|
||||
|
||||
clang2py \
|
||||
$SRC/src/nvidia/generated/g_allclasses.h \
|
||||
-o class_ioctl.py -k cdefstum
|
||||
|
||||
exit
|
||||
|
||||
#clang2py $SRC/src/nvidia/arch/nvalloc/unix/include/nv_escape.h \
|
||||
# $SRC/src/nvidia/arch/nvalloc/unix/include/nv-ioctl-numbers.h \
|
||||
# $SRC/src/common/sdk/nvidia/inc/nvos.h \
|
||||
# --clang-args="-I $SRC/src/common/sdk/nvidia/inc -I $SRC/src/common/sdk/nvidia/inc/ctrl" \
|
||||
# -o esc_ioctl.py -k cdefstum
|
||||
|
||||
clang2py \
|
||||
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrl0000/*.h \
|
||||
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrl0080/*.h \
|
||||
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrl2080/*.h \
|
||||
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrl83de/*.h \
|
||||
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrlc36f.h \
|
||||
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrlcb33.h \
|
||||
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrla06c.h \
|
||||
--clang-args="-I $SRC/src/common/sdk/nvidia/inc -I $SRC/src/common/sdk/nvidia/inc/ctrl" \
|
||||
-o ctrl_ioctl.py -k cdefstum
|
||||
sed -i "s\(0000000001)\1\g" ctrl_ioctl.py
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,24 +1,34 @@
|
||||
# type: ignore
|
||||
import ctypes, ctypes.util, struct, platform, pathlib, re, time, os
|
||||
import ctypes, ctypes.util, struct, platform, pathlib, re, time, os, signal
|
||||
from tinygrad.helpers import from_mv, to_mv, getenv
|
||||
from hexdump import hexdump
|
||||
start = time.perf_counter()
|
||||
|
||||
# *** ioctl lib ***
|
||||
libc = ctypes.CDLL(ctypes.util.find_library("c"))
|
||||
processor = platform.processor()
|
||||
IOCTL_SYSCALL = {"aarch64": 0x1d, "x86_64":16}[processor]
|
||||
MMAP_SYSCALL = {"aarch64": 0xde, "x86_64":0x09}[processor]
|
||||
|
||||
def get_struct(argp, stype):
|
||||
return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents
|
||||
|
||||
def dump_struct(st):
|
||||
print("\t", st.__class__.__name__, end=" { ")
|
||||
for v in type(st)._fields_: print(f"{v[0]}={getattr(st, v[0])}", end=" ")
|
||||
print("}")
|
||||
|
||||
def format_struct(s):
|
||||
sdats = []
|
||||
for field_name, field_type in s._fields_:
|
||||
dat = getattr(s, field_name)
|
||||
if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
|
||||
else: sdats.append(f"{field_name}:{dat}")
|
||||
for field in s._fields_:
|
||||
dat = getattr(s, field[0])
|
||||
if isinstance(dat, int): sdats.append(f"{field[0]}:0x{dat:X}")
|
||||
else: sdats.append(f"{field[0]}:{dat}")
|
||||
return sdats
|
||||
|
||||
real_func_pool = {}
|
||||
def install_hook(c_function, python_function):
|
||||
orig_func = (ctypes.c_char*4096)()
|
||||
python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
|
||||
# AARCH64 trampoline to ioctl
|
||||
if processor == "aarch64":
|
||||
@@ -28,9 +38,9 @@ def install_hook(c_function, python_function):
|
||||
tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
|
||||
tramp += struct.pack("Q", python_function_addr)
|
||||
elif processor == "x86_64":
|
||||
# 0x0000000000000000: 49 B8 aa aa aa aa aa aa aa aa movabs r8, <address>
|
||||
# 0x000000000000000a: 41 FF E0 jmp r8
|
||||
tramp = b"\x49\xB8" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE0"
|
||||
# 0x0000000000000000: 49 BB aa aa aa aa aa aa aa aa movabs r11, <address>
|
||||
# 0x000000000000000a: 41 FF E3 jmp r11
|
||||
tramp = b"\x49\xBB" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE3"
|
||||
else:
|
||||
raise Exception(f"processor {processor} not supported")
|
||||
|
||||
@@ -40,54 +50,168 @@ def install_hook(c_function, python_function):
|
||||
# hook ioctl
|
||||
ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7)
|
||||
assert ret == 0
|
||||
ret = libc.mprotect(ctypes.c_ulong((ctypes.addressof(orig_func)//0x1000)*0x1000), 0x3000, 7)
|
||||
assert ret == 0
|
||||
libc.memcpy(orig_func, ioctl_address.contents, 0x1000)
|
||||
libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp))
|
||||
return orig_func
|
||||
|
||||
# *** ioctl lib end ***
|
||||
import extra.nv_gpu_driver.esc_ioctl as ESC
|
||||
import extra.nv_gpu_driver.ctrl_ioctl as CTRL
|
||||
import extra.nv_gpu_driver.class_ioctl as CLASS
|
||||
nvescs = {getattr(ESC, x):x for x in dir(ESC) if x.startswith("NV_ESC")}
|
||||
nvcmds = {getattr(CTRL, x):(x, getattr(CTRL, "struct_"+x+"_PARAMS", getattr(CTRL, "struct_"+x.replace("_CMD_", "_")+"_PARAMS", None))) for x in dir(CTRL) if \
|
||||
x.startswith("NV") and x[6:].startswith("_CTRL_") and isinstance(getattr(CTRL, x), int)}
|
||||
nvclasses = {getattr(CLASS, x):x for x in dir(CLASS) if isinstance(getattr(CLASS, x), int)}
|
||||
import tinygrad.runtime.autogen.nv_gpu as nv_gpu
|
||||
nvescs = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("NV_ESC")}
|
||||
nvcmds = {getattr(nv_gpu, x):(x, getattr(nv_gpu, "struct_"+x+"_PARAMS", getattr(nv_gpu, "struct_"+x.replace("_CMD_", "_")+"_PARAMS", None))) for x in dir(nv_gpu) if \
|
||||
x.startswith("NV") and x[6:].startswith("_CTRL_") and isinstance(getattr(nv_gpu, x), int)}
|
||||
|
||||
def get_classes():
|
||||
hdrpy = (pathlib.Path(__file__).parent.parent.parent / "tinygrad/runtime/autogen/nv_gpu.py").read_text()
|
||||
clss = re.search(r'NV01_ROOT.*?NV_SEMAPHORE_SURFACE = \(0x000000da\) # macro', hdrpy, re.DOTALL).group()
|
||||
pattern = r'([0-9a-zA-Z_]*) = +\((0x[0-9a-fA-F]+)\)'
|
||||
matches = re.findall(pattern, clss, re.MULTILINE)
|
||||
return {int(num, base=16):name for name, num in matches}
|
||||
nvclasses = get_classes()
|
||||
nvuvms = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("UVM_") and nv_gpu.__dict__.get(x+"_PARAMS")}
|
||||
nvqcmds = {int(getattr(nv_gpu, x)):x for x in dir(nv_gpu) if x[:7] in {"NVC6C0_", "NVC56F_", "NVC6B5_"} and isinstance(getattr(nv_gpu, x), int)}
|
||||
|
||||
global_ioctl_id = 0
|
||||
gpus_user_modes = []
|
||||
gpus_mmio = []
|
||||
gpus_fifo = []
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
|
||||
def ioctl(fd, request, argp):
|
||||
global global_ioctl_id, gpus_user_modes, gpus_mmio
|
||||
global_ioctl_id += 1
|
||||
st = time.perf_counter()
|
||||
ret = libc.syscall(IOCTL_SYSCALL, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp))
|
||||
et = time.perf_counter()-st
|
||||
fn = os.readlink(f"/proc/self/fd/{fd}")
|
||||
#print(f"ioctl {request:8x} {fn:20s}")
|
||||
idir, size, itype, nr = (request>>30), (request>>16)&0x3FFF, (request>>8)&0xFF, request&0xFF
|
||||
if itype == ord(ESC.NV_IOCTL_MAGIC):
|
||||
if nr == ESC.NV_ESC_RM_CONTROL:
|
||||
s = get_struct(argp, ESC.NVOS54_PARAMETERS)
|
||||
print(f"#{global_ioctl_id}: ", end="")
|
||||
if itype == ord(nv_gpu.NV_IOCTL_MAGIC):
|
||||
if nr == nv_gpu.NV_ESC_RM_CONTROL:
|
||||
s = get_struct(argp, nv_gpu.NVOS54_PARAMETERS)
|
||||
if s.cmd in nvcmds:
|
||||
name, struc = nvcmds[s.cmd]
|
||||
if struc is not None:
|
||||
ss = get_struct(s.params, struc)
|
||||
print("NV_ESC_RM_CONTROL ", name, format_struct(ss))
|
||||
else:
|
||||
print("NV_ESC_RM_CONTROL ", name)
|
||||
print(f"NV_ESC_RM_CONTROL cmd={name:30s} hClient={s.hClient}, hObject={s.hObject}, flags={s.flags}, params={s.params}, paramsSize={s.paramsSize}, status={s.status}")
|
||||
if struc is not None: dump_struct(get_struct(s.params, struc))
|
||||
elif hasattr(nv_gpu, name+"_PARAMS"): dump_struct(get_struct(argp, getattr(nv_gpu, name+"_PARAMS")))
|
||||
elif name == "NVA06C_CTRL_CMD_GPFIFO_SCHEDULE": dump_struct(get_struct(argp, nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS))
|
||||
else:
|
||||
print("unhandled cmd", hex(s.cmd))
|
||||
#format_struct(s)
|
||||
#print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
|
||||
elif nr == ESC.NV_ESC_RM_ALLOC:
|
||||
s = get_struct(argp, ESC.NVOS21_PARAMETERS)
|
||||
print(f"NV_ESC_RM_ALLOC class: {nvclasses[s.hClass]:30s}")
|
||||
elif nr == ESC.NV_ESC_RM_MAP_MEMORY:
|
||||
# format_struct(s)
|
||||
# print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
|
||||
elif nr == nv_gpu.NV_ESC_RM_ALLOC:
|
||||
s = get_struct(argp, nv_gpu.NVOS21_PARAMETERS)
|
||||
print(f"NV_ESC_RM_ALLOC hClass={nvclasses.get(s.hClass, 'unk'):30s}, hRoot={s.hRoot}, hObjectParent={s.hObjectParent}, pAllocParms={s.pAllocParms}, hObjectNew={s.hObjectNew}")
|
||||
if s.pAllocParms is not None:
|
||||
if s.hClass == nv_gpu.NV01_DEVICE_0: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV0080_ALLOC_PARAMETERS))
|
||||
if s.hClass == nv_gpu.FERMI_VASPACE_A: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS))
|
||||
if s.hClass == nv_gpu.NV50_MEMORY_VIRTUAL: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
|
||||
if s.hClass == nv_gpu.NV1_MEMORY_USER: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
|
||||
if s.hClass == nv_gpu.NV1_MEMORY_SYSTEM: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
|
||||
if s.hClass == nv_gpu.AMPERE_CHANNEL_GPFIFO_A:
|
||||
sx = get_struct(s.pAllocParms, nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS)
|
||||
dump_struct(sx)
|
||||
gpus_fifo.append((sx.gpFifoOffset, sx.gpFifoEntries))
|
||||
if s.hClass == nv_gpu.KEPLER_CHANNEL_GROUP_A: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS))
|
||||
if s.hClass == nv_gpu.TURING_USERMODE_A: gpus_user_modes.append(s.hObjectNew)
|
||||
elif nr == nv_gpu.NV_ESC_RM_MAP_MEMORY:
|
||||
# nv_ioctl_nvos33_parameters_with_fd
|
||||
s = get_struct(argp, ESC.NVOS33_PARAMETERS)
|
||||
print(f"NV_ESC_RM_MAP_MEMORY {s.pLinearAddress:x}")
|
||||
s = get_struct(argp, nv_gpu.NVOS33_PARAMETERS)
|
||||
print(f"NV_ESC_RM_MAP_MEMORY hClient={s.hClient}, hDevice={s.hDevice}, hMemory={s.hMemory}, length={s.length} flags={s.flags} pLinearAddress={s.pLinearAddress}")
|
||||
elif nr == nv_gpu.NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO:
|
||||
s = get_struct(argp, nv_gpu.NVOS56_PARAMETERS)
|
||||
print(f"NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO hClient={s.hClient}, hDevice={s.hDevice}, hMemory={s.hMemory}, pOldCpuAddress={s.pOldCpuAddress} pNewCpuAddress={s.pNewCpuAddress} status={s.status}")
|
||||
elif nr == nv_gpu.NV_ESC_RM_ALLOC_MEMORY:
|
||||
s = get_struct(argp, nv_gpu.nv_ioctl_nvos02_parameters_with_fd)
|
||||
print(f"NV_ESC_RM_ALLOC_MEMORY fd={s.fd}, hRoot={s.params.hRoot}, hObjectParent={s.params.hObjectParent}, hObjectNew={s.params.hObjectNew}, hClass={s.params.hClass}, flags={s.params.flags}, pMemory={s.params.pMemory}, limit={s.params.limit}, status={s.params.status}")
|
||||
elif nr == nv_gpu.NV_ESC_ALLOC_OS_EVENT:
|
||||
s = get_struct(argp, nv_gpu.nv_ioctl_nvos02_parameters_with_fd)
|
||||
elif nr == nv_gpu.NV_ESC_REGISTER_FD:
|
||||
s = get_struct(argp, nv_gpu.nv_ioctl_register_fd_t)
|
||||
print(f"NV_ESC_REGISTER_FD fd={s.ctl_fd}")
|
||||
elif nr in nvescs:
|
||||
print(nvescs[nr])
|
||||
else:
|
||||
print("unhandled NR", nr)
|
||||
#print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else "")
|
||||
elif fn.endswith("nvidia-uvm"):
|
||||
print(f"{nvuvms.get(request, f'UVM UNKNOWN {request=}')}")
|
||||
if nvuvms.get(request) is not None: dump_struct(get_struct(argp, getattr(nv_gpu, nvuvms.get(request)+"_PARAMS")))
|
||||
if nvuvms.get(request) == "UVM_MAP_EXTERNAL_ALLOCATION":
|
||||
st = get_struct(argp, getattr(nv_gpu, nvuvms.get(request)+"_PARAMS"))
|
||||
for i in range(st.gpuAttributesCount):
|
||||
print("perGpuAttributes[{i}] = ", end="")
|
||||
dump_struct(st.perGpuAttributes[i])
|
||||
print("ok")
|
||||
|
||||
if getenv("IOCTL") >= 2: print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", fn)
|
||||
return ret
|
||||
|
||||
@ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)
|
||||
def _mmap(addr, length, prot, flags, fd, offset):
|
||||
mmap_type = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)
|
||||
orig_mmap = mmap_type(ctypes.addressof(orig_mmap_mv))
|
||||
ret = orig_mmap(addr, length, prot, flags, fd, offset)
|
||||
# ll = os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else ""
|
||||
print(f"mmap {addr=}, {length=}, {prot=}, {flags=}, {fd=}, {offset=} {ret=}")
|
||||
return ret
|
||||
|
||||
install_hook(libc.ioctl, ioctl)
|
||||
if getenv("IOCTL") >= 3: orig_mmap_mv = install_hook(libc.mmap, _mmap)
|
||||
|
||||
import collections
|
||||
old_gpputs = collections.defaultdict(int)
|
||||
def _dump_gpfifo(mark):
|
||||
print("_dump_gpfifo:", mark)
|
||||
for start,size in gpus_fifo:
|
||||
gpfifo_controls = nv_gpu.AmpereAControlGPFifo.from_address(start+size*8)
|
||||
gpfifo = to_mv(start, gpfifo_controls.GPPut * 8).cast("Q")
|
||||
if old_gpputs[start] == gpfifo_controls.GPPut: continue
|
||||
|
||||
print(f"gpfifo {start}: {gpfifo_controls.GPPut=}")
|
||||
for i in range(old_gpputs[start], gpfifo_controls.GPPut):
|
||||
addr = ((gpfifo[i % size] & ((1 << 40)-1)) >> 2) << 2
|
||||
pckt_cnt = (gpfifo[i % size]>>42)&((1 << 20)-1)
|
||||
|
||||
print(f"\t{i}: 0x{gpfifo[i % size]:x}: addr:0x{addr:x} packets:{pckt_cnt} sync:{(gpfifo[i % size] >> 63) & 0x1} fetch:{gpfifo[i % size] & 0x1}")
|
||||
old_gpputs[start] = gpfifo_controls.GPPut
|
||||
_dump_qmd(addr, pckt_cnt)
|
||||
|
||||
import types
|
||||
def _dump_qmd(address, packets):
|
||||
gpfifo = to_mv(address, packets * 4).cast("I")
|
||||
|
||||
i = 0
|
||||
while i < packets:
|
||||
dat = gpfifo[i]
|
||||
typ = (dat>>28) & 0xF
|
||||
if typ == 0: break
|
||||
size = (dat>>16) & 0xFFF
|
||||
subc = (dat>>13) & 7
|
||||
mthd = (dat<<2) & 0x7FFF
|
||||
method_name = nvqcmds.get(mthd, f"unknown method #{mthd}")
|
||||
print(f"\t\t{method_name}, {typ=} {size=} {subc=} {mthd=}")
|
||||
for j in range(size): print(f"\t\t\t{j}: {gpfifo[i+j+1]} | 0x{gpfifo[i+j+1]:x}")
|
||||
if mthd == 792:
|
||||
for x in dir(nv_gpu):
|
||||
if x.startswith("NVC6C0_QMDV03_00_"):
|
||||
vv = getattr(nv_gpu, x)
|
||||
bits = None
|
||||
if isinstance(vv, tuple) and len(vv) == 2:
|
||||
bits = vv
|
||||
if isinstance(vv, types.FunctionType):
|
||||
bits = vv(0)
|
||||
|
||||
if bits is not None:
|
||||
res = 0
|
||||
for bt in range(bits[1], bits[0]+1): res |= ((gpfifo[i + 3 + bt // 32] >> (bt % 32)) & 0x1) << (bt - bits[1])
|
||||
if res != 0: print(f"{x}, {hex(res)} | {bin(res)}")
|
||||
|
||||
const_addr = gpfifo[i+35] + ((gpfifo[i+36] & 0xffff) << 32)
|
||||
const_len = ((gpfifo[i+36] >> 19))
|
||||
# hexdump(to_mv(const_addr, const_len))
|
||||
|
||||
i += size + 1
|
||||
|
||||
# IOCTL=1 PTX=1 CUDA=1 python3 test/test_ops.py TestOps.test_tiny_add
|
||||
@@ -111,7 +111,7 @@ def beam_search(lin:Linearizer, rawbufs:List[Buffer], amt:int, allow_test_size=T
|
||||
beam: List[Tuple[Linearizer, float]] = []
|
||||
seen_libs = set()
|
||||
|
||||
default_parallel, min_progress_micros = 1 if lin.opts.device in {"CUDA", "HSA", "KFD"} else 0, getenv("BEAM_MIN_PROGRESS",0.01)
|
||||
default_parallel, min_progress_micros = 1 if lin.opts.device in {"CUDA", "HSA", "KFD", "NV"} else 0, getenv("BEAM_MIN_PROGRESS",0.01)
|
||||
if beam_pool is None and getenv("PARALLEL", default_parallel):
|
||||
beam_pool = multiprocessing.get_context("spawn").Pool(multiprocessing.cpu_count(), _init_worker, (), getenv("BEAM_MAX_TASKS_PER_CHILD", 16))
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
541
tinygrad/runtime/ops_nv.py
Normal file
541
tinygrad/runtime/ops_nv.py
Normal file
@@ -0,0 +1,541 @@
|
||||
from __future__ import annotations
|
||||
import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time
|
||||
from typing import Tuple, List, Any
|
||||
from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions
|
||||
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes
|
||||
import tinygrad.runtime.autogen.cuda as cuda
|
||||
import tinygrad.runtime.autogen.nv_gpu as nv_gpu
|
||||
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
|
||||
|
||||
libc = ctypes.CDLL("libc.so.6")
|
||||
libc.memset.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int]
|
||||
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
||||
libc.mmap.restype = ctypes.c_void_p
|
||||
libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
|
||||
libc.munmap.restype = ctypes.c_int
|
||||
|
||||
QMD_SIZE = (8 << 8)
|
||||
|
||||
def nv_iowr(fd, nr, args):
|
||||
ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
|
||||
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
||||
|
||||
def rm_alloc(fd, clss, root, parant, params):
|
||||
made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
|
||||
pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
|
||||
nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
|
||||
if made.status != 0: raise RuntimeError(f"rm_alloc returned {made.status}")
|
||||
return made
|
||||
|
||||
def rm_control(fd, cmd, client, obj, params):
|
||||
made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params),
|
||||
params=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
|
||||
nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
|
||||
if made.status != 0: raise RuntimeError(f"rm_control returned {made.status}")
|
||||
return made
|
||||
|
||||
def uvm_ioctl(cmd, sttyp, fd, **kwargs):
|
||||
ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
|
||||
if ret != 0: raise RuntimeError(f"uvm_ioctl returned {ret}")
|
||||
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl struct returned {made.rmStatus}")
|
||||
return made
|
||||
|
||||
def make_uvm_type():
|
||||
fxns = {name.replace("UVM_", "").lower():
|
||||
functools.partial(uvm_ioctl, dt, getattr(nv_gpu, name+"_PARAMS"))
|
||||
for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")}
|
||||
return type("NVUVM", (object, ), fxns)
|
||||
uvm = make_uvm_type()
|
||||
|
||||
def make_qmd_struct_type():
|
||||
fields = []
|
||||
bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
|
||||
bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
|
||||
bits = sorted(bits, key=lambda x: x[1][1])
|
||||
for i,(name, data) in enumerate(bits):
|
||||
if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
|
||||
fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
|
||||
return init_c_struct_t(tuple(fields))
|
||||
qmd_struct_t = make_qmd_struct_type()
|
||||
assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
|
||||
|
||||
def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
|
||||
def nvdata64(data): return (data >> 32, data & 0xFFFFFFFF)
|
||||
def nvdata64_le(data): return (data & 0xFFFFFFFF, data >> 32)
|
||||
|
||||
class NVCompiler(Compiler):
|
||||
compiler_opts = CompilerOptions("NV", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
NVCompiler.compiler_opts = NVCompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
|
||||
cuda_check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
|
||||
self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
|
||||
if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
|
||||
super().__init__(f"compile_nv_{self.arch}")
|
||||
def render(self, name:str, uops) -> str: return CUDARenderer(name, uops)
|
||||
def compile(self, src:str) -> bytes:
|
||||
cuda_check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
|
||||
status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
|
||||
|
||||
if status != 0:
|
||||
raise RuntimeError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, cuda_check).decode()}")
|
||||
return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
|
||||
|
||||
class HWComputeQueue:
|
||||
def __init__(self): self.q = []
|
||||
def copy_from_cpu(self, gpuaddr, data):
|
||||
self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
|
||||
self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
|
||||
self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
|
||||
self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + [x for x in data]
|
||||
return self
|
||||
|
||||
def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), completion_signal=None):
|
||||
prg.qmd.cta_raster_width, prg.qmd.cta_raster_height, prg.qmd.cta_raster_depth = global_size
|
||||
prg.qmd.cta_thread_dimension0, prg.qmd.cta_thread_dimension1, prg.qmd.cta_thread_dimension2 = local_size
|
||||
prg.qmd.constant_buffer_addr_lower_0 = (kernargs + QMD_SIZE) & 0xffffffff
|
||||
prg.qmd.constant_buffer_addr_upper_0 = (kernargs + QMD_SIZE) >> 32
|
||||
self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
|
||||
self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_INLINE_QMD_ADDRESS_A, 0x42), *nvdata64(kernargs >> 8)]
|
||||
self.q += [x for x in to_mv(ctypes.addressof(prg.qmd), ctypes.sizeof(prg.qmd)).cast("I")]
|
||||
|
||||
if completion_signal is not None: self.signal(completion_signal)
|
||||
return self
|
||||
|
||||
def wait(self, signal, value=0):
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal).contents)), *nvdata64_le(value),
|
||||
(3 << 0) | (1 << 12) | (1 << 24)] # ACQUIRE | ACQUIRE_SWITCH_TSG | PAYLOAD_SIZE_64BIT
|
||||
return self
|
||||
|
||||
def signal(self, signal, value=0, timestamp=False):
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal).contents)), *nvdata64_le(value),
|
||||
(1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
|
||||
return self
|
||||
|
||||
def submit(self, dev:NVDevice):
|
||||
assert len(self.q) < (1 << 21)
|
||||
self.signal(dev.compute_progress_signal, dev.compute_put_value + 1)
|
||||
for i,packet in enumerate(self.q): dev.cmdq[dev.cmdq_wptr//4 + i] = packet
|
||||
fifo_entry = dev.compute_put_value % dev.compute_gpfifo_entries
|
||||
dev.compute_gpu_ring[fifo_entry] = ((dev.cmdq_page.base+dev.cmdq_wptr)//4 << 2) | (len(self.q) << 42) | (1 << 41)
|
||||
dev.compute_gpu_ring_controls.GPPut = (dev.compute_put_value + 1) % dev.compute_gpfifo_entries
|
||||
dev.compute_put_value += 1
|
||||
dev.gpu_mmio[0x90 // 4] = dev.compute_gpfifo_token
|
||||
dev.cmdq_wptr += len(self.q) * 4
|
||||
|
||||
class HWCopyQueue:
|
||||
def __init__(self): self.q = []
|
||||
|
||||
def copy(self, dest, src, copy_size):
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *nvdata64(src), *nvdata64(dest)]
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
|
||||
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
|
||||
return self
|
||||
|
||||
def wait(self, signal, value=0):
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal).contents)), value, 0x0,
|
||||
(3 << 0) | (1 << 12) | (1 << 24)] # ACQUIRE | ACQUIRE_SWITCH_TSG | PAYLOAD_SIZE_64BIT
|
||||
return self
|
||||
|
||||
def signal(self, signal, value=0, timestamp=False):
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal).contents)), *nvdata64_le(value),
|
||||
(1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
|
||||
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
|
||||
return self
|
||||
|
||||
def submit(self, dev:NVDevice):
|
||||
self.signal(dev.dma_progress_signal, dev.dma_put_value + 1)
|
||||
for i,packet in enumerate(self.q): dev.cmdq[dev.cmdq_wptr//4 + i] = packet
|
||||
fifo_entry = dev.dma_put_value % dev.dma_gpfifo_entries
|
||||
dev.dma_gpu_ring[fifo_entry] = ((dev.cmdq_page.base+dev.cmdq_wptr)//4 << 2) | (len(self.q) << 42)
|
||||
dev.dma_gpu_ring_controls.GPPut = (dev.dma_put_value + 1) % dev.dma_gpfifo_entries
|
||||
dev.dma_put_value += 1
|
||||
dev.gpu_mmio[0x90 // 4] = dev.dma_gpfifo_token
|
||||
dev.cmdq_wptr += len(self.q) * 4
|
||||
|
||||
SHT_PROGBITS, SHT_NOBITS, SHF_ALLOC, SHF_EXECINSTR = 0x1, 0x8, 0x2, 0x4
|
||||
class NVProgram:
|
||||
def __init__(self, device:NVDevice, name:str, lib:bytes):
|
||||
self.device, self.name, self.lib = device, name, lib
|
||||
if DEBUG >= 6:
|
||||
try:
|
||||
fn = (pathlib.Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
|
||||
with open(fn + ".cubin", "wb") as f: f.write(lib)
|
||||
print(subprocess.check_output(["nvdisasm", fn+".cubin"]).decode('utf-8'))
|
||||
except Exception as e: print("failed to disasm cubin", str(e))
|
||||
|
||||
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
|
||||
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
|
||||
shstrtab = memoryview(bytearray(self.lib[sections[_shstrndx][4]:sections[_shstrndx][4]+sections[_shstrndx][5]]))
|
||||
|
||||
self.shmem_usage = 0
|
||||
constant_buffers_data = {}
|
||||
for sh_name, sh_type, sh_flags, _, sh_offset, sh_size, _, sh_info, _ in sections:
|
||||
section_name = shstrtab[sh_name:].tobytes().split(b'\0', 1)[0].decode('utf-8')
|
||||
if sh_type == SHT_NOBITS and sh_flags & SHF_ALLOC: self.shmem_usage = sh_size
|
||||
elif sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC and sh_flags & SHF_EXECINSTR:
|
||||
self.program = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
||||
self.registers_usage = sh_info >> 24
|
||||
if match := re.match(r'\.nv\.constant(\d+)', section_name):
|
||||
constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
||||
if section_name == ".nv.info":
|
||||
section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
|
||||
for i in range(sh_size // 12):
|
||||
if section_data[i * 3 + 0] & 0xffff == 0x1204 and section_data[i * 3 + 2] + 0x240 > self.device.slm_per_thread:
|
||||
raise RuntimeError("too high local memory")
|
||||
|
||||
# Load program and constant buffers (if any)
|
||||
self.lib_sz = round_up(round_up(self.program.nbytes, 128) + sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]), 0x1000)
|
||||
self.lib_gpu = self.device.allocator.alloc(self.lib_sz)
|
||||
for st in range(0, len(self.program), 4096):
|
||||
HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4096]).submit(self.device)
|
||||
|
||||
self.constbuffer_0 = [0] * 88
|
||||
self.constbuffer_0[6:12] = [*nvdata64_le(self.device.shared_mem_window), *nvdata64_le(self.device.local_mem_window), *nvdata64_le(0xfffdc0)]
|
||||
|
||||
smem_config = min(shmem_conf * 1024 for shmem_conf in [8, 16, 32, 64, 96] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
|
||||
self.qmd = qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
|
||||
invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
|
||||
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
|
||||
shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
|
||||
max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
|
||||
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=0x10, sass_version=0x89,
|
||||
program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32,
|
||||
program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40,
|
||||
constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1)
|
||||
|
||||
# constant buffer 0 is filled for each program, no need to copy it from elf (it's just zeroes)
|
||||
if 0 in constant_buffers_data: constant_buffers_data.pop(0)
|
||||
|
||||
off = round_up(self.program.nbytes, 128)
|
||||
for i,data in constant_buffers_data.items():
|
||||
self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.base + off) >> 32)
|
||||
self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (self.lib_gpu.base + off) & 0xffffffff)
|
||||
self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', data.nbytes)
|
||||
self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
|
||||
|
||||
HWComputeQueue().copy_from_cpu(self.lib_gpu.base + off, data).submit(self.device)
|
||||
off += round_up(data.nbytes, 128)
|
||||
self.device.synchronize()
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_sz)
|
||||
|
||||
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
kernargs_size = round_up(QMD_SIZE + 0x160 + len(args) * 8 + len(vals) * 4, 1 << 8)
|
||||
if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - kernargs_size):
|
||||
self.device.kernargs_ptr = self.device.kernargs_page.base
|
||||
|
||||
kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + [val for val in vals]
|
||||
kernargs_ptr = self.device.kernargs_ptr
|
||||
self.device.kernargs_ptr += kernargs_size
|
||||
|
||||
if wait: st, en = self.device._get_signal(), self.device._get_signal()
|
||||
queue = HWComputeQueue()
|
||||
queue.wait(self.device.dma_progress_signal, self.device.dma_put_value)
|
||||
queue.wait(self.device.compute_progress_signal, self.device.compute_put_value)
|
||||
if wait: queue.signal(st, timestamp=True)
|
||||
queue.copy_from_cpu(kernargs_ptr + QMD_SIZE, self.constbuffer_0 + kernargs)
|
||||
queue.exec(self, kernargs_ptr, global_size, local_size)
|
||||
if wait: queue.signal(en, timestamp=True)
|
||||
queue.submit(self.device)
|
||||
if wait:
|
||||
self.device._wait_signal(self.device.compute_progress_signal, self.device.compute_put_value)
|
||||
return (en[1]-st[1]) / 1e9
|
||||
|
||||
class NVAllocator(LRUAllocator):
|
||||
def __init__(self, device:NVDevice):
|
||||
self.device = device
|
||||
super().__init__()
|
||||
|
||||
def _alloc(self, size:int, options:BufferOptions):
|
||||
if options.host: return self.device._gpu_host_alloc(size)
|
||||
else: return self.device._gpu_alloc(size, map_to_all_gpus=True)
|
||||
|
||||
def _free(self, gpumem, options:BufferOptions):
|
||||
NVDevice.synchronize_system()
|
||||
if options.host: self.device._gpu_host_free(gpumem)
|
||||
else: self.device._gpu_free(gpumem)
|
||||
|
||||
def copyin(self, dest, src: memoryview):
|
||||
host_mem = self.alloc(src.nbytes, BufferOptions(host=True))
|
||||
self.device.pending_copyin.append((host_mem, src.nbytes, BufferOptions(host=True)))
|
||||
ctypes.memmove(host_mem.base, from_mv(src), src.nbytes)
|
||||
HWCopyQueue().copy(dest.base, host_mem.base, src.nbytes).submit(self.device)
|
||||
self.device.synchronize()
|
||||
|
||||
def copyout(self, dest:memoryview, src):
|
||||
NVDevice.synchronize_system()
|
||||
host_mem = self.alloc(dest.nbytes, BufferOptions(host=True))
|
||||
self.device.pending_copyin.append((host_mem, dest.nbytes, BufferOptions(host=True)))
|
||||
HWCopyQueue().copy(host_mem.base, src.base, dest.nbytes).submit(self.device)
|
||||
self.device.synchronize()
|
||||
ctypes.memmove(from_mv(dest), host_mem.base, dest.nbytes)
|
||||
|
||||
def transfer(self, dest, src, sz:int, src_dev=None, dest_dev=None):
|
||||
queue = HWCopyQueue()
|
||||
queue.wait(src_dev.dma_progress_signal, src_dev.dma_put_value)
|
||||
queue.wait(src_dev.compute_progress_signal, src_dev.compute_put_value)
|
||||
queue.wait(dest_dev.dma_progress_signal, dest_dev.dma_put_value)
|
||||
queue.wait(dest_dev.compute_progress_signal, dest_dev.compute_put_value)
|
||||
queue.copy(dest.base, src.base, sz).submit(src_dev)
|
||||
HWCopyQueue().wait(src_dev.dma_progress_signal, src_dev.dma_put_value).submit(dest_dev)
|
||||
dest_dev.synchronize()
|
||||
|
||||
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
||||
class NVDevice(Compiled):
|
||||
root = None
|
||||
fd_ctl: int = -1
|
||||
fd_uvm: int = -1
|
||||
gpus_info = None
|
||||
signals_page:Any = None
|
||||
signal_number: int = 32
|
||||
uvm_vaddr: int = 0x1000000000
|
||||
host_object_enumerator: int = 0x1000
|
||||
devices: List[NVDevice] = []
|
||||
|
||||
def _new_gpu_fd(self):
|
||||
fd_dev = os.open(f"/dev/nvidia{self.device_id}", os.O_RDWR | os.O_CLOEXEC)
|
||||
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
|
||||
return fd_dev
|
||||
|
||||
def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
|
||||
fd_dev = self._new_gpu_fd() if not system else os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
||||
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
|
||||
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.device, hMemory=memory_handle, length=size, flags=flags))
|
||||
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
|
||||
if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {made.params.status}")
|
||||
return libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
|
||||
|
||||
def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_to_all_gpus=False, map_flags=0):
|
||||
size = round_up(size, align:=((4 << 10) if huge_page else (2 << 20))) # TODO: need hugepage option, any speedup?
|
||||
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
|
||||
attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
|
||||
((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contig else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27)),
|
||||
attr2=((nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_YES << 2) |
|
||||
((nv_gpu.NVOS32_ATTR2_PAGE_SIZE_HUGE_2MB << 20) if huge_page else 0)),
|
||||
flags=(nv_gpu.NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE | nv_gpu.NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM | nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED |
|
||||
nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED))
|
||||
mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_USER, self.root, self.device, alloc_params).hObjectNew
|
||||
|
||||
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align)
|
||||
if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
|
||||
|
||||
handle = self._gpu_uvm_map(va_addr, size, mem_handle)
|
||||
if map_to_all_gpus:
|
||||
for dev in NVDevice.devices:
|
||||
if dev != self: dev._gpu_uvm_map(handle.base, handle.length, handle.hMemory, create_range=False)
|
||||
return handle
|
||||
|
||||
def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0):
|
||||
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
|
||||
attr=(nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS << 27) | (nv_gpu.NVOS32_ATTR_LOCATION_PCI << 25),
|
||||
attr2=(nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO << 2),
|
||||
flags=(nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED |
|
||||
nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED), format=6, size=size, alignment=(4<<10), offset=0, limit=size-1)
|
||||
mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_SYSTEM, self.root, self.device, alloc_params).hObjectNew
|
||||
|
||||
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size)
|
||||
if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
|
||||
|
||||
return self._gpu_uvm_map(va_addr, size, mem_handle)
|
||||
|
||||
def _gpu_host_alloc(self, size):
|
||||
va_base = self._alloc_gpu_vaddr(sz:=round_up(size, 4 << 10))
|
||||
libc.mmap(va_base, sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
|
||||
return self._map_to_gpu(va_base, sz)
|
||||
|
||||
def _gpu_free(self, mem):
|
||||
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)
|
||||
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
|
||||
if made.status != 0: raise RuntimeError(f"_gpu_free returned {made.status}")
|
||||
uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
|
||||
|
||||
def _gpu_host_free(self, mem):
|
||||
uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
|
||||
libc.munmap(mem.base, mem.length)
|
||||
|
||||
def _map_to_gpu(self, va_base, size):
|
||||
NVDevice.host_object_enumerator += 1
|
||||
flags = ((nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) |
|
||||
(nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30))
|
||||
made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.device, flags=flags,
|
||||
hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=size-1), fd=-1)
|
||||
nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
|
||||
if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {made.params.status}")
|
||||
return self._gpu_uvm_map(va_base, size, made.params.hObjectNew)
|
||||
|
||||
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True) -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
|
||||
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
|
||||
gpu_attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(
|
||||
nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuMappingType = 1))
|
||||
return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
|
||||
gpuAttributesCount=1, perGpuAttributes=gpu_attrs)
|
||||
|
||||
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10)):
|
||||
NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
|
||||
return res_va
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
if NVDevice.root is None:
|
||||
NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
|
||||
NVDevice.fd_uvm = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
||||
fd_uvm_2 = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
|
||||
NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
|
||||
uvm.initialize(self.fd_uvm)
|
||||
uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm)
|
||||
|
||||
NVDevice.gpus_info = (nv_gpu.nv_ioctl_card_info_t*16)()
|
||||
nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, NVDevice.gpus_info)
|
||||
|
||||
# TODO: Get classes from NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
self.fd_dev = self._new_gpu_fd()
|
||||
|
||||
assert NVDevice.gpus_info[self.device_id].valid
|
||||
gpu_info = nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS(gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
|
||||
rm_control(self.fd_ctl, nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2, self.root, self.root, gpu_info)
|
||||
|
||||
device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=gpu_info.deviceInstance, hClientShare=self.root,
|
||||
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
|
||||
self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
|
||||
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
|
||||
self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
|
||||
gpu_mmio_ptr = self._gpu_map_to_cpu(self.usermode, 0x10000, flags=2)
|
||||
self.gpu_mmio = to_mv(gpu_mmio_ptr, 0x10000).cast("I")
|
||||
|
||||
vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
|
||||
flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
|
||||
vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.device, vaspace_params).hObjectNew
|
||||
|
||||
gpu_uuid_params = nv_gpu.NV2080_CTRL_GPU_GET_GID_INFO_PARAMS(flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
|
||||
rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_GPU_GET_GID_INFO, self.root, self.subdevice, gpu_uuid_params)
|
||||
self.gpu_uuid = (ctypes.c_ubyte*16)(*[gpu_uuid_params.data[i] for i in range(16)])
|
||||
|
||||
uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid))
|
||||
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl,
|
||||
hClient=self.root, hVaSpace=vaspace)
|
||||
|
||||
for dev in self.devices:
|
||||
uvm.enable_peer_access(self.fd_uvm, gpuUuidA=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuUuidB=nv_gpu.struct_nv_uuid(uuid=dev.gpu_uuid))
|
||||
|
||||
if NVDevice.signals_page is None: NVDevice.signals_page = self._gpu_system_alloc(0x10000, map_to_cpu=True)
|
||||
else: self._gpu_uvm_map(NVDevice.signals_page.base, NVDevice.signals_page.length, NVDevice.signals_page.hMemory, create_range=False)
|
||||
|
||||
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
|
||||
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
|
||||
|
||||
gpfifo = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000)
|
||||
|
||||
ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
|
||||
ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
|
||||
|
||||
self.compute_gpfifo_entries: int = 0x10000
|
||||
self.compute_gpfifo_token: int = self._gpu_fifo_setup(gpfifo, ctxshare, channel_group, offset=0, entries=self.compute_gpfifo_entries)
|
||||
self.compute_gpu_ring: memoryview = to_mv(gpfifo.base, self.compute_gpfifo_entries * 8).cast("Q")
|
||||
self.compute_gpu_ring_controls = nv_gpu.AmpereAControlGPFifo.from_address(gpfifo.base + self.compute_gpfifo_entries * 8)
|
||||
self.compute_put_value: int = 0
|
||||
self.compute_progress_signal = NVDevice._get_signal(self.device_id * 2)
|
||||
|
||||
self.dma_gpfifo_entries: int = 0x10000
|
||||
self.dma_gpfifo_token: int = self._gpu_fifo_setup(gpfifo, ctxshare, channel_group, offset=0x100000, entries=self.dma_gpfifo_entries)
|
||||
self.dma_gpu_ring: memoryview = to_mv(gpfifo.base + 0x100000, self.dma_gpfifo_entries * 8).cast("Q")
|
||||
self.dma_gpu_ring_controls = nv_gpu.AmpereAControlGPFifo.from_address(gpfifo.base + 0x100000 + self.dma_gpfifo_entries * 8)
|
||||
self.dma_put_value: int = 0
|
||||
self.dma_progress_signal = NVDevice._get_signal(self.device_id * 2 + 1)
|
||||
|
||||
en_fifo_params = nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1)
|
||||
rm_control(self.fd_ctl, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, self.root, channel_group, en_fifo_params)
|
||||
|
||||
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
|
||||
self.cmdq: memoryview = to_mv(self.cmdq_page.base, 0x200000).cast("I")
|
||||
self.cmdq_wptr: int = 0 # in bytes
|
||||
|
||||
self.kernargs_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x4000000, map_to_cpu=True)
|
||||
self.kernargs_ptr: int = self.kernargs_page.base
|
||||
|
||||
self.arch: str = 'sm_89' # TODO: fix
|
||||
self.pending_copyin: List[Any] = []
|
||||
|
||||
super().__init__(device, NVAllocator(self), NVCompiler(self.arch), functools.partial(NVProgram, self))
|
||||
|
||||
self._cmdq_setup_compute_gpfifo()
|
||||
self._cmdq_setup_dma_gpfifo()
|
||||
|
||||
NVDevice.devices.append(self)
|
||||
|
||||
def synchronize(self):
|
||||
self._wait_signal(self.compute_progress_signal, self.compute_put_value)
|
||||
self._wait_signal(self.dma_progress_signal, self.dma_put_value)
|
||||
self.cmdq_wptr = 0
|
||||
|
||||
for opaque,sz,options in self.pending_copyin: self.allocator.free(opaque, sz, options)
|
||||
self.pending_copyin.clear()
|
||||
|
||||
@staticmethod
|
||||
def synchronize_system():
|
||||
for d in NVDevice.devices: d.synchronize()
|
||||
|
||||
@classmethod
|
||||
def _get_signal(self, num=None) -> memoryview:
|
||||
if num is None:
|
||||
self.signal_number += 1
|
||||
if self.signals_page and self.signal_number * 16 >= self.signals_page.length: self.signal_number = 32
|
||||
num = self.signal_number
|
||||
sig = to_mv(self.signals_page.base + num * 16, 16).cast("Q")
|
||||
sig[0] = 0
|
||||
return sig
|
||||
|
||||
@classmethod
|
||||
def _wait_signal(self, signal, value=0, timeout=10000):
|
||||
start_time = time.time()
|
||||
sem_value = signal[0]
|
||||
while sem_value != value:
|
||||
sem_value = signal[0]
|
||||
if time.time() - start_time > timeout // 1000: raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
|
||||
|
||||
def _gpu_fifo_setup(self, gpfifo, ctxshare, channel_group, offset, entries=0x400):
|
||||
notifier = self._gpu_system_alloc(48 << 20)
|
||||
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo.hMemory,
|
||||
gpFifoOffset=gpfifo.base+offset, gpFifoEntries=entries, hContextShare=ctxshare,
|
||||
hUserdMemory=(ctypes.c_uint32*8)(gpfifo.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
||||
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
|
||||
rm_alloc(self.fd_ctl, nv_gpu.ADA_COMPUTE_A, self.root, gpfifo, None)
|
||||
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
|
||||
|
||||
ws_token_params = nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS(workSubmitToken=-1)
|
||||
rm_control(self.fd_ctl, nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN, self.root, gpfifo, ws_token_params)
|
||||
assert ws_token_params.workSubmitToken != -1
|
||||
|
||||
channel_base = self._alloc_gpu_vaddr(0x4000000)
|
||||
uvm.register_channel(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl, hClient=self.root,
|
||||
hChannel=gpfifo, base=channel_base, length=0x4000000)
|
||||
|
||||
return ws_token_params.workSubmitToken
|
||||
|
||||
def _cmdq_setup_compute_gpfifo(self):
|
||||
self.slm_per_thread = 0x900
|
||||
bytes_per_warp = round_up(self.slm_per_thread * 32, 0x200)
|
||||
bytes_per_tpc = round_up(bytes_per_warp * 48 * 2, 0x8000)
|
||||
self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True).base
|
||||
|
||||
# Set windows addresses to not collide with other allocated buffers.
|
||||
self.shared_mem_window, self.local_mem_window = 0xfe000000, 0xff000000
|
||||
|
||||
queue = HWComputeQueue()
|
||||
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), nv_gpu.ADA_COMPUTE_A]
|
||||
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *nvdata64(self.shader_local_mem)]
|
||||
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *nvdata64(bytes_per_tpc), 0x40]
|
||||
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *nvdata64(self.local_mem_window)]
|
||||
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *nvdata64(self.shared_mem_window)]
|
||||
queue.submit(self)
|
||||
self.synchronize()
|
||||
|
||||
def _cmdq_setup_dma_gpfifo(self):
|
||||
queue = HWCopyQueue()
|
||||
queue.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), nv_gpu.AMPERE_DMA_COPY_B]
|
||||
queue.submit(self)
|
||||
self.synchronize()
|
||||
Reference in New Issue
Block a user