* start

* fix err 93

* gpu

* ioctl mappings

* alloc like cuda

* semaphores

* wait for semaphores value

* start ops_nv

* very simple kernels work

* init several gpus

* qmd dumper

* dirty, but most of kernels work

* always all test_ops

* progress, more tests, stable

* test_ops passes, gpt2 works

but wth big fifo, wrap of fifo doesn't work, i think it's something coherency releated

* need better sync

* fix sync

* alloc2

* all tests pass!

* cleanup 1

* cleanup

* multigpu, simple transfer

* fix sync

* correct init

* nv_gpu autogen + sync bug fix

* clean extra/nv_gpu_driver

* p2p

* clean up

* remove old gen

* small fixes

* cleanup

* cleanup 2

* small fixes

* bigger queue size

* cleanups

* wait

* fixed signals for devs

* fix hang + parallel beam

* small fixes

* detect when local memory is big in kernel

* correct assert

* small fixes

* correct tls size est

* one va space

* less lines

* shorter

* save 2 lines

* save some lines

* remove type ignores

---------

Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
nimlgen
2024-04-22 18:50:20 +03:00
committed by GitHub
parent 77a3780005
commit e6227bdb15
10 changed files with 11067 additions and 3430 deletions

View File

@@ -458,8 +458,11 @@ jobs:
if: matrix.backend == 'cuda'
run: |
cp tinygrad/runtime/autogen/cuda.py /tmp/cuda.py.bak
cp tinygrad/runtime/autogen/nv_gpu.py /tmp/nv_gpu.py.bak
./autogen_stubs.sh cuda
./autogen_stubs.sh nv
diff /tmp/cuda.py.bak tinygrad/runtime/autogen/cuda.py
diff /tmp/nv_gpu.py.bak tinygrad/runtime/autogen/nv_gpu.py
- name: Verify HIP autogen
if: matrix.backend == 'hip'
run: |

View File

@@ -72,6 +72,56 @@ generate_cuda() {
python3 -c "import tinygrad.runtime.autogen.cuda"
}
generate_nv() {
NVKERN_COMMIT_HASH=d6b75a34094b0f56c2ccadf14e5d0bd515ed1ab6
NVKERN_SRC=/tmp/open-gpu-kernel-modules-$NVKERN_COMMIT_HASH
if [ ! -d "$NVKERN_SRC" ]; then
git clone https://github.com/tinygrad/open-gpu-kernel-modules $NVKERN_SRC
pushd .
cd $NVKERN_SRC
git reset --hard $NVKERN_COMMIT_HASH
popd
fi
clang2py \
extra/nv_gpu_driver/clc6c0qmd.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl0080.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl2080_notification.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
$NVKERN_SRC/src/nvidia/generated/g_allclasses.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc6c0.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/clc6b5.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/uvm_ioctl.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/uvm_linux_ioctl.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv_escape.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv-ioctl.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv-ioctl-numbers.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv-ioctl-numa.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv-unix-nvos-params-wrappers.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/alloc/alloc_channel.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/nvos.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl0000/*.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl0080/*.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl2080/*.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl83de/*.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrlc36f.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrlcb33.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrla06c.h \
--clang-args="-include $NVKERN_SRC/src/common/sdk/nvidia/inc/nvtypes.h -I$NVKERN_SRC/src/common/inc -I$NVKERN_SRC/kernel-open/nvidia-uvm -I$NVKERN_SRC/kernel-open/common/inc -I$NVKERN_SRC/src/common/sdk/nvidia/inc -I$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include -I$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl" \
-o $BASE/nv_gpu.py -k cdefstum
fixup $BASE/nv_gpu.py
sed -i "s\(0000000001)\1\g" $BASE/nv_gpu.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/nv_gpu.py
sed -i 's/#\?\s\([A-Za-z0-9_]\+\) = MW ( \([0-9]\+\) : \([0-9]\+\) )/\1 = (\2 , \3)/' $BASE/nv_gpu.py # NVC6C0_QMDV03_00 processing
sed -i 's/#\sdef NVC6C0_QMD\([A-Za-z0-9_()]\+\):/def NVC6C0_QMD\1:/' $BASE/nv_gpu.py
sed -i 's/#\s*return MW(\([0-9i()*+]\+\):\([0-9i()*+]\+\))/ return (\1 , \2)/' $BASE/nv_gpu.py
sed -i 's/#\?\s*\(.*\)\s*=\s*\(NV\)\?BIT\(32\)\?\s*(\s*\([0-9]\+\)\s*)/\1 = (1 << \4)/' $BASE/nv_gpu.py # name = BIT(x) -> name = (1 << x)
sed -i "s/UVM_\([A-Za-z0-9_]\+\) = \['i', '(', '\([0-9]\+\)', ')'\]/UVM_\1 = \2/" $BASE/nv_gpu.py # UVM_name = ['i', '(', '<num>', ')'] -> UVM_name = <num>
python3 -c "import tinygrad.runtime.autogen.nv_gpu"
}
generate_hsa() {
clang2py \
/opt/rocm/include/hsa/hsa.h \
@@ -105,6 +155,7 @@ elif [ "$1" == "comgr" ]; then generate_comgr
elif [ "$1" == "cuda" ]; then generate_cuda
elif [ "$1" == "hsa" ]; then generate_hsa
elif [ "$1" == "kfd" ]; then generate_kfd
elif [ "$1" == "all" ]; then generate_opencl; generate_hip; generate_comgr; generate_cuda; generate_hsa; generate_kfd
elif [ "$1" == "nv" ]; then generate_nv
elif [ "$1" == "all" ]; then generate_opencl; generate_hip; generate_comgr; generate_cuda; generate_hsa; generate_kfd; generate_nv
else echo "usage: $0 <type>"
fi

View File

@@ -1,258 +0,0 @@
# -*- coding: utf-8 -*-
#
# TARGET arch is: []
# WORD_SIZE is: 8
# POINTER_SIZE is: 8
# LONGDOUBLE_SIZE is: 16
#
import ctypes
NV01_ROOT = (0x00000000) # macro
NV1_ROOT = (0x00000000) # macro
NV01_NULL_OBJECT = (0x00000000) # macro
NV1_NULL_OBJECT = (0x00000000) # macro
NV01_ROOT_NON_PRIV = (0x00000001) # macro
NV1_ROOT_NON_PRIV = (0x00000001) # macro
NV01_ROOT_CLIENT = (0x00000041) # macro
FABRIC_MANAGER_SESSION = (0x0000000f) # macro
NV0020_GPU_MANAGEMENT = (0x00000020) # macro
NV01_DEVICE_0 = (0x00000080) # macro
NV20_SUBDEVICE_0 = (0x00002080) # macro
NV2081_BINAPI = (0x00002081) # macro
NV2082_BINAPI_PRIVILEGED = (0x00002082) # macro
NV20_SUBDEVICE_DIAG = (0x0000208f) # macro
NV01_CONTEXT_DMA = (0x00000002) # macro
NV01_MEMORY_SYSTEM = (0x0000003e) # macro
NV1_MEMORY_SYSTEM = (0x0000003e) # macro
NV01_MEMORY_LOCAL_PRIVILEGED = (0x0000003f) # macro
NV1_MEMORY_LOCAL_PRIVILEGED = (0x0000003f) # macro
NV01_MEMORY_PRIVILEGED = (0x0000003f) # macro
NV1_MEMORY_PRIVILEGED = (0x0000003f) # macro
NV01_MEMORY_LOCAL_USER = (0x00000040) # macro
NV1_MEMORY_LOCAL_USER = (0x00000040) # macro
NV01_MEMORY_USER = (0x00000040) # macro
NV1_MEMORY_USER = (0x00000040) # macro
NV_MEMORY_EXTENDED_USER = (0x00000042) # macro
NV01_MEMORY_VIRTUAL = (0x00000070) # macro
NV01_MEMORY_SYSTEM_DYNAMIC = (0x00000070) # macro
NV1_MEMORY_SYSTEM_DYNAMIC = (0x00000070) # macro
NV_MEMORY_MAPPER = (0x000000fe) # macro
NV01_MEMORY_LOCAL_PHYSICAL = (0x000000c2) # macro
NV01_MEMORY_SYSTEM_OS_DESCRIPTOR = (0x00000071) # macro
NV01_MEMORY_DEVICELESS = (0x000090ce) # macro
NV01_MEMORY_FRAMEBUFFER_CONSOLE = (0x00000076) # macro
NV01_MEMORY_HW_RESOURCES = (0x000000b1) # macro
NV01_MEMORY_LIST_SYSTEM = (0x00000081) # macro
NV01_MEMORY_LIST_FBMEM = (0x00000082) # macro
NV01_MEMORY_LIST_OBJECT = (0x00000083) # macro
NV01_MEMORY_FLA = (0x000000f3) # macro
NV_CE_UTILS = (0x00000050) # macro
NV_MEMORY_FABRIC = (0x000000f8) # macro
FABRIC_VASPACE_A = (0x000000fc) # macro
NV_MEMORY_MULTICAST_FABRIC = (0x000000fd) # macro
IO_VASPACE_A = (0x000000f2) # macro
NV01_NULL = (0x00000030) # macro
NV1_NULL = (0x00000030) # macro
NV01_EVENT = (0x00000005) # macro
NV1_EVENT = (0x00000005) # macro
NV01_EVENT_KERNEL_CALLBACK = (0x00000078) # macro
NV1_EVENT_KERNEL_CALLBACK = (0x00000078) # macro
NV01_EVENT_OS_EVENT = (0x00000079) # macro
NV1_EVENT_OS_EVENT = (0x00000079) # macro
NV01_EVENT_WIN32_EVENT = (0x00000079) # macro
NV1_EVENT_WIN32_EVENT = (0x00000079) # macro
NV01_EVENT_KERNEL_CALLBACK_EX = (0x0000007e) # macro
NV1_EVENT_KERNEL_CALLBACK_EX = (0x0000007e) # macro
NV01_TIMER = (0x00000004) # macro
NV1_TIMER = (0x00000004) # macro
KERNEL_GRAPHICS_CONTEXT = (0x00000090) # macro
NV50_CHANNEL_GPFIFO = (0x0000506f) # macro
GF100_CHANNEL_GPFIFO = (0x0000906f) # macro
KEPLER_CHANNEL_GPFIFO_A = (0x0000a06f) # macro
UVM_CHANNEL_RETAINER = (0x0000c574) # macro
KEPLER_CHANNEL_GPFIFO_B = (0x0000a16f) # macro
MAXWELL_CHANNEL_GPFIFO_A = (0x0000b06f) # macro
PASCAL_CHANNEL_GPFIFO_A = (0x0000c06f) # macro
VOLTA_CHANNEL_GPFIFO_A = (0x0000c36f) # macro
TURING_CHANNEL_GPFIFO_A = (0x0000c46f) # macro
AMPERE_CHANNEL_GPFIFO_A = (0x0000c56f) # macro
HOPPER_CHANNEL_GPFIFO_A = (0x0000c86f) # macro
NV04_SOFTWARE_TEST = (0x0000007d) # macro
NV4_SOFTWARE_TEST = (0x0000007d) # macro
NV30_GSYNC = (0x000030f1) # macro
VOLTA_USERMODE_A = (0x0000c361) # macro
TURING_USERMODE_A = (0x0000c461) # macro
AMPERE_USERMODE_A = (0x0000c561) # macro
HOPPER_USERMODE_A = (0x0000c661) # macro
NVC371_DISP_SF_USER = (0x0000c371) # macro
NVC372_DISPLAY_SW = (0x0000c372) # macro
NVC573_DISP_CAPABILITIES = (0x0000c573) # macro
NVC673_DISP_CAPABILITIES = (0x0000c673) # macro
NVC773_DISP_CAPABILITIES = (0x0000c773) # macro
NV04_DISPLAY_COMMON = (0x00000073) # macro
NV50_DEFERRED_API_CLASS = (0x00005080) # macro
MPS_COMPUTE = (0x0000900e) # macro
NVC570_DISPLAY = (0x0000c570) # macro
NVC57A_CURSOR_IMM_CHANNEL_PIO = (0x0000c57a) # macro
NVC57B_WINDOW_IMM_CHANNEL_DMA = (0x0000c57b) # macro
NVC57D_CORE_CHANNEL_DMA = (0x0000c57d) # macro
NVC57E_WINDOW_CHANNEL_DMA = (0x0000c57e) # macro
NVC670_DISPLAY = (0x0000c670) # macro
NVC671_DISP_SF_USER = (0x0000c671) # macro
NVC67A_CURSOR_IMM_CHANNEL_PIO = (0x0000c67a) # macro
NVC67B_WINDOW_IMM_CHANNEL_DMA = (0x0000c67b) # macro
NVC67D_CORE_CHANNEL_DMA = (0x0000c67d) # macro
NVC67E_WINDOW_CHANNEL_DMA = (0x0000c67e) # macro
NVC77F_ANY_CHANNEL_DMA = (0x0000c77f) # macro
NVC770_DISPLAY = (0x0000c770) # macro
NVC771_DISP_SF_USER = (0x0000c771) # macro
NVC77D_CORE_CHANNEL_DMA = (0x0000c77d) # macro
NV9010_VBLANK_CALLBACK = (0x00009010) # macro
GF100_PROFILER = (0x000090cc) # macro
MAXWELL_PROFILER = (0x0000b0cc) # macro
MAXWELL_PROFILER_DEVICE = (0x0000b2cc) # macro
GF100_SUBDEVICE_MASTER = (0x000090e6) # macro
GF100_SUBDEVICE_INFOROM = (0x000090e7) # macro
GF100_ZBC_CLEAR = (0x00009096) # macro
GF100_DISP_SW = (0x00009072) # macro
GF100_TIMED_SEMAPHORE_SW = (0x00009074) # macro
G84_PERFBUFFER = (0x0000844c) # macro
NV50_MEMORY_VIRTUAL = (0x000050a0) # macro
NV50_P2P = (0x0000503b) # macro
NV50_THIRD_PARTY_P2P = (0x0000503c) # macro
FERMI_TWOD_A = (0x0000902d) # macro
FERMI_VASPACE_A = (0x000090f1) # macro
HOPPER_SEC2_WORK_LAUNCH_A = (0x0000cba2) # macro
GF100_HDACODEC = (0x000090ec) # macro
NVB8B0_VIDEO_DECODER = (0x0000b8b0) # macro
NVC4B0_VIDEO_DECODER = (0x0000c4b0) # macro
NVC6B0_VIDEO_DECODER = (0x0000c6b0) # macro
NVC7B0_VIDEO_DECODER = (0x0000c7b0) # macro
NVC9B0_VIDEO_DECODER = (0x0000c9b0) # macro
NVC4B7_VIDEO_ENCODER = (0x0000c4b7) # macro
NVB4B7_VIDEO_ENCODER = (0x0000b4b7) # macro
NVC7B7_VIDEO_ENCODER = (0x0000c7b7) # macro
NVC9B7_VIDEO_ENCODER = (0x0000c9b7) # macro
NVB8D1_VIDEO_NVJPG = (0x0000b8d1) # macro
NVC4D1_VIDEO_NVJPG = (0x0000c4d1) # macro
NVC9D1_VIDEO_NVJPG = (0x0000c9d1) # macro
NVB8FA_VIDEO_OFA = (0x0000b8fa) # macro
NVC6FA_VIDEO_OFA = (0x0000c6fa) # macro
NVC7FA_VIDEO_OFA = (0x0000c7fa) # macro
NVC9FA_VIDEO_OFA = (0x0000c9fa) # macro
KEPLER_INLINE_TO_MEMORY_B = (0x0000a140) # macro
FERMI_CONTEXT_SHARE_A = (0x00009067) # macro
KEPLER_CHANNEL_GROUP_A = (0x0000a06c) # macro
PASCAL_DMA_COPY_A = (0x0000c0b5) # macro
TURING_DMA_COPY_A = (0x0000c5b5) # macro
AMPERE_DMA_COPY_A = (0x0000c6b5) # macro
AMPERE_DMA_COPY_B = (0x0000c7b5) # macro
HOPPER_DMA_COPY_A = (0x0000c8b5) # macro
MAXWELL_DMA_COPY_A = (0x0000b0b5) # macro
ACCESS_COUNTER_NOTIFY_BUFFER = (0x0000c365) # macro
MMU_FAULT_BUFFER = (0x0000c369) # macro
MMU_VIDMEM_ACCESS_BIT_BUFFER = (0x0000c763) # macro
TURING_A = (0x0000c597) # macro
TURING_COMPUTE_A = (0x0000c5c0) # macro
AMPERE_A = (0x0000c697) # macro
AMPERE_COMPUTE_A = (0x0000c6c0) # macro
AMPERE_B = (0x0000c797) # macro
AMPERE_COMPUTE_B = (0x0000c7c0) # macro
ADA_A = (0x0000c997) # macro
ADA_COMPUTE_A = (0x0000c9c0) # macro
AMPERE_SMC_PARTITION_REF = (0x0000c637) # macro
AMPERE_SMC_EXEC_PARTITION_REF = (0x0000c638) # macro
AMPERE_SMC_CONFIG_SESSION = (0x0000c639) # macro
NV0092_RG_LINE_CALLBACK = (0x00000092) # macro
AMPERE_SMC_MONITOR_SESSION = (0x0000c640) # macro
HOPPER_A = (0x0000cb97) # macro
HOPPER_COMPUTE_A = (0x0000cbc0) # macro
NV40_DEBUG_BUFFER = (0x000000db) # macro
RM_USER_SHARED_DATA = (0x000000de) # macro
GT200_DEBUGGER = (0x000083de) # macro
NV40_I2C = (0x0000402c) # macro
NVA081_VGPU_CONFIG = (0x0000a081) # macro
NVA084_KERNEL_HOST_VGPU_DEVICE = (0x0000a084) # macro
NV0060_SYNC_GPU_BOOST = (0x00000060) # macro
GP100_UVM_SW = (0x0000c076) # macro
NV_EVENT_BUFFER = (0x000090cd) # macro
NV_CONFIDENTIAL_COMPUTE = (0x0000cb33) # macro
NV_COUNTER_COLLECTION_UNIT = (0x0000cbca) # macro
NV_SEMAPHORE_SURFACE = (0x000000da) # macro
__all__ = \
['ACCESS_COUNTER_NOTIFY_BUFFER', 'ADA_A', 'ADA_COMPUTE_A',
'AMPERE_A', 'AMPERE_B', 'AMPERE_CHANNEL_GPFIFO_A',
'AMPERE_COMPUTE_A', 'AMPERE_COMPUTE_B', 'AMPERE_DMA_COPY_A',
'AMPERE_DMA_COPY_B', 'AMPERE_SMC_CONFIG_SESSION',
'AMPERE_SMC_EXEC_PARTITION_REF', 'AMPERE_SMC_MONITOR_SESSION',
'AMPERE_SMC_PARTITION_REF', 'AMPERE_USERMODE_A',
'FABRIC_MANAGER_SESSION', 'FABRIC_VASPACE_A',
'FERMI_CONTEXT_SHARE_A', 'FERMI_TWOD_A', 'FERMI_VASPACE_A',
'G84_PERFBUFFER', 'GF100_CHANNEL_GPFIFO', 'GF100_DISP_SW',
'GF100_HDACODEC', 'GF100_PROFILER', 'GF100_SUBDEVICE_INFOROM',
'GF100_SUBDEVICE_MASTER', 'GF100_TIMED_SEMAPHORE_SW',
'GF100_ZBC_CLEAR', 'GP100_UVM_SW', 'GT200_DEBUGGER', 'HOPPER_A',
'HOPPER_CHANNEL_GPFIFO_A', 'HOPPER_COMPUTE_A',
'HOPPER_DMA_COPY_A', 'HOPPER_SEC2_WORK_LAUNCH_A',
'HOPPER_USERMODE_A', 'IO_VASPACE_A', 'KEPLER_CHANNEL_GPFIFO_A',
'KEPLER_CHANNEL_GPFIFO_B', 'KEPLER_CHANNEL_GROUP_A',
'KEPLER_INLINE_TO_MEMORY_B', 'KERNEL_GRAPHICS_CONTEXT',
'MAXWELL_CHANNEL_GPFIFO_A', 'MAXWELL_DMA_COPY_A',
'MAXWELL_PROFILER', 'MAXWELL_PROFILER_DEVICE', 'MMU_FAULT_BUFFER',
'MMU_VIDMEM_ACCESS_BIT_BUFFER', 'MPS_COMPUTE',
'NV0020_GPU_MANAGEMENT', 'NV0060_SYNC_GPU_BOOST',
'NV0092_RG_LINE_CALLBACK', 'NV01_CONTEXT_DMA', 'NV01_DEVICE_0',
'NV01_EVENT', 'NV01_EVENT_KERNEL_CALLBACK',
'NV01_EVENT_KERNEL_CALLBACK_EX', 'NV01_EVENT_OS_EVENT',
'NV01_EVENT_WIN32_EVENT', 'NV01_MEMORY_DEVICELESS',
'NV01_MEMORY_FLA', 'NV01_MEMORY_FRAMEBUFFER_CONSOLE',
'NV01_MEMORY_HW_RESOURCES', 'NV01_MEMORY_LIST_FBMEM',
'NV01_MEMORY_LIST_OBJECT', 'NV01_MEMORY_LIST_SYSTEM',
'NV01_MEMORY_LOCAL_PHYSICAL', 'NV01_MEMORY_LOCAL_PRIVILEGED',
'NV01_MEMORY_LOCAL_USER', 'NV01_MEMORY_PRIVILEGED',
'NV01_MEMORY_SYSTEM', 'NV01_MEMORY_SYSTEM_DYNAMIC',
'NV01_MEMORY_SYSTEM_OS_DESCRIPTOR', 'NV01_MEMORY_USER',
'NV01_MEMORY_VIRTUAL', 'NV01_NULL', 'NV01_NULL_OBJECT',
'NV01_ROOT', 'NV01_ROOT_CLIENT', 'NV01_ROOT_NON_PRIV',
'NV01_TIMER', 'NV04_DISPLAY_COMMON', 'NV04_SOFTWARE_TEST',
'NV1_EVENT', 'NV1_EVENT_KERNEL_CALLBACK',
'NV1_EVENT_KERNEL_CALLBACK_EX', 'NV1_EVENT_OS_EVENT',
'NV1_EVENT_WIN32_EVENT', 'NV1_MEMORY_LOCAL_PRIVILEGED',
'NV1_MEMORY_LOCAL_USER', 'NV1_MEMORY_PRIVILEGED',
'NV1_MEMORY_SYSTEM', 'NV1_MEMORY_SYSTEM_DYNAMIC',
'NV1_MEMORY_USER', 'NV1_NULL', 'NV1_NULL_OBJECT', 'NV1_ROOT',
'NV1_ROOT_NON_PRIV', 'NV1_TIMER', 'NV2081_BINAPI',
'NV2082_BINAPI_PRIVILEGED', 'NV20_SUBDEVICE_0',
'NV20_SUBDEVICE_DIAG', 'NV30_GSYNC', 'NV40_DEBUG_BUFFER',
'NV40_I2C', 'NV4_SOFTWARE_TEST', 'NV50_CHANNEL_GPFIFO',
'NV50_DEFERRED_API_CLASS', 'NV50_MEMORY_VIRTUAL', 'NV50_P2P',
'NV50_THIRD_PARTY_P2P', 'NV9010_VBLANK_CALLBACK',
'NVA081_VGPU_CONFIG', 'NVA084_KERNEL_HOST_VGPU_DEVICE',
'NVB4B7_VIDEO_ENCODER', 'NVB8B0_VIDEO_DECODER',
'NVB8D1_VIDEO_NVJPG', 'NVB8FA_VIDEO_OFA', 'NVC371_DISP_SF_USER',
'NVC372_DISPLAY_SW', 'NVC4B0_VIDEO_DECODER',
'NVC4B7_VIDEO_ENCODER', 'NVC4D1_VIDEO_NVJPG', 'NVC570_DISPLAY',
'NVC573_DISP_CAPABILITIES', 'NVC57A_CURSOR_IMM_CHANNEL_PIO',
'NVC57B_WINDOW_IMM_CHANNEL_DMA', 'NVC57D_CORE_CHANNEL_DMA',
'NVC57E_WINDOW_CHANNEL_DMA', 'NVC670_DISPLAY',
'NVC671_DISP_SF_USER', 'NVC673_DISP_CAPABILITIES',
'NVC67A_CURSOR_IMM_CHANNEL_PIO', 'NVC67B_WINDOW_IMM_CHANNEL_DMA',
'NVC67D_CORE_CHANNEL_DMA', 'NVC67E_WINDOW_CHANNEL_DMA',
'NVC6B0_VIDEO_DECODER', 'NVC6FA_VIDEO_OFA', 'NVC770_DISPLAY',
'NVC771_DISP_SF_USER', 'NVC773_DISP_CAPABILITIES',
'NVC77D_CORE_CHANNEL_DMA', 'NVC77F_ANY_CHANNEL_DMA',
'NVC7B0_VIDEO_DECODER', 'NVC7B7_VIDEO_ENCODER',
'NVC7FA_VIDEO_OFA', 'NVC9B0_VIDEO_DECODER',
'NVC9B7_VIDEO_ENCODER', 'NVC9D1_VIDEO_NVJPG', 'NVC9FA_VIDEO_OFA',
'NV_CE_UTILS', 'NV_CONFIDENTIAL_COMPUTE',
'NV_COUNTER_COLLECTION_UNIT', 'NV_EVENT_BUFFER',
'NV_MEMORY_EXTENDED_USER', 'NV_MEMORY_FABRIC', 'NV_MEMORY_MAPPER',
'NV_MEMORY_MULTICAST_FABRIC', 'NV_SEMAPHORE_SURFACE',
'PASCAL_CHANNEL_GPFIFO_A', 'PASCAL_DMA_COPY_A',
'RM_USER_SHARED_DATA', 'TURING_A', 'TURING_CHANNEL_GPFIFO_A',
'TURING_COMPUTE_A', 'TURING_DMA_COPY_A', 'TURING_USERMODE_A',
'UVM_CHANNEL_RETAINER', 'VOLTA_CHANNEL_GPFIFO_A',
'VOLTA_USERMODE_A']

View File

@@ -0,0 +1,763 @@
/*******************************************************************************
Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*******************************************************************************/
/* AUTO GENERATED FILE -- DO NOT EDIT */
#ifndef __CLC6C0QMD_H__
#define __CLC6C0QMD_H__
/*
** Queue Meta Data, Version 02_03
*/
// The below C preprocessor definitions describe "multi-word" structures, where
// fields may have bit numbers beyond 32. For example, MW(127:96) means
// the field is in bits 0-31 of word number 3 of the structure. The "MW(X:Y)"
// syntax is to distinguish from similar "X:Y" single-word definitions: the
// macros historically used for single-word definitions would fail with
// multi-word definitions.
//
// See nvmisc.h:DRF_VAL_MW() in the source code of the kernel
// interface layer of nvidia.ko for an example of how to manipulate
// these MW(X:Y) definitions.
#define NVC6C0_QMDV02_03_OUTER_PUT MW(30:0)
#define NVC6C0_QMDV02_03_OUTER_OVERFLOW MW(31:31)
#define NVC6C0_QMDV02_03_OUTER_GET MW(62:32)
#define NVC6C0_QMDV02_03_OUTER_STICKY_OVERFLOW MW(63:63)
#define NVC6C0_QMDV02_03_INNER_GET MW(94:64)
#define NVC6C0_QMDV02_03_INNER_OVERFLOW MW(95:95)
#define NVC6C0_QMDV02_03_INNER_PUT MW(126:96)
#define NVC6C0_QMDV02_03_INNER_STICKY_OVERFLOW MW(127:127)
#define NVC6C0_QMDV02_03_QMD_GROUP_ID MW(133:128)
#define NVC6C0_QMDV02_03_SM_GLOBAL_CACHING_ENABLE MW(134:134)
#define NVC6C0_QMDV02_03_RUN_CTA_IN_ONE_SM_PARTITION MW(135:135)
#define NVC6C0_QMDV02_03_RUN_CTA_IN_ONE_SM_PARTITION_FALSE 0x00000000
#define NVC6C0_QMDV02_03_RUN_CTA_IN_ONE_SM_PARTITION_TRUE 0x00000001
#define NVC6C0_QMDV02_03_IS_QUEUE MW(136:136)
#define NVC6C0_QMDV02_03_IS_QUEUE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_IS_QUEUE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(137:137)
#define NVC6C0_QMDV02_03_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000
#define NVC6C0_QMDV02_03_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001
#define NVC6C0_QMDV02_03_SEMAPHORE_RELEASE_ENABLE0 MW(138:138)
#define NVC6C0_QMDV02_03_SEMAPHORE_RELEASE_ENABLE0_FALSE 0x00000000
#define NVC6C0_QMDV02_03_SEMAPHORE_RELEASE_ENABLE0_TRUE 0x00000001
#define NVC6C0_QMDV02_03_SEMAPHORE_RELEASE_ENABLE1 MW(139:139)
#define NVC6C0_QMDV02_03_SEMAPHORE_RELEASE_ENABLE1_FALSE 0x00000000
#define NVC6C0_QMDV02_03_SEMAPHORE_RELEASE_ENABLE1_TRUE 0x00000001
#define NVC6C0_QMDV02_03_REQUIRE_SCHEDULING_PCAS MW(140:140)
#define NVC6C0_QMDV02_03_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000
#define NVC6C0_QMDV02_03_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_SCHEDULE_ENABLE MW(141:141)
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_SCHEDULE_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_SCHEDULE_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_TYPE MW(142:142)
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_TYPE_QUEUE 0x00000000
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_TYPE_GRID 0x00000001
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_FIELD_COPY MW(143:143)
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_FIELD_COPY_FALSE 0x00000000
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_FIELD_COPY_TRUE 0x00000001
#define NVC6C0_QMDV02_03_QMD_RESERVED_B MW(159:144)
#define NVC6C0_QMDV02_03_CIRCULAR_QUEUE_SIZE MW(184:160)
#define NVC6C0_QMDV02_03_QMD_RESERVED_C MW(185:185)
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_HEADER_CACHE MW(186:186)
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(187:187)
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_DATA_CACHE MW(188:188)
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_INVALIDATE_SHADER_DATA_CACHE MW(189:189)
#define NVC6C0_QMDV02_03_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_INVALIDATE_INSTRUCTION_CACHE MW(190:190)
#define NVC6C0_QMDV02_03_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_INVALIDATE_SHADER_CONSTANT_CACHE MW(191:191)
#define NVC6C0_QMDV02_03_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_CTA_RASTER_WIDTH_RESUME MW(223:192)
#define NVC6C0_QMDV02_03_CTA_RASTER_HEIGHT_RESUME MW(239:224)
#define NVC6C0_QMDV02_03_CTA_RASTER_DEPTH_RESUME MW(255:240)
#define NVC6C0_QMDV02_03_PROGRAM_PREFETCH_ADDR_LOWER_SHIFTED MW(287:256)
#define NVC6C0_QMDV02_03_CIRCULAR_QUEUE_ADDR_LOWER MW(319:288)
#define NVC6C0_QMDV02_03_CIRCULAR_QUEUE_ADDR_UPPER MW(327:320)
#define NVC6C0_QMDV02_03_QMD_RESERVED_D MW(335:328)
#define NVC6C0_QMDV02_03_CIRCULAR_QUEUE_ENTRY_SIZE MW(351:336)
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_ID MW(357:352)
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(365:358)
#define NVC6C0_QMDV02_03_RELEASE_MEMBAR_TYPE MW(366:366)
#define NVC6C0_QMDV02_03_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000
#define NVC6C0_QMDV02_03_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_INCR_ENABLE MW(367:367)
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_CWD_MEMBAR_TYPE MW(369:368)
#define NVC6C0_QMDV02_03_CWD_MEMBAR_TYPE_L1_NONE 0x00000000
#define NVC6C0_QMDV02_03_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001
#define NVC6C0_QMDV02_03_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003
#define NVC6C0_QMDV02_03_SEQUENTIALLY_RUN_CTAS MW(370:370)
#define NVC6C0_QMDV02_03_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000
#define NVC6C0_QMDV02_03_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_DECR_ENABLE MW(371:371)
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_API_VISIBLE_CALL_LIMIT MW(378:378)
#define NVC6C0_QMDV02_03_API_VISIBLE_CALL_LIMIT__32 0x00000000
#define NVC6C0_QMDV02_03_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001
#define NVC6C0_QMDV02_03_SAMPLER_INDEX MW(382:382)
#define NVC6C0_QMDV02_03_SAMPLER_INDEX_INDEPENDENTLY 0x00000000
#define NVC6C0_QMDV02_03_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001
#define NVC6C0_QMDV02_03_CTA_RASTER_WIDTH MW(415:384)
#define NVC6C0_QMDV02_03_CTA_RASTER_HEIGHT MW(431:416)
#define NVC6C0_QMDV02_03_QMD_RESERVED13A MW(447:432)
#define NVC6C0_QMDV02_03_CTA_RASTER_DEPTH MW(463:448)
#define NVC6C0_QMDV02_03_QMD_RESERVED14A MW(479:464)
#define NVC6C0_QMDV02_03_DEPENDENT_QMD_POINTER MW(511:480)
#define NVC6C0_QMDV02_03_COALESCE_WAITING_PERIOD MW(529:522)
#define NVC6C0_QMDV02_03_QUEUE_ENTRIES_PER_CTA_LOG2 MW(534:530)
#define NVC6C0_QMDV02_03_SHARED_MEMORY_SIZE MW(561:544)
#define NVC6C0_QMDV02_03_MIN_SM_CONFIG_SHARED_MEM_SIZE MW(568:562)
#define NVC6C0_QMDV02_03_MAX_SM_CONFIG_SHARED_MEM_SIZE MW(575:569)
#define NVC6C0_QMDV02_03_QMD_VERSION MW(579:576)
#define NVC6C0_QMDV02_03_QMD_MAJOR_VERSION MW(583:580)
#define NVC6C0_QMDV02_03_QMD_RESERVED_H MW(591:584)
#define NVC6C0_QMDV02_03_CTA_THREAD_DIMENSION0 MW(607:592)
#define NVC6C0_QMDV02_03_CTA_THREAD_DIMENSION1 MW(623:608)
#define NVC6C0_QMDV02_03_CTA_THREAD_DIMENSION2 MW(639:624)
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1))
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_VALID_FALSE 0x00000000
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_VALID_TRUE 0x00000001
#define NVC6C0_QMDV02_03_REGISTER_COUNT_V MW(656:648)
#define NVC6C0_QMDV02_03_TARGET_SM_CONFIG_SHARED_MEM_SIZE MW(663:657)
#define NVC6C0_QMDV02_03_FREE_CTA_SLOTS_EMPTY_SM MW(671:664)
#define NVC6C0_QMDV02_03_SM_DISABLE_MASK_LOWER MW(703:672)
#define NVC6C0_QMDV02_03_SM_DISABLE_MASK_UPPER MW(735:704)
#define NVC6C0_QMDV02_03_RELEASE0_ADDRESS_LOWER MW(767:736)
#define NVC6C0_QMDV02_03_RELEASE0_ADDRESS_UPPER MW(775:768)
#define NVC6C0_QMDV02_03_QMD_RESERVED_J MW(783:776)
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP MW(790:788)
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_INC 0x00000003
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_AND 0x00000005
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_OR 0x00000006
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007
#define NVC6C0_QMDV02_03_QMD_RESERVED_K MW(791:791)
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_FORMAT MW(793:792)
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_ENABLE MW(794:794)
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_RELEASE0_STRUCTURE_SIZE MW(799:799)
#define NVC6C0_QMDV02_03_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS 0x00000000
#define NVC6C0_QMDV02_03_RELEASE0_STRUCTURE_SIZE_ONE_WORD 0x00000001
#define NVC6C0_QMDV02_03_RELEASE0_PAYLOAD MW(831:800)
#define NVC6C0_QMDV02_03_RELEASE1_ADDRESS_LOWER MW(863:832)
#define NVC6C0_QMDV02_03_RELEASE1_ADDRESS_UPPER MW(871:864)
#define NVC6C0_QMDV02_03_QMD_RESERVED_L MW(879:872)
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP MW(886:884)
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_INC 0x00000003
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_AND 0x00000005
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_OR 0x00000006
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007
#define NVC6C0_QMDV02_03_QMD_RESERVED_M MW(887:887)
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_FORMAT MW(889:888)
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_ENABLE MW(890:890)
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_RELEASE1_STRUCTURE_SIZE MW(895:895)
#define NVC6C0_QMDV02_03_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS 0x00000000
#define NVC6C0_QMDV02_03_RELEASE1_STRUCTURE_SIZE_ONE_WORD 0x00000001
#define NVC6C0_QMDV02_03_RELEASE1_PAYLOAD MW(927:896)
#define NVC6C0_QMDV02_03_SHADER_LOCAL_MEMORY_LOW_SIZE MW(951:928)
#define NVC6C0_QMDV02_03_QMD_RESERVED_N MW(954:952)
#define NVC6C0_QMDV02_03_BARRIER_COUNT MW(959:955)
#define NVC6C0_QMDV02_03_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(983:960)
#define NVC6C0_QMDV02_03_REGISTER_COUNT MW(991:984)
#define NVC6C0_QMDV02_03_PROGRAM_PREFETCH_ADDR_UPPER_SHIFTED MW(1000:992)
#define NVC6C0_QMDV02_03_PROGRAM_PREFETCH_SIZE MW(1009:1001)
#define NVC6C0_QMDV02_03_QMD_RESERVED_A MW(1015:1010)
#define NVC6C0_QMDV02_03_SASS_VERSION MW(1023:1016)
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_ADDR_LOWER(i) MW((1055+(i)*64):(1024+(i)*64))
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_ADDR_UPPER(i) MW((1072+(i)*64):(1056+(i)*64))
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_PREFETCH_POST(i) MW((1073+(i)*64):(1073+(i)*64))
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_PREFETCH_POST_FALSE 0x00000000
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_PREFETCH_POST_TRUE 0x00000001
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_INVALIDATE(i) MW((1074+(i)*64):(1074+(i)*64))
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001
#define NVC6C0_QMDV02_03_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1087+(i)*64):(1075+(i)*64))
#define NVC6C0_QMDV02_03_PROGRAM_ADDRESS_LOWER MW(1567:1536)
#define NVC6C0_QMDV02_03_PROGRAM_ADDRESS_UPPER MW(1584:1568)
#define NVC6C0_QMDV02_03_QMD_RESERVED_S MW(1599:1585)
#define NVC6C0_QMDV02_03_HW_ONLY_INNER_GET MW(1630:1600)
#define NVC6C0_QMDV02_03_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(1631:1631)
#define NVC6C0_QMDV02_03_HW_ONLY_INNER_PUT MW(1662:1632)
#define NVC6C0_QMDV02_03_HW_ONLY_SCG_TYPE MW(1663:1663)
#define NVC6C0_QMDV02_03_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(1693:1664)
#define NVC6C0_QMDV02_03_QMD_RESERVED_Q MW(1694:1694)
#define NVC6C0_QMDV02_03_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(1695:1695)
#define NVC6C0_QMDV02_03_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000
#define NVC6C0_QMDV02_03_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001
#define NVC6C0_QMDV02_03_HW_ONLY_SKED_NEXT_QMD_POINTER MW(1727:1696)
#define NVC6C0_QMDV02_03_QMD_SPARE_G MW(1759:1728)
#define NVC6C0_QMDV02_03_QMD_SPARE_H MW(1791:1760)
#define NVC6C0_QMDV02_03_QMD_SPARE_I MW(1823:1792)
#define NVC6C0_QMDV02_03_QMD_SPARE_J MW(1855:1824)
#define NVC6C0_QMDV02_03_QMD_SPARE_K MW(1887:1856)
#define NVC6C0_QMDV02_03_QMD_SPARE_L MW(1919:1888)
#define NVC6C0_QMDV02_03_QMD_SPARE_M MW(1951:1920)
#define NVC6C0_QMDV02_03_QMD_SPARE_N MW(1983:1952)
#define NVC6C0_QMDV02_03_DEBUG_ID_UPPER MW(2015:1984)
#define NVC6C0_QMDV02_03_DEBUG_ID_LOWER MW(2047:2016)
/*
** Queue Meta Data, Version 02_04
*/
#define NVC6C0_QMDV02_04_OUTER_PUT MW(30:0)
#define NVC6C0_QMDV02_04_OUTER_OVERFLOW MW(31:31)
#define NVC6C0_QMDV02_04_OUTER_GET MW(62:32)
#define NVC6C0_QMDV02_04_OUTER_STICKY_OVERFLOW MW(63:63)
#define NVC6C0_QMDV02_04_INNER_GET MW(94:64)
#define NVC6C0_QMDV02_04_INNER_OVERFLOW MW(95:95)
#define NVC6C0_QMDV02_04_INNER_PUT MW(126:96)
#define NVC6C0_QMDV02_04_INNER_STICKY_OVERFLOW MW(127:127)
#define NVC6C0_QMDV02_04_QMD_GROUP_ID MW(133:128)
#define NVC6C0_QMDV02_04_SM_GLOBAL_CACHING_ENABLE MW(134:134)
#define NVC6C0_QMDV02_04_RUN_CTA_IN_ONE_SM_PARTITION MW(135:135)
#define NVC6C0_QMDV02_04_RUN_CTA_IN_ONE_SM_PARTITION_FALSE 0x00000000
#define NVC6C0_QMDV02_04_RUN_CTA_IN_ONE_SM_PARTITION_TRUE 0x00000001
#define NVC6C0_QMDV02_04_IS_QUEUE MW(136:136)
#define NVC6C0_QMDV02_04_IS_QUEUE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_IS_QUEUE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(137:137)
#define NVC6C0_QMDV02_04_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000
#define NVC6C0_QMDV02_04_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001
#define NVC6C0_QMDV02_04_SEMAPHORE_RELEASE_ENABLE0 MW(138:138)
#define NVC6C0_QMDV02_04_SEMAPHORE_RELEASE_ENABLE0_FALSE 0x00000000
#define NVC6C0_QMDV02_04_SEMAPHORE_RELEASE_ENABLE0_TRUE 0x00000001
#define NVC6C0_QMDV02_04_SEMAPHORE_RELEASE_ENABLE1 MW(139:139)
#define NVC6C0_QMDV02_04_SEMAPHORE_RELEASE_ENABLE1_FALSE 0x00000000
#define NVC6C0_QMDV02_04_SEMAPHORE_RELEASE_ENABLE1_TRUE 0x00000001
#define NVC6C0_QMDV02_04_REQUIRE_SCHEDULING_PCAS MW(140:140)
#define NVC6C0_QMDV02_04_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000
#define NVC6C0_QMDV02_04_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ENABLE MW(141:141)
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ACTION MW(144:142)
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ACTION_QMD_INCREMENT_PUT 0x00000000
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ACTION_QMD_SCHEDULE 0x00000001
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_PREFETCH MW(145:145)
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_PREFETCH_FALSE 0x00000000
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_PREFETCH_TRUE 0x00000001
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ENABLE MW(146:146)
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ACTION MW(149:147)
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ACTION_QMD_INCREMENT_PUT 0x00000000
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ACTION_QMD_SCHEDULE 0x00000001
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_PREFETCH MW(150:150)
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_PREFETCH_FALSE 0x00000000
#define NVC6C0_QMDV02_04_DEPENDENT_QMD1_PREFETCH_TRUE 0x00000001
#define NVC6C0_QMDV02_04_DEPENDENCE_COUNTER MW(157:151)
#define NVC6C0_QMDV02_04_SELF_COPY_ON_COMPLETION MW(158:158)
#define NVC6C0_QMDV02_04_SELF_COPY_ON_COMPLETION_FALSE 0x00000000
#define NVC6C0_QMDV02_04_SELF_COPY_ON_COMPLETION_TRUE 0x00000001
#define NVC6C0_QMDV02_04_QMD_RESERVED_B MW(159:159)
#define NVC6C0_QMDV02_04_CIRCULAR_QUEUE_SIZE MW(184:160)
#define NVC6C0_QMDV02_04_DEMOTE_L2_EVICT_LAST MW(185:185)
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_HEADER_CACHE MW(186:186)
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(187:187)
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_DATA_CACHE MW(188:188)
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_INVALIDATE_SHADER_DATA_CACHE MW(189:189)
#define NVC6C0_QMDV02_04_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_INVALIDATE_INSTRUCTION_CACHE MW(190:190)
#define NVC6C0_QMDV02_04_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_INVALIDATE_SHADER_CONSTANT_CACHE MW(191:191)
#define NVC6C0_QMDV02_04_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_CTA_RASTER_WIDTH_RESUME MW(223:192)
#define NVC6C0_QMDV02_04_CTA_RASTER_HEIGHT_RESUME MW(239:224)
#define NVC6C0_QMDV02_04_CTA_RASTER_DEPTH_RESUME MW(255:240)
#define NVC6C0_QMDV02_04_PROGRAM_PREFETCH_ADDR_LOWER_SHIFTED MW(287:256)
#define NVC6C0_QMDV02_04_CIRCULAR_QUEUE_ADDR_LOWER MW(319:288)
#define NVC6C0_QMDV02_04_CIRCULAR_QUEUE_ADDR_UPPER MW(327:320)
#define NVC6C0_QMDV02_04_QMD_RESERVED_D MW(335:328)
#define NVC6C0_QMDV02_04_CIRCULAR_QUEUE_ENTRY_SIZE MW(351:336)
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_ID MW(357:352)
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(365:358)
#define NVC6C0_QMDV02_04_RELEASE_MEMBAR_TYPE MW(366:366)
#define NVC6C0_QMDV02_04_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000
#define NVC6C0_QMDV02_04_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_INCR_ENABLE MW(367:367)
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_CWD_MEMBAR_TYPE MW(369:368)
#define NVC6C0_QMDV02_04_CWD_MEMBAR_TYPE_L1_NONE 0x00000000
#define NVC6C0_QMDV02_04_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001
#define NVC6C0_QMDV02_04_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003
#define NVC6C0_QMDV02_04_SEQUENTIALLY_RUN_CTAS MW(370:370)
#define NVC6C0_QMDV02_04_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000
#define NVC6C0_QMDV02_04_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_DECR_ENABLE MW(371:371)
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_API_VISIBLE_CALL_LIMIT MW(378:378)
#define NVC6C0_QMDV02_04_API_VISIBLE_CALL_LIMIT__32 0x00000000
#define NVC6C0_QMDV02_04_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001
#define NVC6C0_QMDV02_04_SAMPLER_INDEX MW(382:382)
#define NVC6C0_QMDV02_04_SAMPLER_INDEX_INDEPENDENTLY 0x00000000
#define NVC6C0_QMDV02_04_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001
#define NVC6C0_QMDV02_04_DISABLE_AUTO_INVALIDATE MW(383:383)
#define NVC6C0_QMDV02_04_DISABLE_AUTO_INVALIDATE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_DISABLE_AUTO_INVALIDATE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_CTA_RASTER_WIDTH MW(415:384)
#define NVC6C0_QMDV02_04_CTA_RASTER_HEIGHT MW(431:416)
#define NVC6C0_QMDV02_04_QMD_RESERVED13A MW(447:432)
#define NVC6C0_QMDV02_04_CTA_RASTER_DEPTH MW(463:448)
#define NVC6C0_QMDV02_04_QMD_RESERVED14A MW(479:464)
#define NVC6C0_QMDV02_04_DEPENDENT_QMD0_POINTER MW(511:480)
#define NVC6C0_QMDV02_04_COALESCE_WAITING_PERIOD MW(529:522)
#define NVC6C0_QMDV02_04_QUEUE_ENTRIES_PER_CTA_LOG2 MW(534:530)
#define NVC6C0_QMDV02_04_SHARED_MEMORY_SIZE MW(561:544)
#define NVC6C0_QMDV02_04_MIN_SM_CONFIG_SHARED_MEM_SIZE MW(568:562)
#define NVC6C0_QMDV02_04_MAX_SM_CONFIG_SHARED_MEM_SIZE MW(575:569)
#define NVC6C0_QMDV02_04_QMD_VERSION MW(579:576)
#define NVC6C0_QMDV02_04_QMD_MAJOR_VERSION MW(583:580)
#define NVC6C0_QMDV02_04_QMD_RESERVED_H MW(591:584)
#define NVC6C0_QMDV02_04_CTA_THREAD_DIMENSION0 MW(607:592)
#define NVC6C0_QMDV02_04_CTA_THREAD_DIMENSION1 MW(623:608)
#define NVC6C0_QMDV02_04_CTA_THREAD_DIMENSION2 MW(639:624)
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1))
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_VALID_FALSE 0x00000000
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_VALID_TRUE 0x00000001
#define NVC6C0_QMDV02_04_REGISTER_COUNT_V MW(656:648)
#define NVC6C0_QMDV02_04_TARGET_SM_CONFIG_SHARED_MEM_SIZE MW(663:657)
#define NVC6C0_QMDV02_04_FREE_CTA_SLOTS_EMPTY_SM MW(671:664)
#define NVC6C0_QMDV02_04_SM_DISABLE_MASK_LOWER MW(703:672)
#define NVC6C0_QMDV02_04_SM_DISABLE_MASK_UPPER MW(735:704)
#define NVC6C0_QMDV02_04_RELEASE0_ADDRESS_LOWER MW(767:736)
#define NVC6C0_QMDV02_04_RELEASE0_ADDRESS_UPPER MW(775:768)
#define NVC6C0_QMDV02_04_QMD_RESERVED_J MW(783:776)
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP MW(790:788)
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_INC 0x00000003
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_AND 0x00000005
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_OR 0x00000006
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007
#define NVC6C0_QMDV02_04_QMD_RESERVED_K MW(791:791)
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_FORMAT MW(793:792)
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_ENABLE MW(794:794)
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_RELEASE0_STRUCTURE_SIZE MW(799:799)
#define NVC6C0_QMDV02_04_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS 0x00000000
#define NVC6C0_QMDV02_04_RELEASE0_STRUCTURE_SIZE_ONE_WORD 0x00000001
#define NVC6C0_QMDV02_04_RELEASE0_PAYLOAD MW(831:800)
#define NVC6C0_QMDV02_04_RELEASE1_ADDRESS_LOWER MW(863:832)
#define NVC6C0_QMDV02_04_RELEASE1_ADDRESS_UPPER MW(871:864)
#define NVC6C0_QMDV02_04_QMD_RESERVED_L MW(879:872)
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP MW(886:884)
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_INC 0x00000003
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_AND 0x00000005
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_OR 0x00000006
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007
#define NVC6C0_QMDV02_04_QMD_RESERVED_M MW(887:887)
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_FORMAT MW(889:888)
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_ENABLE MW(890:890)
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_RELEASE1_STRUCTURE_SIZE MW(895:895)
#define NVC6C0_QMDV02_04_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS 0x00000000
#define NVC6C0_QMDV02_04_RELEASE1_STRUCTURE_SIZE_ONE_WORD 0x00000001
#define NVC6C0_QMDV02_04_RELEASE1_PAYLOAD MW(927:896)
#define NVC6C0_QMDV02_04_SHADER_LOCAL_MEMORY_LOW_SIZE MW(951:928)
#define NVC6C0_QMDV02_04_QMD_RESERVED_N MW(954:952)
#define NVC6C0_QMDV02_04_BARRIER_COUNT MW(959:955)
#define NVC6C0_QMDV02_04_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(983:960)
#define NVC6C0_QMDV02_04_QMD_RESERVED_G MW(991:984)
#define NVC6C0_QMDV02_04_PROGRAM_PREFETCH_ADDR_UPPER_SHIFTED MW(1000:992)
#define NVC6C0_QMDV02_04_PROGRAM_PREFETCH_SIZE MW(1009:1001)
#define NVC6C0_QMDV02_04_PROGRAM_PREFETCH_TYPE MW(1011:1010)
#define NVC6C0_QMDV02_04_PROGRAM_PREFETCH_TYPE_PREFETCH_LAUNCH 0x00000000
#define NVC6C0_QMDV02_04_PROGRAM_PREFETCH_TYPE_PREFTECH_POST 0x00000001
#define NVC6C0_QMDV02_04_QMD_RESERVED_A MW(1015:1012)
#define NVC6C0_QMDV02_04_SASS_VERSION MW(1023:1016)
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_ADDR_LOWER(i) MW((1055+(i)*64):(1024+(i)*64))
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_ADDR_UPPER(i) MW((1072+(i)*64):(1056+(i)*64))
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_PREFETCH_POST(i) MW((1073+(i)*64):(1073+(i)*64))
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_PREFETCH_POST_FALSE 0x00000000
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_PREFETCH_POST_TRUE 0x00000001
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_INVALIDATE(i) MW((1074+(i)*64):(1074+(i)*64))
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001
#define NVC6C0_QMDV02_04_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1087+(i)*64):(1075+(i)*64))
#define NVC6C0_QMDV02_04_PROGRAM_ADDRESS_LOWER MW(1567:1536)
#define NVC6C0_QMDV02_04_PROGRAM_ADDRESS_UPPER MW(1584:1568)
#define NVC6C0_QMDV02_04_QMD_RESERVED_S MW(1599:1585)
#define NVC6C0_QMDV02_04_HW_ONLY_INNER_GET MW(1630:1600)
#define NVC6C0_QMDV02_04_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(1631:1631)
#define NVC6C0_QMDV02_04_HW_ONLY_INNER_PUT MW(1662:1632)
#define NVC6C0_QMDV02_04_HW_ONLY_SCG_TYPE MW(1663:1663)
#define NVC6C0_QMDV02_04_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(1693:1664)
#define NVC6C0_QMDV02_04_QMD_RESERVED_Q MW(1694:1694)
#define NVC6C0_QMDV02_04_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(1695:1695)
#define NVC6C0_QMDV02_04_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000
#define NVC6C0_QMDV02_04_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001
#define NVC6C0_QMDV02_04_HW_ONLY_SKED_NEXT_QMD_POINTER MW(1727:1696)
#define NVC6C0_QMDV02_04_HW_ONLY_DEPENDENCE_COUNTER MW(1734:1728)
#define NVC6C0_QMDV02_04_QMD_RESERVED_I MW(1759:1735)
#define NVC6C0_QMDV02_04_QMD_SPARE_H MW(1791:1760)
#define NVC6C0_QMDV02_04_QMD_SPARE_I MW(1823:1792)
#define NVC6C0_QMDV02_04_QMD_SPARE_J MW(1855:1824)
#define NVC6C0_QMDV02_04_QMD_SPARE_K MW(1887:1856)
#define NVC6C0_QMDV02_04_QMD_SPARE_L MW(1919:1888)
#define NVC6C0_QMDV02_04_QMD_SPARE_M MW(1951:1920)
#define NVC6C0_QMDV02_04_QMD_SPARE_N MW(1983:1952)
#define NVC6C0_QMDV02_04_DEBUG_ID_UPPER MW(2015:1984)
#define NVC6C0_QMDV02_04_DEBUG_ID_LOWER MW(2047:2016)
/*
** Queue Meta Data, Version 03_00
*/
#define NVC6C0_QMDV03_00_OUTER_PUT MW(30:0)
#define NVC6C0_QMDV03_00_OUTER_OVERFLOW MW(31:31)
#define NVC6C0_QMDV03_00_OUTER_GET MW(62:32)
#define NVC6C0_QMDV03_00_OUTER_STICKY_OVERFLOW MW(63:63)
#define NVC6C0_QMDV03_00_INNER_GET MW(94:64)
#define NVC6C0_QMDV03_00_INNER_OVERFLOW MW(95:95)
#define NVC6C0_QMDV03_00_INNER_PUT MW(126:96)
#define NVC6C0_QMDV03_00_INNER_STICKY_OVERFLOW MW(127:127)
#define NVC6C0_QMDV03_00_QMD_GROUP_ID MW(133:128)
#define NVC6C0_QMDV03_00_SM_GLOBAL_CACHING_ENABLE MW(134:134)
#define NVC6C0_QMDV03_00_RUN_CTA_IN_ONE_SM_PARTITION MW(135:135)
#define NVC6C0_QMDV03_00_RUN_CTA_IN_ONE_SM_PARTITION_FALSE 0x00000000
#define NVC6C0_QMDV03_00_RUN_CTA_IN_ONE_SM_PARTITION_TRUE 0x00000001
#define NVC6C0_QMDV03_00_IS_QUEUE MW(136:136)
#define NVC6C0_QMDV03_00_IS_QUEUE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_IS_QUEUE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST MW(137:137)
#define NVC6C0_QMDV03_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000
#define NVC6C0_QMDV03_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001
#define NVC6C0_QMDV03_00_QMD_RESERVED04A MW(139:138)
#define NVC6C0_QMDV03_00_REQUIRE_SCHEDULING_PCAS MW(140:140)
#define NVC6C0_QMDV03_00_REQUIRE_SCHEDULING_PCAS_FALSE 0x00000000
#define NVC6C0_QMDV03_00_REQUIRE_SCHEDULING_PCAS_TRUE 0x00000001
#define NVC6C0_QMDV03_00_QMD_RESERVED04B MW(141:141)
#define NVC6C0_QMDV03_00_DEPENDENCE_COUNTER MW(157:142)
#define NVC6C0_QMDV03_00_SELF_COPY_ON_COMPLETION MW(158:158)
#define NVC6C0_QMDV03_00_SELF_COPY_ON_COMPLETION_FALSE 0x00000000
#define NVC6C0_QMDV03_00_SELF_COPY_ON_COMPLETION_TRUE 0x00000001
#define NVC6C0_QMDV03_00_QMD_RESERVED04C MW(159:159)
#define NVC6C0_QMDV03_00_CIRCULAR_QUEUE_SIZE MW(184:160)
#define NVC6C0_QMDV03_00_DEMOTE_L2_EVICT_LAST MW(185:185)
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_HEADER_CACHE MW(186:186)
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(187:187)
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_DATA_CACHE MW(188:188)
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_INVALIDATE_SHADER_DATA_CACHE MW(189:189)
#define NVC6C0_QMDV03_00_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_INVALIDATE_INSTRUCTION_CACHE MW(190:190)
#define NVC6C0_QMDV03_00_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_INVALIDATE_SHADER_CONSTANT_CACHE MW(191:191)
#define NVC6C0_QMDV03_00_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_CTA_RASTER_WIDTH_RESUME MW(223:192)
#define NVC6C0_QMDV03_00_CTA_RASTER_HEIGHT_RESUME MW(239:224)
#define NVC6C0_QMDV03_00_CTA_RASTER_DEPTH_RESUME MW(255:240)
#define NVC6C0_QMDV03_00_PROGRAM_PREFETCH_ADDR_LOWER_SHIFTED MW(287:256)
#define NVC6C0_QMDV03_00_CIRCULAR_QUEUE_ADDR_LOWER MW(319:288)
#define NVC6C0_QMDV03_00_CIRCULAR_QUEUE_ADDR_UPPER MW(327:320)
#define NVC6C0_QMDV03_00_QMD_RESERVED_D MW(335:328)
#define NVC6C0_QMDV03_00_CIRCULAR_QUEUE_ENTRY_SIZE MW(351:336)
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_ID MW(357:352)
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE MW(365:358)
#define NVC6C0_QMDV03_00_QMD_RESERVED11A MW(366:366)
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_INCR_ENABLE MW(367:367)
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE MW(369:368)
#define NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_NONE 0x00000000
#define NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR 0x00000001
#define NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_MEMBAR 0x00000003
#define NVC6C0_QMDV03_00_SEQUENTIALLY_RUN_CTAS MW(370:370)
#define NVC6C0_QMDV03_00_SEQUENTIALLY_RUN_CTAS_FALSE 0x00000000
#define NVC6C0_QMDV03_00_SEQUENTIALLY_RUN_CTAS_TRUE 0x00000001
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_DECR_ENABLE MW(371:371)
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_QMD_RESERVED11B MW(377:372)
#define NVC6C0_QMDV03_00_API_VISIBLE_CALL_LIMIT MW(378:378)
#define NVC6C0_QMDV03_00_API_VISIBLE_CALL_LIMIT__32 0x00000000
#define NVC6C0_QMDV03_00_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001
#define NVC6C0_QMDV03_00_QMD_RESERVED11C MW(381:379)
#define NVC6C0_QMDV03_00_SAMPLER_INDEX MW(382:382)
#define NVC6C0_QMDV03_00_SAMPLER_INDEX_INDEPENDENTLY 0x00000000
#define NVC6C0_QMDV03_00_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001
#define NVC6C0_QMDV03_00_DISABLE_AUTO_INVALIDATE MW(383:383)
#define NVC6C0_QMDV03_00_DISABLE_AUTO_INVALIDATE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_DISABLE_AUTO_INVALIDATE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_CTA_RASTER_WIDTH MW(415:384)
#define NVC6C0_QMDV03_00_CTA_RASTER_HEIGHT MW(431:416)
#define NVC6C0_QMDV03_00_CTA_RASTER_DEPTH MW(463:448)
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_POINTER MW(511:480)
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ENABLE MW(512:512)
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ACTION MW(515:513)
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ACTION_QMD_INCREMENT_PUT 0x00000000
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ACTION_QMD_SCHEDULE 0x00000001
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_PREFETCH MW(516:516)
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_PREFETCH_FALSE 0x00000000
#define NVC6C0_QMDV03_00_DEPENDENT_QMD0_PREFETCH_TRUE 0x00000001
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ENABLE MW(517:517)
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ACTION MW(520:518)
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ACTION_QMD_INCREMENT_PUT 0x00000000
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ACTION_QMD_SCHEDULE 0x00000001
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_PREFETCH MW(521:521)
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_PREFETCH_FALSE 0x00000000
#define NVC6C0_QMDV03_00_DEPENDENT_QMD1_PREFETCH_TRUE 0x00000001
#define NVC6C0_QMDV03_00_COALESCE_WAITING_PERIOD MW(529:522)
#define NVC6C0_QMDV03_00_QUEUE_ENTRIES_PER_CTA_LOG2 MW(534:530)
#define NVC6C0_QMDV03_00_SHARED_MEMORY_SIZE MW(561:544)
#define NVC6C0_QMDV03_00_MIN_SM_CONFIG_SHARED_MEM_SIZE MW(567:562)
#define NVC6C0_QMDV03_00_QMD_RESERVED17A MW(568:568)
#define NVC6C0_QMDV03_00_MAX_SM_CONFIG_SHARED_MEM_SIZE MW(574:569)
#define NVC6C0_QMDV03_00_QMD_RESERVED17B MW(575:575)
#define NVC6C0_QMDV03_00_QMD_VERSION MW(579:576)
#define NVC6C0_QMDV03_00_QMD_MAJOR_VERSION MW(583:580)
#define NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0 MW(607:592)
#define NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION1 MW(623:608)
#define NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION2 MW(639:624)
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_VALID(i) MW((640+(i)*1):(640+(i)*1))
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_VALID_FALSE 0x00000000
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_VALID_TRUE 0x00000001
#define NVC6C0_QMDV03_00_REGISTER_COUNT_V MW(656:648)
#define NVC6C0_QMDV03_00_TARGET_SM_CONFIG_SHARED_MEM_SIZE MW(662:657)
#define NVC6C0_QMDV03_00_SHARED_ALLOCATION_ENABLE MW(663:663)
#define NVC6C0_QMDV03_00_SHARED_ALLOCATION_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_SHARED_ALLOCATION_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_FREE_CTA_SLOTS_EMPTY_SM MW(671:664)
#define NVC6C0_QMDV03_00_SM_DISABLE_MASK_LOWER MW(703:672)
#define NVC6C0_QMDV03_00_SM_DISABLE_MASK_UPPER MW(735:704)
#define NVC6C0_QMDV03_00_SHADER_LOCAL_MEMORY_LOW_SIZE MW(759:736)
#define NVC6C0_QMDV03_00_BARRIER_COUNT MW(767:763)
#define NVC6C0_QMDV03_00_RELEASE0_ADDRESS_LOWER MW(799:768)
#define NVC6C0_QMDV03_00_RELEASE0_ADDRESS_UPPER MW(807:800)
#define NVC6C0_QMDV03_00_SEMAPHORE_RESERVED25A MW(818:808)
#define NVC6C0_QMDV03_00_RELEASE0_MEMBAR_TYPE MW(819:819)
#define NVC6C0_QMDV03_00_RELEASE0_MEMBAR_TYPE_FE_NONE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE0_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP MW(822:820)
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_ADD 0x00000000
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_MIN 0x00000001
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_MAX 0x00000002
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_INC 0x00000003
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_DEC 0x00000004
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_AND 0x00000005
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_OR 0x00000006
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_OP_RED_XOR 0x00000007
#define NVC6C0_QMDV03_00_RELEASE0_ENABLE MW(823:823)
#define NVC6C0_QMDV03_00_RELEASE0_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE0_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_FORMAT MW(825:824)
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_FORMAT_SIGNED_32 0x00000001
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_ENABLE MW(826:826)
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE0_REDUCTION_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_RELEASE0_NON_BLOCKING_INTR_TYPE MW(828:827)
#define NVC6C0_QMDV03_00_RELEASE0_NON_BLOCKING_INTR_TYPE_NONE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE0_NON_BLOCKING_INTR_TYPE_TRAP 0x00000001
#define NVC6C0_QMDV03_00_RELEASE0_NON_BLOCKING_INTR_TYPE_CONDITIONAL_TRAP 0x00000002
#define NVC6C0_QMDV03_00_RELEASE0_PAYLOAD64B MW(829:829)
#define NVC6C0_QMDV03_00_RELEASE0_PAYLOAD64B_FALSE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE0_PAYLOAD64B_TRUE 0x00000001
#define NVC6C0_QMDV03_00_RELEASE0_STRUCTURE_SIZE MW(831:830)
#define NVC6C0_QMDV03_00_RELEASE0_STRUCTURE_SIZE_SEMAPHORE_FOUR_WORDS 0x00000000
#define NVC6C0_QMDV03_00_RELEASE0_STRUCTURE_SIZE_SEMAPHORE_ONE_WORD 0x00000001
#define NVC6C0_QMDV03_00_RELEASE0_STRUCTURE_SIZE_SEMAPHORE_TWO_WORDS 0x00000002
#define NVC6C0_QMDV03_00_RELEASE0_PAYLOAD_LOWER MW(863:832)
#define NVC6C0_QMDV03_00_RELEASE0_PAYLOAD_UPPER MW(895:864)
#define NVC6C0_QMDV03_00_RELEASE1_ADDRESS_LOWER MW(927:896)
#define NVC6C0_QMDV03_00_RELEASE1_ADDRESS_UPPER MW(935:928)
#define NVC6C0_QMDV03_00_SEMAPHORE_RESERVED29A MW(946:936)
#define NVC6C0_QMDV03_00_RELEASE1_MEMBAR_TYPE MW(947:947)
#define NVC6C0_QMDV03_00_RELEASE1_MEMBAR_TYPE_FE_NONE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE1_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP MW(950:948)
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_ADD 0x00000000
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_MIN 0x00000001
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_MAX 0x00000002
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_INC 0x00000003
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_DEC 0x00000004
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_AND 0x00000005
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_OR 0x00000006
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_OP_RED_XOR 0x00000007
#define NVC6C0_QMDV03_00_RELEASE1_ENABLE MW(951:951)
#define NVC6C0_QMDV03_00_RELEASE1_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE1_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_FORMAT MW(953:952)
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_FORMAT_SIGNED_32 0x00000001
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_ENABLE MW(954:954)
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE1_REDUCTION_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_RELEASE1_NON_BLOCKING_INTR_TYPE MW(956:955)
#define NVC6C0_QMDV03_00_RELEASE1_NON_BLOCKING_INTR_TYPE_NONE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE1_NON_BLOCKING_INTR_TYPE_TRAP 0x00000001
#define NVC6C0_QMDV03_00_RELEASE1_NON_BLOCKING_INTR_TYPE_CONDITIONAL_TRAP 0x00000002
#define NVC6C0_QMDV03_00_RELEASE1_PAYLOAD64B MW(957:957)
#define NVC6C0_QMDV03_00_RELEASE1_PAYLOAD64B_FALSE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE1_PAYLOAD64B_TRUE 0x00000001
#define NVC6C0_QMDV03_00_RELEASE1_STRUCTURE_SIZE MW(959:958)
#define NVC6C0_QMDV03_00_RELEASE1_STRUCTURE_SIZE_SEMAPHORE_FOUR_WORDS 0x00000000
#define NVC6C0_QMDV03_00_RELEASE1_STRUCTURE_SIZE_SEMAPHORE_ONE_WORD 0x00000001
#define NVC6C0_QMDV03_00_RELEASE1_STRUCTURE_SIZE_SEMAPHORE_TWO_WORDS 0x00000002
#define NVC6C0_QMDV03_00_RELEASE1_PAYLOAD_LOWER MW(991:960)
#define NVC6C0_QMDV03_00_RELEASE1_PAYLOAD_UPPER MW(1023:992)
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_ADDR_LOWER(i) MW((1055+(i)*64):(1024+(i)*64))
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_ADDR_UPPER(i) MW((1072+(i)*64):(1056+(i)*64))
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_PREFETCH_POST(i) MW((1073+(i)*64):(1073+(i)*64))
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_PREFETCH_POST_FALSE 0x00000000
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_PREFETCH_POST_TRUE 0x00000001
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_INVALIDATE(i) MW((1074+(i)*64):(1074+(i)*64))
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1087+(i)*64):(1075+(i)*64))
#define NVC6C0_QMDV03_00_PROGRAM_ADDRESS_LOWER MW(1567:1536)
#define NVC6C0_QMDV03_00_PROGRAM_ADDRESS_UPPER MW(1584:1568)
#define NVC6C0_QMDV03_00_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(1623:1600)
#define NVC6C0_QMDV03_00_PROGRAM_PREFETCH_ADDR_UPPER_SHIFTED MW(1640:1632)
#define NVC6C0_QMDV03_00_PROGRAM_PREFETCH_SIZE MW(1649:1641)
#define NVC6C0_QMDV03_00_PROGRAM_PREFETCH_TYPE MW(1651:1650)
#define NVC6C0_QMDV03_00_PROGRAM_PREFETCH_TYPE_PREFETCH_LAUNCH 0x00000000
#define NVC6C0_QMDV03_00_PROGRAM_PREFETCH_TYPE_PREFTECH_POST 0x00000001
#define NVC6C0_QMDV03_00_SASS_VERSION MW(1663:1656)
#define NVC6C0_QMDV03_00_RELEASE2_ADDRESS_LOWER MW(1695:1664)
#define NVC6C0_QMDV03_00_RELEASE2_ADDRESS_UPPER MW(1703:1696)
#define NVC6C0_QMDV03_00_SEMAPHORE_RESERVED53A MW(1714:1704)
#define NVC6C0_QMDV03_00_RELEASE2_MEMBAR_TYPE MW(1715:1715)
#define NVC6C0_QMDV03_00_RELEASE2_MEMBAR_TYPE_FE_NONE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE2_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP MW(1718:1716)
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_ADD 0x00000000
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_MIN 0x00000001
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_MAX 0x00000002
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_INC 0x00000003
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_DEC 0x00000004
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_AND 0x00000005
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_OR 0x00000006
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_OP_RED_XOR 0x00000007
#define NVC6C0_QMDV03_00_RELEASE2_ENABLE MW(1719:1719)
#define NVC6C0_QMDV03_00_RELEASE2_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE2_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_FORMAT MW(1721:1720)
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_FORMAT_UNSIGNED_32 0x00000000
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_FORMAT_SIGNED_32 0x00000001
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_ENABLE MW(1722:1722)
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_ENABLE_FALSE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE2_REDUCTION_ENABLE_TRUE 0x00000001
#define NVC6C0_QMDV03_00_RELEASE2_NON_BLOCKING_INTR_TYPE MW(1724:1723)
#define NVC6C0_QMDV03_00_RELEASE2_NON_BLOCKING_INTR_TYPE_NONE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE2_NON_BLOCKING_INTR_TYPE_TRAP 0x00000001
#define NVC6C0_QMDV03_00_RELEASE2_NON_BLOCKING_INTR_TYPE_CONDITIONAL_TRAP 0x00000002
#define NVC6C0_QMDV03_00_RELEASE2_PAYLOAD64B MW(1725:1725)
#define NVC6C0_QMDV03_00_RELEASE2_PAYLOAD64B_FALSE 0x00000000
#define NVC6C0_QMDV03_00_RELEASE2_PAYLOAD64B_TRUE 0x00000001
#define NVC6C0_QMDV03_00_RELEASE2_STRUCTURE_SIZE MW(1727:1726)
#define NVC6C0_QMDV03_00_RELEASE2_STRUCTURE_SIZE_SEMAPHORE_FOUR_WORDS 0x00000000
#define NVC6C0_QMDV03_00_RELEASE2_STRUCTURE_SIZE_SEMAPHORE_ONE_WORD 0x00000001
#define NVC6C0_QMDV03_00_RELEASE2_STRUCTURE_SIZE_SEMAPHORE_TWO_WORDS 0x00000002
#define NVC6C0_QMDV03_00_RELEASE2_PAYLOAD_LOWER MW(1759:1728)
#define NVC6C0_QMDV03_00_RELEASE2_PAYLOAD_UPPER MW(1791:1760)
#define NVC6C0_QMDV03_00_QMD_SPARE_I MW(1823:1792)
#define NVC6C0_QMDV03_00_HW_ONLY_INNER_GET MW(1854:1824)
#define NVC6C0_QMDV03_00_HW_ONLY_REQUIRE_SCHEDULING_PCAS MW(1855:1855)
#define NVC6C0_QMDV03_00_HW_ONLY_INNER_PUT MW(1886:1856)
#define NVC6C0_QMDV03_00_HW_ONLY_SPAN_LIST_HEAD_INDEX MW(1917:1888)
#define NVC6C0_QMDV03_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID MW(1919:1919)
#define NVC6C0_QMDV03_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE 0x00000000
#define NVC6C0_QMDV03_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE 0x00000001
#define NVC6C0_QMDV03_00_HW_ONLY_SKED_NEXT_QMD_POINTER MW(1951:1920)
#define NVC6C0_QMDV03_00_HW_ONLY_DEPENDENCE_COUNTER MW(1958:1952)
#define NVC6C0_QMDV03_00_DEBUG_ID_UPPER MW(2015:1984)
#define NVC6C0_QMDV03_00_DEBUG_ID_LOWER MW(2047:2016)
#endif // #ifndef __CLC6C0QMD_H__

View File

@@ -1,26 +0,0 @@
#!/bin/bash
SRC=/home/kafka/build/open-gpu-kernel-modules
clang2py \
$SRC/src/nvidia/generated/g_allclasses.h \
-o class_ioctl.py -k cdefstum
exit
#clang2py $SRC/src/nvidia/arch/nvalloc/unix/include/nv_escape.h \
# $SRC/src/nvidia/arch/nvalloc/unix/include/nv-ioctl-numbers.h \
# $SRC/src/common/sdk/nvidia/inc/nvos.h \
# --clang-args="-I $SRC/src/common/sdk/nvidia/inc -I $SRC/src/common/sdk/nvidia/inc/ctrl" \
# -o esc_ioctl.py -k cdefstum
clang2py \
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrl0000/*.h \
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrl0080/*.h \
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrl2080/*.h \
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrl83de/*.h \
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrlc36f.h \
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrlcb33.h \
$SRC/src/common/sdk/nvidia/inc/ctrl/ctrla06c.h \
--clang-args="-I $SRC/src/common/sdk/nvidia/inc -I $SRC/src/common/sdk/nvidia/inc/ctrl" \
-o ctrl_ioctl.py -k cdefstum
sed -i "s\(0000000001)\1\g" ctrl_ioctl.py

File diff suppressed because it is too large Load Diff

View File

@@ -1,24 +1,34 @@
# type: ignore
import ctypes, ctypes.util, struct, platform, pathlib, re, time, os
import ctypes, ctypes.util, struct, platform, pathlib, re, time, os, signal
from tinygrad.helpers import from_mv, to_mv, getenv
from hexdump import hexdump
start = time.perf_counter()
# *** ioctl lib ***
libc = ctypes.CDLL(ctypes.util.find_library("c"))
processor = platform.processor()
IOCTL_SYSCALL = {"aarch64": 0x1d, "x86_64":16}[processor]
MMAP_SYSCALL = {"aarch64": 0xde, "x86_64":0x09}[processor]
def get_struct(argp, stype):
return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents
def dump_struct(st):
print("\t", st.__class__.__name__, end=" { ")
for v in type(st)._fields_: print(f"{v[0]}={getattr(st, v[0])}", end=" ")
print("}")
def format_struct(s):
sdats = []
for field_name, field_type in s._fields_:
dat = getattr(s, field_name)
if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
else: sdats.append(f"{field_name}:{dat}")
for field in s._fields_:
dat = getattr(s, field[0])
if isinstance(dat, int): sdats.append(f"{field[0]}:0x{dat:X}")
else: sdats.append(f"{field[0]}:{dat}")
return sdats
real_func_pool = {}
def install_hook(c_function, python_function):
orig_func = (ctypes.c_char*4096)()
python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
# AARCH64 trampoline to ioctl
if processor == "aarch64":
@@ -28,9 +38,9 @@ def install_hook(c_function, python_function):
tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
tramp += struct.pack("Q", python_function_addr)
elif processor == "x86_64":
# 0x0000000000000000: 49 B8 aa aa aa aa aa aa aa aa movabs r8, <address>
# 0x000000000000000a: 41 FF E0 jmp r8
tramp = b"\x49\xB8" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE0"
# 0x0000000000000000: 49 BB aa aa aa aa aa aa aa aa movabs r11, <address>
# 0x000000000000000a: 41 FF E3 jmp r11
tramp = b"\x49\xBB" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE3"
else:
raise Exception(f"processor {processor} not supported")
@@ -40,54 +50,168 @@ def install_hook(c_function, python_function):
# hook ioctl
ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7)
assert ret == 0
ret = libc.mprotect(ctypes.c_ulong((ctypes.addressof(orig_func)//0x1000)*0x1000), 0x3000, 7)
assert ret == 0
libc.memcpy(orig_func, ioctl_address.contents, 0x1000)
libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp))
return orig_func
# *** ioctl lib end ***
import extra.nv_gpu_driver.esc_ioctl as ESC
import extra.nv_gpu_driver.ctrl_ioctl as CTRL
import extra.nv_gpu_driver.class_ioctl as CLASS
nvescs = {getattr(ESC, x):x for x in dir(ESC) if x.startswith("NV_ESC")}
nvcmds = {getattr(CTRL, x):(x, getattr(CTRL, "struct_"+x+"_PARAMS", getattr(CTRL, "struct_"+x.replace("_CMD_", "_")+"_PARAMS", None))) for x in dir(CTRL) if \
x.startswith("NV") and x[6:].startswith("_CTRL_") and isinstance(getattr(CTRL, x), int)}
nvclasses = {getattr(CLASS, x):x for x in dir(CLASS) if isinstance(getattr(CLASS, x), int)}
import tinygrad.runtime.autogen.nv_gpu as nv_gpu
nvescs = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("NV_ESC")}
nvcmds = {getattr(nv_gpu, x):(x, getattr(nv_gpu, "struct_"+x+"_PARAMS", getattr(nv_gpu, "struct_"+x.replace("_CMD_", "_")+"_PARAMS", None))) for x in dir(nv_gpu) if \
x.startswith("NV") and x[6:].startswith("_CTRL_") and isinstance(getattr(nv_gpu, x), int)}
def get_classes():
hdrpy = (pathlib.Path(__file__).parent.parent.parent / "tinygrad/runtime/autogen/nv_gpu.py").read_text()
clss = re.search(r'NV01_ROOT.*?NV_SEMAPHORE_SURFACE = \(0x000000da\) # macro', hdrpy, re.DOTALL).group()
pattern = r'([0-9a-zA-Z_]*) = +\((0x[0-9a-fA-F]+)\)'
matches = re.findall(pattern, clss, re.MULTILINE)
return {int(num, base=16):name for name, num in matches}
nvclasses = get_classes()
nvuvms = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("UVM_") and nv_gpu.__dict__.get(x+"_PARAMS")}
nvqcmds = {int(getattr(nv_gpu, x)):x for x in dir(nv_gpu) if x[:7] in {"NVC6C0_", "NVC56F_", "NVC6B5_"} and isinstance(getattr(nv_gpu, x), int)}
global_ioctl_id = 0
gpus_user_modes = []
gpus_mmio = []
gpus_fifo = []
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
def ioctl(fd, request, argp):
global global_ioctl_id, gpus_user_modes, gpus_mmio
global_ioctl_id += 1
st = time.perf_counter()
ret = libc.syscall(IOCTL_SYSCALL, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp))
et = time.perf_counter()-st
fn = os.readlink(f"/proc/self/fd/{fd}")
#print(f"ioctl {request:8x} {fn:20s}")
idir, size, itype, nr = (request>>30), (request>>16)&0x3FFF, (request>>8)&0xFF, request&0xFF
if itype == ord(ESC.NV_IOCTL_MAGIC):
if nr == ESC.NV_ESC_RM_CONTROL:
s = get_struct(argp, ESC.NVOS54_PARAMETERS)
print(f"#{global_ioctl_id}: ", end="")
if itype == ord(nv_gpu.NV_IOCTL_MAGIC):
if nr == nv_gpu.NV_ESC_RM_CONTROL:
s = get_struct(argp, nv_gpu.NVOS54_PARAMETERS)
if s.cmd in nvcmds:
name, struc = nvcmds[s.cmd]
if struc is not None:
ss = get_struct(s.params, struc)
print("NV_ESC_RM_CONTROL ", name, format_struct(ss))
else:
print("NV_ESC_RM_CONTROL ", name)
print(f"NV_ESC_RM_CONTROL cmd={name:30s} hClient={s.hClient}, hObject={s.hObject}, flags={s.flags}, params={s.params}, paramsSize={s.paramsSize}, status={s.status}")
if struc is not None: dump_struct(get_struct(s.params, struc))
elif hasattr(nv_gpu, name+"_PARAMS"): dump_struct(get_struct(argp, getattr(nv_gpu, name+"_PARAMS")))
elif name == "NVA06C_CTRL_CMD_GPFIFO_SCHEDULE": dump_struct(get_struct(argp, nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS))
else:
print("unhandled cmd", hex(s.cmd))
#format_struct(s)
#print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
elif nr == ESC.NV_ESC_RM_ALLOC:
s = get_struct(argp, ESC.NVOS21_PARAMETERS)
print(f"NV_ESC_RM_ALLOC class: {nvclasses[s.hClass]:30s}")
elif nr == ESC.NV_ESC_RM_MAP_MEMORY:
# format_struct(s)
# print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
elif nr == nv_gpu.NV_ESC_RM_ALLOC:
s = get_struct(argp, nv_gpu.NVOS21_PARAMETERS)
print(f"NV_ESC_RM_ALLOC hClass={nvclasses.get(s.hClass, 'unk'):30s}, hRoot={s.hRoot}, hObjectParent={s.hObjectParent}, pAllocParms={s.pAllocParms}, hObjectNew={s.hObjectNew}")
if s.pAllocParms is not None:
if s.hClass == nv_gpu.NV01_DEVICE_0: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV0080_ALLOC_PARAMETERS))
if s.hClass == nv_gpu.FERMI_VASPACE_A: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS))
if s.hClass == nv_gpu.NV50_MEMORY_VIRTUAL: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
if s.hClass == nv_gpu.NV1_MEMORY_USER: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
if s.hClass == nv_gpu.NV1_MEMORY_SYSTEM: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
if s.hClass == nv_gpu.AMPERE_CHANNEL_GPFIFO_A:
sx = get_struct(s.pAllocParms, nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS)
dump_struct(sx)
gpus_fifo.append((sx.gpFifoOffset, sx.gpFifoEntries))
if s.hClass == nv_gpu.KEPLER_CHANNEL_GROUP_A: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS))
if s.hClass == nv_gpu.TURING_USERMODE_A: gpus_user_modes.append(s.hObjectNew)
elif nr == nv_gpu.NV_ESC_RM_MAP_MEMORY:
# nv_ioctl_nvos33_parameters_with_fd
s = get_struct(argp, ESC.NVOS33_PARAMETERS)
print(f"NV_ESC_RM_MAP_MEMORY {s.pLinearAddress:x}")
s = get_struct(argp, nv_gpu.NVOS33_PARAMETERS)
print(f"NV_ESC_RM_MAP_MEMORY hClient={s.hClient}, hDevice={s.hDevice}, hMemory={s.hMemory}, length={s.length} flags={s.flags} pLinearAddress={s.pLinearAddress}")
elif nr == nv_gpu.NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO:
s = get_struct(argp, nv_gpu.NVOS56_PARAMETERS)
print(f"NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO hClient={s.hClient}, hDevice={s.hDevice}, hMemory={s.hMemory}, pOldCpuAddress={s.pOldCpuAddress} pNewCpuAddress={s.pNewCpuAddress} status={s.status}")
elif nr == nv_gpu.NV_ESC_RM_ALLOC_MEMORY:
s = get_struct(argp, nv_gpu.nv_ioctl_nvos02_parameters_with_fd)
print(f"NV_ESC_RM_ALLOC_MEMORY fd={s.fd}, hRoot={s.params.hRoot}, hObjectParent={s.params.hObjectParent}, hObjectNew={s.params.hObjectNew}, hClass={s.params.hClass}, flags={s.params.flags}, pMemory={s.params.pMemory}, limit={s.params.limit}, status={s.params.status}")
elif nr == nv_gpu.NV_ESC_ALLOC_OS_EVENT:
s = get_struct(argp, nv_gpu.nv_ioctl_nvos02_parameters_with_fd)
elif nr == nv_gpu.NV_ESC_REGISTER_FD:
s = get_struct(argp, nv_gpu.nv_ioctl_register_fd_t)
print(f"NV_ESC_REGISTER_FD fd={s.ctl_fd}")
elif nr in nvescs:
print(nvescs[nr])
else:
print("unhandled NR", nr)
#print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else "")
elif fn.endswith("nvidia-uvm"):
print(f"{nvuvms.get(request, f'UVM UNKNOWN {request=}')}")
if nvuvms.get(request) is not None: dump_struct(get_struct(argp, getattr(nv_gpu, nvuvms.get(request)+"_PARAMS")))
if nvuvms.get(request) == "UVM_MAP_EXTERNAL_ALLOCATION":
st = get_struct(argp, getattr(nv_gpu, nvuvms.get(request)+"_PARAMS"))
for i in range(st.gpuAttributesCount):
print("perGpuAttributes[{i}] = ", end="")
dump_struct(st.perGpuAttributes[i])
print("ok")
if getenv("IOCTL") >= 2: print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", fn)
return ret
@ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)
def _mmap(addr, length, prot, flags, fd, offset):
mmap_type = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)
orig_mmap = mmap_type(ctypes.addressof(orig_mmap_mv))
ret = orig_mmap(addr, length, prot, flags, fd, offset)
# ll = os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else ""
print(f"mmap {addr=}, {length=}, {prot=}, {flags=}, {fd=}, {offset=} {ret=}")
return ret
install_hook(libc.ioctl, ioctl)
if getenv("IOCTL") >= 3: orig_mmap_mv = install_hook(libc.mmap, _mmap)
import collections
old_gpputs = collections.defaultdict(int)
def _dump_gpfifo(mark):
print("_dump_gpfifo:", mark)
for start,size in gpus_fifo:
gpfifo_controls = nv_gpu.AmpereAControlGPFifo.from_address(start+size*8)
gpfifo = to_mv(start, gpfifo_controls.GPPut * 8).cast("Q")
if old_gpputs[start] == gpfifo_controls.GPPut: continue
print(f"gpfifo {start}: {gpfifo_controls.GPPut=}")
for i in range(old_gpputs[start], gpfifo_controls.GPPut):
addr = ((gpfifo[i % size] & ((1 << 40)-1)) >> 2) << 2
pckt_cnt = (gpfifo[i % size]>>42)&((1 << 20)-1)
print(f"\t{i}: 0x{gpfifo[i % size]:x}: addr:0x{addr:x} packets:{pckt_cnt} sync:{(gpfifo[i % size] >> 63) & 0x1} fetch:{gpfifo[i % size] & 0x1}")
old_gpputs[start] = gpfifo_controls.GPPut
_dump_qmd(addr, pckt_cnt)
import types
def _dump_qmd(address, packets):
gpfifo = to_mv(address, packets * 4).cast("I")
i = 0
while i < packets:
dat = gpfifo[i]
typ = (dat>>28) & 0xF
if typ == 0: break
size = (dat>>16) & 0xFFF
subc = (dat>>13) & 7
mthd = (dat<<2) & 0x7FFF
method_name = nvqcmds.get(mthd, f"unknown method #{mthd}")
print(f"\t\t{method_name}, {typ=} {size=} {subc=} {mthd=}")
for j in range(size): print(f"\t\t\t{j}: {gpfifo[i+j+1]} | 0x{gpfifo[i+j+1]:x}")
if mthd == 792:
for x in dir(nv_gpu):
if x.startswith("NVC6C0_QMDV03_00_"):
vv = getattr(nv_gpu, x)
bits = None
if isinstance(vv, tuple) and len(vv) == 2:
bits = vv
if isinstance(vv, types.FunctionType):
bits = vv(0)
if bits is not None:
res = 0
for bt in range(bits[1], bits[0]+1): res |= ((gpfifo[i + 3 + bt // 32] >> (bt % 32)) & 0x1) << (bt - bits[1])
if res != 0: print(f"{x}, {hex(res)} | {bin(res)}")
const_addr = gpfifo[i+35] + ((gpfifo[i+36] & 0xffff) << 32)
const_len = ((gpfifo[i+36] >> 19))
# hexdump(to_mv(const_addr, const_len))
i += size + 1
# IOCTL=1 PTX=1 CUDA=1 python3 test/test_ops.py TestOps.test_tiny_add

View File

@@ -111,7 +111,7 @@ def beam_search(lin:Linearizer, rawbufs:List[Buffer], amt:int, allow_test_size=T
beam: List[Tuple[Linearizer, float]] = []
seen_libs = set()
default_parallel, min_progress_micros = 1 if lin.opts.device in {"CUDA", "HSA", "KFD"} else 0, getenv("BEAM_MIN_PROGRESS",0.01)
default_parallel, min_progress_micros = 1 if lin.opts.device in {"CUDA", "HSA", "KFD", "NV"} else 0, getenv("BEAM_MIN_PROGRESS",0.01)
if beam_pool is None and getenv("PARALLEL", default_parallel):
beam_pool = multiprocessing.get_context("spawn").Pool(multiprocessing.cpu_count(), _init_worker, (), getenv("BEAM_MAX_TASKS_PER_CHILD", 16))

541
tinygrad/runtime/ops_nv.py Normal file
View File

@@ -0,0 +1,541 @@
from __future__ import annotations
import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time
from typing import Tuple, List, Any
from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG
from tinygrad.renderer.cstyle import CUDARenderer
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes
import tinygrad.runtime.autogen.cuda as cuda
import tinygrad.runtime.autogen.nv_gpu as nv_gpu
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
libc = ctypes.CDLL("libc.so.6")
libc.memset.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int]
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
libc.mmap.restype = ctypes.c_void_p
libc.munmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
libc.munmap.restype = ctypes.c_int
QMD_SIZE = (8 << 8)
def nv_iowr(fd, nr, args):
ret = fcntl.ioctl(fd, (3 << 30) | (ctypes.sizeof(args) & 0x1FFF) << 16 | (ord('F') & 0xFF) << 8 | (nr & 0xFF), args)
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
def rm_alloc(fd, clss, root, parant, params):
made = nv_gpu.NVOS21_PARAMETERS(hRoot=root, hObjectParent=parant, hClass=clss,
pAllocParms=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
nv_iowr(fd, nv_gpu.NV_ESC_RM_ALLOC, made)
if made.status != 0: raise RuntimeError(f"rm_alloc returned {made.status}")
return made
def rm_control(fd, cmd, client, obj, params):
made = nv_gpu.NVOS54_PARAMETERS(hClient=client, hObject=obj, cmd=cmd, paramsSize=ctypes.sizeof(params),
params=ctypes.cast(ctypes.byref(params), ctypes.POINTER(None)) if params is not None else None) # type: ignore
nv_iowr(fd, nv_gpu.NV_ESC_RM_CONTROL, made)
if made.status != 0: raise RuntimeError(f"rm_control returned {made.status}")
return made
def uvm_ioctl(cmd, sttyp, fd, **kwargs):
ret = fcntl.ioctl(fd, cmd, made:=sttyp(**kwargs))
if ret != 0: raise RuntimeError(f"uvm_ioctl returned {ret}")
if made.rmStatus != 0: raise RuntimeError(f"uvm_ioctl struct returned {made.rmStatus}")
return made
def make_uvm_type():
fxns = {name.replace("UVM_", "").lower():
functools.partial(uvm_ioctl, dt, getattr(nv_gpu, name+"_PARAMS"))
for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")}
return type("NVUVM", (object, ), fxns)
uvm = make_uvm_type()
def make_qmd_struct_type():
fields = []
bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
bits = sorted(bits, key=lambda x: x[1][1])
for i,(name, data) in enumerate(bits):
if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
return init_c_struct_t(tuple(fields))
qmd_struct_t = make_qmd_struct_type()
assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)
def nvdata64(data): return (data >> 32, data & 0xFFFFFFFF)
def nvdata64_le(data): return (data & 0xFFFFFFFF, data >> 32)
class NVCompiler(Compiler):
compiler_opts = CompilerOptions("NV", global_max=[65535, 65535, 2147483647], local_max=[64, 1024, 1024], shared_max=49152)
def __init__(self, arch:str):
self.arch = arch
NVCompiler.compiler_opts = NVCompiler.compiler_opts._replace(has_tensor_cores=int(arch[3:]) >= 80)
cuda_check(cuda.nvrtcVersion((nvrtcMajor := ctypes.c_int()), (nvrtcMinor := ctypes.c_int())))
self.compile_options = [f'--gpu-architecture={arch}', "-I/usr/local/cuda/include", "-I/usr/include", "-I/opt/cuda/include/"]
if (nvrtcMajor.value, nvrtcMinor.value) >= (12, 4): self.compile_options.append("--minimal")
super().__init__(f"compile_nv_{self.arch}")
def render(self, name:str, uops) -> str: return CUDARenderer(name, uops)
def compile(self, src:str) -> bytes:
cuda_check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
if status != 0:
raise RuntimeError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, cuda_check).decode()}")
return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
class HWComputeQueue:
def __init__(self): self.q = []
def copy_from_cpu(self, gpuaddr, data):
self.q += [nvmethod(1, nv_gpu.NVC6C0_OFFSET_OUT_UPPER, 2), *nvdata64(gpuaddr)]
self.q += [nvmethod(1, nv_gpu.NVC6C0_LINE_LENGTH_IN, 2), len(data)*4, 0x1]
self.q += [nvmethod(1, nv_gpu.NVC6C0_LAUNCH_DMA, 1), 0x41]
self.q += [nvmethod(1, nv_gpu.NVC6C0_LOAD_INLINE_DATA, len(data), typ=6)] + [x for x in data]
return self
def exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), completion_signal=None):
prg.qmd.cta_raster_width, prg.qmd.cta_raster_height, prg.qmd.cta_raster_depth = global_size
prg.qmd.cta_thread_dimension0, prg.qmd.cta_thread_dimension1, prg.qmd.cta_thread_dimension2 = local_size
prg.qmd.constant_buffer_addr_lower_0 = (kernargs + QMD_SIZE) & 0xffffffff
prg.qmd.constant_buffer_addr_upper_0 = (kernargs + QMD_SIZE) >> 32
self.q += [nvmethod(1, nv_gpu.NVC6C0_INVALIDATE_SHADER_CACHES_NO_WFI, 1), (1 << 12) | (1 << 4) | (1 << 0)]
self.q += [nvmethod(1, nv_gpu.NVC6C0_SET_INLINE_QMD_ADDRESS_A, 0x42), *nvdata64(kernargs >> 8)]
self.q += [x for x in to_mv(ctypes.addressof(prg.qmd), ctypes.sizeof(prg.qmd)).cast("I")]
if completion_signal is not None: self.signal(completion_signal)
return self
def wait(self, signal, value=0):
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal).contents)), *nvdata64_le(value),
(3 << 0) | (1 << 12) | (1 << 24)] # ACQUIRE | ACQUIRE_SWITCH_TSG | PAYLOAD_SIZE_64BIT
return self
def signal(self, signal, value=0, timestamp=False):
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal).contents)), *nvdata64_le(value),
(1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
return self
def submit(self, dev:NVDevice):
assert len(self.q) < (1 << 21)
self.signal(dev.compute_progress_signal, dev.compute_put_value + 1)
for i,packet in enumerate(self.q): dev.cmdq[dev.cmdq_wptr//4 + i] = packet
fifo_entry = dev.compute_put_value % dev.compute_gpfifo_entries
dev.compute_gpu_ring[fifo_entry] = ((dev.cmdq_page.base+dev.cmdq_wptr)//4 << 2) | (len(self.q) << 42) | (1 << 41)
dev.compute_gpu_ring_controls.GPPut = (dev.compute_put_value + 1) % dev.compute_gpfifo_entries
dev.compute_put_value += 1
dev.gpu_mmio[0x90 // 4] = dev.compute_gpfifo_token
dev.cmdq_wptr += len(self.q) * 4
class HWCopyQueue:
def __init__(self): self.q = []
def copy(self, dest, src, copy_size):
self.q += [nvmethod(4, nv_gpu.NVC6B5_OFFSET_IN_UPPER, 4), *nvdata64(src), *nvdata64(dest)]
self.q += [nvmethod(4, nv_gpu.NVC6B5_LINE_LENGTH_IN, 1), copy_size]
self.q += [nvmethod(4, nv_gpu.NVC6B5_LAUNCH_DMA, 1), 0x182] # TRANSFER_TYPE_NON_PIPELINED | DST_MEMORY_LAYOUT_PITCH | SRC_MEMORY_LAYOUT_PITCH
return self
def wait(self, signal, value=0):
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal).contents)), value, 0x0,
(3 << 0) | (1 << 12) | (1 << 24)] # ACQUIRE | ACQUIRE_SWITCH_TSG | PAYLOAD_SIZE_64BIT
return self
def signal(self, signal, value=0, timestamp=False):
self.q += [nvmethod(0, nv_gpu.NVC56F_SEM_ADDR_LO, 5), *nvdata64_le(ctypes.addressof(from_mv(signal).contents)), *nvdata64_le(value),
(1 << 0) | (1 << 20) | (1 << 24) | ((1 << 25) if timestamp else 0)] # RELEASE | RELEASE_WFI | PAYLOAD_SIZE_64BIT | RELEASE_TIMESTAMP
self.q += [nvmethod(0, nv_gpu.NVC56F_NON_STALL_INTERRUPT, 1), 0x0]
return self
def submit(self, dev:NVDevice):
self.signal(dev.dma_progress_signal, dev.dma_put_value + 1)
for i,packet in enumerate(self.q): dev.cmdq[dev.cmdq_wptr//4 + i] = packet
fifo_entry = dev.dma_put_value % dev.dma_gpfifo_entries
dev.dma_gpu_ring[fifo_entry] = ((dev.cmdq_page.base+dev.cmdq_wptr)//4 << 2) | (len(self.q) << 42)
dev.dma_gpu_ring_controls.GPPut = (dev.dma_put_value + 1) % dev.dma_gpfifo_entries
dev.dma_put_value += 1
dev.gpu_mmio[0x90 // 4] = dev.dma_gpfifo_token
dev.cmdq_wptr += len(self.q) * 4
SHT_PROGBITS, SHT_NOBITS, SHF_ALLOC, SHF_EXECINSTR = 0x1, 0x8, 0x2, 0x4
class NVProgram:
def __init__(self, device:NVDevice, name:str, lib:bytes):
self.device, self.name, self.lib = device, name, lib
if DEBUG >= 6:
try:
fn = (pathlib.Path(tempfile.gettempdir()) / f"tinycuda_{hashlib.md5(lib).hexdigest()}").as_posix()
with open(fn + ".cubin", "wb") as f: f.write(lib)
print(subprocess.check_output(["nvdisasm", fn+".cubin"]).decode('utf-8'))
except Exception as e: print("failed to disasm cubin", str(e))
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
shstrtab = memoryview(bytearray(self.lib[sections[_shstrndx][4]:sections[_shstrndx][4]+sections[_shstrndx][5]]))
self.shmem_usage = 0
constant_buffers_data = {}
for sh_name, sh_type, sh_flags, _, sh_offset, sh_size, _, sh_info, _ in sections:
section_name = shstrtab[sh_name:].tobytes().split(b'\0', 1)[0].decode('utf-8')
if sh_type == SHT_NOBITS and sh_flags & SHF_ALLOC: self.shmem_usage = sh_size
elif sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC and sh_flags & SHF_EXECINSTR:
self.program = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
self.registers_usage = sh_info >> 24
if match := re.match(r'\.nv\.constant(\d+)', section_name):
constant_buffers_data[int(match.group(1))] = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
if section_name == ".nv.info":
section_data = memoryview(bytearray(self.lib[sh_offset:sh_offset+sh_size])).cast("I")
for i in range(sh_size // 12):
if section_data[i * 3 + 0] & 0xffff == 0x1204 and section_data[i * 3 + 2] + 0x240 > self.device.slm_per_thread:
raise RuntimeError("too high local memory")
# Load program and constant buffers (if any)
self.lib_sz = round_up(round_up(self.program.nbytes, 128) + sum([round_up(x.nbytes, 128) for i,x in constant_buffers_data.items()]), 0x1000)
self.lib_gpu = self.device.allocator.alloc(self.lib_sz)
for st in range(0, len(self.program), 4096):
HWComputeQueue().copy_from_cpu(self.lib_gpu.base+st*4, self.program[st:st+4096]).submit(self.device)
self.constbuffer_0 = [0] * 88
self.constbuffer_0[6:12] = [*nvdata64_le(self.device.shared_mem_window), *nvdata64_le(self.device.local_mem_window), *nvdata64_le(0xfffdc0)]
smem_config = min(shmem_conf * 1024 for shmem_conf in [8, 16, 32, 64, 96] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
self.qmd = qmd_struct_t(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3,
shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config,
max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config,
barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=0x10, sass_version=0x89,
program_address_lower=self.lib_gpu.base&0xffffffff, program_address_upper=self.lib_gpu.base>>32,
program_prefetch_addr_lower_shifted=self.lib_gpu.base>>8, program_prefetch_addr_upper_shifted=self.lib_gpu.base>>40,
constant_buffer_size_shifted4_0=0x190, constant_buffer_valid_0=1, constant_buffer_invalidate_0=1)
# constant buffer 0 is filled for each program, no need to copy it from elf (it's just zeroes)
if 0 in constant_buffers_data: constant_buffers_data.pop(0)
off = round_up(self.program.nbytes, 128)
for i,data in constant_buffers_data.items():
self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (self.lib_gpu.base + off) >> 32)
self.qmd.__setattr__(f'constant_buffer_addr_lower_{i}', (self.lib_gpu.base + off) & 0xffffffff)
self.qmd.__setattr__(f'constant_buffer_size_shifted4_{i}', data.nbytes)
self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1)
HWComputeQueue().copy_from_cpu(self.lib_gpu.base + off, data).submit(self.device)
off += round_up(data.nbytes, 128)
self.device.synchronize()
def __del__(self):
if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_sz)
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
kernargs_size = round_up(QMD_SIZE + 0x160 + len(args) * 8 + len(vals) * 4, 1 << 8)
if self.device.kernargs_ptr >= (self.device.kernargs_page.base + self.device.kernargs_page.length - kernargs_size):
self.device.kernargs_ptr = self.device.kernargs_page.base
kernargs = [arg_half for arg in args for arg_half in nvdata64_le(arg.base)] + [val for val in vals]
kernargs_ptr = self.device.kernargs_ptr
self.device.kernargs_ptr += kernargs_size
if wait: st, en = self.device._get_signal(), self.device._get_signal()
queue = HWComputeQueue()
queue.wait(self.device.dma_progress_signal, self.device.dma_put_value)
queue.wait(self.device.compute_progress_signal, self.device.compute_put_value)
if wait: queue.signal(st, timestamp=True)
queue.copy_from_cpu(kernargs_ptr + QMD_SIZE, self.constbuffer_0 + kernargs)
queue.exec(self, kernargs_ptr, global_size, local_size)
if wait: queue.signal(en, timestamp=True)
queue.submit(self.device)
if wait:
self.device._wait_signal(self.device.compute_progress_signal, self.device.compute_put_value)
return (en[1]-st[1]) / 1e9
class NVAllocator(LRUAllocator):
def __init__(self, device:NVDevice):
self.device = device
super().__init__()
def _alloc(self, size:int, options:BufferOptions):
if options.host: return self.device._gpu_host_alloc(size)
else: return self.device._gpu_alloc(size, map_to_all_gpus=True)
def _free(self, gpumem, options:BufferOptions):
NVDevice.synchronize_system()
if options.host: self.device._gpu_host_free(gpumem)
else: self.device._gpu_free(gpumem)
def copyin(self, dest, src: memoryview):
host_mem = self.alloc(src.nbytes, BufferOptions(host=True))
self.device.pending_copyin.append((host_mem, src.nbytes, BufferOptions(host=True)))
ctypes.memmove(host_mem.base, from_mv(src), src.nbytes)
HWCopyQueue().copy(dest.base, host_mem.base, src.nbytes).submit(self.device)
self.device.synchronize()
def copyout(self, dest:memoryview, src):
NVDevice.synchronize_system()
host_mem = self.alloc(dest.nbytes, BufferOptions(host=True))
self.device.pending_copyin.append((host_mem, dest.nbytes, BufferOptions(host=True)))
HWCopyQueue().copy(host_mem.base, src.base, dest.nbytes).submit(self.device)
self.device.synchronize()
ctypes.memmove(from_mv(dest), host_mem.base, dest.nbytes)
def transfer(self, dest, src, sz:int, src_dev=None, dest_dev=None):
queue = HWCopyQueue()
queue.wait(src_dev.dma_progress_signal, src_dev.dma_put_value)
queue.wait(src_dev.compute_progress_signal, src_dev.compute_put_value)
queue.wait(dest_dev.dma_progress_signal, dest_dev.dma_put_value)
queue.wait(dest_dev.compute_progress_signal, dest_dev.compute_put_value)
queue.copy(dest.base, src.base, sz).submit(src_dev)
HWCopyQueue().wait(src_dev.dma_progress_signal, src_dev.dma_put_value).submit(dest_dev)
dest_dev.synchronize()
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
class NVDevice(Compiled):
root = None
fd_ctl: int = -1
fd_uvm: int = -1
gpus_info = None
signals_page:Any = None
signal_number: int = 32
uvm_vaddr: int = 0x1000000000
host_object_enumerator: int = 0x1000
devices: List[NVDevice] = []
def _new_gpu_fd(self):
fd_dev = os.open(f"/dev/nvidia{self.device_id}", os.O_RDWR | os.O_CLOEXEC)
nv_iowr(fd_dev, nv_gpu.NV_ESC_REGISTER_FD, nv_gpu.nv_ioctl_register_fd_t(ctl_fd=self.fd_ctl))
return fd_dev
def _gpu_map_to_cpu(self, memory_handle, size, target=None, flags=0, system=False):
fd_dev = self._new_gpu_fd() if not system else os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
made = nv_gpu.nv_ioctl_nvos33_parameters_with_fd(fd=fd_dev,
params=nv_gpu.NVOS33_PARAMETERS(hClient=self.root, hDevice=self.device, hMemory=memory_handle, length=size, flags=flags))
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY, made)
if made.params.status != 0: raise RuntimeError(f"_gpu_map_to_cpu returned {made.params.status}")
return libc.mmap(target, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if target is not None else 0), fd_dev, 0)
def _gpu_alloc(self, size:int, contig=False, huge_page=False, va_addr=None, map_to_cpu=False, map_to_all_gpus=False, map_flags=0):
size = round_up(size, align:=((4 << 10) if huge_page else (2 << 20))) # TODO: need hugepage option, any speedup?
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, alignment=align, offset=0, limit=size-1, format=6, size=size,
attr=(((nv_gpu.NVOS32_ATTR_PAGE_SIZE_HUGE << 23) if huge_page else 0) |
((nv_gpu.NVOS32_ATTR_PHYSICALITY_CONTIGUOUS if contig else nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS) << 27)),
attr2=((nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_YES << 2) |
((nv_gpu.NVOS32_ATTR2_PAGE_SIZE_HUGE_2MB << 20) if huge_page else 0)),
flags=(nv_gpu.NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE | nv_gpu.NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM | nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED |
nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED))
mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_USER, self.root, self.device, alloc_params).hObjectNew
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size, alignment=align)
if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags)
handle = self._gpu_uvm_map(va_addr, size, mem_handle)
if map_to_all_gpus:
for dev in NVDevice.devices:
if dev != self: dev._gpu_uvm_map(handle.base, handle.length, handle.hMemory, create_range=False)
return handle
def _gpu_system_alloc(self, size:int, va_addr=None, map_to_cpu=False, map_flags=0):
alloc_params = nv_gpu.NV_MEMORY_ALLOCATION_PARAMS(owner=self.root, type=13,
attr=(nv_gpu.NVOS32_ATTR_PHYSICALITY_ALLOW_NONCONTIGUOUS << 27) | (nv_gpu.NVOS32_ATTR_LOCATION_PCI << 25),
attr2=(nv_gpu.NVOS32_ATTR2_ZBC_PREFER_NO_ZBC << 0) | (nv_gpu.NVOS32_ATTR2_GPU_CACHEABLE_NO << 2),
flags=(nv_gpu.NVOS32_ALLOC_FLAGS_IGNORE_BANK_PLACEMENT | nv_gpu.NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED |
nv_gpu.NVOS32_ALLOC_FLAGS_MAP_NOT_REQUIRED), format=6, size=size, alignment=(4<<10), offset=0, limit=size-1)
mem_handle = rm_alloc(self.fd_ctl, nv_gpu.NV1_MEMORY_SYSTEM, self.root, self.device, alloc_params).hObjectNew
if va_addr is None: va_addr = self._alloc_gpu_vaddr(size)
if map_to_cpu: va_addr = self._gpu_map_to_cpu(mem_handle, size, target=va_addr, flags=map_flags, system=True)
return self._gpu_uvm_map(va_addr, size, mem_handle)
def _gpu_host_alloc(self, size):
va_base = self._alloc_gpu_vaddr(sz:=round_up(size, 4 << 10))
libc.mmap(va_base, sz, mmap.PROT_READ|mmap.PROT_WRITE, MAP_FIXED|mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
return self._map_to_gpu(va_base, sz)
def _gpu_free(self, mem):
made = nv_gpu.NVOS00_PARAMETERS(hRoot=self.root, hObjectParent=self.device, hObjectOld=mem.hMemory)
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_FREE, made)
if made.status != 0: raise RuntimeError(f"_gpu_free returned {made.status}")
uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
def _gpu_host_free(self, mem):
uvm.free(self.fd_uvm, base=mem.base, length=mem.length)
libc.munmap(mem.base, mem.length)
def _map_to_gpu(self, va_base, size):
NVDevice.host_object_enumerator += 1
flags = ((nv_gpu.NVOS02_FLAGS_PHYSICALITY_NONCONTIGUOUS << 4) | (nv_gpu.NVOS02_FLAGS_COHERENCY_CACHED << 12) |
(nv_gpu.NVOS02_FLAGS_MAPPING_NO_MAP << 30))
made = nv_gpu.nv_ioctl_nvos02_parameters_with_fd(params=nv_gpu.NVOS02_PARAMETERS(hRoot=self.root, hObjectParent=self.device, flags=flags,
hObjectNew=NVDevice.host_object_enumerator, hClass=nv_gpu.NV01_MEMORY_SYSTEM_OS_DESCRIPTOR, pMemory=va_base, limit=size-1), fd=-1)
nv_iowr(self.fd_dev, nv_gpu.NV_ESC_RM_ALLOC_MEMORY, made)
if made.params.status != 0: raise RuntimeError(f"_map_to_gpu returned {made.params.status}")
return self._gpu_uvm_map(va_base, size, made.params.hObjectNew)
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True) -> nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS:
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
gpu_attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(
nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuMappingType = 1))
return uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl, hClient=self.root, hMemory=mem_handle,
gpuAttributesCount=1, perGpuAttributes=gpu_attrs)
def _alloc_gpu_vaddr(self, size, alignment=(4 << 10)):
NVDevice.uvm_vaddr = (res_va:=round_up(NVDevice.uvm_vaddr, alignment)) + size
return res_va
def __init__(self, device:str=""):
if NVDevice.root is None:
NVDevice.fd_ctl = os.open("/dev/nvidiactl", os.O_RDWR | os.O_CLOEXEC)
NVDevice.fd_uvm = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
fd_uvm_2 = os.open("/dev/nvidia-uvm", os.O_RDWR | os.O_CLOEXEC)
NVDevice.root = rm_alloc(self.fd_ctl, nv_gpu.NV01_ROOT_CLIENT, 0, 0, None).hObjectNew
uvm.initialize(self.fd_uvm)
uvm.mm_initialize(fd_uvm_2, uvmFd=self.fd_uvm)
NVDevice.gpus_info = (nv_gpu.nv_ioctl_card_info_t*16)()
nv_iowr(NVDevice.fd_ctl, nv_gpu.NV_ESC_CARD_INFO, NVDevice.gpus_info)
# TODO: Get classes from NV0080_CTRL_CMD_GPU_GET_CLASSLIST_V2
self.device_id = int(device.split(":")[1]) if ":" in device else 0
self.fd_dev = self._new_gpu_fd()
assert NVDevice.gpus_info[self.device_id].valid
gpu_info = nv_gpu.NV0000_CTRL_GPU_GET_ID_INFO_V2_PARAMS(gpuId=NVDevice.gpus_info[self.device_id].gpu_id)
rm_control(self.fd_ctl, nv_gpu.NV0000_CTRL_CMD_GPU_GET_ID_INFO_V2, self.root, self.root, gpu_info)
device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=gpu_info.deviceInstance, hClientShare=self.root,
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
self.device = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.device, None).hObjectNew
self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
gpu_mmio_ptr = self._gpu_map_to_cpu(self.usermode, 0x10000, flags=2)
self.gpu_mmio = to_mv(gpu_mmio_ptr, 0x10000).cast("I")
vaspace_params = nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS(vaBase=0x1000, vaSize=0x1fffffb000000,
flags=nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_ENABLE_PAGE_FAULTING | nv_gpu.NV_VASPACE_ALLOCATION_FLAGS_IS_EXTERNALLY_OWNED)
vaspace = rm_alloc(self.fd_ctl, nv_gpu.FERMI_VASPACE_A, self.root, self.device, vaspace_params).hObjectNew
gpu_uuid_params = nv_gpu.NV2080_CTRL_GPU_GET_GID_INFO_PARAMS(flags=nv_gpu.NV2080_GPU_CMD_GPU_GET_GID_FLAGS_FORMAT_BINARY, length=16)
rm_control(self.fd_ctl, nv_gpu.NV2080_CTRL_CMD_GPU_GET_GID_INFO, self.root, self.subdevice, gpu_uuid_params)
self.gpu_uuid = (ctypes.c_ubyte*16)(*[gpu_uuid_params.data[i] for i in range(16)])
uvm.register_gpu(self.fd_uvm, rmCtrlFd=-1, gpu_uuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid))
uvm.register_gpu_vaspace(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl,
hClient=self.root, hVaSpace=vaspace)
for dev in self.devices:
uvm.enable_peer_access(self.fd_uvm, gpuUuidA=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), gpuUuidB=nv_gpu.struct_nv_uuid(uuid=dev.gpu_uuid))
if NVDevice.signals_page is None: NVDevice.signals_page = self._gpu_system_alloc(0x10000, map_to_cpu=True)
else: self._gpu_uvm_map(NVDevice.signals_page.base, NVDevice.signals_page.length, NVDevice.signals_page.hMemory, create_range=False)
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
channel_group = rm_alloc(self.fd_ctl, nv_gpu.KEPLER_CHANNEL_GROUP_A, self.root, self.device, channel_params).hObjectNew
gpfifo = self._gpu_alloc(0x200000, contig=True, huge_page=True, map_to_cpu=True, map_flags=0x10d0000)
ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
ctxshare = rm_alloc(self.fd_ctl, nv_gpu.FERMI_CONTEXT_SHARE_A, self.root, channel_group, ctxshare_params).hObjectNew
self.compute_gpfifo_entries: int = 0x10000
self.compute_gpfifo_token: int = self._gpu_fifo_setup(gpfifo, ctxshare, channel_group, offset=0, entries=self.compute_gpfifo_entries)
self.compute_gpu_ring: memoryview = to_mv(gpfifo.base, self.compute_gpfifo_entries * 8).cast("Q")
self.compute_gpu_ring_controls = nv_gpu.AmpereAControlGPFifo.from_address(gpfifo.base + self.compute_gpfifo_entries * 8)
self.compute_put_value: int = 0
self.compute_progress_signal = NVDevice._get_signal(self.device_id * 2)
self.dma_gpfifo_entries: int = 0x10000
self.dma_gpfifo_token: int = self._gpu_fifo_setup(gpfifo, ctxshare, channel_group, offset=0x100000, entries=self.dma_gpfifo_entries)
self.dma_gpu_ring: memoryview = to_mv(gpfifo.base + 0x100000, self.dma_gpfifo_entries * 8).cast("Q")
self.dma_gpu_ring_controls = nv_gpu.AmpereAControlGPFifo.from_address(gpfifo.base + 0x100000 + self.dma_gpfifo_entries * 8)
self.dma_put_value: int = 0
self.dma_progress_signal = NVDevice._get_signal(self.device_id * 2 + 1)
en_fifo_params = nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1)
rm_control(self.fd_ctl, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, self.root, channel_group, en_fifo_params)
self.cmdq_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x200000, map_to_cpu=True, huge_page=True)
self.cmdq: memoryview = to_mv(self.cmdq_page.base, 0x200000).cast("I")
self.cmdq_wptr: int = 0 # in bytes
self.kernargs_page: nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS = self._gpu_alloc(0x4000000, map_to_cpu=True)
self.kernargs_ptr: int = self.kernargs_page.base
self.arch: str = 'sm_89' # TODO: fix
self.pending_copyin: List[Any] = []
super().__init__(device, NVAllocator(self), NVCompiler(self.arch), functools.partial(NVProgram, self))
self._cmdq_setup_compute_gpfifo()
self._cmdq_setup_dma_gpfifo()
NVDevice.devices.append(self)
def synchronize(self):
self._wait_signal(self.compute_progress_signal, self.compute_put_value)
self._wait_signal(self.dma_progress_signal, self.dma_put_value)
self.cmdq_wptr = 0
for opaque,sz,options in self.pending_copyin: self.allocator.free(opaque, sz, options)
self.pending_copyin.clear()
@staticmethod
def synchronize_system():
for d in NVDevice.devices: d.synchronize()
@classmethod
def _get_signal(self, num=None) -> memoryview:
if num is None:
self.signal_number += 1
if self.signals_page and self.signal_number * 16 >= self.signals_page.length: self.signal_number = 32
num = self.signal_number
sig = to_mv(self.signals_page.base + num * 16, 16).cast("Q")
sig[0] = 0
return sig
@classmethod
def _wait_signal(self, signal, value=0, timeout=10000):
start_time = time.time()
sem_value = signal[0]
while sem_value != value:
sem_value = signal[0]
if time.time() - start_time > timeout // 1000: raise RuntimeError(f"wait_result: {timeout} ms TIMEOUT!")
def _gpu_fifo_setup(self, gpfifo, ctxshare, channel_group, offset, entries=0x400):
notifier = self._gpu_system_alloc(48 << 20)
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.hMemory, hObjectBuffer=gpfifo.hMemory,
gpFifoOffset=gpfifo.base+offset, gpFifoEntries=entries, hContextShare=ctxshare,
hUserdMemory=(ctypes.c_uint32*8)(gpfifo.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
rm_alloc(self.fd_ctl, nv_gpu.ADA_COMPUTE_A, self.root, gpfifo, None)
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
ws_token_params = nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS(workSubmitToken=-1)
rm_control(self.fd_ctl, nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN, self.root, gpfifo, ws_token_params)
assert ws_token_params.workSubmitToken != -1
channel_base = self._alloc_gpu_vaddr(0x4000000)
uvm.register_channel(self.fd_uvm, gpuUuid=nv_gpu.struct_nv_uuid(uuid=self.gpu_uuid), rmCtrlFd=self.fd_ctl, hClient=self.root,
hChannel=gpfifo, base=channel_base, length=0x4000000)
return ws_token_params.workSubmitToken
def _cmdq_setup_compute_gpfifo(self):
self.slm_per_thread = 0x900
bytes_per_warp = round_up(self.slm_per_thread * 32, 0x200)
bytes_per_tpc = round_up(bytes_per_warp * 48 * 2, 0x8000)
self.shader_local_mem = self._gpu_alloc(round_up(bytes_per_tpc * 64, 0x20000), huge_page=True, contig=True).base
# Set windows addresses to not collide with other allocated buffers.
self.shared_mem_window, self.local_mem_window = 0xfe000000, 0xff000000
queue = HWComputeQueue()
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_OBJECT, 1), nv_gpu.ADA_COMPUTE_A]
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_A, 2), *nvdata64(self.shader_local_mem)]
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A, 3), *nvdata64(bytes_per_tpc), 0x40]
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A, 2), *nvdata64(self.local_mem_window)]
queue.q += [nvmethod(1, nv_gpu.NVC6C0_SET_SHADER_SHARED_MEMORY_WINDOW_A, 2), *nvdata64(self.shared_mem_window)]
queue.submit(self)
self.synchronize()
def _cmdq_setup_dma_gpfifo(self):
queue = HWCopyQueue()
queue.q += [nvmethod(4, nv_gpu.NVC6C0_SET_OBJECT, 1), nv_gpu.AMPERE_DMA_COPY_B]
queue.submit(self)
self.synchronize()