mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-10 23:48:01 -05:00
nv: blackwell support (#10487)
* nv: blackwell support * fixes * hm * h * fixes * mypy * xx * yy * arr * revert * oops * unrelated
This commit is contained in:
@@ -104,10 +104,10 @@ generate_nvrtc() {
|
||||
}
|
||||
|
||||
generate_nv() {
|
||||
NVKERN_COMMIT_HASH=d6b75a34094b0f56c2ccadf14e5d0bd515ed1ab6
|
||||
NVKERN_COMMIT_HASH=81fe4fb417c8ac3b9bdcc1d56827d116743892a5
|
||||
NVKERN_SRC=/tmp/open-gpu-kernel-modules-$NVKERN_COMMIT_HASH
|
||||
if [ ! -d "$NVKERN_SRC" ]; then
|
||||
git clone https://github.com/tinygrad/open-gpu-kernel-modules $NVKERN_SRC
|
||||
git clone https://github.com/NVIDIA/open-gpu-kernel-modules $NVKERN_SRC
|
||||
pushd .
|
||||
cd $NVKERN_SRC
|
||||
git reset --hard $NVKERN_COMMIT_HASH
|
||||
@@ -116,15 +116,19 @@ generate_nv() {
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/nv_gpu_driver/clc6c0qmd.h \
|
||||
extra/nv_gpu_driver/clcec0qmd.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl0080.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl2080_notification.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc86f.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc96f.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc761.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl83de.h \
|
||||
$NVKERN_SRC/src/nvidia/generated/g_allclasses.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc6c0.h \
|
||||
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clcdc0.h \
|
||||
$NVKERN_SRC/kernel-open/nvidia-uvm/clc6b5.h \
|
||||
$NVKERN_SRC/kernel-open/nvidia-uvm/clc9b5.h \
|
||||
$NVKERN_SRC/kernel-open/nvidia-uvm/uvm_ioctl.h \
|
||||
$NVKERN_SRC/kernel-open/nvidia-uvm/uvm_linux_ioctl.h \
|
||||
$NVKERN_SRC/kernel-open/nvidia-uvm/hwref/ampere/ga100/dev_fault.h \
|
||||
@@ -149,6 +153,7 @@ generate_nv() {
|
||||
sed -i "s\import ctypes\import ctypes, os\g" $BASE/nv_gpu.py
|
||||
sed -i 's/#\?\s\([A-Za-z0-9_]\+\) = MW ( \([0-9]\+\) : \([0-9]\+\) )/\1 = (\2 , \3)/' $BASE/nv_gpu.py # NVC6C0_QMDV03_00 processing
|
||||
sed -i 's/#\sdef NVC6C0_QMD\([A-Za-z0-9_()]\+\):/def NVC6C0_QMD\1:/' $BASE/nv_gpu.py
|
||||
sed -i 's/#\sdef NVCEC0_QMD\([A-Za-z0-9_()]\+\):/def NVCEC0_QMD\1:/' $BASE/nv_gpu.py
|
||||
sed -i 's/#\s*return MW(\([0-9i()*+]\+\):\([0-9i()*+]\+\))/ return (\1 , \2)/' $BASE/nv_gpu.py
|
||||
sed -i 's/#\?\s*\(.*\)\s*=\s*\(NV\)\?BIT\(32\)\?\s*(\s*\([0-9]\+\)\s*)/\1 = (1 << \4)/' $BASE/nv_gpu.py # name = BIT(x) -> name = (1 << x)
|
||||
sed -i "s/UVM_\([A-Za-z0-9_]\+\) = \['i', '(', '\([0-9]\+\)', ')'\]/UVM_\1 = \2/" $BASE/nv_gpu.py # UVM_name = ['i', '(', '<num>', ')'] -> UVM_name = <num>
|
||||
|
||||
172
extra/nv_gpu_driver/clcec0qmd.h
Normal file
172
extra/nv_gpu_driver/clcec0qmd.h
Normal file
@@ -0,0 +1,172 @@
|
||||
#ifndef __CLCEC0QMD_H__
|
||||
#define __CLCEC0QMD_H__
|
||||
|
||||
#define NVCEC0_QMDV05_00_CTA_RASTER_WIDTH MW(1279:1248) // aka GRID_WIDTH
|
||||
#define NVCEC0_QMDV05_00_CTA_RASTER_HEIGHT MW(1311:1280) // aka GRID_HEIGHT
|
||||
#define NVCEC0_QMDV05_00_CTA_RASTER_DEPTH MW(1343:1312) // aka GRID_DEPTH
|
||||
|
||||
#define NVCEC0_QMDV05_00_REGISTER_COUNT_V MW(1136:1128)
|
||||
#define NVCEC0_QMDV05_00_BARRIER_COUNT MW(1137:1137) // ??
|
||||
|
||||
#define NVCEC0_QMDV05_00_QMD_MINOR_VERSION MW(467:464)
|
||||
#define NVCEC0_QMDV05_00_QMD_MAJOR_VERSION MW(471:468)
|
||||
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_ADDR_LOWER_SHIFTED6(i) MW((1375+(i)*64):(1344+(i)*64))
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_ADDR_UPPER_SHIFTED6(i) MW((1394+(i)*64):(1376+(i)*64))
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1407+(i)*64):(1395+(i)*64))
|
||||
|
||||
#define NVCEC0_QMDV05_00_CTA_THREAD_DIMENSION0 MW(1103:1088)
|
||||
#define NVCEC0_QMDV05_00_CTA_THREAD_DIMENSION1 MW(1119:1104)
|
||||
#define NVCEC0_QMDV05_00_CTA_THREAD_DIMENSION2 MW(1128:1120)
|
||||
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_VALID(i) MW((1856+(i)*4):(1856+(i)*4))
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_VALID_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_VALID_TRUE 0x00000001
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH(i) MW((1858+(i)*4):(1857+(i)*4))
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH_PREFETCH_NONE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH_PREFETCH_PRE 0x00000001
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH_PREFETCH_POST 0x00000002
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_INVALIDATE(i) MW((1859+(i)*4):(1859+(i)*4))
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001
|
||||
|
||||
#define NVCEC0_QMDV05_00_DEPENDENCE_COUNTER MW(143:128) // ??
|
||||
#define NVCEC0_QMDV05_00_QMD_GROUP_ID MW(149:144)
|
||||
|
||||
#define NVCEC0_QMDV05_00_PROGRAM_ADDRESS_LOWER MW(1055:1024)
|
||||
#define NVCEC0_QMDV05_00_PROGRAM_ADDRESS_UPPER MW(1080:1056)
|
||||
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_POINTER MW(415:384)
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_POINTER MW(447:416)
|
||||
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE(i) MW((336+(i)*5):(336+(i)*5))
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE_TRUE 0x00000001
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION(i) MW((339+(i)*5):(337+(i)*5))
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_INCREMENT_PUT 0x00000000
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_SCHEDULE 0x00000001
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH(i) MW((340+(i)*5):(340+(i)*5))
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH_TRUE 0x00000001
|
||||
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ENABLE NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE(0)
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ENABLE NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE(1)
|
||||
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ACTION NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION(0)
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ACTION NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION(1)
|
||||
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_PREFETCH NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH(0)
|
||||
#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_PREFETCH NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH(1)
|
||||
|
||||
#define NVCEC0_QMDV05_00_RELEASE_ENABLE(i) MW((288+(i)*16):(288+(i)*16))
|
||||
#define NVCEC0_QMDV05_00_RELEASE_ENABLE_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_RELEASE_ENABLE_TRUE 0x00000001
|
||||
#define NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE(i) MW((290+(i)*16):(289+(i)*16))
|
||||
#define NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE_SEMAPHORE_FOUR_WORDS 0x00000000
|
||||
#define NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE_SEMAPHORE_ONE_WORD 0x00000001
|
||||
#define NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE_SEMAPHORE_TWO_WORDS 0x00000002
|
||||
#define NVCEC0_QMDV05_00_RELEASE_MEMBAR_TYPE(i) MW((291+(i)*16):(291+(i)*16))
|
||||
#define NVCEC0_QMDV05_00_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_ENABLE(i) MW((292+(i)*16):(292+(i)*16))
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_ENABLE_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_ENABLE_TRUE 0x00000001
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP(i) MW((295+(i)*16):(293+(i)*16))
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_ADD 0x00000000
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_MIN 0x00000001
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_MAX 0x00000002
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_INC 0x00000003
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_DEC 0x00000004
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_AND 0x00000005
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_OR 0x00000006
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_XOR 0x00000007
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_FORMAT(i) MW((297+(i)*16):(296+(i)*16))
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_FORMAT_UNSIGNED 0x00000000
|
||||
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_FORMAT_SIGNED 0x00000001
|
||||
#define NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE(i) MW((299+(i)*16):(298+(i)*16))
|
||||
#define NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE_TRAP_NONE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE_TRAP_UNCONDITIONAL 0x00000001
|
||||
#define NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE_TRAP_CONDITIONAL 0x00000002
|
||||
#define NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE_TRAP_CONDITIONAL_EXT 0x00000003
|
||||
#define NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B(i) MW((300+(i)*16):(300+(i)*16))
|
||||
#define NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B_TRUE 0x00000001
|
||||
#define NVCEC0_QMDV05_00_RELEASE_RESERVED_INFO(i) MW((303+(i)*16):(301+(i)*16))
|
||||
|
||||
#define NVCEC0_QMDV05_00_RELEASE0_ENABLE NVCEC0_QMDV05_00_RELEASE_ENABLE(0)
|
||||
#define NVCEC0_QMDV05_00_RELEASE1_ENABLE NVCEC0_QMDV05_00_RELEASE_ENABLE(1)
|
||||
|
||||
#define NVCEC0_QMDV05_00_RELEASE0_STRUCTURE_SIZE NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE(0)
|
||||
#define NVCEC0_QMDV05_00_RELEASE1_STRUCTURE_SIZE NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE(1)
|
||||
|
||||
#define NVCEC0_QMDV05_00_RELEASE0_MEMBAR_TYPE NVCEC0_QMDV05_00_RELEASE_MEMBAR_TYPE(0)
|
||||
#define NVCEC0_QMDV05_00_RELEASE1_MEMBAR_TYPE NVCEC0_QMDV05_00_RELEASE_MEMBAR_TYPE(1)
|
||||
|
||||
#define NVCEC0_QMDV05_00_RELEASE0_REDUCTION_OP NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP(0)
|
||||
#define NVCEC0_QMDV05_00_RELEASE1_REDUCTION_OP NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP(1)
|
||||
|
||||
#define NVCEC0_QMDV05_00_RELEASE0_REDUCTION_FORMAT NVCEC0_QMDV05_00_RELEASE_REDUCTION_FORMAT(0)
|
||||
#define NVCEC0_QMDV05_00_RELEASE1_REDUCTION_FORMAT NVCEC0_QMDV05_00_RELEASE_REDUCTION_FORMAT(1)
|
||||
|
||||
#define NVCEC0_QMDV05_00_RELEASE0_TRAP_TYPE NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE(0)
|
||||
#define NVCEC0_QMDV05_00_RELEASE1_TRAP_TYPE NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE(1)
|
||||
|
||||
#define NVCEC0_QMDV05_00_RELEASE0_PAYLOAD64B NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B(0)
|
||||
#define NVCEC0_QMDV05_00_RELEASE1_PAYLOAD64B NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B(1)
|
||||
|
||||
#define NVCEC0_QMDV05_00_RELEASE0_ADDRESS_LOWER MW(511:480)
|
||||
#define NVCEC0_QMDV05_00_RELEASE0_ADDRESS_UPPER MW(543:512)
|
||||
#define NVCEC0_QMDV05_00_RELEASE0_PAYLOAD_LOWER MW(575:544)
|
||||
#define NVCEC0_QMDV05_00_RELEASE0_PAYLOAD_UPPER MW(607:576)
|
||||
|
||||
#define NVCEC0_QMDV05_00_RELEASE1_ADDRESS_LOWER MW(799:768)
|
||||
#define NVCEC0_QMDV05_00_RELEASE1_ADDRESS_UPPER MW(831:800)
|
||||
#define NVCEC0_QMDV05_00_RELEASE1_PAYLOAD_LOWER MW(863:832)
|
||||
#define NVCEC0_QMDV05_00_RELEASE1_PAYLOAD_UPPER MW(895:864)
|
||||
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_HEADER_CACHE MW(472:472)
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(473:473)
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_DATA_CACHE MW(474:474)
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_SHADER_DATA_CACHE MW(475:475)
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_INSTRUCTION_CACHE MW(476:476)
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_SHADER_CONSTANT_CACHE MW(477:477)
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000
|
||||
#define NVCEC0_QMDV05_00_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001
|
||||
|
||||
#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_ADDR_LOWER_SHIFTED MW(1919:1888)
|
||||
#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_ADDR_UPPER_SHIFTED MW(1936:1920)
|
||||
#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_SIZE MW(1945:1937)
|
||||
#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_TYPE MW(1947:1946)
|
||||
#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_TYPE_PREFETCH_LAUNCH 0x00000000
|
||||
#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_TYPE_PREFTECH_POST 0x00000001
|
||||
|
||||
#define NVCEC0_QMDV05_00_SHARED_MEMORY_SIZE MW(1162:1145)
|
||||
#define NVCEC0_QMDV05_00_MIN_SM_CONFIG_SHARED_MEM_SIZE MW(1168:1163)
|
||||
#define NVCEC0_QMDV05_00_MAX_SM_CONFIG_SHARED_MEM_SIZE MW(1174:1169)
|
||||
#define NVCEC0_QMDV05_00_TARGET_SM_CONFIG_SHARED_MEM_SIZE MW(1180:1175)
|
||||
|
||||
// ??
|
||||
#define NVCEC0_QMDV05_00_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(1213:1196)
|
||||
#define NVCEC0_QMDV05_00_API_VISIBLE_CALL_LIMIT MW(456:456)
|
||||
#define NVCEC0_QMDV05_00_API_VISIBLE_CALL_LIMIT__32 0x00000000
|
||||
#define NVCEC0_QMDV05_00_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001
|
||||
#define NVCEC0_QMDV05_00_SAMPLER_INDEX MW(457:457)
|
||||
#define NVCEC0_QMDV05_00_SAMPLER_INDEX_INDEPENDENTLY 0x00000000
|
||||
#define NVCEC0_QMDV05_00_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001
|
||||
|
||||
#define NVCEC0_QMDV05_00_UNKNOWN_13 MW(159:152) // A4
|
||||
#define NVCEC0_QMDV05_00_SASS_VERSION MW(455:448)
|
||||
|
||||
#endif // #ifndef __CLCEC0QMD_H__
|
||||
File diff suppressed because it is too large
Load Diff
@@ -60,11 +60,15 @@ uvm = make_uvm_type()
|
||||
class QMD:
|
||||
fields: dict[str, dict[str, tuple[int, int]]] = {}
|
||||
|
||||
def __init__(self, addr=None, pref="NVC6C0_QMDV03_00", **kwargs):
|
||||
if pref not in QMD.fields:
|
||||
def __init__(self, dev:NVDevice, addr:int|None=None, **kwargs):
|
||||
self.ver, self.sz = (5, 0x60) if dev.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A else (3, 0x40)
|
||||
|
||||
# Init fields from module
|
||||
if (pref:="NVCEC0_QMDV05_00" if self.ver == 5 else "NVC6C0_QMDV03_00") not in QMD.fields:
|
||||
QMD.fields[pref] = {**{name[len(pref)+1:]: dt for name,dt in nv_gpu.__dict__.items() if name.startswith(pref) and isinstance(dt, tuple)},
|
||||
**{name[len(pref)+1:]+f"_{i}": dt(i) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith(pref) and callable(dt)}}
|
||||
self.mv, self.pref = (memoryview(bytearray(0x40 * 4)) if addr is None else to_mv(addr, 0x40 * 4)), pref
|
||||
|
||||
self.mv, self.pref = (memoryview(bytearray(self.sz * 4)) if addr is None else to_mv(addr, self.sz * 4)), pref
|
||||
if kwargs: self.write(**kwargs)
|
||||
|
||||
def _rw_bits(self, hi:int, lo:int, value:int|None=None):
|
||||
@@ -83,6 +87,10 @@ class QMD:
|
||||
|
||||
def field_offset(self, k): return QMD.fields[self.pref][k.upper()][1] // 8
|
||||
|
||||
def set_constant_buf_addr(self, i, addr):
|
||||
if self.ver < 4: self.write(**{f'constant_buffer_addr_upper_{i}':hi32(addr), f'constant_buffer_addr_lower_{i}':lo32(addr)})
|
||||
else: self.write(**{f'constant_buffer_addr_upper_shifted6_{i}':hi32(addr >> 6), f'constant_buffer_addr_lower_shifted6_{i}':lo32(addr >> 6)})
|
||||
|
||||
class NVSignal(HCQSignal):
|
||||
def __init__(self, base_buf:HCQBuffer|None=None, **kwargs):
|
||||
super().__init__(base_buf, **kwargs, timestamp_divider=1000, dev_t=NVDevice)
|
||||
@@ -146,15 +154,15 @@ class NVComputeQueue(NVCommandQueue):
|
||||
self.bind_args_state(args_state)
|
||||
|
||||
qmd_buf = args_state.buf.offset(round_up(prg.constbufs[0][1], 1 << 8))
|
||||
qmd_buf.cpu_view().view(size=0x40 * 4, fmt='B')[:] = prg.qmd.mv
|
||||
qmd_buf.cpu_view().view(size=prg.qmd.mv.nbytes, fmt='B')[:] = prg.qmd.mv
|
||||
assert qmd_buf.va_addr < (1 << 40), f"large qmd addr {qmd_buf.va_addr:x}"
|
||||
|
||||
qmd = QMD(addr=qmd_buf.va_addr) # Save qmd for later update
|
||||
qmd = QMD(dev=prg.dev, addr=cast(int, qmd_buf.va_addr)) # Save qmd for later update
|
||||
|
||||
self.bind_sints_to_mem(*global_size, mem=qmd_buf.cpu_view(), fmt='I', offset=nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8)
|
||||
self.bind_sints_to_mem(*local_size, mem=qmd_buf.cpu_view(), fmt='H', offset=nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8)
|
||||
self.bind_sints_to_mem(*local_size, *global_size, mem=args_state.buf.cpu_view(), fmt='I')
|
||||
qmd.write(constant_buffer_addr_upper_0=hi32(args_state.buf.va_addr), constant_buffer_addr_lower_0=lo32(args_state.buf.va_addr))
|
||||
self.bind_sints_to_mem(*global_size, mem=qmd_buf.cpu_view(), fmt='I', offset=qmd.field_offset('cta_raster_width'))
|
||||
self.bind_sints_to_mem(*(local_size[:2]), mem=qmd_buf.cpu_view(), fmt='H', offset=qmd.field_offset('cta_thread_dimension0'))
|
||||
self.bind_sints_to_mem(local_size[2], mem=qmd_buf.cpu_view(), fmt='B', offset=qmd.field_offset('cta_thread_dimension2'))
|
||||
qmd.set_constant_buf_addr(0, args_state.buf.va_addr)
|
||||
|
||||
if self.active_qmd is None:
|
||||
self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_buf.va_addr >> 8)
|
||||
@@ -242,21 +250,28 @@ class NVProgram(HCQProgram):
|
||||
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
|
||||
|
||||
self.constbuffer_0 = [0] * (cbuf0_size // 4)
|
||||
self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)]
|
||||
|
||||
if dev.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A:
|
||||
self.constbuffer_0[188:192], self.constbuffer_0[223] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window)], 0xfffdc0
|
||||
qmd = {'qmd_major_version':5, 'unknown_13':0x1, 'program_address_upper':hi32(self.prog_addr>>4),'program_address_lower':lo32(self.prog_addr>>4),
|
||||
'sass_version':0xA4}
|
||||
else:
|
||||
self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)]
|
||||
qmd = {'qmd_major_version':3, 'sm_global_caching_enable':1, 'cwd_membar_type':nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR,
|
||||
'program_address_upper':hi32(self.prog_addr), 'program_address_lower':lo32(self.prog_addr), 'sass_version':0x89}
|
||||
|
||||
smem_cfg = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
|
||||
self.qmd:QMD = QMD(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
|
||||
invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
|
||||
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, constant_buffer_invalidate_0=1,
|
||||
shared_memory_size=self.shmem_usage, min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg,
|
||||
max_sm_config_shared_mem_size=0x1a, register_count_v=self.regs_usage, sass_version=0x89,
|
||||
program_address_upper=hi32(self.prog_addr), program_address_lower=lo32(self.prog_addr),
|
||||
barrier_count=1, shader_local_memory_high_size=self.dev.slm_per_thread, program_prefetch_size=min(self.prog_sz>>8, 0x1ff),
|
||||
program_prefetch_addr_lower_shifted=self.prog_addr>>8, program_prefetch_addr_upper_shifted=self.prog_addr>>40)
|
||||
|
||||
self.qmd:QMD = QMD(dev, **qmd, qmd_group_id=0x3f, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
|
||||
invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1, barrier_count=1,
|
||||
constant_buffer_invalidate_0=1, register_count_v=self.regs_usage, shader_local_memory_high_size=self.dev.slm_per_thread,
|
||||
min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg, max_sm_config_shared_mem_size=0x1a,
|
||||
shared_memory_size=self.shmem_usage, program_prefetch_size=min(self.prog_sz>>8, 0x1ff),
|
||||
program_prefetch_addr_upper_shifted=self.prog_addr>>40, program_prefetch_addr_lower_shifted=self.prog_addr>>8)
|
||||
|
||||
for i,(addr,sz) in self.constbufs.items():
|
||||
self.qmd.write(**{f'constant_buffer_addr_upper_{i}': hi32(addr), f'constant_buffer_addr_lower_{i}': lo32(addr),
|
||||
f'constant_buffer_size_shifted4_{i}': sz, f'constant_buffer_valid_{i}': 1})
|
||||
self.qmd.set_constant_buf_addr(i, addr)
|
||||
self.qmd.write(**{f'constant_buffer_size_shifted4_{i}': sz, f'constant_buffer_valid_{i}': 1})
|
||||
|
||||
# Registers allocation granularity per warp is 256, warp allocation granularity is 4. Register file size is 65536.
|
||||
self.max_threads = ((65536 // round_up(max(1, self.regs_usage) * 32, 256)) // 4) * 4 * 32
|
||||
@@ -399,7 +414,10 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
classlist = memoryview(bytearray(100 * 4)).cast('I')
|
||||
clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.nvdevice, numClasses=100, classList=mv_address(classlist))
|
||||
self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
|
||||
self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)
|
||||
self.usermode_class:int = next(c for c in [nv_gpu.HOPPER_USERMODE_A, nv_gpu.TURING_USERMODE_A] if c in self.nvclasses)
|
||||
self.gpfifo_class:int = next(c for c in [nv_gpu.BLACKWELL_CHANNEL_GPFIFO_A, nv_gpu.AMPERE_CHANNEL_GPFIFO_A] if c in self.nvclasses)
|
||||
self.compute_class:int = next(c for c in [nv_gpu.BLACKWELL_COMPUTE_B, nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if c in self.nvclasses)
|
||||
self.dma_class:int = next(c for c in [nv_gpu.BLACKWELL_DMA_COPY_B, nv_gpu.AMPERE_DMA_COPY_B] if c in self.nvclasses)
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
if NVDevice.root is None:
|
||||
@@ -427,12 +445,13 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
|
||||
self.nvdevice = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
|
||||
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.nvdevice, None).hObjectNew
|
||||
self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
|
||||
self.gpu_mmio = MMIOInterface(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz, fmt='I')
|
||||
|
||||
self._setup_nvclasses()
|
||||
self._debug_mappings: dict[tuple[int, int], str] = dict()
|
||||
|
||||
self.usermode = rm_alloc(self.fd_ctl, self.usermode_class, self.root, self.subdevice, None).hObjectNew
|
||||
self.gpu_mmio = MMIOInterface(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz, fmt='I')
|
||||
|
||||
rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
|
||||
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
|
||||
|
||||
@@ -469,7 +488,9 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
|
||||
self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs',
|
||||
'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version')
|
||||
self.arch: str = f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
|
||||
|
||||
# FIXME: no idea how to convert this for blackwells
|
||||
self.arch: str = "sm_120" if self.sm_version==0xa04 else f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
|
||||
|
||||
compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler)
|
||||
super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch),
|
||||
@@ -482,9 +503,9 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory,
|
||||
gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
|
||||
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
|
||||
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
|
||||
gpfifo = rm_alloc(self.fd_ctl, self.gpfifo_class, self.root, channel_group, params).hObjectNew
|
||||
comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew
|
||||
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
|
||||
rm_alloc(self.fd_ctl, self.dma_class, self.root, gpfifo, None)
|
||||
|
||||
if enable_debug:
|
||||
self.debug_compute_obj, self.debug_channel = comp, gpfifo
|
||||
@@ -508,14 +529,17 @@ class NVDevice(HCQCompiled[NVSignal]):
|
||||
return [x.data for x in infos]
|
||||
|
||||
def _setup_gpfifos(self):
|
||||
self.slm_per_thread, self.shader_local_mem = 0, None
|
||||
|
||||
# Set windows addresses to not collide with other allocated buffers.
|
||||
self.shared_mem_window, self.local_mem_window, self.slm_per_thread, self.shader_local_mem = 0xfe000000, 0xff000000, 0, None
|
||||
self.shared_mem_window = 0x729400000000 if self.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A else 0xfe000000
|
||||
self.local_mem_window = 0x729300000000 if self.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A else 0xff000000
|
||||
|
||||
NVComputeQueue().setup(compute_class=self.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
|
||||
.signal(self.timeline_signal, self.timeline_value).submit(self)
|
||||
|
||||
cast(NVCopyQueue, NVCopyQueue().wait(self.timeline_signal, self.timeline_value)) \
|
||||
.setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
|
||||
.setup(copy_class=self.dma_class) \
|
||||
.signal(self.timeline_signal, self.timeline_value + 1).submit(self)
|
||||
|
||||
self.timeline_value += 2
|
||||
|
||||
Reference in New Issue
Block a user