nv: blackwell support (#10487)

* nv: blackwell support

* fixes

* hm

* h

* fixes

* mypy

* xx

* yy

* arr

* revert

* oops

* unrelated
This commit is contained in:
nimlgen
2025-05-24 18:23:53 +03:00
committed by GitHub
parent dc6309242d
commit d90ddcc365
4 changed files with 6053 additions and 1948 deletions

View File

@@ -104,10 +104,10 @@ generate_nvrtc() {
}
generate_nv() {
NVKERN_COMMIT_HASH=d6b75a34094b0f56c2ccadf14e5d0bd515ed1ab6
NVKERN_COMMIT_HASH=81fe4fb417c8ac3b9bdcc1d56827d116743892a5
NVKERN_SRC=/tmp/open-gpu-kernel-modules-$NVKERN_COMMIT_HASH
if [ ! -d "$NVKERN_SRC" ]; then
git clone https://github.com/tinygrad/open-gpu-kernel-modules $NVKERN_SRC
git clone https://github.com/NVIDIA/open-gpu-kernel-modules $NVKERN_SRC
pushd .
cd $NVKERN_SRC
git reset --hard $NVKERN_COMMIT_HASH
@@ -116,15 +116,19 @@ generate_nv() {
clang2py -k cdefstum \
extra/nv_gpu_driver/clc6c0qmd.h \
extra/nv_gpu_driver/clcec0qmd.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl0080.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl2080_notification.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc86f.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc96f.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc761.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl83de.h \
$NVKERN_SRC/src/nvidia/generated/g_allclasses.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc6c0.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clcdc0.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/clc6b5.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/clc9b5.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/uvm_ioctl.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/uvm_linux_ioctl.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/hwref/ampere/ga100/dev_fault.h \
@@ -149,6 +153,7 @@ generate_nv() {
sed -i "s\import ctypes\import ctypes, os\g" $BASE/nv_gpu.py
sed -i 's/#\?\s\([A-Za-z0-9_]\+\) = MW ( \([0-9]\+\) : \([0-9]\+\) )/\1 = (\2 , \3)/' $BASE/nv_gpu.py # NVC6C0_QMDV03_00 processing
sed -i 's/#\sdef NVC6C0_QMD\([A-Za-z0-9_()]\+\):/def NVC6C0_QMD\1:/' $BASE/nv_gpu.py
sed -i 's/#\sdef NVCEC0_QMD\([A-Za-z0-9_()]\+\):/def NVCEC0_QMD\1:/' $BASE/nv_gpu.py
sed -i 's/#\s*return MW(\([0-9i()*+]\+\):\([0-9i()*+]\+\))/ return (\1 , \2)/' $BASE/nv_gpu.py
sed -i 's/#\?\s*\(.*\)\s*=\s*\(NV\)\?BIT\(32\)\?\s*(\s*\([0-9]\+\)\s*)/\1 = (1 << \4)/' $BASE/nv_gpu.py # name = BIT(x) -> name = (1 << x)
sed -i "s/UVM_\([A-Za-z0-9_]\+\) = \['i', '(', '\([0-9]\+\)', ')'\]/UVM_\1 = \2/" $BASE/nv_gpu.py # UVM_name = ['i', '(', '<num>', ')'] -> UVM_name = <num>

View File

@@ -0,0 +1,172 @@
#ifndef __CLCEC0QMD_H__
#define __CLCEC0QMD_H__
#define NVCEC0_QMDV05_00_CTA_RASTER_WIDTH MW(1279:1248) // aka GRID_WIDTH
#define NVCEC0_QMDV05_00_CTA_RASTER_HEIGHT MW(1311:1280) // aka GRID_HEIGHT
#define NVCEC0_QMDV05_00_CTA_RASTER_DEPTH MW(1343:1312) // aka GRID_DEPTH
#define NVCEC0_QMDV05_00_REGISTER_COUNT_V MW(1136:1128)
#define NVCEC0_QMDV05_00_BARRIER_COUNT MW(1137:1137) // ??
#define NVCEC0_QMDV05_00_QMD_MINOR_VERSION MW(467:464)
#define NVCEC0_QMDV05_00_QMD_MAJOR_VERSION MW(471:468)
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_ADDR_LOWER_SHIFTED6(i) MW((1375+(i)*64):(1344+(i)*64))
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_ADDR_UPPER_SHIFTED6(i) MW((1394+(i)*64):(1376+(i)*64))
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_SIZE_SHIFTED4(i) MW((1407+(i)*64):(1395+(i)*64))
#define NVCEC0_QMDV05_00_CTA_THREAD_DIMENSION0 MW(1103:1088)
#define NVCEC0_QMDV05_00_CTA_THREAD_DIMENSION1 MW(1119:1104)
#define NVCEC0_QMDV05_00_CTA_THREAD_DIMENSION2 MW(1128:1120)
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_VALID(i) MW((1856+(i)*4):(1856+(i)*4))
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_VALID_FALSE 0x00000000
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_VALID_TRUE 0x00000001
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH(i) MW((1858+(i)*4):(1857+(i)*4))
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH_PREFETCH_NONE 0x00000000
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH_PREFETCH_PRE 0x00000001
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_PREFETCH_PREFETCH_POST 0x00000002
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_INVALIDATE(i) MW((1859+(i)*4):(1859+(i)*4))
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_INVALIDATE_FALSE 0x00000000
#define NVCEC0_QMDV05_00_CONSTANT_BUFFER_INVALIDATE_TRUE 0x00000001
#define NVCEC0_QMDV05_00_DEPENDENCE_COUNTER MW(143:128) // ??
#define NVCEC0_QMDV05_00_QMD_GROUP_ID MW(149:144)
#define NVCEC0_QMDV05_00_PROGRAM_ADDRESS_LOWER MW(1055:1024)
#define NVCEC0_QMDV05_00_PROGRAM_ADDRESS_UPPER MW(1080:1056)
#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_POINTER MW(415:384)
#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_POINTER MW(447:416)
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE(i) MW((336+(i)*5):(336+(i)*5))
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE_FALSE 0x00000000
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE_TRUE 0x00000001
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION(i) MW((339+(i)*5):(337+(i)*5))
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_INCREMENT_PUT 0x00000000
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_SCHEDULE 0x00000001
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_INVALIDATE_COPY_SCHEDULE 0x00000003
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION_QMD_DECREMENT_DEPENDENCE 0x00000004
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH(i) MW((340+(i)*5):(340+(i)*5))
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH_FALSE 0x00000000
#define NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH_TRUE 0x00000001
#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ENABLE NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE(0)
#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ENABLE NVCEC0_QMDV05_00_DEPENDENT_QMD_ENABLE(1)
#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_ACTION NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION(0)
#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_ACTION NVCEC0_QMDV05_00_DEPENDENT_QMD_ACTION(1)
#define NVCEC0_QMDV05_00_DEPENDENT_QMD0_PREFETCH NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH(0)
#define NVCEC0_QMDV05_00_DEPENDENT_QMD1_PREFETCH NVCEC0_QMDV05_00_DEPENDENT_QMD_PREFETCH(1)
#define NVCEC0_QMDV05_00_RELEASE_ENABLE(i) MW((288+(i)*16):(288+(i)*16))
#define NVCEC0_QMDV05_00_RELEASE_ENABLE_FALSE 0x00000000
#define NVCEC0_QMDV05_00_RELEASE_ENABLE_TRUE 0x00000001
#define NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE(i) MW((290+(i)*16):(289+(i)*16))
#define NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE_SEMAPHORE_FOUR_WORDS 0x00000000
#define NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE_SEMAPHORE_ONE_WORD 0x00000001
#define NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE_SEMAPHORE_TWO_WORDS 0x00000002
#define NVCEC0_QMDV05_00_RELEASE_MEMBAR_TYPE(i) MW((291+(i)*16):(291+(i)*16))
#define NVCEC0_QMDV05_00_RELEASE_MEMBAR_TYPE_FE_NONE 0x00000000
#define NVCEC0_QMDV05_00_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR 0x00000001
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_ENABLE(i) MW((292+(i)*16):(292+(i)*16))
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_ENABLE_FALSE 0x00000000
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_ENABLE_TRUE 0x00000001
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP(i) MW((295+(i)*16):(293+(i)*16))
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_ADD 0x00000000
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_MIN 0x00000001
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_MAX 0x00000002
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_INC 0x00000003
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_DEC 0x00000004
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_AND 0x00000005
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_OR 0x00000006
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP_RED_XOR 0x00000007
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_FORMAT(i) MW((297+(i)*16):(296+(i)*16))
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_FORMAT_UNSIGNED 0x00000000
#define NVCEC0_QMDV05_00_RELEASE_REDUCTION_FORMAT_SIGNED 0x00000001
#define NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE(i) MW((299+(i)*16):(298+(i)*16))
#define NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE_TRAP_NONE 0x00000000
#define NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE_TRAP_UNCONDITIONAL 0x00000001
#define NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE_TRAP_CONDITIONAL 0x00000002
#define NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE_TRAP_CONDITIONAL_EXT 0x00000003
#define NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B(i) MW((300+(i)*16):(300+(i)*16))
#define NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B_FALSE 0x00000000
#define NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B_TRUE 0x00000001
#define NVCEC0_QMDV05_00_RELEASE_RESERVED_INFO(i) MW((303+(i)*16):(301+(i)*16))
#define NVCEC0_QMDV05_00_RELEASE0_ENABLE NVCEC0_QMDV05_00_RELEASE_ENABLE(0)
#define NVCEC0_QMDV05_00_RELEASE1_ENABLE NVCEC0_QMDV05_00_RELEASE_ENABLE(1)
#define NVCEC0_QMDV05_00_RELEASE0_STRUCTURE_SIZE NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE(0)
#define NVCEC0_QMDV05_00_RELEASE1_STRUCTURE_SIZE NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE(1)
#define NVCEC0_QMDV05_00_RELEASE0_MEMBAR_TYPE NVCEC0_QMDV05_00_RELEASE_MEMBAR_TYPE(0)
#define NVCEC0_QMDV05_00_RELEASE1_MEMBAR_TYPE NVCEC0_QMDV05_00_RELEASE_MEMBAR_TYPE(1)
#define NVCEC0_QMDV05_00_RELEASE0_REDUCTION_OP NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP(0)
#define NVCEC0_QMDV05_00_RELEASE1_REDUCTION_OP NVCEC0_QMDV05_00_RELEASE_REDUCTION_OP(1)
#define NVCEC0_QMDV05_00_RELEASE0_REDUCTION_FORMAT NVCEC0_QMDV05_00_RELEASE_REDUCTION_FORMAT(0)
#define NVCEC0_QMDV05_00_RELEASE1_REDUCTION_FORMAT NVCEC0_QMDV05_00_RELEASE_REDUCTION_FORMAT(1)
#define NVCEC0_QMDV05_00_RELEASE0_TRAP_TYPE NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE(0)
#define NVCEC0_QMDV05_00_RELEASE1_TRAP_TYPE NVCEC0_QMDV05_00_RELEASE_TRAP_TYPE(1)
#define NVCEC0_QMDV05_00_RELEASE0_PAYLOAD64B NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B(0)
#define NVCEC0_QMDV05_00_RELEASE1_PAYLOAD64B NVCEC0_QMDV05_00_RELEASE_PAYLOAD64B(1)
#define NVCEC0_QMDV05_00_RELEASE0_ADDRESS_LOWER MW(511:480)
#define NVCEC0_QMDV05_00_RELEASE0_ADDRESS_UPPER MW(543:512)
#define NVCEC0_QMDV05_00_RELEASE0_PAYLOAD_LOWER MW(575:544)
#define NVCEC0_QMDV05_00_RELEASE0_PAYLOAD_UPPER MW(607:576)
#define NVCEC0_QMDV05_00_RELEASE1_ADDRESS_LOWER MW(799:768)
#define NVCEC0_QMDV05_00_RELEASE1_ADDRESS_UPPER MW(831:800)
#define NVCEC0_QMDV05_00_RELEASE1_PAYLOAD_LOWER MW(863:832)
#define NVCEC0_QMDV05_00_RELEASE1_PAYLOAD_UPPER MW(895:864)
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_HEADER_CACHE MW(472:472)
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE 0x00000000
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE 0x00000001
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_SAMPLER_CACHE MW(473:473)
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE 0x00000000
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE 0x00000001
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_DATA_CACHE MW(474:474)
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_DATA_CACHE_FALSE 0x00000000
#define NVCEC0_QMDV05_00_INVALIDATE_TEXTURE_DATA_CACHE_TRUE 0x00000001
#define NVCEC0_QMDV05_00_INVALIDATE_SHADER_DATA_CACHE MW(475:475)
#define NVCEC0_QMDV05_00_INVALIDATE_SHADER_DATA_CACHE_FALSE 0x00000000
#define NVCEC0_QMDV05_00_INVALIDATE_SHADER_DATA_CACHE_TRUE 0x00000001
#define NVCEC0_QMDV05_00_INVALIDATE_INSTRUCTION_CACHE MW(476:476)
#define NVCEC0_QMDV05_00_INVALIDATE_INSTRUCTION_CACHE_FALSE 0x00000000
#define NVCEC0_QMDV05_00_INVALIDATE_INSTRUCTION_CACHE_TRUE 0x00000001
#define NVCEC0_QMDV05_00_INVALIDATE_SHADER_CONSTANT_CACHE MW(477:477)
#define NVCEC0_QMDV05_00_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE 0x00000000
#define NVCEC0_QMDV05_00_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE 0x00000001
#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_ADDR_LOWER_SHIFTED MW(1919:1888)
#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_ADDR_UPPER_SHIFTED MW(1936:1920)
#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_SIZE MW(1945:1937)
#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_TYPE MW(1947:1946)
#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_TYPE_PREFETCH_LAUNCH 0x00000000
#define NVCEC0_QMDV05_00_PROGRAM_PREFETCH_TYPE_PREFTECH_POST 0x00000001
#define NVCEC0_QMDV05_00_SHARED_MEMORY_SIZE MW(1162:1145)
#define NVCEC0_QMDV05_00_MIN_SM_CONFIG_SHARED_MEM_SIZE MW(1168:1163)
#define NVCEC0_QMDV05_00_MAX_SM_CONFIG_SHARED_MEM_SIZE MW(1174:1169)
#define NVCEC0_QMDV05_00_TARGET_SM_CONFIG_SHARED_MEM_SIZE MW(1180:1175)
// ??
#define NVCEC0_QMDV05_00_SHADER_LOCAL_MEMORY_HIGH_SIZE MW(1213:1196)
#define NVCEC0_QMDV05_00_API_VISIBLE_CALL_LIMIT MW(456:456)
#define NVCEC0_QMDV05_00_API_VISIBLE_CALL_LIMIT__32 0x00000000
#define NVCEC0_QMDV05_00_API_VISIBLE_CALL_LIMIT_NO_CHECK 0x00000001
#define NVCEC0_QMDV05_00_SAMPLER_INDEX MW(457:457)
#define NVCEC0_QMDV05_00_SAMPLER_INDEX_INDEPENDENTLY 0x00000000
#define NVCEC0_QMDV05_00_SAMPLER_INDEX_VIA_HEADER_INDEX 0x00000001
#define NVCEC0_QMDV05_00_UNKNOWN_13 MW(159:152) // A4
#define NVCEC0_QMDV05_00_SASS_VERSION MW(455:448)
#endif // #ifndef __CLCEC0QMD_H__

File diff suppressed because it is too large Load Diff

View File

@@ -60,11 +60,15 @@ uvm = make_uvm_type()
class QMD:
fields: dict[str, dict[str, tuple[int, int]]] = {}
def __init__(self, addr=None, pref="NVC6C0_QMDV03_00", **kwargs):
if pref not in QMD.fields:
def __init__(self, dev:NVDevice, addr:int|None=None, **kwargs):
self.ver, self.sz = (5, 0x60) if dev.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A else (3, 0x40)
# Init fields from module
if (pref:="NVCEC0_QMDV05_00" if self.ver == 5 else "NVC6C0_QMDV03_00") not in QMD.fields:
QMD.fields[pref] = {**{name[len(pref)+1:]: dt for name,dt in nv_gpu.__dict__.items() if name.startswith(pref) and isinstance(dt, tuple)},
**{name[len(pref)+1:]+f"_{i}": dt(i) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith(pref) and callable(dt)}}
self.mv, self.pref = (memoryview(bytearray(0x40 * 4)) if addr is None else to_mv(addr, 0x40 * 4)), pref
self.mv, self.pref = (memoryview(bytearray(self.sz * 4)) if addr is None else to_mv(addr, self.sz * 4)), pref
if kwargs: self.write(**kwargs)
def _rw_bits(self, hi:int, lo:int, value:int|None=None):
@@ -83,6 +87,10 @@ class QMD:
def field_offset(self, k): return QMD.fields[self.pref][k.upper()][1] // 8
def set_constant_buf_addr(self, i, addr):
if self.ver < 4: self.write(**{f'constant_buffer_addr_upper_{i}':hi32(addr), f'constant_buffer_addr_lower_{i}':lo32(addr)})
else: self.write(**{f'constant_buffer_addr_upper_shifted6_{i}':hi32(addr >> 6), f'constant_buffer_addr_lower_shifted6_{i}':lo32(addr >> 6)})
class NVSignal(HCQSignal):
def __init__(self, base_buf:HCQBuffer|None=None, **kwargs):
super().__init__(base_buf, **kwargs, timestamp_divider=1000, dev_t=NVDevice)
@@ -146,15 +154,15 @@ class NVComputeQueue(NVCommandQueue):
self.bind_args_state(args_state)
qmd_buf = args_state.buf.offset(round_up(prg.constbufs[0][1], 1 << 8))
qmd_buf.cpu_view().view(size=0x40 * 4, fmt='B')[:] = prg.qmd.mv
qmd_buf.cpu_view().view(size=prg.qmd.mv.nbytes, fmt='B')[:] = prg.qmd.mv
assert qmd_buf.va_addr < (1 << 40), f"large qmd addr {qmd_buf.va_addr:x}"
qmd = QMD(addr=qmd_buf.va_addr) # Save qmd for later update
qmd = QMD(dev=prg.dev, addr=cast(int, qmd_buf.va_addr)) # Save qmd for later update
self.bind_sints_to_mem(*global_size, mem=qmd_buf.cpu_view(), fmt='I', offset=nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8)
self.bind_sints_to_mem(*local_size, mem=qmd_buf.cpu_view(), fmt='H', offset=nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8)
self.bind_sints_to_mem(*local_size, *global_size, mem=args_state.buf.cpu_view(), fmt='I')
qmd.write(constant_buffer_addr_upper_0=hi32(args_state.buf.va_addr), constant_buffer_addr_lower_0=lo32(args_state.buf.va_addr))
self.bind_sints_to_mem(*global_size, mem=qmd_buf.cpu_view(), fmt='I', offset=qmd.field_offset('cta_raster_width'))
self.bind_sints_to_mem(*(local_size[:2]), mem=qmd_buf.cpu_view(), fmt='H', offset=qmd.field_offset('cta_thread_dimension0'))
self.bind_sints_to_mem(local_size[2], mem=qmd_buf.cpu_view(), fmt='B', offset=qmd.field_offset('cta_thread_dimension2'))
qmd.set_constant_buf_addr(0, args_state.buf.va_addr)
if self.active_qmd is None:
self.nvm(1, nv_gpu.NVC6C0_SEND_PCAS_A, qmd_buf.va_addr >> 8)
@@ -242,21 +250,28 @@ class NVProgram(HCQProgram):
ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
self.constbuffer_0 = [0] * (cbuf0_size // 4)
self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)]
if dev.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A:
self.constbuffer_0[188:192], self.constbuffer_0[223] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window)], 0xfffdc0
qmd = {'qmd_major_version':5, 'unknown_13':0x1, 'program_address_upper':hi32(self.prog_addr>>4),'program_address_lower':lo32(self.prog_addr>>4),
'sass_version':0xA4}
else:
self.constbuffer_0[6:12] = [*data64_le(self.dev.shared_mem_window), *data64_le(self.dev.local_mem_window), *data64_le(0xfffdc0)]
qmd = {'qmd_major_version':3, 'sm_global_caching_enable':1, 'cwd_membar_type':nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR,
'program_address_upper':hi32(self.prog_addr), 'program_address_lower':lo32(self.prog_addr), 'sass_version':0x89}
smem_cfg = min(shmem_conf * 1024 for shmem_conf in [32, 64, 100] if shmem_conf * 1024 >= self.shmem_usage) // 4096 + 1
self.qmd:QMD = QMD(qmd_group_id=0x3f, sm_global_caching_enable=1, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1,
cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, constant_buffer_invalidate_0=1,
shared_memory_size=self.shmem_usage, min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg,
max_sm_config_shared_mem_size=0x1a, register_count_v=self.regs_usage, sass_version=0x89,
program_address_upper=hi32(self.prog_addr), program_address_lower=lo32(self.prog_addr),
barrier_count=1, shader_local_memory_high_size=self.dev.slm_per_thread, program_prefetch_size=min(self.prog_sz>>8, 0x1ff),
program_prefetch_addr_lower_shifted=self.prog_addr>>8, program_prefetch_addr_upper_shifted=self.prog_addr>>40)
self.qmd:QMD = QMD(dev, **qmd, qmd_group_id=0x3f, invalidate_texture_header_cache=1, invalidate_texture_sampler_cache=1,
invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1, barrier_count=1,
constant_buffer_invalidate_0=1, register_count_v=self.regs_usage, shader_local_memory_high_size=self.dev.slm_per_thread,
min_sm_config_shared_mem_size=smem_cfg, target_sm_config_shared_mem_size=smem_cfg, max_sm_config_shared_mem_size=0x1a,
shared_memory_size=self.shmem_usage, program_prefetch_size=min(self.prog_sz>>8, 0x1ff),
program_prefetch_addr_upper_shifted=self.prog_addr>>40, program_prefetch_addr_lower_shifted=self.prog_addr>>8)
for i,(addr,sz) in self.constbufs.items():
self.qmd.write(**{f'constant_buffer_addr_upper_{i}': hi32(addr), f'constant_buffer_addr_lower_{i}': lo32(addr),
f'constant_buffer_size_shifted4_{i}': sz, f'constant_buffer_valid_{i}': 1})
self.qmd.set_constant_buf_addr(i, addr)
self.qmd.write(**{f'constant_buffer_size_shifted4_{i}': sz, f'constant_buffer_valid_{i}': 1})
# Registers allocation granularity per warp is 256, warp allocation granularity is 4. Register file size is 65536.
self.max_threads = ((65536 // round_up(max(1, self.regs_usage) * 32, 256)) // 4) * 4 * 32
@@ -399,7 +414,10 @@ class NVDevice(HCQCompiled[NVSignal]):
classlist = memoryview(bytearray(100 * 4)).cast('I')
clsinfo = rmctrl.gpu_get_classlist(self.fd_ctl, self.root, self.nvdevice, numClasses=100, classList=mv_address(classlist))
self.nvclasses = {classlist[i] for i in range(clsinfo.numClasses)}
self.compute_class = next(clss for clss in [nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if clss in self.nvclasses)
self.usermode_class:int = next(c for c in [nv_gpu.HOPPER_USERMODE_A, nv_gpu.TURING_USERMODE_A] if c in self.nvclasses)
self.gpfifo_class:int = next(c for c in [nv_gpu.BLACKWELL_CHANNEL_GPFIFO_A, nv_gpu.AMPERE_CHANNEL_GPFIFO_A] if c in self.nvclasses)
self.compute_class:int = next(c for c in [nv_gpu.BLACKWELL_COMPUTE_B, nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if c in self.nvclasses)
self.dma_class:int = next(c for c in [nv_gpu.BLACKWELL_DMA_COPY_B, nv_gpu.AMPERE_DMA_COPY_B] if c in self.nvclasses)
def __init__(self, device:str=""):
if NVDevice.root is None:
@@ -427,12 +445,13 @@ class NVDevice(HCQCompiled[NVSignal]):
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES)
self.nvdevice = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew
self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.nvdevice, None).hObjectNew
self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew
self.gpu_mmio = MMIOInterface(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz, fmt='I')
self._setup_nvclasses()
self._debug_mappings: dict[tuple[int, int], str] = dict()
self.usermode = rm_alloc(self.fd_ctl, self.usermode_class, self.root, self.subdevice, None).hObjectNew
self.gpu_mmio = MMIOInterface(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz, fmt='I')
rmctrl.perf_boost(self.fd_ctl, self.root, self.subdevice, duration=0xffffffff, flags=((nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_YES << 4) | \
(nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CUDA_PRIORITY_HIGH << 6) | (nv_gpu.NV2080_CTRL_PERF_BOOST_FLAGS_CMD_BOOST_TO_MAX << 0)))
@@ -469,7 +488,9 @@ class NVDevice(HCQCompiled[NVSignal]):
self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs',
'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version')
self.arch: str = f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
# FIXME: no idea how to convert this for blackwells
self.arch: str = "sm_120" if self.sm_version==0xa04 else f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler)
super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch),
@@ -482,9 +503,9 @@ class NVDevice(HCQCompiled[NVSignal]):
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory,
gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
gpfifo = rm_alloc(self.fd_ctl, nv_gpu.AMPERE_CHANNEL_GPFIFO_A, self.root, channel_group, params).hObjectNew
gpfifo = rm_alloc(self.fd_ctl, self.gpfifo_class, self.root, channel_group, params).hObjectNew
comp = rm_alloc(self.fd_ctl, self.compute_class, self.root, gpfifo, None).hObjectNew
rm_alloc(self.fd_ctl, nv_gpu.AMPERE_DMA_COPY_B, self.root, gpfifo, None)
rm_alloc(self.fd_ctl, self.dma_class, self.root, gpfifo, None)
if enable_debug:
self.debug_compute_obj, self.debug_channel = comp, gpfifo
@@ -508,14 +529,17 @@ class NVDevice(HCQCompiled[NVSignal]):
return [x.data for x in infos]
def _setup_gpfifos(self):
self.slm_per_thread, self.shader_local_mem = 0, None
# Set windows addresses to not collide with other allocated buffers.
self.shared_mem_window, self.local_mem_window, self.slm_per_thread, self.shader_local_mem = 0xfe000000, 0xff000000, 0, None
self.shared_mem_window = 0x729400000000 if self.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A else 0xfe000000
self.local_mem_window = 0x729300000000 if self.compute_class >= nv_gpu.BLACKWELL_COMPUTE_A else 0xff000000
NVComputeQueue().setup(compute_class=self.compute_class, local_mem_window=self.local_mem_window, shared_mem_window=self.shared_mem_window) \
.signal(self.timeline_signal, self.timeline_value).submit(self)
cast(NVCopyQueue, NVCopyQueue().wait(self.timeline_signal, self.timeline_value)) \
.setup(copy_class=nv_gpu.AMPERE_DMA_COPY_B) \
.setup(copy_class=self.dma_class) \
.signal(self.timeline_signal, self.timeline_value + 1).submit(self)
self.timeline_value += 2