From daea1161ccf3ac88a27ff85b6703bde9b1a49cc4 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Wed, 3 Dec 2025 16:30:22 +0300 Subject: [PATCH] nv: nvdec for blackwell (#13546) --- extra/hevc/hevc.py | 1 + tinygrad/runtime/autogen/__init__.py | 2 +- tinygrad/runtime/ops_nv.py | 14 +++++++++----- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/extra/hevc/hevc.py b/extra/hevc/hevc.py index 36abd30c8e..248bd766f3 100644 --- a/extra/hevc/hevc.py +++ b/extra/hevc/hevc.py @@ -322,6 +322,7 @@ def parse_hevc_file_headers(dat:bytes, device="NV"): if hdr.slice_type == avcodec.HEVC_SLICE_B: device_ctx.initreflistidxl1 = (ctypes.c_uint8 * 16)(*[idx for _,idx in after_list + before_list]) locl_ctx_bytes = bytes(device_ctx) + locl_ctx_bytes += b'\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00' # blackwell extension locl_ctx_bytes += bytes(0x200 - len(locl_ctx_bytes)) # pad to 512 bytes pic_width_in_ctbs = ceildiv(sps.pic_width_in_luma_samples, (1 << sps.log2_max_luma_coding_block_size)) diff --git a/tinygrad/runtime/autogen/__init__.py b/tinygrad/runtime/autogen/__init__.py index 7eab683e8c..2ec86178b5 100644 --- a/tinygrad/runtime/autogen/__init__.py +++ b/tinygrad/runtime/autogen/__init__.py @@ -40,7 +40,7 @@ def __getattr__(nm): *[root/"extra/nv_gpu_driver"/s for s in ["clc9b0.h", "clc6c0qmd.h","clcec0qmd.h", "nvdec_drv.h"]], "{}/kernel-open/common/inc/nvmisc.h", *[f"{{}}/src/common/sdk/nvidia/inc/class/cl{s}.h" for s in ["0000", "0070", "0080", "2080", "2080_notification", "c56f", "c86f", "c96f", "c761", "83de", "c6c0", "cdc0"]], - *[f"{{}}/kernel-open/nvidia-uvm/{s}.h" for s in ["clc6b5", "clc9b5", "uvm_ioctl", "uvm_linux_ioctl", "hwref/ampere/ga100/dev_fault"]], + *[f"{{}}/kernel-open/nvidia-uvm/{s}.h" for s in ["clc6b5", "clc9b5", "clcfb0", "uvm_ioctl", "uvm_linux_ioctl", "hwref/ampere/ga100/dev_fault"]], *[f"{{}}/src/nvidia/arch/nvalloc/unix/include/nv{s}.h" for s in ["_escape", "-ioctl", "-ioctl-numbers", "-ioctl-numa", "-unix-nvos-params-wrappers"]], *[f"{{}}/src/common/sdk/nvidia/inc/{s}.h" for s in ["alloc/alloc_channel", "nvos", "ctrl/ctrlc36f", "ctrl/ctrlcb33", diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 00259dcaf7..7ec7594665 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -181,8 +181,8 @@ class NVCopyQueue(NVCommandQueue): def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo) class NVVideoQueue(NVCommandQueue): - def decode_hevc_chunk(self, pic_desc:HCQBuffer, in_buf:HCQBuffer, out_buf:HCQBuffer, out_buf_pos:int, hist_bufs:list[HCQBuffer], - hist_pos:list[int], chroma_off:int, coloc_buf:HCQBuffer, filter_buf:HCQBuffer, intra_top_off:int, status_buf:HCQBuffer): + def decode_hevc_chunk(self, pic_desc:HCQBuffer, in_buf:HCQBuffer, out_buf:HCQBuffer, out_buf_pos:int, hist_bufs:list[HCQBuffer], hist_pos:list[int], + chroma_off:int, coloc_buf:HCQBuffer, filter_buf:HCQBuffer, intra_top_off:int, intra_unk_off:int|None, status_buf:HCQBuffer): self.nvm(4, nv_gpu.NVC9B0_SET_APPLICATION_ID, nv_gpu.NVC9B0_SET_APPLICATION_ID_ID_HEVC) self.nvm(4, nv_gpu.NVC9B0_SET_CONTROL_PARAMS, 0x52057) self.nvm(4, nv_gpu.NVC9B0_SET_DRV_PIC_SETUP_OFFSET, pic_desc.va_addr >> 8) @@ -195,6 +195,7 @@ class NVVideoQueue(NVCommandQueue): self.nvm(4, nv_gpu.NVC9B0_HEVC_SET_TILE_SIZES_OFFSET, pic_desc.offset(0x200).va_addr >> 8) self.nvm(4, nv_gpu.NVC9B0_HEVC_SET_FILTER_BUFFER_OFFSET, filter_buf.va_addr >> 8) self.nvm(4, nv_gpu.NVC9B0_SET_INTRA_TOP_BUF_OFFSET, (filter_buf.va_addr + intra_top_off) >> 8) + if intra_unk_off is not None: self.nvm(4, 0x4dc, (filter_buf.va_addr + intra_unk_off) >> 8) self.nvm(4, nv_gpu.NVC9B0_EXECUTE, 0) return self @@ -313,7 +314,8 @@ class NVAllocator(HCQAllocator['NVDevice']): self.dev._ensure_has_vid_hw(w, h) NVVideoQueue().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \ .decode_hevc_chunk(desc_buf, bufin, bufout, frame_pos, hist, [(frame_pos-x) % (len(hist) + 1) for x in range(len(hist), 0, -1)], - round_up(w, 64)*round_up(h, 64), self.dev.vid_coloc_buf, self.dev.vid_filter_buf, self.dev.intra_top_off, self.dev.vid_stat_buf) \ + round_up(w, 64)*round_up(h, 64), self.dev.vid_coloc_buf, self.dev.vid_filter_buf, self.dev.intra_top_off, + self.dev.intra_unk_off, self.dev.vid_stat_buf) \ .signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev) @dataclass @@ -393,7 +395,7 @@ class NVKIface: self.gpfifo_class:int = next(c for c in [nv_gpu.BLACKWELL_CHANNEL_GPFIFO_A, nv_gpu.AMPERE_CHANNEL_GPFIFO_A] if c in self.nvclasses) self.compute_class:int = next(c for c in [nv_gpu.BLACKWELL_COMPUTE_B, nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if c in self.nvclasses) self.dma_class:int = next(c for c in [nv_gpu.BLACKWELL_DMA_COPY_B, nv_gpu.AMPERE_DMA_COPY_B] if c in self.nvclasses) - self.viddec_class:int|None = next((c for c in [nv_gpu.NVC9B0_VIDEO_DECODER] if c in self.nvclasses), None) + self.viddec_class:int|None = next((c for c in [nv_gpu.NVCFB0_VIDEO_DECODER, nv_gpu.NVC9B0_VIDEO_DECODER] if c in self.nvclasses), None) usermode = self.rm_alloc(self.dev.subdevice, self.usermode_class) return usermode, MMIOInterface(self._gpu_map_to_cpu(usermode, mmio_sz:=0x10000), mmio_sz, fmt='I') @@ -661,7 +663,9 @@ class NVDevice(HCQCompiled[HCQSignal]): coloc_size = round_up((round_up(h, 64) * round_up(h, 64)) + (round_up(w, 64) * round_up(h, 64) // 16), 2 << 20) self.intra_top_off = round_up(h, 64) * (608 + 4864 + 152 + 2000) - filter_size = round_up(round_up(self.intra_top_off, 0x10000) + 64 << 10, 2 << 20) + intra_unk_size = ((2 << 20) if self.iface.viddec_class >= nv_gpu.NVCFB0_VIDEO_DECODER else 0) + self.intra_unk_off = (round_up(self.intra_top_off, 0x10000) + (64 << 10)) if intra_unk_size > 0 else None + filter_size = round_up(round_up(self.intra_top_off, 0x10000) + (64 << 10) + intra_unk_size, 2 << 20) if not hasattr(self, 'vid_gpfifo'): self.vid_gpfifo = self._new_gpu_fifo(self.gpfifo_area, 0, self.nvdevice, offset=0x200000, entries=2048, compute=False, video=True)