nv: profile nvdec (#14053)

This commit is contained in:
nimlgen
2026-01-07 15:56:54 +03:00
committed by GitHub
parent 35900290b2
commit 25c82dd242

View File

@@ -4,10 +4,10 @@ assert sys.platform != 'win32'
from typing import cast, ClassVar
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
from tinygrad.uop.ops import sint
from tinygrad.device import BufferSpec, CompilerPair, CompilerSet
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, PROFILE
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.cstyle import NVRenderer
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, NVPTXCompiler, NVCompiler
@@ -204,7 +204,7 @@ class NVVideoQueue(NVCommandQueue):
def signal(self, signal:HCQSignal, value:sint=0):
self.nvm(4, nv_gpu.NVC9B0_SEMAPHORE_A, *data64(signal.value_addr), value)
self.nvm(4, nv_gpu.NVC9B0_SEMAPHORE_D, 0)
self.nvm(4, nv_gpu.NVC9B0_SEMAPHORE_D, (1 << 24) | (1 << 0))
return self
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.vid_gpfifo)
@@ -314,11 +314,13 @@ class NVAllocator(HCQAllocator['NVDevice']):
h, w = ((2 * shape[0]) // 3 if shape[0] % 3 == 0 else (2 * shape[0] - 1) // 3), shape[1]
self.dev._ensure_has_vid_hw(w, h)
NVVideoQueue().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
.decode_hevc_chunk(desc_buf, bufin, bufout, frame_pos, hist, [(frame_pos-x) % (len(hist) + 1) for x in range(len(hist), 0, -1)],
round_up(w, 64)*round_up(h, 64), self.dev.vid_coloc_buf, self.dev.vid_filter_buf, self.dev.intra_top_off,
self.dev.intra_unk_off, self.dev.vid_stat_buf) \
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
q = NVVideoQueue().wait(self.dev.timeline_signal, self.dev.timeline_value - 1)
with hcq_profile(self.dev, queue=q, desc="NVDEC", enabled=PROFILE):
q.decode_hevc_chunk(desc_buf, bufin, bufout, frame_pos, hist, [(frame_pos-x) % (len(hist) + 1) for x in range(len(hist), 0, -1)],
round_up(w, 64)*round_up(h, 64), self.dev.vid_coloc_buf, self.dev.vid_filter_buf, self.dev.intra_top_off,
self.dev.intra_unk_off, self.dev.vid_stat_buf)
q.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
@dataclass
class GPFifo: