mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 23:18:04 -05:00
nv: profile nvdec (#14053)
This commit is contained in:
@@ -4,10 +4,10 @@ assert sys.platform != 'win32'
|
||||
from typing import cast, ClassVar
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
|
||||
from tinygrad.uop.ops import sint
|
||||
from tinygrad.device import BufferSpec, CompilerPair, CompilerSet
|
||||
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK
|
||||
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, PROFILE
|
||||
from tinygrad.renderer.ptx import PTXRenderer
|
||||
from tinygrad.renderer.cstyle import NVRenderer
|
||||
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, NVPTXCompiler, NVCompiler
|
||||
@@ -204,7 +204,7 @@ class NVVideoQueue(NVCommandQueue):
|
||||
|
||||
def signal(self, signal:HCQSignal, value:sint=0):
|
||||
self.nvm(4, nv_gpu.NVC9B0_SEMAPHORE_A, *data64(signal.value_addr), value)
|
||||
self.nvm(4, nv_gpu.NVC9B0_SEMAPHORE_D, 0)
|
||||
self.nvm(4, nv_gpu.NVC9B0_SEMAPHORE_D, (1 << 24) | (1 << 0))
|
||||
return self
|
||||
|
||||
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.vid_gpfifo)
|
||||
@@ -314,11 +314,13 @@ class NVAllocator(HCQAllocator['NVDevice']):
|
||||
|
||||
h, w = ((2 * shape[0]) // 3 if shape[0] % 3 == 0 else (2 * shape[0] - 1) // 3), shape[1]
|
||||
self.dev._ensure_has_vid_hw(w, h)
|
||||
NVVideoQueue().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
|
||||
.decode_hevc_chunk(desc_buf, bufin, bufout, frame_pos, hist, [(frame_pos-x) % (len(hist) + 1) for x in range(len(hist), 0, -1)],
|
||||
round_up(w, 64)*round_up(h, 64), self.dev.vid_coloc_buf, self.dev.vid_filter_buf, self.dev.intra_top_off,
|
||||
self.dev.intra_unk_off, self.dev.vid_stat_buf) \
|
||||
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
|
||||
|
||||
q = NVVideoQueue().wait(self.dev.timeline_signal, self.dev.timeline_value - 1)
|
||||
with hcq_profile(self.dev, queue=q, desc="NVDEC", enabled=PROFILE):
|
||||
q.decode_hevc_chunk(desc_buf, bufin, bufout, frame_pos, hist, [(frame_pos-x) % (len(hist) + 1) for x in range(len(hist), 0, -1)],
|
||||
round_up(w, 64)*round_up(h, 64), self.dev.vid_coloc_buf, self.dev.vid_filter_buf, self.dev.intra_top_off,
|
||||
self.dev.intra_unk_off, self.dev.vid_stat_buf)
|
||||
q.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
|
||||
|
||||
@dataclass
|
||||
class GPFifo:
|
||||
|
||||
Reference in New Issue
Block a user