nv: minimal hevc (#13502)

* nv: minimal hevc

* validate

* not needed

* tralin

* var

* cpu

* fxi

* desc

* move

* cleanup
This commit is contained in:
nimlgen
2025-11-30 16:46:55 +03:00
committed by GitHub
parent fd373fea7a
commit 455dd88236
23 changed files with 7417 additions and 37 deletions

View File

@@ -318,6 +318,8 @@ jobs:
# TODO: too slow
# - name: Fuzz Padded Tensor Core GEMM (PTX)
# run: NV=1 NV_PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
- name: HEVC Decode Benchmark
run: VALIDATE=1 MAX_FRAMES=100 NV=1 PYTHONPATH=. python3 extra/hevc/decode.py
- name: Train MNIST
run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
# TODO: too slow

1
extra/hevc/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
out/

71
extra/hevc/decode.py Normal file
View File

@@ -0,0 +1,71 @@
import argparse, os, hashlib
from tinygrad.helpers import getenv, DEBUG, round_up, Timing, tqdm, fetch
from extra.hevc.hevc import parse_hevc_file_headers, untile_nv12, to_bgr, nv_gpu
from tinygrad import Tensor, dtypes, Device, Variable
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", type=str, default="")
parser.add_argument("--output_dir", type=str, default="extra/hevc/out")
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
if args.input_file == "":
url = "https://github.com/haraschax/filedump/raw/09a497959f7fa6fd8dba501a25f2cdb3a41ecb12/comma_video.hevc"
hevc_tensor = Tensor.from_url(url, device="CPU")
else:
hevc_tensor = Tensor.empty(os.stat(args.input_file).st_size, dtype=dtypes.uint8, device=f"disk:{args.input_file}").to("CPU")
dat = bytes(hevc_tensor.data())
dat_hash = hashlib.md5(dat).hexdigest()
with Timing("prep infos: "):
dat_nv = hevc_tensor.to("NV")
opaque, frame_info, w, h, luma_w, luma_h, chroma_off = parse_hevc_file_headers(dat)
frame_info = frame_info[:getenv("MAX_FRAMES", len(frame_info))]
# move all needed data to gpu
all_slices = []
with Timing("prep slices to gpu: "):
opaque_nv = opaque.to("NV").contiguous().realize()
for i, (offset, sz, frame_pos, history_sz, _) in enumerate(frame_info):
all_slices.append(hevc_tensor[offset:offset+sz].to("NV").contiguous().realize())
Device.default.synchronize()
out_image_size = luma_h + (luma_h + 1) // 2, round_up(luma_w, 64)
max_hist = max(history_sz for _, _, _, history_sz, _ in frame_info)
pos = Variable("pos", 0, max_hist + 1)
history = []
out_images = []
with Timing("decoding whole file: ", on_exit=(lambda et: f", {len(frame_info)} frames, {len(frame_info)/(et/1e9):.2f} fps")):
for i, (offset, sz, frame_pos, history_sz, is_hist) in enumerate(frame_info):
history = history[-history_sz:] if history_sz > 0 else []
outimg = all_slices[i].decode_hevc_frame(pos.bind(frame_pos), out_image_size, opaque_nv[i], history).realize()
out_images.append(outimg)
if is_hist: history.append(outimg)
Device.default.synchronize()
if getenv("VALIDATE", 0):
import pickle
if dat_hash == "b813bfdbec194fd17fdf0e3ceb8cea1c":
url = "https://github.com/nimlgen/hevc_validate_set/raw/refs/heads/main/decoded_frames_b813bfdbec194fd17fdf0e3ceb8cea1c.pkl"
decoded_frames = pickle.load(fetch(url).open("rb"))
else: decoded_frames = pickle.load(open(f"extra/hevc/decoded_frames_{dat_hash}.pkl", "rb"))
else: import cv2
for i, img in tqdm(enumerate(out_images)):
if getenv("VALIDATE", 0):
if i < len(decoded_frames) and len(decoded_frames[i]) > 0:
img = untile_nv12(img, h, w, luma_w, chroma_off).realize()
assert img.data() == decoded_frames[i], f"Frame {i} does not match reference decoder!"
print(f"Frame {i} matches reference decoder!")
else:
img = to_bgr(img, h, w, luma_w, chroma_off).realize()
cv2.imwrite(f"{args.output_dir}/out_frame_{i:04d}.png", img.numpy())

449
extra/hevc/hevc.py Normal file
View File

@@ -0,0 +1,449 @@
import dataclasses, enum, argparse, os, itertools, time, ctypes
from typing import Any
from tinygrad import Tensor, dtypes, Device, TinyJit
from tinygrad.helpers import DEBUG, round_up, ceildiv, Timing, prod
from tinygrad.runtime.autogen import avcodec, nv_570 as nv_gpu
class BitReader:
def __init__(self, data:bytes): self.reader, self.current_bits, self.bits, self.read_bits, self.total = iter(data), 0, 0, 0, len(data) * 8
def empty(self): return self.read_bits == self.total and self.current_bits == 0
def peak_bits(self, n):
while self.current_bits < n:
self.bits = (self.bits << 8) | next(self.reader)
self.current_bits += 8
self.read_bits += 8
return (self.bits >> (self.current_bits - n)) & ((1 << n) - 1)
def _next_bits(self, n):
val = self.peak_bits(n)
self.bits &= (1 << (self.current_bits - n)) - 1
self.current_bits -= n
return val
def u(self, n): return self._next_bits(n)
# 9.2 Parsing process for 0-th order Exp-Golomb codes
def ue_v(self):
leading_zero_bits = -1
while True:
bit = self.u(1)
leading_zero_bits += 1
if bit == 1: break
part = self.u(leading_zero_bits)
if leading_zero_bits == 0: return 0
return (1 << leading_zero_bits) - 1 + part
# 9.2.2 Mapping process for signed Exp-Golomb codes
def se_v(self):
k = self.ue_v()
return (-1 ** (k + 1)) * (k // 2)
# 7.3.1.1 General NAL unit syntax
def _hevc_get_rbsp(dat:bytes, off=0) -> bytes:
rbsp = bytes()
while off < len(dat):
if off + 2 < len(dat) and dat[off:off+3] == b'\x00\x00\x03':
rbsp += bytes([0, 0])
off += 3
else:
rbsp += bytes([dat[off]])
off += 1
return rbsp
class HevcSlice:
# 7.3.3 Profile, tier and level syntax
def profile_tier_level(self, r:BitReader, enable:bool, max_sub_layers:int):
assert enable and max_sub_layers == 0, "no sublayers supported"
self._notimpl_profile_tier_level = r.u(88)
self.general_level_idc = r.u(8)
# 7.3.7 Short-term reference picture set syntax
def st_ref_pic_set(self, r:BitReader, stRpsIdx:int, num_short_term_ref_pic_sets:int=0, sps=None):
inter_ref_pic_set_prediction_flag = r.u(1) if stRpsIdx != 0 else 0
if inter_ref_pic_set_prediction_flag:
if stRpsIdx == num_short_term_ref_pic_sets:
delta_idx_minus1 = r.ue_v()
delta_rps_sign = r.u(1)
abs_delta_rps_minus1 = r.ue_v()
NumDeltaPocs = sps.num_negative_pics + sps.num_positive_pics
for i in range(NumDeltaPocs + 1):
used_by_curr_pic_flag = r.u(1)
if not used_by_curr_pic_flag:
use_delta_flag = r.u(1)
else:
self.num_negative_pics = r.ue_v()
self.num_positive_pics = r.ue_v()
for i in range(self.num_negative_pics):
delta_poc_s0_minus1 = r.ue_v()
used_by_curr_pic_s0_flag = r.u(1)
for i in range(self.num_positive_pics):
delta_poc_s1_minus1 = r.ue_v()
used_by_curr_pic_s1_flag = r.u(1)
# 7.3.2.2 Sequence parameter set RBSP syntax
class SPS(HevcSlice):
def __init__(self, r:BitReader):
self.sps_video_parameter_set_id = r.u(4)
self.sps_max_sub_layers_minus1 = r.u(3)
self.sps_temporal_id_nesting_flag = r.u(1)
self.profile_tier_level(r, True, self.sps_max_sub_layers_minus1)
self.sps_seq_parameter_set_id = r.ue_v()
self.chroma_format_idc = r.ue_v()
self.separate_colour_plane_flag = r.u(1) if self.chroma_format_idc == 3 else 0
self.pic_width_in_luma_samples = r.ue_v()
self.pic_height_in_luma_samples = r.ue_v()
self.conformance_window_flag = r.u(1)
if self.conformance_window_flag:
self.conf_win_left_offset = r.ue_v()
self.conf_win_right_offset = r.ue_v()
self.conf_win_top_offset = r.ue_v()
self.conf_win_bottom_offset = r.ue_v()
else: self.conf_win_left_offset = self.conf_win_right_offset = self.conf_win_top_offset = self.conf_win_bottom_offset = 0
self.bit_depth_luma = r.ue_v() + 8
self.bit_depth_chroma = r.ue_v() + 8
self.log2_max_pic_order_cnt_lsb_minus4 = r.ue_v()
self.sps_sub_layer_ordering_info_present_flag = r.u(1)
self.sps_max_dec_pic_buffering, self.sps_max_num_reorder_pics, self.sps_max_latency_increase_plus1 = [], [], []
for i in range((0 if self.sps_sub_layer_ordering_info_present_flag else self.sps_max_sub_layers_minus1), self.sps_max_sub_layers_minus1 + 1):
self.sps_max_dec_pic_buffering.append(r.ue_v() + 1)
self.sps_max_num_reorder_pics.append(r.ue_v())
self.sps_max_latency_increase_plus1.append(r.ue_v())
self.log2_min_luma_coding_block_size = r.ue_v() + 3
self.log2_max_luma_coding_block_size = self.log2_min_luma_coding_block_size + r.ue_v()
self.log2_min_transform_block_size = r.ue_v() + 2
self.log2_max_transform_block_size = self.log2_min_transform_block_size + r.ue_v()
self.max_transform_hierarchy_depth_inter = r.ue_v()
self.max_transform_hierarchy_depth_intra = r.ue_v()
if scaling_list_enabled_flag := r.u(1):
if sps_scaling_list_data_present_flag := r.u(1): assert False, "scaling_list_data parsing not implemented"
self.amp_enabled_flag = r.u(1)
self.sample_adaptive_offset_enabled_flag = r.u(1)
self.pcm_enabled_flag = r.u(1)
assert self.pcm_enabled_flag == 0, "pcm not implemented"
self.num_short_term_ref_pic_sets = r.ue_v()
for i in range(self.num_short_term_ref_pic_sets):
self.st_ref_pic_set(r, i, self.num_short_term_ref_pic_sets)
self.long_term_ref_pics_present_flag = r.u(1)
if self.long_term_ref_pics_present_flag: assert False, "long_term_ref_pics parsing not implemented"
self.sps_temporal_mvp_enabled_flag = r.u(1)
self.strong_intra_smoothing_enabled_flag = r.u(1)
# 7.3.2.3 Picture parameter set RBSP syntax
class PPS(HevcSlice):
def __init__(self, r:BitReader):
self.pps_pic_parameter_set_id = r.ue_v()
self.pps_seq_parameter_set_id = r.ue_v()
self.dependent_slice_segments_enabled_flag = r.u(1)
self.output_flag_present_flag = r.u(1)
self.num_extra_slice_header_bits = r.u(3)
self.sign_data_hiding_enabled_flag = r.u(1)
self.cabac_init_present_flag = r.u(1)
self.num_ref_idx_l0_default_active = r.ue_v() + 1
self.num_ref_idx_l1_default_active = r.ue_v() + 1
self.init_qp = r.se_v() + 26
self.constrained_intra_pred_flag = r.u(1)
self.transform_skip_enabled_flag = r.u(1)
self.cu_qp_delta_enabled_flag = r.u(1)
if self.cu_qp_delta_enabled_flag: self.diff_cu_qp_delta_depth = r.ue_v()
self.pps_cb_qp_offset = r.se_v()
self.pps_cr_qp_offset = r.se_v()
self.pps_slice_chroma_qp_offsets_present_flag = r.u(1)
self.weighted_pred_flag = r.u(1)
self.weighted_bipred_flag = r.u(1)
self.transquant_bypass_enabled_flag = r.u(1)
self.tiles_enabled_flag = r.u(1)
self.entropy_coding_sync_enabled_flag = r.u(1)
if self.tiles_enabled_flag:
self.num_tile_columns_minus1 = r.ue_v()
self.num_tile_rows_minus1 = r.ue_v()
self.uniform_spacing_flag = r.u(1)
self.column_width_minus1, self.row_height_minus1 = [], []
if not self.uniform_spacing_flag:
for i in range(self.num_tile_columns_minus1): self.column_width_minus1.append(r.ue_v())
for i in range(self.num_tile_rows_minus1): self.row_height_minus1.append(r.ue_v())
self.loop_filter_across_tiles_enabled_flag = r.u(1)
self.loop_filter_across_slices_enabled_flag = r.u(1)
self.deblocking_filter_control_present_flag = r.u(1)
if self.deblocking_filter_control_present_flag: assert False, "deblocking_filter parsing not implemented"
self.scaling_list_data_present_flag = r.u(1)
if self.scaling_list_data_present_flag: assert False, "scaling_list_data parsing not implemented"
self.lists_modification_present_flag = r.u(1)
self.log2_parallel_merge_level = r.ue_v() + 2
# 7.3.6 Slice segment header syntax
class SliceSegment(HevcSlice):
def __init__(self, r:BitReader, nal_unit_type:int, sps:SPS, pps:PPS):
self.first_slice_segment_in_pic_flag = r.u(1)
if nal_unit_type >= avcodec.HEVC_NAL_BLA_W_LP and nal_unit_type <= avcodec.HEVC_NAL_RSV_IRAP_VCL23:
self.no_output_of_prior_pics_flag = r.u(1)
self.slice_pic_parameter_set_id = r.ue_v()
if not self.first_slice_segment_in_pic_flag:
if pps.dependent_slice_segments_enabled_flag:
self.dependent_slice_segment_flag = r.u(1)
self.slice_segment_address = r.ue_v()
self.dependent_slice_segment_flag = 0
if not self.dependent_slice_segment_flag:
r.u(pps.num_extra_slice_header_bits) # extra bits ignored
self.slice_type = r.ue_v()
self.sw_skip_start = r.read_bits - r.current_bits
self.pic_output_flag = r.u(1) if pps.output_flag_present_flag else 0
self.colour_plane_id = r.u(2) if sps.separate_colour_plane_flag else 0
if nal_unit_type != avcodec.HEVC_NAL_IDR_W_RADL and nal_unit_type != avcodec.HEVC_NAL_IDR_N_LP:
self.slice_pic_order_cnt_lsb = r.u(sps.log2_max_pic_order_cnt_lsb_minus4 + 4)
self.short_term_ref_pic_set_sps_flag = r.u(1)
if not self.short_term_ref_pic_set_sps_flag:
self.short_term_ref_pics_in_slice_start = r.read_bits - r.current_bits
self.st_ref_pic_set(r, sps.num_short_term_ref_pic_sets, sps=sps)
self.short_term_ref_pics_in_slice_end = r.read_bits - r.current_bits
elif sps.num_short_term_ref_pic_sets > 1: assert False, "short_term_ref_pic_set parsing not implemented"
if sps.long_term_ref_pics_present_flag: assert False, "long_term_ref_pics parsing not implemented"
self.sw_skip_end = r.read_bits - r.current_bits
self.slice_temporal_mvp_enabled_flag = r.u(1) if sps.sps_temporal_mvp_enabled_flag else 0
else: self.slice_pic_order_cnt_lsb, self.sw_skip_end = 0, self.sw_skip_start
if sps.sample_adaptive_offset_enabled_flag:
slice_sao_luma_flag = r.u(1)
ChromaArrayType = sps.chroma_format_idc if sps.separate_colour_plane_flag == 0 else 0
slice_sao_chroma_flag = r.u(1) if ChromaArrayType != 0 else 0
if self.slice_type in {avcodec.HEVC_SLICE_B, avcodec.HEVC_SLICE_B}:
if num_ref_idx_active_override_flag := r.u(1):
num_ref_idx_l0_active_minus1 = r.ue_v()
num_ref_idx_l1_active_minus1 = r.ue_v() if self.slice_type == avcodec.HEVC_SLICE_B else 0
def fill_sps_into_dev_context(device_ctx, sps:SPS):
device_ctx.chroma_format_idc = sps.chroma_format_idc
device_ctx.pic_width_in_luma_samples = sps.pic_width_in_luma_samples
device_ctx.pic_height_in_luma_samples = sps.pic_height_in_luma_samples
device_ctx.bit_depth_luma = sps.bit_depth_luma
device_ctx.bit_depth_chroma = sps.bit_depth_chroma
device_ctx.log2_max_pic_order_cnt_lsb_minus4 = sps.log2_max_pic_order_cnt_lsb_minus4
device_ctx.log2_min_luma_coding_block_size = sps.log2_min_luma_coding_block_size
device_ctx.log2_max_luma_coding_block_size = sps.log2_max_luma_coding_block_size
device_ctx.log2_min_transform_block_size = sps.log2_min_transform_block_size
device_ctx.log2_max_transform_block_size = sps.log2_max_transform_block_size
device_ctx.amp_enabled_flag = sps.amp_enabled_flag
device_ctx.pcm_enabled_flag = sps.pcm_enabled_flag
device_ctx.sample_adaptive_offset_enabled_flag = sps.sample_adaptive_offset_enabled_flag
device_ctx.sps_temporal_mvp_enabled_flag = sps.sps_temporal_mvp_enabled_flag
device_ctx.strong_intra_smoothing_enabled_flag = sps.strong_intra_smoothing_enabled_flag
def fill_pps_into_dev_context(device_ctx, pps:PPS):
device_ctx.sign_data_hiding_enabled_flag = pps.sign_data_hiding_enabled_flag
device_ctx.cabac_init_present_flag = pps.cabac_init_present_flag
device_ctx.num_ref_idx_l0_default_active = pps.num_ref_idx_l0_default_active
device_ctx.num_ref_idx_l1_default_active = pps.num_ref_idx_l1_default_active
device_ctx.init_qp = pps.init_qp
device_ctx.cu_qp_delta_enabled_flag = pps.cu_qp_delta_enabled_flag
device_ctx.diff_cu_qp_delta_depth = getattr(pps, 'diff_cu_qp_delta_depth', 0)
device_ctx.pps_cb_qp_offset = pps.pps_cb_qp_offset
device_ctx.pps_cr_qp_offset = pps.pps_cr_qp_offset
device_ctx.pps_slice_chroma_qp_offsets_present_flag = pps.pps_slice_chroma_qp_offsets_present_flag
device_ctx.weighted_pred_flag = pps.weighted_pred_flag
device_ctx.weighted_bipred_flag = pps.weighted_bipred_flag
device_ctx.transquant_bypass_enabled_flag = pps.transquant_bypass_enabled_flag
device_ctx.tiles_enabled_flag = pps.tiles_enabled_flag
device_ctx.entropy_coding_sync_enabled_flag = pps.entropy_coding_sync_enabled_flag
device_ctx.loop_filter_across_slices_enabled_flag = pps.loop_filter_across_slices_enabled_flag
device_ctx.deblocking_filter_control_present_flag = pps.deblocking_filter_control_present_flag
device_ctx.scaling_list_data_present_flag = pps.scaling_list_data_present_flag
device_ctx.lists_modification_present_flag = pps.lists_modification_present_flag
device_ctx.log2_parallel_merge_level = pps.log2_parallel_merge_level
device_ctx.loop_filter_across_tiles_enabled_flag = getattr(pps, 'loop_filter_across_tiles_enabled_flag', 0)
def parse_hevc_file_headers(dat:bytes, device="NV"):
res = []
nal_unit_start = 1
history:list[tuple[int, int, int]] = []
device_ctx = nv_gpu.nvdec_hevc_pic_s(gptimer_timeout_value=92720000, tileformat=1, sw_start_code_e=1, pattern_id=2)
nal_infos = []
ctx_bytes = bytes()
align_ctx_bytes_size = 0x300
def _flush_picture():
nonlocal res, history, device_ctx, nal_infos, ctx_bytes, align_ctx_bytes_size
if not len(nal_infos): return
hdr, nal_unit_type = nal_infos[0][0]
assert all(nal_unit_type == x[0][1] for x in nal_infos), "all NAL units in a picture must be of the same type"
device_ctx.curr_pic_idx = next(i for i in range(16) if all(d[0] != i for d in history))
if nal_unit_type in {avcodec.HEVC_NAL_IDR_W_RADL, avcodec.HEVC_NAL_IDR_N_LP}:
history = []
device_ctx.num_ref_frames = len(history)
device_ctx.IDR_picture_flag = int(nal_unit_type in {avcodec.HEVC_NAL_IDR_W_RADL, avcodec.HEVC_NAL_IDR_N_LP})
device_ctx.RAP_picture_flag = int(nal_unit_type >= avcodec.HEVC_NAL_BLA_W_LP and nal_unit_type <= avcodec.HEVC_NAL_RSV_IRAP_VCL23)
device_ctx.RefDiffPicOrderCnts=(ctypes.c_int16 * 16)()
device_ctx.colMvBuffersize = (round_up(sps.pic_width_in_luma_samples, 64) * round_up(sps.pic_height_in_luma_samples, 64) // 16) // 256
device_ctx.framestride=(ctypes.c_uint32 * 2)(round_up(sps.pic_width_in_luma_samples, 64), round_up(sps.pic_width_in_luma_samples, 64))
device_ctx.sw_hdr_skip_length = hdr.sw_skip_end - hdr.sw_skip_start
device_ctx.num_bits_short_term_ref_pics_in_slice = max(0, device_ctx.sw_hdr_skip_length - 9)
device_ctx.stream_len = sum(x[2] for x in nal_infos)
if pps.tiles_enabled_flag:
device_ctx.num_tile_columns = pps.num_tile_columns_minus1 + 1
device_ctx.num_tile_rows = pps.num_tile_rows_minus1 + 1
device_ctx.num_short_term_ref_pic_sets = sps.num_short_term_ref_pic_sets
luma_h_rounded = round_up(sps.pic_height_in_luma_samples, 64)
device_ctx.HevcSaoBufferOffset = (608 * luma_h_rounded) >> 8
device_ctx.HevcBsdCtrlOffset = ((device_ctx.HevcSaoBufferOffset<<8) + 4864 * luma_h_rounded) >> 8
device_ctx.v1.hevc_main10_444_ext.HevcFltAboveOffset = ((device_ctx.HevcBsdCtrlOffset<<8) + 152 * luma_h_rounded) >> 8
device_ctx.v1.hevc_main10_444_ext.HevcSaoAboveOffset = ((device_ctx.v1.hevc_main10_444_ext.HevcFltAboveOffset<<8) + 2000 * luma_h_rounded) >> 8
device_ctx.v3.HevcSliceEdgeOffset = device_ctx.v1.hevc_main10_444_ext.HevcSaoAboveOffset
before_list, after_list = [], []
for pic_idx, poc, _ in history:
device_ctx.RefDiffPicOrderCnts[pic_idx] = hdr.slice_pic_order_cnt_lsb - poc
if hdr.slice_pic_order_cnt_lsb < poc: after_list.append((poc - hdr.slice_pic_order_cnt_lsb, pic_idx))
else: before_list.append((hdr.slice_pic_order_cnt_lsb - poc, pic_idx))
before_list.sort()
after_list.sort()
device_ctx.initreflistidxl0 = (ctypes.c_uint8 * 16)(*[idx for _,idx in before_list + after_list])
if hdr.slice_type == avcodec.HEVC_SLICE_B: device_ctx.initreflistidxl1 = (ctypes.c_uint8 * 16)(*[idx for _,idx in after_list + before_list])
locl_ctx_bytes = bytes(device_ctx)
locl_ctx_bytes += bytes(0x200 - len(locl_ctx_bytes)) # pad to 512 bytes
pic_width_in_ctbs = ceildiv(sps.pic_width_in_luma_samples, (1 << sps.log2_max_luma_coding_block_size))
pic_height_in_ctbs = ceildiv(sps.pic_height_in_luma_samples, (1 << sps.log2_max_luma_coding_block_size))
# append tile sizes 0x200
if pps.tiles_enabled_flag and pps.uniform_spacing_flag:
assert device_ctx.num_tile_columns == 1 and device_ctx.num_tile_rows == 1, "not implemented: uniform spacing with multiple tiles"
locl_ctx_bytes += pic_width_in_ctbs.to_bytes(2, "little") + pic_height_in_ctbs.to_bytes(2, "little")
else:
if pps.tiles_enabled_flag and not getattr(pps, 'uniform_spacing_flag', 0):
column_width = [cw_minus1 + 1 for cw_minus1 in pps.column_width_minus1[0:pps.num_tile_columns_minus1]]
row_height = [rh_minus1 + 1 for rh_minus1 in pps.row_height_minus1[0:pps.num_tile_rows_minus1]]
else:
column_width = []
row_height = []
column_width.append(pic_width_in_ctbs - sum(column_width))
row_height.append(pic_height_in_ctbs - sum(row_height))
for c in column_width:
for r in row_height: locl_ctx_bytes += c.to_bytes(2, "little") + r.to_bytes(2, "little")
luma_size = round_up(sps.pic_width_in_luma_samples, 64) * round_up(sps.pic_height_in_luma_samples, 64)
chroma_size = round_up(sps.pic_width_in_luma_samples, 64) * round_up((sps.pic_height_in_luma_samples + 1) // 2, 64)
is_hist = nal_unit_type in {avcodec.HEVC_NAL_TRAIL_R, avcodec.HEVC_NAL_IDR_N_LP, avcodec.HEVC_NAL_IDR_W_RADL}
res.append((nal_infos[0][1], device_ctx.stream_len, device_ctx.curr_pic_idx, len(history), is_hist))
locl_ctx_bytes += (align_ctx_bytes_size - len(locl_ctx_bytes)) * b'\x00'
ctx_bytes += locl_ctx_bytes
if nal_unit_type in {avcodec.HEVC_NAL_TRAIL_R, avcodec.HEVC_NAL_IDR_N_LP, avcodec.HEVC_NAL_IDR_W_RADL}:
history.append((device_ctx.curr_pic_idx, hdr.slice_pic_order_cnt_lsb, None))
if len(history) >= sps.sps_max_dec_pic_buffering[0]:
# remove the oldest poc
history.pop(0)
nal_infos = []
cnt = 0
while nal_unit_start < len(dat):
assert dat[nal_unit_start:nal_unit_start+3] == b"\x00\x00\x01", "NAL unit start code not found"
pos = dat.find(b"\x00\x00\x01", nal_unit_start + 3)
nal_unit_len = (pos if pos != -1 else len(dat)) - nal_unit_start
# 7.3.1.1 General NAL unit syntax
nal_unit_type = (dat[nal_unit_start+3] >> 1) & 0x3F
slice_dat = dat[nal_unit_start+5:nal_unit_start+nal_unit_len]
if nal_unit_type == avcodec.HEVC_NAL_SPS:
sps = SPS(BitReader(_hevc_get_rbsp(slice_dat)))
fill_sps_into_dev_context(device_ctx, sps)
elif nal_unit_type == avcodec.HEVC_NAL_PPS:
pps = PPS(BitReader(_hevc_get_rbsp(slice_dat)))
fill_pps_into_dev_context(device_ctx, pps)
elif nal_unit_type in {avcodec.HEVC_NAL_IDR_N_LP, avcodec.HEVC_NAL_IDR_W_RADL, avcodec.HEVC_NAL_TRAIL_R, avcodec.HEVC_NAL_TRAIL_N}:
hdr = SliceSegment(BitReader(slice_dat), nal_unit_type, sps, pps)
if hdr.first_slice_segment_in_pic_flag == 1: _flush_picture()
nal_infos.append(((hdr, nal_unit_type), nal_unit_start, nal_unit_len))
nal_unit_start += nal_unit_len
_flush_picture()
w = sps.pic_width_in_luma_samples - 2 * (sps.conf_win_left_offset + sps.conf_win_right_offset)
h = sps.pic_height_in_luma_samples - 2 * (sps.conf_win_top_offset + sps.conf_win_bottom_offset)
chroma_off = round_up(sps.pic_width_in_luma_samples, 64) * round_up(sps.pic_height_in_luma_samples, 64)
opaque = Tensor(ctx_bytes, device=device).reshape(len(res), align_ctx_bytes_size)
return opaque, res, w, h, sps.pic_width_in_luma_samples, sps.pic_height_in_luma_samples, chroma_off
def _addr_table(h, w, w_aligned):
GOB_W, GOB_H = 64, 8
GOB_SIZE = GOB_W * GOB_H
BLOCK_H_GOBS = 2
xs = Tensor.arange(w, dtype=dtypes.uint32).reshape(1, w)
ys = Tensor.arange(h, dtype=dtypes.uint32).reshape(h, 1)
gob_x = xs // GOB_W
gob_y = ys // GOB_H
super_block_y = gob_y // BLOCK_H_GOBS
gob_y_in_block = gob_y % BLOCK_H_GOBS
stride_gobs = w_aligned // GOB_W
base = ((super_block_y * stride_gobs + gob_x) * BLOCK_H_GOBS + gob_y_in_block) * GOB_SIZE
lx, ly = xs % GOB_W, ys % GOB_H
swiz = (lx & 0x0F) | ((ly & 0x03) << 4) | ((lx & 0x10) << 2) | ((ly & 0x04) << 5) | ((lx & 0x20) << 3)
return (base + swiz).reshape(-1)
def nv12_to_bgr_from_planes(luma: Tensor, chroma: Tensor, h: int, w: int) -> Tensor:
Y = luma.reshape(h, w).cast(dtypes.float32)
uv = chroma.reshape(h // 2, w // 2, 2).cast(dtypes.float32)
U_small = uv[..., 0]
V_small = uv[..., 1]
U = U_small.reshape(h // 2, 1, w // 2, 1).expand(h // 2, 2, w // 2, 2).reshape(h, w)
V = V_small.reshape(h // 2, 1, w // 2, 1).expand(h // 2, 2, w // 2, 2).reshape(h, w)
C = Y - 16.0
D = U - 128.0
E = V - 128.0
R = 1.1643835616438356 * C + 1.5960267857142858 * E
G = 1.1643835616438356 * C - 0.39176229009491365 * D - 0.8129676472377708 * E
B = 1.1643835616438356 * C + 2.017232142857143 * D
R = R.maximum(0.0).minimum(255.0)
G = G.maximum(0.0).minimum(255.0)
B = B.maximum(0.0).minimum(255.0)
return Tensor.stack([B, G, R], dim=2).cast(dtypes.uint8)
def untile_nv12(src:Tensor, h:int, w:int, luma_w:int, chroma_off:int) -> Tensor:
luma = src.reshape(-1)[_addr_table(h, w, round_up(luma_w, 64))]
chroma = src.reshape(-1)[chroma_off:][_addr_table((h + 1) // 2, w, round_up(luma_w, 64))]
return luma.cat(chroma).realize()
def to_bgr(tensor:Tensor, h:int, w:int, luma_w:int, chroma_off:int) -> Tensor:
luma = tensor.reshape(-1)[_addr_table(h, w, round_up(luma_w, 64))]
chroma = tensor.reshape(-1)[chroma_off:][_addr_table((h + 1) // 2, w, round_up(luma_w, 64))]
return nv12_to_bgr_from_planes(luma, chroma, h, w).realize()

View File

@@ -0,0 +1,603 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef clc9b0_h_
#define clc9b0_h_
#include "nvtypes.h"
#ifdef __cplusplus
extern "C" {
#endif
#define NVC9B0_VIDEO_DECODER (0x0000C9B0)
#define NVC9B0_NOP (0x00000100)
#define NVC9B0_NOP_PARAMETER 31:0
#define NVC9B0_PM_TRIGGER (0x00000140)
#define NVC9B0_PM_TRIGGER_V 31:0
#define NVC9B0_SET_APPLICATION_ID (0x00000200)
#define NVC9B0_SET_APPLICATION_ID_ID 31:0
#define NVC9B0_SET_APPLICATION_ID_ID_MPEG12 (0x00000001)
#define NVC9B0_SET_APPLICATION_ID_ID_VC1 (0x00000002)
#define NVC9B0_SET_APPLICATION_ID_ID_H264 (0x00000003)
#define NVC9B0_SET_APPLICATION_ID_ID_MPEG4 (0x00000004)
#define NVC9B0_SET_APPLICATION_ID_ID_VP8 (0x00000005)
#define NVC9B0_SET_APPLICATION_ID_ID_CTR64 (0x00000006)
#define NVC9B0_SET_APPLICATION_ID_ID_HEVC (0x00000007)
#define NVC9B0_SET_APPLICATION_ID_ID_NEW_H264 (0x00000008)
#define NVC9B0_SET_APPLICATION_ID_ID_VP9 (0x00000009)
#define NVC9B0_SET_APPLICATION_ID_ID_PASS1 (0x0000000A)
#define NVC9B0_SET_APPLICATION_ID_ID_HEVC_PARSER (0x0000000C)
#define NVC9B0_SET_APPLICATION_ID_ID_UCODE_TEST (0x0000000D)
#define NVC9B0_SET_APPLICATION_ID_ID_HWDRM_PR_DECRYPTAUDIO (0x0000000E)
#define NVC9B0_SET_APPLICATION_ID_ID_HWDRM_PR_DECRYPTAUDIOMULTIPLE (0x0000000F)
#define NVC9B0_SET_APPLICATION_ID_ID_HWDRM_PR_PREPROCESSENCRYPTEDDATA (0x00000010)
#define NVC9B0_SET_APPLICATION_ID_ID_VP9_WITH_PARSER (0x00000011)
#define NVC9B0_SET_APPLICATION_ID_ID_AVD (0x00000012)
#define NVC9B0_SET_APPLICATION_ID_ID_HW_DRM_PR4_DECRYPTCONTENTMULTIPLE (0x00000013)
#define NVC9B0_SET_APPLICATION_ID_ID_DHKE (0x00000020)
#define NVC9B0_SET_WATCHDOG_TIMER (0x00000204)
#define NVC9B0_SET_WATCHDOG_TIMER_TIMER 31:0
#define NVC9B0_SEMAPHORE_A (0x00000240)
#define NVC9B0_SEMAPHORE_A_UPPER 7:0
#define NVC9B0_SEMAPHORE_B (0x00000244)
#define NVC9B0_SEMAPHORE_B_LOWER 31:0
#define NVC9B0_SEMAPHORE_C (0x00000248)
#define NVC9B0_SEMAPHORE_C_PAYLOAD 31:0
#define NVC9B0_CTX_SAVE_AREA (0x0000024C)
#define NVC9B0_CTX_SAVE_AREA_OFFSET 31:0
#define NVC9B0_CTX_SWITCH (0x00000250)
#define NVC9B0_CTX_SWITCH_OP 1:0
#define NVC9B0_CTX_SWITCH_OP_CTX_UPDATE (0x00000000)
#define NVC9B0_CTX_SWITCH_OP_CTX_SAVE (0x00000001)
#define NVC9B0_CTX_SWITCH_OP_CTX_RESTORE (0x00000002)
#define NVC9B0_CTX_SWITCH_OP_CTX_FORCERESTORE (0x00000003)
#define NVC9B0_CTX_SWITCH_CTXID_VALID 2:2
#define NVC9B0_CTX_SWITCH_CTXID_VALID_FALSE (0x00000000)
#define NVC9B0_CTX_SWITCH_CTXID_VALID_TRUE (0x00000001)
#define NVC9B0_CTX_SWITCH_RESERVED0 7:3
#define NVC9B0_CTX_SWITCH_CTX_ID 23:8
#define NVC9B0_CTX_SWITCH_RESERVED1 31:24
#define NVC9B0_SET_SEMAPHORE_PAYLOAD_LOWER (0x00000254)
#define NVC9B0_SET_SEMAPHORE_PAYLOAD_LOWER_PAYLOAD_LOWER 31:0
#define NVC9B0_SET_SEMAPHORE_PAYLOAD_UPPER (0x00000258)
#define NVC9B0_SET_SEMAPHORE_PAYLOAD_UPPER_PAYLOAD_UPPER 31:0
#define NVC9B0_SET_MONITORED_FENCE_SIGNAL_ADDRESS_BASE_A (0x0000025C)
#define NVC9B0_SET_MONITORED_FENCE_SIGNAL_ADDRESS_BASE_A_LOWER 31:0
#define NVC9B0_SET_MONITORED_FENCE_SIGNAL_ADDRESS_BASE_B (0x00000260)
#define NVC9B0_SET_MONITORED_FENCE_SIGNAL_ADDRESS_BASE_B_UPPER 31:0
#define NVC9B0_EXECUTE (0x00000300)
#define NVC9B0_EXECUTE_NOTIFY 0:0
#define NVC9B0_EXECUTE_NOTIFY_DISABLE (0x00000000)
#define NVC9B0_EXECUTE_NOTIFY_ENABLE (0x00000001)
#define NVC9B0_EXECUTE_NOTIFY_ON 1:1
#define NVC9B0_EXECUTE_NOTIFY_ON_END (0x00000000)
#define NVC9B0_EXECUTE_NOTIFY_ON_BEGIN (0x00000001)
#define NVC9B0_EXECUTE_PREDICATION 2:2
#define NVC9B0_EXECUTE_PREDICATION_DISABLE (0x00000000)
#define NVC9B0_EXECUTE_PREDICATION_ENABLE (0x00000001)
#define NVC9B0_EXECUTE_PREDICATION_OP 3:3
#define NVC9B0_EXECUTE_PREDICATION_OP_EQUAL_ZERO (0x00000000)
#define NVC9B0_EXECUTE_PREDICATION_OP_NOT_EQUAL_ZERO (0x00000001)
#define NVC9B0_EXECUTE_AWAKEN 8:8
#define NVC9B0_EXECUTE_AWAKEN_DISABLE (0x00000000)
#define NVC9B0_EXECUTE_AWAKEN_ENABLE (0x00000001)
#define NVC9B0_SEMAPHORE_D (0x00000304)
#define NVC9B0_SEMAPHORE_D_STRUCTURE_SIZE 1:0
#define NVC9B0_SEMAPHORE_D_STRUCTURE_SIZE_ONE (0x00000000)
#define NVC9B0_SEMAPHORE_D_STRUCTURE_SIZE_FOUR (0x00000001)
#define NVC9B0_SEMAPHORE_D_STRUCTURE_SIZE_TWO (0x00000002)
#define NVC9B0_SEMAPHORE_D_AWAKEN_ENABLE 8:8
#define NVC9B0_SEMAPHORE_D_AWAKEN_ENABLE_FALSE (0x00000000)
#define NVC9B0_SEMAPHORE_D_AWAKEN_ENABLE_TRUE (0x00000001)
#define NVC9B0_SEMAPHORE_D_OPERATION 17:16
#define NVC9B0_SEMAPHORE_D_OPERATION_RELEASE (0x00000000)
#define NVC9B0_SEMAPHORE_D_OPERATION_RESERVED_0 (0x00000001)
#define NVC9B0_SEMAPHORE_D_OPERATION_RESERVED_1 (0x00000002)
#define NVC9B0_SEMAPHORE_D_OPERATION_TRAP (0x00000003)
#define NVC9B0_SEMAPHORE_D_FLUSH_DISABLE 21:21
#define NVC9B0_SEMAPHORE_D_FLUSH_DISABLE_FALSE (0x00000000)
#define NVC9B0_SEMAPHORE_D_FLUSH_DISABLE_TRUE (0x00000001)
#define NVC9B0_SEMAPHORE_D_TRAP_TYPE 23:22
#define NVC9B0_SEMAPHORE_D_TRAP_TYPE_UNCONDITIONAL (0x00000000)
#define NVC9B0_SEMAPHORE_D_TRAP_TYPE_CONDITIONAL (0x00000001)
#define NVC9B0_SEMAPHORE_D_TRAP_TYPE_CONDITIONAL_EXT (0x00000002)
#define NVC9B0_SEMAPHORE_D_PAYLOAD_SIZE 24:24
#define NVC9B0_SEMAPHORE_D_PAYLOAD_SIZE_32BIT (0x00000000)
#define NVC9B0_SEMAPHORE_D_PAYLOAD_SIZE_64BIT (0x00000001)
#define NVC9B0_SET_PREDICATION_OFFSET_UPPER (0x00000308)
#define NVC9B0_SET_PREDICATION_OFFSET_UPPER_OFFSET 7:0
#define NVC9B0_SET_PREDICATION_OFFSET_LOWER (0x0000030C)
#define NVC9B0_SET_PREDICATION_OFFSET_LOWER_OFFSET 31:0
#define NVC9B0_SET_AUXILIARY_DATA_BUFFER (0x00000310)
#define NVC9B0_SET_AUXILIARY_DATA_BUFFER_OFFSET 31:0
#define NVC9B0_SET_CONTROL_PARAMS (0x00000400)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE 3:0
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_MPEG1 (0x00000000)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_MPEG2 (0x00000001)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_VC1 (0x00000002)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_H264 (0x00000003)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_MPEG4 (0x00000004)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_DIVX3 (0x00000004)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_VP8 (0x00000005)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_HEVC (0x00000007)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_VP9 (0x00000009)
#define NVC9B0_SET_CONTROL_PARAMS_CODEC_TYPE_AV1 (0x0000000A)
#define NVC9B0_SET_CONTROL_PARAMS_GPTIMER_ON 4:4
#define NVC9B0_SET_CONTROL_PARAMS_RET_ERROR 5:5
#define NVC9B0_SET_CONTROL_PARAMS_ERR_CONCEAL_ON 6:6
#define NVC9B0_SET_CONTROL_PARAMS_ERROR_FRM_IDX 12:7
#define NVC9B0_SET_CONTROL_PARAMS_MBTIMER_ON 13:13
#define NVC9B0_SET_CONTROL_PARAMS_EC_INTRA_FRAME_USING_PSLC 14:14
#define NVC9B0_SET_CONTROL_PARAMS_IGNORE_SOME_FIELDS_CRC_CHECK 15:15
#define NVC9B0_SET_CONTROL_PARAMS_EVENT_TRACE_LOGGING_ON 16:16
#define NVC9B0_SET_CONTROL_PARAMS_ALL_INTRA_FRAME 17:17
#define NVC9B0_SET_CONTROL_PARAMS_TESTRUN_ENV 19:18
#define NVC9B0_SET_CONTROL_PARAMS_TESTRUN_ENV_TRACE3D_RUN (0x00000000)
#define NVC9B0_SET_CONTROL_PARAMS_TESTRUN_ENV_PROD_RUN (0x00000001)
#define NVC9B0_SET_CONTROL_PARAMS_HINT_DUMP_EN 20:20
#define NVC9B0_SET_CONTROL_PARAMS_RESERVED 25:21
#define NVC9B0_SET_CONTROL_PARAMS_NVDECSIM_SKIP_SCP 26:26
#define NVC9B0_SET_CONTROL_PARAMS_ENABLE_ENCRYPT 27:27
#define NVC9B0_SET_CONTROL_PARAMS_ENCRYPTMODE 31:28
#define NVC9B0_SET_DRV_PIC_SETUP_OFFSET (0x00000404)
#define NVC9B0_SET_DRV_PIC_SETUP_OFFSET_OFFSET 31:0
#define NVC9B0_SET_IN_BUF_BASE_OFFSET (0x00000408)
#define NVC9B0_SET_IN_BUF_BASE_OFFSET_OFFSET 31:0
#define NVC9B0_SET_PICTURE_INDEX (0x0000040C)
#define NVC9B0_SET_PICTURE_INDEX_INDEX 31:0
#define NVC9B0_SET_SLICE_OFFSETS_BUF_OFFSET (0x00000410)
#define NVC9B0_SET_SLICE_OFFSETS_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_SET_COLOC_DATA_OFFSET (0x00000414)
#define NVC9B0_SET_COLOC_DATA_OFFSET_OFFSET 31:0
#define NVC9B0_SET_HISTORY_OFFSET (0x00000418)
#define NVC9B0_SET_HISTORY_OFFSET_OFFSET 31:0
#define NVC9B0_SET_DISPLAY_BUF_SIZE (0x0000041C)
#define NVC9B0_SET_DISPLAY_BUF_SIZE_SIZE 31:0
#define NVC9B0_SET_HISTOGRAM_OFFSET (0x00000420)
#define NVC9B0_SET_HISTOGRAM_OFFSET_OFFSET 31:0
#define NVC9B0_SET_NVDEC_STATUS_OFFSET (0x00000424)
#define NVC9B0_SET_NVDEC_STATUS_OFFSET_OFFSET 31:0
#define NVC9B0_SET_DISPLAY_BUF_LUMA_OFFSET (0x00000428)
#define NVC9B0_SET_DISPLAY_BUF_LUMA_OFFSET_OFFSET 31:0
#define NVC9B0_SET_DISPLAY_BUF_CHROMA_OFFSET (0x0000042C)
#define NVC9B0_SET_DISPLAY_BUF_CHROMA_OFFSET_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET0 (0x00000430)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET0_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET1 (0x00000434)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET1_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET2 (0x00000438)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET2_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET3 (0x0000043C)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET3_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET4 (0x00000440)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET4_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET5 (0x00000444)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET5_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET6 (0x00000448)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET6_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET7 (0x0000044C)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET7_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET8 (0x00000450)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET8_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET9 (0x00000454)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET9_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET10 (0x00000458)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET10_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET11 (0x0000045C)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET11_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET12 (0x00000460)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET12_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET13 (0x00000464)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET13_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET14 (0x00000468)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET14_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET15 (0x0000046C)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET15_OFFSET 31:0
#define NVC9B0_SET_PICTURE_LUMA_OFFSET16 (0x00000470)
#define NVC9B0_SET_PICTURE_LUMA_OFFSET16_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET0 (0x00000474)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET0_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET1 (0x00000478)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET1_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET2 (0x0000047C)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET2_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET3 (0x00000480)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET3_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET4 (0x00000484)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET4_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET5 (0x00000488)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET5_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET6 (0x0000048C)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET6_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET7 (0x00000490)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET7_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET8 (0x00000494)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET8_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET9 (0x00000498)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET9_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET10 (0x0000049C)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET10_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET11 (0x000004A0)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET11_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET12 (0x000004A4)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET12_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET13 (0x000004A8)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET13_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET14 (0x000004AC)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET14_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET15 (0x000004B0)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET15_OFFSET 31:0
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET16 (0x000004B4)
#define NVC9B0_SET_PICTURE_CHROMA_OFFSET16_OFFSET 31:0
#define NVC9B0_SET_PIC_SCRATCH_BUF_OFFSET (0x000004B8)
#define NVC9B0_SET_PIC_SCRATCH_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_SET_EXTERNAL_MVBUFFER_OFFSET (0x000004BC)
#define NVC9B0_SET_EXTERNAL_MVBUFFER_OFFSET_OFFSET 31:0
#define NVC9B0_SET_SUB_SAMPLE_MAP_OFFSET (0x000004C0)
#define NVC9B0_SET_SUB_SAMPLE_MAP_OFFSET_OFFSET 31:0
#define NVC9B0_SET_SUB_SAMPLE_MAP_IV_OFFSET (0x000004C4)
#define NVC9B0_SET_SUB_SAMPLE_MAP_IV_OFFSET_OFFSET 31:0
#define NVC9B0_SET_INTRA_TOP_BUF_OFFSET (0x000004C8)
#define NVC9B0_SET_INTRA_TOP_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_SET_TILE_SIZE_BUF_OFFSET (0x000004CC)
#define NVC9B0_SET_TILE_SIZE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_SET_FILTER_BUFFER_OFFSET (0x000004D0)
#define NVC9B0_SET_FILTER_BUFFER_OFFSET_OFFSET 31:0
#define NVC9B0_SET_CRC_STRUCT_OFFSET (0x000004D4)
#define NVC9B0_SET_CRC_STRUCT_OFFSET_OFFSET 31:0
#define NVC9B0_SET_PR_SSM_CONTENT_INFO_BUF_OFFSET (0x000004D8)
#define NVC9B0_SET_PR_SSM_CONTENT_INFO_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_H264_SET_MBHIST_BUF_OFFSET (0x00000500)
#define NVC9B0_H264_SET_MBHIST_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP8_SET_PROB_DATA_OFFSET (0x00000540)
#define NVC9B0_VP8_SET_PROB_DATA_OFFSET_OFFSET 31:0
#define NVC9B0_VP8_SET_HEADER_PARTITION_BUF_BASE_OFFSET (0x00000544)
#define NVC9B0_VP8_SET_HEADER_PARTITION_BUF_BASE_OFFSET_OFFSET 31:0
#define NVC9B0_HEVC_SET_SCALING_LIST_OFFSET (0x00000580)
#define NVC9B0_HEVC_SET_SCALING_LIST_OFFSET_OFFSET 31:0
#define NVC9B0_HEVC_SET_TILE_SIZES_OFFSET (0x00000584)
#define NVC9B0_HEVC_SET_TILE_SIZES_OFFSET_OFFSET 31:0
#define NVC9B0_HEVC_SET_FILTER_BUFFER_OFFSET (0x00000588)
#define NVC9B0_HEVC_SET_FILTER_BUFFER_OFFSET_OFFSET 31:0
#define NVC9B0_HEVC_SET_SAO_BUFFER_OFFSET (0x0000058C)
#define NVC9B0_HEVC_SET_SAO_BUFFER_OFFSET_OFFSET 31:0
#define NVC9B0_HEVC_SET_SLICE_INFO_BUFFER_OFFSET (0x00000590)
#define NVC9B0_HEVC_SET_SLICE_INFO_BUFFER_OFFSET_OFFSET 31:0
#define NVC9B0_HEVC_SET_SLICE_GROUP_INDEX (0x00000594)
#define NVC9B0_HEVC_SET_SLICE_GROUP_INDEX_OFFSET 31:0
#define NVC9B0_VP9_SET_PROB_TAB_BUF_OFFSET (0x000005C0)
#define NVC9B0_VP9_SET_PROB_TAB_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_CTX_COUNTER_BUF_OFFSET (0x000005C4)
#define NVC9B0_VP9_SET_CTX_COUNTER_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_SEGMENT_READ_BUF_OFFSET (0x000005C8)
#define NVC9B0_VP9_SET_SEGMENT_READ_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_SEGMENT_WRITE_BUF_OFFSET (0x000005CC)
#define NVC9B0_VP9_SET_SEGMENT_WRITE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_TILE_SIZE_BUF_OFFSET (0x000005D0)
#define NVC9B0_VP9_SET_TILE_SIZE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_COL_MVWRITE_BUF_OFFSET (0x000005D4)
#define NVC9B0_VP9_SET_COL_MVWRITE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_COL_MVREAD_BUF_OFFSET (0x000005D8)
#define NVC9B0_VP9_SET_COL_MVREAD_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_FILTER_BUFFER_OFFSET (0x000005DC)
#define NVC9B0_VP9_SET_FILTER_BUFFER_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_PARSER_SET_PIC_SETUP_OFFSET (0x000005E0)
#define NVC9B0_VP9_PARSER_SET_PIC_SETUP_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_PARSER_SET_PREV_PIC_SETUP_OFFSET (0x000005E4)
#define NVC9B0_VP9_PARSER_SET_PREV_PIC_SETUP_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_PARSER_SET_PROB_TAB_BUF_OFFSET (0x000005E8)
#define NVC9B0_VP9_PARSER_SET_PROB_TAB_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_VP9_SET_HINT_DUMP_BUF_OFFSET (0x000005EC)
#define NVC9B0_VP9_SET_HINT_DUMP_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PASS1_SET_CLEAR_HEADER_OFFSET (0x00000600)
#define NVC9B0_PASS1_SET_CLEAR_HEADER_OFFSET_OFFSET 31:0
#define NVC9B0_PASS1_SET_RE_ENCRYPT_OFFSET (0x00000604)
#define NVC9B0_PASS1_SET_RE_ENCRYPT_OFFSET_OFFSET 31:0
#define NVC9B0_PASS1_SET_VP8_TOKEN_OFFSET (0x00000608)
#define NVC9B0_PASS1_SET_VP8_TOKEN_OFFSET_OFFSET 31:0
#define NVC9B0_PASS1_SET_INPUT_DATA_OFFSET (0x0000060C)
#define NVC9B0_PASS1_SET_INPUT_DATA_OFFSET_OFFSET 31:0
#define NVC9B0_PASS1_SET_OUTPUT_DATA_SIZE_OFFSET (0x00000610)
#define NVC9B0_PASS1_SET_OUTPUT_DATA_SIZE_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_PROB_TAB_READ_BUF_OFFSET (0x00000640)
#define NVC9B0_AV1_SET_PROB_TAB_READ_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_PROB_TAB_WRITE_BUF_OFFSET (0x00000644)
#define NVC9B0_AV1_SET_PROB_TAB_WRITE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_SEGMENT_READ_BUF_OFFSET (0x00000648)
#define NVC9B0_AV1_SET_SEGMENT_READ_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_SEGMENT_WRITE_BUF_OFFSET (0x0000064C)
#define NVC9B0_AV1_SET_SEGMENT_WRITE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_COL_MV0_READ_BUF_OFFSET (0x00000650)
#define NVC9B0_AV1_SET_COL_MV0_READ_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_COL_MV1_READ_BUF_OFFSET (0x00000654)
#define NVC9B0_AV1_SET_COL_MV1_READ_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_COL_MV2_READ_BUF_OFFSET (0x00000658)
#define NVC9B0_AV1_SET_COL_MV2_READ_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_COL_MVWRITE_BUF_OFFSET (0x0000065C)
#define NVC9B0_AV1_SET_COL_MVWRITE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_GLOBAL_MODEL_BUF_OFFSET (0x00000660)
#define NVC9B0_AV1_SET_GLOBAL_MODEL_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_FILM_GRAIN_BUF_OFFSET (0x00000664)
#define NVC9B0_AV1_SET_FILM_GRAIN_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_TILE_STREAM_INFO_BUF_OFFSET (0x00000668)
#define NVC9B0_AV1_SET_TILE_STREAM_INFO_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_SUB_STREAM_ENTRY_BUF_OFFSET (0x0000066C)
#define NVC9B0_AV1_SET_SUB_STREAM_ENTRY_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_AV1_SET_HINT_DUMP_BUF_OFFSET (0x00000670)
#define NVC9B0_AV1_SET_HINT_DUMP_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_H264_SET_SCALING_LIST_OFFSET (0x00000680)
#define NVC9B0_H264_SET_SCALING_LIST_OFFSET_OFFSET 31:0
#define NVC9B0_H264_SET_VLDHIST_BUF_OFFSET (0x00000684)
#define NVC9B0_H264_SET_VLDHIST_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_H264_SET_EDOBOFFSET0 (0x00000688)
#define NVC9B0_H264_SET_EDOBOFFSET0_OFFSET 31:0
#define NVC9B0_H264_SET_EDOBOFFSET1 (0x0000068C)
#define NVC9B0_H264_SET_EDOBOFFSET1_OFFSET 31:0
#define NVC9B0_H264_SET_EDOBOFFSET2 (0x00000690)
#define NVC9B0_H264_SET_EDOBOFFSET2_OFFSET 31:0
#define NVC9B0_H264_SET_EDOBOFFSET3 (0x00000694)
#define NVC9B0_H264_SET_EDOBOFFSET3_OFFSET 31:0
#define NVC9B0_SET_CONTENT_INITIAL_VECTOR(b) (0x00000C00 + (b)*0x00000004)
#define NVC9B0_SET_CONTENT_INITIAL_VECTOR_VALUE 31:0
#define NVC9B0_SET_CTL_COUNT (0x00000C10)
#define NVC9B0_SET_CTL_COUNT_VALUE 31:0
#define NVC9B0_SET_UPPER_SRC (0x00000C14)
#define NVC9B0_SET_UPPER_SRC_OFFSET 7:0
#define NVC9B0_SET_LOWER_SRC (0x00000C18)
#define NVC9B0_SET_LOWER_SRC_OFFSET 31:0
#define NVC9B0_SET_UPPER_DST (0x00000C1C)
#define NVC9B0_SET_UPPER_DST_OFFSET 7:0
#define NVC9B0_SET_LOWER_DST (0x00000C20)
#define NVC9B0_SET_LOWER_DST_OFFSET 31:0
#define NVC9B0_SET_BLOCK_COUNT (0x00000C24)
#define NVC9B0_SET_BLOCK_COUNT_VALUE 31:0
#define NVC9B0_PR_SET_REQUEST_BUF_OFFSET (0x00000D00)
#define NVC9B0_PR_SET_REQUEST_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PR_SET_REQUEST_BUF_SIZE (0x00000D04)
#define NVC9B0_PR_SET_REQUEST_BUF_SIZE_SIZE 31:0
#define NVC9B0_PR_SET_RESPONSE_BUF_OFFSET (0x00000D08)
#define NVC9B0_PR_SET_RESPONSE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PR_SET_RESPONSE_BUF_SIZE (0x00000D0C)
#define NVC9B0_PR_SET_RESPONSE_BUF_SIZE_SIZE 31:0
#define NVC9B0_PR_SET_REQUEST_MESSAGE_BUF_OFFSET (0x00000D10)
#define NVC9B0_PR_SET_REQUEST_MESSAGE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PR_SET_RESPONSE_MESSAGE_BUF_OFFSET (0x00000D14)
#define NVC9B0_PR_SET_RESPONSE_MESSAGE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PR_SET_LOCAL_DECRYPT_BUF_OFFSET (0x00000D18)
#define NVC9B0_PR_SET_LOCAL_DECRYPT_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PR_SET_LOCAL_DECRYPT_BUF_SIZE (0x00000D1C)
#define NVC9B0_PR_SET_LOCAL_DECRYPT_BUF_SIZE_SIZE 31:0
#define NVC9B0_PR_SET_CONTENT_DECRYPT_INFO_BUF_OFFSET (0x00000D20)
#define NVC9B0_PR_SET_CONTENT_DECRYPT_INFO_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_PR_SET_REENCRYPTED_BITSTREAM_BUF_OFFSET (0x00000D24)
#define NVC9B0_PR_SET_REENCRYPTED_BITSTREAM_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_DH_KE_SET_CHALLENGE_BUF_OFFSET (0x00000E00)
#define NVC9B0_DH_KE_SET_CHALLENGE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_DH_KE_SET_RESPONSE_BUF_OFFSET (0x00000E04)
#define NVC9B0_DH_KE_SET_RESPONSE_BUF_OFFSET_OFFSET 31:0
#define NVC9B0_SET_SESSION_KEY(b) (0x00000F00 + (b)*0x00000004)
#define NVC9B0_SET_SESSION_KEY_VALUE 31:0
#define NVC9B0_SET_CONTENT_KEY(b) (0x00000F10 + (b)*0x00000004)
#define NVC9B0_SET_CONTENT_KEY_VALUE 31:0
#define NVC9B0_PM_TRIGGER_END (0x00001114)
#define NVC9B0_PM_TRIGGER_END_V 31:0
#define NVC9B0_ERROR_NONE (0x00000000)
#define NVC9B0_OS_ERROR_EXECUTE_INSUFFICIENT_DATA (0x00000001)
#define NVC9B0_OS_ERROR_SEMAPHORE_INSUFFICIENT_DATA (0x00000002)
#define NVC9B0_OS_ERROR_INVALID_METHOD (0x00000003)
#define NVC9B0_OS_ERROR_INVALID_DMA_PAGE (0x00000004)
#define NVC9B0_OS_ERROR_UNHANDLED_INTERRUPT (0x00000005)
#define NVC9B0_OS_ERROR_EXCEPTION (0x00000006)
#define NVC9B0_OS_ERROR_INVALID_CTXSW_REQUEST (0x00000007)
#define NVC9B0_OS_ERROR_APPLICATION (0x00000008)
#define NVC9B0_OS_ERROR_SW_BREAKPT (0x00000009)
#define NVC9B0_OS_INTERRUPT_EXECUTE_AWAKEN (0x00000100)
#define NVC9B0_OS_INTERRUPT_BACKEND_SEMAPHORE_AWAKEN (0x00000200)
#define NVC9B0_OS_INTERRUPT_CTX_ERROR_FBIF (0x00000300)
#define NVC9B0_OS_INTERRUPT_LIMIT_VIOLATION (0x00000400)
#define NVC9B0_OS_INTERRUPT_LIMIT_AND_FBIF_CTX_ERROR (0x00000500)
#define NVC9B0_OS_INTERRUPT_HALT_ENGINE (0x00000600)
#define NVC9B0_OS_INTERRUPT_TRAP_NONSTALL (0x00000700)
#define NVC9B0_H264_VLD_ERR_SEQ_DATA_INCONSISTENT (0x00004001)
#define NVC9B0_H264_VLD_ERR_PIC_DATA_INCONSISTENT (0x00004002)
#define NVC9B0_H264_VLD_ERR_SLC_DATA_BUF_ADDR_OUT_OF_BOUNDS (0x00004100)
#define NVC9B0_H264_VLD_ERR_BITSTREAM_ERROR (0x00004101)
#define NVC9B0_H264_VLD_ERR_CTX_DMA_ID_CTRL_IN_INVALID (0x000041F8)
#define NVC9B0_H264_VLD_ERR_SLC_HDR_OUT_SIZE_NOT_MULT256 (0x00004200)
#define NVC9B0_H264_VLD_ERR_SLC_DATA_OUT_SIZE_NOT_MULT256 (0x00004201)
#define NVC9B0_H264_VLD_ERR_CTX_DMA_ID_FLOW_CTRL_INVALID (0x00004203)
#define NVC9B0_H264_VLD_ERR_CTX_DMA_ID_SLC_HDR_OUT_INVALID (0x00004204)
#define NVC9B0_H264_VLD_ERR_SLC_HDR_OUT_BUF_TOO_SMALL (0x00004205)
#define NVC9B0_H264_VLD_ERR_SLC_HDR_OUT_BUF_ALREADY_VALID (0x00004206)
#define NVC9B0_H264_VLD_ERR_SLC_DATA_OUT_BUF_TOO_SMALL (0x00004207)
#define NVC9B0_H264_VLD_ERR_DATA_BUF_CNT_TOO_SMALL (0x00004208)
#define NVC9B0_H264_VLD_ERR_BITSTREAM_EMPTY (0x00004209)
#define NVC9B0_H264_VLD_ERR_FRAME_WIDTH_TOO_LARGE (0x0000420A)
#define NVC9B0_H264_VLD_ERR_FRAME_HEIGHT_TOO_LARGE (0x0000420B)
#define NVC9B0_H264_VLD_ERR_HIST_BUF_TOO_SMALL (0x00004300)
#define NVC9B0_VC1_VLD_ERR_PIC_DATA_BUF_ADDR_OUT_OF_BOUND (0x00005100)
#define NVC9B0_VC1_VLD_ERR_BITSTREAM_ERROR (0x00005101)
#define NVC9B0_VC1_VLD_ERR_PIC_HDR_OUT_SIZE_NOT_MULT256 (0x00005200)
#define NVC9B0_VC1_VLD_ERR_PIC_DATA_OUT_SIZE_NOT_MULT256 (0x00005201)
#define NVC9B0_VC1_VLD_ERR_CTX_DMA_ID_CTRL_IN_INVALID (0x00005202)
#define NVC9B0_VC1_VLD_ERR_CTX_DMA_ID_FLOW_CTRL_INVALID (0x00005203)
#define NVC9B0_VC1_VLD_ERR_CTX_DMA_ID_PIC_HDR_OUT_INVALID (0x00005204)
#define NVC9B0_VC1_VLD_ERR_SLC_HDR_OUT_BUF_TOO_SMALL (0x00005205)
#define NVC9B0_VC1_VLD_ERR_PIC_HDR_OUT_BUF_ALREADY_VALID (0x00005206)
#define NVC9B0_VC1_VLD_ERR_PIC_DATA_OUT_BUF_TOO_SMALL (0x00005207)
#define NVC9B0_VC1_VLD_ERR_DATA_INFO_IN_BUF_TOO_SMALL (0x00005208)
#define NVC9B0_VC1_VLD_ERR_BITSTREAM_EMPTY (0x00005209)
#define NVC9B0_VC1_VLD_ERR_FRAME_WIDTH_TOO_LARGE (0x0000520A)
#define NVC9B0_VC1_VLD_ERR_FRAME_HEIGHT_TOO_LARGE (0x0000520B)
#define NVC9B0_VC1_VLD_ERR_PIC_DATA_OUT_BUF_FULL_TIME_OUT (0x00005300)
#define NVC9B0_MPEG12_VLD_ERR_SLC_DATA_BUF_ADDR_OUT_OF_BOUNDS (0x00006100)
#define NVC9B0_MPEG12_VLD_ERR_BITSTREAM_ERROR (0x00006101)
#define NVC9B0_MPEG12_VLD_ERR_SLC_DATA_OUT_SIZE_NOT_MULT256 (0x00006200)
#define NVC9B0_MPEG12_VLD_ERR_CTX_DMA_ID_CTRL_IN_INVALID (0x00006201)
#define NVC9B0_MPEG12_VLD_ERR_CTX_DMA_ID_FLOW_CTRL_INVALID (0x00006202)
#define NVC9B0_MPEG12_VLD_ERR_SLC_DATA_OUT_BUF_TOO_SMALL (0x00006203)
#define NVC9B0_MPEG12_VLD_ERR_DATA_INFO_IN_BUF_TOO_SMALL (0x00006204)
#define NVC9B0_MPEG12_VLD_ERR_BITSTREAM_EMPTY (0x00006205)
#define NVC9B0_MPEG12_VLD_ERR_INVALID_PIC_STRUCTURE (0x00006206)
#define NVC9B0_MPEG12_VLD_ERR_INVALID_PIC_CODING_TYPE (0x00006207)
#define NVC9B0_MPEG12_VLD_ERR_FRAME_WIDTH_TOO_LARGE (0x00006208)
#define NVC9B0_MPEG12_VLD_ERR_FRAME_HEIGHT_TOO_LARGE (0x00006209)
#define NVC9B0_MPEG12_VLD_ERR_SLC_DATA_OUT_BUF_FULL_TIME_OUT (0x00006300)
#define NVC9B0_CMN_VLD_ERR_PDEC_RETURNED_ERROR (0x00007101)
#define NVC9B0_CMN_VLD_ERR_EDOB_FLUSH_TIME_OUT (0x00007102)
#define NVC9B0_CMN_VLD_ERR_EDOB_REWIND_TIME_OUT (0x00007103)
#define NVC9B0_CMN_VLD_ERR_VLD_WD_TIME_OUT (0x00007104)
#define NVC9B0_CMN_VLD_ERR_NUM_SLICES_ZERO (0x00007105)
#define NVC9B0_MPEG4_VLD_ERR_PIC_DATA_BUF_ADDR_OUT_OF_BOUND (0x00008100)
#define NVC9B0_MPEG4_VLD_ERR_BITSTREAM_ERROR (0x00008101)
#define NVC9B0_MPEG4_VLD_ERR_PIC_HDR_OUT_SIZE_NOT_MULT256 (0x00008200)
#define NVC9B0_MPEG4_VLD_ERR_PIC_DATA_OUT_SIZE_NOT_MULT256 (0x00008201)
#define NVC9B0_MPEG4_VLD_ERR_CTX_DMA_ID_CTRL_IN_INVALID (0x00008202)
#define NVC9B0_MPEG4_VLD_ERR_CTX_DMA_ID_FLOW_CTRL_INVALID (0x00008203)
#define NVC9B0_MPEG4_VLD_ERR_CTX_DMA_ID_PIC_HDR_OUT_INVALID (0x00008204)
#define NVC9B0_MPEG4_VLD_ERR_SLC_HDR_OUT_BUF_TOO_SMALL (0x00008205)
#define NVC9B0_MPEG4_VLD_ERR_PIC_HDR_OUT_BUF_ALREADY_VALID (0x00008206)
#define NVC9B0_MPEG4_VLD_ERR_PIC_DATA_OUT_BUF_TOO_SMALL (0x00008207)
#define NVC9B0_MPEG4_VLD_ERR_DATA_INFO_IN_BUF_TOO_SMALL (0x00008208)
#define NVC9B0_MPEG4_VLD_ERR_BITSTREAM_EMPTY (0x00008209)
#define NVC9B0_MPEG4_VLD_ERR_FRAME_WIDTH_TOO_LARGE (0x0000820A)
#define NVC9B0_MPEG4_VLD_ERR_FRAME_HEIGHT_TOO_LARGE (0x0000820B)
#define NVC9B0_MPEG4_VLD_ERR_PIC_DATA_OUT_BUF_FULL_TIME_OUT (0x00051E01)
#define NVC9B0_DEC_ERROR_MPEG12_APPTIMER_EXPIRED (0xDEC10001)
#define NVC9B0_DEC_ERROR_MPEG12_MVTIMER_EXPIRED (0xDEC10002)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_TOKEN (0xDEC10003)
#define NVC9B0_DEC_ERROR_MPEG12_SLICEDATA_MISSING (0xDEC10004)
#define NVC9B0_DEC_ERROR_MPEG12_HWERR_INTERRUPT (0xDEC10005)
#define NVC9B0_DEC_ERROR_MPEG12_DETECTED_VLD_FAILURE (0xDEC10006)
#define NVC9B0_DEC_ERROR_MPEG12_PICTURE_INIT (0xDEC10100)
#define NVC9B0_DEC_ERROR_MPEG12_STATEMACHINE_FAILURE (0xDEC10101)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_CTXID_PIC (0xDEC10901)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_CTXID_UCODE (0xDEC10902)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_CTXID_FC (0xDEC10903)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_CTXID_SLH (0xDEC10904)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_UCODE_SIZE (0xDEC10905)
#define NVC9B0_DEC_ERROR_MPEG12_INVALID_SLICE_COUNT (0xDEC10906)
#define NVC9B0_DEC_ERROR_VC1_APPTIMER_EXPIRED (0xDEC20001)
#define NVC9B0_DEC_ERROR_VC1_MVTIMER_EXPIRED (0xDEC20002)
#define NVC9B0_DEC_ERROR_VC1_INVALID_TOKEN (0xDEC20003)
#define NVC9B0_DEC_ERROR_VC1_SLICEDATA_MISSING (0xDEC20004)
#define NVC9B0_DEC_ERROR_VC1_HWERR_INTERRUPT (0xDEC20005)
#define NVC9B0_DEC_ERROR_VC1_DETECTED_VLD_FAILURE (0xDEC20006)
#define NVC9B0_DEC_ERROR_VC1_TIMEOUT_POLLING_FOR_DATA (0xDEC20007)
#define NVC9B0_DEC_ERROR_VC1_PDEC_PIC_END_UNALIGNED (0xDEC20008)
#define NVC9B0_DEC_ERROR_VC1_WDTIMER_EXPIRED (0xDEC20009)
#define NVC9B0_DEC_ERROR_VC1_ERRINTSTART (0xDEC20010)
#define NVC9B0_DEC_ERROR_VC1_IQT_ERRINT (0xDEC20011)
#define NVC9B0_DEC_ERROR_VC1_MC_ERRINT (0xDEC20012)
#define NVC9B0_DEC_ERROR_VC1_MC_IQT_ERRINT (0xDEC20013)
#define NVC9B0_DEC_ERROR_VC1_REC_ERRINT (0xDEC20014)
#define NVC9B0_DEC_ERROR_VC1_REC_IQT_ERRINT (0xDEC20015)
#define NVC9B0_DEC_ERROR_VC1_REC_MC_ERRINT (0xDEC20016)
#define NVC9B0_DEC_ERROR_VC1_REC_MC_IQT_ERRINT (0xDEC20017)
#define NVC9B0_DEC_ERROR_VC1_DBF_ERRINT (0xDEC20018)
#define NVC9B0_DEC_ERROR_VC1_DBF_IQT_ERRINT (0xDEC20019)
#define NVC9B0_DEC_ERROR_VC1_DBF_MC_ERRINT (0xDEC2001A)
#define NVC9B0_DEC_ERROR_VC1_DBF_MC_IQT_ERRINT (0xDEC2001B)
#define NVC9B0_DEC_ERROR_VC1_DBF_REC_ERRINT (0xDEC2001C)
#define NVC9B0_DEC_ERROR_VC1_DBF_REC_IQT_ERRINT (0xDEC2001D)
#define NVC9B0_DEC_ERROR_VC1_DBF_REC_MC_ERRINT (0xDEC2001E)
#define NVC9B0_DEC_ERROR_VC1_DBF_REC_MC_IQT_ERRINT (0xDEC2001F)
#define NVC9B0_DEC_ERROR_VC1_PICTURE_INIT (0xDEC20100)
#define NVC9B0_DEC_ERROR_VC1_STATEMACHINE_FAILURE (0xDEC20101)
#define NVC9B0_DEC_ERROR_VC1_INVALID_CTXID_PIC (0xDEC20901)
#define NVC9B0_DEC_ERROR_VC1_INVALID_CTXID_UCODE (0xDEC20902)
#define NVC9B0_DEC_ERROR_VC1_INVALID_CTXID_FC (0xDEC20903)
#define NVC9B0_DEC_ERROR_VC1_INVAILD_CTXID_SLH (0xDEC20904)
#define NVC9B0_DEC_ERROR_VC1_INVALID_UCODE_SIZE (0xDEC20905)
#define NVC9B0_DEC_ERROR_VC1_INVALID_SLICE_COUNT (0xDEC20906)
#define NVC9B0_DEC_ERROR_H264_APPTIMER_EXPIRED (0xDEC30001)
#define NVC9B0_DEC_ERROR_H264_MVTIMER_EXPIRED (0xDEC30002)
#define NVC9B0_DEC_ERROR_H264_INVALID_TOKEN (0xDEC30003)
#define NVC9B0_DEC_ERROR_H264_SLICEDATA_MISSING (0xDEC30004)
#define NVC9B0_DEC_ERROR_H264_HWERR_INTERRUPT (0xDEC30005)
#define NVC9B0_DEC_ERROR_H264_DETECTED_VLD_FAILURE (0xDEC30006)
#define NVC9B0_DEC_ERROR_H264_ERRINTSTART (0xDEC30010)
#define NVC9B0_DEC_ERROR_H264_IQT_ERRINT (0xDEC30011)
#define NVC9B0_DEC_ERROR_H264_MC_ERRINT (0xDEC30012)
#define NVC9B0_DEC_ERROR_H264_MC_IQT_ERRINT (0xDEC30013)
#define NVC9B0_DEC_ERROR_H264_REC_ERRINT (0xDEC30014)
#define NVC9B0_DEC_ERROR_H264_REC_IQT_ERRINT (0xDEC30015)
#define NVC9B0_DEC_ERROR_H264_REC_MC_ERRINT (0xDEC30016)
#define NVC9B0_DEC_ERROR_H264_REC_MC_IQT_ERRINT (0xDEC30017)
#define NVC9B0_DEC_ERROR_H264_DBF_ERRINT (0xDEC30018)
#define NVC9B0_DEC_ERROR_H264_DBF_IQT_ERRINT (0xDEC30019)
#define NVC9B0_DEC_ERROR_H264_DBF_MC_ERRINT (0xDEC3001A)
#define NVC9B0_DEC_ERROR_H264_DBF_MC_IQT_ERRINT (0xDEC3001B)
#define NVC9B0_DEC_ERROR_H264_DBF_REC_ERRINT (0xDEC3001C)
#define NVC9B0_DEC_ERROR_H264_DBF_REC_IQT_ERRINT (0xDEC3001D)
#define NVC9B0_DEC_ERROR_H264_DBF_REC_MC_ERRINT (0xDEC3001E)
#define NVC9B0_DEC_ERROR_H264_DBF_REC_MC_IQT_ERRINT (0xDEC3001F)
#define NVC9B0_DEC_ERROR_H264_PICTURE_INIT (0xDEC30100)
#define NVC9B0_DEC_ERROR_H264_STATEMACHINE_FAILURE (0xDEC30101)
#define NVC9B0_DEC_ERROR_H264_INVALID_CTXID_PIC (0xDEC30901)
#define NVC9B0_DEC_ERROR_H264_INVALID_CTXID_UCODE (0xDEC30902)
#define NVC9B0_DEC_ERROR_H264_INVALID_CTXID_FC (0xDEC30903)
#define NVC9B0_DEC_ERROR_H264_INVALID_CTXID_SLH (0xDEC30904)
#define NVC9B0_DEC_ERROR_H264_INVALID_UCODE_SIZE (0xDEC30905)
#define NVC9B0_DEC_ERROR_H264_INVALID_SLICE_COUNT (0xDEC30906)
#define NVC9B0_DEC_ERROR_MPEG4_APPTIMER_EXPIRED (0xDEC40001)
#define NVC9B0_DEC_ERROR_MPEG4_MVTIMER_EXPIRED (0xDEC40002)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_TOKEN (0xDEC40003)
#define NVC9B0_DEC_ERROR_MPEG4_SLICEDATA_MISSING (0xDEC40004)
#define NVC9B0_DEC_ERROR_MPEG4_HWERR_INTERRUPT (0xDEC40005)
#define NVC9B0_DEC_ERROR_MPEG4_DETECTED_VLD_FAILURE (0xDEC40006)
#define NVC9B0_DEC_ERROR_MPEG4_TIMEOUT_POLLING_FOR_DATA (0xDEC40007)
#define NVC9B0_DEC_ERROR_MPEG4_PDEC_PIC_END_UNALIGNED (0xDEC40008)
#define NVC9B0_DEC_ERROR_MPEG4_WDTIMER_EXPIRED (0xDEC40009)
#define NVC9B0_DEC_ERROR_MPEG4_ERRINTSTART (0xDEC40010)
#define NVC9B0_DEC_ERROR_MPEG4_IQT_ERRINT (0xDEC40011)
#define NVC9B0_DEC_ERROR_MPEG4_MC_ERRINT (0xDEC40012)
#define NVC9B0_DEC_ERROR_MPEG4_MC_IQT_ERRINT (0xDEC40013)
#define NVC9B0_DEC_ERROR_MPEG4_REC_ERRINT (0xDEC40014)
#define NVC9B0_DEC_ERROR_MPEG4_REC_IQT_ERRINT (0xDEC40015)
#define NVC9B0_DEC_ERROR_MPEG4_REC_MC_ERRINT (0xDEC40016)
#define NVC9B0_DEC_ERROR_MPEG4_REC_MC_IQT_ERRINT (0xDEC40017)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_ERRINT (0xDEC40018)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_IQT_ERRINT (0xDEC40019)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_MC_ERRINT (0xDEC4001A)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_MC_IQT_ERRINT (0xDEC4001B)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_REC_ERRINT (0xDEC4001C)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_REC_IQT_ERRINT (0xDEC4001D)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_REC_MC_ERRINT (0xDEC4001E)
#define NVC9B0_DEC_ERROR_MPEG4_DBF_REC_MC_IQT_ERRINT (0xDEC4001F)
#define NVC9B0_DEC_ERROR_MPEG4_PICTURE_INIT (0xDEC40100)
#define NVC9B0_DEC_ERROR_MPEG4_STATEMACHINE_FAILURE (0xDEC40101)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_CTXID_PIC (0xDEC40901)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_CTXID_UCODE (0xDEC40902)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_CTXID_FC (0xDEC40903)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_CTXID_SLH (0xDEC40904)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_UCODE_SIZE (0xDEC40905)
#define NVC9B0_DEC_ERROR_MPEG4_INVALID_SLICE_COUNT (0xDEC40906)
#ifdef __cplusplus
}; /* extern "C" */
#endif
#endif // clc9b0_h

File diff suppressed because it is too large Load Diff

View File

@@ -100,6 +100,9 @@ class NVDriver(VirtDriver):
assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVGPU)
struct.hObjectNew = self._alloc_handle()
self.object_by_handle[struct.hObjectNew] = NVSubDevice(self.object_by_handle[struct.hObjectParent])
elif struct.hClass == nv_gpu.NV01_MEMORY_VIRTUAL:
assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVGPU)
struct.hObjectNew = self._alloc_handle()
elif struct.hClass == nv_gpu.TURING_USERMODE_A:
assert struct.hObjectParent in self.object_by_handle and isinstance(self.object_by_handle[struct.hObjectParent], NVSubDevice)
struct.hObjectNew = self._alloc_handle()
@@ -215,6 +218,8 @@ class NVDriver(VirtDriver):
elif nr == nv_gpu.NV_ESC_RM_FREE:
st = nv_gpu.NVOS00_PARAMETERS.from_address(argp)
self.object_by_handle.pop(st.hObjectOld)
elif nr == nv_gpu.NV_ESC_RM_MAP_MEMORY_DMA:
pass # mappings are same as uvm
elif nr == nv_gpu.NV_ESC_CARD_INFO:
for i,gpu in enumerate(self.gpus.values()):
st = nv_gpu.nv_ioctl_card_info_t.from_address(argp + i * ctypes.sizeof(nv_gpu.nv_ioctl_card_info_t))

View File

@@ -0,0 +1,65 @@
import unittest
from tinygrad import Tensor, Device
from extra.hevc.hevc import parse_hevc_file_headers, nv_gpu
class TestHevc(unittest.TestCase):
def test_hevc_parser(self):
url = "https://github.com/haraschax/filedump/raw/09a497959f7fa6fd8dba501a25f2cdb3a41ecb12/comma_video.hevc"
hevc_tensor = Tensor.from_url(url, device="CPU")
dat = bytes(hevc_tensor.data())
opaque, frame_info, w, h, luma_w, luma_h, chroma_off = parse_hevc_file_headers(dat, device=Device.DEFAULT)
def _test_common(frame, bts):
self.assertEqual(frame0.pic_width_in_luma_samples, 1952)
self.assertEqual(frame0.pic_height_in_luma_samples, 1216)
self.assertEqual(frame0.chroma_format_idc, 1)
self.assertEqual(frame0.bit_depth_luma, 8)
self.assertEqual(frame0.bit_depth_chroma, 8)
self.assertEqual(frame0.log2_min_luma_coding_block_size, 3)
self.assertEqual(frame0.log2_max_luma_coding_block_size, 5)
self.assertEqual(frame0.log2_min_transform_block_size, 2)
self.assertEqual(frame0.log2_max_transform_block_size, 5)
self.assertEqual(frame0.num_tile_columns, 3)
self.assertEqual(frame0.num_tile_rows, 1)
self.assertEqual(frame0.colMvBuffersize, 589)
self.assertEqual(frame0.HevcSaoBufferOffset, 2888)
self.assertEqual(frame0.HevcBsdCtrlOffset, 25992)
self.assertEqual(frame0.v1.hevc_main10_444_ext.HevcFltAboveOffset, 26714)
self.assertEqual(frame0.v1.hevc_main10_444_ext.HevcSaoAboveOffset, 36214)
# tiles
self.assertEqual(bytes(bts[0x200:0x210]), b'\x18\x00&\x00\x18\x00&\x00\r\x00&\x00\x00\x00\x00\x00')
frame0 = nv_gpu.nvdec_hevc_pic_s.from_buffer(opaque[0].data())
_test_common(frame0, opaque[0].data())
self.assertEqual(frame0.stream_len, 148063)
self.assertEqual(frame0.IDR_picture_flag, 1)
self.assertEqual(frame0.RAP_picture_flag, 1)
self.assertEqual(frame0.sw_hdr_skip_length, 0)
self.assertEqual(frame0.num_ref_frames, 0)
frame1 = nv_gpu.nvdec_hevc_pic_s.from_buffer(opaque[1].data())
_test_common(frame1, opaque[1].data())
self.assertEqual(frame1.stream_len, 57110)
self.assertEqual(frame1.IDR_picture_flag, 0)
self.assertEqual(frame1.RAP_picture_flag, 0)
self.assertEqual(frame1.sw_hdr_skip_length, 9)
self.assertEqual(frame1.num_ref_frames, 1)
self.assertEqual(list(frame1.initreflistidxl0), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
self.assertEqual(list(frame1.initreflistidxl1), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
self.assertEqual(list(frame1.RefDiffPicOrderCnts), [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
frame3 = nv_gpu.nvdec_hevc_pic_s.from_buffer(opaque[3].data())
_test_common(frame3, opaque[3].data())
self.assertEqual(frame3.stream_len, 47036)
self.assertEqual(frame3.IDR_picture_flag, 0)
self.assertEqual(frame3.RAP_picture_flag, 0)
self.assertEqual(frame3.sw_hdr_skip_length, 9)
self.assertEqual(frame3.num_ref_frames, 1)
self.assertEqual(list(frame3.initreflistidxl0), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
self.assertEqual(list(frame3.initreflistidxl1), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
self.assertEqual(list(frame3.RefDiffPicOrderCnts), [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
if __name__ == "__main__":
unittest.main()

View File

@@ -238,6 +238,7 @@ class Allocator(Generic[DeviceType]):
# def _as_buffer(self, src) -> memoryview:
# def _offset(self, buf, size:int, offset:int):
# def _transfer(self, dest, src, sz:int, src_dev, dest_dev):
def _encode_decode(self, bufout, bufin, desc, hist:list, shape:tuple[int,...], frame_pos:int): raise NotImplementedError("need encdec") # optional
class LRUAllocator(Allocator, Generic[DeviceType]):
"""

View File

@@ -141,6 +141,19 @@ class BufferCopy(Runner):
class BufferXfer(BufferCopy):
def copy(self, dest, src): dest.allocator._transfer(dest._buf, src._buf, dest.nbytes, src_dev=src.allocator.dev, dest_dev=dest.allocator.dev)
class EncDec(Runner):
def __init__(self, encdec:UOp, total_sz:int, device:str):
self.shape, self.pos_var = encdec.arg[0], encdec.variables()[0].expr
name = f"enc/dec {total_sz/1e6:7.2f}M, HEVC" if total_sz >= 1e6 else f"enc/dec {total_sz:8d}, HEVC"
super().__init__(colored(name, "yellow"), device, Estimates(lds=total_sz, mem=total_sz))
def __call__(self, rawbufs:list[Buffer], var_vals:dict[str, int], wait=False):
st = time.perf_counter()
rawbufs[0].allocator._encode_decode(rawbufs[0]._buf, rawbufs[1]._buf, rawbufs[2]._buf,
[x._buf for x in rawbufs[3:]], self.shape, var_vals[self.pos_var])
if wait:
Device[rawbufs[0].device].synchronize()
return time.perf_counter() - st
# **************** method cache ****************
method_cache: dict[tuple[str, type, bytes, tuple[int, ...], bool], CompiledRunner] = {}
@@ -201,6 +214,7 @@ si_lowerer = PatternMatcher([
(UPat(Ops.COPY, name="copy"), lambda ctx,copy: ((BufferXfer(ctx[0].nbytes, ctx[0].device, ctx[1].device) \
if hasattr(Device[ctx[0].device].allocator, '_transfer') and all_same([x.device.split(":")[0] for x in ctx]) \
else BufferCopy(ctx[0].nbytes, ctx[0].device, ctx[1].device)), list(ctx))),
(UPat(Ops.ENCDEC, name="encdec"), lambda ctx,encdec: ((EncDec(encdec, ctx[0].nbytes, ctx[1].device)), list(ctx))),
])
def lower_schedule_item(si:ScheduleItem) -> ExecItem:
return ExecItem(*cast(tuple[Runner,list], si_lowerer.rewrite(si.ast, si.bufs)), si.metadata, si.fixedvars)

View File

@@ -4,6 +4,7 @@ from tinygrad.helpers import fetch, flatten, system, getenv
root = (here:=pathlib.Path(__file__).parent).parents[2]
nv_src = {"nv_570": "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/81fe4fb417c8ac3b9bdcc1d56827d116743892a5.tar.gz",
"nv_580": "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/2af9f1f0f7de4988432d4ae875b5858ffdb09cc2.tar.gz"}
ffmpeg_src = "https://ffmpeg.org/releases/ffmpeg-8.0.1.tar.gz"
macossdk = "/var/db/xcode_select_link/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk"
def load(name, dll, files, **kwargs):
@@ -27,6 +28,7 @@ def __getattr__(nm):
case "libc": return load("libc", ["find_library('c')"], lambda: (
[i for i in system("dpkg -L libc6-dev").split() if 'sys/mman.h' in i or 'sys/syscall.h' in i] +
["/usr/include/string.h", "/usr/include/elf.h", "/usr/include/unistd.h", "/usr/include/asm-generic/mman-common.h"]), use_errno=True)
case "avcodec": return load("avcodec", [], ["{}/libavcodec/hevc/hevc.h", "{}/libavcodec/cbs_h265.h"], tarball=ffmpeg_src)
case "opencl": return load("opencl", ["find_library('OpenCL')"], ["/usr/include/CL/cl.h"])
case "cuda": return load("cuda", ["find_library('cuda')"], ["/usr/include/cuda.h"], args=["-D__CUDA_API_VERSION_INTERNAL"], parse_macros=False)
case "nvrtc": return load("nvrtc", ["find_library('nvrtc')"], ["/usr/include/nvrtc.h"])
@@ -34,14 +36,14 @@ def __getattr__(nm):
case "kfd": return load("kfd", [], ["/usr/include/linux/kfd_ioctl.h"])
case "nv_570" | "nv_580":
return load(nm, [], [
*[root/"extra/nv_gpu_driver"/s for s in ["clc6c0qmd.h","clcec0qmd.h"]], "{}/kernel-open/common/inc/nvmisc.h",
*[f"{{}}/src/common/sdk/nvidia/inc/class/cl{s}.h" for s in ["0000", "0080", "2080", "2080_notification", "c56f", "c86f", "c96f", "c761",
*[root/"extra/nv_gpu_driver"/s for s in ["clc9b0.h", "clc6c0qmd.h","clcec0qmd.h", "nvdec_drv.h"]], "{}/kernel-open/common/inc/nvmisc.h",
*[f"{{}}/src/common/sdk/nvidia/inc/class/cl{s}.h" for s in ["0000", "0070", "0080", "2080", "2080_notification", "c56f", "c86f", "c96f", "c761",
"83de", "c6c0", "cdc0"]],
*[f"{{}}/kernel-open/nvidia-uvm/{s}.h" for s in ["clc6b5", "clc9b5", "uvm_ioctl", "uvm_linux_ioctl", "hwref/ampere/ga100/dev_fault"]],
*[f"{{}}/src/nvidia/arch/nvalloc/unix/include/nv{s}.h" for s in ["_escape", "-ioctl", "-ioctl-numbers",
"-ioctl-numa", "-unix-nvos-params-wrappers"]],
*[f"{{}}/src/common/sdk/nvidia/inc/{s}.h" for s in ["alloc/alloc_channel", "nvos", "ctrl/ctrlc36f", "ctrl/ctrlcb33",
"ctrl/ctrla06c", "ctrl/ctrl90f1"]],
"ctrl/ctrla06c", "ctrl/ctrl90f1", "ctrl/ctrla06f/ctrla06fgpfifo"]],
*[f"{{}}/src/common/sdk/nvidia/inc/ctrl/ctrl{s}/*.h" for s in ["0000", "0080", "2080", "83de"]],
"{}/kernel-open/common/inc/nvstatus.h", "{}/src/nvidia/generated/g_allclasses.h"
], args=[
@@ -129,4 +131,4 @@ python3 src/compiler/builtin_types_h.py gen/builtin_types.h""", cwd=path, shell=
return load("metal", ["find_library('Metal')"],[f"{macossdk}/System/Library/Frameworks/Metal.framework/Headers/MTL{s}.h" for s in
["ComputeCommandEncoder", "ComputePipeline", "CommandQueue", "Device", "IndirectCommandBuffer", "Resource", "CommandEncoder"]],
args=["-xobjective-c","-isysroot",macossdk], types={"dispatch_data_t":"objc.id_"})
case _: raise AttributeError(f"no such autogen: {nm}")
case _: raise AttributeError(f"no such autogen: {nm}")

View File

@@ -0,0 +1,543 @@
# mypy: ignore-errors
import ctypes
from tinygrad.helpers import unwrap
from tinygrad.runtime.support.c import Struct, CEnum, _IO, _IOW, _IOR, _IOWR
enum_HEVCNALUnitType = CEnum(ctypes.c_uint32)
HEVC_NAL_TRAIL_N = enum_HEVCNALUnitType.define('HEVC_NAL_TRAIL_N', 0)
HEVC_NAL_TRAIL_R = enum_HEVCNALUnitType.define('HEVC_NAL_TRAIL_R', 1)
HEVC_NAL_TSA_N = enum_HEVCNALUnitType.define('HEVC_NAL_TSA_N', 2)
HEVC_NAL_TSA_R = enum_HEVCNALUnitType.define('HEVC_NAL_TSA_R', 3)
HEVC_NAL_STSA_N = enum_HEVCNALUnitType.define('HEVC_NAL_STSA_N', 4)
HEVC_NAL_STSA_R = enum_HEVCNALUnitType.define('HEVC_NAL_STSA_R', 5)
HEVC_NAL_RADL_N = enum_HEVCNALUnitType.define('HEVC_NAL_RADL_N', 6)
HEVC_NAL_RADL_R = enum_HEVCNALUnitType.define('HEVC_NAL_RADL_R', 7)
HEVC_NAL_RASL_N = enum_HEVCNALUnitType.define('HEVC_NAL_RASL_N', 8)
HEVC_NAL_RASL_R = enum_HEVCNALUnitType.define('HEVC_NAL_RASL_R', 9)
HEVC_NAL_VCL_N10 = enum_HEVCNALUnitType.define('HEVC_NAL_VCL_N10', 10)
HEVC_NAL_VCL_R11 = enum_HEVCNALUnitType.define('HEVC_NAL_VCL_R11', 11)
HEVC_NAL_VCL_N12 = enum_HEVCNALUnitType.define('HEVC_NAL_VCL_N12', 12)
HEVC_NAL_VCL_R13 = enum_HEVCNALUnitType.define('HEVC_NAL_VCL_R13', 13)
HEVC_NAL_VCL_N14 = enum_HEVCNALUnitType.define('HEVC_NAL_VCL_N14', 14)
HEVC_NAL_VCL_R15 = enum_HEVCNALUnitType.define('HEVC_NAL_VCL_R15', 15)
HEVC_NAL_BLA_W_LP = enum_HEVCNALUnitType.define('HEVC_NAL_BLA_W_LP', 16)
HEVC_NAL_BLA_W_RADL = enum_HEVCNALUnitType.define('HEVC_NAL_BLA_W_RADL', 17)
HEVC_NAL_BLA_N_LP = enum_HEVCNALUnitType.define('HEVC_NAL_BLA_N_LP', 18)
HEVC_NAL_IDR_W_RADL = enum_HEVCNALUnitType.define('HEVC_NAL_IDR_W_RADL', 19)
HEVC_NAL_IDR_N_LP = enum_HEVCNALUnitType.define('HEVC_NAL_IDR_N_LP', 20)
HEVC_NAL_CRA_NUT = enum_HEVCNALUnitType.define('HEVC_NAL_CRA_NUT', 21)
HEVC_NAL_RSV_IRAP_VCL22 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_IRAP_VCL22', 22)
HEVC_NAL_RSV_IRAP_VCL23 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_IRAP_VCL23', 23)
HEVC_NAL_RSV_VCL24 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_VCL24', 24)
HEVC_NAL_RSV_VCL25 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_VCL25', 25)
HEVC_NAL_RSV_VCL26 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_VCL26', 26)
HEVC_NAL_RSV_VCL27 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_VCL27', 27)
HEVC_NAL_RSV_VCL28 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_VCL28', 28)
HEVC_NAL_RSV_VCL29 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_VCL29', 29)
HEVC_NAL_RSV_VCL30 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_VCL30', 30)
HEVC_NAL_RSV_VCL31 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_VCL31', 31)
HEVC_NAL_VPS = enum_HEVCNALUnitType.define('HEVC_NAL_VPS', 32)
HEVC_NAL_SPS = enum_HEVCNALUnitType.define('HEVC_NAL_SPS', 33)
HEVC_NAL_PPS = enum_HEVCNALUnitType.define('HEVC_NAL_PPS', 34)
HEVC_NAL_AUD = enum_HEVCNALUnitType.define('HEVC_NAL_AUD', 35)
HEVC_NAL_EOS_NUT = enum_HEVCNALUnitType.define('HEVC_NAL_EOS_NUT', 36)
HEVC_NAL_EOB_NUT = enum_HEVCNALUnitType.define('HEVC_NAL_EOB_NUT', 37)
HEVC_NAL_FD_NUT = enum_HEVCNALUnitType.define('HEVC_NAL_FD_NUT', 38)
HEVC_NAL_SEI_PREFIX = enum_HEVCNALUnitType.define('HEVC_NAL_SEI_PREFIX', 39)
HEVC_NAL_SEI_SUFFIX = enum_HEVCNALUnitType.define('HEVC_NAL_SEI_SUFFIX', 40)
HEVC_NAL_RSV_NVCL41 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_NVCL41', 41)
HEVC_NAL_RSV_NVCL42 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_NVCL42', 42)
HEVC_NAL_RSV_NVCL43 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_NVCL43', 43)
HEVC_NAL_RSV_NVCL44 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_NVCL44', 44)
HEVC_NAL_RSV_NVCL45 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_NVCL45', 45)
HEVC_NAL_RSV_NVCL46 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_NVCL46', 46)
HEVC_NAL_RSV_NVCL47 = enum_HEVCNALUnitType.define('HEVC_NAL_RSV_NVCL47', 47)
HEVC_NAL_UNSPEC48 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC48', 48)
HEVC_NAL_UNSPEC49 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC49', 49)
HEVC_NAL_UNSPEC50 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC50', 50)
HEVC_NAL_UNSPEC51 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC51', 51)
HEVC_NAL_UNSPEC52 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC52', 52)
HEVC_NAL_UNSPEC53 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC53', 53)
HEVC_NAL_UNSPEC54 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC54', 54)
HEVC_NAL_UNSPEC55 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC55', 55)
HEVC_NAL_UNSPEC56 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC56', 56)
HEVC_NAL_UNSPEC57 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC57', 57)
HEVC_NAL_UNSPEC58 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC58', 58)
HEVC_NAL_UNSPEC59 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC59', 59)
HEVC_NAL_UNSPEC60 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC60', 60)
HEVC_NAL_UNSPEC61 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC61', 61)
HEVC_NAL_UNSPEC62 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC62', 62)
HEVC_NAL_UNSPEC63 = enum_HEVCNALUnitType.define('HEVC_NAL_UNSPEC63', 63)
enum_HEVCSliceType = CEnum(ctypes.c_uint32)
HEVC_SLICE_B = enum_HEVCSliceType.define('HEVC_SLICE_B', 0)
HEVC_SLICE_P = enum_HEVCSliceType.define('HEVC_SLICE_P', 1)
HEVC_SLICE_I = enum_HEVCSliceType.define('HEVC_SLICE_I', 2)
_anonenum0 = CEnum(ctypes.c_uint32)
HEVC_MAX_LAYERS = _anonenum0.define('HEVC_MAX_LAYERS', 63)
HEVC_MAX_SUB_LAYERS = _anonenum0.define('HEVC_MAX_SUB_LAYERS', 7)
HEVC_MAX_LAYER_SETS = _anonenum0.define('HEVC_MAX_LAYER_SETS', 1024)
HEVC_MAX_LAYER_ID = _anonenum0.define('HEVC_MAX_LAYER_ID', 63)
HEVC_MAX_NUH_LAYER_ID = _anonenum0.define('HEVC_MAX_NUH_LAYER_ID', 62)
HEVC_MAX_VPS_COUNT = _anonenum0.define('HEVC_MAX_VPS_COUNT', 16)
HEVC_MAX_SPS_COUNT = _anonenum0.define('HEVC_MAX_SPS_COUNT', 16)
HEVC_MAX_PPS_COUNT = _anonenum0.define('HEVC_MAX_PPS_COUNT', 64)
HEVC_MAX_DPB_SIZE = _anonenum0.define('HEVC_MAX_DPB_SIZE', 16)
HEVC_MAX_REFS = _anonenum0.define('HEVC_MAX_REFS', 16)
HEVC_MAX_SHORT_TERM_REF_PIC_SETS = _anonenum0.define('HEVC_MAX_SHORT_TERM_REF_PIC_SETS', 64)
HEVC_MAX_LONG_TERM_REF_PICS = _anonenum0.define('HEVC_MAX_LONG_TERM_REF_PICS', 32)
HEVC_MIN_LOG2_CTB_SIZE = _anonenum0.define('HEVC_MIN_LOG2_CTB_SIZE', 4)
HEVC_MAX_LOG2_CTB_SIZE = _anonenum0.define('HEVC_MAX_LOG2_CTB_SIZE', 6)
HEVC_MAX_CPB_CNT = _anonenum0.define('HEVC_MAX_CPB_CNT', 32)
HEVC_MAX_LUMA_PS = _anonenum0.define('HEVC_MAX_LUMA_PS', 35651584)
HEVC_MAX_WIDTH = _anonenum0.define('HEVC_MAX_WIDTH', 16888)
HEVC_MAX_HEIGHT = _anonenum0.define('HEVC_MAX_HEIGHT', 16888)
HEVC_MAX_TILE_ROWS = _anonenum0.define('HEVC_MAX_TILE_ROWS', 22)
HEVC_MAX_TILE_COLUMNS = _anonenum0.define('HEVC_MAX_TILE_COLUMNS', 20)
HEVC_MAX_SLICE_SEGMENTS = _anonenum0.define('HEVC_MAX_SLICE_SEGMENTS', 600)
HEVC_MAX_ENTRY_POINT_OFFSETS = _anonenum0.define('HEVC_MAX_ENTRY_POINT_OFFSETS', 2700)
HEVC_MAX_PALETTE_PREDICTOR_SIZE = _anonenum0.define('HEVC_MAX_PALETTE_PREDICTOR_SIZE', 128)
enum_HEVCScalabilityMask = CEnum(ctypes.c_uint32)
HEVC_SCALABILITY_DEPTH = enum_HEVCScalabilityMask.define('HEVC_SCALABILITY_DEPTH', 32768)
HEVC_SCALABILITY_MULTIVIEW = enum_HEVCScalabilityMask.define('HEVC_SCALABILITY_MULTIVIEW', 16384)
HEVC_SCALABILITY_SPATIAL = enum_HEVCScalabilityMask.define('HEVC_SCALABILITY_SPATIAL', 8192)
HEVC_SCALABILITY_AUXILIARY = enum_HEVCScalabilityMask.define('HEVC_SCALABILITY_AUXILIARY', 4096)
HEVC_SCALABILITY_MASK_MAX = enum_HEVCScalabilityMask.define('HEVC_SCALABILITY_MASK_MAX', 65535)
enum_HEVCAuxId = CEnum(ctypes.c_uint32)
HEVC_AUX_ALPHA = enum_HEVCAuxId.define('HEVC_AUX_ALPHA', 1)
HEVC_AUX_DEPTH = enum_HEVCAuxId.define('HEVC_AUX_DEPTH', 2)
class struct_H265RawNALUnitHeader(Struct): pass
uint8_t = ctypes.c_ubyte
struct_H265RawNALUnitHeader._fields_ = [
('nal_unit_type', uint8_t),
('nuh_layer_id', uint8_t),
('nuh_temporal_id_plus1', uint8_t),
]
H265RawNALUnitHeader = struct_H265RawNALUnitHeader
class struct_H265RawProfileTierLevel(Struct): pass
struct_H265RawProfileTierLevel._fields_ = [
('general_profile_space', uint8_t),
('general_tier_flag', uint8_t),
('general_profile_idc', uint8_t),
('general_profile_compatibility_flag', (uint8_t * 32)),
('general_progressive_source_flag', uint8_t),
('general_interlaced_source_flag', uint8_t),
('general_non_packed_constraint_flag', uint8_t),
('general_frame_only_constraint_flag', uint8_t),
('general_max_12bit_constraint_flag', uint8_t),
('general_max_10bit_constraint_flag', uint8_t),
('general_max_8bit_constraint_flag', uint8_t),
('general_max_422chroma_constraint_flag', uint8_t),
('general_max_420chroma_constraint_flag', uint8_t),
('general_max_monochrome_constraint_flag', uint8_t),
('general_intra_constraint_flag', uint8_t),
('general_one_picture_only_constraint_flag', uint8_t),
('general_lower_bit_rate_constraint_flag', uint8_t),
('general_max_14bit_constraint_flag', uint8_t),
('general_inbld_flag', uint8_t),
('general_level_idc', uint8_t),
('sub_layer_profile_present_flag', (uint8_t * 7)),
('sub_layer_level_present_flag', (uint8_t * 7)),
('sub_layer_profile_space', (uint8_t * 7)),
('sub_layer_tier_flag', (uint8_t * 7)),
('sub_layer_profile_idc', (uint8_t * 7)),
('sub_layer_profile_compatibility_flag', ((uint8_t * 32) * 7)),
('sub_layer_progressive_source_flag', (uint8_t * 7)),
('sub_layer_interlaced_source_flag', (uint8_t * 7)),
('sub_layer_non_packed_constraint_flag', (uint8_t * 7)),
('sub_layer_frame_only_constraint_flag', (uint8_t * 7)),
('sub_layer_max_12bit_constraint_flag', (uint8_t * 7)),
('sub_layer_max_10bit_constraint_flag', (uint8_t * 7)),
('sub_layer_max_8bit_constraint_flag', (uint8_t * 7)),
('sub_layer_max_422chroma_constraint_flag', (uint8_t * 7)),
('sub_layer_max_420chroma_constraint_flag', (uint8_t * 7)),
('sub_layer_max_monochrome_constraint_flag', (uint8_t * 7)),
('sub_layer_intra_constraint_flag', (uint8_t * 7)),
('sub_layer_one_picture_only_constraint_flag', (uint8_t * 7)),
('sub_layer_lower_bit_rate_constraint_flag', (uint8_t * 7)),
('sub_layer_max_14bit_constraint_flag', (uint8_t * 7)),
('sub_layer_inbld_flag', (uint8_t * 7)),
('sub_layer_level_idc', (uint8_t * 7)),
]
H265RawProfileTierLevel = struct_H265RawProfileTierLevel
class struct_H265RawSubLayerHRDParameters(Struct): pass
uint32_t = ctypes.c_uint32
struct_H265RawSubLayerHRDParameters._fields_ = [
('bit_rate_value_minus1', (uint32_t * 32)),
('cpb_size_value_minus1', (uint32_t * 32)),
('cpb_size_du_value_minus1', (uint32_t * 32)),
('bit_rate_du_value_minus1', (uint32_t * 32)),
('cbr_flag', (uint8_t * 32)),
]
H265RawSubLayerHRDParameters = struct_H265RawSubLayerHRDParameters
class struct_H265RawHRDParameters(Struct): pass
uint16_t = ctypes.c_uint16
struct_H265RawHRDParameters._fields_ = [
('nal_hrd_parameters_present_flag', uint8_t),
('vcl_hrd_parameters_present_flag', uint8_t),
('sub_pic_hrd_params_present_flag', uint8_t),
('tick_divisor_minus2', uint8_t),
('du_cpb_removal_delay_increment_length_minus1', uint8_t),
('sub_pic_cpb_params_in_pic_timing_sei_flag', uint8_t),
('dpb_output_delay_du_length_minus1', uint8_t),
('bit_rate_scale', uint8_t),
('cpb_size_scale', uint8_t),
('cpb_size_du_scale', uint8_t),
('initial_cpb_removal_delay_length_minus1', uint8_t),
('au_cpb_removal_delay_length_minus1', uint8_t),
('dpb_output_delay_length_minus1', uint8_t),
('fixed_pic_rate_general_flag', (uint8_t * 7)),
('fixed_pic_rate_within_cvs_flag', (uint8_t * 7)),
('elemental_duration_in_tc_minus1', (uint16_t * 7)),
('low_delay_hrd_flag', (uint8_t * 7)),
('cpb_cnt_minus1', (uint8_t * 7)),
('nal_sub_layer_hrd_parameters', (H265RawSubLayerHRDParameters * 7)),
('vcl_sub_layer_hrd_parameters', (H265RawSubLayerHRDParameters * 7)),
]
H265RawHRDParameters = struct_H265RawHRDParameters
class struct_H265RawVUI(Struct): pass
struct_H265RawVUI._fields_ = [
('aspect_ratio_info_present_flag', uint8_t),
('aspect_ratio_idc', uint8_t),
('sar_width', uint16_t),
('sar_height', uint16_t),
('overscan_info_present_flag', uint8_t),
('overscan_appropriate_flag', uint8_t),
('video_signal_type_present_flag', uint8_t),
('video_format', uint8_t),
('video_full_range_flag', uint8_t),
('colour_description_present_flag', uint8_t),
('colour_primaries', uint8_t),
('transfer_characteristics', uint8_t),
('matrix_coefficients', uint8_t),
('chroma_loc_info_present_flag', uint8_t),
('chroma_sample_loc_type_top_field', uint8_t),
('chroma_sample_loc_type_bottom_field', uint8_t),
('neutral_chroma_indication_flag', uint8_t),
('field_seq_flag', uint8_t),
('frame_field_info_present_flag', uint8_t),
('default_display_window_flag', uint8_t),
('def_disp_win_left_offset', uint16_t),
('def_disp_win_right_offset', uint16_t),
('def_disp_win_top_offset', uint16_t),
('def_disp_win_bottom_offset', uint16_t),
('vui_timing_info_present_flag', uint8_t),
('vui_num_units_in_tick', uint32_t),
('vui_time_scale', uint32_t),
('vui_poc_proportional_to_timing_flag', uint8_t),
('vui_num_ticks_poc_diff_one_minus1', uint32_t),
('vui_hrd_parameters_present_flag', uint8_t),
('hrd_parameters', H265RawHRDParameters),
('bitstream_restriction_flag', uint8_t),
('tiles_fixed_structure_flag', uint8_t),
('motion_vectors_over_pic_boundaries_flag', uint8_t),
('restricted_ref_pic_lists_flag', uint8_t),
('min_spatial_segmentation_idc', uint16_t),
('max_bytes_per_pic_denom', uint8_t),
('max_bits_per_min_cu_denom', uint8_t),
('log2_max_mv_length_horizontal', uint8_t),
('log2_max_mv_length_vertical', uint8_t),
]
H265RawVUI = struct_H265RawVUI
class struct_H265RawExtensionData(Struct): pass
H265RawExtensionData = struct_H265RawExtensionData
class struct_H265RawVPS(Struct): pass
H265RawVPS = struct_H265RawVPS
class struct_H265RawSTRefPicSet(Struct): pass
struct_H265RawSTRefPicSet._fields_ = [
('inter_ref_pic_set_prediction_flag', uint8_t),
('delta_idx_minus1', uint8_t),
('delta_rps_sign', uint8_t),
('abs_delta_rps_minus1', uint16_t),
('used_by_curr_pic_flag', (uint8_t * 16)),
('use_delta_flag', (uint8_t * 16)),
('num_negative_pics', uint8_t),
('num_positive_pics', uint8_t),
('delta_poc_s0_minus1', (uint16_t * 16)),
('used_by_curr_pic_s0_flag', (uint8_t * 16)),
('delta_poc_s1_minus1', (uint16_t * 16)),
('used_by_curr_pic_s1_flag', (uint8_t * 16)),
]
H265RawSTRefPicSet = struct_H265RawSTRefPicSet
class struct_H265RawScalingList(Struct): pass
int16_t = ctypes.c_int16
int8_t = ctypes.c_byte
struct_H265RawScalingList._fields_ = [
('scaling_list_pred_mode_flag', ((uint8_t * 6) * 4)),
('scaling_list_pred_matrix_id_delta', ((uint8_t * 6) * 4)),
('scaling_list_dc_coef_minus8', ((int16_t * 6) * 4)),
('scaling_list_delta_coeff', (((int8_t * 64) * 6) * 4)),
]
H265RawScalingList = struct_H265RawScalingList
class struct_H265RawSPS(Struct): pass
H265RawSPS = struct_H265RawSPS
class struct_H265RawPPS(Struct): pass
H265RawPPS = struct_H265RawPPS
class struct_H265RawAUD(Struct): pass
struct_H265RawAUD._fields_ = [
('nal_unit_header', H265RawNALUnitHeader),
('pic_type', uint8_t),
]
H265RawAUD = struct_H265RawAUD
class struct_H265RawSliceHeader(Struct): pass
struct_H265RawSliceHeader._fields_ = [
('nal_unit_header', H265RawNALUnitHeader),
('first_slice_segment_in_pic_flag', uint8_t),
('no_output_of_prior_pics_flag', uint8_t),
('slice_pic_parameter_set_id', uint8_t),
('dependent_slice_segment_flag', uint8_t),
('slice_segment_address', uint16_t),
('slice_reserved_flag', (uint8_t * 8)),
('slice_type', uint8_t),
('pic_output_flag', uint8_t),
('colour_plane_id', uint8_t),
('slice_pic_order_cnt_lsb', uint16_t),
('short_term_ref_pic_set_sps_flag', uint8_t),
('short_term_ref_pic_set', H265RawSTRefPicSet),
('short_term_ref_pic_set_idx', uint8_t),
('num_long_term_sps', uint8_t),
('num_long_term_pics', uint8_t),
('lt_idx_sps', (uint8_t * 16)),
('poc_lsb_lt', (uint8_t * 16)),
('used_by_curr_pic_lt_flag', (uint8_t * 16)),
('delta_poc_msb_present_flag', (uint8_t * 16)),
('delta_poc_msb_cycle_lt', (uint32_t * 16)),
('slice_temporal_mvp_enabled_flag', uint8_t),
('slice_sao_luma_flag', uint8_t),
('slice_sao_chroma_flag', uint8_t),
('num_ref_idx_active_override_flag', uint8_t),
('num_ref_idx_l0_active_minus1', uint8_t),
('num_ref_idx_l1_active_minus1', uint8_t),
('ref_pic_list_modification_flag_l0', uint8_t),
('list_entry_l0', (uint8_t * 16)),
('ref_pic_list_modification_flag_l1', uint8_t),
('list_entry_l1', (uint8_t * 16)),
('mvd_l1_zero_flag', uint8_t),
('cabac_init_flag', uint8_t),
('collocated_from_l0_flag', uint8_t),
('collocated_ref_idx', uint8_t),
('luma_log2_weight_denom', uint8_t),
('delta_chroma_log2_weight_denom', int8_t),
('luma_weight_l0_flag', (uint8_t * 16)),
('chroma_weight_l0_flag', (uint8_t * 16)),
('delta_luma_weight_l0', (int8_t * 16)),
('luma_offset_l0', (int16_t * 16)),
('delta_chroma_weight_l0', ((int8_t * 2) * 16)),
('chroma_offset_l0', ((int16_t * 2) * 16)),
('luma_weight_l1_flag', (uint8_t * 16)),
('chroma_weight_l1_flag', (uint8_t * 16)),
('delta_luma_weight_l1', (int8_t * 16)),
('luma_offset_l1', (int16_t * 16)),
('delta_chroma_weight_l1', ((int8_t * 2) * 16)),
('chroma_offset_l1', ((int16_t * 2) * 16)),
('five_minus_max_num_merge_cand', uint8_t),
('use_integer_mv_flag', uint8_t),
('slice_qp_delta', int8_t),
('slice_cb_qp_offset', int8_t),
('slice_cr_qp_offset', int8_t),
('slice_act_y_qp_offset', int8_t),
('slice_act_cb_qp_offset', int8_t),
('slice_act_cr_qp_offset', int8_t),
('cu_chroma_qp_offset_enabled_flag', uint8_t),
('deblocking_filter_override_flag', uint8_t),
('slice_deblocking_filter_disabled_flag', uint8_t),
('slice_beta_offset_div2', int8_t),
('slice_tc_offset_div2', int8_t),
('slice_loop_filter_across_slices_enabled_flag', uint8_t),
('num_entry_point_offsets', uint16_t),
('offset_len_minus1', uint8_t),
('entry_point_offset_minus1', (uint32_t * 2700)),
('slice_segment_header_extension_length', uint16_t),
('slice_segment_header_extension_data_byte', (uint8_t * 256)),
]
H265RawSliceHeader = struct_H265RawSliceHeader
class struct_H265RawSlice(Struct): pass
H265RawSlice = struct_H265RawSlice
class struct_H265RawSEIBufferingPeriod(Struct): pass
struct_H265RawSEIBufferingPeriod._fields_ = [
('bp_seq_parameter_set_id', uint8_t),
('irap_cpb_params_present_flag', uint8_t),
('cpb_delay_offset', uint32_t),
('dpb_delay_offset', uint32_t),
('concatenation_flag', uint8_t),
('au_cpb_removal_delay_delta_minus1', uint32_t),
('nal_initial_cpb_removal_delay', (uint32_t * 32)),
('nal_initial_cpb_removal_offset', (uint32_t * 32)),
('nal_initial_alt_cpb_removal_delay', (uint32_t * 32)),
('nal_initial_alt_cpb_removal_offset', (uint32_t * 32)),
('vcl_initial_cpb_removal_delay', (uint32_t * 32)),
('vcl_initial_cpb_removal_offset', (uint32_t * 32)),
('vcl_initial_alt_cpb_removal_delay', (uint32_t * 32)),
('vcl_initial_alt_cpb_removal_offset', (uint32_t * 32)),
('use_alt_cpb_params_flag', uint8_t),
]
H265RawSEIBufferingPeriod = struct_H265RawSEIBufferingPeriod
class struct_H265RawSEIPicTiming(Struct): pass
struct_H265RawSEIPicTiming._fields_ = [
('pic_struct', uint8_t),
('source_scan_type', uint8_t),
('duplicate_flag', uint8_t),
('au_cpb_removal_delay_minus1', uint32_t),
('pic_dpb_output_delay', uint32_t),
('pic_dpb_output_du_delay', uint32_t),
('num_decoding_units_minus1', uint16_t),
('du_common_cpb_removal_delay_flag', uint8_t),
('du_common_cpb_removal_delay_increment_minus1', uint32_t),
('num_nalus_in_du_minus1', (uint16_t * 600)),
('du_cpb_removal_delay_increment_minus1', (uint32_t * 600)),
]
H265RawSEIPicTiming = struct_H265RawSEIPicTiming
class struct_H265RawSEIPanScanRect(Struct): pass
int32_t = ctypes.c_int32
struct_H265RawSEIPanScanRect._fields_ = [
('pan_scan_rect_id', uint32_t),
('pan_scan_rect_cancel_flag', uint8_t),
('pan_scan_cnt_minus1', uint8_t),
('pan_scan_rect_left_offset', (int32_t * 3)),
('pan_scan_rect_right_offset', (int32_t * 3)),
('pan_scan_rect_top_offset', (int32_t * 3)),
('pan_scan_rect_bottom_offset', (int32_t * 3)),
('pan_scan_rect_persistence_flag', uint16_t),
]
H265RawSEIPanScanRect = struct_H265RawSEIPanScanRect
class struct_H265RawSEIRecoveryPoint(Struct): pass
struct_H265RawSEIRecoveryPoint._fields_ = [
('recovery_poc_cnt', int16_t),
('exact_match_flag', uint8_t),
('broken_link_flag', uint8_t),
]
H265RawSEIRecoveryPoint = struct_H265RawSEIRecoveryPoint
class struct_H265RawFilmGrainCharacteristics(Struct): pass
struct_H265RawFilmGrainCharacteristics._fields_ = [
('film_grain_characteristics_cancel_flag', uint8_t),
('film_grain_model_id', uint8_t),
('separate_colour_description_present_flag', uint8_t),
('film_grain_bit_depth_luma_minus8', uint8_t),
('film_grain_bit_depth_chroma_minus8', uint8_t),
('film_grain_full_range_flag', uint8_t),
('film_grain_colour_primaries', uint8_t),
('film_grain_transfer_characteristics', uint8_t),
('film_grain_matrix_coeffs', uint8_t),
('blending_mode_id', uint8_t),
('log2_scale_factor', uint8_t),
('comp_model_present_flag', (uint8_t * 3)),
('num_intensity_intervals_minus1', (uint8_t * 3)),
('num_model_values_minus1', (uint8_t * 3)),
('intensity_interval_lower_bound', ((uint8_t * 256) * 3)),
('intensity_interval_upper_bound', ((uint8_t * 256) * 3)),
('comp_model_value', (((int16_t * 6) * 256) * 3)),
('film_grain_characteristics_persistence_flag', uint8_t),
]
H265RawFilmGrainCharacteristics = struct_H265RawFilmGrainCharacteristics
class struct_H265RawSEIDisplayOrientation(Struct): pass
struct_H265RawSEIDisplayOrientation._fields_ = [
('display_orientation_cancel_flag', uint8_t),
('hor_flip', uint8_t),
('ver_flip', uint8_t),
('anticlockwise_rotation', uint16_t),
('display_orientation_repetition_period', uint16_t),
('display_orientation_persistence_flag', uint8_t),
]
H265RawSEIDisplayOrientation = struct_H265RawSEIDisplayOrientation
class struct_H265RawSEIActiveParameterSets(Struct): pass
struct_H265RawSEIActiveParameterSets._fields_ = [
('active_video_parameter_set_id', uint8_t),
('self_contained_cvs_flag', uint8_t),
('no_parameter_set_update_flag', uint8_t),
('num_sps_ids_minus1', uint8_t),
('active_seq_parameter_set_id', (uint8_t * 16)),
('layer_sps_idx', (uint8_t * 63)),
]
H265RawSEIActiveParameterSets = struct_H265RawSEIActiveParameterSets
class struct_H265RawSEIDecodedPictureHash(Struct): pass
struct_H265RawSEIDecodedPictureHash._fields_ = [
('hash_type', uint8_t),
('picture_md5', ((uint8_t * 16) * 3)),
('picture_crc', (uint16_t * 3)),
('picture_checksum', (uint32_t * 3)),
]
H265RawSEIDecodedPictureHash = struct_H265RawSEIDecodedPictureHash
class struct_H265RawSEITimeCode(Struct): pass
struct_H265RawSEITimeCode._fields_ = [
('num_clock_ts', uint8_t),
('clock_timestamp_flag', (uint8_t * 3)),
('units_field_based_flag', (uint8_t * 3)),
('counting_type', (uint8_t * 3)),
('full_timestamp_flag', (uint8_t * 3)),
('discontinuity_flag', (uint8_t * 3)),
('cnt_dropped_flag', (uint8_t * 3)),
('n_frames', (uint16_t * 3)),
('seconds_value', (uint8_t * 3)),
('minutes_value', (uint8_t * 3)),
('hours_value', (uint8_t * 3)),
('seconds_flag', (uint8_t * 3)),
('minutes_flag', (uint8_t * 3)),
('hours_flag', (uint8_t * 3)),
('time_offset_length', (uint8_t * 3)),
('time_offset_value', (int32_t * 3)),
]
H265RawSEITimeCode = struct_H265RawSEITimeCode
class struct_H265RawSEIAlphaChannelInfo(Struct): pass
struct_H265RawSEIAlphaChannelInfo._fields_ = [
('alpha_channel_cancel_flag', uint8_t),
('alpha_channel_use_idc', uint8_t),
('alpha_channel_bit_depth_minus8', uint8_t),
('alpha_transparent_value', uint16_t),
('alpha_opaque_value', uint16_t),
('alpha_channel_incr_flag', uint8_t),
('alpha_channel_clip_flag', uint8_t),
('alpha_channel_clip_type_flag', uint8_t),
]
H265RawSEIAlphaChannelInfo = struct_H265RawSEIAlphaChannelInfo
class struct_H265RawSEI3DReferenceDisplaysInfo(Struct): pass
struct_H265RawSEI3DReferenceDisplaysInfo._fields_ = [
('prec_ref_display_width', uint8_t),
('ref_viewing_distance_flag', uint8_t),
('prec_ref_viewing_dist', uint8_t),
('num_ref_displays_minus1', uint8_t),
('left_view_id', (uint16_t * 32)),
('right_view_id', (uint16_t * 32)),
('exponent_ref_display_width', (uint8_t * 32)),
('mantissa_ref_display_width', (uint8_t * 32)),
('exponent_ref_viewing_distance', (uint8_t * 32)),
('mantissa_ref_viewing_distance', (uint8_t * 32)),
('additional_shift_present_flag', (uint8_t * 32)),
('num_sample_shift_plus512', (uint16_t * 32)),
('three_dimensional_reference_displays_extension_flag', uint8_t),
]
H265RawSEI3DReferenceDisplaysInfo = struct_H265RawSEI3DReferenceDisplaysInfo
class struct_H265RawSEI(Struct): pass
class struct_SEIRawMessageList(Struct): pass
SEIRawMessageList = struct_SEIRawMessageList
class struct_SEIRawMessage(Struct): pass
SEIRawMessage = struct_SEIRawMessage
size_t = ctypes.c_uint64
struct_SEIRawMessage._fields_ = [
('payload_type', uint32_t),
('payload_size', uint32_t),
('payload', ctypes.c_void_p),
('payload_ref', ctypes.c_void_p),
('extension_data', ctypes.POINTER(uint8_t)),
('extension_bit_length', size_t),
]
struct_SEIRawMessageList._fields_ = [
('messages', ctypes.POINTER(SEIRawMessage)),
('nb_messages', ctypes.c_int32),
('nb_messages_allocated', ctypes.c_int32),
]
struct_H265RawSEI._fields_ = [
('nal_unit_header', H265RawNALUnitHeader),
('message_list', SEIRawMessageList),
]
H265RawSEI = struct_H265RawSEI
class struct_H265RawFiller(Struct): pass
struct_H265RawFiller._fields_ = [
('nal_unit_header', H265RawNALUnitHeader),
('filler_size', uint32_t),
]
H265RawFiller = struct_H265RawFiller
class struct_CodedBitstreamH265Context(Struct): pass
CodedBitstreamH265Context = struct_CodedBitstreamH265Context

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -180,6 +180,31 @@ class NVCopyQueue(NVCommandQueue):
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.dma_gpfifo)
class NVVideoQueue(NVCommandQueue):
def decode_hevc_chunk(self, pic_desc:HCQBuffer, in_buf:HCQBuffer, out_buf:HCQBuffer, out_buf_pos:int, hist_bufs:list[HCQBuffer],
hist_pos:list[int], chroma_off:int, coloc_buf:HCQBuffer, filter_buf:HCQBuffer, intra_top_off:int, status_buf:HCQBuffer):
self.nvm(4, nv_gpu.NVC9B0_SET_APPLICATION_ID, nv_gpu.NVC9B0_SET_APPLICATION_ID_ID_HEVC)
self.nvm(4, nv_gpu.NVC9B0_SET_CONTROL_PARAMS, 0x52057)
self.nvm(4, nv_gpu.NVC9B0_SET_DRV_PIC_SETUP_OFFSET, pic_desc.va_addr >> 8)
self.nvm(4, nv_gpu.NVC9B0_SET_IN_BUF_BASE_OFFSET, in_buf.va_addr >> 8)
for pos, buf in zip(hist_pos + [out_buf_pos], hist_bufs + [out_buf]):
self.nvm(4, nv_gpu.NVC9B0_SET_PICTURE_LUMA_OFFSET0 + pos*4, buf.va_addr >> 8)
self.nvm(4, nv_gpu.NVC9B0_SET_PICTURE_CHROMA_OFFSET0 + pos*4, buf.offset(chroma_off).va_addr >> 8)
self.nvm(4, nv_gpu.NVC9B0_SET_COLOC_DATA_OFFSET, coloc_buf.va_addr >> 8)
self.nvm(4, nv_gpu.NVC9B0_SET_NVDEC_STATUS_OFFSET, status_buf.va_addr >> 8)
self.nvm(4, nv_gpu.NVC9B0_HEVC_SET_TILE_SIZES_OFFSET, pic_desc.offset(0x200).va_addr >> 8)
self.nvm(4, nv_gpu.NVC9B0_HEVC_SET_FILTER_BUFFER_OFFSET, filter_buf.va_addr >> 8)
self.nvm(4, nv_gpu.NVC9B0_SET_INTRA_TOP_BUF_OFFSET, (filter_buf.va_addr + intra_top_off) >> 8)
self.nvm(4, nv_gpu.NVC9B0_EXECUTE, 0)
return self
def signal(self, signal:HCQSignal, value:sint=0):
self.nvm(4, nv_gpu.NVC9B0_SEMAPHORE_A, *data64(signal.value_addr), value)
self.nvm(4, nv_gpu.NVC9B0_SEMAPHORE_D, 0)
return self
def _submit(self, dev:NVDevice): self._submit_to_gpfifo(dev, dev.vid_gpfifo)
class NVArgsState(CLikeArgsState):
def __init__(self, buf:HCQBuffer, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
if MOCKGPU: prg.cbuf_0[80:82] = [len(bufs), len(vals)]
@@ -281,6 +306,16 @@ class NVAllocator(HCQAllocator['NVDevice']):
def _map(self, buf:HCQBuffer): return self.dev.iface.map(buf._base if buf._base is not None else buf)
def _encode_decode(self, bufout:HCQBuffer, bufin:HCQBuffer, desc_buf:HCQBuffer, hist:list[HCQBuffer], shape:tuple[int,...], frame_pos:int):
assert all(h.va_addr % 0x100 == 0 for h in hist + [bufin, bufout]), "all buffers must be 0x100 aligned"
h, w = ((2 * shape[0]) // 3 if shape[0] % 3 == 0 else (2 * shape[0] - 1) // 3), shape[1]
self.dev._ensure_has_vid_hw(w, h)
NVVideoQueue().wait(self.dev.timeline_signal, self.dev.timeline_value - 1) \
.decode_hevc_chunk(desc_buf, bufin, bufout, frame_pos, hist, [(frame_pos-x) % (len(hist) + 1) for x in range(len(hist), 0, -1)],
round_up(w, 64)*round_up(h, 64), self.dev.vid_coloc_buf, self.dev.vid_filter_buf, self.dev.intra_top_off, self.dev.vid_stat_buf) \
.signal(self.dev.timeline_signal, self.dev.next_timeline()).submit(self.dev)
@dataclass
class GPFifo:
ring: MMIOInterface
@@ -358,6 +393,7 @@ class NVKIface:
self.gpfifo_class:int = next(c for c in [nv_gpu.BLACKWELL_CHANNEL_GPFIFO_A, nv_gpu.AMPERE_CHANNEL_GPFIFO_A] if c in self.nvclasses)
self.compute_class:int = next(c for c in [nv_gpu.BLACKWELL_COMPUTE_B, nv_gpu.ADA_COMPUTE_A, nv_gpu.AMPERE_COMPUTE_B] if c in self.nvclasses)
self.dma_class:int = next(c for c in [nv_gpu.BLACKWELL_DMA_COPY_B, nv_gpu.AMPERE_DMA_COPY_B] if c in self.nvclasses)
self.viddec_class:int|None = next((c for c in [nv_gpu.NVC9B0_VIDEO_DECODER] if c in self.nvclasses), None)
usermode = self.rm_alloc(self.dev.subdevice, self.usermode_class)
return usermode, MMIOInterface(self._gpu_map_to_cpu(usermode, mmio_sz:=0x10000), mmio_sz, fmt='I')
@@ -440,7 +476,15 @@ class NVKIface:
if mem.meta.has_cpu_mapping: FileIOInterface.munmap(cast(int, mem.va_addr), mem.size)
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False) -> HCQBuffer:
if create_range: self.uvm(nv_gpu.UVM_CREATE_EXTERNAL_RANGE, nv_gpu.UVM_CREATE_EXTERNAL_RANGE_PARAMS(base=va_base, length=size))
if create_range:
self.uvm(nv_gpu.UVM_CREATE_EXTERNAL_RANGE, nv_gpu.UVM_CREATE_EXTERNAL_RANGE_PARAMS(base=va_base, length=size))
made = nv_gpu.NVOS46_PARAMETERS(hClient=self.root, hDevice=self.dev.nvdevice, hDma=self.dev.virtmem, hMemory=mem_handle, length=size,
flags=(nv_gpu.NVOS46_FLAGS_PAGE_SIZE_4KB<<8)|(nv_gpu.NVOS46_FLAGS_CACHE_SNOOP_ENABLE<<4)|(nv_gpu.NVOS46_FLAGS_DMA_OFFSET_FIXED_TRUE<<15),
dmaOffset=va_base)
nv_iowr(self.fd_ctl, nv_gpu.NV_ESC_RM_MAP_MEMORY_DMA, made)
if made.status != 0: raise RuntimeError(f"nv_sys_alloc 1 returned {get_error_str(made.status)}")
assert made.dmaOffset == va_base, f"made.dmaOffset != va_base {made.dmaOffset=} {va_base=}"
attrs = (nv_gpu.UvmGpuMappingAttributes*256)(nv_gpu.UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
self.uvm(nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION, uvm_map:=nv_gpu.UVM_MAP_EXTERNAL_ALLOCATION_PARAMS(base=va_base, length=size,
@@ -472,6 +516,7 @@ class PCIIface(PCIIfaceBase):
# Setup classes for the GPU
self.gpfifo_class, self.compute_class, self.dma_class = (gsp:=self.dev_impl.gsp).gpfifo_class, gsp.compute_class, gsp.dma_class
self.viddec_class = None
def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, **kwargs) -> HCQBuffer:
# Force use of huge pages for large allocations. NVDev will attempt to use huge pages in any case,
@@ -499,6 +544,7 @@ class NVDevice(HCQCompiled[HCQSignal]):
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_OPTIONAL_MULTIPLE_VASPACES)
self.nvdevice = self.iface.rm_alloc(self.iface.root, nv_gpu.NV01_DEVICE_0, device_params)
self.subdevice = self.iface.rm_alloc(self.nvdevice, nv_gpu.NV20_SUBDEVICE_0, nv_gpu.NV2080_ALLOC_PARAMETERS())
self.virtmem = self.iface.rm_alloc(self.nvdevice, nv_gpu.NV01_MEMORY_VIRTUAL, nv_gpu.NV_MEMORY_VIRTUAL_ALLOCATION_PARAMS(limit=0x1ffffffffffff))
self.usermode, self.gpu_mmio = self.iface.setup_usermode()
self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_PERF_BOOST, nv_gpu.NV2080_CTRL_PERF_BOOST_PARAMS(duration=0xffffffff,
@@ -514,14 +560,14 @@ class NVDevice(HCQCompiled[HCQSignal]):
channel_params = nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS(engineType=nv_gpu.NV2080_ENGINE_TYPE_GRAPHICS)
channel_group = self.iface.rm_alloc(self.nvdevice, nv_gpu.KEPLER_CHANNEL_GROUP_A, channel_params)
gpfifo_area = self.iface.alloc(0x200000, contiguous=True, cpu_access=True, force_devmem=True,
self.gpfifo_area = self.iface.alloc(0x300000, contiguous=True, cpu_access=True, force_devmem=True,
map_flags=(nv_gpu.NVOS33_FLAGS_CACHING_TYPE_WRITECOMBINED<<23))
ctxshare_params = nv_gpu.NV_CTXSHARE_ALLOCATION_PARAMETERS(hVASpace=vaspace, flags=nv_gpu.NV_CTXSHARE_ALLOCATION_FLAGS_SUBCONTEXT_ASYNC)
ctxshare = self.iface.rm_alloc(channel_group, nv_gpu.FERMI_CONTEXT_SHARE_A, ctxshare_params)
self.compute_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000, compute=True)
self.dma_gpfifo = self._new_gpu_fifo(gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000, compute=False)
self.compute_gpfifo = self._new_gpu_fifo(self.gpfifo_area, ctxshare, channel_group, offset=0, entries=0x10000, compute=True)
self.dma_gpfifo = self._new_gpu_fifo(self.gpfifo_area, ctxshare, channel_group, offset=0x100000, entries=0x10000, compute=False)
self.iface.rm_control(channel_group, nv_gpu.NVA06C_CTRL_CMD_GPFIFO_SCHEDULE, nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1))
self.cmdq_page:HCQBuffer = self.iface.alloc(0x200000, cpu_access=True)
@@ -542,22 +588,27 @@ class NVDevice(HCQCompiled[HCQSignal]):
self._setup_gpfifos()
def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, compute=False) -> GPFifo:
def _new_gpu_fifo(self, gpfifo_area, ctxshare, channel_group, offset=0, entries=0x400, compute=False, video=False) -> GPFifo:
notifier = self.iface.alloc(48 << 20, uncached=True)
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(hObjectError=notifier.meta.hMemory, hObjectBuffer=gpfifo_area.meta.hMemory,
gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset))
params = nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS(gpFifoOffset=gpfifo_area.va_addr+offset, gpFifoEntries=entries, hContextShare=ctxshare,
hObjectError=notifier.meta.hMemory, hObjectBuffer=self.virtmem if video else gpfifo_area.meta.hMemory,
hUserdMemory=(ctypes.c_uint32*8)(gpfifo_area.meta.hMemory), userdOffset=(ctypes.c_uint64*8)(entries*8+offset), engineType=19 if video else 0)
gpfifo = self.iface.rm_alloc(channel_group, self.iface.gpfifo_class, params)
if compute:
self.debug_compute_obj, self.debug_channel = self.iface.rm_alloc(gpfifo, self.iface.compute_class), gpfifo
debugger_params = nv_gpu.NV83DE_ALLOC_PARAMETERS(hAppClient=self.iface.root, hClass3dObject=self.debug_compute_obj)
self.debugger = self.iface.rm_alloc(self.nvdevice, nv_gpu.GT200_DEBUGGER, debugger_params)
else: self.iface.rm_alloc(gpfifo, self.iface.dma_class)
elif not video: self.iface.rm_alloc(gpfifo, self.iface.dma_class)
else: self.iface.rm_alloc(gpfifo, self.iface.viddec_class)
if channel_group == self.nvdevice:
self.iface.rm_control(gpfifo, nv_gpu.NVA06F_CTRL_CMD_BIND, nv_gpu.NVA06F_CTRL_BIND_PARAMS(engineType=params.engineType))
self.iface.rm_control(gpfifo, nv_gpu.NVA06F_CTRL_CMD_GPFIFO_SCHEDULE, nv_gpu.NVA06F_CTRL_GPFIFO_SCHEDULE_PARAMS(bEnable=1))
ws_token_params = self.iface.rm_control(gpfifo, nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN,
nv_gpu.NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS(workSubmitToken=-1))
self.iface.setup_gpfifo_vm(gpfifo)
if ctxshare != 0: self.iface.setup_gpfifo_vm(gpfifo)
return GPFifo(ring=gpfifo_area.cpu_view().view(offset, entries*8, fmt='Q'), entries_count=entries, token=ws_token_params.workSubmitToken,
controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.cpu_view().addr + offset + entries * 8))
@@ -604,6 +655,24 @@ class NVDevice(HCQCompiled[HCQSignal]):
.setup(local_mem=self.shader_local_mem.va_addr, local_mem_tpc_bytes=bytes_per_tpc) \
.signal(self.timeline_signal, self.next_timeline()).submit(self)
def _ensure_has_vid_hw(self, w, h):
if self.iface.viddec_class is None: raise RuntimeError(f"{self.device} Video decoder class not available.")
coloc_size = round_up((round_up(h, 64) * round_up(h, 64)) + (round_up(w, 64) * round_up(h, 64) // 16), 2 << 20)
self.intra_top_off = round_up(h, 64) * (608 + 4864 + 152 + 2000)
filter_size = round_up(round_up(self.intra_top_off, 0x10000) + 64 << 10, 2 << 20)
if not hasattr(self, 'vid_gpfifo'):
self.vid_gpfifo = self._new_gpu_fifo(self.gpfifo_area, 0, self.nvdevice, offset=0x200000, entries=2048, compute=False, video=True)
self.vid_coloc_buf, self.vid_filter_buf = self.allocator.alloc(coloc_size), self.allocator.alloc(filter_size)
self.vid_stat_buf = self.allocator.alloc(0x1000)
NVVideoQueue().wait(self.timeline_signal, self.timeline_value - 1) \
.setup(copy_class=self.iface.viddec_class) \
.signal(self.timeline_signal, self.next_timeline()).submit(self)
else:
if coloc_size > self.vid_coloc_buf.size: self.vid_coloc_buf, _ = self._realloc(self.vid_coloc_buf, coloc_size, force=True)
if filter_size > self.vid_filter_buf.size: self.vid_filter_buf, _ = self._realloc(self.vid_filter_buf, filter_size, force=True)
def invalidate_caches(self):
if self.is_nvd(): self.iface.rm_control(self.subdevice, nv_gpu.NV2080_CTRL_CMD_INTERNAL_BUS_FLUSH_WITH_SYSMEMBAR, None)
else:

View File

@@ -432,10 +432,12 @@ class HCQCompiled(Compiled, Generic[SignalType]):
self.timeline_signal.value = 0
cast(HCQAllocatorBase, self.allocator).b_timeline = [0] * len(cast(HCQAllocatorBase, self.allocator).b)
def _realloc(self, oldbuf:HCQBuffer|None, new_size:int, options:BufferSpec|None=None) -> tuple[HCQBuffer, bool]:
def _realloc(self, oldbuf:HCQBuffer|None, new_size:int, options:BufferSpec|None=None, force=False) -> tuple[HCQBuffer, bool]:
if oldbuf is not None: self.allocator.free(oldbuf, oldbuf.size, options=options)
try: buf, realloced = self.allocator.alloc(new_size, options=options), True
except MemoryError: buf, realloced = self.allocator.alloc(oldbuf.size if oldbuf is not None else new_size, options=options), False
except MemoryError:
if force: raise
buf, realloced = self.allocator.alloc(oldbuf.size if oldbuf is not None else new_size, options=options), False
return buf, realloced
def _select_iface(self, *ifaces:Type):

View File

@@ -8,7 +8,7 @@ from tinygrad.helpers import argsort, all_same, cpu_profile, PCONTIG, colored
ALWAYS_CONTIGUOUS: set[Ops] = {Ops.CONTIGUOUS, Ops.ASSIGN, Ops.COPY, Ops.BUFFER, Ops.BUFFER_VIEW,
Ops.CONST, Ops.BIND, Ops.DEVICE, Ops.MSELECT, Ops.MSTACK, Ops.DEFINE_GLOBAL,
Ops.DEFINE_LOCAL, Ops.DEFINE_REG, Ops.LOAD, Ops.KERNEL}
Ops.DEFINE_LOCAL, Ops.DEFINE_REG, Ops.LOAD, Ops.KERNEL, Ops.ENCDEC}
def realize(ctx:dict[UOp, None], tr:UOp) -> None: ctx[tr] = None
@@ -24,12 +24,12 @@ def realize_assign(ctx:dict[UOp, None], a:UOp) -> None:
pm_generate_realize_map = PatternMatcher([
# always realize SINK src
(UPat(Ops.SINK, name="s"), lambda ctx,s: ctx.update((x.base, None) for x in s.src if x.base.op not in ALWAYS_CONTIGUOUS)),
# always realize COPY/BUFFER_VIEW/CONTIGUOUS/STORE
(UPat({Ops.COPY, Ops.BUFFER_VIEW, Ops.CONTIGUOUS, Ops.STORE}, name="tr"), realize),
# always realize COPY/BUFFER_VIEW/CONTIGUOUS/STORE/ENCDEC
(UPat({Ops.COPY, Ops.BUFFER_VIEW, Ops.CONTIGUOUS, Ops.STORE, Ops.ENCDEC}, name="tr"), realize),
# always realize REDUCE on outer ranges
(UPat(Ops.REDUCE, name="r"), lambda ctx,r: realize(ctx, r) if any(tr.arg[-1] == AxisType.OUTER for tr in r.src[1:]) else None),
# realize srcs of COPY, MSELECT, MSTACK
(UPat((Ops.COPY, Ops.MSELECT, Ops.MSTACK), name="rb"), realize_srcs),
# realize srcs of COPY, MSELECT, MSTACK, ENCDEC
(UPat((Ops.COPY, Ops.MSELECT, Ops.MSTACK, Ops.ENCDEC), name="rb"), realize_srcs),
# realize ASSIGN and input to assign (might be optimized out)
(UPat(Ops.ASSIGN, name="a"), realize_assign),
])

View File

@@ -117,7 +117,7 @@ earliest_rewrites = mop_cleanup+PatternMatcher([
# 3.5 cleanups
# Ops.NOOP happens when we have a COPY to the device the Tensor is already on. We treat it like COPY here for MSTACK.
ALWAYS_RUN_OPS = {Ops.CONTIGUOUS, Ops.COPY, Ops.ASSIGN, Ops.NOOP}
ALWAYS_RUN_OPS = {Ops.CONTIGUOUS, Ops.COPY, Ops.ASSIGN, Ops.ENCDEC, Ops.NOOP}
# you don't know in the first pass if axes are going to die, this happens if there's an EXPAND to the left
def cleanup_dead_axes(b:UOp):
@@ -494,7 +494,7 @@ def split_store(ctx:list[UOp], x:UOp) -> UOp|None:
# NOTE: the hack for COPY is here
for u in ret.toposort():
# TODO: this can be wrong if there's multiple of these
if u.op in {Ops.COPY, Ops.BUFFER_VIEW}:
if u.op in {Ops.COPY, Ops.BUFFER_VIEW, Ops.ENCDEC}:
ret = u
break
else:

View File

@@ -11,7 +11,7 @@ from tinygrad.helpers import suppress_finalizing, disable_gc
from tinygrad.gradient import compute_gradient
from tinygrad.mixin import OpMixin
from tinygrad.mixin.movement import _align_left
from tinygrad.uop.ops import smax, smin, resolve, UOp, Ops, sint, identity_element, all_metadata, _index_to_concrete_int, sint_to_uop
from tinygrad.uop.ops import smax, smin, resolve, UOp, Ops, sint, identity_element, all_metadata, _index_to_concrete_int, sint_to_uop, Variable
from tinygrad.engine.schedule import ScheduleItem, complete_create_schedule_with_vars
from tinygrad.device import Device, Buffer
from tinygrad.engine.realize import run_schedule
@@ -3564,6 +3564,19 @@ class Tensor(OpMixin):
def __eq__(self, x) -> Tensor: return self.eq(x) # type: ignore[override]
# ***** encoding/decoding ops *****
def decode_hevc_frame(self, frame_pos:Variable, shape:tuple[int,...], state:Tensor, ref_frames:list[Tensor]|None=None) -> Tensor:
"""
Creates a Tensor by decoding an HEVC frame chunk.
You must provide the output shape of the decoded data (`shape`), the HEVC context (`vstate`), and, if required by the chunk,
the reference frames (`ref_frames`).
"""
ref_frames = [x.contiguous() for x in ref_frames or []]
assert isinstance(frame_pos, Variable), "frame_pos must be a Variable"
return self.contiguous()._apply_uop(UOp.encdec, state.contiguous(), *ref_frames, extra_args=(frame_pos,), arg=(shape,))
# ***** functional nn ops *****
def linear(self, weight:Tensor, bias:Tensor|None=None, dtype:DTypeLike|None=None) -> Tensor:

View File

@@ -80,7 +80,7 @@ class Ops(FastEnum):
CONTIGUOUS = auto(); CONTIGUOUS_BACKWARD = auto(); DETACH = auto()
# buffer ops
BUFFERIZE = auto(); COPY = auto(); BUFFER = auto(); BUFFER_VIEW = auto(); MSELECT = auto(); MSTACK = auto()
BUFFERIZE = auto(); COPY = auto(); BUFFER = auto(); BUFFER_VIEW = auto(); MSELECT = auto(); MSTACK = auto(); ENCDEC = auto()
# the core 6 movement ops! these only exist in the tensor graph
RESHAPE = auto(); PERMUTE = auto(); EXPAND = auto(); PAD = auto(); SHRINK = auto(); FLIP = auto()

View File

@@ -232,6 +232,7 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
case Ops.CONST | Ops.DEFINE_VAR | Ops.BIND: return () if self._device is not None else None
case Ops.BUFFER: return (self.arg,)
case Ops.BUFFER_VIEW: return (self.arg[0],)
case Ops.ENCDEC: return self.arg[0]
case Ops.BUFFERIZE: return tuple([int(r.vmax+1) for r in self.src[1:]])
case Ops.DEFINE_GLOBAL | Ops.DEFINE_LOCAL | Ops.DEFINE_REG: return (self.ptrdtype.size,)
@@ -538,6 +539,7 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
def mselect(self, arg:int) -> UOp: return UOp(Ops.MSELECT, self.dtype, (self,), arg)
@property
def metadata(self) -> tuple[Metadata, ...]|None: return all_metadata.get(self, None)
def encdec(self, *src, arg=None): return UOp(Ops.ENCDEC, self.dtype, src=(self,)+src, arg=arg)
# *** uop movement ops ***
@@ -1371,6 +1373,7 @@ pm_pyrender_extra = PatternMatcher([
(UPat(Ops.BUFFER, src=(UPat(Ops.UNIQUE, name="u"), UPat(Ops.DEVICE, name="d")), name="x"), lambda x,u,d:
f"UOp.new_buffer({repr(d.arg)}, {x.size}, {x.dtype}, {u.arg})"),
(UPat(Ops.COPY, src=(UPat(name="x"), UPat(Ops.DEVICE, name="d"))), lambda ctx,x,d: f"{ctx[x]}.copy_to_device({repr(d.arg)})"),
(UPat(Ops.ENCDEC, name="x"), lambda ctx,x: f"{ctx[x.src[0]]}.encdec({''.join([str(ctx[s])+', ' for s in x.src[1:]])}arg={x.arg!r})"),
(UPat(Ops.REDUCE_AXIS, name="r"), lambda ctx,r: f"{ctx[r.src[0]]}.r({r.arg[0]}, {r.arg[1]})"),
# NOTE: range has srcs sometimes after control flow
(UPat(Ops.RANGE, src=(UPat(Ops.CONST, name="c"),), allow_any_len=True, name="x"), lambda ctx,x,c:

View File

@@ -96,10 +96,11 @@ _tensor_spec = PatternMatcher([
(UPat(Ops.CONTIGUOUS, name="root", src=(UPat.var("x"),), allow_any_len=True, arg=None),
lambda root,x: root.dtype == x.dtype and all(u.op is Ops.RANGE for u in root.src[1:])),
# COPY/ALLREDUCE/MULTI
# COPY/ALLREDUCE/MULTI/ENCDEC
(UPat(Ops.COPY, name="copy", src=(UPat.var("x"), UPat(Ops.DEVICE)), arg=None), lambda copy,x: copy.dtype == x.dtype),
(UPat(Ops.ALLREDUCE, name="red", src=(UPat.var("x"), UPat(Ops.DEVICE))), lambda red,x: red.dtype == x.dtype and isinstance(red.arg, Ops)),
(UPat(Ops.MULTI, name="multi"), lambda multi: all(x.dtype == multi.dtype for x in multi.src) and isinstance(multi.arg, int)),
(UPat(Ops.ENCDEC, name="x"), lambda x: len(x.src) >= 2), # state + inbuffer
# REDUCE_AXIS is the reduce in the tensor graph
(UPat(Ops.REDUCE_AXIS, name="x"), lambda x: isinstance(x.arg, tuple) and len(x.arg) >= 2 and x.arg[0] in {Ops.ADD, Ops.MUL, Ops.MAX}),

View File

@@ -19,7 +19,7 @@ uops_colors = {Ops.LOAD: "#ffc0c0", Ops.STORE: "#87CEEB", Ops.CONST: "#e0e0e0",
Ops.RANGE: "#c8a0e0", Ops.ASSIGN: "#909090", Ops.BARRIER: "#ff8080", Ops.IF: "#c8b0c0", Ops.SPECIAL: "#c0c0ff",
Ops.INDEX: "#cef263", Ops.WMMA: "#efefc0", Ops.MULTI: "#f6ccff", Ops.KERNEL: "#3e7f55",
**{x:"#D8F9E4" for x in GroupOp.Movement}, **{x:"#ffffc0" for x in GroupOp.ALU}, Ops.THREEFRY:"#ffff80",
Ops.BUFFER_VIEW: "#E5EAFF", Ops.BUFFER: "#B0BDFF", Ops.COPY: "#a040a0",
Ops.BUFFER_VIEW: "#E5EAFF", Ops.BUFFER: "#B0BDFF", Ops.COPY: "#a040a0", Ops.ENCDEC: "#bf71b6",
Ops.ALLREDUCE: "#ff40a0", Ops.MSELECT: "#d040a0", Ops.MSTACK: "#d040a0", Ops.CONTIGUOUS: "#FFC14D",
Ops.BUFFERIZE: "#FF991C", Ops.REWRITE_ERROR: "#ff2e2e", Ops.AFTER: "#8A7866", Ops.END: "#524C46"}