diff --git a/test/mockgpu/nv/nvgpu.py b/test/mockgpu/nv/nvgpu.py index 8a7f8e3074..f450a16ab1 100644 --- a/test/mockgpu/nv/nvgpu.py +++ b/test/mockgpu/nv/nvgpu.py @@ -92,7 +92,7 @@ class GPFIFO: qmd = qmd_struct_t.from_address(qmd_addr) prg_addr = qmd.program_address_lower + (qmd.program_address_upper << 32) const0 = to_mv(qmd.constant_buffer_addr_lower_0 + (qmd.constant_buffer_addr_upper_0 << 32), 0x160).cast('I') - args_cnt, vals_cnt = const0[0], const0[1] + args_cnt, vals_cnt = const0[80], const0[81] args_addr = qmd.constant_buffer_addr_lower_0 + (qmd.constant_buffer_addr_upper_0 << 32) + 0x160 args = to_mv(args_addr, args_cnt*8).cast('Q') vals = to_mv(args_addr + args_cnt*8, vals_cnt*4).cast('I') diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 368b3c03b9..679823e69e 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -141,6 +141,7 @@ class NVComputeQueue(NVCommandQueue): self.bind_sints_to_ptr(*global_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_RASTER_WIDTH[1] // 8, fmt='I') self.bind_sints_to_ptr(*local_size, ptr=qmd_addr + nv_gpu.NVC6C0_QMDV03_00_CTA_THREAD_DIMENSION0[1] // 8, fmt='H') + self.bind_sints_to_ptr(*local_size, *global_size, ptr=args_state.ptr, fmt='I') qmd.constant_buffer_addr_upper_0, qmd.constant_buffer_addr_lower_0 = data64(args_state.ptr) if self.active_qmd is None: @@ -188,7 +189,7 @@ class NVCopyQueue(NVCommandQueue): class NVArgsState(CLikeArgsState): def __init__(self, ptr:int, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()): - if MOCKGPU: prg.constbuffer_0[0:2] = [len(bufs), len(vals)] + if MOCKGPU: prg.constbuffer_0[80:82] = [len(bufs), len(vals)] super().__init__(ptr, prg, bufs, vals=vals, prefix=prg.constbuffer_0) class NVProgram(HCQProgram):