diff --git a/test/mockgpu/mockgpu.py b/test/mockgpu/mockgpu.py index 92655f049d..b0041f38a1 100644 --- a/test/mockgpu/mockgpu.py +++ b/test/mockgpu/mockgpu.py @@ -27,7 +27,7 @@ class TrackedMemoryView: self.wcb(self.mv, index) def cast(self, new_type, **kwargs): - self.mv = self.mv.cast(new_type, **kwargs) + self.mv = self.mv.cast('B').cast(new_type, **kwargs) return self @property diff --git a/test/test_hcq_iface.py b/test/test_hcq_iface.py new file mode 100644 index 0000000000..05c5732767 --- /dev/null +++ b/test/test_hcq_iface.py @@ -0,0 +1,37 @@ +import unittest, array, time +from tinygrad.helpers import mv_address +from tinygrad.runtime.support.hcq import MMIOInterface + +class TestHCQIface(unittest.TestCase): + def setUp(self): + self.size = 4 << 10 + self.buffer = bytearray(self.size) + self.mv = memoryview(self.buffer).cast('I') + self.mmio = MMIOInterface(mv_address(self.mv), self.size, fmt='I') + + def test_getitem_setitem(self): + self.mmio[1] = 0xdeadbeef + self.assertEqual(self.mmio[1], 0xdeadbeef) + values = array.array('I', [10, 20, 30, 40]) + self.mmio[2:6] = values + read_slice = self.mmio[2:6] + # self.assertIsInstance(read_slice, array.array) + self.assertEqual(read_slice, values.tolist()) + self.assertEqual(self.mv[2:6].tolist(), values.tolist()) + + def test_speed(self): + start = time.perf_counter() + for i in range(10000): + self.mmio[3:100] = array.array('I', [i] * 97) + _ = self.mmio[3:100] + end = time.perf_counter() + + mvstart = time.perf_counter() + for i in range(10000): + self.mv[3:100] = array.array('I', [i] * 97) + _ = self.mv[3:100].tolist() + mvend = time.perf_counter() + print(f"speed: hcq {end - start:.6f}s vs plain mv {mvend - mvstart:.6f}s") + +if __name__ == "__main__": + unittest.main() diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 6e6e19b46a..7d8952a256 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -4,10 +4,10 @@ assert sys.platform != 'win32' from typing import Any, cast, Union, Type, ClassVar from dataclasses import dataclass from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator -from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU +from tinygrad.runtime.support.hcq import MMIOInterface, HWInterface, MOCKGPU from tinygrad.ops import sint from tinygrad.device import BufferSpec, CPUProgram -from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod, OSX +from tinygrad.helpers import getenv, mv_address, init_c_struct_t, round_up, data64, data64_le, DEBUG, prod, OSX from tinygrad.renderer.ptx import PTXRenderer from tinygrad.renderer.cstyle import NVRenderer from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler @@ -104,7 +104,7 @@ class NVCommandQueue(HWQueue[NVSignal, 'NVDevice', 'NVProgram', 'NVArgsState']): def bind(self, dev:NVDevice): self.binded_device = dev self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True)) - hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I") + hw_view = MMIOInterface(self.hw_page.va_addr, self.hw_page.size, fmt='I') for i, value in enumerate(self._q): hw_view[i] = value # From now on, the queue is on the device for faster submission. @@ -280,7 +280,7 @@ class NVAllocator(HCQAllocator['NVDevice']): @dataclass class GPFifo: - ring: memoryview + ring: MMIOInterface controls: nv_gpu.AmpereAControlGPFifo entries_count: int token: int @@ -416,7 +416,7 @@ class NVDevice(HCQCompiled[NVSignal]): self.nvdevice = rm_alloc(self.fd_ctl, nv_gpu.NV01_DEVICE_0, self.root, self.root, device_params).hObjectNew self.subdevice = rm_alloc(self.fd_ctl, nv_gpu.NV20_SUBDEVICE_0, self.root, self.nvdevice, None).hObjectNew self.usermode = rm_alloc(self.fd_ctl, nv_gpu.TURING_USERMODE_A, self.root, self.subdevice, None).hObjectNew - self.gpu_mmio = to_mv(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz).cast("I") + self.gpu_mmio = MMIOInterface(self._gpu_map_to_cpu(self.usermode, mmio_sz:=0x10000, flags=2), mmio_sz, fmt='I') self._setup_nvclasses() self._debug_mappings: dict[tuple[int, int], str] = dict() @@ -453,7 +453,7 @@ class NVDevice(HCQCompiled[NVSignal]): self.cmdq_page:HCQBuffer = self._gpu_alloc(0x200000, cpu_access=True, tag="cmdq") self.cmdq_allocator = BumpAllocator(size=self.cmdq_page.size, base=cast(int, self.cmdq_page.va_addr), wrap=True) - self.cmdq: memoryview = to_mv(cast(int, self.cmdq_page.va_addr), 0x200000).cast("I") + self.cmdq = MMIOInterface(cast(int, self.cmdq_page.va_addr), 0x200000, fmt='I') self.num_gpcs, self.num_tpc_per_gpc, self.num_sm_per_tpc, self.max_warps_per_sm, self.sm_version = self._query_gpu_info('num_gpcs', 'num_tpc_per_gpc', 'num_sm_per_tpc', 'max_warps_per_sm', 'sm_version') @@ -486,7 +486,7 @@ class NVDevice(HCQCompiled[NVSignal]): uvm.register_channel(self.fd_uvm, gpuUuid=self.gpu_uuid, rmCtrlFd=self.fd_ctl.fd, hClient=self.root, hChannel=gpfifo, base=channel_base, length=0x4000000) - return GPFifo(ring=to_mv(gpfifo_area.va_addr + offset, entries * 8).cast("Q"), entries_count=entries, token=ws_token_params.workSubmitToken, + return GPFifo(ring=MMIOInterface(gpfifo_area.va_addr + offset, entries*8, fmt='Q'), entries_count=entries, token=ws_token_params.workSubmitToken, controls=nv_gpu.AmpereAControlGPFifo.from_address(gpfifo_area.va_addr + offset + entries * 8)) def _query_gpu_info(self, *reqs): diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index 4055439cb9..2e9beaf369 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -1,12 +1,18 @@ from __future__ import annotations from typing import cast, Callable, Type, TypeVar, Generic, Any, ClassVar -import contextlib, decimal, statistics, time, ctypes, array, os, fcntl +import contextlib, decimal, statistics, time, ctypes, array, os, fcntl, struct from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up from tinygrad.renderer import Renderer from tinygrad.device import BufferSpec, Compiler, Compiled, LRUAllocator, ProfileRangeEvent, ProfileDeviceEvent, ProfileProgramEvent from tinygrad.ops import sym_infer, sint, Variable, UOp from tinygrad.runtime.autogen import libc +class MMIOInterface: + def __init__(self, va:int, sz:int, fmt='B'): self.mv, self.va, self.size, self.fmt = to_mv(va, sz).cast(fmt), va, sz, fmt + def __len__(self): return self.size // struct.calcsize(self.fmt) + def __getitem__(self, k) -> int|list[int]: return self.mv[k].tolist() if isinstance(k, slice) else self.mv[k] + def __setitem__(self, k, v:int|array.array): self.mv[k] = v + class HWInterface: """ Hardware Abstraction Layer for HCQ devices. The class provides a unified interface for interacting with hardware devices.