From c0240855b9c1666132f9e9dd010a025382383d74 Mon Sep 17 00:00:00 2001 From: nimlgen <138685161+nimlgen@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:45:01 +0300 Subject: [PATCH] qcom has not transfer (#8075) * qcom alloc is not hcq alloc * maybe base? * test --- test/test_jit.py | 8 ++++++++ tinygrad/runtime/ops_qcom.py | 4 ++-- tinygrad/runtime/support/hcq.py | 19 +++++++++---------- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index a88f9a52c4..a90ceab303 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -398,6 +398,14 @@ class TestJit(unittest.TestCase): for i in range(5): np.testing.assert_equal(g(Tensor([i]*3), Tensor.ones(3), Tensor.zeros(3)).numpy(), np.array([i+1]*3)) + def test_jitted_clone(self): + def f(a): return a.clone().realize() + jf = TinyJit(f) + for _ in range(5): + a = Tensor.randn(10, 10, device=Device.DEFAULT).realize() + ja = jf(a) + np.testing.assert_allclose(a.numpy(), ja.numpy(), atol=1e-4, rtol=1e-5) + @unittest.skipIf(CI and Device.DEFAULT in {"GPU", "CUDA", "METAL", "NV", "AMD"}, "no GPU CI") def test_jitted_transfers(self): d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1" diff --git a/tinygrad/runtime/ops_qcom.py b/tinygrad/runtime/ops_qcom.py index a44cf01aa1..ef9b2357c1 100644 --- a/tinygrad/runtime/ops_qcom.py +++ b/tinygrad/runtime/ops_qcom.py @@ -4,7 +4,7 @@ assert sys.platform != 'win32' from types import SimpleNamespace from typing import Tuple, List, Any, cast, Optional from tinygrad.device import BufferSpec -from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQSignal, HCQAllocator, HCQArgsState +from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState from tinygrad.runtime.autogen import kgsl, adreno, libc from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice from tinygrad.renderer.cstyle import QCOMRenderer @@ -276,7 +276,7 @@ class QCOMBuffer(HCQBuffer): # Texture specific definitions self.desc, self.ibo, self.pitch, self.real_stride = [0] * 16, [0] * 16, pitch, real_stride -class QCOMAllocator(HCQAllocator): +class QCOMAllocator(HCQAllocatorBase): def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer: if options.image is not None: imgw, imgh, itemsize_log = options.image.shape[1], options.image.shape[0], int(math.log2(options.image.itemsize)) diff --git a/tinygrad/runtime/support/hcq.py b/tinygrad/runtime/support/hcq.py index e713453ad0..b1bb59118d 100644 --- a/tinygrad/runtime/support/hcq.py +++ b/tinygrad/runtime/support/hcq.py @@ -333,7 +333,7 @@ class HCQCompiled(Compiled, Generic[SignalType]): gpu2cpu_copy_time_diff: decimal.Decimal = decimal.Decimal('nan') gpu2cpu_compute_time_diff: decimal.Decimal = decimal.Decimal('nan') - def __init__(self, device:str, allocator:HCQAllocator, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType], + def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType], comp_queue_t:Type[HWQueue], copy_queue_t:Optional[Type[HWQueue]]): self.device_id:int = int(device.split(":")[1]) if ":" in device else 0 self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t @@ -445,12 +445,12 @@ class HCQCompiled(Compiled, Generic[SignalType]): def _wrap_timeline_signal(self): self.timeline_signal, self._shadow_timeline_signal, self.timeline_value = self._shadow_timeline_signal, self.timeline_signal, 1 self.timeline_signal.value = 0 - cast(HCQAllocator, self.allocator).b_timeline = [0] * len(cast(HCQAllocator, self.allocator).b) + cast(HCQAllocatorBase, self.allocator).b_timeline = [0] * len(cast(HCQAllocatorBase, self.allocator).b) # Protocol for hcq compatible allocators for allocated buffers to contain VA address and it's size. class HCQBuffer(Protocol): va_addr:int; size:int # noqa: E702 -class HCQAllocator(LRUAllocator, Generic[DeviceType]): +class HCQAllocatorBase(LRUAllocator, Generic[DeviceType]): """ A base allocator class compatible with the HCQ (Hardware Command Queue) API. @@ -463,8 +463,13 @@ class HCQAllocator(LRUAllocator, Generic[DeviceType]): self.b_timeline, self.b_next = [0] * len(self.b), 0 super().__init__() - def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer: raise NotImplementedError("need hcq compat alloc") + def map(self, buf:HCQBuffer): pass + def _offset(self, buf, size:int, offset:int) -> HCQBuffer: + return type(buf)(va_addr=buf.va_addr + offset, size=size, **{k:v for k,v in buf.__dict__.items() if k not in ['va_addr', 'size']}, + **{x[0]:getattr(buf, x[0]) for x in getattr(buf, '_fields_', []) if x[0] not in ['va_addr', 'size']}, _base=buf) + +class HCQAllocator(HCQAllocatorBase, Generic[DeviceType]): def _copyin(self, dest:HCQBuffer, src:memoryview): assert self.dev.hw_copy_queue_t is not None with hcq_profile(self.dev, queue_type=self.dev.hw_copy_queue_t, desc=f"CPU -> {self.dev.device}", enabled=PROFILE): @@ -525,9 +530,3 @@ class HCQAllocator(LRUAllocator, Generic[DeviceType]): .wait(dest_dev.timeline_signal, dest_dev.timeline_value - 1) \ .signal(dest_dev.timeline_signal, dest_dev.timeline_value).submit(dest_dev) dest_dev.timeline_value += 1 - - def map(self, buf:HCQBuffer): pass - - def _offset(self, buf, size:int, offset:int) -> HCQBuffer: - return type(buf)(va_addr=buf.va_addr + offset, size=size, **{k:v for k,v in buf.__dict__.items() if k not in ['va_addr', 'size']}, - **{x[0]:getattr(buf, x[0]) for x in getattr(buf, '_fields_', []) if x[0] not in ['va_addr', 'size']}, _base=buf)