diff --git a/test/test_speed_v_torch.py b/test/test_speed_v_torch.py index f3e0b312c2..9753ee330d 100644 --- a/test/test_speed_v_torch.py +++ b/test/test_speed_v_torch.py @@ -10,6 +10,7 @@ import time import numpy as np np.set_printoptions(linewidth=160) from functools import partial +from tinygrad.lazy import Device from tinygrad.ops import GlobalCounters from tinygrad.tensor import Tensor from tinygrad.nn import Conv2d @@ -19,6 +20,14 @@ from tinygrad.jit import TinyJit IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")] torch_device = torch.device('mps' if getenv("MPS", 0) else ('cuda' if getenv("TORCHCUDA", 0) else 'cpu')) +if str(torch_device) == "mps": + import torch.mps + sync = lambda: torch.mps.synchronize() +elif str(torch_device) == "cuda": + import torch.cuda + sync = lambda: torch.cuda.synchronize() +else: + sync = lambda: None def colorize_float(x): ret = f"{x:7.2f}x" @@ -47,9 +56,11 @@ def helper_test_speed(f1, *args): if DEBUG >= 4: print("benchmark start") st = time.monotonic() ret = f1(*args) - # not ideal, it's copying (sometimes). why is this so slow in tinygrad? - if isinstance(ret, Tensor) or str(torch_device) == "cpu": ret.numpy() - else: ret.cpu().numpy() + if isinstance(ret, Tensor): + ret.realize() + Device[ret.device].synchronize() + else: + sync() et = (time.monotonic() - st) * 1000 ets.append(et) if DEBUG >= 4: print("benchmark stop") @@ -160,7 +171,7 @@ class TestSpeed(unittest.TestCase): def test_gemm(self): def f(a, b): return a @ b - helper_test_generic_square('gemm', 512, f, f) + helper_test_generic_square('gemm', 1024, f, f) def test_gemm_unrolled(self): N = 512 diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 09e1cd7017..ce53b738bd 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -39,6 +39,7 @@ class Interpreted: self.fxn_for_op = fxn_for_op self.from_lazybuffer = from_lazybuffer self.to_underlying = to_underlying + self.synchronize = lambda: None self.codegen = None def exec_ast(self, ast:LazyOp, output=None, context=None): @@ -122,8 +123,8 @@ class ASTRunner: return min([(self.timeit(rawbufs, local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))])[1] class Compiled: - def __init__(self, buffer: Type[RawBuffer], codegen, runtime): - self.buffer, self.codegen, self.runtime = buffer, codegen, runtime + def __init__(self, buffer: Type[RawBuffer], codegen, runtime, synchronize=lambda: None): + self.buffer, self.codegen, self.runtime, self.synchronize = buffer, codegen, runtime, synchronize self.method_cache: Dict[str, ASTRunner] = {} def exec_ast(self, ast:LazyOp, output): diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index 1bb5f0ea60..b14774ce52 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -45,4 +45,4 @@ class CUDACodegen(CStyleCodegen): gid = [f'blockDim.{chr(120+i)}*blockIdx.{chr(120+i)}+threadIdx.{chr(120+i)}' for i in range(3)], lid = [f'threadIdx.{chr(120+i)}' for i in range(3)]) -CUDABuffer = Compiled(RawCUDABuffer, CUDACodegen, CUDAProgram) +CUDABuffer = Compiled(RawCUDABuffer, CUDACodegen, CUDAProgram, cuda.Context.synchronize) diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py index f3e3c2cc6d..b4510dba93 100644 --- a/tinygrad/runtime/ops_gpu.py +++ b/tinygrad/runtime/ops_gpu.py @@ -75,4 +75,4 @@ class CLCodegen(CStyleCodegen): barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)", gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)], uses_vload=True) -GPUBuffer = Compiled(CLBuffer, CLCodegen, CLProgram) +GPUBuffer = Compiled(CLBuffer, CLCodegen, CLProgram, CL.cl_queue.finish) diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index 350bed338c..e3a78e5d52 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -14,6 +14,10 @@ class _METAL: self.mtl_buffers_in_flight: List[Any] = [] self.device = Metal.MTLCreateSystemDefaultDevice() self.mtl_queue = self.device.newCommandQueue() + # TODO: is there a better way to do this? + def synchronize(self): + for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted() + self.mtl_buffers_in_flight.clear() METAL = _METAL() class RawMetalBuffer(RawBufferMapped): @@ -22,8 +26,7 @@ class RawMetalBuffer(RawBufferMapped): self._buf.release() super().__del__() def _buffer(self): - for cbuf in METAL.mtl_buffers_in_flight: cbuf.waitUntilCompleted() - METAL.mtl_buffers_in_flight.clear() + METAL.synchronize() return self._buf.contents().as_buffer(self._buf.length()) def unwrap(x): @@ -80,4 +83,4 @@ class MetalCodegen(CStyleCodegen): gid = [f"gid.{chr(120+i)}" for i in range(3)], lid = [f"lid.{chr(120+i)}" for i in range(3)], extra_args = ['uint3 gid [[thread_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]']) -MetalBuffer = Compiled(RawMetalBuffer, MetalCodegen, MetalProgram) +MetalBuffer = Compiled(RawMetalBuffer, MetalCodegen, MetalProgram, METAL.synchronize)