diff --git a/test/test_speed_v_torch.py b/test/test_speed_v_torch.py
index f3e0b312c2..9753ee330d 100644
--- a/test/test_speed_v_torch.py
+++ b/test/test_speed_v_torch.py
@@ -10,6 +10,7 @@ import time
 import numpy as np
 np.set_printoptions(linewidth=160)
 from functools import partial
+from tinygrad.lazy import Device
 from tinygrad.ops import GlobalCounters
 from tinygrad.tensor import Tensor
 from tinygrad.nn import Conv2d
@@ -19,6 +20,14 @@ from tinygrad.jit import TinyJit
 IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]
 
 torch_device = torch.device('mps' if getenv("MPS", 0) else ('cuda' if getenv("TORCHCUDA", 0) else 'cpu'))
+if str(torch_device) == "mps":
+  import torch.mps
+  sync = lambda: torch.mps.synchronize()
+elif str(torch_device) == "cuda":
+  import torch.cuda
+  sync = lambda: torch.cuda.synchronize()
+else:
+  sync = lambda: None
 
 def colorize_float(x):
   ret = f"{x:7.2f}x"
@@ -47,9 +56,11 @@ def helper_test_speed(f1, *args):
     if DEBUG >= 4: print("benchmark start")
     st = time.monotonic()
     ret = f1(*args)
-    # not ideal, it's copying (sometimes). why is this so slow in tinygrad?
-    if isinstance(ret, Tensor) or str(torch_device) == "cpu": ret.numpy()
-    else: ret.cpu().numpy()
+    if isinstance(ret, Tensor):
+      ret.realize()
+      Device[ret.device].synchronize()
+    else:
+      sync()
     et = (time.monotonic() - st) * 1000
     ets.append(et)
     if DEBUG >= 4: print("benchmark stop")
@@ -160,7 +171,7 @@ class TestSpeed(unittest.TestCase):
 
   def test_gemm(self):
     def f(a, b): return a @ b
-    helper_test_generic_square('gemm', 512, f, f)
+    helper_test_generic_square('gemm', 1024, f, f)
 
   def test_gemm_unrolled(self):
     N = 512
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index 09e1cd7017..ce53b738bd 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -39,6 +39,7 @@ class Interpreted:
     self.fxn_for_op = fxn_for_op
     self.from_lazybuffer = from_lazybuffer
     self.to_underlying = to_underlying
+    self.synchronize = lambda: None
     self.codegen = None
 
   def exec_ast(self, ast:LazyOp, output=None, context=None):
@@ -122,8 +123,8 @@ class ASTRunner:
     return min([(self.timeit(rawbufs, local_size), local_size) for local_size in random.sample(local_sizes, len(local_sizes))])[1]
 
 class Compiled:
-  def __init__(self, buffer: Type[RawBuffer], codegen, runtime):
-    self.buffer, self.codegen, self.runtime = buffer, codegen, runtime
+  def __init__(self, buffer: Type[RawBuffer], codegen, runtime, synchronize=lambda: None):
+    self.buffer, self.codegen, self.runtime, self.synchronize = buffer, codegen, runtime, synchronize
     self.method_cache: Dict[str, ASTRunner] = {}
 
   def exec_ast(self, ast:LazyOp, output):
diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py
index 1bb5f0ea60..b14774ce52 100644
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@@ -45,4 +45,4 @@ class CUDACodegen(CStyleCodegen):
     gid = [f'blockDim.{chr(120+i)}*blockIdx.{chr(120+i)}+threadIdx.{chr(120+i)}' for i in range(3)],
     lid = [f'threadIdx.{chr(120+i)}' for i in range(3)])
 
-CUDABuffer = Compiled(RawCUDABuffer, CUDACodegen, CUDAProgram)
+CUDABuffer = Compiled(RawCUDABuffer, CUDACodegen, CUDAProgram, cuda.Context.synchronize)
diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py
index f3e3c2cc6d..b4510dba93 100644
--- a/tinygrad/runtime/ops_gpu.py
+++ b/tinygrad/runtime/ops_gpu.py
@@ -75,4 +75,4 @@ class CLCodegen(CStyleCodegen):
     barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
     gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)], uses_vload=True)
 
-GPUBuffer = Compiled(CLBuffer, CLCodegen, CLProgram)
+GPUBuffer = Compiled(CLBuffer, CLCodegen, CLProgram, CL.cl_queue.finish)
diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py
index 350bed338c..e3a78e5d52 100644
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@@ -14,6 +14,10 @@ class _METAL:
     self.mtl_buffers_in_flight: List[Any] = []
     self.device = Metal.MTLCreateSystemDefaultDevice()
     self.mtl_queue = self.device.newCommandQueue()
+  # TODO: is there a better way to do this?
+  def synchronize(self):
+    for cbuf in self.mtl_buffers_in_flight: cbuf.waitUntilCompleted()
+    self.mtl_buffers_in_flight.clear()
 METAL = _METAL()
 
 class RawMetalBuffer(RawBufferMapped):
@@ -22,8 +26,7 @@ class RawMetalBuffer(RawBufferMapped):
     self._buf.release()
     super().__del__()
   def _buffer(self):
-    for cbuf in METAL.mtl_buffers_in_flight: cbuf.waitUntilCompleted()
-    METAL.mtl_buffers_in_flight.clear()
+    METAL.synchronize()
     return self._buf.contents().as_buffer(self._buf.length())
 
 def unwrap(x):
@@ -80,4 +83,4 @@ class MetalCodegen(CStyleCodegen):
     gid = [f"gid.{chr(120+i)}" for i in range(3)], lid = [f"lid.{chr(120+i)}" for i in range(3)],
     extra_args = ['uint3 gid [[thread_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]'])
 
-MetalBuffer = Compiled(RawMetalBuffer, MetalCodegen, MetalProgram)
+MetalBuffer = Compiled(RawMetalBuffer, MetalCodegen, MetalProgram, METAL.synchronize)