new style device (#2530)

* cpu tests pass * torch works * works * metal works * fix ops_disk * metal jit works * fix openpilot * llvm and clang work * fix webgpu * docs are rly broken * LRU works on metal * delete comment * revert name to ._buf. LRU only on Compiled * changes * allocator * allocator, getting closer * lru alloc * LRUAllocator * all pass * metal * cuda * test examples * linearizer * test fixes * fix custom + clean realize * fix hip * skip tests * fix tests * fix size=0 * fix MOCKHIP * fix thneed * copy better * simple * old style metal copy * fix thneed * np reshape * give cuda a device
2026-04-07 03:00:26 -04:00 · 2023-11-30 17:07:16 -08:00
parent e56511b59a
commit 2c363b5f0b
38 changed files with 572 additions and 1039 deletions
--- a/test/external/external_test_allocator_on_models.py
+++ b/test/external/external_test_allocator_on_models.py
@@ -1,125 +0,0 @@
-#!/usr/bin/env python
-import unittest, gc
-import numpy as np
-from tinygrad.tensor import Tensor
-from tinygrad.nn.state import get_state_dict
-from tinygrad.helpers import GlobalCounters
-from tinygrad.runtime.lib import RawBuffer, LRUAllocator
-from tinygrad.helpers import dtypes, prod
-from tinygrad import Device
-from test.helpers import derandomize_model
-
-from examples.llama import Transformer
-
-ALLOCATED_DEV_BUFS = 0
-class FakeDeviceBuffer:
-  def __init__(self, sz, dt, device):
-    self.id = 1
-    self.size = sz
-    self.dtype = dt
-    self.device = device
-
-    global ALLOCATED_DEV_BUFS
-    ALLOCATED_DEV_BUFS += 1
-class FakeAllocator(LRUAllocator):
-  def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
-  def _do_free(self, buf):
-    buf.id -= 1
-    assert buf.id == 0, f"Free should be called once, but {buf.id}"
-  def __del__(self): # Fake allocator should clear all buffers after each test.
-    for v in self.cached_buffers.values():
-      for buf, _ in v: self._free_buffer(buf)
-
-FAKE_GLOBAL_ALLOCATOR = None
-class FakeBuffer(RawBuffer):
-  def __init__(self, size, dtype, device='0'):
-    global FAKE_GLOBAL_ALLOCATOR
-    super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
-    assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
-  @classmethod
-  def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
-  def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
-class FakeProgram:
-  def __init__(self, name:str, prg:str): pass
-  def __call__(self, *bufs, global_size, local_size, wait=False): pass
-
-def helper_test_correctness(gen, train):
-  from tinygrad.runtime.ops_gpu import CL, CLAllocator
-  old_alloc = CL.cl_allocator
-  CL.cl_allocator = CLAllocator(0)
-  no_alloc_result = train(*gen()).numpy()
-  Device[Device.DEFAULT].synchronize()
-  CL.cl_allocator = CLAllocator(512<<30) # Test cache correctness, so cache as much as possible, 512gb
-  for _ in range(4):
-    GlobalCounters.reset()
-    np.testing.assert_allclose(train(*gen()).numpy(), no_alloc_result, rtol=1e-3, atol=1e-5)
-    Device[Device.DEFAULT].synchronize()
-  assert len(CL.cl_allocator.cached_buffers) != 0, "Cache must be used"
-  CL.cl_allocator = old_alloc
-
-def __helper_test_alloc_count(gen, train):
-  was_alloc = ALLOCATED_DEV_BUFS
-  for _ in range(2):
-    train(*gen())
-  return ALLOCATED_DEV_BUFS - was_alloc
-
-def helper_test_alloc_count(mm, gen, train):
-  global FAKE_GLOBAL_ALLOCATOR
-  backup_program = Device[Device.DEFAULT].runtime
-  backup_buffer = Device[Device.DEFAULT].buffer
-  Device[Device.DEFAULT].runtime = FakeProgram
-  Device[Device.DEFAULT].buffer = FakeBuffer
-  Device[Device.DEFAULT].get_runner.cache_clear()
-  FAKE_GLOBAL_ALLOCATOR = FakeAllocator(16<<30)
-  new_allocs = __helper_test_alloc_count(gen, train)
-  Device[Device.DEFAULT].get_runner.cache_clear()
-  FAKE_GLOBAL_ALLOCATOR = FakeAllocator(0)
-  old_allocs = __helper_test_alloc_count(gen, train)
-  print(f"{mm}: llama: old allocs count {old_allocs}, new allocs count {new_allocs}")
-  assert new_allocs < old_allocs, "Hmm, doesn't cache work any more?"
-  Device[Device.DEFAULT].runtime = backup_program
-  Device[Device.DEFAULT].buffer = backup_buffer
-  FAKE_GLOBAL_ALLOCATOR = None
-
-def check_gc():
-  if Device.DEFAULT == "GPU":
-    gc.collect() # Need to collect Tensors.
-    from extra.introspection import print_objects
-    assert print_objects() == 0
-
-class TestAllocators(unittest.TestCase):
-  @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
-  def test_lru_allocator_tiny_llama(self):
-    old_type = Tensor.default_type
-    Tensor.default_type = dtypes.float16
-
-    args_tiny = {"dim": 1024, "hidden_dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
-    def __test():
-      model = Transformer(**args_tiny)
-      derandomize_model(model)
-      def test(t): return model(t, 0).realize()
-      helper_test_correctness(lambda: (Tensor([[1,]]),), test)
-    __test()
-    Tensor.default_type = old_type
-    check_gc()
-
-  @unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
-  def test_lru_allocator_tiny_llama_alloc_counts(self):
-    args_tiny = {"dim": 1024, "hidden_dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
-    def test_alloc_count(t):
-      model = Transformer(**args_tiny)
-      for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
-      return model(t, 0).realize()
-    helper_test_alloc_count("llama", lambda: (Tensor([[2,]]),), test_alloc_count)
-    check_gc()
-
-  @unittest.skip("huge for CI")
-  def test_stable_diffusion(self):
-    from examples.stable_diffusion import UNetModel
-    model = UNetModel()
-    derandomize_model(model)
-    def test(t, t2): return model(t, 801, t2).realize()
-    helper_test_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test)
-
-if __name__ == "__main__":
-  unittest.main()
--- a/test/external/external_test_speed_llama.py
+++ b/test/external/external_test_speed_llama.py
@@ -1,29 +1,27 @@
 # NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
 import unittest, time
-import numpy as np
 from examples.llama import Transformer, MODEL_PARAMS
 from tinygrad.tensor import Tensor
 from tinygrad import Device
 from tinygrad.nn.state import get_state_dict
-from tinygrad.device import Compiled
+from tinygrad.device import Compiled, Allocator
 from tinygrad.helpers import Profiling
-from tinygrad.runtime.lib import RawBuffer

 class FakeProgram:
-  def __init__(self, name:str, prg:str): pass
+  def __init__(self, name:str, prg:bytes, bufs:int, vars:int=0): pass
  def __call__(self, *bufs, global_size, local_size, wait=False): pass

-class RawFakeBuffer(RawBuffer):
-  def _copyin(self, x:np.ndarray): pass
-  def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
+class FakeAllocator(Allocator):
+  def _alloc(self, sz, dtype): return None
+  def copyin(self, dest, src:memoryview): pass

 class TestLLaMASpeed(unittest.TestCase):
  @unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
  def test_llama_compile(self):
    backup_program = Device[Device.DEFAULT].runtime
-    backup_buffer = Device[Device.DEFAULT].buffer
+    backup_allocator = Device[Device.DEFAULT].allocator
    Device[Device.DEFAULT].runtime = FakeProgram
-    Device[Device.DEFAULT].buffer = RawFakeBuffer
+    Device[Device.DEFAULT].allocator = FakeAllocator()

    print("testing llama python run time")
    model = Transformer(**MODEL_PARAMS["1"]["7B"]["args"])
@@ -48,7 +46,7 @@ class TestLLaMASpeed(unittest.TestCase):
      run_llama("profile")

    Device[Device.DEFAULT].runtime = backup_program
-    Device[Device.DEFAULT].buffer = backup_buffer
+    Device[Device.DEFAULT].allocator = backup_allocator

 if __name__ == '__main__':
  unittest.main()
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -6,7 +6,7 @@ from tinygrad.nn.state import get_parameters
 def derandomize(x):
  if isinstance(x, LazyOp):
    new_op = LoadOps.EMPTY if x.op == LoadOps.RAND else x.op
-    return LazyOp(new_op, tuple([derandomize(s) for s in x.src]), x.arg)
+    return LazyOp(new_op, tuple([derandomize(s) for s in x.src]), None if x.op == LoadOps.RAND else x.arg)
  x.op = derandomize(x.op)
  return x

--- a/test/test_allocators.py
+++ b/test/test_allocators.py
@@ -1,188 +0,0 @@
-#!/usr/bin/env python
-import unittest
-import pytest
-import numpy as np
-from weakref import ref
-
-from tinygrad.helpers import GlobalCounters
-from tinygrad.runtime.lib import RawBuffer, LRUAllocator
-from tinygrad.helpers import dtypes, prod
-from tinygrad import Device
-from tinygrad.tensor import Tensor
-
-def check_gc():
-  if Device.DEFAULT == "GPU":
-    from extra.introspection import print_objects
-    assert print_objects() == 0
-
-class FakeDeviceBuffer:
-  def __init__(self, sz, dt, device):
-    self.id = 1
-    self.size = sz
-    self.dtype = dt
-    self.device = device
-  def __del__(self):
-    assert self.id == 0, "Should called _do_free() before"
-
-class FakeAllocator(LRUAllocator):
-  def _do_alloc(self, size, dtype, device, **kwargs):
-    if size*dtype.itemsize > self._get_cur_free_space(device): raise Exception("OOM")
-    return FakeDeviceBuffer(size, dtype, device)
-  def _do_free(self, buf):
-    buf.id -= 1
-    assert buf.id == 0, f"Free should be called once, but {buf.id}"
-  def __del__(self): # Fake allocator should clear all buffers after each test.
-    for v in self.cached_buffers.values():
-      for buf, _ in v: self._free_buffer(buf)
-
-FAKE_GLOBAL_ALLOCATOR = None
-class FakeBuffer(RawBuffer):
-  def __init__(self, size, dtype, device='0'):
-    global FAKE_GLOBAL_ALLOCATOR
-    super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
-    assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
-  @classmethod
-  def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
-  def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
-
-def alloc(allocator, size, dtype, **kwargs):
-  global FAKE_GLOBAL_ALLOCATOR
-  FAKE_GLOBAL_ALLOCATOR = allocator
-  buf = FakeBuffer(size, dtype, **kwargs)
-  assert buf.dtype == dtype and buf.size == size
-  FAKE_GLOBAL_ALLOCATOR = None
-  return buf
-
-def alloc_free_trace(allocator, size, dtype, **kwargs):
-  buf = alloc(allocator, size, dtype, **kwargs)
-  return ref(buf._buf)
-
-def cmp_trace_and_buf(buf, trace_ref): return trace_ref and trace_ref() == buf._buf
-
-class TestAllocators(unittest.TestCase):
-  def test_lru_allocator_reusage(self):
-    mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
-    def test():
-      lru_allocator = FakeAllocator(2048)
-      traced_buf = alloc_free_trace(lru_allocator, 16, dtypes.float32)
-      assert GlobalCounters.mem_cached - mc == 16*dtypes.float32.itemsize, "Buffer should be cached"
-      for _ in range(32):
-        def __test():
-          buf = alloc(lru_allocator, 16, dtypes.float32)
-          assert cmp_trace_and_buf(buf, traced_buf), "Buffer should be reused"
-        __test()
-
-      usedbuf = alloc(lru_allocator, 16, dtypes.float32)
-      for _ in range(32):
-        def __test():
-          buf = alloc(lru_allocator, 16, dtypes.float32)
-          assert usedbuf != buf, "Nobody should get used buffer"
-        __test()
-      assert GlobalCounters.mem_used - mu == 16*dtypes.float32.itemsize, "Only usedbuf is still allocated."
-    test()
-    check_gc()
-
-  def test_lru_allocator_cache_free(self):
-    mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
-    def test():
-      lru_allocator = FakeAllocator(128)
-      refs = []
-      for _ in range(32):
-        refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32))
-      for sz in range(1, 32):
-        alloc_free_trace(lru_allocator, sz, dtypes.float32)
-        assert GlobalCounters.mem_used + GlobalCounters.mem_cached - mc - mu <= 128, "Should not allocate on device more than allowed (128)"
-      for r in refs: assert r() is None, "All refs should be dead, since buffers were cleared from cache"
-    test()
-    check_gc()
-
-  def test_lru_allocator_multidevice(self):
-    def test():
-      lru_allocator = FakeAllocator(256)
-      refs=[]
-      for i in range(8):
-        refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32, device=str(i)))
-      for i in range(64):
-        def __test():
-          dev = str(i % 8)
-          buf = alloc(lru_allocator, 16, dtypes.float32, device=dev)
-          assert cmp_trace_and_buf(buf, refs[i%8]), "Buffer should be reused"
-        __test()
-      for r in refs: assert r() is not None, "All refs should be cached"
-    test()
-    check_gc()
-
-  def test_lru_allocator_failing_alloc_cleans_cache(self):
-    def test():
-      lru_allocator = FakeAllocator(128)
-      for size in range(1, 4):
-        alloc_free_trace(lru_allocator, size, dtypes.float32, device='0')
-      assert len(lru_allocator.aging_order['0']) == 3, "All buffers should be cached"
-      assert lru_allocator.free_space['0'] == 128 - 24, "24 bytes to be used by current cached buffers"
-
-      def always_raise_exception(*args, **kwargs):
-        raise MemoryError("OOM")
-      lru_allocator._do_alloc = always_raise_exception
-
-      with pytest.raises(Exception):
-        alloc(lru_allocator, 5, dtypes.float32, device='0')
-      assert len(lru_allocator.aging_order['0']) == 0, "All buffers should be freed from cache due to failing alloc"
-    test()
-    check_gc()
-
-  def test_lru_allocator_fail_first_alloc_pass_after_clear_cahce(self):
-    def test():
-      lru_allocator = FakeAllocator(128)
-      for size in range(1, 4):
-        alloc_free_trace(lru_allocator, size, dtypes.float32, device='0')
-      cache_length = 3
-      assert len(lru_allocator.aging_order['0']) == cache_length, "All buffers should be cached"
-      assert lru_allocator.free_space['0'] == 128 - 24, "24 bytes to be used by current cached buffers"
-
-      original_do_alloc = lru_allocator._do_alloc  # save the original method
-      def single_fail_then_pass(*args, **kwargs):
-        lru_allocator._do_alloc = original_do_alloc  # restore the original method
-        raise MemoryError("OOM")
-      lru_allocator._do_alloc = single_fail_then_pass
-
-      alloc(lru_allocator, 5, dtypes.float32, device='0')
-      assert len(lru_allocator.aging_order['0']) < cache_length, "Some buffers should be cleaned as first alloc failed"
-    test()
-    check_gc()
-
-  @unittest.skip("failing in CI")
-  def test_gpu_copyout(self):
-    def test():
-      from tinygrad.runtime.ops_gpu import CL
-
-      # Allocation to init the allocator.
-      tx = Tensor.rand(1)
-      tx.realize()
-      free_space = CL.cl_allocator.free_space[tx.lazydata.realized._device]
-
-      # Spawning 128mb objects to fill half of free_space
-      will_allocate = free_space // 3
-      trash_allocation_size = free_space // 2
-
-      def sp():
-        trash_buffer = Tensor.rand(trash_allocation_size // 4)
-        trash_buffer.realize()
-      sp()
-
-      xx = Tensor.rand(will_allocate // 4)
-      _ = xx.numpy()
-    test()
-    check_gc()
-
-  def test_lru_allocator_massive_buffer(self):
-    with self.assertRaises(AssertionError) as context: alloc(allocator := FakeAllocator(), size := 1e13, dtypes.int8)
-    self.assertEqual(str(context.exception), f"out of memory - requested: {size/1e9:5.2f} GB, available: {allocator._get_cur_free_space('0')/1e9:5.2f} GB")
-
-  @unittest.skipIf(Device.DEFAULT != "METAL", "only applies to Metal")
-  def test_lru_allocator_metal_max_buffer_length(self):
-    from tinygrad.runtime.ops_metal import METAL
-    with self.assertRaises(AssertionError) as context: METAL.allocator._do_alloc(buf_len := (max_buf_len := METAL.device.maxBufferLength()+1), dtypes.int8, '0')
-    self.assertEqual(str(context.exception), f"Buffer length of {buf_len/1e9:5.2f} GB exceeds Metal's max buffer length of {max_buf_len/1e9:5.2f} GB.")
-
-if __name__ == "__main__":
-  unittest.main()
--- a/test/test_custom_function.py
+++ b/test/test_custom_function.py
@@ -8,7 +8,7 @@ from tinygrad.helpers import prod, dtypes

 # *** first, we implement the atan2 op at the lowest level ***
 # `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
-from tinygrad.lazy import LazyBuffer, create_lazybuffer
+from tinygrad.lazy import Buffer, create_lazybuffer
 from tinygrad.device import CompiledASTRunner, Device
 from tinygrad.shape.shapetracker import ShapeTracker
 import pytest
@@ -16,17 +16,15 @@ import pytest
 pytestmark = pytest.mark.webgpu

 # we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
-def atan2_gpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
-  assert a.device == "GPU" and b.device == "GPU", "gpu function requires GPUBuffers"
+def atan2_gpu(ret:Buffer, a:Buffer, b:Buffer):
  assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
-  ret.realized = Device[ret.device].buffer(prod(ret.shape), ret.dtype)
  CompiledASTRunner(None, "atan2_gpu", """
    __kernel void atan2_gpu(global float *c, global float *a, global float *b) {
      int idx = get_global_id(0);
      c[idx] = atan2(a[idx], b[idx]);
-    }""", global_size=[prod(ret.shape)]).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret.realized, a.realized, b.realized])
+    }""", global_size=[ret.size], bufcount=3).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret, a, b])

-def atan2_cpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer): ret.realized._copyin(np.arctan2(a.realized._buf, b.realized._buf))
+def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(a._buf, b._buf), requirements='C').data)

 # *** second, we write the ATan2 mlop ***
 # NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
--- a/test/test_lazybuffer.py
+++ b/test/test_lazybuffer.py
@@ -7,6 +7,7 @@ from tinygrad.tensor import Tensor
 from tinygrad.jit import CacheCollector

 class TestLazyBuffer(unittest.TestCase):
+  @unittest.skip("it doesn't work like this anymore")
  def test_fromcpu_buffer_sharing(self):
    a = np.arange(8)
    assert LazyBuffer.fromCPU(a).realized._buf is a
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -3,7 +3,7 @@ import unittest, os

 from tinygrad.codegen.kernel import Opt, OptOps, tensor_cores
 from tinygrad.codegen.linearizer import Linearizer, UOp, UOps
-from tinygrad.device import Compiled, Device
+from tinygrad.device import Compiled, Device, Buffer
 from tinygrad.ops import BufferOps, MemBuffer, ConstBuffer, LazyOp, LoadOps, TernaryOps
 from tinygrad.shape.shapetracker import ShapeTracker
 from tinygrad.shape.view import View
@@ -140,7 +140,7 @@ def helper_realized_ast(r:Tensor):
  s = r.lazydata.schedule()
  run_schedule(s[:-1])  # run all kernels except the last one
  # now all input LazyBuffers buffers in s[-1] should be realized
-  output_buffer = Device[s[-1].out.device].buffer(prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args())  # allocate an output buffer
+  output_buffer = Buffer(s[-1].out.device, prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args())  # allocate an output buffer
  return s[-1].ast, [output_buffer] + [l.realized for l in s[-1].inputs]

 class TestFloat4(unittest.TestCase):
@@ -367,7 +367,7 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False):
      for opt in opts:
        k.apply_opt(opt)
    prg = to_prg(k)
-    real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
+    real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
    prg.exec(real_bufs)
    np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)

@@ -381,7 +381,7 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False):
  k = Linearizer(realized_ast)
  k.hand_coded_optimizations()
  prg = Device[Device.DEFAULT].to_program(k)
-  real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
+  real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
  prg.exec(real_bufs)
  np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
  for x in opts: # Check custom transformations if any.
--- a/test/test_search.py
+++ b/test/test_search.py
@@ -2,7 +2,7 @@ import unittest

 from tinygrad.codegen.linearizer import Linearizer
 from tinygrad.features.search import time_linearizer
-from tinygrad.device import Compiled, Device
+from tinygrad.device import Compiled, Device, Buffer
 from tinygrad.ops import LoadOps
 from tinygrad.tensor import Tensor

@@ -12,7 +12,7 @@ class TestTimeLinearizer(unittest.TestCase):

  def test_reasonable_time(self):
    si = [si for si in Tensor([1,2,3,4]).add(1).lazydata.schedule() if si.ast.op not in LoadOps][0]
-    rawbufs = [Device[Device.DEFAULT].buffer(si.out.st.size(), si.out.dtype)] + [Device[Device.DEFAULT].buffer(x.st.size(), x.dtype) for x in si.inputs]
+    rawbufs = [Buffer(Device.DEFAULT, si.out.st.size(), si.out.dtype)] + [Buffer(Device.DEFAULT, x.st.size(), x.dtype) for x in si.inputs]
    tm = time_linearizer(Linearizer(si.ast), rawbufs, allow_test_size=False, cnt=10)
    assert tm > 0 and tm != float('inf')

--- a/test/test_uops.py
+++ b/test/test_uops.py
@@ -2,16 +2,16 @@ from typing import Optional, Tuple, Any, List
 import unittest, math
 import numpy as np
 from tinygrad.helpers import dtypes, getenv, DType, PtrDType
-from tinygrad.tensor import Device
+from tinygrad.device import Buffer, Device
 from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
 from tinygrad.device import CompiledASTRunner, Compiled
 from tinygrad.codegen.linearizer import UOps, UOp

-def _uops_to_prg(uops):
+def _uops_to_prg(uops, bufcount):
  src, runtime_args = Device[Device.DEFAULT].renderer("test", uops)
  return CompiledASTRunner(None, "test", src,
                           [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None, [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None,
-                           runtime_args=runtime_args).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)
+                           runtime_args=runtime_args, bufcount=bufcount).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)

 def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp:
  uops.append(UOp(uop, dtype, tuple(vin), arg))
@@ -24,9 +24,9 @@ def _test_single_value(vals, op, dtype):
  loads = (uop(uops, UOps.LOAD, dtype, [buf_loads[i], uop(uops, UOps.CONST, dtypes.int32, (), 0)]) for i in range(len(vals)))
  alu = uop(uops, UOps.ALU, dtype, loads, op)
  uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
-  buf = Device[Device.DEFAULT].buffer(1, dtype)
-  buf2 = [Device[Device.DEFAULT].buffer.fromCPU(np.array([a], dtype=dtype.np)) for a in vals]
-  prg = _uops_to_prg(uops)
+  buf = Buffer(Device.DEFAULT, 1, dtype)
+  buf2 = [Buffer.fromCPU(Device.DEFAULT, np.array([a], dtype=dtype.np)) for a in vals]
+  prg = _uops_to_prg(uops, 1+len(buf2))
  prg.exec([buf]+buf2)
  return buf.toCPU()[0]

@@ -36,8 +36,8 @@ def _test_single_value_const(vals, op, dtype):
  loads = (uop(uops, UOps.CONST, dtype, [], a) for a in vals)
  alu = uop(uops, UOps.ALU, dtype, loads, op)
  uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
-  buf = Device[Device.DEFAULT].buffer(1, dtype)
-  prg = _uops_to_prg(uops)
+  buf = Buffer(Device.DEFAULT, 1, dtype)
+  prg = _uops_to_prg(uops, 1)
  prg.exec([buf])
  return buf.toCPU()[0]

--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@@ -3,8 +3,7 @@ import unittest
 import numpy as np
 from tinygrad.tensor import Tensor, Device
 from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load
-from tinygrad.helpers import dtypes, fetch, temp
-from tinygrad.runtime.ops_disk import RawDiskBuffer
+from tinygrad.helpers import fetch, temp
 from tinygrad.helpers import Timing

 def compare_weights_both(url):
@@ -40,11 +39,6 @@ class TestRawDiskBuffer(unittest.TestCase):
      with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
        f.readinto(tst)

-  def test_mmap_read_speed(self):
-    db = RawDiskBuffer(test_size, dtype=dtypes.uint8, device=test_fn)
-    tst = np.empty(test_size, np.uint8)
-    with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
-      np.copyto(tst, db.toCPU())
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu doesn't support uint8 datatype")
 class TestSafetensors(unittest.TestCase):
  def test_real_safetensors(self):