new style device (#2530)

* cpu tests pass

* torch works

* works

* metal works

* fix ops_disk

* metal jit works

* fix openpilot

* llvm and clang work

* fix webgpu

* docs are rly broken

* LRU works on metal

* delete comment

* revert name to ._buf. LRU only on Compiled

* changes

* allocator

* allocator, getting closer

* lru alloc

* LRUAllocator

* all pass

* metal

* cuda

* test examples

* linearizer

* test fixes

* fix custom + clean realize

* fix hip

* skip tests

* fix tests

* fix size=0

* fix MOCKHIP

* fix thneed

* copy better

* simple

* old style metal copy

* fix thneed

* np reshape

* give cuda a device
This commit is contained in:
George Hotz
2023-11-30 17:07:16 -08:00
committed by GitHub
parent e56511b59a
commit 2c363b5f0b
38 changed files with 572 additions and 1039 deletions

View File

@@ -1,125 +0,0 @@
#!/usr/bin/env python
import unittest, gc
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.nn.state import get_state_dict
from tinygrad.helpers import GlobalCounters
from tinygrad.runtime.lib import RawBuffer, LRUAllocator
from tinygrad.helpers import dtypes, prod
from tinygrad import Device
from test.helpers import derandomize_model
from examples.llama import Transformer
ALLOCATED_DEV_BUFS = 0
class FakeDeviceBuffer:
def __init__(self, sz, dt, device):
self.id = 1
self.size = sz
self.dtype = dt
self.device = device
global ALLOCATED_DEV_BUFS
ALLOCATED_DEV_BUFS += 1
class FakeAllocator(LRUAllocator):
def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
def _do_free(self, buf):
buf.id -= 1
assert buf.id == 0, f"Free should be called once, but {buf.id}"
def __del__(self): # Fake allocator should clear all buffers after each test.
for v in self.cached_buffers.values():
for buf, _ in v: self._free_buffer(buf)
FAKE_GLOBAL_ALLOCATOR = None
class FakeBuffer(RawBuffer):
def __init__(self, size, dtype, device='0'):
global FAKE_GLOBAL_ALLOCATOR
super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
@classmethod
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
class FakeProgram:
def __init__(self, name:str, prg:str): pass
def __call__(self, *bufs, global_size, local_size, wait=False): pass
def helper_test_correctness(gen, train):
from tinygrad.runtime.ops_gpu import CL, CLAllocator
old_alloc = CL.cl_allocator
CL.cl_allocator = CLAllocator(0)
no_alloc_result = train(*gen()).numpy()
Device[Device.DEFAULT].synchronize()
CL.cl_allocator = CLAllocator(512<<30) # Test cache correctness, so cache as much as possible, 512gb
for _ in range(4):
GlobalCounters.reset()
np.testing.assert_allclose(train(*gen()).numpy(), no_alloc_result, rtol=1e-3, atol=1e-5)
Device[Device.DEFAULT].synchronize()
assert len(CL.cl_allocator.cached_buffers) != 0, "Cache must be used"
CL.cl_allocator = old_alloc
def __helper_test_alloc_count(gen, train):
was_alloc = ALLOCATED_DEV_BUFS
for _ in range(2):
train(*gen())
return ALLOCATED_DEV_BUFS - was_alloc
def helper_test_alloc_count(mm, gen, train):
global FAKE_GLOBAL_ALLOCATOR
backup_program = Device[Device.DEFAULT].runtime
backup_buffer = Device[Device.DEFAULT].buffer
Device[Device.DEFAULT].runtime = FakeProgram
Device[Device.DEFAULT].buffer = FakeBuffer
Device[Device.DEFAULT].get_runner.cache_clear()
FAKE_GLOBAL_ALLOCATOR = FakeAllocator(16<<30)
new_allocs = __helper_test_alloc_count(gen, train)
Device[Device.DEFAULT].get_runner.cache_clear()
FAKE_GLOBAL_ALLOCATOR = FakeAllocator(0)
old_allocs = __helper_test_alloc_count(gen, train)
print(f"{mm}: llama: old allocs count {old_allocs}, new allocs count {new_allocs}")
assert new_allocs < old_allocs, "Hmm, doesn't cache work any more?"
Device[Device.DEFAULT].runtime = backup_program
Device[Device.DEFAULT].buffer = backup_buffer
FAKE_GLOBAL_ALLOCATOR = None
def check_gc():
if Device.DEFAULT == "GPU":
gc.collect() # Need to collect Tensors.
from extra.introspection import print_objects
assert print_objects() == 0
class TestAllocators(unittest.TestCase):
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
def test_lru_allocator_tiny_llama(self):
old_type = Tensor.default_type
Tensor.default_type = dtypes.float16
args_tiny = {"dim": 1024, "hidden_dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
def __test():
model = Transformer(**args_tiny)
derandomize_model(model)
def test(t): return model(t, 0).realize()
helper_test_correctness(lambda: (Tensor([[1,]]),), test)
__test()
Tensor.default_type = old_type
check_gc()
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
def test_lru_allocator_tiny_llama_alloc_counts(self):
args_tiny = {"dim": 1024, "hidden_dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
def test_alloc_count(t):
model = Transformer(**args_tiny)
for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
return model(t, 0).realize()
helper_test_alloc_count("llama", lambda: (Tensor([[2,]]),), test_alloc_count)
check_gc()
@unittest.skip("huge for CI")
def test_stable_diffusion(self):
from examples.stable_diffusion import UNetModel
model = UNetModel()
derandomize_model(model)
def test(t, t2): return model(t, 801, t2).realize()
helper_test_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test)
if __name__ == "__main__":
unittest.main()

View File

@@ -1,29 +1,27 @@
# NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
import unittest, time
import numpy as np
from examples.llama import Transformer, MODEL_PARAMS
from tinygrad.tensor import Tensor
from tinygrad import Device
from tinygrad.nn.state import get_state_dict
from tinygrad.device import Compiled
from tinygrad.device import Compiled, Allocator
from tinygrad.helpers import Profiling
from tinygrad.runtime.lib import RawBuffer
class FakeProgram:
def __init__(self, name:str, prg:str): pass
def __init__(self, name:str, prg:bytes, bufs:int, vars:int=0): pass
def __call__(self, *bufs, global_size, local_size, wait=False): pass
class RawFakeBuffer(RawBuffer):
def _copyin(self, x:np.ndarray): pass
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
class FakeAllocator(Allocator):
def _alloc(self, sz, dtype): return None
def copyin(self, dest, src:memoryview): pass
class TestLLaMASpeed(unittest.TestCase):
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
def test_llama_compile(self):
backup_program = Device[Device.DEFAULT].runtime
backup_buffer = Device[Device.DEFAULT].buffer
backup_allocator = Device[Device.DEFAULT].allocator
Device[Device.DEFAULT].runtime = FakeProgram
Device[Device.DEFAULT].buffer = RawFakeBuffer
Device[Device.DEFAULT].allocator = FakeAllocator()
print("testing llama python run time")
model = Transformer(**MODEL_PARAMS["1"]["7B"]["args"])
@@ -48,7 +46,7 @@ class TestLLaMASpeed(unittest.TestCase):
run_llama("profile")
Device[Device.DEFAULT].runtime = backup_program
Device[Device.DEFAULT].buffer = backup_buffer
Device[Device.DEFAULT].allocator = backup_allocator
if __name__ == '__main__':
unittest.main()

View File

@@ -6,7 +6,7 @@ from tinygrad.nn.state import get_parameters
def derandomize(x):
if isinstance(x, LazyOp):
new_op = LoadOps.EMPTY if x.op == LoadOps.RAND else x.op
return LazyOp(new_op, tuple([derandomize(s) for s in x.src]), x.arg)
return LazyOp(new_op, tuple([derandomize(s) for s in x.src]), None if x.op == LoadOps.RAND else x.arg)
x.op = derandomize(x.op)
return x

View File

@@ -1,188 +0,0 @@
#!/usr/bin/env python
import unittest
import pytest
import numpy as np
from weakref import ref
from tinygrad.helpers import GlobalCounters
from tinygrad.runtime.lib import RawBuffer, LRUAllocator
from tinygrad.helpers import dtypes, prod
from tinygrad import Device
from tinygrad.tensor import Tensor
def check_gc():
if Device.DEFAULT == "GPU":
from extra.introspection import print_objects
assert print_objects() == 0
class FakeDeviceBuffer:
def __init__(self, sz, dt, device):
self.id = 1
self.size = sz
self.dtype = dt
self.device = device
def __del__(self):
assert self.id == 0, "Should called _do_free() before"
class FakeAllocator(LRUAllocator):
def _do_alloc(self, size, dtype, device, **kwargs):
if size*dtype.itemsize > self._get_cur_free_space(device): raise Exception("OOM")
return FakeDeviceBuffer(size, dtype, device)
def _do_free(self, buf):
buf.id -= 1
assert buf.id == 0, f"Free should be called once, but {buf.id}"
def __del__(self): # Fake allocator should clear all buffers after each test.
for v in self.cached_buffers.values():
for buf, _ in v: self._free_buffer(buf)
FAKE_GLOBAL_ALLOCATOR = None
class FakeBuffer(RawBuffer):
def __init__(self, size, dtype, device='0'):
global FAKE_GLOBAL_ALLOCATOR
super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
@classmethod
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
def alloc(allocator, size, dtype, **kwargs):
global FAKE_GLOBAL_ALLOCATOR
FAKE_GLOBAL_ALLOCATOR = allocator
buf = FakeBuffer(size, dtype, **kwargs)
assert buf.dtype == dtype and buf.size == size
FAKE_GLOBAL_ALLOCATOR = None
return buf
def alloc_free_trace(allocator, size, dtype, **kwargs):
buf = alloc(allocator, size, dtype, **kwargs)
return ref(buf._buf)
def cmp_trace_and_buf(buf, trace_ref): return trace_ref and trace_ref() == buf._buf
class TestAllocators(unittest.TestCase):
def test_lru_allocator_reusage(self):
mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
def test():
lru_allocator = FakeAllocator(2048)
traced_buf = alloc_free_trace(lru_allocator, 16, dtypes.float32)
assert GlobalCounters.mem_cached - mc == 16*dtypes.float32.itemsize, "Buffer should be cached"
for _ in range(32):
def __test():
buf = alloc(lru_allocator, 16, dtypes.float32)
assert cmp_trace_and_buf(buf, traced_buf), "Buffer should be reused"
__test()
usedbuf = alloc(lru_allocator, 16, dtypes.float32)
for _ in range(32):
def __test():
buf = alloc(lru_allocator, 16, dtypes.float32)
assert usedbuf != buf, "Nobody should get used buffer"
__test()
assert GlobalCounters.mem_used - mu == 16*dtypes.float32.itemsize, "Only usedbuf is still allocated."
test()
check_gc()
def test_lru_allocator_cache_free(self):
mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
def test():
lru_allocator = FakeAllocator(128)
refs = []
for _ in range(32):
refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32))
for sz in range(1, 32):
alloc_free_trace(lru_allocator, sz, dtypes.float32)
assert GlobalCounters.mem_used + GlobalCounters.mem_cached - mc - mu <= 128, "Should not allocate on device more than allowed (128)"
for r in refs: assert r() is None, "All refs should be dead, since buffers were cleared from cache"
test()
check_gc()
def test_lru_allocator_multidevice(self):
def test():
lru_allocator = FakeAllocator(256)
refs=[]
for i in range(8):
refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32, device=str(i)))
for i in range(64):
def __test():
dev = str(i % 8)
buf = alloc(lru_allocator, 16, dtypes.float32, device=dev)
assert cmp_trace_and_buf(buf, refs[i%8]), "Buffer should be reused"
__test()
for r in refs: assert r() is not None, "All refs should be cached"
test()
check_gc()
def test_lru_allocator_failing_alloc_cleans_cache(self):
def test():
lru_allocator = FakeAllocator(128)
for size in range(1, 4):
alloc_free_trace(lru_allocator, size, dtypes.float32, device='0')
assert len(lru_allocator.aging_order['0']) == 3, "All buffers should be cached"
assert lru_allocator.free_space['0'] == 128 - 24, "24 bytes to be used by current cached buffers"
def always_raise_exception(*args, **kwargs):
raise MemoryError("OOM")
lru_allocator._do_alloc = always_raise_exception
with pytest.raises(Exception):
alloc(lru_allocator, 5, dtypes.float32, device='0')
assert len(lru_allocator.aging_order['0']) == 0, "All buffers should be freed from cache due to failing alloc"
test()
check_gc()
def test_lru_allocator_fail_first_alloc_pass_after_clear_cahce(self):
def test():
lru_allocator = FakeAllocator(128)
for size in range(1, 4):
alloc_free_trace(lru_allocator, size, dtypes.float32, device='0')
cache_length = 3
assert len(lru_allocator.aging_order['0']) == cache_length, "All buffers should be cached"
assert lru_allocator.free_space['0'] == 128 - 24, "24 bytes to be used by current cached buffers"
original_do_alloc = lru_allocator._do_alloc # save the original method
def single_fail_then_pass(*args, **kwargs):
lru_allocator._do_alloc = original_do_alloc # restore the original method
raise MemoryError("OOM")
lru_allocator._do_alloc = single_fail_then_pass
alloc(lru_allocator, 5, dtypes.float32, device='0')
assert len(lru_allocator.aging_order['0']) < cache_length, "Some buffers should be cleaned as first alloc failed"
test()
check_gc()
@unittest.skip("failing in CI")
def test_gpu_copyout(self):
def test():
from tinygrad.runtime.ops_gpu import CL
# Allocation to init the allocator.
tx = Tensor.rand(1)
tx.realize()
free_space = CL.cl_allocator.free_space[tx.lazydata.realized._device]
# Spawning 128mb objects to fill half of free_space
will_allocate = free_space // 3
trash_allocation_size = free_space // 2
def sp():
trash_buffer = Tensor.rand(trash_allocation_size // 4)
trash_buffer.realize()
sp()
xx = Tensor.rand(will_allocate // 4)
_ = xx.numpy()
test()
check_gc()
def test_lru_allocator_massive_buffer(self):
with self.assertRaises(AssertionError) as context: alloc(allocator := FakeAllocator(), size := 1e13, dtypes.int8)
self.assertEqual(str(context.exception), f"out of memory - requested: {size/1e9:5.2f} GB, available: {allocator._get_cur_free_space('0')/1e9:5.2f} GB")
@unittest.skipIf(Device.DEFAULT != "METAL", "only applies to Metal")
def test_lru_allocator_metal_max_buffer_length(self):
from tinygrad.runtime.ops_metal import METAL
with self.assertRaises(AssertionError) as context: METAL.allocator._do_alloc(buf_len := (max_buf_len := METAL.device.maxBufferLength()+1), dtypes.int8, '0')
self.assertEqual(str(context.exception), f"Buffer length of {buf_len/1e9:5.2f} GB exceeds Metal's max buffer length of {max_buf_len/1e9:5.2f} GB.")
if __name__ == "__main__":
unittest.main()

View File

@@ -8,7 +8,7 @@ from tinygrad.helpers import prod, dtypes
# *** first, we implement the atan2 op at the lowest level ***
# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
from tinygrad.lazy import LazyBuffer, create_lazybuffer
from tinygrad.lazy import Buffer, create_lazybuffer
from tinygrad.device import CompiledASTRunner, Device
from tinygrad.shape.shapetracker import ShapeTracker
import pytest
@@ -16,17 +16,15 @@ import pytest
pytestmark = pytest.mark.webgpu
# we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
def atan2_gpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
assert a.device == "GPU" and b.device == "GPU", "gpu function requires GPUBuffers"
def atan2_gpu(ret:Buffer, a:Buffer, b:Buffer):
assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
ret.realized = Device[ret.device].buffer(prod(ret.shape), ret.dtype)
CompiledASTRunner(None, "atan2_gpu", """
__kernel void atan2_gpu(global float *c, global float *a, global float *b) {
int idx = get_global_id(0);
c[idx] = atan2(a[idx], b[idx]);
}""", global_size=[prod(ret.shape)]).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret.realized, a.realized, b.realized])
}""", global_size=[ret.size], bufcount=3).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret, a, b])
def atan2_cpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer): ret.realized._copyin(np.arctan2(a.realized._buf, b.realized._buf))
def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(a._buf, b._buf), requirements='C').data)
# *** second, we write the ATan2 mlop ***
# NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative

View File

@@ -7,6 +7,7 @@ from tinygrad.tensor import Tensor
from tinygrad.jit import CacheCollector
class TestLazyBuffer(unittest.TestCase):
@unittest.skip("it doesn't work like this anymore")
def test_fromcpu_buffer_sharing(self):
a = np.arange(8)
assert LazyBuffer.fromCPU(a).realized._buf is a

View File

@@ -3,7 +3,7 @@ import unittest, os
from tinygrad.codegen.kernel import Opt, OptOps, tensor_cores
from tinygrad.codegen.linearizer import Linearizer, UOp, UOps
from tinygrad.device import Compiled, Device
from tinygrad.device import Compiled, Device, Buffer
from tinygrad.ops import BufferOps, MemBuffer, ConstBuffer, LazyOp, LoadOps, TernaryOps
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.shape.view import View
@@ -140,7 +140,7 @@ def helper_realized_ast(r:Tensor):
s = r.lazydata.schedule()
run_schedule(s[:-1]) # run all kernels except the last one
# now all input LazyBuffers buffers in s[-1] should be realized
output_buffer = Device[s[-1].out.device].buffer(prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args()) # allocate an output buffer
output_buffer = Buffer(s[-1].out.device, prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args()) # allocate an output buffer
return s[-1].ast, [output_buffer] + [l.realized for l in s[-1].inputs]
class TestFloat4(unittest.TestCase):
@@ -367,7 +367,7 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False):
for opt in opts:
k.apply_opt(opt)
prg = to_prg(k)
real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
prg.exec(real_bufs)
np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
@@ -381,7 +381,7 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False):
k = Linearizer(realized_ast)
k.hand_coded_optimizations()
prg = Device[Device.DEFAULT].to_program(k)
real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
prg.exec(real_bufs)
np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
for x in opts: # Check custom transformations if any.

View File

@@ -2,7 +2,7 @@ import unittest
from tinygrad.codegen.linearizer import Linearizer
from tinygrad.features.search import time_linearizer
from tinygrad.device import Compiled, Device
from tinygrad.device import Compiled, Device, Buffer
from tinygrad.ops import LoadOps
from tinygrad.tensor import Tensor
@@ -12,7 +12,7 @@ class TestTimeLinearizer(unittest.TestCase):
def test_reasonable_time(self):
si = [si for si in Tensor([1,2,3,4]).add(1).lazydata.schedule() if si.ast.op not in LoadOps][0]
rawbufs = [Device[Device.DEFAULT].buffer(si.out.st.size(), si.out.dtype)] + [Device[Device.DEFAULT].buffer(x.st.size(), x.dtype) for x in si.inputs]
rawbufs = [Buffer(Device.DEFAULT, si.out.st.size(), si.out.dtype)] + [Buffer(Device.DEFAULT, x.st.size(), x.dtype) for x in si.inputs]
tm = time_linearizer(Linearizer(si.ast), rawbufs, allow_test_size=False, cnt=10)
assert tm > 0 and tm != float('inf')

View File

@@ -2,16 +2,16 @@ from typing import Optional, Tuple, Any, List
import unittest, math
import numpy as np
from tinygrad.helpers import dtypes, getenv, DType, PtrDType
from tinygrad.tensor import Device
from tinygrad.device import Buffer, Device
from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
from tinygrad.device import CompiledASTRunner, Compiled
from tinygrad.codegen.linearizer import UOps, UOp
def _uops_to_prg(uops):
def _uops_to_prg(uops, bufcount):
src, runtime_args = Device[Device.DEFAULT].renderer("test", uops)
return CompiledASTRunner(None, "test", src,
[1] if Device[Device.DEFAULT].linearizer_opts.has_local else None, [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None,
runtime_args=runtime_args).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)
runtime_args=runtime_args, bufcount=bufcount).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)
def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp:
uops.append(UOp(uop, dtype, tuple(vin), arg))
@@ -24,9 +24,9 @@ def _test_single_value(vals, op, dtype):
loads = (uop(uops, UOps.LOAD, dtype, [buf_loads[i], uop(uops, UOps.CONST, dtypes.int32, (), 0)]) for i in range(len(vals)))
alu = uop(uops, UOps.ALU, dtype, loads, op)
uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
buf = Device[Device.DEFAULT].buffer(1, dtype)
buf2 = [Device[Device.DEFAULT].buffer.fromCPU(np.array([a], dtype=dtype.np)) for a in vals]
prg = _uops_to_prg(uops)
buf = Buffer(Device.DEFAULT, 1, dtype)
buf2 = [Buffer.fromCPU(Device.DEFAULT, np.array([a], dtype=dtype.np)) for a in vals]
prg = _uops_to_prg(uops, 1+len(buf2))
prg.exec([buf]+buf2)
return buf.toCPU()[0]
@@ -36,8 +36,8 @@ def _test_single_value_const(vals, op, dtype):
loads = (uop(uops, UOps.CONST, dtype, [], a) for a in vals)
alu = uop(uops, UOps.ALU, dtype, loads, op)
uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
buf = Device[Device.DEFAULT].buffer(1, dtype)
prg = _uops_to_prg(uops)
buf = Buffer(Device.DEFAULT, 1, dtype)
prg = _uops_to_prg(uops, 1)
prg.exec([buf])
return buf.toCPU()[0]

View File

@@ -3,8 +3,7 @@ import unittest
import numpy as np
from tinygrad.tensor import Tensor, Device
from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load
from tinygrad.helpers import dtypes, fetch, temp
from tinygrad.runtime.ops_disk import RawDiskBuffer
from tinygrad.helpers import fetch, temp
from tinygrad.helpers import Timing
def compare_weights_both(url):
@@ -40,11 +39,6 @@ class TestRawDiskBuffer(unittest.TestCase):
with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
f.readinto(tst)
def test_mmap_read_speed(self):
db = RawDiskBuffer(test_size, dtype=dtypes.uint8, device=test_fn)
tst = np.empty(test_size, np.uint8)
with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
np.copyto(tst, db.toCPU())
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu doesn't support uint8 datatype")
class TestSafetensors(unittest.TestCase):
def test_real_safetensors(self):