mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-07 03:00:26 -04:00
new style device (#2530)
* cpu tests pass * torch works * works * metal works * fix ops_disk * metal jit works * fix openpilot * llvm and clang work * fix webgpu * docs are rly broken * LRU works on metal * delete comment * revert name to ._buf. LRU only on Compiled * changes * allocator * allocator, getting closer * lru alloc * LRUAllocator * all pass * metal * cuda * test examples * linearizer * test fixes * fix custom + clean realize * fix hip * skip tests * fix tests * fix size=0 * fix MOCKHIP * fix thneed * copy better * simple * old style metal copy * fix thneed * np reshape * give cuda a device
This commit is contained in:
125
test/external/external_test_allocator_on_models.py
vendored
125
test/external/external_test_allocator_on_models.py
vendored
@@ -1,125 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest, gc
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.nn.state import get_state_dict
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.runtime.lib import RawBuffer, LRUAllocator
|
||||
from tinygrad.helpers import dtypes, prod
|
||||
from tinygrad import Device
|
||||
from test.helpers import derandomize_model
|
||||
|
||||
from examples.llama import Transformer
|
||||
|
||||
ALLOCATED_DEV_BUFS = 0
|
||||
class FakeDeviceBuffer:
|
||||
def __init__(self, sz, dt, device):
|
||||
self.id = 1
|
||||
self.size = sz
|
||||
self.dtype = dt
|
||||
self.device = device
|
||||
|
||||
global ALLOCATED_DEV_BUFS
|
||||
ALLOCATED_DEV_BUFS += 1
|
||||
class FakeAllocator(LRUAllocator):
|
||||
def _do_alloc(self, size, dtype, device, **kwargs): return FakeDeviceBuffer(size, dtype, device)
|
||||
def _do_free(self, buf):
|
||||
buf.id -= 1
|
||||
assert buf.id == 0, f"Free should be called once, but {buf.id}"
|
||||
def __del__(self): # Fake allocator should clear all buffers after each test.
|
||||
for v in self.cached_buffers.values():
|
||||
for buf, _ in v: self._free_buffer(buf)
|
||||
|
||||
FAKE_GLOBAL_ALLOCATOR = None
|
||||
class FakeBuffer(RawBuffer):
|
||||
def __init__(self, size, dtype, device='0'):
|
||||
global FAKE_GLOBAL_ALLOCATOR
|
||||
super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
|
||||
assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
|
||||
@classmethod
|
||||
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
|
||||
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
|
||||
class FakeProgram:
|
||||
def __init__(self, name:str, prg:str): pass
|
||||
def __call__(self, *bufs, global_size, local_size, wait=False): pass
|
||||
|
||||
def helper_test_correctness(gen, train):
|
||||
from tinygrad.runtime.ops_gpu import CL, CLAllocator
|
||||
old_alloc = CL.cl_allocator
|
||||
CL.cl_allocator = CLAllocator(0)
|
||||
no_alloc_result = train(*gen()).numpy()
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
CL.cl_allocator = CLAllocator(512<<30) # Test cache correctness, so cache as much as possible, 512gb
|
||||
for _ in range(4):
|
||||
GlobalCounters.reset()
|
||||
np.testing.assert_allclose(train(*gen()).numpy(), no_alloc_result, rtol=1e-3, atol=1e-5)
|
||||
Device[Device.DEFAULT].synchronize()
|
||||
assert len(CL.cl_allocator.cached_buffers) != 0, "Cache must be used"
|
||||
CL.cl_allocator = old_alloc
|
||||
|
||||
def __helper_test_alloc_count(gen, train):
|
||||
was_alloc = ALLOCATED_DEV_BUFS
|
||||
for _ in range(2):
|
||||
train(*gen())
|
||||
return ALLOCATED_DEV_BUFS - was_alloc
|
||||
|
||||
def helper_test_alloc_count(mm, gen, train):
|
||||
global FAKE_GLOBAL_ALLOCATOR
|
||||
backup_program = Device[Device.DEFAULT].runtime
|
||||
backup_buffer = Device[Device.DEFAULT].buffer
|
||||
Device[Device.DEFAULT].runtime = FakeProgram
|
||||
Device[Device.DEFAULT].buffer = FakeBuffer
|
||||
Device[Device.DEFAULT].get_runner.cache_clear()
|
||||
FAKE_GLOBAL_ALLOCATOR = FakeAllocator(16<<30)
|
||||
new_allocs = __helper_test_alloc_count(gen, train)
|
||||
Device[Device.DEFAULT].get_runner.cache_clear()
|
||||
FAKE_GLOBAL_ALLOCATOR = FakeAllocator(0)
|
||||
old_allocs = __helper_test_alloc_count(gen, train)
|
||||
print(f"{mm}: llama: old allocs count {old_allocs}, new allocs count {new_allocs}")
|
||||
assert new_allocs < old_allocs, "Hmm, doesn't cache work any more?"
|
||||
Device[Device.DEFAULT].runtime = backup_program
|
||||
Device[Device.DEFAULT].buffer = backup_buffer
|
||||
FAKE_GLOBAL_ALLOCATOR = None
|
||||
|
||||
def check_gc():
|
||||
if Device.DEFAULT == "GPU":
|
||||
gc.collect() # Need to collect Tensors.
|
||||
from extra.introspection import print_objects
|
||||
assert print_objects() == 0
|
||||
|
||||
class TestAllocators(unittest.TestCase):
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
def test_lru_allocator_tiny_llama(self):
|
||||
old_type = Tensor.default_type
|
||||
Tensor.default_type = dtypes.float16
|
||||
|
||||
args_tiny = {"dim": 1024, "hidden_dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
|
||||
def __test():
|
||||
model = Transformer(**args_tiny)
|
||||
derandomize_model(model)
|
||||
def test(t): return model(t, 0).realize()
|
||||
helper_test_correctness(lambda: (Tensor([[1,]]),), test)
|
||||
__test()
|
||||
Tensor.default_type = old_type
|
||||
check_gc()
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
|
||||
def test_lru_allocator_tiny_llama_alloc_counts(self):
|
||||
args_tiny = {"dim": 1024, "hidden_dim": 1024, "n_heads": 8, "n_layers": 8, "norm_eps": 1e-05, "vocab_size": 1000}
|
||||
def test_alloc_count(t):
|
||||
model = Transformer(**args_tiny)
|
||||
for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
|
||||
return model(t, 0).realize()
|
||||
helper_test_alloc_count("llama", lambda: (Tensor([[2,]]),), test_alloc_count)
|
||||
check_gc()
|
||||
|
||||
@unittest.skip("huge for CI")
|
||||
def test_stable_diffusion(self):
|
||||
from examples.stable_diffusion import UNetModel
|
||||
model = UNetModel()
|
||||
derandomize_model(model)
|
||||
def test(t, t2): return model(t, 801, t2).realize()
|
||||
helper_test_correctness(lambda: (Tensor.randn(1, 4, 16, 16),Tensor.randn(1, 77, 768)), test)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
18
test/external/external_test_speed_llama.py
vendored
18
test/external/external_test_speed_llama.py
vendored
@@ -1,29 +1,27 @@
|
||||
# NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
|
||||
import unittest, time
|
||||
import numpy as np
|
||||
from examples.llama import Transformer, MODEL_PARAMS
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad import Device
|
||||
from tinygrad.nn.state import get_state_dict
|
||||
from tinygrad.device import Compiled
|
||||
from tinygrad.device import Compiled, Allocator
|
||||
from tinygrad.helpers import Profiling
|
||||
from tinygrad.runtime.lib import RawBuffer
|
||||
|
||||
class FakeProgram:
|
||||
def __init__(self, name:str, prg:str): pass
|
||||
def __init__(self, name:str, prg:bytes, bufs:int, vars:int=0): pass
|
||||
def __call__(self, *bufs, global_size, local_size, wait=False): pass
|
||||
|
||||
class RawFakeBuffer(RawBuffer):
|
||||
def _copyin(self, x:np.ndarray): pass
|
||||
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
|
||||
class FakeAllocator(Allocator):
|
||||
def _alloc(self, sz, dtype): return None
|
||||
def copyin(self, dest, src:memoryview): pass
|
||||
|
||||
class TestLLaMASpeed(unittest.TestCase):
|
||||
@unittest.skipIf(not isinstance(Device[Device.DEFAULT], Compiled), "only test for compiled backends")
|
||||
def test_llama_compile(self):
|
||||
backup_program = Device[Device.DEFAULT].runtime
|
||||
backup_buffer = Device[Device.DEFAULT].buffer
|
||||
backup_allocator = Device[Device.DEFAULT].allocator
|
||||
Device[Device.DEFAULT].runtime = FakeProgram
|
||||
Device[Device.DEFAULT].buffer = RawFakeBuffer
|
||||
Device[Device.DEFAULT].allocator = FakeAllocator()
|
||||
|
||||
print("testing llama python run time")
|
||||
model = Transformer(**MODEL_PARAMS["1"]["7B"]["args"])
|
||||
@@ -48,7 +46,7 @@ class TestLLaMASpeed(unittest.TestCase):
|
||||
run_llama("profile")
|
||||
|
||||
Device[Device.DEFAULT].runtime = backup_program
|
||||
Device[Device.DEFAULT].buffer = backup_buffer
|
||||
Device[Device.DEFAULT].allocator = backup_allocator
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@@ -6,7 +6,7 @@ from tinygrad.nn.state import get_parameters
|
||||
def derandomize(x):
|
||||
if isinstance(x, LazyOp):
|
||||
new_op = LoadOps.EMPTY if x.op == LoadOps.RAND else x.op
|
||||
return LazyOp(new_op, tuple([derandomize(s) for s in x.src]), x.arg)
|
||||
return LazyOp(new_op, tuple([derandomize(s) for s in x.src]), None if x.op == LoadOps.RAND else x.arg)
|
||||
x.op = derandomize(x.op)
|
||||
return x
|
||||
|
||||
|
||||
@@ -1,188 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
import unittest
|
||||
import pytest
|
||||
import numpy as np
|
||||
from weakref import ref
|
||||
|
||||
from tinygrad.helpers import GlobalCounters
|
||||
from tinygrad.runtime.lib import RawBuffer, LRUAllocator
|
||||
from tinygrad.helpers import dtypes, prod
|
||||
from tinygrad import Device
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
def check_gc():
|
||||
if Device.DEFAULT == "GPU":
|
||||
from extra.introspection import print_objects
|
||||
assert print_objects() == 0
|
||||
|
||||
class FakeDeviceBuffer:
|
||||
def __init__(self, sz, dt, device):
|
||||
self.id = 1
|
||||
self.size = sz
|
||||
self.dtype = dt
|
||||
self.device = device
|
||||
def __del__(self):
|
||||
assert self.id == 0, "Should called _do_free() before"
|
||||
|
||||
class FakeAllocator(LRUAllocator):
|
||||
def _do_alloc(self, size, dtype, device, **kwargs):
|
||||
if size*dtype.itemsize > self._get_cur_free_space(device): raise Exception("OOM")
|
||||
return FakeDeviceBuffer(size, dtype, device)
|
||||
def _do_free(self, buf):
|
||||
buf.id -= 1
|
||||
assert buf.id == 0, f"Free should be called once, but {buf.id}"
|
||||
def __del__(self): # Fake allocator should clear all buffers after each test.
|
||||
for v in self.cached_buffers.values():
|
||||
for buf, _ in v: self._free_buffer(buf)
|
||||
|
||||
FAKE_GLOBAL_ALLOCATOR = None
|
||||
class FakeBuffer(RawBuffer):
|
||||
def __init__(self, size, dtype, device='0'):
|
||||
global FAKE_GLOBAL_ALLOCATOR
|
||||
super().__init__(size, dtype, allocator=FAKE_GLOBAL_ALLOCATOR, **{'device': device})
|
||||
assert self._buf.size == size and self._buf.dtype == dtype and self._buf.device == device, "This allocator requires 100% match of dtype and size."
|
||||
@classmethod
|
||||
def fromCPU(cls, x:np.ndarray, **kwargs): return cls(prod(x.shape), dtypes.from_np(x.dtype), **kwargs)
|
||||
def toCPU(self): return np.empty(self.size, dtype=self.dtype.np)
|
||||
|
||||
def alloc(allocator, size, dtype, **kwargs):
|
||||
global FAKE_GLOBAL_ALLOCATOR
|
||||
FAKE_GLOBAL_ALLOCATOR = allocator
|
||||
buf = FakeBuffer(size, dtype, **kwargs)
|
||||
assert buf.dtype == dtype and buf.size == size
|
||||
FAKE_GLOBAL_ALLOCATOR = None
|
||||
return buf
|
||||
|
||||
def alloc_free_trace(allocator, size, dtype, **kwargs):
|
||||
buf = alloc(allocator, size, dtype, **kwargs)
|
||||
return ref(buf._buf)
|
||||
|
||||
def cmp_trace_and_buf(buf, trace_ref): return trace_ref and trace_ref() == buf._buf
|
||||
|
||||
class TestAllocators(unittest.TestCase):
|
||||
def test_lru_allocator_reusage(self):
|
||||
mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
|
||||
def test():
|
||||
lru_allocator = FakeAllocator(2048)
|
||||
traced_buf = alloc_free_trace(lru_allocator, 16, dtypes.float32)
|
||||
assert GlobalCounters.mem_cached - mc == 16*dtypes.float32.itemsize, "Buffer should be cached"
|
||||
for _ in range(32):
|
||||
def __test():
|
||||
buf = alloc(lru_allocator, 16, dtypes.float32)
|
||||
assert cmp_trace_and_buf(buf, traced_buf), "Buffer should be reused"
|
||||
__test()
|
||||
|
||||
usedbuf = alloc(lru_allocator, 16, dtypes.float32)
|
||||
for _ in range(32):
|
||||
def __test():
|
||||
buf = alloc(lru_allocator, 16, dtypes.float32)
|
||||
assert usedbuf != buf, "Nobody should get used buffer"
|
||||
__test()
|
||||
assert GlobalCounters.mem_used - mu == 16*dtypes.float32.itemsize, "Only usedbuf is still allocated."
|
||||
test()
|
||||
check_gc()
|
||||
|
||||
def test_lru_allocator_cache_free(self):
|
||||
mc, mu = GlobalCounters.mem_cached, GlobalCounters.mem_used
|
||||
def test():
|
||||
lru_allocator = FakeAllocator(128)
|
||||
refs = []
|
||||
for _ in range(32):
|
||||
refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32))
|
||||
for sz in range(1, 32):
|
||||
alloc_free_trace(lru_allocator, sz, dtypes.float32)
|
||||
assert GlobalCounters.mem_used + GlobalCounters.mem_cached - mc - mu <= 128, "Should not allocate on device more than allowed (128)"
|
||||
for r in refs: assert r() is None, "All refs should be dead, since buffers were cleared from cache"
|
||||
test()
|
||||
check_gc()
|
||||
|
||||
def test_lru_allocator_multidevice(self):
|
||||
def test():
|
||||
lru_allocator = FakeAllocator(256)
|
||||
refs=[]
|
||||
for i in range(8):
|
||||
refs.append(alloc_free_trace(lru_allocator, 16, dtypes.float32, device=str(i)))
|
||||
for i in range(64):
|
||||
def __test():
|
||||
dev = str(i % 8)
|
||||
buf = alloc(lru_allocator, 16, dtypes.float32, device=dev)
|
||||
assert cmp_trace_and_buf(buf, refs[i%8]), "Buffer should be reused"
|
||||
__test()
|
||||
for r in refs: assert r() is not None, "All refs should be cached"
|
||||
test()
|
||||
check_gc()
|
||||
|
||||
def test_lru_allocator_failing_alloc_cleans_cache(self):
|
||||
def test():
|
||||
lru_allocator = FakeAllocator(128)
|
||||
for size in range(1, 4):
|
||||
alloc_free_trace(lru_allocator, size, dtypes.float32, device='0')
|
||||
assert len(lru_allocator.aging_order['0']) == 3, "All buffers should be cached"
|
||||
assert lru_allocator.free_space['0'] == 128 - 24, "24 bytes to be used by current cached buffers"
|
||||
|
||||
def always_raise_exception(*args, **kwargs):
|
||||
raise MemoryError("OOM")
|
||||
lru_allocator._do_alloc = always_raise_exception
|
||||
|
||||
with pytest.raises(Exception):
|
||||
alloc(lru_allocator, 5, dtypes.float32, device='0')
|
||||
assert len(lru_allocator.aging_order['0']) == 0, "All buffers should be freed from cache due to failing alloc"
|
||||
test()
|
||||
check_gc()
|
||||
|
||||
def test_lru_allocator_fail_first_alloc_pass_after_clear_cahce(self):
|
||||
def test():
|
||||
lru_allocator = FakeAllocator(128)
|
||||
for size in range(1, 4):
|
||||
alloc_free_trace(lru_allocator, size, dtypes.float32, device='0')
|
||||
cache_length = 3
|
||||
assert len(lru_allocator.aging_order['0']) == cache_length, "All buffers should be cached"
|
||||
assert lru_allocator.free_space['0'] == 128 - 24, "24 bytes to be used by current cached buffers"
|
||||
|
||||
original_do_alloc = lru_allocator._do_alloc # save the original method
|
||||
def single_fail_then_pass(*args, **kwargs):
|
||||
lru_allocator._do_alloc = original_do_alloc # restore the original method
|
||||
raise MemoryError("OOM")
|
||||
lru_allocator._do_alloc = single_fail_then_pass
|
||||
|
||||
alloc(lru_allocator, 5, dtypes.float32, device='0')
|
||||
assert len(lru_allocator.aging_order['0']) < cache_length, "Some buffers should be cleaned as first alloc failed"
|
||||
test()
|
||||
check_gc()
|
||||
|
||||
@unittest.skip("failing in CI")
|
||||
def test_gpu_copyout(self):
|
||||
def test():
|
||||
from tinygrad.runtime.ops_gpu import CL
|
||||
|
||||
# Allocation to init the allocator.
|
||||
tx = Tensor.rand(1)
|
||||
tx.realize()
|
||||
free_space = CL.cl_allocator.free_space[tx.lazydata.realized._device]
|
||||
|
||||
# Spawning 128mb objects to fill half of free_space
|
||||
will_allocate = free_space // 3
|
||||
trash_allocation_size = free_space // 2
|
||||
|
||||
def sp():
|
||||
trash_buffer = Tensor.rand(trash_allocation_size // 4)
|
||||
trash_buffer.realize()
|
||||
sp()
|
||||
|
||||
xx = Tensor.rand(will_allocate // 4)
|
||||
_ = xx.numpy()
|
||||
test()
|
||||
check_gc()
|
||||
|
||||
def test_lru_allocator_massive_buffer(self):
|
||||
with self.assertRaises(AssertionError) as context: alloc(allocator := FakeAllocator(), size := 1e13, dtypes.int8)
|
||||
self.assertEqual(str(context.exception), f"out of memory - requested: {size/1e9:5.2f} GB, available: {allocator._get_cur_free_space('0')/1e9:5.2f} GB")
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT != "METAL", "only applies to Metal")
|
||||
def test_lru_allocator_metal_max_buffer_length(self):
|
||||
from tinygrad.runtime.ops_metal import METAL
|
||||
with self.assertRaises(AssertionError) as context: METAL.allocator._do_alloc(buf_len := (max_buf_len := METAL.device.maxBufferLength()+1), dtypes.int8, '0')
|
||||
self.assertEqual(str(context.exception), f"Buffer length of {buf_len/1e9:5.2f} GB exceeds Metal's max buffer length of {max_buf_len/1e9:5.2f} GB.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -8,7 +8,7 @@ from tinygrad.helpers import prod, dtypes
|
||||
|
||||
# *** first, we implement the atan2 op at the lowest level ***
|
||||
# `atan2_gpu` for GPUBuffers and `atan2_cpu` for CPUBuffers
|
||||
from tinygrad.lazy import LazyBuffer, create_lazybuffer
|
||||
from tinygrad.lazy import Buffer, create_lazybuffer
|
||||
from tinygrad.device import CompiledASTRunner, Device
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
import pytest
|
||||
@@ -16,17 +16,15 @@ import pytest
|
||||
pytestmark = pytest.mark.webgpu
|
||||
|
||||
# we don't always have GPU support, so the type signature is the abstract CompiledBuffer instead of GPUBuffer
|
||||
def atan2_gpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer):
|
||||
assert a.device == "GPU" and b.device == "GPU", "gpu function requires GPUBuffers"
|
||||
def atan2_gpu(ret:Buffer, a:Buffer, b:Buffer):
|
||||
assert a.dtype == b.dtype and a.dtype == dtypes.float32, "gpu function only supports float32"
|
||||
ret.realized = Device[ret.device].buffer(prod(ret.shape), ret.dtype)
|
||||
CompiledASTRunner(None, "atan2_gpu", """
|
||||
__kernel void atan2_gpu(global float *c, global float *a, global float *b) {
|
||||
int idx = get_global_id(0);
|
||||
c[idx] = atan2(a[idx], b[idx]);
|
||||
}""", global_size=[prod(ret.shape)]).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret.realized, a.realized, b.realized])
|
||||
}""", global_size=[ret.size], bufcount=3).build(Device[ret.device].compiler, Device[ret.device].runtime).exec([ret, a, b])
|
||||
|
||||
def atan2_cpu(ret:LazyBuffer, a:LazyBuffer, b:LazyBuffer): ret.realized._copyin(np.arctan2(a.realized._buf, b.realized._buf))
|
||||
def atan2_cpu(ret:Buffer, a:Buffer, b:Buffer): ret.copyin(np.require(np.arctan2(a._buf, b._buf), requirements='C').data)
|
||||
|
||||
# *** second, we write the ATan2 mlop ***
|
||||
# NOTE: The derivative of atan2 doesn't need a custom op! https://www.liquisearch.com/atan2/derivative
|
||||
|
||||
@@ -7,6 +7,7 @@ from tinygrad.tensor import Tensor
|
||||
from tinygrad.jit import CacheCollector
|
||||
|
||||
class TestLazyBuffer(unittest.TestCase):
|
||||
@unittest.skip("it doesn't work like this anymore")
|
||||
def test_fromcpu_buffer_sharing(self):
|
||||
a = np.arange(8)
|
||||
assert LazyBuffer.fromCPU(a).realized._buf is a
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest, os
|
||||
|
||||
from tinygrad.codegen.kernel import Opt, OptOps, tensor_cores
|
||||
from tinygrad.codegen.linearizer import Linearizer, UOp, UOps
|
||||
from tinygrad.device import Compiled, Device
|
||||
from tinygrad.device import Compiled, Device, Buffer
|
||||
from tinygrad.ops import BufferOps, MemBuffer, ConstBuffer, LazyOp, LoadOps, TernaryOps
|
||||
from tinygrad.shape.shapetracker import ShapeTracker
|
||||
from tinygrad.shape.view import View
|
||||
@@ -140,7 +140,7 @@ def helper_realized_ast(r:Tensor):
|
||||
s = r.lazydata.schedule()
|
||||
run_schedule(s[:-1]) # run all kernels except the last one
|
||||
# now all input LazyBuffers buffers in s[-1] should be realized
|
||||
output_buffer = Device[s[-1].out.device].buffer(prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args()) # allocate an output buffer
|
||||
output_buffer = Buffer(s[-1].out.device, prod((s if isinstance(s, int) else s.max for s in s[-1].out.shape)), s[-1].out.dtype, **s[-1].out._device_extra_args()) # allocate an output buffer
|
||||
return s[-1].ast, [output_buffer] + [l.realized for l in s[-1].inputs]
|
||||
|
||||
class TestFloat4(unittest.TestCase):
|
||||
@@ -367,7 +367,7 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False):
|
||||
for opt in opts:
|
||||
k.apply_opt(opt)
|
||||
prg = to_prg(k)
|
||||
real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
|
||||
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
|
||||
prg.exec(real_bufs)
|
||||
np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
|
||||
|
||||
@@ -381,7 +381,7 @@ def helper_linearizer_opt(r:Tensor, opts=[], apply_tc=False):
|
||||
k = Linearizer(realized_ast)
|
||||
k.hand_coded_optimizations()
|
||||
prg = Device[Device.DEFAULT].to_program(k)
|
||||
real_bufs[0] = real_bufs[0].fromCPU(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np)) # Zero to check that all values are filled
|
||||
real_bufs[0].copyin(np.zeros((real_bufs[0].size, ), dtype=real_bufs[0].dtype.np).data) # Zero to check that all values are filled
|
||||
prg.exec(real_bufs)
|
||||
np.testing.assert_allclose(wanna_output, real_bufs[0].toCPU(), atol=1e-4, rtol=1e-4)
|
||||
for x in opts: # Check custom transformations if any.
|
||||
|
||||
@@ -2,7 +2,7 @@ import unittest
|
||||
|
||||
from tinygrad.codegen.linearizer import Linearizer
|
||||
from tinygrad.features.search import time_linearizer
|
||||
from tinygrad.device import Compiled, Device
|
||||
from tinygrad.device import Compiled, Device, Buffer
|
||||
from tinygrad.ops import LoadOps
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
@@ -12,7 +12,7 @@ class TestTimeLinearizer(unittest.TestCase):
|
||||
|
||||
def test_reasonable_time(self):
|
||||
si = [si for si in Tensor([1,2,3,4]).add(1).lazydata.schedule() if si.ast.op not in LoadOps][0]
|
||||
rawbufs = [Device[Device.DEFAULT].buffer(si.out.st.size(), si.out.dtype)] + [Device[Device.DEFAULT].buffer(x.st.size(), x.dtype) for x in si.inputs]
|
||||
rawbufs = [Buffer(Device.DEFAULT, si.out.st.size(), si.out.dtype)] + [Buffer(Device.DEFAULT, x.st.size(), x.dtype) for x in si.inputs]
|
||||
tm = time_linearizer(Linearizer(si.ast), rawbufs, allow_test_size=False, cnt=10)
|
||||
assert tm > 0 and tm != float('inf')
|
||||
|
||||
|
||||
@@ -2,16 +2,16 @@ from typing import Optional, Tuple, Any, List
|
||||
import unittest, math
|
||||
import numpy as np
|
||||
from tinygrad.helpers import dtypes, getenv, DType, PtrDType
|
||||
from tinygrad.tensor import Device
|
||||
from tinygrad.device import Buffer, Device
|
||||
from tinygrad.ops import UnaryOps, BinaryOps, TernaryOps
|
||||
from tinygrad.device import CompiledASTRunner, Compiled
|
||||
from tinygrad.codegen.linearizer import UOps, UOp
|
||||
|
||||
def _uops_to_prg(uops):
|
||||
def _uops_to_prg(uops, bufcount):
|
||||
src, runtime_args = Device[Device.DEFAULT].renderer("test", uops)
|
||||
return CompiledASTRunner(None, "test", src,
|
||||
[1] if Device[Device.DEFAULT].linearizer_opts.has_local else None, [1] if Device[Device.DEFAULT].linearizer_opts.has_local else None,
|
||||
runtime_args=runtime_args).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)
|
||||
runtime_args=runtime_args, bufcount=bufcount).build(Device[Device.DEFAULT].compiler, Device[Device.DEFAULT].runtime)
|
||||
|
||||
def uop(uops:List[UOp], uop:UOps, dtype:Optional[DType], vin:Tuple[UOp, ...], arg:Any=None) -> UOp:
|
||||
uops.append(UOp(uop, dtype, tuple(vin), arg))
|
||||
@@ -24,9 +24,9 @@ def _test_single_value(vals, op, dtype):
|
||||
loads = (uop(uops, UOps.LOAD, dtype, [buf_loads[i], uop(uops, UOps.CONST, dtypes.int32, (), 0)]) for i in range(len(vals)))
|
||||
alu = uop(uops, UOps.ALU, dtype, loads, op)
|
||||
uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
|
||||
buf = Device[Device.DEFAULT].buffer(1, dtype)
|
||||
buf2 = [Device[Device.DEFAULT].buffer.fromCPU(np.array([a], dtype=dtype.np)) for a in vals]
|
||||
prg = _uops_to_prg(uops)
|
||||
buf = Buffer(Device.DEFAULT, 1, dtype)
|
||||
buf2 = [Buffer.fromCPU(Device.DEFAULT, np.array([a], dtype=dtype.np)) for a in vals]
|
||||
prg = _uops_to_prg(uops, 1+len(buf2))
|
||||
prg.exec([buf]+buf2)
|
||||
return buf.toCPU()[0]
|
||||
|
||||
@@ -36,8 +36,8 @@ def _test_single_value_const(vals, op, dtype):
|
||||
loads = (uop(uops, UOps.CONST, dtype, [], a) for a in vals)
|
||||
alu = uop(uops, UOps.ALU, dtype, loads, op)
|
||||
uop(uops, UOps.STORE, None, (buf_store, uop(uops, UOps.CONST, dtypes.int32, (), 0), alu))
|
||||
buf = Device[Device.DEFAULT].buffer(1, dtype)
|
||||
prg = _uops_to_prg(uops)
|
||||
buf = Buffer(Device.DEFAULT, 1, dtype)
|
||||
prg = _uops_to_prg(uops, 1)
|
||||
prg.exec([buf])
|
||||
return buf.toCPU()[0]
|
||||
|
||||
|
||||
@@ -3,8 +3,7 @@ import unittest
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor, Device
|
||||
from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load
|
||||
from tinygrad.helpers import dtypes, fetch, temp
|
||||
from tinygrad.runtime.ops_disk import RawDiskBuffer
|
||||
from tinygrad.helpers import fetch, temp
|
||||
from tinygrad.helpers import Timing
|
||||
|
||||
def compare_weights_both(url):
|
||||
@@ -40,11 +39,6 @@ class TestRawDiskBuffer(unittest.TestCase):
|
||||
with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
|
||||
f.readinto(tst)
|
||||
|
||||
def test_mmap_read_speed(self):
|
||||
db = RawDiskBuffer(test_size, dtype=dtypes.uint8, device=test_fn)
|
||||
tst = np.empty(test_size, np.uint8)
|
||||
with Timing("copy in ", lambda et_ns: f" {test_size/et_ns:.2f} GB/s"):
|
||||
np.copyto(tst, db.toCPU())
|
||||
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu doesn't support uint8 datatype")
|
||||
class TestSafetensors(unittest.TestCase):
|
||||
def test_real_safetensors(self):
|
||||
|
||||
Reference in New Issue
Block a user