mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
buffer view is like buffer (#15082)
* buffer view is like buffer * fix * swap_reshape_shrink * contiguous on gguf, fix overlap * revert that * _device_supports_view * this * fix that test * 0 buffers * that test was wrong * this * check correct size * contig BUFFER_VIEW * this * fix tests * buffer view tests * om * fix torch * no MOCKGPU * skip
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
import unittest
|
||||
from tinygrad import Device, dtypes, Tensor
|
||||
from tinygrad.device import Buffer
|
||||
from tinygrad.helpers import Context
|
||||
from tinygrad.helpers import Context, getenv
|
||||
from test.helpers import needs_second_gpu
|
||||
|
||||
@unittest.skipUnless(hasattr(Device[Device.DEFAULT].allocator, "_offset"), "subbuffer not supported")
|
||||
@@ -42,7 +42,7 @@ class TestSubBuffer(unittest.TestCase):
|
||||
assert out == [102, 103]
|
||||
|
||||
@needs_second_gpu
|
||||
@unittest.skipIf(Device.DEFAULT not in {"CUDA", "NV", "AMD"}, "only NV, AMD, CUDA")
|
||||
@unittest.skipIf(Device.DEFAULT not in {"CUDA", "NV", "AMD"} or getenv("MOCKGPU"), "only NV, AMD, CUDA")
|
||||
def test_subbuffer_transfer(self):
|
||||
t = Tensor.arange(0, 10, dtype=dtypes.uint8).realize()
|
||||
vt = t[2:5].contiguous().realize()
|
||||
|
||||
@@ -8,7 +8,7 @@ class TestDataset(unittest.TestCase):
|
||||
X_train[0].contiguous().realize()
|
||||
GlobalCounters.reset()
|
||||
X_train[0].contiguous().realize()
|
||||
self.assertEqual(GlobalCounters.kernel_count, 1)
|
||||
self.assertLessEqual(GlobalCounters.kernel_count, 1) # 0 if BUFFER_VIEW (zero-copy), 1 otherwise
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@@ -117,7 +117,7 @@ class TestContiguous(unittest.TestCase):
|
||||
def test_size_change_buffer_view(self):
|
||||
a = Tensor.empty(4)
|
||||
b = a.reshape((1, 1, 4)).shrink(((0, 1), (0, 1), (0, 3))).contiguous()
|
||||
check_schedule(b, 1)
|
||||
check_schedule(b, 0) # contiguous shrink of a realized buffer is a zero-copy BUFFER_VIEW
|
||||
|
||||
def test_double_contiguous_realizes_once(self):
|
||||
a = Tensor.empty(4, 1)
|
||||
@@ -1170,5 +1170,23 @@ class TestFusionOp(unittest.TestCase):
|
||||
self.assertEqual(len(sched), 1)
|
||||
self.assertLess(time.perf_counter()-st, 2.0)
|
||||
|
||||
# NOTE: the NULL backend supports BUFFER_VIEW
|
||||
class TestBufferView(unittest.TestCase):
|
||||
def test_shrink_contiguous_is_buffer_view(self):
|
||||
# simple 1D shrink of a realized buffer should be BUFFER_VIEW, not a copy kernel
|
||||
a = Tensor.arange(100).contiguous().realize()
|
||||
b = a.shrink(((10, 50),)).contiguous()
|
||||
run_schedule(check_schedule(b, 0))
|
||||
|
||||
def test_shrink_2d_contiguous_is_buffer_view(self):
|
||||
a = Tensor.arange(100).reshape(10,10).contiguous().realize()
|
||||
b = a.shrink(((1, 5),None)).contiguous()
|
||||
run_schedule(check_schedule(b, 0))
|
||||
|
||||
def test_chained_shrink_is_buffer_view(self):
|
||||
a = Tensor.arange(1000).contiguous().realize()
|
||||
b = a.shrink(((200, 800),)).shrink(((0, 300),)).reshape((30, 10)).shrink(((20, 25), (0, 10))).contiguous()
|
||||
run_schedule(check_schedule(b, 0))
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
@@ -683,6 +683,9 @@ class TestAssignOrdering(unittest.TestCase):
|
||||
buf[4:8].assign(buf[0:4].contiguous())
|
||||
np.testing.assert_equal(buf.numpy(), [1, 2, 3, 4, 1, 2, 3, 4])
|
||||
|
||||
# TODO: this test was testing wrong behavior.
|
||||
# you have to fix the first one to fix the second since the slices no longer realize.
|
||||
@unittest.expectedFailure
|
||||
def test_swap_slices(self):
|
||||
"""Swap two non-overlapping slices - requires reading both before writing."""
|
||||
# without .realize() on temps: values not captured before overwriting
|
||||
|
||||
@@ -192,7 +192,7 @@ class Transformer:
|
||||
@staticmethod
|
||||
def from_gguf(gguf:Tensor, max_context:int|None=None, realize=bool(getenv("REALIZE", 1))) -> tuple[Transformer, dict]:
|
||||
# TODO: remove the need for copy to default device
|
||||
kv, state_dict = nn.state.gguf_load(gguf.to(None))
|
||||
kv, state_dict = nn.state.gguf_load(gguf.to(None).realize())
|
||||
|
||||
# all state items should be float16, not float32
|
||||
state_dict = {k:v.cast('float16') if getenv("HALF", 1) else v for k,v in state_dict.items()}
|
||||
|
||||
@@ -69,11 +69,41 @@ def found_contiguous(ctx:dict[UOp, UOp], contig:UOp, src:UOp):
|
||||
x = x.src[0]
|
||||
ctx[src.base] = contig
|
||||
|
||||
def contiguous_mops_to_view(c:UOp):
|
||||
"""CONTIGUOUS(MOPS(BUFFER)) → CONTIGUOUS(BUFFER_VIEW) when movement ops collapse to a contiguous range."""
|
||||
src = c.src[0]
|
||||
buf = src.base
|
||||
if buf.op not in {Ops.BUFFER, Ops.BUFFER_VIEW}: return None
|
||||
if src.op is Ops.RESHAPE and src.src[0].op in {Ops.BUFFER, Ops.BUFFER_VIEW}: return None
|
||||
|
||||
# no symbolic shape
|
||||
if not all(isinstance(x, int) for x in c.shape): return None
|
||||
|
||||
# check if view is supported
|
||||
if not isinstance(c.device, str): return None
|
||||
from tinygrad.device import Device
|
||||
if not hasattr(Device[c.device].allocator, "_offset"): return None
|
||||
|
||||
# see if this can be a view
|
||||
size_offset = src.contiguous_view_offset()
|
||||
if size_offset is None: return None
|
||||
|
||||
# merge BUFFER_VIEWs
|
||||
size, offset = size_offset
|
||||
if buf.op is Ops.BUFFER_VIEW: offset, buf = offset + buf.arg[1], buf.src[0]
|
||||
|
||||
# NOTE: this contiguous is removed because this BUFFER_VIEW/RESHAPE has_buffer_identity
|
||||
return UOp(Ops.BUFFER_VIEW, src.dtype, (buf,), (size, offset)).reshape(src.shape).contiguous(tag=c.tag)
|
||||
|
||||
pm_early_transform_tensor_graph = PatternMatcher([
|
||||
# CONTIGUOUS replacement hack for openpilot
|
||||
# CONTIGUOUS(MOPS(BUFFER/BUFFER_VIEW)) → CONTIGUOUS(BUFFER_VIEW) when movement ops collapse to contiguous range
|
||||
(UPat(Ops.CONTIGUOUS, src=(UPat(GroupOp.Movement),), name="c"), contiguous_mops_to_view),
|
||||
|
||||
# *** CONTIGUOUS replacement hack for openpilot ***
|
||||
(UPat(Ops.CONTIGUOUS, src=(UPat(GroupOp.Movement, name="src"),), name="contig"), found_contiguous),
|
||||
# replace ALU sources with contiguous versions found above
|
||||
(UPat(GroupOp.ALU, name="alu"), lambda ctx,alu: alu.replace(src=new_src) if (new_src:=tuple(ctx.get(s, s) for s in alu.src)) != alu.src else None),
|
||||
|
||||
# add CONTIGUOUS to tagged UOps
|
||||
(UPat(GroupOp.All-{Ops.CONTIGUOUS, Ops.ASSIGN}, name="x"), lambda x: x.rtag(None).contiguous(tag=x.tag) if x.tag else x.replace(tag=None)),
|
||||
# remove extra CONTIGUOUS on ASSIGN (only when assign target is contiguous)
|
||||
@@ -124,6 +154,8 @@ pm_finalize_call = PatternMatcher([
|
||||
pm_replace_buf = PatternMatcher([
|
||||
# replace BUFFER with PARAM for cache key normalization
|
||||
(UPat(Ops.BUFFER, src=(UPat(Ops.UNIQUE), UPat(Ops.DEVICE)), name="b"), replace_input_buffer),
|
||||
# replace BUFFER_VIEW with PARAM. this rewrite is bottom up so BUFFERs we don't need won't be in the input
|
||||
(UPat(Ops.BUFFER_VIEW, src=(UPat(Ops.BUFFER),), name="b"), replace_input_buffer),
|
||||
# strip value from BIND for cache key normalization, so different values hit same cache
|
||||
(UPat(Ops.BIND, src=(UPat(Ops.DEFINE_VAR), UPat(Ops.CONST)), name="b"), replace_input_buffer),
|
||||
])
|
||||
@@ -145,6 +177,6 @@ def transform_to_call(big_sink:UOp) -> tuple[UOp, dict[UOp, UOp]]:
|
||||
|
||||
# here we construct the final buffer_map. this is everything that will go into the tensor map
|
||||
graph_rewrite(big_sink, pm_finalize_call, ctx=ctx, name="finalize call")
|
||||
ret = graph_rewrite(UOp.sink(*ctx.assigns), pm_replace_buf, ctx=ctx, name="replace bufs").call(*ctx.replacements)
|
||||
ret = graph_rewrite(UOp.sink(*ctx.assigns), pm_replace_buf, ctx=ctx, bottom_up=True, name="replace bufs").call(*ctx.replacements)
|
||||
if VIZ: graph_rewrite(ret, PatternMatcher([]), name="View Call")
|
||||
return ret, ctx.buffer_map
|
||||
|
||||
@@ -663,9 +663,12 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
|
||||
from tinygrad.uop.symbolic import symbolic
|
||||
out = graph_rewrite(self._mop(Ops.RESHAPE, (self.size,)).index(UOp.range(self.size, 0)), pm_mops+symbolic, name="contiguous_view_offset")
|
||||
if out.op is not Ops.INDEX: return None
|
||||
if out.src[1].op is Ops.CONST and self.size == 1: return (1, out.src[1].arg)
|
||||
if out.src[1].op is Ops.CONST and self.size == 1:
|
||||
if not isinstance(out.src[1].arg, int): return None # masked/padded regions produce InvalidType
|
||||
return (1, out.src[1].arg)
|
||||
if out.src[1].op is Ops.RANGE: return (self.size, 0)
|
||||
if out.src[1].op is Ops.ADD and out.src[1].src[0].op is Ops.RANGE and out.src[1].src[1].op is Ops.CONST:
|
||||
if not isinstance(out.src[1].src[1].arg, int): return None # masked/padded regions produce InvalidType
|
||||
return (self.size, out.src[1].src[1].arg)
|
||||
return None
|
||||
|
||||
@@ -677,7 +680,7 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
|
||||
@property
|
||||
def buffer(self) -> Buffer|MultiBuffer:
|
||||
from tinygrad.device import Buffer, MultiBuffer
|
||||
if self.op in {Ops.CONTIGUOUS, Ops.RESHAPE}: return self.src[0].buffer
|
||||
if self.op in {Ops.CONTIGUOUS, Ops.RESHAPE, Ops.DETACH}: return self.src[0].buffer
|
||||
# this buffer can process disk tensors and simple movement ops
|
||||
if self is not self.base:
|
||||
size_offset = self.contiguous_view_offset()
|
||||
|
||||
Reference in New Issue
Block a user