mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
Merge branch 'master' into test_rewrite_map
This commit is contained in:
@@ -240,6 +240,8 @@ class TestMultiConstFolding(unittest.TestCase):
|
||||
_check_ast_count(0, t ** 1)
|
||||
_check_ast_count(0, 1 ** t)
|
||||
|
||||
# failing because multi calls .contiguous() on every single sharded uop
|
||||
@unittest.expectedFailure
|
||||
def test_multi_const_folding_tensor(self):
|
||||
ds = tuple(f"{Device.DEFAULT}:{i}" for i in range(4))
|
||||
t = Tensor.arange(16).float().realize().to(ds)
|
||||
|
||||
@@ -5,7 +5,7 @@ from tinygrad.ops import Ops
|
||||
from tinygrad.helpers import CI, getenv, prod, Context
|
||||
from tinygrad.nn.state import get_parameters, get_state_dict
|
||||
from tinygrad.engine.schedule import create_schedule
|
||||
from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner
|
||||
from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule
|
||||
from tinygrad.multi import all_reduce, MultiLazyBuffer
|
||||
import numpy as np
|
||||
from hypothesis import given, strategies as strat, settings
|
||||
@@ -704,6 +704,19 @@ class TestMultiTensor(unittest.TestCase):
|
||||
t = Tensor.rand(16, 16).shard(devices_2, axis=0)
|
||||
np.testing.assert_allclose(t.numpy(), t.clone().numpy())
|
||||
|
||||
def test_multi_const_folding(self):
|
||||
with Context(TRACK_MATCH_STATS=0):
|
||||
a = Tensor.arange(3).realize()
|
||||
zeros = Tensor.zeros(3).realize()
|
||||
b = a.to(devices_2)*zeros.to(devices_2)
|
||||
sched = b.schedule()
|
||||
self.assertEqual(len(sched), 6)
|
||||
# notably, only two copies (for the arange) - vs 4 copies if we didn't fold the const copy
|
||||
self.assertEqual(len([x for x in sched if any(u.op is Ops.COPY for u in x.ast.toposort)]), 2)
|
||||
# all these kernels are just because multi calls contiguous on every single shard
|
||||
run_schedule(sched)
|
||||
self.assertListEqual(b.tolist(), [0, 0, 0])
|
||||
|
||||
@unittest.skipIf(CI and Device.DEFAULT in ("GPU", "CUDA", "METAL"), "no GPU CI")
|
||||
class TestHandleData(unittest.TestCase):
|
||||
def test_copied_to_device(self):
|
||||
|
||||
@@ -2163,5 +2163,19 @@ class TestConst(unittest.TestCase):
|
||||
run_schedule(sched, var_vals)
|
||||
self.assertEqual(a.tolist(), 3)
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT == "CLANG", "tests copy from another device to clang")
|
||||
class TestCopyFolding(unittest.TestCase):
|
||||
def test_const_copy_is_free(self):
|
||||
b = Tensor(1).to("CLANG")
|
||||
check_schedule(b, 0, filter_sink=False)
|
||||
assert b.item() == 1
|
||||
|
||||
def test_late_const_copy_folding(self):
|
||||
a = Tensor.arange(3).realize()
|
||||
zeros = Tensor.zeros(3).realize()
|
||||
b = (a*zeros).to("CLANG")
|
||||
run_schedule(check_schedule(b, 0, filter_sink=False))
|
||||
self.assertListEqual(b.tolist(), [0, 0, 0])
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
@@ -66,7 +66,8 @@ class MultiLazyBuffer(MathTrait):
|
||||
assert (axis is None) == (bounds is None), "must specify bounds iff axis is specified"
|
||||
lbs = [lb] * len(devices)
|
||||
sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(to_sharded(lbs, axis, bounds) if axis is not None and bounds is not None else lbs, devices)]
|
||||
return MultiLazyBuffer([lb if lb.is_unrealized_unmasked_const() else lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
|
||||
# NOTE: this contiguous is making it impossible for the scheduler to do late const folding
|
||||
return MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
|
||||
|
||||
def copy_to_device(self, device:str) -> UOp:
|
||||
if self.axis is None:
|
||||
|
||||
@@ -453,8 +453,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
|
||||
def copy_to_device(self, device:str, force=False, clone:bool=False) -> UOp:
|
||||
# no COPY
|
||||
if self.device == device and not clone: return self
|
||||
# TODO: hack const metaop early here, fix this in multi
|
||||
if self.base.op is Ops.CONST: return UOp.metaop(Ops.CONST, (), self.dtype, device, self.const_arg).view(unwrap(self.st))
|
||||
# if it's a shrink, do the shrink before the copy with CONTIGUOUS
|
||||
if prod(self.shape) < prod(self.base.shape): return self.contiguous().copy_to_device(device)
|
||||
# copy the base and apply the shapetracker on the new device
|
||||
|
||||
Reference in New Issue
Block a user