Merge branch 'master' into test_rewrite_map

This commit is contained in:
George Hotz
2024-12-29 10:34:07 -05:00
committed by GitHub
5 changed files with 32 additions and 4 deletions

View File

@@ -240,6 +240,8 @@ class TestMultiConstFolding(unittest.TestCase):
_check_ast_count(0, t ** 1)
_check_ast_count(0, 1 ** t)
# failing because multi calls .contiguous() on every single sharded uop
@unittest.expectedFailure
def test_multi_const_folding_tensor(self):
ds = tuple(f"{Device.DEFAULT}:{i}" for i in range(4))
t = Tensor.arange(16).float().realize().to(ds)

View File

@@ -5,7 +5,7 @@ from tinygrad.ops import Ops
from tinygrad.helpers import CI, getenv, prod, Context
from tinygrad.nn.state import get_parameters, get_state_dict
from tinygrad.engine.schedule import create_schedule
from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner
from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule
from tinygrad.multi import all_reduce, MultiLazyBuffer
import numpy as np
from hypothesis import given, strategies as strat, settings
@@ -704,6 +704,19 @@ class TestMultiTensor(unittest.TestCase):
t = Tensor.rand(16, 16).shard(devices_2, axis=0)
np.testing.assert_allclose(t.numpy(), t.clone().numpy())
def test_multi_const_folding(self):
with Context(TRACK_MATCH_STATS=0):
a = Tensor.arange(3).realize()
zeros = Tensor.zeros(3).realize()
b = a.to(devices_2)*zeros.to(devices_2)
sched = b.schedule()
self.assertEqual(len(sched), 6)
# notably, only two copies (for the arange) - vs 4 copies if we didn't fold the const copy
self.assertEqual(len([x for x in sched if any(u.op is Ops.COPY for u in x.ast.toposort)]), 2)
# all these kernels are just because multi calls contiguous on every single shard
run_schedule(sched)
self.assertListEqual(b.tolist(), [0, 0, 0])
@unittest.skipIf(CI and Device.DEFAULT in ("GPU", "CUDA", "METAL"), "no GPU CI")
class TestHandleData(unittest.TestCase):
def test_copied_to_device(self):

View File

@@ -2163,5 +2163,19 @@ class TestConst(unittest.TestCase):
run_schedule(sched, var_vals)
self.assertEqual(a.tolist(), 3)
@unittest.skipIf(Device.DEFAULT == "CLANG", "tests copy from another device to clang")
class TestCopyFolding(unittest.TestCase):
def test_const_copy_is_free(self):
b = Tensor(1).to("CLANG")
check_schedule(b, 0, filter_sink=False)
assert b.item() == 1
def test_late_const_copy_folding(self):
a = Tensor.arange(3).realize()
zeros = Tensor.zeros(3).realize()
b = (a*zeros).to("CLANG")
run_schedule(check_schedule(b, 0, filter_sink=False))
self.assertListEqual(b.tolist(), [0, 0, 0])
if __name__ == '__main__':
unittest.main(verbosity=2)

View File

@@ -66,7 +66,8 @@ class MultiLazyBuffer(MathTrait):
assert (axis is None) == (bounds is None), "must specify bounds iff axis is specified"
lbs = [lb] * len(devices)
sharded_lbs = [lb.copy_to_device(d) for lb,d in zip(to_sharded(lbs, axis, bounds) if axis is not None and bounds is not None else lbs, devices)]
return MultiLazyBuffer([lb if lb.is_unrealized_unmasked_const() else lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
# NOTE: this contiguous is making it impossible for the scheduler to do late const folding
return MultiLazyBuffer([lb.contiguous(allow_buffer_view=False) for lb in sharded_lbs], axis)
def copy_to_device(self, device:str) -> UOp:
if self.axis is None:

View File

@@ -453,8 +453,6 @@ class UOp(MathTrait, metaclass=UOpMetaClass):
def copy_to_device(self, device:str, force=False, clone:bool=False) -> UOp:
# no COPY
if self.device == device and not clone: return self
# TODO: hack const metaop early here, fix this in multi
if self.base.op is Ops.CONST: return UOp.metaop(Ops.CONST, (), self.dtype, device, self.const_arg).view(unwrap(self.st))
# if it's a shrink, do the shrink before the copy with CONTIGUOUS
if prod(self.shape) < prod(self.base.shape): return self.contiguous().copy_to_device(device)
# copy the base and apply the shapetracker on the new device