From c38b7684dc7da212dbcddf235dfac35788df9ca8 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Sat, 29 Nov 2025 10:15:22 -0800 Subject: [PATCH] improve microbenchmarks (#13492) * improve microbenchmarks * bugfix + ubench * lil * no src in const method --- test/unit/test_microbenchmarks.py | 88 ++++++++++++++++++++++++------- tinygrad/dtype.py | 3 +- tinygrad/uop/ops.py | 10 ++-- tinygrad/uop/symbolic.py | 3 +- 4 files changed, 78 insertions(+), 26 deletions(-) diff --git a/test/unit/test_microbenchmarks.py b/test/unit/test_microbenchmarks.py index e90a729e95..2efb64d211 100644 --- a/test/unit/test_microbenchmarks.py +++ b/test/unit/test_microbenchmarks.py @@ -1,51 +1,99 @@ import unittest, time +from tinygrad import dtypes, Tensor, UOp, getenv from tinygrad.helpers import Profiling -from tinygrad.uop.ops import UOp -from tinygrad.dtype import dtypes -# it's about 1 ms per 1k UOps on M3 -N = 10000 +PYPROFILE = getenv("PYPROFILE") +class TestBench(unittest.TestCase): + @staticmethod + def setUpClass(): + # no fixed cost + Tensor.empty(10,10) + Tensor.randn(10,10) -class TestMicrobenchmarks(unittest.TestCase): + def start_time(self): self.st = time.perf_counter() def setUp(self): - self.st = time.perf_counter() + # it's about 1 ms per 1k UOps on M3 + if PYPROFILE: + self.prof = Profiling() + self.prof.__enter__() + else: + self.prof = None + self.N = 10000 + self.start_time() + def tearDown(self): et = (time.perf_counter() - self.st) - print(f"{self._testMethodName} {et*1e3:.2f} ms") + if self.prof is not None: self.prof.__exit__() + print(f"{self._testMethodName:30s} {et*1e6/self.N:.2f} us") def test_uop_instant_creation(self): - for i in range(N): UOp.const(dtypes.int, 100+i) + for i in range(self.N): UOp.const(dtypes.int, 100+i) def test_uop_list_creation(self): - [UOp.const(dtypes.int, 100+i) for i in range(N)] + [UOp.const(dtypes.int, 100+i) for i in range(self.N)] def test_uop_add_2n(self): a = UOp.const(dtypes.int, 2) - for _ in range(N): a = a + a + for _ in range(self.N): a = a + a def test_uop_toposort(self): a = UOp.const(dtypes.int, 0) - for i in range(N): a = a + UOp.const(dtypes.int, 100+i) - self.setUp() - self.assertEqual(len(a.toposort()), 2*N+1) + for i in range(self.N): a = a + UOp.const(dtypes.int, 100+i) + self.start_time() + self.assertEqual(len(a.toposort()), 2*self.N+1) def test_uop_toposort_2n(self): a = UOp.const(dtypes.int, 0) - for i in range(N): a = a + a - self.setUp() - self.assertEqual(len(a.toposort()), N+1) + for _ in range(self.N): a = a + a + self.start_time() + self.assertEqual(len(a.toposort()), self.N+1) def test_uop_simplify(self): a = UOp.const(dtypes.int, 2) - for _ in range(N): (a+a).simplify() + for _ in range(self.N): (a+a).simplify() -class TestMicroprofile(unittest.TestCase): def test_uop_simplify_complex(self): + self.N //= 10 # this test is slow x = UOp.variable("x", 0, 10) y = UOp.variable("y", 0, 10) expr = (x*2)+5+(x*4)+(y*2)+y - with Profiling(): - for _ in range(1000): expr.simplify() + for _ in range(self.N): expr.simplify() + + def test_uop_simplify_div(self): + self.N //= 10 # this test is slow + x = UOp.variable("x", 0, 10) + y = UOp.variable("y", 0, 10) + z = UOp.variable("z", 0, 10) + expr = (x*4+y*8)//(z*2) + for _ in range(self.N): expr.simplify() + + def test_uop_chain_free(self): + a = UOp.const(dtypes.int, 2) + for _ in range(self.N): a = a + a + self.start_time() + del a + + def test_tensor_zeros(self): + self.N //= 10 # this test is slow + for _ in range(self.N): Tensor.zeros(10, 10) + + def test_tensor_add(self): + self.N //= 10 # this test is slow + a = Tensor.zeros(10, 10) + b = Tensor.zeros(10, 10) + for _ in range(self.N): a+b + + def test_tensor_empty(self): + self.N //= 10 # this test is slow + for _ in range(self.N): Tensor.empty(10, 10) + + def test_tensor_rand(self): + self.N //= 100 # this test is very slow + for _ in range(self.N): Tensor.rand(10, 10) + + def test_tensor_randn(self): + self.N //= 100 # this test is very slow + for _ in range(self.N): Tensor.randn(10, 10) if __name__ == '__main__': unittest.main() diff --git a/tinygrad/dtype.py b/tinygrad/dtype.py index 09d00bab0c..6cf7aea272 100644 --- a/tinygrad/dtype.py +++ b/tinygrad/dtype.py @@ -108,7 +108,7 @@ class dtypes: def is_float(x: DType) -> bool: return x.scalar() in dtypes.floats or isinstance(x, ImageDType) @staticmethod # static methods on top, or bool in the type info will refer to dtypes.bool @functools.cache - def is_int(x: DType) -> bool: return x.scalar() in dtypes.ints + (dtypes.index,) + def is_int(x: DType) -> bool: return x.scalar() in dtypes.index_like @staticmethod @functools.cache def is_unsigned(x: DType) -> bool: return x.scalar() in dtypes.uints @@ -185,6 +185,7 @@ class dtypes: uints = (uint8, uint16, uint32, uint64) sints = (int8, int16, int32, int64) ints = uints + sints + index_like = ints + (index,) all = floats + ints + (bool, index) # noqa: A003 if (env_default_float := getenv("DEFAULT_FLOAT", "")): diff --git a/tinygrad/uop/ops.py b/tinygrad/uop/ops.py index 7041bb7e3b..cb167ff487 100644 --- a/tinygrad/uop/ops.py +++ b/tinygrad/uop/ops.py @@ -429,12 +429,14 @@ class UOp(OpMixin, metaclass=UOpMetaClass): if op in {Ops.CMPLT, Ops.CMPNE, Ops.CMPEQ}: out_dtype = dtypes.bool.vec(out_dtype.count) if out_dtype.count > 1 else dtypes.bool return UOp(op, out_dtype, (self,)+src, **kwargs) @staticmethod - def const(dtype:DType, b:ConstLike, device:str|tuple[str, ...]|None=None, shape:tuple[sint, ...]|None=None, src=None, unique:bool|int=False): + def const(dtype:DType, b:ConstLike, device:str|tuple[str, ...]|None=None, shape:tuple[sint, ...]|None=None, unique:bool|int=False): if isinstance(b, UOp): return b.unbind()[0] if b.op is Ops.BIND else b - if isinstance(b, tuple) and all_same(b): b = b[0] # doesn't have to be a VCONST if they are all the same + if isinstance(b, tuple) and all_same(b): + assert len(b) > 0, "can't create const from empty tuple" + b = b[0] # doesn't have to be a VCONST if they are all the same # NOTE: float('nan') != float('nan'), so we canonicalize here if isinstance(b, float) and math.isnan(b): b = math.nan - ret = UOp(Ops.VCONST if isinstance(b, tuple) else Ops.CONST, dtype, arg=dtypes.as_const(b, dtype), src=() if src is None else (src,)) + ret = UOp(Ops.VCONST if isinstance(b, tuple) else Ops.CONST, dtype, arg=dtypes.as_const(b, dtype)) if device is not None: if unique or not isinstance(unique, bool): ret = ret.replace(src=(UOp(Ops.DEVICE, arg=device), UOp.unique(None if unique is True else unique))) else: ret = ret.replace(src=(UOp(Ops.DEVICE, arg=device),)) @@ -572,7 +574,7 @@ class UOp(OpMixin, metaclass=UOpMetaClass): else: usrcs.append(UOp(Ops.VECTORIZE, dtypes.index.vec(len(arg)), tuple(UOp.const(dtypes.index, x) if isinstance(x, int) else x for x in arg))) if len(usrcs) == 0: ret = UOp(op, self.dtype, (self,), arg) else: ret = UOp(op, self.dtype, (self,)+UOp.sink(*usrcs).simplify().src) - # for all movement ops, we check shape property + # for all movement ops, we check shape property to validity check the movement op if ret.shape == self.shape and same_shape_noop: return self return ret diff --git a/tinygrad/uop/symbolic.py b/tinygrad/uop/symbolic.py index ea230b13b2..68dfbb0b7c 100644 --- a/tinygrad/uop/symbolic.py +++ b/tinygrad/uop/symbolic.py @@ -255,7 +255,8 @@ symbolic = symbolic_simple+commutative+PatternMatcher([ # after with 1 src is just src[0] (UPat(Ops.AFTER, src=(UPat.var("s"),)), lambda s: s), # VECTORIZE/CONST - (UPat(Ops.VECTORIZE, src=UPat(Ops.CONST), name="vec"), lambda vec: UOp.const(vec.dtype, tuple(x.arg for x in vec.src))), + (UPat(Ops.VECTORIZE, src=UPat(Ops.CONST), name="vec"), + lambda vec: UOp.const(vec.dtype, tuple(x.arg for x in vec.src)) if len(vec.src) > 0 else None), ])+div_and_mod_symbolic+gep_pushing # ******** we take a small aside to "simplify_valid" to rewrite valids ********