improve microbenchmarks (#13492)

* improve microbenchmarks * bugfix + ubench * lil * no src in const method
2026-01-07 22:23:55 -05:00 · 2025-11-29 10:15:22 -08:00
parent 941597db71
commit c38b7684dc
4 changed files with 78 additions and 26 deletions
--- a/test/unit/test_microbenchmarks.py
+++ b/test/unit/test_microbenchmarks.py
@@ -1,51 +1,99 @@
 import unittest, time
+from tinygrad import dtypes, Tensor, UOp, getenv
 from tinygrad.helpers import Profiling
-from tinygrad.uop.ops import UOp
-from tinygrad.dtype import dtypes

-# it's about 1 ms per 1k UOps on M3
-N = 10000
+PYPROFILE = getenv("PYPROFILE")
+class TestBench(unittest.TestCase):
+  @staticmethod
+  def setUpClass():
+    # no fixed cost
+    Tensor.empty(10,10)
+    Tensor.randn(10,10)

-class TestMicrobenchmarks(unittest.TestCase):
+  def start_time(self): self.st = time.perf_counter()
  def setUp(self):
-    self.st = time.perf_counter()
+    # it's about 1 ms per 1k UOps on M3
+    if PYPROFILE:
+      self.prof = Profiling()
+      self.prof.__enter__()
+    else:
+      self.prof = None
+    self.N = 10000
+    self.start_time()
+
  def tearDown(self):
    et = (time.perf_counter() - self.st)
-    print(f"{self._testMethodName} {et*1e3:.2f} ms")
+    if self.prof is not None: self.prof.__exit__()
+    print(f"{self._testMethodName:30s} {et*1e6/self.N:.2f} us")

  def test_uop_instant_creation(self):
-    for i in range(N): UOp.const(dtypes.int, 100+i)
+    for i in range(self.N): UOp.const(dtypes.int, 100+i)

  def test_uop_list_creation(self):
-    [UOp.const(dtypes.int, 100+i) for i in range(N)]
+    [UOp.const(dtypes.int, 100+i) for i in range(self.N)]

  def test_uop_add_2n(self):
    a = UOp.const(dtypes.int, 2)
-    for _ in range(N): a = a + a
+    for _ in range(self.N): a = a + a

  def test_uop_toposort(self):
    a = UOp.const(dtypes.int, 0)
-    for i in range(N): a = a + UOp.const(dtypes.int, 100+i)
-    self.setUp()
-    self.assertEqual(len(a.toposort()), 2*N+1)
+    for i in range(self.N): a = a + UOp.const(dtypes.int, 100+i)
+    self.start_time()
+    self.assertEqual(len(a.toposort()), 2*self.N+1)

  def test_uop_toposort_2n(self):
    a = UOp.const(dtypes.int, 0)
-    for i in range(N): a = a + a
-    self.setUp()
-    self.assertEqual(len(a.toposort()), N+1)
+    for _ in range(self.N): a = a + a
+    self.start_time()
+    self.assertEqual(len(a.toposort()), self.N+1)

  def test_uop_simplify(self):
    a = UOp.const(dtypes.int, 2)
-    for _ in range(N): (a+a).simplify()
+    for _ in range(self.N): (a+a).simplify()

-class TestMicroprofile(unittest.TestCase):
  def test_uop_simplify_complex(self):
+    self.N //= 10 # this test is slow
    x = UOp.variable("x", 0, 10)
    y = UOp.variable("y", 0, 10)
    expr = (x*2)+5+(x*4)+(y*2)+y
-    with Profiling():
-      for _ in range(1000): expr.simplify()
+    for _ in range(self.N): expr.simplify()
+
+  def test_uop_simplify_div(self):
+    self.N //= 10 # this test is slow
+    x = UOp.variable("x", 0, 10)
+    y = UOp.variable("y", 0, 10)
+    z = UOp.variable("z", 0, 10)
+    expr = (x*4+y*8)//(z*2)
+    for _ in range(self.N): expr.simplify()
+
+  def test_uop_chain_free(self):
+    a = UOp.const(dtypes.int, 2)
+    for _ in range(self.N): a = a + a
+    self.start_time()
+    del a
+
+  def test_tensor_zeros(self):
+    self.N //= 10 # this test is slow
+    for _ in range(self.N): Tensor.zeros(10, 10)
+
+  def test_tensor_add(self):
+    self.N //= 10 # this test is slow
+    a = Tensor.zeros(10, 10)
+    b = Tensor.zeros(10, 10)
+    for _ in range(self.N): a+b
+
+  def test_tensor_empty(self):
+    self.N //= 10 # this test is slow
+    for _ in range(self.N): Tensor.empty(10, 10)
+
+  def test_tensor_rand(self):
+    self.N //= 100 # this test is very slow
+    for _ in range(self.N): Tensor.rand(10, 10)
+
+  def test_tensor_randn(self):
+    self.N //= 100 # this test is very slow
+    for _ in range(self.N): Tensor.randn(10, 10)

 if __name__ == '__main__':
  unittest.main()
--- a/tinygrad/dtype.py
+++ b/tinygrad/dtype.py
@@ -108,7 +108,7 @@ class dtypes:
  def is_float(x: DType) -> bool: return x.scalar() in dtypes.floats or isinstance(x, ImageDType)
  @staticmethod # static methods on top, or bool in the type info will refer to dtypes.bool
  @functools.cache
-  def is_int(x: DType) -> bool: return x.scalar() in dtypes.ints + (dtypes.index,)
+  def is_int(x: DType) -> bool: return x.scalar() in dtypes.index_like
  @staticmethod
  @functools.cache
  def is_unsigned(x: DType) -> bool: return x.scalar() in dtypes.uints
@@ -185,6 +185,7 @@ class dtypes:
  uints = (uint8, uint16, uint32, uint64)
  sints = (int8, int16, int32, int64)
  ints = uints + sints
+  index_like = ints + (index,)
  all = floats + ints + (bool, index) # noqa: A003

 if (env_default_float := getenv("DEFAULT_FLOAT", "")):
--- a/tinygrad/uop/ops.py
+++ b/tinygrad/uop/ops.py
@@ -429,12 +429,14 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
    if op in {Ops.CMPLT, Ops.CMPNE, Ops.CMPEQ}: out_dtype = dtypes.bool.vec(out_dtype.count) if out_dtype.count > 1 else dtypes.bool
    return UOp(op, out_dtype, (self,)+src, **kwargs)
  @staticmethod
-  def const(dtype:DType, b:ConstLike, device:str|tuple[str, ...]|None=None, shape:tuple[sint, ...]|None=None, src=None, unique:bool|int=False):
+  def const(dtype:DType, b:ConstLike, device:str|tuple[str, ...]|None=None, shape:tuple[sint, ...]|None=None, unique:bool|int=False):
    if isinstance(b, UOp): return b.unbind()[0] if b.op is Ops.BIND else b
-    if isinstance(b, tuple) and all_same(b): b = b[0]  # doesn't have to be a VCONST if they are all the same
+    if isinstance(b, tuple) and all_same(b):
+      assert len(b) > 0, "can't create const from empty tuple"
+      b = b[0]  # doesn't have to be a VCONST if they are all the same
    # NOTE: float('nan') != float('nan'), so we canonicalize here
    if isinstance(b, float) and math.isnan(b): b = math.nan
-    ret = UOp(Ops.VCONST if isinstance(b, tuple) else Ops.CONST, dtype, arg=dtypes.as_const(b, dtype), src=() if src is None else (src,))
+    ret = UOp(Ops.VCONST if isinstance(b, tuple) else Ops.CONST, dtype, arg=dtypes.as_const(b, dtype))
    if device is not None:
      if unique or not isinstance(unique, bool): ret = ret.replace(src=(UOp(Ops.DEVICE, arg=device), UOp.unique(None if unique is True else unique)))
      else: ret = ret.replace(src=(UOp(Ops.DEVICE, arg=device),))
@@ -572,7 +574,7 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
      else: usrcs.append(UOp(Ops.VECTORIZE, dtypes.index.vec(len(arg)), tuple(UOp.const(dtypes.index, x) if isinstance(x, int) else x for x in arg)))
    if len(usrcs) == 0: ret = UOp(op, self.dtype, (self,), arg)
    else: ret = UOp(op, self.dtype, (self,)+UOp.sink(*usrcs).simplify().src)
-    # for all movement ops, we check shape property
+    # for all movement ops, we check shape property to validity check the movement op
    if ret.shape == self.shape and same_shape_noop: return self
    return ret

--- a/tinygrad/uop/symbolic.py
+++ b/tinygrad/uop/symbolic.py
@@ -255,7 +255,8 @@ symbolic = symbolic_simple+commutative+PatternMatcher([
  # after with 1 src is just src[0]
  (UPat(Ops.AFTER, src=(UPat.var("s"),)), lambda s: s),
  # VECTORIZE/CONST
-  (UPat(Ops.VECTORIZE, src=UPat(Ops.CONST), name="vec"), lambda vec: UOp.const(vec.dtype, tuple(x.arg for x in vec.src))),
+  (UPat(Ops.VECTORIZE, src=UPat(Ops.CONST), name="vec"),
+    lambda vec: UOp.const(vec.dtype, tuple(x.arg for x in vec.src)) if len(vec.src) > 0 else None),
 ])+div_and_mod_symbolic+gep_pushing

 # ******** we take a small aside to "simplify_valid" to rewrite valids ********