From c38b7684dc7da212dbcddf235dfac35788df9ca8 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Sat, 29 Nov 2025 10:15:22 -0800
Subject: [PATCH] improve microbenchmarks (#13492)

* improve microbenchmarks

* bugfix + ubench

* lil

* no src in const method
---
 test/unit/test_microbenchmarks.py | 88 ++++++++++++++++++++++++-------
 tinygrad/dtype.py                 |  3 +-
 tinygrad/uop/ops.py               | 10 ++--
 tinygrad/uop/symbolic.py          |  3 +-
 4 files changed, 78 insertions(+), 26 deletions(-)

diff --git a/test/unit/test_microbenchmarks.py b/test/unit/test_microbenchmarks.py
index e90a729e95..2efb64d211 100644
--- a/test/unit/test_microbenchmarks.py
+++ b/test/unit/test_microbenchmarks.py
@@ -1,51 +1,99 @@
 import unittest, time
+from tinygrad import dtypes, Tensor, UOp, getenv
 from tinygrad.helpers import Profiling
-from tinygrad.uop.ops import UOp
-from tinygrad.dtype import dtypes
 
-# it's about 1 ms per 1k UOps on M3
-N = 10000
+PYPROFILE = getenv("PYPROFILE")
+class TestBench(unittest.TestCase):
+  @staticmethod
+  def setUpClass():
+    # no fixed cost
+    Tensor.empty(10,10)
+    Tensor.randn(10,10)
 
-class TestMicrobenchmarks(unittest.TestCase):
+  def start_time(self): self.st = time.perf_counter()
   def setUp(self):
-    self.st = time.perf_counter()
+    # it's about 1 ms per 1k UOps on M3
+    if PYPROFILE:
+      self.prof = Profiling()
+      self.prof.__enter__()
+    else:
+      self.prof = None
+    self.N = 10000
+    self.start_time()
+
   def tearDown(self):
     et = (time.perf_counter() - self.st)
-    print(f"{self._testMethodName} {et*1e3:.2f} ms")
+    if self.prof is not None: self.prof.__exit__()
+    print(f"{self._testMethodName:30s} {et*1e6/self.N:.2f} us")
 
   def test_uop_instant_creation(self):
-    for i in range(N): UOp.const(dtypes.int, 100+i)
+    for i in range(self.N): UOp.const(dtypes.int, 100+i)
 
   def test_uop_list_creation(self):
-    [UOp.const(dtypes.int, 100+i) for i in range(N)]
+    [UOp.const(dtypes.int, 100+i) for i in range(self.N)]
 
   def test_uop_add_2n(self):
     a = UOp.const(dtypes.int, 2)
-    for _ in range(N): a = a + a
+    for _ in range(self.N): a = a + a
 
   def test_uop_toposort(self):
     a = UOp.const(dtypes.int, 0)
-    for i in range(N): a = a + UOp.const(dtypes.int, 100+i)
-    self.setUp()
-    self.assertEqual(len(a.toposort()), 2*N+1)
+    for i in range(self.N): a = a + UOp.const(dtypes.int, 100+i)
+    self.start_time()
+    self.assertEqual(len(a.toposort()), 2*self.N+1)
 
   def test_uop_toposort_2n(self):
     a = UOp.const(dtypes.int, 0)
-    for i in range(N): a = a + a
-    self.setUp()
-    self.assertEqual(len(a.toposort()), N+1)
+    for _ in range(self.N): a = a + a
+    self.start_time()
+    self.assertEqual(len(a.toposort()), self.N+1)
 
   def test_uop_simplify(self):
     a = UOp.const(dtypes.int, 2)
-    for _ in range(N): (a+a).simplify()
+    for _ in range(self.N): (a+a).simplify()
 
-class TestMicroprofile(unittest.TestCase):
   def test_uop_simplify_complex(self):
+    self.N //= 10 # this test is slow
     x = UOp.variable("x", 0, 10)
     y = UOp.variable("y", 0, 10)
     expr = (x*2)+5+(x*4)+(y*2)+y
-    with Profiling():
-      for _ in range(1000): expr.simplify()
+    for _ in range(self.N): expr.simplify()
+
+  def test_uop_simplify_div(self):
+    self.N //= 10 # this test is slow
+    x = UOp.variable("x", 0, 10)
+    y = UOp.variable("y", 0, 10)
+    z = UOp.variable("z", 0, 10)
+    expr = (x*4+y*8)//(z*2)
+    for _ in range(self.N): expr.simplify()
+
+  def test_uop_chain_free(self):
+    a = UOp.const(dtypes.int, 2)
+    for _ in range(self.N): a = a + a
+    self.start_time()
+    del a
+
+  def test_tensor_zeros(self):
+    self.N //= 10 # this test is slow
+    for _ in range(self.N): Tensor.zeros(10, 10)
+
+  def test_tensor_add(self):
+    self.N //= 10 # this test is slow
+    a = Tensor.zeros(10, 10)
+    b = Tensor.zeros(10, 10)
+    for _ in range(self.N): a+b
+
+  def test_tensor_empty(self):
+    self.N //= 10 # this test is slow
+    for _ in range(self.N): Tensor.empty(10, 10)
+
+  def test_tensor_rand(self):
+    self.N //= 100 # this test is very slow
+    for _ in range(self.N): Tensor.rand(10, 10)
+
+  def test_tensor_randn(self):
+    self.N //= 100 # this test is very slow
+    for _ in range(self.N): Tensor.randn(10, 10)
 
 if __name__ == '__main__':
   unittest.main()
diff --git a/tinygrad/dtype.py b/tinygrad/dtype.py
index 09d00bab0c..6cf7aea272 100644
--- a/tinygrad/dtype.py
+++ b/tinygrad/dtype.py
@@ -108,7 +108,7 @@ class dtypes:
   def is_float(x: DType) -> bool: return x.scalar() in dtypes.floats or isinstance(x, ImageDType)
   @staticmethod # static methods on top, or bool in the type info will refer to dtypes.bool
   @functools.cache
-  def is_int(x: DType) -> bool: return x.scalar() in dtypes.ints + (dtypes.index,)
+  def is_int(x: DType) -> bool: return x.scalar() in dtypes.index_like
   @staticmethod
   @functools.cache
   def is_unsigned(x: DType) -> bool: return x.scalar() in dtypes.uints
@@ -185,6 +185,7 @@ class dtypes:
   uints = (uint8, uint16, uint32, uint64)
   sints = (int8, int16, int32, int64)
   ints = uints + sints
+  index_like = ints + (index,)
   all = floats + ints + (bool, index) # noqa: A003
 
 if (env_default_float := getenv("DEFAULT_FLOAT", "")):
diff --git a/tinygrad/uop/ops.py b/tinygrad/uop/ops.py
index 7041bb7e3b..cb167ff487 100644
--- a/tinygrad/uop/ops.py
+++ b/tinygrad/uop/ops.py
@@ -429,12 +429,14 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
     if op in {Ops.CMPLT, Ops.CMPNE, Ops.CMPEQ}: out_dtype = dtypes.bool.vec(out_dtype.count) if out_dtype.count > 1 else dtypes.bool
     return UOp(op, out_dtype, (self,)+src, **kwargs)
   @staticmethod
-  def const(dtype:DType, b:ConstLike, device:str|tuple[str, ...]|None=None, shape:tuple[sint, ...]|None=None, src=None, unique:bool|int=False):
+  def const(dtype:DType, b:ConstLike, device:str|tuple[str, ...]|None=None, shape:tuple[sint, ...]|None=None, unique:bool|int=False):
     if isinstance(b, UOp): return b.unbind()[0] if b.op is Ops.BIND else b
-    if isinstance(b, tuple) and all_same(b): b = b[0]  # doesn't have to be a VCONST if they are all the same
+    if isinstance(b, tuple) and all_same(b):
+      assert len(b) > 0, "can't create const from empty tuple"
+      b = b[0]  # doesn't have to be a VCONST if they are all the same
     # NOTE: float('nan') != float('nan'), so we canonicalize here
     if isinstance(b, float) and math.isnan(b): b = math.nan
-    ret = UOp(Ops.VCONST if isinstance(b, tuple) else Ops.CONST, dtype, arg=dtypes.as_const(b, dtype), src=() if src is None else (src,))
+    ret = UOp(Ops.VCONST if isinstance(b, tuple) else Ops.CONST, dtype, arg=dtypes.as_const(b, dtype))
     if device is not None:
       if unique or not isinstance(unique, bool): ret = ret.replace(src=(UOp(Ops.DEVICE, arg=device), UOp.unique(None if unique is True else unique)))
       else: ret = ret.replace(src=(UOp(Ops.DEVICE, arg=device),))
@@ -572,7 +574,7 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
       else: usrcs.append(UOp(Ops.VECTORIZE, dtypes.index.vec(len(arg)), tuple(UOp.const(dtypes.index, x) if isinstance(x, int) else x for x in arg)))
     if len(usrcs) == 0: ret = UOp(op, self.dtype, (self,), arg)
     else: ret = UOp(op, self.dtype, (self,)+UOp.sink(*usrcs).simplify().src)
-    # for all movement ops, we check shape property
+    # for all movement ops, we check shape property to validity check the movement op
     if ret.shape == self.shape and same_shape_noop: return self
     return ret
 
diff --git a/tinygrad/uop/symbolic.py b/tinygrad/uop/symbolic.py
index ea230b13b2..68dfbb0b7c 100644
--- a/tinygrad/uop/symbolic.py
+++ b/tinygrad/uop/symbolic.py
@@ -255,7 +255,8 @@ symbolic = symbolic_simple+commutative+PatternMatcher([
   # after with 1 src is just src[0]
   (UPat(Ops.AFTER, src=(UPat.var("s"),)), lambda s: s),
   # VECTORIZE/CONST
-  (UPat(Ops.VECTORIZE, src=UPat(Ops.CONST), name="vec"), lambda vec: UOp.const(vec.dtype, tuple(x.arg for x in vec.src))),
+  (UPat(Ops.VECTORIZE, src=UPat(Ops.CONST), name="vec"),
+    lambda vec: UOp.const(vec.dtype, tuple(x.arg for x in vec.src)) if len(vec.src) > 0 else None),
 ])+div_and_mod_symbolic+gep_pushing
 
 # ******** we take a small aside to "simplify_valid" to rewrite valids ********