improve microbenchmarks (#13492)

* improve microbenchmarks

* bugfix + ubench

* lil

* no src in const method
This commit is contained in:
George Hotz
2025-11-29 10:15:22 -08:00
committed by GitHub
parent 941597db71
commit c38b7684dc
4 changed files with 78 additions and 26 deletions

View File

@@ -1,51 +1,99 @@
import unittest, time
from tinygrad import dtypes, Tensor, UOp, getenv
from tinygrad.helpers import Profiling
from tinygrad.uop.ops import UOp
from tinygrad.dtype import dtypes
# it's about 1 ms per 1k UOps on M3
N = 10000
PYPROFILE = getenv("PYPROFILE")
class TestBench(unittest.TestCase):
@staticmethod
def setUpClass():
# no fixed cost
Tensor.empty(10,10)
Tensor.randn(10,10)
class TestMicrobenchmarks(unittest.TestCase):
def start_time(self): self.st = time.perf_counter()
def setUp(self):
self.st = time.perf_counter()
# it's about 1 ms per 1k UOps on M3
if PYPROFILE:
self.prof = Profiling()
self.prof.__enter__()
else:
self.prof = None
self.N = 10000
self.start_time()
def tearDown(self):
et = (time.perf_counter() - self.st)
print(f"{self._testMethodName} {et*1e3:.2f} ms")
if self.prof is not None: self.prof.__exit__()
print(f"{self._testMethodName:30s} {et*1e6/self.N:.2f} us")
def test_uop_instant_creation(self):
for i in range(N): UOp.const(dtypes.int, 100+i)
for i in range(self.N): UOp.const(dtypes.int, 100+i)
def test_uop_list_creation(self):
[UOp.const(dtypes.int, 100+i) for i in range(N)]
[UOp.const(dtypes.int, 100+i) for i in range(self.N)]
def test_uop_add_2n(self):
a = UOp.const(dtypes.int, 2)
for _ in range(N): a = a + a
for _ in range(self.N): a = a + a
def test_uop_toposort(self):
a = UOp.const(dtypes.int, 0)
for i in range(N): a = a + UOp.const(dtypes.int, 100+i)
self.setUp()
self.assertEqual(len(a.toposort()), 2*N+1)
for i in range(self.N): a = a + UOp.const(dtypes.int, 100+i)
self.start_time()
self.assertEqual(len(a.toposort()), 2*self.N+1)
def test_uop_toposort_2n(self):
a = UOp.const(dtypes.int, 0)
for i in range(N): a = a + a
self.setUp()
self.assertEqual(len(a.toposort()), N+1)
for _ in range(self.N): a = a + a
self.start_time()
self.assertEqual(len(a.toposort()), self.N+1)
def test_uop_simplify(self):
a = UOp.const(dtypes.int, 2)
for _ in range(N): (a+a).simplify()
for _ in range(self.N): (a+a).simplify()
class TestMicroprofile(unittest.TestCase):
def test_uop_simplify_complex(self):
self.N //= 10 # this test is slow
x = UOp.variable("x", 0, 10)
y = UOp.variable("y", 0, 10)
expr = (x*2)+5+(x*4)+(y*2)+y
with Profiling():
for _ in range(1000): expr.simplify()
for _ in range(self.N): expr.simplify()
def test_uop_simplify_div(self):
self.N //= 10 # this test is slow
x = UOp.variable("x", 0, 10)
y = UOp.variable("y", 0, 10)
z = UOp.variable("z", 0, 10)
expr = (x*4+y*8)//(z*2)
for _ in range(self.N): expr.simplify()
def test_uop_chain_free(self):
a = UOp.const(dtypes.int, 2)
for _ in range(self.N): a = a + a
self.start_time()
del a
def test_tensor_zeros(self):
self.N //= 10 # this test is slow
for _ in range(self.N): Tensor.zeros(10, 10)
def test_tensor_add(self):
self.N //= 10 # this test is slow
a = Tensor.zeros(10, 10)
b = Tensor.zeros(10, 10)
for _ in range(self.N): a+b
def test_tensor_empty(self):
self.N //= 10 # this test is slow
for _ in range(self.N): Tensor.empty(10, 10)
def test_tensor_rand(self):
self.N //= 100 # this test is very slow
for _ in range(self.N): Tensor.rand(10, 10)
def test_tensor_randn(self):
self.N //= 100 # this test is very slow
for _ in range(self.N): Tensor.randn(10, 10)
if __name__ == '__main__':
unittest.main()

View File

@@ -108,7 +108,7 @@ class dtypes:
def is_float(x: DType) -> bool: return x.scalar() in dtypes.floats or isinstance(x, ImageDType)
@staticmethod # static methods on top, or bool in the type info will refer to dtypes.bool
@functools.cache
def is_int(x: DType) -> bool: return x.scalar() in dtypes.ints + (dtypes.index,)
def is_int(x: DType) -> bool: return x.scalar() in dtypes.index_like
@staticmethod
@functools.cache
def is_unsigned(x: DType) -> bool: return x.scalar() in dtypes.uints
@@ -185,6 +185,7 @@ class dtypes:
uints = (uint8, uint16, uint32, uint64)
sints = (int8, int16, int32, int64)
ints = uints + sints
index_like = ints + (index,)
all = floats + ints + (bool, index) # noqa: A003
if (env_default_float := getenv("DEFAULT_FLOAT", "")):

View File

@@ -429,12 +429,14 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
if op in {Ops.CMPLT, Ops.CMPNE, Ops.CMPEQ}: out_dtype = dtypes.bool.vec(out_dtype.count) if out_dtype.count > 1 else dtypes.bool
return UOp(op, out_dtype, (self,)+src, **kwargs)
@staticmethod
def const(dtype:DType, b:ConstLike, device:str|tuple[str, ...]|None=None, shape:tuple[sint, ...]|None=None, src=None, unique:bool|int=False):
def const(dtype:DType, b:ConstLike, device:str|tuple[str, ...]|None=None, shape:tuple[sint, ...]|None=None, unique:bool|int=False):
if isinstance(b, UOp): return b.unbind()[0] if b.op is Ops.BIND else b
if isinstance(b, tuple) and all_same(b): b = b[0] # doesn't have to be a VCONST if they are all the same
if isinstance(b, tuple) and all_same(b):
assert len(b) > 0, "can't create const from empty tuple"
b = b[0] # doesn't have to be a VCONST if they are all the same
# NOTE: float('nan') != float('nan'), so we canonicalize here
if isinstance(b, float) and math.isnan(b): b = math.nan
ret = UOp(Ops.VCONST if isinstance(b, tuple) else Ops.CONST, dtype, arg=dtypes.as_const(b, dtype), src=() if src is None else (src,))
ret = UOp(Ops.VCONST if isinstance(b, tuple) else Ops.CONST, dtype, arg=dtypes.as_const(b, dtype))
if device is not None:
if unique or not isinstance(unique, bool): ret = ret.replace(src=(UOp(Ops.DEVICE, arg=device), UOp.unique(None if unique is True else unique)))
else: ret = ret.replace(src=(UOp(Ops.DEVICE, arg=device),))
@@ -572,7 +574,7 @@ class UOp(OpMixin, metaclass=UOpMetaClass):
else: usrcs.append(UOp(Ops.VECTORIZE, dtypes.index.vec(len(arg)), tuple(UOp.const(dtypes.index, x) if isinstance(x, int) else x for x in arg)))
if len(usrcs) == 0: ret = UOp(op, self.dtype, (self,), arg)
else: ret = UOp(op, self.dtype, (self,)+UOp.sink(*usrcs).simplify().src)
# for all movement ops, we check shape property
# for all movement ops, we check shape property to validity check the movement op
if ret.shape == self.shape and same_shape_noop: return self
return ret

View File

@@ -255,7 +255,8 @@ symbolic = symbolic_simple+commutative+PatternMatcher([
# after with 1 src is just src[0]
(UPat(Ops.AFTER, src=(UPat.var("s"),)), lambda s: s),
# VECTORIZE/CONST
(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST), name="vec"), lambda vec: UOp.const(vec.dtype, tuple(x.arg for x in vec.src))),
(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST), name="vec"),
lambda vec: UOp.const(vec.dtype, tuple(x.arg for x in vec.src)) if len(vec.src) > 0 else None),
])+div_and_mod_symbolic+gep_pushing
# ******** we take a small aside to "simplify_valid" to rewrite valids ********