benchmark single kernel launch (#8921)

* benchmark kernel launch

* don't realize unneeded

* faster

* faster metal

* fix mypy

* without sync

* no div 0

* lru cache that

* no sync in the profile
This commit is contained in:
George Hotz
2025-02-06 13:35:34 +08:00
committed by GitHub
parent 3e082d4a9d
commit a8e54df363
6 changed files with 54 additions and 9 deletions

View File

@@ -0,0 +1,38 @@
import time
from tinygrad import Tensor, TinyJit, Device, Context
from tinygrad.helpers import Profiling, Timing, GlobalCounters
# python3 test/test_speed_v_torch.py TestSpeed.test_add_a
@TinyJit
def plus(a:Tensor, b:Tensor): return a+b
if __name__ == "__main__":
a = Tensor([1]).realize()
b = Tensor([1]).realize()
for i in range(5):
with Timing(prefix=f"{i}:"):
c = plus(a,b)
Device[c.device].synchronize()
assert c.item() == 2
for i in range(5):
st = time.perf_counter()
c = plus(a,b)
et = time.perf_counter() - st
print(f"nosync {i}: {et*1e6:.2f} us")
Device[c.device].synchronize()
for i in range(5):
st = time.perf_counter()
c = plus(a,b)
Device[c.device].synchronize()
et = time.perf_counter() - st
print(f"precise {i}: {et*1e6:.2f} us")
assert GlobalCounters.time_sum_s == 0
with Context(DEBUG=2):
st = time.perf_counter()
c = plus(a,b)
Device[c.device].synchronize()
et = time.perf_counter() - st
print(f"kernel {GlobalCounters.time_sum_s*1e3:.2f} ms / full {et*1e3:.2f} ms -- {et/(GlobalCounters.time_sum_s+1e-12):.2f} x")
with Profiling():
c = plus(a,b)

View File

@@ -202,8 +202,12 @@ class TestSpeed(unittest.TestCase):
def f(a, b): return (a*b).sum()
helper_test_generic_square('mul_sum', 4096, f, f)
def test_add(self):
for N in [1, 1024, 4096]:
def test_add_a(self):
def f(a, b): return a + b
helper_test_generic_square('add', 1, f, f)
def test_add_big(self):
for N in [1024, 4096]:
def f(a, b): return a + b
helper_test_generic_square('add', N, f, f)