pmatmul example + GB/s bugfix [run_process_replay] (#5974)

* pmatmul example + bugfix * improve pmatmul * Update real_pmatmul.py
2026-01-08 22:48:25 -05:00 · 2024-08-07 22:32:11 -07:00
parent c5baa3d66b
commit bc55c8a30e
2 changed files with 22 additions and 2 deletions
--- a/extra/gemm/real_pmatmul.py
+++ b/extra/gemm/real_pmatmul.py
@@ -0,0 +1,20 @@
+import time
+from tinygrad import Tensor, Device, TinyJit
+from tinygrad.helpers import getenv
+
+if __name__ == "__main__":
+  DEVS = [f"NV:{i}" for i in range(getenv("GPUS", 2))]
+  N = getenv("N", 8192)
+  A = Tensor.rand(N, N).shard(DEVS, 0).realize()
+  B = Tensor.rand(N, N).shard(DEVS, 1).realize()
+  print("***** MUL *****")
+  jmatmul = TinyJit(Tensor.dot)
+  for i in range(10):
+    Device["NV:0"].synchronize()
+    Device["NV:1"].synchronize()
+    st = time.perf_counter()
+    jmatmul(A, B)
+    Device["NV:0"].synchronize()
+    Device["NV:1"].synchronize()
+    et = time.perf_counter()
+    print(f"{(N*N*N*2*1e-12)/(et-st):.2f} TFLOPS")
--- a/tinygrad/engine/realize.py
+++ b/tinygrad/engine/realize.py
@@ -66,9 +66,9 @@ def get_kernel(renderer:Renderer, ast:LazyOp) -> Kernel:
 # **************** Runners ****************

 class Runner:
-  def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0, lds_estimate:sint=0):
+  def __init__(self, display_name:str, dname:str, op_estimate:sint=0, mem_estimate:sint=0, lds_estimate:Optional[sint]=None):
    self.first_run, self.display_name, self.dname, self.op_estimate, self.mem_estimate, self.lds_estimate = \
-      True, display_name, dname, op_estimate, mem_estimate, lds_estimate
+      True, display_name, dname, op_estimate, mem_estimate, mem_estimate if lds_estimate is None else lds_estimate
  @property
  def device(self): return Device[self.dname]
  def exec(self, rawbufs:List[Buffer], var_vals:Optional[Dict[Variable, int]]=None) -> Optional[float]: