From ac02e7347d98263960f738b34c274c2b7d03d945 Mon Sep 17 00:00:00 2001
From: George Hotz <72895+geohot@users.noreply.github.com>
Date: Fri, 8 Mar 2024 10:17:49 -0800
Subject: [PATCH] ptx timing vs cuda timing (#3659)

---
 extra/optimization/helpers.py           |  2 +-
 test/external/speed_compare_cuda_ptx.py | 63 +++++++++++++++++++++++++
 tinygrad/runtime/ops_cuda.py            |  1 +
 3 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 test/external/speed_compare_cuda_ptx.py

diff --git a/extra/optimization/helpers.py b/extra/optimization/helpers.py
index e794bfde46..241bae84c9 100644
--- a/extra/optimization/helpers.py
+++ b/extra/optimization/helpers.py
@@ -9,7 +9,7 @@ inf, nan = float('inf'), float('nan')
 # kernel unpacker
 from tinygrad.codegen.linearizer import Linearizer
 def ast_str_to_ast(ast_str:str) -> LazyOp: return eval(ast_str)
-def ast_str_to_lin(ast_str:str): return Linearizer(ast_str_to_ast(ast_str))
+def ast_str_to_lin(ast_str:str, opts=None): return Linearizer(ast_str_to_ast(ast_str), opts)
 
 # load worlds, a dataset of about 12k kernels
 import gzip
diff --git a/test/external/speed_compare_cuda_ptx.py b/test/external/speed_compare_cuda_ptx.py
new file mode 100644
index 0000000000..f6c833d1fa
--- /dev/null
+++ b/test/external/speed_compare_cuda_ptx.py
@@ -0,0 +1,63 @@
+import itertools
+from tinygrad import Device
+from tinygrad.device import CompiledASTRunner
+from tinygrad.helpers import to_function_name, getenv, colored
+from extra.optimization.helpers import load_worlds, ast_str_to_lin
+from tinygrad.features.search import bufs_from_lin
+from tinygrad.runtime.ops_cuda import PTXCompiler
+
+# move to helpers?
+def colorize_float(x):
+  ret = f"{x:7.2f}x"
+  if x < 0.75: return colored(ret, 'green')
+  elif x > 1.15: return colored(ret, 'red')
+  else: return colored(ret, 'yellow')
+
+if __name__ == "__main__":
+  ast_strs = load_worlds(filter_reduce=False, filter_novariable=True)
+  dev = Device["CUDA"]
+  ptx = PTXCompiler(dev.arch)
+
+  # NUM=112 python3 test/external/speed_compare_cuda_ptx.py
+
+  single = getenv("NUM", -1)
+  if single != -1: ast_strs = ast_strs[single:single+1]
+
+  average_tm_cuda, average_tm_ptx = 0, 0
+  for num,ast in enumerate(ast_strs):
+    # cuda compile
+    lin = ast_str_to_lin(ast, opts=dev.compiler.linearizer_opts)
+    lin.hand_coded_optimizations()
+    cuda_prg = dev.to_program(lin)
+
+    bufs = bufs_from_lin(lin)
+
+    # ptx compile
+    lin = ast_str_to_lin(ast, opts=ptx.linearizer_opts)
+    lin.hand_coded_optimizations()
+    lin.linearize()
+    ptx_src = ptx.render(to_function_name(lin.name), lin.uops)
+    try:
+      ptx_prg = CompiledASTRunner(lin.name, ptx_src, dev, lin.global_size, lin.local_size, lin.uops.vars(), precompiled=ptx.compile(ptx_src))
+    except RuntimeError:
+      print("PTX FAIL")
+      continue
+    # warmup
+    cuda_prg(bufs, {}, wait=True)
+    ptx_prg(bufs, {}, wait=True)
+
+    tm_cuda, tm_ptx = [], []
+    for i in range(5):
+      tm_cuda.append(cuda_prg(bufs, {}, wait=True))
+      tm_ptx.append(ptx_prg(bufs, {}, wait=True))
+    average_tm_cuda += min(tm_cuda)
+    average_tm_ptx += min(tm_ptx)
+    ratio = min(tm_ptx)/min(tm_cuda)
+    print(f"{average_tm_ptx/average_tm_cuda:5.2f}x -- {num:4d} {colorize_float(ratio)}  {min(tm_ptx)*1e6:7.2f} us", lin.name)
+    if ratio > 1.5:
+      def fix(x): return x.replace('\t', ' ').strip()
+      ll1, ll2 = cuda_prg.lib.decode().split('\n'), ptx_src.split('\n')
+      if single != -1:
+        for ln, (l1, l2) in enumerate(itertools.zip_longest(ll1, ll2, fillvalue='')):
+          print(f"{ln:5d} | {fix(l1):80s} | {fix(l2):80s}")
+      print(len(ll1), len(ll2), "RATIO", ratio, "us", min(tm_ptx)*1e6)
diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py
index e966d54551..11f25791a0 100644
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@@ -77,6 +77,7 @@ class CUDAProgram:
       self.module = cuda.CUmodule()
       status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
       if status != 0:
+        del self.module
         cuda_disassemble(lib, device.arch)
         raise RuntimeError("module load failed")
       check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))