From ac02e7347d98263960f738b34c274c2b7d03d945 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Fri, 8 Mar 2024 10:17:49 -0800 Subject: [PATCH] ptx timing vs cuda timing (#3659) --- extra/optimization/helpers.py | 2 +- test/external/speed_compare_cuda_ptx.py | 63 +++++++++++++++++++++++++ tinygrad/runtime/ops_cuda.py | 1 + 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 test/external/speed_compare_cuda_ptx.py diff --git a/extra/optimization/helpers.py b/extra/optimization/helpers.py index e794bfde46..241bae84c9 100644 --- a/extra/optimization/helpers.py +++ b/extra/optimization/helpers.py @@ -9,7 +9,7 @@ inf, nan = float('inf'), float('nan') # kernel unpacker from tinygrad.codegen.linearizer import Linearizer def ast_str_to_ast(ast_str:str) -> LazyOp: return eval(ast_str) -def ast_str_to_lin(ast_str:str): return Linearizer(ast_str_to_ast(ast_str)) +def ast_str_to_lin(ast_str:str, opts=None): return Linearizer(ast_str_to_ast(ast_str), opts) # load worlds, a dataset of about 12k kernels import gzip diff --git a/test/external/speed_compare_cuda_ptx.py b/test/external/speed_compare_cuda_ptx.py new file mode 100644 index 0000000000..f6c833d1fa --- /dev/null +++ b/test/external/speed_compare_cuda_ptx.py @@ -0,0 +1,63 @@ +import itertools +from tinygrad import Device +from tinygrad.device import CompiledASTRunner +from tinygrad.helpers import to_function_name, getenv, colored +from extra.optimization.helpers import load_worlds, ast_str_to_lin +from tinygrad.features.search import bufs_from_lin +from tinygrad.runtime.ops_cuda import PTXCompiler + +# move to helpers? +def colorize_float(x): + ret = f"{x:7.2f}x" + if x < 0.75: return colored(ret, 'green') + elif x > 1.15: return colored(ret, 'red') + else: return colored(ret, 'yellow') + +if __name__ == "__main__": + ast_strs = load_worlds(filter_reduce=False, filter_novariable=True) + dev = Device["CUDA"] + ptx = PTXCompiler(dev.arch) + + # NUM=112 python3 test/external/speed_compare_cuda_ptx.py + + single = getenv("NUM", -1) + if single != -1: ast_strs = ast_strs[single:single+1] + + average_tm_cuda, average_tm_ptx = 0, 0 + for num,ast in enumerate(ast_strs): + # cuda compile + lin = ast_str_to_lin(ast, opts=dev.compiler.linearizer_opts) + lin.hand_coded_optimizations() + cuda_prg = dev.to_program(lin) + + bufs = bufs_from_lin(lin) + + # ptx compile + lin = ast_str_to_lin(ast, opts=ptx.linearizer_opts) + lin.hand_coded_optimizations() + lin.linearize() + ptx_src = ptx.render(to_function_name(lin.name), lin.uops) + try: + ptx_prg = CompiledASTRunner(lin.name, ptx_src, dev, lin.global_size, lin.local_size, lin.uops.vars(), precompiled=ptx.compile(ptx_src)) + except RuntimeError: + print("PTX FAIL") + continue + # warmup + cuda_prg(bufs, {}, wait=True) + ptx_prg(bufs, {}, wait=True) + + tm_cuda, tm_ptx = [], [] + for i in range(5): + tm_cuda.append(cuda_prg(bufs, {}, wait=True)) + tm_ptx.append(ptx_prg(bufs, {}, wait=True)) + average_tm_cuda += min(tm_cuda) + average_tm_ptx += min(tm_ptx) + ratio = min(tm_ptx)/min(tm_cuda) + print(f"{average_tm_ptx/average_tm_cuda:5.2f}x -- {num:4d} {colorize_float(ratio)} {min(tm_ptx)*1e6:7.2f} us", lin.name) + if ratio > 1.5: + def fix(x): return x.replace('\t', ' ').strip() + ll1, ll2 = cuda_prg.lib.decode().split('\n'), ptx_src.split('\n') + if single != -1: + for ln, (l1, l2) in enumerate(itertools.zip_longest(ll1, ll2, fillvalue='')): + print(f"{ln:5d} | {fix(l1):80s} | {fix(l2):80s}") + print(len(ll1), len(ll2), "RATIO", ratio, "us", min(tm_ptx)*1e6) diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py index e966d54551..11f25791a0 100644 --- a/tinygrad/runtime/ops_cuda.py +++ b/tinygrad/runtime/ops_cuda.py @@ -77,6 +77,7 @@ class CUDAProgram: self.module = cuda.CUmodule() status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib) if status != 0: + del self.module cuda_disassemble(lib, device.arch) raise RuntimeError("module load failed") check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))