diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 107019254b..4bd00d2ead 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -72,7 +72,7 @@ jobs: - name: Install Dependencies run: pip install -e . - name: Compile EfficientNet to C - run: CLANG=1 GPU=1 python3 examples/compile_efficientnet.py > recognize.c + run: CLANG=1 python3 examples/compile_efficientnet.py > recognize.c - name: Compile C to native run: clang -O2 recognize.c -lm -o recognize - name: Test EfficientNet diff --git a/accel/triton/ops_triton.py b/accel/triton/ops_triton.py index e7fd4f036a..d778efccc2 100644 --- a/accel/triton/ops_triton.py +++ b/accel/triton/ops_triton.py @@ -14,7 +14,7 @@ from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, LazyOp, Op, ExplicitExe from tinygrad.shape import ShapeTracker from tinygrad.helpers import prod, DEBUG from tinygrad.runtime.cuda import CLBuffer -from tinygrad.ast import ASTKernel +from tinygrad.compiler.ast import ASTKernel stream = cuda.Stream() diff --git a/examples/compile_efficientnet.py b/examples/compile_efficientnet.py index 7817298315..a606053a78 100644 --- a/examples/compile_efficientnet.py +++ b/examples/compile_efficientnet.py @@ -4,31 +4,29 @@ from extra.utils import fetch import ast def compile_net(run, special_names): - # c header - cprog = ["#include ", "#include ", "#define max(x,y) ((x>y)?x:y)"] - # functions that run the net + functions = {} bufs = {} bufnum = 0 statements = [] bufs_to_save = {} for fxn,args in run.jit_cache: - cprog.append(fxn.clprg.prg) + functions[fxn.name] = fxn.prg # NOTE: this assumes all with the same name are the same cargs = [] for i,arg in enumerate(args): if i in fxn.bufs_to_delete: continue - key = id(arg.cl) + key = id(arg.raw()) if key not in bufs: if key in special_names: - bufs[key] = (special_names[key], len(arg.cl)//4) + bufs[key] = (special_names[key], len(arg.raw()._buf)) else: - bufs[key] = (f"buf_{bufnum}", len(arg.cl)//4) + bufs[key] = (f"buf_{bufnum}", len(arg.raw()._buf)) bufnum += 1 - if i > 0: bufs_to_save[bufs[key][0]] = arg.cl # if first usage of a buffer is not an output, and it's not a special name + if i > 0: bufs_to_save[bufs[key][0]] = arg.raw() # if first usage of a buffer is not an output, and it's not a special name cargs.append(bufs[key][0]) - statements.append(f"{fxn.clprg.name}({', '.join(cargs)});") + statements.append(f"{fxn.name}({', '.join(cargs)});") - return cprog, statements, bufs, bufs_to_save + return functions, statements, bufs, bufs_to_save if __name__ == "__main__": model = EfficientNet(0) @@ -44,21 +42,17 @@ if __name__ == "__main__": the_output = run(the_input) # TODO: fetch this from the jit in self.input_replace and self.ret (hint: use get_parameters on self.ret) - special_names = {id(the_input.lazydata.realized.cl): "input", id(the_output.lazydata.realized.cl): "outputs"} + special_names = {id(the_input.lazydata.realized.raw()): "input", id(the_output.lazydata.realized.raw()): "outputs"} - cprog, statements, bufs, bufs_to_save = compile_net(run, special_names) + functions, statements, bufs, bufs_to_save = compile_net(run, special_names) - # buffers (empty) - cprog += [f"float {x[0]}[{x[1]}];" for x in bufs.values() if x[0] not in bufs_to_save] + # c header + cprog = ["#include ", "#include ", "#define max(x,y) ((x>y)?x:y)"] - # buffers (weights) + # save the weights for name,cl in bufs_to_save.items(): - weight = ''.join(["\\x%02X"%x for x in bytes(memoryview(cl)[0:len(cl)//4])]) + weight = ''.join(["\\x%02X"%x for x in bytes(cl._buf)]) cprog.append(f"unsigned char {name}_data[] = \"{weight}\";") - cprog.append(f"float *{name} = (float *){name}_data;") - - # the net - cprog += ["void net() {"] + statements + ["}"] # image library! cprog += ["#define STB_IMAGE_IMPLEMENTATION", fetch("https://raw.githubusercontent.com/nothings/stb/master/stb_image.h").decode('utf-8')] @@ -69,6 +63,15 @@ if __name__ == "__main__": lbls = ['"'+lbls[i]+'"' for i in range(1000)] cprog.append(f"char *lbls[] = {{{','.join(lbls)}}};") + # buffers (empty + weights) + cprog += [f"float {name}[{len}];" if name not in bufs_to_save else f"float *{name} = (float *){name}_data;" for name,len in bufs.values()] + + # the functions + cprog += list(functions.values()) + + # the net + cprog += ["void net() {"] + statements + ["}"] + cprog += [""" int main(int argc, char* argv[]) { int DEBUG = getenv("DEBUG") != NULL ? atoi(getenv("DEBUG")) : 0; @@ -101,5 +104,5 @@ int main(int argc, char* argv[]) { else printf("%s\\n", lbls[best_idx]); }"""] - # CLANG=1 GPU=1 python3 examples/compile_efficientnet.py | clang -O2 -lm -x c - -o recognize && time ./recognize docs/stable_diffusion_by_tinygrad.jpg + # CLANG=1 python3 examples/compile_efficientnet.py | clang -O2 -lm -x c - -o recognize && DEBUG=1 time ./recognize docs/stable_diffusion_by_tinygrad.jpg print('\n'.join(cprog)) diff --git a/extra/gemm/amx.py b/extra/gemm/amx.py index 6a2cde1f34..5901a8a633 100755 --- a/extra/gemm/amx.py +++ b/extra/gemm/amx.py @@ -4,7 +4,7 @@ import time import sys np.set_printoptions(linewidth=160) np.set_printoptions(linewidth=1000, threshold=10000000000, suppress=False) -from tinygrad.llops.ops_llvm import LLVM, LLVMBuffer, int_const +from tinygrad.runtime.ops_llvm import LLVM, LLVMBuffer, int_const from llvmlite import ir # type: ignore diff --git a/extra/gemm/metal_matmul.py b/extra/gemm/metal_matmul.py index 7d2839ad89..64eca2c17c 100644 --- a/extra/gemm/metal_matmul.py +++ b/extra/gemm/metal_matmul.py @@ -1,5 +1,5 @@ import numpy as np -from tinygrad.runtime.metal import CLBuffer, CLProgram +from tinygrad.runtime.ops_metal import CLBuffer, CLProgram def benchmark(prog): e = prog() diff --git a/extra/introspection.py b/extra/introspection.py index 9b3ec6645c..47cbe6893b 100644 --- a/extra/introspection.py +++ b/extra/introspection.py @@ -3,7 +3,7 @@ import gc from tinygrad.helpers import prod from tinygrad.tensor import Tensor from tinygrad.lazy import LazyBuffer -from tinygrad.llops.ops_gpu import GPUBuffer +from tinygrad.runtime.ops_gpu import GPUBuffer from tinygrad.ops import GlobalCounters def print_objects(): diff --git a/extra/kernel_search.py b/extra/kernel_search.py index a9e4d1ebfb..ceb27f2dfd 100644 --- a/extra/kernel_search.py +++ b/extra/kernel_search.py @@ -6,7 +6,7 @@ from enum import Enum import numpy as np from tinygrad.ops import LazyOp, ReduceOps, BinaryOps, UnaryOps, MovementOps from tinygrad.shape import ShapeTracker, View, ZeroView -from tinygrad.llops.ops_gpu import GPUBuffer, CLASTKernel +from tinygrad.runtime.ops_gpu import GPUBuffer, CLASTKernel from tinygrad.runtime.opencl import OSX_TIMING_RATIO from tinygrad.helpers import getenv, DEBUG from extra.lib_test_ast import test_ast diff --git a/extra/lib_test_ast.py b/extra/lib_test_ast.py index e41244d773..30eacedd7c 100644 --- a/extra/lib_test_ast.py +++ b/extra/lib_test_ast.py @@ -1,8 +1,8 @@ import sys import numpy as np from typing import Dict, Type -from tinygrad.ast import ASTKernel -from tinygrad.llops.ops_cpu import CPUBuffer +from tinygrad.compiler.ast import ASTKernel +from tinygrad.runtime.ops_cpu import CPUBuffer from tinygrad.ops import DeviceBuffer, map_buffers in_test = False diff --git a/extra/thneed.py b/extra/thneed.py index f514087eb4..c9c428c8d3 100644 --- a/extra/thneed.py +++ b/extra/thneed.py @@ -4,12 +4,11 @@ import struct import json import traceback import numpy as np -from tinygrad.runtime.opencl import CL -from tinygrad.llops.ops_gpu import CLProgram, CLImage, CLBuffer +from tinygrad.runtime.ops_gpu import CLProgram, CLImage, CLBuffer from tinygrad.helpers import prod, getenv from collections import defaultdict import pyopencl as cl -from tinygrad.runtime.opencl import OSX_TIMING_RATIO +from tinygrad.runtime.ops_gpu import CL, OSX_TIMING_RATIO DEBUGCL = getenv("DEBUGCL", 0) FLOAT16 = getenv("FLOAT16", 0) @@ -75,29 +74,29 @@ class Thneed: if o['arg_type'] == "image2d_t": if 'buffer_id' in o and o['height'] == 1 and not bufs_loaded[o['buffer_id']]: # hack: use a image1d since we can back that with a buffer - buf = cl.Image(CL().cl_ctx, mf.READ_WRITE, tfmt, shape=(o['width'],), buffer=bufs[o['buffer_id']]) + buf = cl.Image(CL.cl_ctx, mf.READ_WRITE, tfmt, shape=(o['width'],), buffer=bufs[o['buffer_id']]) else: # buffer isn't supported in image2d, copy buffer into image if 'buffer_id' in o and bufs_loaded[o['buffer_id']]: arr = np.zeros(bufs[o['buffer_id']].size // 2, dtype=np.float16) cl.enqueue_copy(q, arr, bufs[o['buffer_id']]) - buf = cl.Image(CL().cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, tfmt, + buf = cl.Image(CL.cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, tfmt, shape=(o['width'], o['height']), pitches=(o['row_pitch'],), hostbuf=arr) elif o['needs_load']: - buf = cl.Image(CL().cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, tfmt, + buf = cl.Image(CL.cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, tfmt, shape=(o['width'], o['height']), pitches=(o['row_pitch'],), hostbuf=o['data']) else: - buf = cl.Image(CL().cl_ctx, mf.READ_WRITE, tfmt, shape=(o['width'], o['height'])) + buf = cl.Image(CL.cl_ctx, mf.READ_WRITE, tfmt, shape=(o['width'], o['height'])) if o['arg_type'] == "image1d_t": assert not o['needs_load'] assert not bufs_loaded[o['buffer_id']] - buf = cl.Image(CL().cl_ctx, mf.READ_WRITE, tfmt, shape=(o['width'],), buffer=bufs[o['buffer_id']]) + buf = cl.Image(CL.cl_ctx, mf.READ_WRITE, tfmt, shape=(o['width'],), buffer=bufs[o['buffer_id']]) else: if 'data' in o: - buf = cl.Buffer(CL().cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=o['data']) + buf = cl.Buffer(CL.cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=o['data']) else: # zero out buffers - buf = cl.Buffer(CL().cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b'\x00'*o['size']) + buf = cl.Buffer(CL.cl_ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b'\x00'*o['size']) bufs[o['id']] = buf bufs_loaded[o['id']] = 'data' in o @@ -119,7 +118,7 @@ class Thneed: # load binaries for o in jdat['binaries']: nptr = ptr + o['length'] - prgs[o['name']] = CLProgram(o['name'], weights[ptr:nptr], rename=False, binary=True) + prgs[o['name']] = CLProgram(o['name'], weights[ptr:nptr], binary=True) ptr = nptr # populate the cl_cache @@ -194,17 +193,17 @@ class Thneed: }) if needs_load: data = np.empty(a.size//4, dtype=np.float32) - cl.enqueue_copy(CL().cl_queue, data, a, is_blocking=True) + cl.enqueue_copy(CL.cl_queue, data, a, is_blocking=True) weights.append(data.tobytes()) elif isinstance(a, cl.Image): needs_load = a in self.buffers_to_save row_pitch = (a.shape[0]*4*(2 if FLOAT16 else 4) + 63)//64 * 64 size = row_pitch * a.shape[1] # this is *2 if float16 and *4 if float32 - buf = cl.Buffer(CL().cl_ctx, cl.mem_flags.READ_WRITE, size=size * (2 if FLOAT16 else 1)) + buf = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size=size * (2 if FLOAT16 else 1)) # zero out the buffer - cl.enqueue_copy(CL().cl_queue, buf, b'\x00'*buf.size, is_blocking=True) + cl.enqueue_copy(CL.cl_queue, buf, b'\x00'*buf.size, is_blocking=True) CLProgram("from_image_strided", """ __kernel void from_image_strided(read_only image2d_t in, __global float4 *out, int row_pitch) { @@ -224,7 +223,7 @@ class Thneed: if needs_load: data = np.empty(size//(2 if FLOAT16 else 4), dtype=np.float32) - cl.enqueue_copy(CL().cl_queue, data, buf, is_blocking=True) + cl.enqueue_copy(CL.cl_queue, data, buf, is_blocking=True) if FLOAT16: data = data.astype(np.float16) weights.append(data.tobytes()) else: @@ -271,9 +270,9 @@ class Thneed: events = [] st = time.monotonic() for prg, args in self.cl_cache: - events.append(prg.clprg(CL().cl_queue, *args)) + events.append(prg.clprg(CL.cl_queue, *args)) mt = time.monotonic() - CL().cl_queue.finish() + CL.cl_queue.finish() et = time.monotonic() - st print(f"submit in {(mt-st)*1000.0:.2f} ms, total runtime is {et*1000.0:.2f} ms") @@ -284,7 +283,7 @@ class Thneed: total_runtime = 0 for i, ((prg, args), e) in enumerate(zip(self.cl_cache, events)): runtime = (e.profile.end - e.profile.start) * OSX_TIMING_RATIO - print(f"{i:3d} time {total_runtime/1e6:5.2f} ms running {prg.name:20s} with {str(args[0]):15s} {str(args[1]):15s} count {len(args)-2:2d} runtime {runtime/1e3:7.2f} us {(prg.op_estimate)/runtime:9.2f} GFLOPS {prg.options} -> {args[2].shape if hasattr(args[2], 'shape') else args[2].size}") + print(f"{i:3d} time {total_runtime/1e6:5.2f} ms running {prg.name:20s} with {str(args[0]):15s} {str(args[1]):15s} count {len(args)-2:2d} runtime {runtime/1e3:7.2f} us {(getattr(prg, 'op_estimate', float('nan')))/runtime:9.2f} GFLOPS -> {args[2].shape if hasattr(args[2], 'shape') else args[2].size}") if (DEBUGCL >= 2 and getenv("PRINT_KERNEL", -1) == i) or DEBUGCL >= 3: print(prg.prg) total_runtime += runtime @@ -321,11 +320,11 @@ class Thneed: # 3 runs just in case for i in range(3): try: - e = prg.clprg(CL().cl_queue, *args) + e = prg.clprg(CL.cl_queue, *args) except (cl.LogicError, cl.RuntimeError): # INVALID_WORK_GROUP_SIZE continue - CL().cl_queue.finish() + CL.cl_queue.finish() runtime = e.profile.end - e.profile.start #print(runtime, args[0], args[1]) runtimes.append((runtime, local_args)) diff --git a/openpilot/compile.py b/openpilot/compile.py index 5161dc9b1c..af9b074bce 100644 --- a/openpilot/compile.py +++ b/openpilot/compile.py @@ -19,7 +19,8 @@ import numpy as np import tinygrad.graph as graph from tinygrad.ops import GlobalCounters -from tinygrad.runtime.opencl import CL +import pyopencl as cl +from tinygrad.runtime.ops_gpu import CL from extra.utils import fetch from extra.onnx import get_run_onnx from tinygrad.tensor import Tensor @@ -64,13 +65,16 @@ def compile(dat, output_fn): cl_cache = [] for prg,args in model_exec.jit_cache: real_clprg = prg.clprg + setattr(real_clprg, "op_estimate", prg.op_estimate) used_ops += real_clprg.op_estimate # replace clprg with a fake program to log to cl_cache - prg.clprg = lambda *args: cl_cache.append((real_clprg, args)) + prg.clprg = lambda *args, wait=False: cl_cache.append((real_clprg, list(args[0:2])+[x._cl for x in args[2:]])) prg(*args) + # put it back + prg.clprg = real_clprg from extra.thneed import Thneed - t = Thneed(cl_cache, {k:inputs[k].lazydata.realized.cl for k in inputs.keys()}) + t = Thneed(cl_cache, {k:inputs[k].lazydata.realized.raw()._cl for k in inputs.keys()}) if getenv("OPTWG", 0): t.optimize_local_workgroup() @@ -84,7 +88,7 @@ def compile(dat, output_fn): # confirm thneed found the right output thneed_out = np.empty((t.outputs[0].size//4,), dtype=np.float32).reshape(tinygrad_out.shape) - CL.enqueue_copy(thneed_out, t.outputs[0], is_blocking=True) + cl.enqueue_copy(CL.cl_queue, thneed_out, t.outputs[0], is_blocking=True) np.testing.assert_allclose(thneed_out, tinygrad_out.numpy()) # testing is float32 only (fix this) @@ -102,11 +106,11 @@ def compile(dat, output_fn): # try old thneed with a different input for k,v in t.inputs.items(): - CL.enqueue_copy(v, new_np_inputs[k], is_blocking=True) + cl.enqueue_copy(CL.cl_queue, v, new_np_inputs[k], is_blocking=True) t.run() old_thneed_out = np.empty((t.outputs[0].size//4,), dtype=np.float32).reshape(tinygrad_out.shape) - CL.enqueue_copy(old_thneed_out, t.outputs[0], is_blocking=True) + cl.enqueue_copy(CL.cl_queue, old_thneed_out, t.outputs[0], is_blocking=True) # compare thneed (rerun) with torch np.testing.assert_allclose(new_torch_out, old_thneed_out, atol=1e-4, rtol=1e-2) @@ -119,11 +123,11 @@ def compile(dat, output_fn): # inputs for k,v in nt.inputs.items(): - CL.enqueue_copy(v, new_np_inputs[k], is_blocking=True) + cl.enqueue_copy(CL.cl_queue, v, new_np_inputs[k], is_blocking=True) nt.run() new_thneed_out = np.empty((nt.outputs[0].size//4,), dtype=np.float32).reshape(tinygrad_out.shape) - CL.enqueue_copy(new_thneed_out, nt.outputs[0], is_blocking=True) + cl.enqueue_copy(CL.cl_queue, new_thneed_out, nt.outputs[0], is_blocking=True) # compare torch to thneed np.testing.assert_allclose(new_torch_out, new_thneed_out, atol=1e-4, rtol=1e-2) diff --git a/setup.py b/setup.py index ce72d88ba0..12f01950b6 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ setup(name='tinygrad', license='MIT', long_description=long_description, long_description_content_type='text/markdown', - packages = ['tinygrad', 'tinygrad.llops', 'tinygrad.nn', 'tinygrad.runtime', 'tinygrad.shape'], + packages = ['tinygrad', 'tinygrad.codegen', 'tinygrad.nn', 'tinygrad.runtime', 'tinygrad.shape'], classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License" diff --git a/sz.sh b/sz.sh index 0911cb8486..c1e4ba4831 100755 --- a/sz.sh +++ b/sz.sh @@ -1,2 +1,3 @@ #!/bin/bash -scc tinygrad --by-file +# switched to cloc due to https://github.com/boyter/scc/issues/379 +cloc --by-file tinygrad/* diff --git a/test/external_test_gpu_ast.py b/test/external_test_gpu_ast.py index a14fabc975..54dab89213 100644 --- a/test/external_test_gpu_ast.py +++ b/test/external_test_gpu_ast.py @@ -3,7 +3,7 @@ import unittest import numpy as np from tinygrad.ops import LazyOp, ReduceOps, BinaryOps, UnaryOps, MovementOps from tinygrad.shape import ShapeTracker, View, ZeroView -from tinygrad.llops.ops_gpu import GPUBuffer, CLASTKernel +from tinygrad.runtime.ops_gpu import GPUBuffer, CLASTKernel from tinygrad.helpers import getenv from extra.lib_test_ast import test_ast diff --git a/test/external_test_image.py b/test/external_test_image.py index 1f4156713d..0a9e2b0c73 100644 --- a/test/external_test_image.py +++ b/test/external_test_image.py @@ -7,7 +7,7 @@ if 'IMAGE' not in os.environ: os.environ['GPU'] = '1' os.environ['OPT'] = '2' from tinygrad.tensor import Tensor -from tinygrad.llops.ops_gpu import CLImage +from tinygrad.runtime.ops_gpu import CLImage from tinygrad.nn import Conv2d Tensor.no_grad = True diff --git a/test/test_conv_shapetracker.py b/test/test_conv_shapetracker.py index cedb6fd631..3083f3d374 100644 --- a/test/test_conv_shapetracker.py +++ b/test/test_conv_shapetracker.py @@ -14,8 +14,8 @@ class TestConvShapetracker(unittest.TestCase): conv(inp).realize() test = GlobalCounters.cache GlobalCounters.cache = None - assert len(test) == 1, f"conv should only have one kernel {[x[0].clprg.name for x in test]}" - print(test[0][0].clprg.prg) + assert len(test) == 1, f"conv should only have one kernel {[x[0].name for x in test]}" + print(test[0][0].prg) for arg in test[0][1]: print(arg.st) assert len(arg.st.views) == 1 diff --git a/test/test_graph.py b/test/test_graph.py index ca1b1a2cfb..650f219166 100644 --- a/test/test_graph.py +++ b/test/test_graph.py @@ -3,7 +3,7 @@ import unittest import networkx as nx # type: ignore import numpy as np from tinygrad.graph import G, log_op, prune_graph -from tinygrad.llops.ops_cpu import CPUBuffer +from tinygrad.runtime.ops_cpu import CPUBuffer from tinygrad.ops import BinaryOps, LazyOp, MovementOps, ReduceOps class TestGraph(unittest.TestCase): diff --git a/test/test_speed_v_torch.py b/test/test_speed_v_torch.py index 62d6f9fc6d..cc65bcc4da 100644 --- a/test/test_speed_v_torch.py +++ b/test/test_speed_v_torch.py @@ -12,15 +12,6 @@ from tinygrad.tensor import Tensor from tinygrad.nn import Conv2d from tinygrad.helpers import colored, getenv, DEBUG from tinygrad.jit import TinyJit -METAL = getenv("METAL") -try: - from tinygrad.runtime.opencl import CL - if METAL: - from tinygrad.runtime.metal import sync - else: - def sync(): CL().cl_queue.finish() -except ImportError: - def sync(): pass IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")] @@ -43,22 +34,17 @@ def helper_test_speed(f1, *args): ret = None for _ in range(CNT): del ret - GlobalCounters.global_ops = 0 - GlobalCounters.global_mem = 0 args = [(x+1).realize() if isinstance(x,Tensor) else (None if x is None else (x+1)) for x in args] # cache defeats - # sync all before! - sync() - torch.zeros(1, device=torch_device).cpu() + # force syncing + [x.cpu().numpy() for x in args if x is not None] + GlobalCounters.global_ops = 0 + GlobalCounters.global_mem = 0 if DEBUG >= 4: print("benchmark start") st = time.monotonic() ret = f1(*args) - if isinstance(ret, Tensor) and ret.device in ["GPU"]: - sync() - if not isinstance(ret, Tensor) and torch_device != "cpu": - # TODO: better way to sync? - torch.zeros(1, device=torch_device).cpu() + ret.cpu().numpy() # not ideal, it's copying. why is this so slow in tinygrad? et = (time.monotonic() - st) * 1000 ets.append(et) if DEBUG >= 4: print("benchmark stop") diff --git a/tinygrad/ast.py b/tinygrad/codegen/ast.py similarity index 84% rename from tinygrad/ast.py rename to tinygrad/codegen/ast.py index 286885d21c..73255c5f62 100644 --- a/tinygrad/ast.py +++ b/tinygrad/codegen/ast.py @@ -1,10 +1,26 @@ from enum import Enum, auto import itertools -from typing import List, Tuple, Optional -from tinygrad.helpers import prod, dedup, all_same -from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops +from typing import List, Tuple, Optional, Set +from tinygrad.helpers import prod, dedup, all_same, DEBUG +from tinygrad.ops import LazyOp, MovementOps, get_lazyop_info, get_buffers, ReduceOps, get_lazyops, GlobalCounters from tinygrad.shape import ShapeTracker, View, strides_for_shape +class ASTRunner: + def __init__(self, name, prg, bufs_to_delete:Set[int]=set(), global_work_size:Optional[List[int]]=None, local_work_size:Optional[List[int]]=None, op_estimate=0, mem_estimate=0): + if DEBUG >= 4: print(prg) + self.name, self.prg, self.global_work_size, self.local_work_size, self.bufs_to_delete, self.op_estimate, self.mem_estimate = name, prg, global_work_size, local_work_size, bufs_to_delete, op_estimate, mem_estimate + def build(self, runtime): + self.clprg = runtime(self.name, self.prg) + return self + def __call__(self, *bufs): + et = self.clprg(self.global_work_size, self.local_work_size, *[x.raw() for i,x in enumerate(bufs) if i not in self.bufs_to_delete], wait=DEBUG>=2) + if et is not None: GlobalCounters.time_sum_s += et + if DEBUG >= 1: + print(f"**** {GlobalCounters.kernel_count:4d} {self.name:20s} args {len(bufs)-len(self.bufs_to_delete):5d} kernels {str(self.global_work_size):18s} {str(self.local_work_size):12s} OPs {self.op_estimate/1e6:7.1f}M/{GlobalCounters.global_ops/1e9:7.2f}G mem {GlobalCounters.mem_used/1e9:5.2f} GB " + + (str() if DEBUG <= 1 else f"tm {et*1e6:9.2f}us/{GlobalCounters.time_sum_s*1e3:9.2f}ms ({self.op_estimate/(et*1e9):8.2f} GFLOPS)")) + GlobalCounters.log_kernel(self.op_estimate, self.mem_estimate) + return et + def get_first_reduce(shapes): for i in range(len(shapes[0])): if not all_same([x[i] for x in shapes]): diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/codegen/gpu.py similarity index 64% rename from tinygrad/llops/ops_gpu.py rename to tinygrad/codegen/gpu.py index c62257a1b8..e2119827de 100644 --- a/tinygrad/llops/ops_gpu.py +++ b/tinygrad/codegen/gpu.py @@ -1,42 +1,35 @@ -from __future__ import annotations -import numpy as np import math -from typing import List, Tuple, Optional, Dict, Union, Set, Final, Callable -from tinygrad.helpers import prod, DEBUG, IMAGE -from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, MovementOps, LazyOp, Op, ExplicitExecAST, GlobalCounters -from tinygrad.ast import ASTKernel, Token, Types -from tinygrad.shape import ShapeTracker +from collections import defaultdict +from typing import Optional, List, Tuple, Dict, Set, Final, NamedTuple +from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, LazyOp, Op +from tinygrad.codegen.ast import ASTKernel, ASTRunner, Token, Types from tinygrad.shape.symbolic import Node, ModNode, DivNode, render_python +from tinygrad.shape import ShapeTracker +from tinygrad.helpers import getenv, DEBUG, prod + # div is different in cl than python render_cl = render_python.copy() render_cl[DivNode] = lambda self,ops,ctx: f"({self.a.render(ops)}/{self.b})" -from tinygrad.helpers import getenv - -# TODO: select runtimes in a smarter way -CUDA,METAL,CLANG = getenv("CUDA", 0), getenv("METAL", 0), getenv("CLANG", 0) -if not CUDA and not METAL and not CLANG: - from tinygrad.runtime.opencl import CLBuffer, CLImage, CLProgram # NOTE: using CL will not work for the CUDA runtime # noqa: F401 -else: - class CLImage: # type: ignore - def __init__(self, shape): raise NotImplementedError("current runtime doesn't support images") - if CUDA: from tinygrad.runtime.cuda import CLBuffer, CLProgram # type: ignore - elif METAL: from tinygrad.runtime.metal import CLBuffer, CLProgram # type: ignore - elif CLANG: from tinygrad.runtime.clang import CLBuffer, CLProgram # type: ignore VALIDHACKS = getenv("VALIDHACKS", 0) # TODO: remove the need for this NATIVE_EXPLOG = getenv("NATIVE_EXPLOG", 0) # this is needed as a switch for the tests to pass -KOPT = getenv("KOPT", -1) -PRINT_AST = getenv("PRINT_AST", "0") -TEST_AST = getenv("TEST_AST", 0) +class GPULanguage(NamedTuple): + kernel_prefix : str = "" + buffer_prefix : str = "" + smem_prefix : str = "" + barrier : str = "" + gid : List[str] = [] + lid : List[str] = [] + extra_args : List[str] = [] + float4 : Optional[str] = None -class GPURunner: - def __init__(self, clprg:CLProgram, bufs_to_delete:Set[int], global_work_size:List[int], local_work_size:Optional[List[int]]): - self.clprg, self.global_work_size, self.local_work_size, self.bufs_to_delete = clprg, global_work_size, local_work_size, bufs_to_delete - def __call__(self, *bufs): - return self.clprg(self.global_work_size, self.local_work_size, *[x.cl for i,x in enumerate(bufs) if i not in self.bufs_to_delete]) +class GPUCodegen(ASTKernel): + lang : GPULanguage = GPULanguage() + + # for renaming + kernel_cnt : Final[Dict[str, int]] = defaultdict(lambda: -1) -class CLASTKernel(ASTKernel): code_for_op : Final[Dict[Op, str]] = { UnaryOps.NOOP: "(A)", UnaryOps.NEG: "(-(A))", UnaryOps.NOT: "((float)1.0-A)", UnaryOps.EXP: "native_exp(A)" if NATIVE_EXPLOG else "exp(A)", @@ -58,7 +51,7 @@ class CLASTKernel(ASTKernel): assert len(self.bufs[buf_index].st.views) == 1, "store has more than one view" # all stores can merge, since they have one view and are valid - should_upcast = not CLANG and self.buftokens[buf_index].can_float4() + should_upcast = self.lang.float4 and self.buftokens[buf_index].can_float4() to_store = {o:v for o,v in zip(self.buftokens[buf_index].offsets(), value)} did_store = set() @@ -68,14 +61,14 @@ class CLASTKernel(ASTKernel): assert valid.min == 1, "store must always be valid" if should_upcast: for j in range(4): did_store.add(o+j) - v = Token(f"{CLProgram.float4}({','.join([to_store[o+j].tok for j in range(4)])})", Types.FLOAT4) + v = Token(f"{self.lang.float4}({','.join([to_store[o+j].tok for j in range(4)])})", Types.FLOAT4) idxy, valid = self.sts[buf_index].expr_idxs(o) assert valid.min == 1, "store must always be valid" - if isinstance(self.bufs[buf_index]._buf, CLImage): + if hasattr(self.bufs[buf_index]._buf, "IMAGE"): assert v.typ == Types.FLOAT4, "Image requires upcasting to FLOAT4" self.kernel.append(f"write_imagef(data{buf_index}, {self.image_idx(buf_index, idxy)}, {v.tok}); /* {self.bufs[buf_index]._base_shape} */\n") elif v.typ == Types.FLOAT4: - self.kernel.append(f"(({CLProgram.buffer_prefix}float4*)data{buf_index})[{(idxy//4).render(render_cl)}] = {v.tok};\n") + self.kernel.append(f"(({self.lang.buffer_prefix}float4*)data{buf_index})[{(idxy//4).render(render_cl)}] = {v.tok};\n") else: self.kernel.append(f"data{buf_index}[{(idxy//(4 if v.typ == Types.FLOAT4 else 1)).render(render_cl)}] = {v.tok};\n") @@ -87,7 +80,7 @@ class CLASTKernel(ASTKernel): val = self.bufs[buf_index]._backing[0] assert not math.isnan(val) const = Token(f"({val}f)", Types.FLOAT) - should_upcast = not CLANG and const is None and self.buftokens[buf_index].can_float4() + should_upcast = self.lang.float4 and const is None and self.buftokens[buf_index].can_float4() tokens = [] for o in self.buftokens[buf_index].offsets(): key = f"val{buf_index}_{o}" if o >= 0 else f"val{buf_index}_m{-o}" @@ -102,14 +95,14 @@ class CLASTKernel(ASTKernel): #print((idxy+j).render(), idxy_test.render(), valid.render(), valid_test.render(), can_merge) if const is not None: ldr = const - elif isinstance(self.bufs[buf_index]._buf, CLImage): + elif hasattr(self.bufs[buf_index]._buf, "IMAGE"): assert should_upcast and can_merge, f"Image requires upcasting to FLOAT4 {self.buftokens[buf_index]}" ldr = Token(f"read_imagef({self.buftokens[buf_index].tok}, smp, {self.image_idx(buf_index, idxy, VALIDHACKS)}) /* {self.bufs[buf_index]._base_shape} */", Types.FLOAT4) elif should_upcast and can_merge: - ldr = Token(f"(({CLProgram.buffer_prefix}float4*){self.buftokens[buf_index].tok})[{(idxy//4).render(render_cl)}]", Types.FLOAT4) + ldr = Token(f"(({self.lang.buffer_prefix}float4*){self.buftokens[buf_index].tok})[{(idxy//4).render(render_cl)}]", Types.FLOAT4) else: ldr = Token(f"{self.buftokens[buf_index].tok}[{idxy.render(render_cl)}]", Types.FLOAT) - ldr = ldr if valid.min == 1 or (VALIDHACKS and isinstance(self.bufs[buf_index]._buf, CLImage)) else (Token(f"({valid.render(render_cl)} ? {ldr.tok} : 0.0f)", ldr.typ) if valid.max == 1 else Token("0.0f", ldr.typ)) + ldr = ldr if valid.min == 1 or (VALIDHACKS and hasattr(self.bufs[buf_index]._buf, "IMAGE")) else (Token(f"({valid.render(render_cl)} ? {ldr.tok} : 0.0f)", ldr.typ) if valid.max == 1 else Token("0.0f", ldr.typ)) if const is not None: self.loaded_keys[(buf_index,o)] = ldr else: @@ -122,11 +115,11 @@ class CLASTKernel(ASTKernel): tokens.append(self.loaded_keys[(buf_index,o)]) return tokens - def ast_parse(self, x:Union[GPUBuffer, LazyOp], acc:List[Token], do_reduce=False) -> List[Token]: + def ast_parse(self, x, acc:List[Token], do_reduce=False) -> List[Token]: if not isinstance(x, LazyOp): return self.load(self.bufs.index(x)) if isinstance(x.op, ReduceOps) and not do_reduce: return acc values : List[List[Token]] = ([acc] if isinstance(x.op, ReduceOps) else []) + [self.ast_parse(v, acc, do_reduce) for v in x.src] - code = CLASTKernel.code_for_op[x.op] # TODO: replace this with a function + code = GPUCodegen.code_for_op[x.op] # TODO: replace this with a function if len(values) == 2: assert len(values[0]) == len(values[1]) and values[0][0].typ == values[1][0].typ, f"values mismatch {values}" return [Token(code.replace("A", a.tok).replace("B", b.tok), a.typ) for a,b in zip(values[0], values[1])] @@ -136,10 +129,10 @@ class CLASTKernel(ASTKernel): def hand_coded_optimizations(self): # if there's images in the earlybufs, we have to make an axis the 4 loading one # shove the axis to the end and remove - if any(isinstance(buf._buf, CLImage) for buf in self.earlybufs): + if any(hasattr(buf._buf, "IMAGE") for buf in self.earlybufs): eb_valids = [True] * self.shape_len for i in range(len(self.bufs)): - if isinstance(self.bufs[i]._buf, CLImage) and self.bufs[i] in self.earlybufs: + if hasattr(self.bufs[i]._buf, "IMAGE") and self.bufs[i] in self.earlybufs: valids = [self.sts[i].shape[j]%4 == 0 and self.sts[i].views[-1].strides[j] == 1 for j in range(self.shape_len)] eb_valids = [x and y for x,y in zip(eb_valids, valids)] assert any(eb_valids), f"invalid op with images {eb_valids}" @@ -158,18 +151,18 @@ class CLASTKernel(ASTKernel): self.simplify_ones() # are we grouping? - if not CLANG and not self.buftokens[0].can_float4() and self.first_reduce <= 2 and self.first_reduce + 1 <= self.shape_len and prod(self.sts[0].shape[:self.first_reduce]) <= 2048: + if self.lang.float4 and not self.buftokens[0].can_float4() and self.first_reduce <= 2 and self.first_reduce + 1 <= self.shape_len and prod(self.sts[0].shape[:self.first_reduce]) <= 2048: # TODO: use 1024 if it's allowed in a smarter way - for sz in ((([1024] if METAL else []) + [256, 16]) if prod(self.sts[0].shape[:self.first_reduce]) <= 32 else [16]): + for sz in (([256, 16]) if prod(self.sts[0].shape[:self.first_reduce]) <= 32 else [16]): if all([st.shape[self.first_reduce] % sz == 0 or st.shape[self.first_reduce] == 1 for st in self.sts]): self.group_for_reduce.append(sz) break # if there's images in the latebufs, we have to make an axis the 4 storing one. this affects the kernel shape - if any(isinstance(buf._buf, CLImage) for buf in self.bufs if buf not in self.earlybufs) and not self.buftokens[0].can_float4(): + if any(hasattr(buf._buf, "IMAGE") for buf in self.bufs if buf not in self.earlybufs) and not self.buftokens[0].can_float4(): lb_valids = [True] * self.shape_len for i in range(len(self.bufs)): - valids = [self.sts[i].shape[j]%4 == 0 and (self.sts[i].views[-1].strides[j] == 1 or not isinstance(self.bufs[i]._buf, CLImage) or self.bufs[i] in self.earlybufs) for j in range(self.shape_len)] + valids = [self.sts[i].shape[j]%4 == 0 and (self.sts[i].views[-1].strides[j] == 1 or not hasattr(self.bufs[i]._buf, "IMAGE") or self.bufs[i] in self.earlybufs) for j in range(self.shape_len)] lb_valids = [x and y for x,y in zip(lb_valids, valids)] assert any(lb_valids), f"invalid op with images {lb_valids}" lb_valid = lb_valids.index(True) @@ -192,7 +185,7 @@ class CLASTKernel(ASTKernel): self.simplify_ones() # split to 4 float4s - if self.buftokens[0].can_float4() and any(isinstance(buf._buf, CLImage) for buf in self.earlybufs) and prod(self.sts[0].shape[:self.first_reduce]) >= 2048 and not self.group_for_reduce: + if self.buftokens[0].can_float4() and any(hasattr(buf._buf, "IMAGE") for buf in self.earlybufs) and prod(self.sts[0].shape[:self.first_reduce]) >= 2048 and not self.group_for_reduce: xb_choices = [] for i in range(self.first_reduce): if all(st.shape[i]%4 == 0 for st in self.sts): @@ -214,7 +207,7 @@ class CLASTKernel(ASTKernel): self.simplify_ones() # use more opencl indexing - if self.first_reduce == 2 and isinstance(self.bufs[0]._buf, CLImage): + if self.first_reduce == 2 and hasattr(self.bufs[0]._buf, "IMAGE"): base_shape = self.bufs[0]._base_shape if all([(base_shape[0]*base_shape[1])%st.shape[0] == 0 and st.shape[0]//base_shape[0] != 0 for st in self.sts]): if DEBUG >= 3: print("split opencl", base_shape, self.sts[0].shape) @@ -237,11 +230,11 @@ class CLASTKernel(ASTKernel): # STOP WASTING TIME WITH DOING THE RESHAPES AND PERMUTES BY HAND. KERNEL SEARCH IS THE ONLY WAY IT WILL EVER BE GOOD # group_for_reduce will have to be better first - def codegen(self) -> Callable: + def codegen(self) -> ASTRunner: self.process() self.upcast_in_mid_reduce = False if DEBUG >= 3: self.printbufs("old:", DEBUG>=4) - if KOPT == -1 or IMAGE == 2: self.hand_coded_optimizations() + self.hand_coded_optimizations() # add a local buffer for multistage reduce if len(self.group_for_reduce): @@ -257,18 +250,17 @@ class CLASTKernel(ASTKernel): self.loaded_keys : Dict[Tuple[int,int], Token] = {} self.prekernel : Set[str] = set() - self.kernel : List[str] = ["const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"] if any(isinstance(buf._buf, CLImage) for buf in self.bufs) else [] + self.kernel : List[str] = ["const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"] if any(hasattr(buf._buf, "IMAGE") for buf in self.bufs) else [] # output_shape[-1] is get_global_id(0) - if CLANG: + if len(self.lang.gid) == 0: self.kernel += [f"for (int idx{i} = 0; idx{i} < {self.output_shape[i]}; idx{i}++) {{\n" for i in range(0, len(self.output_shape))] else: - MAX_OUTPUT_SHAPE = 3 - self.kernel += [f"int idx{len(self.output_shape)-1-i} = {CLProgram.gid[i]}; /* {self.output_shape[-1-i]} */\n" for i in range(min(MAX_OUTPUT_SHAPE, len(self.output_shape))) if self.output_shape[-1-i] != 1] - if len(self.output_shape) > MAX_OUTPUT_SHAPE: + self.kernel += [f"int idx{len(self.output_shape)-1-i} = {self.lang.gid[i]}; /* {self.output_shape[-1-i]} */\n" for i in range(min(len(self.lang.gid), len(self.output_shape))) if self.output_shape[-1-i] != 1] + if len(self.output_shape) > len(self.lang.gid): # sometimes, there's more dimensions. compact all the dimensions into the first one # TODO: these compactions should be searchable - final_dimension = len(self.output_shape)-MAX_OUTPUT_SHAPE + final_dimension = len(self.output_shape)-len(self.lang.gid) for i in range(final_dimension-1, -1, -1): self.kernel += [f"int idx{i} = idx{final_dimension} % {self.output_shape[i]};", f"idx{final_dimension} = idx{final_dimension} / {self.output_shape[i]};\n"] self.output_shape = [prod(self.output_shape[0:final_dimension+1])] + list(self.output_shape[final_dimension+1:]) @@ -282,7 +274,7 @@ class CLASTKernel(ASTKernel): acc_offsets = self.buftokens[self.bufs.index(self.earlybufs[0])].acc_offsets() assert self.reduceopop is not None - self.kernel += [f"{accumulator.decltype()} {accumulator.tok} = {CLASTKernel.start_for_op[self.reduceopop]};\n" for accumulator in accumulators] + self.kernel += [f"{accumulator.decltype()} {accumulator.tok} = {GPUCodegen.start_for_op[self.reduceopop]};\n" for accumulator in accumulators] self.kernel += [f"for (int idx{i} = 0; idx{i} < {full_shape[i]}; idx{i}++) {{\n" for i in range(self.first_reduce+len(self.group_for_reduce), self.shape_len)] self.kernel += [f"{x.tok};\n" for x in self.ast_parse(self.reduceop, [accumulators[off] for off in acc_offsets], do_reduce=True)] + ["}\n"] * (self.shape_len - (self.first_reduce + len(self.group_for_reduce))) @@ -292,9 +284,9 @@ class CLASTKernel(ASTKernel): assert lvalid.min == 1, "local buffer must always be valid" self.kernel.append(f"int mid_idx = {lidx.render(render_cl)};\n") for i,acc in enumerate(accumulators): - self.kernel.append(CLProgram.smem_prefix + f"{acc.decltype()} {self.buftokens[-1].tok}{i}[{prod(self.group_for_reduce)}];") + self.kernel.append(self.lang.smem_prefix + f"{acc.decltype()} {self.buftokens[-1].tok}{i}[{prod(self.group_for_reduce)}];") self.kernel.append(f"{self.buftokens[-1].tok}{i}[mid_idx] = {acc.tok};\n") - self.kernel.append(CLProgram.barrier+"\n") + self.kernel.append(self.lang.barrier+"\n") if self.upcast_in_mid_reduce: assert len(self.group_for_reduce) == 2 @@ -311,86 +303,31 @@ class CLASTKernel(ASTKernel): if self.upcast_in_mid_reduce: self.kernel.append(f'float4 ld = vload4(0, &temp{i}[mid*4]);\n') for j in range(4): - self.kernel.append(CLASTKernel.code_for_op[self.reduceopop].replace('A', new_accumulators[i*4+j].tok).replace('B', f'ld.{"xyzw"[j]}')+";\n") + self.kernel.append(GPUCodegen.code_for_op[self.reduceopop].replace('A', new_accumulators[i*4+j].tok).replace('B', f'ld.{"xyzw"[j]}')+";\n") else: - self.kernel.append(CLASTKernel.code_for_op[self.reduceopop].replace('A', new_accumulators[i].tok).replace('B', f'temp{i}[mid]')+";\n") + self.kernel.append(GPUCodegen.code_for_op[self.reduceopop].replace('A', new_accumulators[i].tok).replace('B', f'temp{i}[mid]')+";\n") self.kernel.append("}\n") accumulators = new_accumulators # late ast self.store(0, self.ast_parse(self.ast, accumulators)) if self.group_for_reduce: self.kernel.append("}") - if CLANG: self.kernel += ["}"] * len(self.output_shape) + if len(self.lang.gid) == 0: self.kernel += ["}"] * len(self.output_shape) self.kernel.append("\n}") # kernel function definition function_name = ("re_S" if self.reduceop else "ew_S") + '_'.join([str(x) for x in self.bufs[0].shape if x != 1]) - buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if isinstance(x._buf, CLImage) else (CLProgram.buffer_prefix+self.buftokens[i].decltype() + ("restrict" if CLANG else "")) for i,x in enumerate(self.bufs)] - self.kernel = list(self.prekernel) + [f"{CLProgram.kernel_prefix} void {function_name}(",] + \ - [', '.join([f'{t} data{i}' for i,t in enumerate(buftypes) if i not in self.bufs_to_delete] + CLProgram.extra_args)] + \ + GPUCodegen.kernel_cnt[function_name] += 1 + if GPUCodegen.kernel_cnt[function_name]: + function_name = f"{function_name}{'_N'+str(GPUCodegen.kernel_cnt[function_name])}" + + buftypes = [f"{'read_only' if i > 0 else 'write_only'} image2d_t" if hasattr(x._buf, "IMAGE") else self.lang.buffer_prefix+self.buftokens[i].decltype() for i,x in enumerate(self.bufs)] + self.kernel = list(self.prekernel) + [f"{self.lang.kernel_prefix} void {function_name}(",] + \ + [', '.join([f'{t} data{i}' for i,t in enumerate(buftypes) if i not in self.bufs_to_delete] + self.lang.extra_args)] + \ [") {\n"] + self.kernel - # compile kernel - self.fxn = CLProgram(function_name, ' '.join(self.kernel), op_estimate=self.info.flops, mem_estimate=sum(prod(x._base_shape) for x in self.bufs)) if DEBUG >= 3 and len(self.bufs_to_delete): print(f"deleting buffers {self.bufs_to_delete}") - return GPURunner(self.fxn, self.bufs_to_delete, self.output_shape[::-1] if len(self.output_shape) > 0 else [1], (self.group_for_reduce[::-1] + [1]*(len(self.output_shape)-len(self.group_for_reduce))) if self.group_for_reduce else None) - - def print(self): - super().print() - for i in range(len(self.bufs)): - print(self.buftokens[i], self.bufs[i] in self.earlybufs, self.sts[i]) - print(self.fxn.prg) - -class GPUBuffer(ExplicitExecAST): - def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[GPUBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False): - super().__init__(shape, hostbuf) - self._buf : Optional[Union[CLImage, CLBuffer]] = hostbuf._buf if hostbuf is not None else None - self._backing : Optional[np.ndarray] = hostbuf._backing if hostbuf is not None else backing - # early copy in for large buffers - if (self._backing is not None and self._backing.shape != (1,)) or force_create: - self.cl - - # TODO: refactor this to return self._buf and not import pyopencl - @property - def cl(self) -> Union[CLBuffer, CLImage]: - if self._buf is None: - self._buf = CLImage(self._base_shape) if (len(self._base_shape) == 3 and self._base_shape[2] == 4 and IMAGE >= 2) else CLBuffer(4*prod(self._base_shape)) - assert self._buf is not None - if self._backing is not None: - assert GlobalCounters.cache is None, f"can't copy in {self._backing.shape} while caching" - self._buf.copyin(self._backing) - self._backing = None - return self._buf._cl - - # TODO: we don't always need a hostbuf - def __repr__(self): return f"GPUBuffer(shape={self.st}, hostbuf=GPUBuffer(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.float32)))" if self._backing else ", force_create=True))") - - @staticmethod - def fromCPU(x): return GPUBuffer(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel()) - - def toCPU(self) -> np.ndarray: - cl_buf = self.contiguous() - cl_buf.cl # force buffer creation, happens if it's a backed buffer that hasn't been created yet - cl_buf = cl_buf if isinstance(cl_buf._buf, CLBuffer) else type(self).exec_ast(LazyOp(op=UnaryOps.NOOP, src=(self.movement_op(MovementOps.RESHAPE, tuple(list(self.shape)+[1])), ))) - assert prod(cl_buf._base_shape) == prod(self.shape), f"shape product mismatch {cl_buf._base_shape} vs {self.shape}" - data = np.empty(self.shape, dtype=np.float32) - assert GlobalCounters.cache is None, f"can't copy out {self} while caching" - cl_buf._buf.copyout(data) - return data - - @classmethod - def exec_ast(cls, ast:LazyOp, output_buffer:Optional[GPUBuffer]=None): - k = CLASTKernel(ast, output_buffer) - if KOPT > 0: - from extra.kernel_search import apply_optimization - apply_optimization(k, ast, max_interventions=KOPT) - prg = k.codegen() - if GlobalCounters.cache is not None: GlobalCounters.cache.append((prg, k.bufs)) - prg(*k.bufs) - if PRINT_AST == "1" or (hasattr(k, "fxn") and PRINT_AST == k.fxn.name): - print(k.fxn.name) - k.print() - if TEST_AST: - from extra.lib_test_ast import test_ast # type: ignore - test_ast(k) - return k.ret + return ASTRunner(function_name, ' '.join(self.kernel), self.bufs_to_delete, + self.output_shape[::-1] if len(self.output_shape) > 0 else [1], + (self.group_for_reduce[::-1] + [1]*(len(self.output_shape)-len(self.group_for_reduce))) if self.group_for_reduce else None, + op_estimate=self.info.flops, mem_estimate=sum(prod(x._base_shape) for x in self.bufs)) diff --git a/tinygrad/llops/ops_llvm.py b/tinygrad/codegen/llvm.py similarity index 77% rename from tinygrad/llops/ops_llvm.py rename to tinygrad/codegen/llvm.py index 13f69b8cf7..fc51c9a228 100644 --- a/tinygrad/llops/ops_llvm.py +++ b/tinygrad/codegen/llvm.py @@ -1,19 +1,12 @@ -from __future__ import annotations -import math -import functools -from typing import Tuple, Union, Dict, Any, List, ClassVar, Optional -from tinygrad.helpers import prod, DEBUG -from tinygrad.shape import ShapeTracker -from tinygrad.ops import LazyOp -from tinygrad.ast import ASTKernel -import ctypes -import numpy as np -from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, ExplicitExecAST -from tinygrad.runtime.llvm import LLVM, ir +import functools, math +from typing import ClassVar, List +from llvmlite import ir # type: ignore +from tinygrad.codegen.ast import ASTKernel, ASTRunner +from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, LazyOp +from tinygrad.helpers import DEBUG, prod + from tinygrad.shape.symbolic import Variable, NumNode, MulNode, DivNode, ModNode, GeNode, LtNode, SumNode, AndNode - def int_const(x): return ir.Constant(ir.IntType(64), x) - render_llvm = { Variable: lambda self,ops,ctx: self.expr, NumNode: lambda self,ops,ctx: int_const(self.b), @@ -26,7 +19,7 @@ render_llvm = { AndNode: lambda self,ops,ctx: functools.reduce(lambda a,b: ctx.and_(a,b.render(ops,ctx)), self.nodes[1:], self.nodes[0].render(ops,ctx)) } -class LLVMBuffer(ExplicitExecAST): +class LLVMCodegen(ASTKernel): op_lookup : ClassVar = { UnaryOps.NOOP: lambda builder,x: x, UnaryOps.NEG: lambda builder,x: builder.fneg(x, flags=('fast',)), @@ -46,41 +39,10 @@ class LLVMBuffer(ExplicitExecAST): ReduceOps.MAX: ir.Constant(ir.FloatType(), -math.inf) } - def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf=None, force_create=False): - super().__init__(shape, hostbuf) - # TODO: force alignment? - self._buf = (ctypes.c_float * (prod(self.shape)))() if hostbuf is None else hostbuf._buf - #assert ctypes.addressof(self._buf) & 0x1F == 0 + def codegen(self): + self.process() + if DEBUG >= 3: self.printbufs("old:", DEBUG>=4) - def __repr__(self): return f"LLVMBuffer {str(self.st)}" - - @staticmethod - def fromCPU(x): - x = x.astype(np.float32) - ret = LLVMBuffer(x.shape) - ctypes.memmove(ret._buf, x.ctypes.data, prod(ret.shape)*4) - return ret - - def toCPU(x): return np.ctypeslib.as_array(x.contiguous()._buf)[:prod(x.shape)].reshape(x.shape).copy() - - func_cache : Dict[str, Any] = {} - @classmethod - def exec_ast(cls, ast:LazyOp, output_buffer:Optional[LLVMBuffer]=None) -> LLVMBuffer: - k = ASTKernel(ast, output_buffer) - - # cached kernel - if k.key in LLVMBuffer.func_cache: - LLVMBuffer.func_cache[k.key](*[x._buf for x in k.bufs]) - return k.ret - - # process if uncached - k.process() - - if DEBUG >= 2: - print(k.ast) - print("old:", [x.shape for x in k.sts]) - print("old:", [x.views[-1].strides for x in k.sts]) - # this stuff can't be hand coded kernel_output_axis : List[int] = [] """ @@ -119,12 +81,12 @@ class LLVMBuffer(ExplicitExecAST): """ # the 4x4 need to go all the way at the end, even after reduce - output_shape = k.sts[0].shape - full_shape_options = [x.shape for x in k.sts if x.shape != output_shape] + output_shape = self.sts[0].shape + full_shape_options = [x.shape for x in self.sts if x.shape != output_shape] full_shape = output_shape if len(full_shape_options) == 0 else full_shape_options[0] full_shape = full_shape if not kernel_output_axis else full_shape[:-len(kernel_output_axis)] - kernel_output_dim = prod([k.sts[0].shape[a] for a in kernel_output_axis]) + kernel_output_dim = prod([self.sts[0].shape[a] for a in kernel_output_axis]) kernel_output_type = ir.FloatType() if kernel_output_dim == 1 else ir.VectorType(ir.FloatType(), kernel_output_dim) def get_idxs(builder, idx, buf_index): @@ -143,7 +105,7 @@ class LLVMBuffer(ExplicitExecAST): # create llvm function module = ir.Module(name=__file__) - func = ir.Function(module, ir.FunctionType(ir.VoidType(), [ir.FloatType().as_pointer()]*(len(k.bufs))), name='exec') + func = ir.Function(module, ir.FunctionType(ir.VoidType(), [ir.FloatType().as_pointer()]*(len(self.bufs))), name='exec') # force llvmlite to allow us to add function attribute then add the attribute func.attributes._known = func.attributes._known.union(frozenset(['"no-nans-fp-math"="true"'])) @@ -158,13 +120,13 @@ class LLVMBuffer(ExplicitExecAST): loop_exit = loop_exit[::-1] # add the buffer indexing - idx_level = [[int_const(st.offset)] for st in k.sts] + idx_level = [[int_const(st.offset)] for st in self.sts] for i in range(len(full_shape)): - for j in range(len(k.bufs)): + for j in range(len(self.bufs)): # stride si = loop_entry[i+1].phi(ir.IntType(64), name=f"idx_{j}_{i}") si.add_incoming(idx_level[j][-1], loop_entry[i]._block) - si_ps = loop_exit[i+1].add(si, int_const(k.sts[j].views[-1].strides[i])) + si_ps = loop_exit[i+1].add(si, int_const(self.sts[j].views[-1].strides[i])) si.add_incoming(si_ps, loop_exit[i+1]._block) idx_level[j].append(si) @@ -172,7 +134,7 @@ class LLVMBuffer(ExplicitExecAST): def ast_parse(builder, x, level, reduce_result=None): if not isinstance(x, LazyOp): m = kernel_output_type(ir.Undefined) - buf_index = k.bufs.index(x) + buf_index = self.bufs.index(x) for i, idx in enumerate(get_idxs(builder, idx_level[buf_index][level], buf_index)): # first view is already implictly handled idx, valid = x.st._expr_idx(Variable(idx, 0, prod(x.st.shape))) @@ -195,12 +157,12 @@ class LLVMBuffer(ExplicitExecAST): m = kernel_output_type(ir.Undefined) if kernel_output_dim == 1: - return LLVMBuffer.op_lookup[x.op](builder, *values) + return LLVMCodegen.op_lookup[x.op](builder, *values) else: # TODO: this only has to be done for certain ops for i in range(kernel_output_dim): value = [builder.extract_element(v, int_const(i)) for v in values] - element = LLVMBuffer.op_lookup[x.op](builder, *value) + element = LLVMCodegen.op_lookup[x.op](builder, *value) m = builder.insert_element(m, element, int_const(i)) return m @@ -209,9 +171,9 @@ class LLVMBuffer(ExplicitExecAST): # do the early ast reduce_result = None - if k.reduceop: - reduce_input = ast_parse(loop_exit[-1], k.reduceop.src[0], -1) - phis = [LLVMBuffer.start_for_op[k.reduceop.op]] # type: ignore + if self.reduceop: + reduce_input = ast_parse(loop_exit[-1], self.reduceop.src[0], -1) + phis = [LLVMCodegen.start_for_op[self.reduceop.op]] # type: ignore if kernel_output_dim > 1: phis = [kernel_output_type(phis * kernel_output_dim)] for i in range(store_loop+1, len(loop_entry)): @@ -219,16 +181,16 @@ class LLVMBuffer(ExplicitExecAST): val.add_incoming(phis[-1], loop_entry[i-1]._block) phis.append(val) - if k.reduceop.op == ReduceOps.SUM: + if self.reduceop.op == ReduceOps.SUM: reduce_result = loop_exit[-1].fadd(reduce_input, val, flags=('fast',)) - elif k.reduceop.op == ReduceOps.MAX: + elif self.reduceop.op == ReduceOps.MAX: reduce_result = loop_exit[-1].select(loop_exit[-1].fcmp_unordered(">", val, reduce_input, flags=('fast',)), val, reduce_input, flags=('fast',)) for i,phi in enumerate(phis[1:]): phi.add_incoming(reduce_result, loop_exit[store_loop+1+i]._block) # do the late ast - result = ast_parse(loop_exit[store_loop], k.ast, store_loop, reduce_result=reduce_result) + result = ast_parse(loop_exit[store_loop], self.ast, store_loop, reduce_result=reduce_result) # store result builder = loop_exit[store_loop] @@ -247,5 +209,5 @@ class LLVMBuffer(ExplicitExecAST): loop_entry[-1].branch(loop_exit[-1]._block) loop_exit[0].ret_void() - LLVMBuffer.func_cache[k.key] = LLVM().exec(module, k.bufs, k.info.flops, sum(len(x._buf) for x in k.bufs)) - return k.ret + + return ASTRunner('exec', str(module), op_estimate=self.info.flops, mem_estimate=sum(prod(x._base_shape) for x in self.bufs)) diff --git a/tinygrad/jit.py b/tinygrad/jit.py index 37eedf9d4f..f8581aa755 100644 --- a/tinygrad/jit.py +++ b/tinygrad/jit.py @@ -13,7 +13,7 @@ class TinyJit: self.input_replace : Dict[DeviceBuffer, Any]= {} def __call__(self, *args, **kwargs) -> Any: - if Device.DEFAULT != "GPU": return self.fxn(*args, **kwargs) # only jit on the GPU + if Device.DEFAULT not in ["GPU", "CLANG"]: return self.fxn(*args, **kwargs) # only jit on the GPU # NOTE: this cast is needed since although we know realize will create a ".realized" DeviceBuffer, the type checker doesn't input_tensors = {k:cast(DeviceBuffer, v.realize().lazydata.realized)._buf for k,v in itertools.chain(enumerate(args), kwargs.items()) if isinstance(v, Tensor)} assert len(input_tensors) != 0, "no inputs to JIT" diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index c4659610e7..13a6219485 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -1,6 +1,6 @@ from __future__ import annotations from typing import Optional, Tuple, Union, List, Dict, Any, ClassVar, Type -import sys, weakref, importlib, inspect +import os, sys, weakref, importlib, inspect from weakref import WeakValueDictionary from tinygrad.helpers import prod, DEBUG from tinygrad.shape import ShapeTracker @@ -14,21 +14,20 @@ sys.setrecursionlimit(10000) OPT = getenv("OPT", 2) LAZY = getenv("LAZY", 1) -def get_buffer(name, base='tinygrad.llops'): +def get_buffer(name, base='tinygrad.runtime'): try: - return (name.upper(), [cls for cname, cls in inspect.getmembers(importlib.import_module(f'{base}.ops_{name}'), inspect.isclass) if (cname.lower() == name + "buffer")][0]) + return [cls for cname, cls in inspect.getmembers(importlib.import_module(f'{base}.ops_{name}'), inspect.isclass) if (cname.lower() == name + "buffer")][0] except Exception as e: # NOTE: this can't be put on one line due to mypy issue print(name, "backend not available", e, file=sys.stderr) class _Device: def __init__(self) -> None: - self._buffers : Dict[str, Type[DeviceBuffer]] = {x[0]:x[1] for x in [ - get_buffer('cpu'), get_buffer('gpu'), get_buffer('llvm'), get_buffer('torch'), - get_buffer('triton', 'accel.triton')] if x is not None} + self._buffers : Dict[str, Type[DeviceBuffer]] = {x.upper():get_buffer(x) for x in + [os.path.splitext(x)[0][len("ops_"):] for x in sorted(os.listdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), "runtime"))) if x.startswith("ops_")] if x is not None} self.DEFAULT : str = "CPU" for name in self._buffers: if getenv(name) == 1: self.DEFAULT = name # note: DEFAULT can be a Device that can't be imported. better than silent use of a different device - self.__setattr__(name, name) + if self._buffers[name] is not None: self.__setattr__(name, name) Device = _Device() # TODO: movement ops that only change shape are really nops. treat them as such diff --git a/tinygrad/ops.py b/tinygrad/ops.py index 98ac10a0ad..50c8befb5b 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -1,7 +1,7 @@ from __future__ import annotations import numpy as np from enum import Enum, auto -from typing import Union, Type, NamedTuple, Tuple, Any, List, ClassVar, Optional, Callable, Dict +from typing import Union, Type, NamedTuple, Tuple, Any, List, ClassVar, Optional, Callable, Dict, TypeVar import functools, operator from tinygrad.helpers import prod, DEBUG from tinygrad.shape import ShapeTracker @@ -32,13 +32,35 @@ def map_buffers(real_srcs, x:LazyOp) -> LazyOp: if x in real_srcs: return map_buffers(real_srcs, real_srcs[x]) if isinstance(real_srcs[x], LazyOp) else real_srcs[x] return LazyOp(x.op, tuple((map_buffers(real_srcs, y) if isinstance(y, LazyOp) else real_srcs[y]) for y in x.src), x.arg) +_T = TypeVar("_T") +class RawBuffer: + def __init__(self, size): raise NotImplementedError("must be implemented") + @classmethod + def fromCPU(cls:Type[_T], x:np.ndarray) -> _T: raise NotImplementedError("must be implemented") + def toCPU(self:RawBuffer) -> np.ndarray: raise NotImplementedError("must be implemented") + +class RawBufferCopyIn(RawBuffer): + def copyin(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented") + + @classmethod + def fromCPU(cls, x:np.ndarray): + ret = cls(4*prod(x.shape)) + ret.copyin(x) + return ret + +class RawBufferCopyInOut(RawBufferCopyIn): + size : int + def copyout(self, x:np.ndarray) -> None: raise NotImplementedError("must be implemented") + + def toCPU(self) -> np.ndarray: + x = np.empty((self.size//4), dtype=np.float32) + self.copyout(x) + return x + # a placeholder class to extend by the exec classes -class DeviceBuffer: +class DeviceBuffer(RawBuffer): _buf: Any # underlying buffer shape: Tuple[int, ...] - @staticmethod - def fromCPU(x:np.ndarray) -> DeviceBuffer: raise NotImplementedError("must be implemented") - def toCPU(self:DeviceBuffer) -> np.ndarray: raise NotImplementedError("must be implemented") @classmethod def exec_ast(cls, ast:LazyOp, output_buffer=None): raise NotImplementedError("must be implemented") @@ -53,14 +75,14 @@ shape_fxn_for_op : Dict[Op, Callable] = { **{op:functools.partial(lambda mop,self,arg: GenericShape(ShapeTracker(self.shape).movement_op(mop, arg).shape, self.flops), op) for op in MovementOps}} # used in CPUBuffer and TorchBuffer -class GenericExecAST(DeviceBuffer): # pylint: disable=abstract-method +class InterpretedBuffer(DeviceBuffer): # pylint: disable=abstract-method fxn_for_op : ClassVar = shape_fxn_for_op # TODO: use generic types here to remove __init__ in specialized classes - def __init__(self, lbuf:Any): self.buf, self.shape = lbuf, tuple(lbuf.shape) + def __init__(self, lbuf:Any): self._buf, self.shape = lbuf, tuple(lbuf.shape) def contiguous(self): return type(self).exec_ast(LazyOp(op=UnaryOps.NOOP, src=(self,))) - def movement_op(self, op:MovementOps, arg=None): return type(self)(self.fxn_for_op[op](self.buf, arg)) if op in self.fxn_for_op else type(self)(getattr(self.buf, op.name.lower())(arg)) + def movement_op(self, op:MovementOps, arg=None): return type(self)(self.fxn_for_op[op](self._buf, arg)) if op in self.fxn_for_op else type(self)(getattr(self._buf, op.name.lower())(arg)) @classmethod - def exec_ast(cls, ast:LazyOp, output_buffer:Optional[GenericExecAST]=None): + def exec_ast(cls, ast:LazyOp, output_buffer:Optional[InterpretedBuffer]=None): if FusedOps.MULACC in cls.fxn_for_op and ast.op == ReduceOps.SUM and isinstance(ast.src[0], LazyOp) and ast.src[0].op == BinaryOps.MUL: ast = LazyOp(FusedOps.MULACC, ast.src[0].src, ast.arg) srcs = [cls.exec_ast(x) if isinstance(x, LazyOp) else x for x in ast.src] @@ -68,22 +90,57 @@ class GenericExecAST(DeviceBuffer): # pylint: disable=abstract-method if ast.op in BinaryOps: assert srcs[0].shape == srcs[1].shape, f"BinaryOps shape mismatch {srcs[0].shape} != {srcs[1].shape}" if ast.op in ReduceOps: assert all(r == n or n == 1 for r,n in zip(srcs[0].shape, ast.arg)), f"ReduceOps can't reduce {srcs[0].shape} -> {ast.arg}" if ast.op in MovementOps: ret = srcs[0].movement_op(ast.op, ast.arg) - else: ret = cls(cls.fxn_for_op[ast.op](*([x.buf for x in srcs] + ([ast.arg] if ast.arg else [])))) + else: ret = cls(cls.fxn_for_op[ast.op](*([x._buf for x in srcs] + ([ast.arg] if ast.arg else [])))) if output_buffer is not None: assert output_buffer.shape == ret.shape - output_buffer.buf = ret.buf + output_buffer._buf = ret._buf return output_buffer else: return ret -def get_lazyop_info(ast:LazyOp): return GenericExecAST.exec_ast(map_buffers({x:GenericExecAST(GenericShape(x.shape)) for x in get_buffers(ast)}, ast)).buf +def get_lazyop_info(ast:LazyOp): return InterpretedBuffer.exec_ast(map_buffers({x:InterpretedBuffer(GenericShape(x.shape)) for x in get_buffers(ast)}, ast))._buf # assumes you are using ShapeTracker # used in GPUBuffer and LLVMBuffer -class ExplicitExecAST(DeviceBuffer): # pylint: disable=abstract-method - def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf=None): +class CompiledBuffer(DeviceBuffer): # pylint: disable=abstract-method + def __init__(self, shape:Union[ShapeTracker, Tuple[int, ...]], hostbuf:Optional[CompiledBuffer]=None, backing:Optional[np.ndarray]=None, force_create=False): self.st = shape if isinstance(shape, ShapeTracker) else ShapeTracker(tuple(shape)) self.shape = self.st.shape self._base_shape : Tuple[int, ...] = hostbuf._base_shape if hostbuf is not None else self.shape + self._buf = hostbuf._buf if hostbuf is not None else None + self._backing : Optional[np.ndarray] = hostbuf._backing if hostbuf is not None else backing + if (self._backing is not None and self._backing.shape != (1,)) or force_create: self.raw() + + # TODO: not GPUBuffer, get name of class + def __repr__(self): return f"GPUBuffer(shape={self.st}, hostbuf=GPUBuffer(shape={self._base_shape}" + (f", backing=np.array({self._backing}, dtype=np.float32)))" if self._backing else ", force_create=True))") + + raw_buffer_type : Type[RawBuffer] + @classmethod + def create_raw_buffer(cls, shape, backing) -> RawBuffer: + assert backing is None or prod(shape) == prod(backing.shape), "backing has the wrong shape" + assert backing is None or GlobalCounters.cache is None, f"can't copy in {backing.shape} while caching" + return cls.raw_buffer_type(4*prod(shape)) if backing is None else cls.raw_buffer_type.fromCPU(backing) + def raw(self) -> RawBuffer: + if self._buf is None: self._buf = self.create_raw_buffer(self._base_shape, self._backing) + self._backing = None + return self._buf + + @classmethod + def fromCPU(cls, x:np.ndarray) -> CompiledBuffer: return cls(x.shape, backing=x.view(np.ndarray).astype(np.float32).ravel()) + def toCPU(self) -> np.ndarray: + assert GlobalCounters.cache is None, f"can't copy out {self} while caching" + return self.contiguous().raw().toCPU().reshape(self.shape) + + codegen_type : Any + runtime_type : Type + method_cache : Dict[str, Callable] = {} + @classmethod + def exec_ast(cls, ast:LazyOp, output_buffer:Optional[CompiledBuffer]=None): + k = cls.codegen_type(ast, output_buffer) + if k.key not in cls.method_cache: cls.method_cache[k.key] = k.codegen().build(cls.runtime_type) + prg = cls.method_cache[k.key] + if GlobalCounters.cache is not None: GlobalCounters.cache.append((prg, k.bufs)) + prg(*k.bufs) + return k.ret # universal for shape tracked def contiguous(self): return self if self.st.contiguous and prod(self._base_shape) == prod(self.shape) else type(self).exec_ast(LazyOp(op=UnaryOps.NOOP, src=(self,))) @@ -92,12 +149,12 @@ class ExplicitExecAST(DeviceBuffer): # pylint: disable=abstract-method class GlobalCounters: global_ops : ClassVar[int] = 0 global_mem : ClassVar[int] = 0 - time_sum : ClassVar[int] = 0 + time_sum_s : ClassVar[float] = 0.0 kernel_count : ClassVar[int] = 0 mem_used : ClassVar[int] = 0 # NOTE: this is not reset cache : ClassVar[Optional[list]] = None @staticmethod - def reset(): GlobalCounters.global_ops, GlobalCounters.global_mem, GlobalCounters.time_sum, GlobalCounters.kernel_count, GlobalCounters.cache = 0,0,0,0,None + def reset(): GlobalCounters.global_ops, GlobalCounters.global_mem, GlobalCounters.time_sum_s, GlobalCounters.kernel_count, GlobalCounters.cache = 0,0,0.0,0,None @staticmethod def log_kernel(op_estimate:int, mem_estimate:int): GlobalCounters.kernel_count += 1 diff --git a/tinygrad/runtime/clang.py b/tinygrad/runtime/clang.py deleted file mode 100644 index 7068f08a5f..0000000000 --- a/tinygrad/runtime/clang.py +++ /dev/null @@ -1,38 +0,0 @@ -import ctypes -import os -import numpy as np -import hashlib -import subprocess -from collections import defaultdict -from typing import List, Final, Dict -from tinygrad.helpers import DEBUG -import platform -OSX = platform.system() == "Darwin" - -class CLBuffer: - def __init__(self, size): self._cl = (ctypes.c_float * (size))() - def copyin(self, b:np.ndarray): ctypes.memmove(self._cl, b.ctypes.data, b.size*4) - def copyout(self, a:np.ndarray): - np.copyto(a, np.ctypeslib.as_array(self._cl)[:a.size].reshape(a.shape)) - -class CLProgram: - kernel_prefix, buffer_prefix, smem_prefix, barrier = "", "", "", "" - gid = [f"gid[{i}]" for i in range(3)] - lid = [f"lid[{i}]" for i in range(3)] - extra_args : List[str] = [] - kernel_cnt : Final[Dict[str, int]] = defaultdict(int) - # TODO: remove name, factor out op_estimate and mem_estimate - def __init__(self, name:str, prg:str, rename=True, op_estimate=0, mem_estimate=0): - self.name = f"{name}{('_N'+str(CLProgram.kernel_cnt[name])) if CLProgram.kernel_cnt[name] else str()}" if rename else name - CLProgram.kernel_cnt[name] += 1 - self.prg = prg.replace(f"{name}(", f"{self.name}(") - prg = "#include \n#define max(x,y) ((x>y)?x:y)\n" + prg - if DEBUG >= 4: print(prg) # TODO: outside runtime! - # TODO: is there a way to not write this to disk? - fn = f"/tmp/clang_{hashlib.md5(prg.encode('utf-8')).hexdigest()}.{'dylib' if OSX else 'so'}" - if not os.path.exists(fn): - subprocess.check_output(['clang', '-shared', '-O2', '-Wall','-Werror', '-lm', '-fPIC', '-x', 'c', '-', '-o', fn+".tmp"], input=prg.encode('utf-8')) - os.rename(fn+".tmp", fn) - self.lib = ctypes.CDLL(fn) - self.fxn = self.lib[name] - def __call__(self, *args): self.fxn(*args[2:]) diff --git a/tinygrad/runtime/cuda.py b/tinygrad/runtime/cuda.py deleted file mode 100644 index 95f0367046..0000000000 --- a/tinygrad/runtime/cuda.py +++ /dev/null @@ -1,37 +0,0 @@ -from typing import Optional, List -import pycuda.autoprimaryctx # type: ignore # pylint: disable=unused-import # noqa: F401 -import pycuda.driver as cuda # type: ignore -from pycuda.compiler import compile # type: ignore -import numpy as np -from tinygrad.helpers import DEBUG -from tinygrad.ops import GlobalCounters - -class CLBuffer: - def __init__(self, size): self._cl = cuda.mem_alloc(size) - def copyin(self, b:np.ndarray, stream:Optional[cuda.Stream]=None): cuda.memcpy_htod_async(self._cl, b, stream) - def copyout(self, a:np.ndarray): cuda.memcpy_dtoh(a, self._cl) - -class CLProgram: - kernel_prefix = "__global__" - buffer_prefix = "" - smem_prefix = "__shared__ " - barrier = "__syncthreads();" - float4 = "make_float4" - gid = [f'blockDim.{chr(120+i)}*blockIdx.{chr(120+i)}+threadIdx.{chr(120+i)}' for i in range(3)] - lid = [f'threadIdx.{chr(120+i)}' for i in range(3)] - extra_args : List[str] = [] - def __init__(self, name:str, prg:str, binary=False, shared=0, op_estimate:int=0, mem_estimate:int=0): - self.name, self.op_estimate, self.mem_estimate, self.shared = name, op_estimate, mem_estimate, shared - if DEBUG >= 4 and not binary: print("CUDA compile", prg) - if not binary: prg = compile(prg, target="ptx", no_extern_c=True).decode('utf-8') - if DEBUG >= 5: print(prg) - self.prg = cuda.module_from_buffer(prg.encode('utf-8')).get_function(prg.split(".visible .entry ")[1].split("(")[0]) - - def __call__(self, global_size, local_size, *args): - local_size = (local_size + [1] * (3 - len(local_size))) if local_size is not None else (1,1,1) - global_size = global_size + [1] * (3 - len(global_size)) - assert all(x%y == 0 for x,y in zip(global_size, local_size)), f"local:{local_size} must divide global:{global_size}" - global_size = [x//y for x,y in zip(global_size, local_size)] - if DEBUG >= 2: print("CUDA launch", global_size, local_size) - self.prg(*args, block=tuple(local_size), grid=tuple(global_size), shared=self.shared) - GlobalCounters.log_kernel(self.op_estimate, self.mem_estimate) \ No newline at end of file diff --git a/tinygrad/runtime/metal.py b/tinygrad/runtime/metal.py deleted file mode 100644 index 2ffdeacfbc..0000000000 --- a/tinygrad/runtime/metal.py +++ /dev/null @@ -1,98 +0,0 @@ -# pip3 install pyobjc-framework-Metal pyobjc-framework-libdispatch -import Metal, Cocoa, libdispatch # type: ignore -import numpy as np -from typing import List, Any -from tinygrad.ops import GlobalCounters -from tinygrad.helpers import prod, getenv, DEBUG -import subprocess, pathlib - -METAL_XCODE = getenv("METAL_XCODE") - -device = Metal.MTLCreateSystemDefaultDevice() -mtl_queue = device.newCommandQueue() -mtl_buffers_in_flight : List[Any] = [] - -def sync(): - global mtl_buffers_in_flight - for cbuf in mtl_buffers_in_flight: cbuf.waitUntilCompleted() - mtl_buffers_in_flight = [] - -class CLBuffer: - def __init__(self, size): self._cl = device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared) - def __del__(self): self._cl.release() - - def copyin(self, b:np.ndarray): - np.copyto(np.frombuffer(self._cl.contents().as_buffer(self._cl.length()), dtype=np.float32), b.reshape(-1).data) - - def toCPU(self): - sync() - return np.frombuffer(self._cl.contents().as_buffer(self._cl.length()), dtype=np.float32) - - # TODO: remove copyout everywhere - def copyout(self, a:np.ndarray): np.copyto(a, self.toCPU().reshape(a.shape)) - -class CLProgram: - kernel_prefix = "#include \nusing namespace metal;\nkernel" - buffer_prefix = "device " - smem_prefix = "threadgroup " - barrier = "threadgroup_barrier(mem_flags::mem_threadgroup);" - float4 = "float4" - gid = [f"gid.{chr(120+i)}" for i in range(3)] - lid = [f"lid.{chr(120+i)}" for i in range(3)] - extra_args = ['uint3 gid [[thread_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]'] - def __init__(self, name:str, prg:str, op_estimate:int=0, mem_estimate:int=0): - self.name, self.op_estimate, self.mem_estimate = name, op_estimate, mem_estimate - if DEBUG >= 4: print("Metal compile", prg) - if DEBUG >= 6: # dump llvm - air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8')) - dis = subprocess.check_output(['/Users/kafka/Downloads/clang+llvm-15.0.7-arm64-apple-darwin22.0/bin/llvm-dis'], input=air) - print(dis.decode('utf-8')) - if METAL_XCODE: - air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8')) - lib = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air) - data = libdispatch.dispatch_data_create(lib, len(lib), None, None) - self.library, err = device.newLibraryWithData_error_(data, None) - else: - options = Metal.MTLCompileOptions.alloc().init() - self.library, err = device.newLibraryWithSource_options_error_(prg, options, None) - assert err is None, str(err) - self.fxn = self.library.newFunctionWithName_(name) #self.library.functionNames()[0] - # hacks to disassemble shader - if DEBUG >= 5: - arc, err = device.newBinaryArchiveWithDescriptor_error_(Metal.MTLBinaryArchiveDescriptor.alloc().init(), None) - assert err is None, str(err) - desc = Metal.MTLComputePipelineDescriptor.alloc().init() - desc.setComputeFunction_(self.fxn) - _, err = arc.addComputePipelineFunctionsWithDescriptor_error_(desc, None) - assert err is None, str(err) - _, err = arc.serializeToURL_error_(Cocoa.NSURL.URLWithString_("file:///tmp/shader.bin"), None) - assert err is None, str(err) - # clone https://github.com/dougallj/applegpu.git in the root of tinygrad - import os - os.system(f"cd {pathlib.Path(__file__).parent.parent.parent}/applegpu && python3 compiler_explorer.py /tmp/shader.bin") - self.pipeline_state, err = device.newComputePipelineStateWithFunction_error_(self.fxn, None) - assert err is None, str(err) - - def __call__(self, global_size, local_size, *args): - global_size += [1] * (3-len(global_size)) - if local_size is None: local_size = [32] - local_size += [1] * (3-len(local_size)) - - assert prod(local_size) <= self.pipeline_state.maxTotalThreadsPerThreadgroup(), f"local size {local_size} bigger than {self.pipeline_state.maxTotalThreadsPerThreadgroup()} with exec width {self.pipeline_state.threadExecutionWidth()} memory length {self.pipeline_state.staticThreadgroupMemoryLength()}" - command_buffer = mtl_queue.commandBuffer() - encoder = command_buffer.computeCommandEncoder() - encoder.setComputePipelineState_(self.pipeline_state) - for i,a in enumerate(args): - encoder.setBuffer_offset_atIndex_(a, 0, i) - encoder.dispatchThreads_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size)) - encoder.endEncoding() - command_buffer.commit() - if DEBUG >= 2: - command_buffer.waitUntilCompleted() - et = command_buffer.GPUEndTime() - command_buffer.GPUStartTime() - print(f"METAL et {et*1e6:8.2f} us {self.name:28s} launch {str(global_size):18s} {local_size}") - GlobalCounters.time_sum += et - else: - mtl_buffers_in_flight.append(command_buffer) - GlobalCounters.log_kernel(self.op_estimate, self.mem_estimate) - return command_buffer diff --git a/tinygrad/runtime/opencl.py b/tinygrad/runtime/opencl.py deleted file mode 100644 index 515f745e0b..0000000000 --- a/tinygrad/runtime/opencl.py +++ /dev/null @@ -1,101 +0,0 @@ -import functools, platform -import numpy as np -import pyopencl as cl # type: ignore -from typing import Dict, Optional, Tuple, List, ClassVar, Final -from collections import defaultdict -from tinygrad.ops import GlobalCounters -from tinygrad.helpers import getenv, DEBUG - -OSX = platform.system() == "Darwin" -OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something - -CLCACHE = getenv("CLCACHE", 1) -FLOAT16 = getenv("FLOAT16", 0) - -class CL: - BUFFER_CACHE : ClassVar[Dict[int, List[cl.Buffer]]] = defaultdict(list) - cl_ctx : ClassVar[Optional[cl.Context]] = None - cl_queue : ClassVar[Optional[cl.CommandQueue]] = None - def __init__(self) -> None: - if CL.cl_queue is not None: return # already initted - devices : List[cl.Device] = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], []) - if len(devices) == 0: # settle for CPU - devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) - CL.cl_ctx = cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]]) - if len(devices) > 1 or DEBUG >= 1: print(f"using {CL.cl_ctx.devices}") - CL.cl_queue = cl.CommandQueue(self.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # this is an in-order command queue - - @staticmethod - def enqueue_copy(a, b, is_blocking=False): - if DEBUG >= 1: print(f"**CL** copy in {b.shape}" if isinstance(b, np.ndarray) else f"**CL** copy OUT {a.shape}") - cl.enqueue_copy(CL().cl_queue, a, b, is_blocking=is_blocking) - -class CLBuffer: - def __init__(self, size): - if DEBUG >= 4: print(f"allocate GPU Buffer {size}") - if len(CL.BUFFER_CACHE[size]) > 0: - self._cl = CL.BUFFER_CACHE[size].pop() - else: - # TODO: on GPU OOM, clear the cache - self._cl = cl.Buffer(CL().cl_ctx, cl.mem_flags.READ_WRITE, size) - GlobalCounters.mem_used += self._cl.size - - def __del__(self): - if CLCACHE: CL.BUFFER_CACHE[self._cl.size].append(self._cl) - else: GlobalCounters.mem_used -= self._cl.size - - def copyin(self, b:np.ndarray): CL.enqueue_copy(self._cl, b, False) - def copyout(self, a:np.ndarray): CL.enqueue_copy(a, self._cl, True) - -class CLImage: - fmt : Final = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT) - - def __init__(self, shape): - self._cl = cl.Image(CL().cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0])) - GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height - - def __del__(self): - GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height - - def copyin(self, b:np.ndarray): raise NotImplementedError("no copyin for CLImage") - def copyout(self, a:np.ndarray): raise NotImplementedError("no copyout for CLImage") - -@functools.lru_cache(maxsize=None) -class CLProgram: - kernel_prefix = "__kernel" - buffer_prefix = "__global " - smem_prefix = "__local " - kernel_cnt : Final[Dict[str, int]] = defaultdict(int) - barrier = "barrier(CLK_LOCAL_MEM_FENCE);" - float4 = "(float4)" - gid = [f'get_global_id({i})' for i in range(3)] - lid = [f'get_local_id({i})' for i in range(3)] - extra_args : List[str] = [] - def __init__(self, name:str, prg:str, options:Tuple[str, ...]=tuple(), argdtypes=None, rename=True, binary=False, op_estimate=0, mem_estimate=0): - self.name = f"{name}{('_N'+str(CLProgram.kernel_cnt[name])) if CLProgram.kernel_cnt[name] else str()}" if rename else name - self.prg, self.options, self.argdtypes, self.op_estimate, self.mem_estimate = prg.replace(f"{name}(", f"{self.name}(") if rename else prg, options, argdtypes, op_estimate, mem_estimate - self.clprogram = cl.Program(CL().cl_ctx, CL().cl_ctx.devices, [self.prg]) if binary else cl.Program(CL().cl_ctx, self.prg) # type: ignore - try: - self.clprg = self.clprogram.build(options=list(self.options)).__getattr__(self.name) - except cl.RuntimeError as e: - if DEBUG >= 3: print("FAILED TO BUILD", self.prg) - raise e - if self.argdtypes is not None: - self.clprg.set_scalar_arg_dtypes(self.argdtypes) - CLProgram.kernel_cnt[name] += 1 - def __call__(self, *args) -> cl.Event: - if DEBUG >= 4: print(args[0], args[1], self.prg) - # print the PTX for NVIDIA. TODO: probably broken for everything else - if DEBUG >= 5 and not OSX: print(self.clprogram.get_info(cl.program_info.BINARIES)[0].decode('utf-8')) - e = self.clprg(CL().cl_queue, *args) - if DEBUG >= 2: - assert CL.cl_queue is not None - CL.cl_queue.finish() - # NOTE: Profiling is not in ns in OS X, we multiply by a computed ratio - et = (e.profile.end - e.profile.start) * OSX_TIMING_RATIO - GlobalCounters.time_sum += et - if DEBUG >= 1: - print(f"**CL** {GlobalCounters.kernel_count:6d} {self.name:28s} args {len(args[2:]):5d} kernels {str(args[0]):18s} {str(args[1]):12s} OPs {self.op_estimate/1e6:7.1f}M/{GlobalCounters.global_ops/1e9:7.2f}G mem {GlobalCounters.mem_used/1e9:5.2f} GB " + - (str() if DEBUG <= 1 else f"tm {et/1e3:9.2f}us/{GlobalCounters.time_sum/1e6:9.2f}ms ({self.op_estimate/et:8.2f} GFLOPS)")) - GlobalCounters.log_kernel(self.op_estimate, self.mem_estimate) - return e diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py new file mode 100644 index 0000000000..7d32b3baf5 --- /dev/null +++ b/tinygrad/runtime/ops_clang.py @@ -0,0 +1,37 @@ +import ctypes +import os, time +import numpy as np +import hashlib +import subprocess +from collections import defaultdict +from typing import Final, Dict +from tinygrad.ops import CompiledBuffer, RawBufferCopyIn +from tinygrad.codegen.gpu import GPUCodegen +import platform +OSX = platform.system() == "Darwin" + +class RawMallocBuffer(RawBufferCopyIn): + def __init__(self, size): self._buf = (ctypes.c_float * (size//4))() + def copyin(self, x:np.ndarray): ctypes.memmove(self._buf, x.ctypes.data, x.size*4) + def toCPU(self): return np.ctypeslib.as_array(self._buf) + +class ClangProgram: + kernel_cnt : Final[Dict[str, int]] = defaultdict(int) + def __init__(self, name:str, prg:str): + prg = "#include \n#define max(x,y) ((x>y)?x:y)\n" + prg + # TODO: is there a way to not write this to disk? + fn = f"/tmp/clang_{hashlib.md5(prg.encode('utf-8')).hexdigest()}.{'dylib' if OSX else 'so'}" + if not os.path.exists(fn): + subprocess.check_output(['clang', '-shared', '-O2', '-Wall','-Werror', '-lm', '-fPIC', '-x', 'c', '-', '-o', fn+".tmp"], input=prg.encode('utf-8')) + os.rename(fn+".tmp", fn) + self.lib = ctypes.CDLL(fn) + self.fxn = self.lib[name] + def __call__(self, *args, wait=False): + if wait: st = time.monotonic() + self.fxn(*[x._buf for x in args[2:]]) + if wait: return time.monotonic()-st + +class ClangBuffer(CompiledBuffer): + raw_buffer_type = RawMallocBuffer + codegen_type = GPUCodegen # clang is the default + runtime_type = ClangProgram diff --git a/tinygrad/llops/ops_cpu.py b/tinygrad/runtime/ops_cpu.py similarity index 95% rename from tinygrad/llops/ops_cpu.py rename to tinygrad/runtime/ops_cpu.py index e5ca55617b..5d495d9442 100644 --- a/tinygrad/llops/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -1,7 +1,7 @@ import numpy as np import operator from typing import ClassVar, Callable, Dict -from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, ReduceOps, FusedOps, GenericExecAST, Op +from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, ReduceOps, FusedOps, InterpretedBuffer, Op from tinygrad.helpers import shape_to_axis base_fxn_for_op : Dict[Op, Callable] = { @@ -30,9 +30,9 @@ numpy_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{ FusedOps.MULACC: einsum_mulacc(lambda s,a,b: np.einsum(s, a.copy(), b.copy()), lambda x: x.strides, np.broadcast_to) }} -class CPUBuffer(GenericExecAST): +class CPUBuffer(InterpretedBuffer): fxn_for_op : ClassVar = numpy_fxn_for_op @staticmethod def fromCPU(x): return CPUBuffer(x) - def toCPU(x): return x.buf + def toCPU(x): return x._buf diff --git a/tinygrad/runtime/ops_cuda.py b/tinygrad/runtime/ops_cuda.py new file mode 100644 index 0000000000..7f8fc265ba --- /dev/null +++ b/tinygrad/runtime/ops_cuda.py @@ -0,0 +1,38 @@ +from typing import Optional +import pycuda.autoprimaryctx # type: ignore # pylint: disable=unused-import # noqa: F401 +import pycuda.driver as cuda # type: ignore +from pycuda.compiler import compile # type: ignore +import numpy as np +from tinygrad.helpers import DEBUG +from tinygrad.ops import CompiledBuffer, RawBufferCopyInOut +from tinygrad.codegen.gpu import GPUCodegen, GPULanguage + +class RawCUDABuffer(RawBufferCopyInOut): + def __init__(self, size): self.size, self._cl = size, cuda.mem_alloc(size) + def copyin(self, x:np.ndarray, stream:Optional[cuda.Stream]=None): cuda.memcpy_htod_async(self._cl, x, stream) + def copyout(self, x:np.ndarray): cuda.memcpy_dtoh(x, self._cl) + +class CUDAProgram: + def __init__(self, name:str, prg:str, binary=False): + if not binary: prg = compile(prg, target="ptx", no_extern_c=True).decode('utf-8') + if DEBUG >= 5: print(prg) + # TODO: name is wrong + self.prg = cuda.module_from_buffer(prg.encode('utf-8')).get_function(prg.split(".visible .entry ")[1].split("(")[0]) + + def __call__(self, global_size, local_size, *args, wait=False): + local_size = (local_size + [1] * (3 - len(local_size))) if local_size is not None else (1,1,1) + global_size = global_size + [1] * (3 - len(global_size)) + assert all(x%y == 0 for x,y in zip(global_size, local_size)), f"local:{local_size} must divide global:{global_size}" + global_size = [x//y for x,y in zip(global_size, local_size)] + self.prg(*args, block=tuple(local_size), grid=tuple(global_size)) + +class CUDACodegen(GPUCodegen): + lang = GPULanguage( + kernel_prefix = "__global__", smem_prefix = "__shared__ ", barrier = "__syncthreads();", float4 = "make_float4", + gid = [f'blockDim.{chr(120+i)}*blockIdx.{chr(120+i)}+threadIdx.{chr(120+i)}' for i in range(3)], + lid = [f'threadIdx.{chr(120+i)}' for i in range(3)]) + +class CUDABuffer(CompiledBuffer): + raw_buffer_type = RawCUDABuffer + codegen_type = CUDACodegen + runtime_type = CUDAProgram diff --git a/tinygrad/runtime/ops_gpu.py b/tinygrad/runtime/ops_gpu.py new file mode 100644 index 0000000000..cd7f6335a7 --- /dev/null +++ b/tinygrad/runtime/ops_gpu.py @@ -0,0 +1,92 @@ +from __future__ import annotations +import platform, functools +import numpy as np +import pyopencl as cl # type: ignore +from typing import Dict, Optional, List, ClassVar, Final +from collections import defaultdict +from tinygrad.helpers import IMAGE, DEBUG, getenv +from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer +from tinygrad.codegen.gpu import GPUCodegen, GPULanguage + +OSX = platform.system() == "Darwin" +OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something +CLCACHE = getenv("CLCACHE", 1) +FLOAT16 = getenv("FLOAT16", 0) + +class _CL: + @functools.cached_property + def cl_ctx(self) -> cl.Context: + devices : List[cl.Device] = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], []) + if len(devices) == 0: devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) # settle for CPU + if len(devices) > 1 or DEBUG >= 1: print(f"using {devices[getenv('CL_DEVICE', 0)]}") + return cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]]) + + @functools.cached_property + def cl_queue(self) -> cl.CommandQueue: + return cl.CommandQueue(CL.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # this is an in-order command queue +CL = _CL() + +class CLBuffer(RawBufferCopyInOut): + # TODO: this can be in RawBuffer generically + BUFFER_CACHE : ClassVar[Dict[int, List[cl.Buffer]]] = defaultdict(list) + + def __init__(self, size): + self.size = size + if len(CLBuffer.BUFFER_CACHE[size]) > 0: + self._cl = CLBuffer.BUFFER_CACHE[size].pop() + else: + # TODO: on GPU OOM, clear the cache + self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size) + GlobalCounters.mem_used += self._cl.size + + def __del__(self): + if CLCACHE: CLBuffer.BUFFER_CACHE[self._cl.size].append(self._cl) + else: GlobalCounters.mem_used -= self._cl.size + + def copyin(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, self._cl, x, is_blocking=False) + def copyout(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, x, self._cl, is_blocking=True) + +class CLImage(RawBuffer): + fmt : Final = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT) + IMAGE : Final = True + + def __init__(self, shape): + self._cl = cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0])) + GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height + + def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height + +class CLProgram: + def __init__(self, name:str, prg:str, binary=False, argdtypes=None): + self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]) if binary else cl.Program(CL.cl_ctx, prg) # type: ignore + try: + self._clprg = self.clprogram.build() + except cl.RuntimeError as e: + if DEBUG >= 3: print("FAILED TO BUILD", prg) + raise e + self.clprg = self._clprg.__getattr__(name) + if DEBUG >= 5 and not OSX: print(self.clprogram.get_info(cl.program_info.BINARIES)[0].decode('utf-8')) # print the PTX for NVIDIA. TODO: probably broken for everything else + if self.argdtypes is not None: self.clprg.set_scalar_arg_dtypes(self.argdtypes) + + def __call__(self, global_size, local_size, *bufs, wait=False) -> Optional[float]: + e = self.clprg(CL.cl_queue, global_size, local_size, *[x._cl if isinstance(x, (CLBuffer, CLImage)) else x for x in bufs]) + if wait: + CL.cl_queue.finish() + return ((e.profile.end - e.profile.start) * OSX_TIMING_RATIO) * 1e-9 + return None + +class CLCodegen(GPUCodegen): + lang = GPULanguage( + kernel_prefix = "__kernel", buffer_prefix = "__global ", smem_prefix = "__local ", + barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)", + gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)]) + +class GPUBuffer(CompiledBuffer): + raw_buffer_type = CLBuffer + # override this method for image + @classmethod + def create_raw_buffer(cls, shape, backing) -> RawBuffer: + if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and not backing: return CLImage(shape) + else: return super().create_raw_buffer(shape, backing) + codegen_type = CLCodegen + runtime_type = CLProgram diff --git a/tinygrad/runtime/llvm.py b/tinygrad/runtime/ops_llvm.py similarity index 53% rename from tinygrad/runtime/llvm.py rename to tinygrad/runtime/ops_llvm.py index 2ff70d301f..7d17fa2b3f 100644 --- a/tinygrad/runtime/llvm.py +++ b/tinygrad/runtime/ops_llvm.py @@ -1,13 +1,12 @@ +import time, hashlib, ctypes from typing import ClassVar +from tinygrad.ops import CompiledBuffer +from tinygrad.runtime.ops_clang import RawMallocBuffer from tinygrad.helpers import getenv, DEBUG -from tinygrad.ops import GlobalCounters -import hashlib -import time -import ctypes from ctypes import CFUNCTYPE +from tinygrad.codegen.llvm import LLVMCodegen import llvmlite.binding as llvm # type: ignore -from llvmlite import ir # type: ignore class LLVM: target_machine : ClassVar[llvm.targets.TargetMachine] = None @@ -15,8 +14,7 @@ class LLVM: optimizer : ClassVar[llvm.passmanagers.ModulePassManager] = None def __init__(self): - if LLVM.engine is not None: - return + if LLVM.engine is not None: return llvm.initialize() llvm.initialize_native_target() llvm.initialize_native_asmprinter() @@ -46,42 +44,26 @@ class LLVM: backing_mod.triple = llvm.get_process_triple() LLVM.engine = llvm.create_mcjit_compiler(backing_mod, LLVM.target_machine) - # TODO: LLVMProgram - def exec(self, module:ir.Module, bufs, op_estimate=0, mem_estimate=0): - module.triple = llvm.get_process_triple() - module.data_layout = self.engine.target_data - llvm_ir = str(module) - - if DEBUG >= 2: - print(llvm_ir) - - mod = llvm.parse_assembly(llvm_ir) - mod.verify() - LLVM.optimizer.run(mod) - if DEBUG >= 4: - print("Optimized IR:") - print(str(mod)) - mod.name = hashlib.sha1(llvm_ir.encode('utf-8')).hexdigest() - if DEBUG >= 3: - print(LLVM.target_machine.emit_assembly(mod)) - LLVM.engine.add_module(mod) +class LLVMProgram: + def __init__(self, name:str, prg:str, binary=False): + self.mod = llvm.parse_assembly(prg) + self.mod.verify() + LLVM().optimizer.run(self.mod) + self.mod.name = hashlib.sha1(prg.encode('utf-8')).hexdigest() + if DEBUG >= 5: print(LLVM.target_machine.emit_assembly(self.mod)) + LLVM.engine.add_module(self.mod) LLVM.engine.finalize_object() + self.fxn = LLVM.engine.get_function_address(name) - # call function (NOTE: if the types don't match, there's likely something wrong with the cache) - #cfunc = CFUNCTYPE(ctypes.c_int, *[type(x._buf) for x in bufs])(LLVM.engine.get_function_address('exec')) + def __del__(self): LLVM.engine.remove_module(self.mod) - # why is this needed without the types. fixed tests below - # LLVM=1 OPT=2 python3 test/test_ops.py TestOps.test_cat TestOps.test_multicat - cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.POINTER(ctypes.c_float) for x in bufs])(LLVM.engine.get_function_address('exec')) - - st = time.monotonic() + def __call__(self, unused_global_size, unused_local_size, *bufs, wait=False): + cfunc = CFUNCTYPE(ctypes.c_int, *[ctypes.POINTER(ctypes.c_float) for _ in bufs])(self.fxn) + if wait: st = time.monotonic() cfunc(*[x._buf for x in bufs]) - et = time.monotonic() - st - if DEBUG >= 1: - print(f"**LLVM** time {et*1000:7.2f} ms OPs {op_estimate/1e6:7.2f}M -- {(op_estimate/1e9)/et:5.2f} GFLOPS -- {mem_estimate:10d} reads -- {(mem_estimate*4/1e9)/et:5.2f} GB/s") - GlobalCounters.global_ops += op_estimate - GlobalCounters.global_mem += mem_estimate + if wait: return time.monotonic()-st - # we are done - LLVM.engine.remove_module(mod) - return cfunc +class LLVMBuffer(CompiledBuffer): + raw_buffer_type = RawMallocBuffer + codegen_type = LLVMCodegen + runtime_type = LLVMProgram diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py new file mode 100644 index 0000000000..fc363dce88 --- /dev/null +++ b/tinygrad/runtime/ops_metal.py @@ -0,0 +1,92 @@ +# pip3 install pyobjc-framework-Metal pyobjc-framework-libdispatch +import os, subprocess, pathlib, functools +import Metal, Cocoa, libdispatch # type: ignore +import numpy as np +from typing import List, Any +from tinygrad.codegen.gpu import GPUCodegen, GPULanguage +from tinygrad.helpers import prod, getenv, DEBUG +from tinygrad.ops import CompiledBuffer, RawBufferCopyIn + +METAL_XCODE = getenv("METAL_XCODE") + +class _METAL: + mtl_buffers_in_flight : List[Any] = [] + @functools.cached_property + def device(self): + return Metal.MTLCreateSystemDefaultDevice() + @functools.cached_property + def mtl_queue(self): + return METAL.device.newCommandQueue() +METAL = _METAL() + +class RawMetalBuffer(RawBufferCopyIn): + def __init__(self, size): self.size, self._cl = size, METAL.device.newBufferWithLength_options_(size, Metal.MTLResourceStorageModeShared) + def __del__(self): self._cl.release() + def _as_np(self): return np.frombuffer(self._cl.contents().as_buffer(self._cl.length()), dtype=np.float32) + def copyin(self, x:np.ndarray): np.copyto(self._as_np(), x.reshape(-1).data) + def toCPU(self) -> np.ndarray: + for cbuf in METAL.mtl_buffers_in_flight: cbuf.waitUntilCompleted() + METAL.mtl_buffers_in_flight = [] + return self._as_np() # no copy! + +class MetalProgram: + def __init__(self, name:str, prg:str): + if DEBUG >= 6: # dump llvm + air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8')) + dis = subprocess.check_output(['/Users/kafka/Downloads/clang+llvm-15.0.7-arm64-apple-darwin22.0/bin/llvm-dis'], input=air) + print(dis.decode('utf-8')) + if METAL_XCODE: + air = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metal', '-x', 'metal', '-c', '-', '-o', '-'], input=prg.encode('utf-8')) + lib = subprocess.check_output(['xcrun', '-sdk', 'macosx', 'metallib', '-', '-o', '-'], input=air) + data = libdispatch.dispatch_data_create(lib, len(lib), None, None) + self.library, err = METAL.device.newLibraryWithData_error_(data, None) + else: + options = Metal.MTLCompileOptions.alloc().init() + self.library, err = METAL.device.newLibraryWithSource_options_error_(prg, options, None) + assert err is None, str(err) + self.fxn = self.library.newFunctionWithName_(name) + # hacks to disassemble shader + if DEBUG >= 5: + arc, err = METAL.device.newBinaryArchiveWithDescriptor_error_(Metal.MTLBinaryArchiveDescriptor.alloc().init(), None) + assert err is None, str(err) + desc = Metal.MTLComputePipelineDescriptor.alloc().init() + desc.setComputeFunction_(self.fxn) + _, err = arc.addComputePipelineFunctionsWithDescriptor_error_(desc, None) + assert err is None, str(err) + _, err = arc.serializeToURL_error_(Cocoa.NSURL.URLWithString_("file:///tmp/shader.bin"), None) + assert err is None, str(err) + # clone https://github.com/dougallj/applegpu.git in the root of tinygrad + os.system(f"cd {pathlib.Path(__file__).parent.parent.parent}/applegpu && python3 compiler_explorer.py /tmp/shader.bin") + self.pipeline_state, err = METAL.device.newComputePipelineStateWithFunction_error_(self.fxn, None) + assert err is None, str(err) + + def __call__(self, global_size, local_size, *bufs, wait=False): + global_size += [1] * (3-len(global_size)) + if local_size is None: local_size = [32] + local_size += [1] * (3-len(local_size)) + + assert prod(local_size) <= self.pipeline_state.maxTotalThreadsPerThreadgroup(), f"local size {local_size} bigger than {self.pipeline_state.maxTotalThreadsPerThreadgroup()} with exec width {self.pipeline_state.threadExecutionWidth()} memory length {self.pipeline_state.staticThreadgroupMemoryLength()}" + command_buffer = METAL.mtl_queue.commandBuffer() + encoder = command_buffer.computeCommandEncoder() + encoder.setComputePipelineState_(self.pipeline_state) + for i,a in enumerate(bufs): encoder.setBuffer_offset_atIndex_(a._cl, 0, i) + encoder.dispatchThreads_threadsPerThreadgroup_(Metal.MTLSize(*global_size), Metal.MTLSize(*local_size)) + encoder.endEncoding() + command_buffer.commit() + if wait: + command_buffer.waitUntilCompleted() + return command_buffer.GPUEndTime() - command_buffer.GPUStartTime() + else: + METAL.mtl_buffers_in_flight.append(command_buffer) + +class MetalCodegen(GPUCodegen): + lang = GPULanguage( + kernel_prefix = "#include \nusing namespace metal;\nkernel", buffer_prefix = "device ", smem_prefix = "threadgroup ", + barrier = "threadgroup_barrier(mem_flags::mem_threadgroup);", float4 = "float4", + gid = [f"gid.{chr(120+i)}" for i in range(3)], lid = [f"lid.{chr(120+i)}" for i in range(3)], + extra_args = ['uint3 gid [[thread_position_in_grid]]', 'uint3 lid [[thread_position_in_threadgroup]]']) + +class MetalBuffer(CompiledBuffer): + raw_buffer_type = RawMetalBuffer + codegen_type = MetalCodegen + runtime_type = MetalProgram \ No newline at end of file diff --git a/tinygrad/llops/ops_torch.py b/tinygrad/runtime/ops_torch.py similarity index 75% rename from tinygrad/llops/ops_torch.py rename to tinygrad/runtime/ops_torch.py index f881fc37cd..120c3900c6 100644 --- a/tinygrad/llops/ops_torch.py +++ b/tinygrad/runtime/ops_torch.py @@ -1,8 +1,8 @@ import torch -from typing import ClassVar, Final, Dict, Callable -from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, FusedOps, GenericExecAST, Op +from typing import ClassVar, Dict, Callable +from tinygrad.ops import UnaryOps, BinaryOps, MovementOps, FusedOps, InterpretedBuffer, Op from tinygrad.helpers import getenv -from tinygrad.llops.ops_cpu import base_fxn_for_op, einsum_mulacc +from tinygrad.runtime.ops_cpu import base_fxn_for_op, einsum_mulacc torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{ UnaryOps.EXP: lambda x: x.exp(), UnaryOps.LOG: lambda x: x.log(), @@ -12,10 +12,9 @@ torch_fxn_for_op : Dict[Op, Callable] = {**base_fxn_for_op, **{ }} device = torch.device("cuda:0" if torch.cuda.is_available() else ("mps" if getenv("MPS", 0) else "cpu")) -class TorchBuffer(GenericExecAST): +class TorchBuffer(InterpretedBuffer): fxn_for_op : ClassVar = torch_fxn_for_op - SUPPORTS_SIMPLE_PADDING : Final = True @staticmethod def fromCPU(data): return TorchBuffer(torch.from_numpy(data).requires_grad_(False).to(device)) - def toCPU(x): return x.buf.cpu().numpy() + def toCPU(x): return x._buf.cpu().numpy()