diff --git a/accel/lazy/ops_lazy.py b/accel/lazy/ops_lazy.py index 145023c2b4..78bc3835b9 100644 --- a/accel/lazy/ops_lazy.py +++ b/accel/lazy/ops_lazy.py @@ -3,7 +3,7 @@ import os from typing import Union, NamedTuple, List, Any, Tuple, Dict from tinygrad.shapetracker import ShapeTracker import functools, operator -from tinygrad.helpers import prod +from tinygrad.helpers import prod, ConvArgs import sys sys.setrecursionlimit(10000) @@ -13,7 +13,8 @@ LoadOps = Enum("LoadOps", ["FROMCPU", "CONTIGUOUS"]) Op = Union[BinaryOps, ReduceOps, MovementOps, ProcessingOps, LoadOps] MERGE_MOVEMENT_OPS = True -SHUFFLE_MOVEMENT_OPS = True # this breaks maxpool +SHUFFLE_MOVEMENT_OPS = True +SHUFFLE_SLICE_OPS = False # NOTE: 0/0 is NaN if you slice, so this can change the output REMOVE_MOVEMENT_NOPS = True MERGE_ELEMENTWISE_OPS = True MERGE_ELEMENTWISE_INTO_CONV_OUTPUT = True @@ -86,11 +87,11 @@ class LazyBuffer: def binary_op(x, op, y:LazyBuffer): return elementwise_op(op, (x,y)) @functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0) - def contiguous_op(x) -> LazyBuffer: return x if x.st.contiguous else LazyBuffer(x.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (x,))) + def contiguous_op(x:LazyBuffer) -> LazyBuffer: return x if x.st.contiguous else LazyBuffer(x.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (x,))) @functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0) - def movement_op(x, op:MovementOps, arg) -> LazyBuffer: - if SHUFFLE_MOVEMENT_OPS and x.optype == BinaryOps: + def movement_op(x:LazyBuffer, op:MovementOps, arg) -> LazyBuffer: + if SHUFFLE_MOVEMENT_OPS and x.optype == BinaryOps and (SHUFFLE_SLICE_OPS or op != MovementOps.SLICE): # if this MovementOp is being applied to a BinaryOp, apply the MovementOp to all the BinaryOp inputs instead def replace_with_movement_op(y:Union[LazyOp, LazyBuffer]) -> LazyBuffer: if isinstance(y, LazyBuffer): return y.movement_op(op, arg) @@ -108,10 +109,10 @@ class LazyBuffer: return ret - def reduce_op(x, op, new_shape:Tuple[int]): + def reduce_op(x:LazyBuffer, op:ReduceOps, new_shape:Tuple[int]): return LazyBuffer(new_shape, ReduceOps, LazyOp(op, (x,), new_shape)) - def processing_op(x, op, w:LazyBuffer, C): + def processing_op(x:LazyBuffer, op:ProcessingOps, w:LazyBuffer, C:ConvArgs): return LazyBuffer(C.out_shape, ProcessingOps, LazyOp(op, (x.contiguous_op(), w.contiguous_op()), C)) def ast_op(op: Op, srcs_code: List[str]) -> str: diff --git a/test/test_ops.py b/test/test_ops.py index 8c03370312..a0cb6def15 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -296,6 +296,13 @@ class TestOps(unittest.TestCase): lambda x,w: torch.nn.functional.conv2d(x,w,dilation=dilation).relu(), lambda x,w: Tensor.conv2d(x,w,dilation=dilation).relu(), atol=1e-4) + def test_maxpool2d_simple(self): + ksz = (2,2) + helper_test_op([(1,1,2,3)], + lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz), + # TODO: why is this tolerance so high? + lambda x: Tensor.max_pool2d(x, kernel_size=ksz), grad_atol=1e-4) + def test_maxpool2d(self): for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1)]: with self.subTest(kernel_size=ksz): diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py index ea463478b7..7c4f1b0a29 100644 --- a/tinygrad/llops/ops_gpu.py +++ b/tinygrad/llops/ops_gpu.py @@ -7,6 +7,7 @@ from typing import List, Tuple, Optional from tinygrad.helpers import prod, ConvArgs from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, MovementOps, ProcessingOps from tinygrad.shapetracker import ShapeTracker, View, strides_for_shape +from tinygrad.ops import DEBUG class CL: CACHE = None @@ -29,11 +30,12 @@ class CL: @functools.lru_cache(maxsize=None) class CLProgram: def __init__(self, name, prg, options=tuple(), argdtypes=None): - self.name = name - self.built = cl.Program(CL().cl_ctx, prg).build(options=options) - self.clprg = self.built.__getattr__(name) + self.name, self.prg = name, prg + self.built = cl.Program(CL().cl_ctx, self.prg).build(options=options) + self.clprg = self.built.__getattr__(self.name) if argdtypes is not None: self.clprg.set_scalar_arg_dtypes(argdtypes) def __call__(self, *args): + if DEBUG >= 2: print(f"**** {self.name} {args[0]} {args[1]} ****\n{self.prg}") if CL.CACHE is not None: CL.CACHE.append((self, args)) else: self.clprg(CL().cl_queue, *args) @@ -130,27 +132,7 @@ class GPUBuffer: assert bufs[0][0] == "input" and bufs[1][0] == "weight" ewbufs = bufs[2:] # input and weight are consumed by the convs kernel_name = "conv" - else: - ints, params = '', [] - options.append("-DNOCONV") - global_size = [prod(ret.shape), 1, 1] - ewbufs = bufs - kernel_name = "elementwise" - - elementwise_prefix = '\n'.join([buf.contiguous_view(name) for name, buf in ewbufs])+ \ - "inline float _ewop("+','.join(["int gid", "float acc"]+[f"__global const float *{name}_g" for name, _ in ewbufs])+") {"+ \ - '\n'.join([f"float {name} = get_{name}({name}_g, gid);" for name, _ in ewbufs])+ \ - f"return {code}; }}" - - conv_params = ["__global float* restrict output"] + \ - [f"__global const float *{name}_g" for name, _ in bufs] + \ - [x[0] for x in params] - conv_prg = CLProgram(kernel_name, elementwise_prefix+f"__kernel void {kernel_name}("+','.join(conv_params)+""") { - float acc = 0.0; - int gid = get_global_id(0); - """+ints+""" - - #ifndef NOCONV + conv_src = """ int B = gid/(groups*rcout); // range 0-bs int g = (gid/rcout)%groups; int c = gid % rcout; @@ -181,9 +163,26 @@ class GPUBuffer: #endif } } } - #endif + """ + else: + ints, params = '', [] + global_size = [prod(ret.shape), 1, 1] + ewbufs = bufs + kernel_name = "elementwise" + conv_src = "" - output[gid] = _ewop("""+','.join(["gid", "acc"]+[f"{name}_g" for name, _ in ewbufs])+"""); + elementwise_prefix = '\n'.join([buf.contiguous_view(name) for name, buf in ewbufs])+ \ + "\n\ninline float _ewop("+','.join(["int gid", "float acc"]+[f"__global const float *{name}_g" for name, _ in ewbufs])+") {\n"+ \ + '\n'.join([f"float {name} = get_{name}({name}_g, gid);" for name, _ in ewbufs])+ \ + f"\nreturn {code}; }}" + + conv_params = ["__global float* restrict output"] + \ + [f"__global const float *{name}_g" for name, _ in bufs] + \ + [x[0] for x in params] + conv_prg = CLProgram(kernel_name, elementwise_prefix+f"\n\n__kernel void {kernel_name}("+','.join(conv_params)+""") { + float acc = 0.0; + int gid = get_global_id(0); + """+ints+conv_src+"""output[gid] = _ewop("""+','.join(["gid", "acc"]+[f"{name}_g" for name, _ in ewbufs])+"""); }""", options=tuple(options), argdtypes=tuple([None]*(1+len(bufs)) + [np.int32]*len(params))) conv_prg(global_size, None, ret.cl, *[buf.cl for _, buf in bufs], *[x[1] for x in params]) return ret diff --git a/tinygrad/ops.py b/tinygrad/ops.py index d68df775a6..da71e4cd2e 100644 --- a/tinygrad/ops.py +++ b/tinygrad/ops.py @@ -38,8 +38,8 @@ def log_op(optype, op, ret, inp): for x in inp: if not isinstance(op, list): op = [op] - #sop = '.'.join([str(y).split(".")[1] for y in op][::-1]) - sop = str(len(op)) + sop = '.'.join([str(y).split(".")[1] for y in op][::-1]) + #sop = str(len(op)) G.add_edge(nm(x), nm(ret), label=sop) if 'label' not in G.nodes[nm(x)]: G.nodes[nm(x)]['label'] = str(x.shape) if nm(ret) not in G.nodes: G.add_node(nm(ret))