diff --git a/accel/lazy/ops_lazy.py b/accel/lazy/ops_lazy.py
index 145023c2b4..78bc3835b9 100644
--- a/accel/lazy/ops_lazy.py
+++ b/accel/lazy/ops_lazy.py
@@ -3,7 +3,7 @@ import os
 from typing import Union, NamedTuple, List, Any, Tuple, Dict
 from tinygrad.shapetracker import ShapeTracker
 import functools, operator
-from tinygrad.helpers import prod
+from tinygrad.helpers import prod, ConvArgs
 import sys
 sys.setrecursionlimit(10000)
 
@@ -13,7 +13,8 @@ LoadOps = Enum("LoadOps", ["FROMCPU", "CONTIGUOUS"])
 Op = Union[BinaryOps, ReduceOps, MovementOps, ProcessingOps, LoadOps]
 
 MERGE_MOVEMENT_OPS = True
-SHUFFLE_MOVEMENT_OPS = True   # this breaks maxpool
+SHUFFLE_MOVEMENT_OPS = True
+SHUFFLE_SLICE_OPS = False  # NOTE: 0/0 is NaN if you slice, so this can change the output
 REMOVE_MOVEMENT_NOPS = True
 MERGE_ELEMENTWISE_OPS = True
 MERGE_ELEMENTWISE_INTO_CONV_OUTPUT = True
@@ -86,11 +87,11 @@ class LazyBuffer:
   def binary_op(x, op, y:LazyBuffer): return elementwise_op(op, (x,y))
 
   @functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0)
-  def contiguous_op(x) -> LazyBuffer: return x if x.st.contiguous else LazyBuffer(x.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (x,)))
+  def contiguous_op(x:LazyBuffer) -> LazyBuffer: return x if x.st.contiguous else LazyBuffer(x.shape, LoadOps, LazyOp(LoadOps.CONTIGUOUS, (x,)))
 
   @functools.lru_cache(maxsize=None if CACHE_LAZYBUFFERS else 0)
-  def movement_op(x, op:MovementOps, arg) -> LazyBuffer:
-    if SHUFFLE_MOVEMENT_OPS and x.optype == BinaryOps:
+  def movement_op(x:LazyBuffer, op:MovementOps, arg) -> LazyBuffer:
+    if SHUFFLE_MOVEMENT_OPS and x.optype == BinaryOps and (SHUFFLE_SLICE_OPS or op != MovementOps.SLICE):
       # if this MovementOp is being applied to a BinaryOp, apply the MovementOp to all the BinaryOp inputs instead
       def replace_with_movement_op(y:Union[LazyOp, LazyBuffer]) -> LazyBuffer:
         if isinstance(y, LazyBuffer): return y.movement_op(op, arg)
@@ -108,10 +109,10 @@ class LazyBuffer:
 
     return ret
 
-  def reduce_op(x, op, new_shape:Tuple[int]):
+  def reduce_op(x:LazyBuffer, op:ReduceOps, new_shape:Tuple[int]):
     return LazyBuffer(new_shape, ReduceOps, LazyOp(op, (x,), new_shape))
 
-  def processing_op(x, op, w:LazyBuffer, C):
+  def processing_op(x:LazyBuffer, op:ProcessingOps, w:LazyBuffer, C:ConvArgs):
     return LazyBuffer(C.out_shape, ProcessingOps, LazyOp(op, (x.contiguous_op(), w.contiguous_op()), C))
 
 def ast_op(op: Op, srcs_code: List[str]) -> str:
diff --git a/test/test_ops.py b/test/test_ops.py
index 8c03370312..a0cb6def15 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -296,6 +296,13 @@ class TestOps(unittest.TestCase):
           lambda x,w: torch.nn.functional.conv2d(x,w,dilation=dilation).relu(),
           lambda x,w: Tensor.conv2d(x,w,dilation=dilation).relu(), atol=1e-4)
 
+  def test_maxpool2d_simple(self):
+    ksz = (2,2)
+    helper_test_op([(1,1,2,3)],
+      lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz),
+      # TODO: why is this tolerance so high?
+      lambda x: Tensor.max_pool2d(x, kernel_size=ksz), grad_atol=1e-4)
+
   def test_maxpool2d(self):
     for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1)]:
       with self.subTest(kernel_size=ksz):
diff --git a/tinygrad/llops/ops_gpu.py b/tinygrad/llops/ops_gpu.py
index ea463478b7..7c4f1b0a29 100644
--- a/tinygrad/llops/ops_gpu.py
+++ b/tinygrad/llops/ops_gpu.py
@@ -7,6 +7,7 @@ from typing import List, Tuple, Optional
 from tinygrad.helpers import prod, ConvArgs
 from tinygrad.ops import UnaryOps, BinaryOps, ReduceOps, MovementOps, ProcessingOps
 from tinygrad.shapetracker import ShapeTracker, View, strides_for_shape
+from tinygrad.ops import DEBUG
 
 class CL:
   CACHE = None
@@ -29,11 +30,12 @@ class CL:
 @functools.lru_cache(maxsize=None)
 class CLProgram:
   def __init__(self, name, prg, options=tuple(), argdtypes=None):
-    self.name = name
-    self.built = cl.Program(CL().cl_ctx, prg).build(options=options)
-    self.clprg = self.built.__getattr__(name)
+    self.name, self.prg = name, prg
+    self.built = cl.Program(CL().cl_ctx, self.prg).build(options=options)
+    self.clprg = self.built.__getattr__(self.name)
     if argdtypes is not None: self.clprg.set_scalar_arg_dtypes(argdtypes)
   def __call__(self, *args):
+    if DEBUG >= 2: print(f"**** {self.name} {args[0]} {args[1]} ****\n{self.prg}")
     if CL.CACHE is not None: CL.CACHE.append((self, args))
     else: self.clprg(CL().cl_queue, *args)
 
@@ -130,27 +132,7 @@ class GPUBuffer:
       assert bufs[0][0] == "input" and bufs[1][0] == "weight"
       ewbufs = bufs[2:]   # input and weight are consumed by the convs
       kernel_name = "conv"
-    else:
-      ints, params = '', []
-      options.append("-DNOCONV")
-      global_size = [prod(ret.shape), 1, 1]
-      ewbufs = bufs
-      kernel_name = "elementwise"
-
-    elementwise_prefix = '\n'.join([buf.contiguous_view(name) for name, buf in ewbufs])+ \
-      "inline float _ewop("+','.join(["int gid", "float acc"]+[f"__global const float *{name}_g" for name, _ in ewbufs])+") {"+ \
-      '\n'.join([f"float {name} = get_{name}({name}_g, gid);" for name, _ in ewbufs])+ \
-      f"return {code}; }}"
-
-    conv_params = ["__global float* restrict output"] + \
-                  [f"__global const float *{name}_g" for name, _ in bufs] + \
-                  [x[0] for x in params]
-    conv_prg = CLProgram(kernel_name, elementwise_prefix+f"__kernel void {kernel_name}("+','.join(conv_params)+""") {
-      float acc = 0.0;
-      int gid = get_global_id(0);
-      """+ints+"""
-
-  #ifndef NOCONV
+      conv_src = """
       int B = gid/(groups*rcout);  // range 0-bs
       int g = (gid/rcout)%groups;
       int c = gid % rcout;
@@ -181,9 +163,26 @@ class GPUBuffer:
   #endif
         } }
       }
-  #endif
+      """
+    else:
+      ints, params = '', []
+      global_size = [prod(ret.shape), 1, 1]
+      ewbufs = bufs
+      kernel_name = "elementwise"
+      conv_src = ""
 
-      output[gid] = _ewop("""+','.join(["gid", "acc"]+[f"{name}_g" for name, _ in ewbufs])+""");
+    elementwise_prefix = '\n'.join([buf.contiguous_view(name) for name, buf in ewbufs])+ \
+      "\n\ninline float _ewop("+','.join(["int gid", "float acc"]+[f"__global const float *{name}_g" for name, _ in ewbufs])+") {\n"+ \
+      '\n'.join([f"float {name} = get_{name}({name}_g, gid);" for name, _ in ewbufs])+ \
+      f"\nreturn {code}; }}"
+
+    conv_params = ["__global float* restrict output"] + \
+                  [f"__global const float *{name}_g" for name, _ in bufs] + \
+                  [x[0] for x in params]
+    conv_prg = CLProgram(kernel_name, elementwise_prefix+f"\n\n__kernel void {kernel_name}("+','.join(conv_params)+""") {
+      float acc = 0.0;
+      int gid = get_global_id(0);
+      """+ints+conv_src+"""output[gid] = _ewop("""+','.join(["gid", "acc"]+[f"{name}_g" for name, _ in ewbufs])+""");
     }""", options=tuple(options), argdtypes=tuple([None]*(1+len(bufs)) + [np.int32]*len(params)))
     conv_prg(global_size, None, ret.cl, *[buf.cl for _, buf in bufs], *[x[1] for x in params])
     return ret
diff --git a/tinygrad/ops.py b/tinygrad/ops.py
index d68df775a6..da71e4cd2e 100644
--- a/tinygrad/ops.py
+++ b/tinygrad/ops.py
@@ -38,8 +38,8 @@ def log_op(optype, op, ret, inp):
 
     for x in inp:
       if not isinstance(op, list): op = [op]
-      #sop = '.'.join([str(y).split(".")[1] for y in op][::-1])
-      sop = str(len(op))
+      sop = '.'.join([str(y).split(".")[1] for y in op][::-1])
+      #sop = str(len(op))
       G.add_edge(nm(x), nm(ret), label=sop)
       if 'label' not in G.nodes[nm(x)]: G.nodes[nm(x)]['label'] = str(x.shape)
     if nm(ret) not in G.nodes: G.add_node(nm(ret))