diff --git a/test/test_ops.py b/test/test_ops.py
index 8b2df0ef94..cf17732a2a 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -37,30 +37,22 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn, atol=1e-7, grad_atol=1e-7, gpu
   print("testing %30r   torch/tinygrad fp: %.2f / %.2f ms  bp: %.2f / %.2f ms" % (shps, torch_fp, tinygrad_fp, torch_fbp-torch_fp, tinygrad_fbp-tinygrad_fp))
 
 class TestOps(unittest.TestCase):
+  gpu = False
   def test_add(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x+y, Tensor.add)
-  @unittest.skipUnless(GPU, "Requires GPU")
-  def test_add_gpu(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x+y, Tensor.add, gpu=True)
+    helper_test_op([(45,65), (45,65)], lambda x,y: x+y, Tensor.add, gpu=self.gpu)
   def test_sub(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x-y, Tensor.sub)
+    helper_test_op([(45,65), (45,65)], lambda x,y: x-y, Tensor.sub, gpu=self.gpu)
   def test_mul(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x*y, Tensor.mul)
-  @unittest.skipUnless(GPU, "Requires GPU")
-  def test_mul_gpu(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x*y, Tensor.mul, gpu=True)
+    helper_test_op([(45,65), (45,65)], lambda x,y: x*y, Tensor.mul, gpu=self.gpu)
   def test_div(self):
     # TODO: why does this need more tolerance?
-    helper_test_op([(45,65), (45,65)], lambda x,y: x/y, Tensor.div, atol=1e-3, grad_atol=1e-3)
+    helper_test_op([(45,65), (45,65)], lambda x,y: x/y, Tensor.div, atol=1e-3, grad_atol=1e-3, gpu=self.gpu)
   def test_pow(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x**y, Tensor.pow)
+    helper_test_op([(45,65), (45,65)], lambda x,y: x**y, Tensor.pow, gpu=self.gpu)
   def test_sqrt(self):
-    helper_test_op([(45,65)], lambda x: x.sqrt(), Tensor.sqrt)
+    helper_test_op([(45,65)], lambda x: x.sqrt(), Tensor.sqrt, gpu=self.gpu)
   def test_dot(self):
-    helper_test_op([(45,65), (65,100)], lambda x,y: x.matmul(y), Tensor.dot, atol=1e-5)
-  @unittest.skipUnless(GPU, "Requires GPU")
-  def test_dot_gpu(self):
-    helper_test_op([(3,4), (4,5)], lambda x,y: x.matmul(y), Tensor.dot, atol=1e-5, gpu=True)
+    helper_test_op([(45,65), (65,100)], lambda x,y: x.matmul(y), Tensor.dot, atol=1e-5, gpu=self.gpu)
 
   def test_conv2d(self):
     for bs in [1,8]:
@@ -70,7 +62,7 @@ class TestOps(unittest.TestCase):
             for W in [2,3,5]:
               helper_test_op([(bs,cin,11,28), (6,cin//groups,H,W)],
                 lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(),
-                lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), atol=2e-5, grad_atol=2e-6)
+                lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu)
 
   def test_strided_conv2d(self):
     bs = 4
@@ -78,22 +70,26 @@ class TestOps(unittest.TestCase):
     H,W = 3,3
     helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
       lambda x,w: torch.nn.functional.conv2d(x,w,stride=2).relu(),
-      lambda x,w: Tensor.conv2d(x,w,stride=2).relu(), atol=2e-5, grad_atol=2e-6)
+      lambda x,w: Tensor.conv2d(x,w,stride=2).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu)
     helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
       lambda x,w: torch.nn.functional.conv2d(x,w,stride=(2,1)).relu(),
-      lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), atol=2e-5, grad_atol=2e-6)
+      lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), atol=2e-5, grad_atol=2e-6, gpu=self.gpu)
 
   def test_maxpool2x2(self):
-    helper_test_op([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, (2,2)), Tensor.max_pool2d)
+    helper_test_op([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, (2,2)), Tensor.max_pool2d, gpu=self.gpu)
 
   def test_maxpool_sizes(self):
     for sz in [(2,2), (3,3), (3,2), (5,5), (5,1)]:
       helper_test_op([(32,2,110,28)],
         lambda x: torch.nn.functional.max_pool2d(x, kernel_size=sz),
-        lambda x: Tensor.max_pool2d(x, kernel_size=sz))
+        lambda x: Tensor.max_pool2d(x, kernel_size=sz), gpu=self.gpu)
 
   def test_avgpool2x2(self):
-    helper_test_op([(32,2,111,28)], lambda x: torch.nn.functional.avg_pool2d(x, (2,2)), Tensor.avg_pool2d)
+    helper_test_op([(32,2,111,28)], lambda x: torch.nn.functional.avg_pool2d(x, (2,2)), Tensor.avg_pool2d, gpu=self.gpu)
+
+if GPU:
+  class TestOpsGPU(TestOps):
+    gpu = True
 
 if __name__ == '__main__':
   unittest.main(verbosity=2)
diff --git a/tinygrad/opsgpu.py b/tinygrad/opsgpu.py
index 16b9ba3c21..7d26ad5a50 100644
--- a/tinygrad/opsgpu.py
+++ b/tinygrad/opsgpu.py
@@ -16,7 +16,7 @@ def buffer_like(ctx, x):
 def clbuild(cl_ctx, prg):
   return cl.Program(cl_ctx, prg).build()
 
-def in_place_op(ctx, code, x, y):
+def binary_op(ctx, code, x, y):
   ret = buffer_like(ctx, x)
   prg = clbuild(ctx.cl_ctx, """
   __kernel void add(
@@ -29,10 +29,23 @@ def in_place_op(ctx, code, x, y):
   prg.add(ctx.cl_queue, [np.prod(ret.shape)], None, x, y, ret)
   return ret
 
+def unary_op(ctx, code, x):
+  ret = buffer_like(ctx, x)
+  prg = clbuild(ctx.cl_ctx, """
+  __kernel void relu(
+      __global const float *a_g, __global float *res_g)
+  {
+    int gid = get_global_id(0);
+    res_g[gid] = min(a_g[gid], (float)0.);
+  }
+  """)
+  prg.relu(ctx.cl_queue, [np.prod(ret.shape)], None, x, ret)
+  return ret
+
 class Add(Function):
   @staticmethod
   def forward(ctx, x, y):
-    return in_place_op(ctx, 'res_g[gid] = a_g[gid] + b_g[gid];', x, y)
+    return binary_op(ctx, 'res_g[gid] = a_g[gid] + b_g[gid];', x, y)
 
   @staticmethod
   def backward(ctx, grad_output):
@@ -42,7 +55,7 @@ register('add', Add, gpu=True)
 class Sub(Function):
   @staticmethod
   def forward(ctx, x, y):
-    return in_place_op(ctx, 'res_g[gid] = a_g[gid] - b_g[gid];', x, y)
+    return binary_op(ctx, 'res_g[gid] = a_g[gid] - b_g[gid];', x, y)
 
   @staticmethod
   def backward(ctx, grad_output):
@@ -57,9 +70,9 @@ class Mul(Function):
 
     # HACK
     if y.shape == (1,):
-      return in_place_op(ctx, 'res_g[gid] = a_g[gid] * b_g[0];', x, y)
+      return binary_op(ctx, 'res_g[gid] = a_g[gid] * b_g[0];', x, y)
     elif x.shape == y.shape:
-      return in_place_op(ctx, 'res_g[gid] = a_g[gid] * b_g[gid];', x, y)
+      return binary_op(ctx, 'res_g[gid] = a_g[gid] * b_g[gid];', x, y)
     else:
       raise Exception("mismatched shapes %r %r" % (x.shape, y.shape))
 
@@ -68,8 +81,8 @@ class Mul(Function):
   @staticmethod
   def backward(ctx, grad_output):
     x,y = ctx.saved_tensors
-    return in_place_op(ctx, 'res_g[gid] = a_g[gid] * b_g[gid];', y, grad_output),\
-           in_place_op(ctx, 'res_g[gid] = a_g[gid] * b_g[gid];', x, grad_output)
+    return binary_op(ctx, 'res_g[gid] = a_g[gid] * b_g[gid];', y, grad_output),\
+           binary_op(ctx, 'res_g[gid] = a_g[gid] * b_g[gid];', x, grad_output)
 register('mul', Mul, gpu=True)
 
 class Sum(Function):
@@ -168,17 +181,7 @@ register('matmul', Dot, gpu=True)
 class ReLU(Function):
   @staticmethod
   def forward(ctx, x):
-    ret = buffer_like(ctx, x)
-    prg = clbuild(ctx.cl_ctx, """
-    __kernel void relu(
-        __global const float *a_g, __global float *res_g)
-    {
-      int gid = get_global_id(0);
-      res_g[gid] = min(a_g[gid], (float)0.);
-    }
-    """)
-    prg.relu(ctx.cl_queue, [np.prod(ret.shape)], None, x, ret)
-    return ret
+    return unary_op(ctx, 'res_g[gid] = min(a_g[gid], (float)0.);', x)
 
   @staticmethod
   def backward(ctx, grad_output):