diff --git a/test/test_conv_speed.py b/test/test_conv_speed.py
index d4c8feccc9..754150a4a7 100644
--- a/test/test_conv_speed.py
+++ b/test/test_conv_speed.py
@@ -75,7 +75,7 @@ class TestConvSpeed(unittest.TestCase):
     for i in range(1+cnt):
       et0 = time.time()
       x = Tensor.randn(128, 1, 28, 28)
-      x = x.conv2d(c1).relu().max_pool2d()
+      x = x.conv2d(c1).relu().avg_pool2d()
       x = x.conv2d(c2).relu().max_pool2d()
       x = x.reshape(Tensor(np.array((x.shape[0], -1))))
       out = x.dot(l1).logsoftmax()
diff --git a/test/test_ops.py b/test/test_ops.py
index c9ff8675f6..4a20557277 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1,14 +1,17 @@
 import torch
 import numpy as np
 import unittest
+import timeit
+import functools
 from tinygrad.tensor import Tensor
 
-def test_op(shps, f1, f2, atol=1e-7, grad_atol=1e-7):
+def test_op(shps, torch_fxn, tinygrad_fxn, atol=1e-7, grad_atol=1e-7):
   ts = [torch.rand(x, requires_grad=True) for x in shps]
   tst = [Tensor(x.detach().numpy()) for x in ts]
 
-  out = f1(*ts)
-  ret = f2(*tst)
+  out = torch_fxn(*ts)
+  ret = tinygrad_fxn(*tst)
+
   # TODO: why so inaccurate?
   np.testing.assert_allclose(ret.data, out.detach().numpy(), atol=atol)
 
@@ -18,18 +21,31 @@ def test_op(shps, f1, f2, atol=1e-7, grad_atol=1e-7):
   for t, tt in zip(ts, tst):
     np.testing.assert_allclose(t.grad, tt.grad, atol=grad_atol)
 
+  # speed
+  torch_fp = timeit.Timer(functools.partial(torch_fxn, *ts)).timeit(5) * 1000/5
+  tinygrad_fp = timeit.Timer(functools.partial(tinygrad_fxn, *tst)).timeit(5) * 1000/5
+
+  torch_fbp = timeit.Timer(functools.partial(lambda f,x: f(*x).mean().backward(), torch_fxn, ts)).timeit(5) * 1000/5
+  tinygrad_fbp = timeit.Timer(functools.partial(lambda f,x: f(*x).mean().backward(), tinygrad_fxn, tst)).timeit(5) * 1000/5
+
+  print("testing %30r   torch/tinygrad fp: %.2f / %.2f ms  bp: %.2f / %.2f ms" % (shps, torch_fp, tinygrad_fp, torch_fbp-torch_fp, tinygrad_fbp-tinygrad_fp))
+
 class TestOps(unittest.TestCase):
   def test_conv2d(self):
-    for cin in [1,2,3]:
-      for H in [2,3,5]:
-        for W in [2,3,5]:
-          test_op([(5,cin,10,7), (4,cin,H,W)], torch.nn.functional.conv2d, Tensor.conv2d, atol=1e-5)
+    for bs in [1,128]:
+      for cin in [1,3]:
+        for H in [2,5]:
+          for W in [2,3,5]:
+            test_op([(bs,cin,11,28), (4,cin,H,W)],
+              lambda x,w: torch.nn.functional.conv2d(x,w).relu(),
+              lambda x,w: Tensor.conv2d(x,w).relu(), atol=2e-5, grad_atol=2e-6)
 
   def test_maxpool2x2(self):
-    test_op([(5,2,11,8)], lambda x: torch.nn.functional.max_pool2d(x, (2,2)), Tensor.max_pool2d)
+    test_op([(32,2,110,28)], lambda x: torch.nn.functional.max_pool2d(x, (2,2)), Tensor.max_pool2d)
 
   def test_avgpool2x2(self):
-    test_op([(5,2,11,8)], lambda x: torch.nn.functional.avg_pool2d(x, (2,2)), Tensor.avg_pool2d)
+    test_op([(32,2,111,28)], lambda x: torch.nn.functional.avg_pool2d(x, (2,2)), Tensor.avg_pool2d)
 
 if __name__ == '__main__':
-  unittest.main()
+  unittest.main(verbosity=2)
+