diff --git a/README.md b/README.md
index 531602c030..78962e4348 100644
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ from tinygrad.tensor import Tensor
 
 ### ANE Support?!?!
 
-If all you want to do is ReLU, you are in luck! You can do very fast ReLU (fastness not confirmed)
+If all you want to do is ReLU, you are in luck! You can do very fast ReLU (at least 30 MEGAReLUs/sec confirmed)
 
 Requires your Python to be signed with `ane/lib/sign_python.sh` to add the `com.apple.ane.iokit-user-access` entitlement.
 
@@ -101,6 +101,8 @@ b = a.relu()
 print(b.cpu())
 ```
 
+Warning: do not rely on the ANE port. It segfaults sometimes. So if you were doing something important with tinygrad and wanted to use the ANE, you might have a bad time.
+
 ### ImageNet inference
 
 Despite being tiny, tinygrad supports the full EfficientNet. Pass in a picture to discover what it is.
diff --git a/ane/2_compile/hwx_parse.py b/ane/2_compile/hwx_parse.py
index 60ea3ea55f..4961ade896 100755
--- a/ane/2_compile/hwx_parse.py
+++ b/ane/2_compile/hwx_parse.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
-
+import sys
 from hexdump import hexdump
-
 from macholib import MachO
 def get_macho(fn):
   # mod to make the header okay
@@ -85,7 +84,7 @@ def compare(x, y):
         ln2.append(a[1])
   return ''.join(ss)
 
-g = get_macho("model.hwx.golden")
+g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1])
 f1 = g.headers[0].commands[1][2][0].section_data
 f2 = a.headers[0].commands[1][2][0].section_data
 for i in range(0, len(f2), 0x300):
diff --git a/ane/2_compile/simple/neuron.plist b/ane/2_compile/simple/neuron.plist
index a24a8a1dcf..dd57aaffa9 100644
--- a/ane/2_compile/simple/neuron.plist
+++ b/ane/2_compile/simple/neuron.plist
@@ -35,12 +35,8 @@
 			<key>InputHeight</key>
 			<integer>1</integer>
 			<key>InputWidth</key>
-			<integer>16</integer>
+			<integer>678</integer>
 
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
 			<key>InputType</key>
 			<string>Float16</string>
 		</dict>
@@ -66,10 +62,6 @@
 		<dict>
 			<key>Bottom</key>
 			<string>my_layer</string>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
 		</dict>
 
 	</dict>
diff --git a/ane/lib/ane.mm b/ane/lib/ane.mm
index 955e5a3866..2df6814e24 100644
--- a/ane/lib/ane.mm
+++ b/ane/lib/ane.mm
@@ -10,19 +10,22 @@
 #include "h11ane.h"
 using namespace H11ANE;
 
+//#define DEBUG printf
+#define DEBUG(x, ...)
+
 extern "C" {
 
 // global vars
 H11ANEDevice *dev = NULL;
 
 int MyH11ANEDeviceControllerNotification(H11ANEDeviceController *param_1, void *param_2, H11ANEDevice *param_3) {
-  printf("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
+  DEBUG("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
   dev = param_3;
   return 0;
 }
 
 int MyH11ANEDeviceMessageNotification(H11ANE::H11ANEDevice* dev, unsigned int param_1, void* param_2, void* param_3) {
-  printf("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
+  DEBUG("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
   return 0;
 }
 
@@ -36,20 +39,20 @@ int ANE_Open() {
   char empty[0x90] = {0};
   H11ANEDeviceInfoStruct dis = {0};
   ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty, UsageCompile, &dis);
-  printf("open 0x%x %p\n", ret, dev);
+  DEBUG("open 0x%x %p\n", ret, dev);
 
   ret = dev->ANE_PowerOn();
-  printf("power on: %d\n", ret);
+  DEBUG("power on: %d\n", ret);
 
   ret = dev->ANE_IsPowered();
-  printf("powered? %d\n", ret);
+  DEBUG("powered? %d\n", ret);
 
   return 0;
 }
 
 int stride_for_width(int width) {
   int ret = width*2;
-  ret += (64-ret) % 64;
+  ret += (64-(ret % 64))%64;
   return ret;
 }
 
@@ -73,7 +76,7 @@ void *ANE_TensorCreate(int width, int height) {
 void* ANE_TensorData(void *out_surf) {
   void *ret = (void *)IOSurfaceGetBaseAddress((IOSurfaceRef)out_surf);
   //IOSurfaceUnlock((IOSurfaceRef)out_surf, 0, nil);
-  printf("TensorData %p -> %p\n", out_surf, ret);
+  DEBUG("TensorData %p -> %p\n", out_surf, ret);
   return ret;
 }
 
@@ -81,7 +84,7 @@ uint64_t ANE_Compile(char *iprog, int sz) {
   int ret;
   int cksum = 0;
   for (int i = 0; i < sz; i++) cksum += iprog[i];
-  printf("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz);
+  DEBUG("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz);
 
   char *prog = (char*)aligned_alloc(0x1000, sz);
   memcpy(prog, iprog, sz);
@@ -95,7 +98,7 @@ uint64_t ANE_Compile(char *iprog, int sz) {
   ret = dev->ANE_ProgramCreate(&mprog, out);
   uint64_t program_handle = out->program_handle;
   delete out;
-  printf("program create: %lx %lx\n", ret, program_handle);
+  DEBUG("program create: %lx %lx\n", ret, program_handle);
   // early failure
   if (ret != 0) return 0;
 
@@ -103,14 +106,14 @@ uint64_t ANE_Compile(char *iprog, int sz) {
   pas.program_handle = program_handle;
   pas.flags = 0x0000000100010001;
   ret = dev->ANE_ProgramPrepare(&pas);
-  printf("program prepare: %lx\n", ret);
+  DEBUG("program prepare: %lx\n", ret);
 
   return program_handle;
 }
 
 int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
   int ret;
-  printf("ANE_Run %p %p\n", in_surf, out_surf);
+  DEBUG("ANE_Run %p %p\n", in_surf, out_surf);
   H11ANEProgramRequestArgsStruct *pras = new H11ANEProgramRequestArgsStruct;
   memset(pras, 0, sizeof(H11ANEProgramRequestArgsStruct));
 
@@ -132,11 +135,11 @@ int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
 
   mach_port_t recvPort = 0;
   IOCreateReceivePort(kOSAsyncCompleteMessageID, &recvPort);
-  printf("recv port: 0x%x\n", recvPort);
+  DEBUG("recv port: 0x%x\n", recvPort);
 
   // run program
   ret = dev->ANE_ProgramSendRequest(pras, recvPort);
-  printf("send 0x%x\n", ret);
+  DEBUG("send 0x%x\n", ret);
 
   struct {
     mach_msg_header_t header;
@@ -149,7 +152,7 @@ int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
           recvPort,
           MACH_MSG_TIMEOUT_NONE,
           MACH_PORT_NULL);
-  printf("got message: %d sz %d\n", ret, message.header.msgh_size);
+  DEBUG("got message: %d sz %d\n", ret, message.header.msgh_size);
   delete pras;
 
   return 0;
diff --git a/ane/lib/ane.py b/ane/lib/ane.py
index 4676af2c53..def815fabb 100755
--- a/ane/lib/ane.py
+++ b/ane/lib/ane.py
@@ -25,6 +25,7 @@ class ANETensor:
     self.shape = shape
     self.dtype = np.float16
     self.sz = int(np.prod(shape))
+    assert(self.sz <= 0x4000)
     self.tt = libane.ANE_TensorCreate(self.sz, 1)
     assert(self.tt is not None)
 
diff --git a/ane/ops/relu.hwx b/ane/ops/relu.hwx
index d66a665b39..dc54cc540e 100644
Binary files a/ane/ops/relu.hwx and b/ane/ops/relu.hwx differ
diff --git a/examples/benchmark.py b/examples/benchmark.py
new file mode 100644
index 0000000000..2788915055
--- /dev/null
+++ b/examples/benchmark.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+import numpy as np
+from tinygrad.tensor import Tensor
+import time
+
+# Tensor has max size of 0x4000 for now
+ba = Tensor(np.random.normal(size=(0x4000,)))
+for dev in ["CPU", "GPU", "ANE"]:
+  if dev == "GPU":
+    baa = ba.cuda()
+  elif dev == "ANE":
+    baa = ba.ane()
+  else:
+    baa = ba
+  for i in range(3):
+    st = time.time()
+    boaa = baa.relu()
+    et = time.time()
+    if i == 2:
+      print("%s can do at least %.2f MEGAReLUs/sec" % (dev, (np.prod(boaa.shape)/1e6)/(et-st)))
+    # decently reliable
+    assert(np.all(boaa.cpu().data >= 0))
+
+
diff --git a/examples/use_ane.py b/examples/use_ane.py
index c80d20fd1b..9c4bb5bd2c 100755
--- a/examples/use_ane.py
+++ b/examples/use_ane.py
@@ -1,9 +1,11 @@
 #!/usr/bin/env python3
 import numpy as np
 from tinygrad.tensor import Tensor
+import time
 
 a = Tensor([-2,-1,0,1,2]).ane()
 print(a.cpu())
 b = a.relu()
 print(b.cpu())
+assert(np.all(b.cpu().data >= 0))
 
diff --git a/tinygrad/ops_ane.py b/tinygrad/ops_ane.py
index b5d25f2bae..ba1c9a2d66 100644
--- a/tinygrad/ops_ane.py
+++ b/tinygrad/ops_ane.py
@@ -1,16 +1,37 @@
 from .tensor import Tensor, Function, register
 from functools import lru_cache
+import struct
 
 @lru_cache
 def compile_wrapper(ane, dat):
   return ane.compile(dat)
 
+def roundup(x, v):
+  return x + (v-x)%v
+
+def fill(dat, addrs, type, val, base=0x4000):
+  x = struct.pack(type, val)
+  for a in addrs:
+    dat[base+a:base+a+len(x)] = x
+  return dat
+
+@lru_cache
+def compile_relu(ane, sz):
+  dat = list(open("ane/ops/relu.hwx", "rb").read())
+  # TODO: make this all nice and once
+  # number of relus
+  dat = fill(dat, [0x128, 0x13C], "H", sz)
+  # number of engines? (max 0x100)
+  dat = fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8], "I", max(0x100, roundup(sz*2, 0x10)))
+  # strides?
+  dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz*2, 0x40))
+  return compile_wrapper(ane, bytes(dat))
+
 class ReLU(Function):
   @staticmethod
   def forward(ctx, input):
     ret = ctx.ane.tensor(input.shape)
-    comp = compile_wrapper(ctx.ane, open("ane/ops/relu.hwx", "rb").read())
-    ctx.ane.run(comp, input, ret)
+    ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret)
     return ret
 register('relu', ReLU, device=Tensor.ANE)