diff --git a/README.md b/README.md
index 531602c030..78962e4348 100644
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ from tinygrad.tensor import Tensor
### ANE Support?!?!
-If all you want to do is ReLU, you are in luck! You can do very fast ReLU (fastness not confirmed)
+If all you want to do is ReLU, you are in luck! You can do very fast ReLU (at least 30 MEGAReLUs/sec confirmed)
Requires your Python to be signed with `ane/lib/sign_python.sh` to add the `com.apple.ane.iokit-user-access` entitlement.
@@ -101,6 +101,8 @@ b = a.relu()
print(b.cpu())
```
+Warning: do not rely on the ANE port. It segfaults sometimes. So if you were doing something important with tinygrad and wanted to use the ANE, you might have a bad time.
+
### ImageNet inference
Despite being tiny, tinygrad supports the full EfficientNet. Pass in a picture to discover what it is.
diff --git a/ane/2_compile/hwx_parse.py b/ane/2_compile/hwx_parse.py
index 60ea3ea55f..4961ade896 100755
--- a/ane/2_compile/hwx_parse.py
+++ b/ane/2_compile/hwx_parse.py
@@ -1,7 +1,6 @@
#!/usr/bin/env python3
-
+import sys
from hexdump import hexdump
-
from macholib import MachO
def get_macho(fn):
# mod to make the header okay
@@ -85,7 +84,7 @@ def compare(x, y):
ln2.append(a[1])
return ''.join(ss)
-g = get_macho("model.hwx.golden")
+g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1])
f1 = g.headers[0].commands[1][2][0].section_data
f2 = a.headers[0].commands[1][2][0].section_data
for i in range(0, len(f2), 0x300):
diff --git a/ane/2_compile/simple/neuron.plist b/ane/2_compile/simple/neuron.plist
index a24a8a1dcf..dd57aaffa9 100644
--- a/ane/2_compile/simple/neuron.plist
+++ b/ane/2_compile/simple/neuron.plist
@@ -35,12 +35,8 @@
InputHeight
1
InputWidth
- 16
+ 678
- InputPlaneStride
- 64
- InputRowStride
- 64
InputType
Float16
@@ -66,10 +62,6 @@
Bottom
my_layer
- OutputPlaneStride
- 64
- OutputRowStride
- 64
diff --git a/ane/lib/ane.mm b/ane/lib/ane.mm
index 955e5a3866..2df6814e24 100644
--- a/ane/lib/ane.mm
+++ b/ane/lib/ane.mm
@@ -10,19 +10,22 @@
#include "h11ane.h"
using namespace H11ANE;
+//#define DEBUG printf
+#define DEBUG(x, ...)
+
extern "C" {
// global vars
H11ANEDevice *dev = NULL;
int MyH11ANEDeviceControllerNotification(H11ANEDeviceController *param_1, void *param_2, H11ANEDevice *param_3) {
- printf("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
+ DEBUG("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
dev = param_3;
return 0;
}
int MyH11ANEDeviceMessageNotification(H11ANE::H11ANEDevice* dev, unsigned int param_1, void* param_2, void* param_3) {
- printf("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
+ DEBUG("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
return 0;
}
@@ -36,20 +39,20 @@ int ANE_Open() {
char empty[0x90] = {0};
H11ANEDeviceInfoStruct dis = {0};
ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty, UsageCompile, &dis);
- printf("open 0x%x %p\n", ret, dev);
+ DEBUG("open 0x%x %p\n", ret, dev);
ret = dev->ANE_PowerOn();
- printf("power on: %d\n", ret);
+ DEBUG("power on: %d\n", ret);
ret = dev->ANE_IsPowered();
- printf("powered? %d\n", ret);
+ DEBUG("powered? %d\n", ret);
return 0;
}
int stride_for_width(int width) {
int ret = width*2;
- ret += (64-ret) % 64;
+ ret += (64-(ret % 64))%64;
return ret;
}
@@ -73,7 +76,7 @@ void *ANE_TensorCreate(int width, int height) {
void* ANE_TensorData(void *out_surf) {
void *ret = (void *)IOSurfaceGetBaseAddress((IOSurfaceRef)out_surf);
//IOSurfaceUnlock((IOSurfaceRef)out_surf, 0, nil);
- printf("TensorData %p -> %p\n", out_surf, ret);
+ DEBUG("TensorData %p -> %p\n", out_surf, ret);
return ret;
}
@@ -81,7 +84,7 @@ uint64_t ANE_Compile(char *iprog, int sz) {
int ret;
int cksum = 0;
for (int i = 0; i < sz; i++) cksum += iprog[i];
- printf("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz);
+ DEBUG("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz);
char *prog = (char*)aligned_alloc(0x1000, sz);
memcpy(prog, iprog, sz);
@@ -95,7 +98,7 @@ uint64_t ANE_Compile(char *iprog, int sz) {
ret = dev->ANE_ProgramCreate(&mprog, out);
uint64_t program_handle = out->program_handle;
delete out;
- printf("program create: %lx %lx\n", ret, program_handle);
+ DEBUG("program create: %lx %lx\n", ret, program_handle);
// early failure
if (ret != 0) return 0;
@@ -103,14 +106,14 @@ uint64_t ANE_Compile(char *iprog, int sz) {
pas.program_handle = program_handle;
pas.flags = 0x0000000100010001;
ret = dev->ANE_ProgramPrepare(&pas);
- printf("program prepare: %lx\n", ret);
+ DEBUG("program prepare: %lx\n", ret);
return program_handle;
}
int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
int ret;
- printf("ANE_Run %p %p\n", in_surf, out_surf);
+ DEBUG("ANE_Run %p %p\n", in_surf, out_surf);
H11ANEProgramRequestArgsStruct *pras = new H11ANEProgramRequestArgsStruct;
memset(pras, 0, sizeof(H11ANEProgramRequestArgsStruct));
@@ -132,11 +135,11 @@ int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
mach_port_t recvPort = 0;
IOCreateReceivePort(kOSAsyncCompleteMessageID, &recvPort);
- printf("recv port: 0x%x\n", recvPort);
+ DEBUG("recv port: 0x%x\n", recvPort);
// run program
ret = dev->ANE_ProgramSendRequest(pras, recvPort);
- printf("send 0x%x\n", ret);
+ DEBUG("send 0x%x\n", ret);
struct {
mach_msg_header_t header;
@@ -149,7 +152,7 @@ int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
recvPort,
MACH_MSG_TIMEOUT_NONE,
MACH_PORT_NULL);
- printf("got message: %d sz %d\n", ret, message.header.msgh_size);
+ DEBUG("got message: %d sz %d\n", ret, message.header.msgh_size);
delete pras;
return 0;
diff --git a/ane/lib/ane.py b/ane/lib/ane.py
index 4676af2c53..def815fabb 100755
--- a/ane/lib/ane.py
+++ b/ane/lib/ane.py
@@ -25,6 +25,7 @@ class ANETensor:
self.shape = shape
self.dtype = np.float16
self.sz = int(np.prod(shape))
+ assert(self.sz <= 0x4000)
self.tt = libane.ANE_TensorCreate(self.sz, 1)
assert(self.tt is not None)
diff --git a/ane/ops/relu.hwx b/ane/ops/relu.hwx
index d66a665b39..dc54cc540e 100644
Binary files a/ane/ops/relu.hwx and b/ane/ops/relu.hwx differ
diff --git a/examples/benchmark.py b/examples/benchmark.py
new file mode 100644
index 0000000000..2788915055
--- /dev/null
+++ b/examples/benchmark.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+import numpy as np
+from tinygrad.tensor import Tensor
+import time
+
+# Tensor has max size of 0x4000 for now
+ba = Tensor(np.random.normal(size=(0x4000,)))
+for dev in ["CPU", "GPU", "ANE"]:
+ if dev == "GPU":
+ baa = ba.cuda()
+ elif dev == "ANE":
+ baa = ba.ane()
+ else:
+ baa = ba
+ for i in range(3):
+ st = time.time()
+ boaa = baa.relu()
+ et = time.time()
+ if i == 2:
+ print("%s can do at least %.2f MEGAReLUs/sec" % (dev, (np.prod(boaa.shape)/1e6)/(et-st)))
+ # decently reliable
+ assert(np.all(boaa.cpu().data >= 0))
+
+
diff --git a/examples/use_ane.py b/examples/use_ane.py
index c80d20fd1b..9c4bb5bd2c 100755
--- a/examples/use_ane.py
+++ b/examples/use_ane.py
@@ -1,9 +1,11 @@
#!/usr/bin/env python3
import numpy as np
from tinygrad.tensor import Tensor
+import time
a = Tensor([-2,-1,0,1,2]).ane()
print(a.cpu())
b = a.relu()
print(b.cpu())
+assert(np.all(b.cpu().data >= 0))
diff --git a/tinygrad/ops_ane.py b/tinygrad/ops_ane.py
index b5d25f2bae..ba1c9a2d66 100644
--- a/tinygrad/ops_ane.py
+++ b/tinygrad/ops_ane.py
@@ -1,16 +1,37 @@
from .tensor import Tensor, Function, register
from functools import lru_cache
+import struct
@lru_cache
def compile_wrapper(ane, dat):
return ane.compile(dat)
+def roundup(x, v):
+ return x + (v-x)%v
+
+def fill(dat, addrs, type, val, base=0x4000):
+ x = struct.pack(type, val)
+ for a in addrs:
+ dat[base+a:base+a+len(x)] = x
+ return dat
+
+@lru_cache
+def compile_relu(ane, sz):
+ dat = list(open("ane/ops/relu.hwx", "rb").read())
+ # TODO: make this all nice and once
+ # number of relus
+ dat = fill(dat, [0x128, 0x13C], "H", sz)
+ # number of engines? (max 0x100)
+ dat = fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8], "I", max(0x100, roundup(sz*2, 0x10)))
+ # strides?
+ dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz*2, 0x40))
+ return compile_wrapper(ane, bytes(dat))
+
class ReLU(Function):
@staticmethod
def forward(ctx, input):
ret = ctx.ane.tensor(input.shape)
- comp = compile_wrapper(ctx.ane, open("ane/ops/relu.hwx", "rb").read())
- ctx.ane.run(comp, input, ret)
+ ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret)
return ret
register('relu', ReLU, device=Tensor.ANE)