diff --git a/README.md b/README.md index 531602c030..78962e4348 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ from tinygrad.tensor import Tensor ### ANE Support?!?! -If all you want to do is ReLU, you are in luck! You can do very fast ReLU (fastness not confirmed) +If all you want to do is ReLU, you are in luck! You can do very fast ReLU (at least 30 MEGAReLUs/sec confirmed) Requires your Python to be signed with `ane/lib/sign_python.sh` to add the `com.apple.ane.iokit-user-access` entitlement. @@ -101,6 +101,8 @@ b = a.relu() print(b.cpu()) ``` +Warning: do not rely on the ANE port. It segfaults sometimes. So if you were doing something important with tinygrad and wanted to use the ANE, you might have a bad time. + ### ImageNet inference Despite being tiny, tinygrad supports the full EfficientNet. Pass in a picture to discover what it is. diff --git a/ane/2_compile/hwx_parse.py b/ane/2_compile/hwx_parse.py index 60ea3ea55f..4961ade896 100755 --- a/ane/2_compile/hwx_parse.py +++ b/ane/2_compile/hwx_parse.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 - +import sys from hexdump import hexdump - from macholib import MachO def get_macho(fn): # mod to make the header okay @@ -85,7 +84,7 @@ def compare(x, y): ln2.append(a[1]) return ''.join(ss) -g = get_macho("model.hwx.golden") +g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1]) f1 = g.headers[0].commands[1][2][0].section_data f2 = a.headers[0].commands[1][2][0].section_data for i in range(0, len(f2), 0x300): diff --git a/ane/2_compile/simple/neuron.plist b/ane/2_compile/simple/neuron.plist index a24a8a1dcf..dd57aaffa9 100644 --- a/ane/2_compile/simple/neuron.plist +++ b/ane/2_compile/simple/neuron.plist @@ -35,12 +35,8 @@ InputHeight 1 InputWidth - 16 + 678 - InputPlaneStride - 64 - InputRowStride - 64 InputType Float16 @@ -66,10 +62,6 @@ Bottom my_layer - OutputPlaneStride - 64 - OutputRowStride - 64 diff --git a/ane/lib/ane.mm b/ane/lib/ane.mm index 955e5a3866..2df6814e24 100644 --- a/ane/lib/ane.mm +++ b/ane/lib/ane.mm @@ -10,19 +10,22 @@ #include "h11ane.h" using namespace H11ANE; +//#define DEBUG printf +#define DEBUG(x, ...) + extern "C" { // global vars H11ANEDevice *dev = NULL; int MyH11ANEDeviceControllerNotification(H11ANEDeviceController *param_1, void *param_2, H11ANEDevice *param_3) { - printf("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3); + DEBUG("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3); dev = param_3; return 0; } int MyH11ANEDeviceMessageNotification(H11ANE::H11ANEDevice* dev, unsigned int param_1, void* param_2, void* param_3) { - printf("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3); + DEBUG("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3); return 0; } @@ -36,20 +39,20 @@ int ANE_Open() { char empty[0x90] = {0}; H11ANEDeviceInfoStruct dis = {0}; ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty, UsageCompile, &dis); - printf("open 0x%x %p\n", ret, dev); + DEBUG("open 0x%x %p\n", ret, dev); ret = dev->ANE_PowerOn(); - printf("power on: %d\n", ret); + DEBUG("power on: %d\n", ret); ret = dev->ANE_IsPowered(); - printf("powered? %d\n", ret); + DEBUG("powered? %d\n", ret); return 0; } int stride_for_width(int width) { int ret = width*2; - ret += (64-ret) % 64; + ret += (64-(ret % 64))%64; return ret; } @@ -73,7 +76,7 @@ void *ANE_TensorCreate(int width, int height) { void* ANE_TensorData(void *out_surf) { void *ret = (void *)IOSurfaceGetBaseAddress((IOSurfaceRef)out_surf); //IOSurfaceUnlock((IOSurfaceRef)out_surf, 0, nil); - printf("TensorData %p -> %p\n", out_surf, ret); + DEBUG("TensorData %p -> %p\n", out_surf, ret); return ret; } @@ -81,7 +84,7 @@ uint64_t ANE_Compile(char *iprog, int sz) { int ret; int cksum = 0; for (int i = 0; i < sz; i++) cksum += iprog[i]; - printf("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz); + DEBUG("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz); char *prog = (char*)aligned_alloc(0x1000, sz); memcpy(prog, iprog, sz); @@ -95,7 +98,7 @@ uint64_t ANE_Compile(char *iprog, int sz) { ret = dev->ANE_ProgramCreate(&mprog, out); uint64_t program_handle = out->program_handle; delete out; - printf("program create: %lx %lx\n", ret, program_handle); + DEBUG("program create: %lx %lx\n", ret, program_handle); // early failure if (ret != 0) return 0; @@ -103,14 +106,14 @@ uint64_t ANE_Compile(char *iprog, int sz) { pas.program_handle = program_handle; pas.flags = 0x0000000100010001; ret = dev->ANE_ProgramPrepare(&pas); - printf("program prepare: %lx\n", ret); + DEBUG("program prepare: %lx\n", ret); return program_handle; } int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) { int ret; - printf("ANE_Run %p %p\n", in_surf, out_surf); + DEBUG("ANE_Run %p %p\n", in_surf, out_surf); H11ANEProgramRequestArgsStruct *pras = new H11ANEProgramRequestArgsStruct; memset(pras, 0, sizeof(H11ANEProgramRequestArgsStruct)); @@ -132,11 +135,11 @@ int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) { mach_port_t recvPort = 0; IOCreateReceivePort(kOSAsyncCompleteMessageID, &recvPort); - printf("recv port: 0x%x\n", recvPort); + DEBUG("recv port: 0x%x\n", recvPort); // run program ret = dev->ANE_ProgramSendRequest(pras, recvPort); - printf("send 0x%x\n", ret); + DEBUG("send 0x%x\n", ret); struct { mach_msg_header_t header; @@ -149,7 +152,7 @@ int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) { recvPort, MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL); - printf("got message: %d sz %d\n", ret, message.header.msgh_size); + DEBUG("got message: %d sz %d\n", ret, message.header.msgh_size); delete pras; return 0; diff --git a/ane/lib/ane.py b/ane/lib/ane.py index 4676af2c53..def815fabb 100755 --- a/ane/lib/ane.py +++ b/ane/lib/ane.py @@ -25,6 +25,7 @@ class ANETensor: self.shape = shape self.dtype = np.float16 self.sz = int(np.prod(shape)) + assert(self.sz <= 0x4000) self.tt = libane.ANE_TensorCreate(self.sz, 1) assert(self.tt is not None) diff --git a/ane/ops/relu.hwx b/ane/ops/relu.hwx index d66a665b39..dc54cc540e 100644 Binary files a/ane/ops/relu.hwx and b/ane/ops/relu.hwx differ diff --git a/examples/benchmark.py b/examples/benchmark.py new file mode 100644 index 0000000000..2788915055 --- /dev/null +++ b/examples/benchmark.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import numpy as np +from tinygrad.tensor import Tensor +import time + +# Tensor has max size of 0x4000 for now +ba = Tensor(np.random.normal(size=(0x4000,))) +for dev in ["CPU", "GPU", "ANE"]: + if dev == "GPU": + baa = ba.cuda() + elif dev == "ANE": + baa = ba.ane() + else: + baa = ba + for i in range(3): + st = time.time() + boaa = baa.relu() + et = time.time() + if i == 2: + print("%s can do at least %.2f MEGAReLUs/sec" % (dev, (np.prod(boaa.shape)/1e6)/(et-st))) + # decently reliable + assert(np.all(boaa.cpu().data >= 0)) + + diff --git a/examples/use_ane.py b/examples/use_ane.py index c80d20fd1b..9c4bb5bd2c 100755 --- a/examples/use_ane.py +++ b/examples/use_ane.py @@ -1,9 +1,11 @@ #!/usr/bin/env python3 import numpy as np from tinygrad.tensor import Tensor +import time a = Tensor([-2,-1,0,1,2]).ane() print(a.cpu()) b = a.relu() print(b.cpu()) +assert(np.all(b.cpu().data >= 0)) diff --git a/tinygrad/ops_ane.py b/tinygrad/ops_ane.py index b5d25f2bae..ba1c9a2d66 100644 --- a/tinygrad/ops_ane.py +++ b/tinygrad/ops_ane.py @@ -1,16 +1,37 @@ from .tensor import Tensor, Function, register from functools import lru_cache +import struct @lru_cache def compile_wrapper(ane, dat): return ane.compile(dat) +def roundup(x, v): + return x + (v-x)%v + +def fill(dat, addrs, type, val, base=0x4000): + x = struct.pack(type, val) + for a in addrs: + dat[base+a:base+a+len(x)] = x + return dat + +@lru_cache +def compile_relu(ane, sz): + dat = list(open("ane/ops/relu.hwx", "rb").read()) + # TODO: make this all nice and once + # number of relus + dat = fill(dat, [0x128, 0x13C], "H", sz) + # number of engines? (max 0x100) + dat = fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8], "I", max(0x100, roundup(sz*2, 0x10))) + # strides? + dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz*2, 0x40)) + return compile_wrapper(ane, bytes(dat)) + class ReLU(Function): @staticmethod def forward(ctx, input): ret = ctx.ane.tensor(input.shape) - comp = compile_wrapper(ctx.ane, open("ane/ops/relu.hwx", "rb").read()) - ctx.ane.run(comp, input, ret) + ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret) return ret register('relu', ReLU, device=Tensor.ANE)