30 MEGAReLUs. we need to lose 12 lines

This commit is contained in:
George Hotz
2020-12-12 17:07:34 -08:00
parent 49da969d25
commit a5aced8d47
9 changed files with 73 additions and 29 deletions

View File

@@ -89,7 +89,7 @@ from tinygrad.tensor import Tensor
### ANE Support?!?!
If all you want to do is ReLU, you are in luck! You can do very fast ReLU (fastness not confirmed)
If all you want to do is ReLU, you are in luck! You can do very fast ReLU (at least 30 MEGAReLUs/sec confirmed)
Requires your Python to be signed with `ane/lib/sign_python.sh` to add the `com.apple.ane.iokit-user-access` entitlement.
@@ -101,6 +101,8 @@ b = a.relu()
print(b.cpu())
```
Warning: do not rely on the ANE port. It segfaults sometimes. So if you were doing something important with tinygrad and wanted to use the ANE, you might have a bad time.
### ImageNet inference
Despite being tiny, tinygrad supports the full EfficientNet. Pass in a picture to discover what it is.

View File

@@ -1,7 +1,6 @@
#!/usr/bin/env python3
import sys
from hexdump import hexdump
from macholib import MachO
def get_macho(fn):
# mod to make the header okay
@@ -85,7 +84,7 @@ def compare(x, y):
ln2.append(a[1])
return ''.join(ss)
g = get_macho("model.hwx.golden")
g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1])
f1 = g.headers[0].commands[1][2][0].section_data
f2 = a.headers[0].commands[1][2][0].section_data
for i in range(0, len(f2), 0x300):

View File

@@ -35,12 +35,8 @@
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>16</integer>
<integer>678</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
@@ -66,10 +62,6 @@
<dict>
<key>Bottom</key>
<string>my_layer</string>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
</dict>
</dict>

View File

@@ -10,19 +10,22 @@
#include "h11ane.h"
using namespace H11ANE;
//#define DEBUG printf
#define DEBUG(x, ...)
extern "C" {
// global vars
H11ANEDevice *dev = NULL;
int MyH11ANEDeviceControllerNotification(H11ANEDeviceController *param_1, void *param_2, H11ANEDevice *param_3) {
printf("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
DEBUG("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
dev = param_3;
return 0;
}
int MyH11ANEDeviceMessageNotification(H11ANE::H11ANEDevice* dev, unsigned int param_1, void* param_2, void* param_3) {
printf("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
DEBUG("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
return 0;
}
@@ -36,20 +39,20 @@ int ANE_Open() {
char empty[0x90] = {0};
H11ANEDeviceInfoStruct dis = {0};
ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty, UsageCompile, &dis);
printf("open 0x%x %p\n", ret, dev);
DEBUG("open 0x%x %p\n", ret, dev);
ret = dev->ANE_PowerOn();
printf("power on: %d\n", ret);
DEBUG("power on: %d\n", ret);
ret = dev->ANE_IsPowered();
printf("powered? %d\n", ret);
DEBUG("powered? %d\n", ret);
return 0;
}
int stride_for_width(int width) {
int ret = width*2;
ret += (64-ret) % 64;
ret += (64-(ret % 64))%64;
return ret;
}
@@ -73,7 +76,7 @@ void *ANE_TensorCreate(int width, int height) {
void* ANE_TensorData(void *out_surf) {
void *ret = (void *)IOSurfaceGetBaseAddress((IOSurfaceRef)out_surf);
//IOSurfaceUnlock((IOSurfaceRef)out_surf, 0, nil);
printf("TensorData %p -> %p\n", out_surf, ret);
DEBUG("TensorData %p -> %p\n", out_surf, ret);
return ret;
}
@@ -81,7 +84,7 @@ uint64_t ANE_Compile(char *iprog, int sz) {
int ret;
int cksum = 0;
for (int i = 0; i < sz; i++) cksum += iprog[i];
printf("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz);
DEBUG("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz);
char *prog = (char*)aligned_alloc(0x1000, sz);
memcpy(prog, iprog, sz);
@@ -95,7 +98,7 @@ uint64_t ANE_Compile(char *iprog, int sz) {
ret = dev->ANE_ProgramCreate(&mprog, out);
uint64_t program_handle = out->program_handle;
delete out;
printf("program create: %lx %lx\n", ret, program_handle);
DEBUG("program create: %lx %lx\n", ret, program_handle);
// early failure
if (ret != 0) return 0;
@@ -103,14 +106,14 @@ uint64_t ANE_Compile(char *iprog, int sz) {
pas.program_handle = program_handle;
pas.flags = 0x0000000100010001;
ret = dev->ANE_ProgramPrepare(&pas);
printf("program prepare: %lx\n", ret);
DEBUG("program prepare: %lx\n", ret);
return program_handle;
}
int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
int ret;
printf("ANE_Run %p %p\n", in_surf, out_surf);
DEBUG("ANE_Run %p %p\n", in_surf, out_surf);
H11ANEProgramRequestArgsStruct *pras = new H11ANEProgramRequestArgsStruct;
memset(pras, 0, sizeof(H11ANEProgramRequestArgsStruct));
@@ -132,11 +135,11 @@ int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
mach_port_t recvPort = 0;
IOCreateReceivePort(kOSAsyncCompleteMessageID, &recvPort);
printf("recv port: 0x%x\n", recvPort);
DEBUG("recv port: 0x%x\n", recvPort);
// run program
ret = dev->ANE_ProgramSendRequest(pras, recvPort);
printf("send 0x%x\n", ret);
DEBUG("send 0x%x\n", ret);
struct {
mach_msg_header_t header;
@@ -149,7 +152,7 @@ int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
recvPort,
MACH_MSG_TIMEOUT_NONE,
MACH_PORT_NULL);
printf("got message: %d sz %d\n", ret, message.header.msgh_size);
DEBUG("got message: %d sz %d\n", ret, message.header.msgh_size);
delete pras;
return 0;

View File

@@ -25,6 +25,7 @@ class ANETensor:
self.shape = shape
self.dtype = np.float16
self.sz = int(np.prod(shape))
assert(self.sz <= 0x4000)
self.tt = libane.ANE_TensorCreate(self.sz, 1)
assert(self.tt is not None)

Binary file not shown.

24
examples/benchmark.py Normal file
View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python3
import numpy as np
from tinygrad.tensor import Tensor
import time
# Tensor has max size of 0x4000 for now
ba = Tensor(np.random.normal(size=(0x4000,)))
for dev in ["CPU", "GPU", "ANE"]:
if dev == "GPU":
baa = ba.cuda()
elif dev == "ANE":
baa = ba.ane()
else:
baa = ba
for i in range(3):
st = time.time()
boaa = baa.relu()
et = time.time()
if i == 2:
print("%s can do at least %.2f MEGAReLUs/sec" % (dev, (np.prod(boaa.shape)/1e6)/(et-st)))
# decently reliable
assert(np.all(boaa.cpu().data >= 0))

View File

@@ -1,9 +1,11 @@
#!/usr/bin/env python3
import numpy as np
from tinygrad.tensor import Tensor
import time
a = Tensor([-2,-1,0,1,2]).ane()
print(a.cpu())
b = a.relu()
print(b.cpu())
assert(np.all(b.cpu().data >= 0))

View File

@@ -1,16 +1,37 @@
from .tensor import Tensor, Function, register
from functools import lru_cache
import struct
@lru_cache
def compile_wrapper(ane, dat):
return ane.compile(dat)
def roundup(x, v):
return x + (v-x)%v
def fill(dat, addrs, type, val, base=0x4000):
x = struct.pack(type, val)
for a in addrs:
dat[base+a:base+a+len(x)] = x
return dat
@lru_cache
def compile_relu(ane, sz):
dat = list(open("ane/ops/relu.hwx", "rb").read())
# TODO: make this all nice and once
# number of relus
dat = fill(dat, [0x128, 0x13C], "H", sz)
# number of engines? (max 0x100)
dat = fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8], "I", max(0x100, roundup(sz*2, 0x10)))
# strides?
dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz*2, 0x40))
return compile_wrapper(ane, bytes(dat))
class ReLU(Function):
@staticmethod
def forward(ctx, input):
ret = ctx.ane.tensor(input.shape)
comp = compile_wrapper(ctx.ane, open("ane/ops/relu.hwx", "rb").read())
ctx.ane.run(comp, input, ret)
ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret)
return ret
register('relu', ReLU, device=Tensor.ANE)