mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
30 MEGAReLUs. we need to lose 12 lines
This commit is contained in:
@@ -89,7 +89,7 @@ from tinygrad.tensor import Tensor
|
||||
|
||||
### ANE Support?!?!
|
||||
|
||||
If all you want to do is ReLU, you are in luck! You can do very fast ReLU (fastness not confirmed)
|
||||
If all you want to do is ReLU, you are in luck! You can do very fast ReLU (at least 30 MEGAReLUs/sec confirmed)
|
||||
|
||||
Requires your Python to be signed with `ane/lib/sign_python.sh` to add the `com.apple.ane.iokit-user-access` entitlement.
|
||||
|
||||
@@ -101,6 +101,8 @@ b = a.relu()
|
||||
print(b.cpu())
|
||||
```
|
||||
|
||||
Warning: do not rely on the ANE port. It segfaults sometimes. So if you were doing something important with tinygrad and wanted to use the ANE, you might have a bad time.
|
||||
|
||||
### ImageNet inference
|
||||
|
||||
Despite being tiny, tinygrad supports the full EfficientNet. Pass in a picture to discover what it is.
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
from hexdump import hexdump
|
||||
|
||||
from macholib import MachO
|
||||
def get_macho(fn):
|
||||
# mod to make the header okay
|
||||
@@ -85,7 +84,7 @@ def compare(x, y):
|
||||
ln2.append(a[1])
|
||||
return ''.join(ss)
|
||||
|
||||
g = get_macho("model.hwx.golden")
|
||||
g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1])
|
||||
f1 = g.headers[0].commands[1][2][0].section_data
|
||||
f2 = a.headers[0].commands[1][2][0].section_data
|
||||
for i in range(0, len(f2), 0x300):
|
||||
|
||||
@@ -35,12 +35,8 @@
|
||||
<key>InputHeight</key>
|
||||
<integer>1</integer>
|
||||
<key>InputWidth</key>
|
||||
<integer>16</integer>
|
||||
<integer>678</integer>
|
||||
|
||||
<key>InputPlaneStride</key>
|
||||
<integer>64</integer>
|
||||
<key>InputRowStride</key>
|
||||
<integer>64</integer>
|
||||
<key>InputType</key>
|
||||
<string>Float16</string>
|
||||
</dict>
|
||||
@@ -66,10 +62,6 @@
|
||||
<dict>
|
||||
<key>Bottom</key>
|
||||
<string>my_layer</string>
|
||||
<key>OutputPlaneStride</key>
|
||||
<integer>64</integer>
|
||||
<key>OutputRowStride</key>
|
||||
<integer>64</integer>
|
||||
</dict>
|
||||
|
||||
</dict>
|
||||
|
||||
@@ -10,19 +10,22 @@
|
||||
#include "h11ane.h"
|
||||
using namespace H11ANE;
|
||||
|
||||
//#define DEBUG printf
|
||||
#define DEBUG(x, ...)
|
||||
|
||||
extern "C" {
|
||||
|
||||
// global vars
|
||||
H11ANEDevice *dev = NULL;
|
||||
|
||||
int MyH11ANEDeviceControllerNotification(H11ANEDeviceController *param_1, void *param_2, H11ANEDevice *param_3) {
|
||||
printf("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
|
||||
DEBUG("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
|
||||
dev = param_3;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int MyH11ANEDeviceMessageNotification(H11ANE::H11ANEDevice* dev, unsigned int param_1, void* param_2, void* param_3) {
|
||||
printf("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
|
||||
DEBUG("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -36,20 +39,20 @@ int ANE_Open() {
|
||||
char empty[0x90] = {0};
|
||||
H11ANEDeviceInfoStruct dis = {0};
|
||||
ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty, UsageCompile, &dis);
|
||||
printf("open 0x%x %p\n", ret, dev);
|
||||
DEBUG("open 0x%x %p\n", ret, dev);
|
||||
|
||||
ret = dev->ANE_PowerOn();
|
||||
printf("power on: %d\n", ret);
|
||||
DEBUG("power on: %d\n", ret);
|
||||
|
||||
ret = dev->ANE_IsPowered();
|
||||
printf("powered? %d\n", ret);
|
||||
DEBUG("powered? %d\n", ret);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int stride_for_width(int width) {
|
||||
int ret = width*2;
|
||||
ret += (64-ret) % 64;
|
||||
ret += (64-(ret % 64))%64;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -73,7 +76,7 @@ void *ANE_TensorCreate(int width, int height) {
|
||||
void* ANE_TensorData(void *out_surf) {
|
||||
void *ret = (void *)IOSurfaceGetBaseAddress((IOSurfaceRef)out_surf);
|
||||
//IOSurfaceUnlock((IOSurfaceRef)out_surf, 0, nil);
|
||||
printf("TensorData %p -> %p\n", out_surf, ret);
|
||||
DEBUG("TensorData %p -> %p\n", out_surf, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -81,7 +84,7 @@ uint64_t ANE_Compile(char *iprog, int sz) {
|
||||
int ret;
|
||||
int cksum = 0;
|
||||
for (int i = 0; i < sz; i++) cksum += iprog[i];
|
||||
printf("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz);
|
||||
DEBUG("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz);
|
||||
|
||||
char *prog = (char*)aligned_alloc(0x1000, sz);
|
||||
memcpy(prog, iprog, sz);
|
||||
@@ -95,7 +98,7 @@ uint64_t ANE_Compile(char *iprog, int sz) {
|
||||
ret = dev->ANE_ProgramCreate(&mprog, out);
|
||||
uint64_t program_handle = out->program_handle;
|
||||
delete out;
|
||||
printf("program create: %lx %lx\n", ret, program_handle);
|
||||
DEBUG("program create: %lx %lx\n", ret, program_handle);
|
||||
// early failure
|
||||
if (ret != 0) return 0;
|
||||
|
||||
@@ -103,14 +106,14 @@ uint64_t ANE_Compile(char *iprog, int sz) {
|
||||
pas.program_handle = program_handle;
|
||||
pas.flags = 0x0000000100010001;
|
||||
ret = dev->ANE_ProgramPrepare(&pas);
|
||||
printf("program prepare: %lx\n", ret);
|
||||
DEBUG("program prepare: %lx\n", ret);
|
||||
|
||||
return program_handle;
|
||||
}
|
||||
|
||||
int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
|
||||
int ret;
|
||||
printf("ANE_Run %p %p\n", in_surf, out_surf);
|
||||
DEBUG("ANE_Run %p %p\n", in_surf, out_surf);
|
||||
H11ANEProgramRequestArgsStruct *pras = new H11ANEProgramRequestArgsStruct;
|
||||
memset(pras, 0, sizeof(H11ANEProgramRequestArgsStruct));
|
||||
|
||||
@@ -132,11 +135,11 @@ int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
|
||||
|
||||
mach_port_t recvPort = 0;
|
||||
IOCreateReceivePort(kOSAsyncCompleteMessageID, &recvPort);
|
||||
printf("recv port: 0x%x\n", recvPort);
|
||||
DEBUG("recv port: 0x%x\n", recvPort);
|
||||
|
||||
// run program
|
||||
ret = dev->ANE_ProgramSendRequest(pras, recvPort);
|
||||
printf("send 0x%x\n", ret);
|
||||
DEBUG("send 0x%x\n", ret);
|
||||
|
||||
struct {
|
||||
mach_msg_header_t header;
|
||||
@@ -149,7 +152,7 @@ int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
|
||||
recvPort,
|
||||
MACH_MSG_TIMEOUT_NONE,
|
||||
MACH_PORT_NULL);
|
||||
printf("got message: %d sz %d\n", ret, message.header.msgh_size);
|
||||
DEBUG("got message: %d sz %d\n", ret, message.header.msgh_size);
|
||||
delete pras;
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -25,6 +25,7 @@ class ANETensor:
|
||||
self.shape = shape
|
||||
self.dtype = np.float16
|
||||
self.sz = int(np.prod(shape))
|
||||
assert(self.sz <= 0x4000)
|
||||
self.tt = libane.ANE_TensorCreate(self.sz, 1)
|
||||
assert(self.tt is not None)
|
||||
|
||||
|
||||
BIN
ane/ops/relu.hwx
BIN
ane/ops/relu.hwx
Binary file not shown.
24
examples/benchmark.py
Normal file
24
examples/benchmark.py
Normal file
@@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
import time
|
||||
|
||||
# Tensor has max size of 0x4000 for now
|
||||
ba = Tensor(np.random.normal(size=(0x4000,)))
|
||||
for dev in ["CPU", "GPU", "ANE"]:
|
||||
if dev == "GPU":
|
||||
baa = ba.cuda()
|
||||
elif dev == "ANE":
|
||||
baa = ba.ane()
|
||||
else:
|
||||
baa = ba
|
||||
for i in range(3):
|
||||
st = time.time()
|
||||
boaa = baa.relu()
|
||||
et = time.time()
|
||||
if i == 2:
|
||||
print("%s can do at least %.2f MEGAReLUs/sec" % (dev, (np.prod(boaa.shape)/1e6)/(et-st)))
|
||||
# decently reliable
|
||||
assert(np.all(boaa.cpu().data >= 0))
|
||||
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
import time
|
||||
|
||||
a = Tensor([-2,-1,0,1,2]).ane()
|
||||
print(a.cpu())
|
||||
b = a.relu()
|
||||
print(b.cpu())
|
||||
assert(np.all(b.cpu().data >= 0))
|
||||
|
||||
|
||||
@@ -1,16 +1,37 @@
|
||||
from .tensor import Tensor, Function, register
|
||||
from functools import lru_cache
|
||||
import struct
|
||||
|
||||
@lru_cache
|
||||
def compile_wrapper(ane, dat):
|
||||
return ane.compile(dat)
|
||||
|
||||
def roundup(x, v):
|
||||
return x + (v-x)%v
|
||||
|
||||
def fill(dat, addrs, type, val, base=0x4000):
|
||||
x = struct.pack(type, val)
|
||||
for a in addrs:
|
||||
dat[base+a:base+a+len(x)] = x
|
||||
return dat
|
||||
|
||||
@lru_cache
|
||||
def compile_relu(ane, sz):
|
||||
dat = list(open("ane/ops/relu.hwx", "rb").read())
|
||||
# TODO: make this all nice and once
|
||||
# number of relus
|
||||
dat = fill(dat, [0x128, 0x13C], "H", sz)
|
||||
# number of engines? (max 0x100)
|
||||
dat = fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8], "I", max(0x100, roundup(sz*2, 0x10)))
|
||||
# strides?
|
||||
dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz*2, 0x40))
|
||||
return compile_wrapper(ane, bytes(dat))
|
||||
|
||||
class ReLU(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input):
|
||||
ret = ctx.ane.tensor(input.shape)
|
||||
comp = compile_wrapper(ctx.ane, open("ane/ops/relu.hwx", "rb").read())
|
||||
ctx.ane.run(comp, input, ret)
|
||||
ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret)
|
||||
return ret
|
||||
register('relu', ReLU, device=Tensor.ANE)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user