30 MEGAReLUs. we need to lose 12 lines

2026-01-09 15:08:02 -05:00 · 2020-12-12 17:07:34 -08:00
parent 49da969d25
commit a5aced8d47
9 changed files with 73 additions and 29 deletions
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ from tinygrad.tensor import Tensor

 ### ANE Support?!?!

-If all you want to do is ReLU, you are in luck! You can do very fast ReLU (fastness not confirmed)
+If all you want to do is ReLU, you are in luck! You can do very fast ReLU (at least 30 MEGAReLUs/sec confirmed)

 Requires your Python to be signed with `ane/lib/sign_python.sh` to add the `com.apple.ane.iokit-user-access` entitlement.

@@ -101,6 +101,8 @@ b = a.relu()
 print(b.cpu())
 ```

+Warning: do not rely on the ANE port. It segfaults sometimes. So if you were doing something important with tinygrad and wanted to use the ANE, you might have a bad time.
+
 ### ImageNet inference

 Despite being tiny, tinygrad supports the full EfficientNet. Pass in a picture to discover what it is.
--- a/ane/2_compile/hwx_parse.py
+++ b/ane/2_compile/hwx_parse.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
-
+import sys
 from hexdump import hexdump
-
 from macholib import MachO
 def get_macho(fn):
  # mod to make the header okay
@@ -85,7 +84,7 @@ def compare(x, y):
        ln2.append(a[1])
  return ''.join(ss)

-g = get_macho("model.hwx.golden")
+g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1])
 f1 = g.headers[0].commands[1][2][0].section_data
 f2 = a.headers[0].commands[1][2][0].section_data
 for i in range(0, len(f2), 0x300):
--- a/ane/2_compile/simple/neuron.plist
+++ b/ane/2_compile/simple/neuron.plist
@@ -35,12 +35,8 @@
 			<key>InputHeight</key>
 			<integer>1</integer>
 			<key>InputWidth</key>
-			<integer>16</integer>
+			<integer>678</integer>

-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
 			<key>InputType</key>
 			<string>Float16</string>
 		</dict>
@@ -66,10 +62,6 @@
 		<dict>
 			<key>Bottom</key>
 			<string>my_layer</string>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
 		</dict>

 	</dict>
--- a/ane/lib/ane.mm
+++ b/ane/lib/ane.mm
@@ -10,19 +10,22 @@
 #include "h11ane.h"
 using namespace H11ANE;

+//#define DEBUG printf
+#define DEBUG(x, ...)
+
 extern "C" {

 // global vars
 H11ANEDevice *dev = NULL;

 int MyH11ANEDeviceControllerNotification(H11ANEDeviceController *param_1, void *param_2, H11ANEDevice *param_3) {
-  printf("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
+  DEBUG("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
  dev = param_3;
  return 0;
 }

 int MyH11ANEDeviceMessageNotification(H11ANE::H11ANEDevice* dev, unsigned int param_1, void* param_2, void* param_3) {
-  printf("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
+  DEBUG("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
  return 0;
 }

@@ -36,20 +39,20 @@ int ANE_Open() {
  char empty[0x90] = {0};
  H11ANEDeviceInfoStruct dis = {0};
  ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty, UsageCompile, &dis);
-  printf("open 0x%x %p\n", ret, dev);
+  DEBUG("open 0x%x %p\n", ret, dev);

  ret = dev->ANE_PowerOn();
-  printf("power on: %d\n", ret);
+  DEBUG("power on: %d\n", ret);

  ret = dev->ANE_IsPowered();
-  printf("powered? %d\n", ret);
+  DEBUG("powered? %d\n", ret);

  return 0;
 }

 int stride_for_width(int width) {
  int ret = width*2;
-  ret += (64-ret) % 64;
+  ret += (64-(ret % 64))%64;
  return ret;
 }

@@ -73,7 +76,7 @@ void *ANE_TensorCreate(int width, int height) {
 void* ANE_TensorData(void *out_surf) {
  void *ret = (void *)IOSurfaceGetBaseAddress((IOSurfaceRef)out_surf);
  //IOSurfaceUnlock((IOSurfaceRef)out_surf, 0, nil);
-  printf("TensorData %p -> %p\n", out_surf, ret);
+  DEBUG("TensorData %p -> %p\n", out_surf, ret);
  return ret;
 }

@@ -81,7 +84,7 @@ uint64_t ANE_Compile(char *iprog, int sz) {
  int ret;
  int cksum = 0;
  for (int i = 0; i < sz; i++) cksum += iprog[i];
-  printf("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz);
+  DEBUG("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz);

  char *prog = (char*)aligned_alloc(0x1000, sz);
  memcpy(prog, iprog, sz);
@@ -95,7 +98,7 @@ uint64_t ANE_Compile(char *iprog, int sz) {
  ret = dev->ANE_ProgramCreate(&mprog, out);
  uint64_t program_handle = out->program_handle;
  delete out;
-  printf("program create: %lx %lx\n", ret, program_handle);
+  DEBUG("program create: %lx %lx\n", ret, program_handle);
  // early failure
  if (ret != 0) return 0;

@@ -103,14 +106,14 @@ uint64_t ANE_Compile(char *iprog, int sz) {
  pas.program_handle = program_handle;
  pas.flags = 0x0000000100010001;
  ret = dev->ANE_ProgramPrepare(&pas);
-  printf("program prepare: %lx\n", ret);
+  DEBUG("program prepare: %lx\n", ret);

  return program_handle;
 }

 int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
  int ret;
-  printf("ANE_Run %p %p\n", in_surf, out_surf);
+  DEBUG("ANE_Run %p %p\n", in_surf, out_surf);
  H11ANEProgramRequestArgsStruct *pras = new H11ANEProgramRequestArgsStruct;
  memset(pras, 0, sizeof(H11ANEProgramRequestArgsStruct));

@@ -132,11 +135,11 @@ int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {

  mach_port_t recvPort = 0;
  IOCreateReceivePort(kOSAsyncCompleteMessageID, &recvPort);
-  printf("recv port: 0x%x\n", recvPort);
+  DEBUG("recv port: 0x%x\n", recvPort);

  // run program
  ret = dev->ANE_ProgramSendRequest(pras, recvPort);
-  printf("send 0x%x\n", ret);
+  DEBUG("send 0x%x\n", ret);

  struct {
    mach_msg_header_t header;
@@ -149,7 +152,7 @@ int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf) {
          recvPort,
          MACH_MSG_TIMEOUT_NONE,
          MACH_PORT_NULL);
-  printf("got message: %d sz %d\n", ret, message.header.msgh_size);
+  DEBUG("got message: %d sz %d\n", ret, message.header.msgh_size);
  delete pras;

  return 0;
--- a/ane/lib/ane.py
+++ b/ane/lib/ane.py
@@ -25,6 +25,7 @@ class ANETensor:
    self.shape = shape
    self.dtype = np.float16
    self.sz = int(np.prod(shape))
+    assert(self.sz <= 0x4000)
    self.tt = libane.ANE_TensorCreate(self.sz, 1)
    assert(self.tt is not None)

--- a/ane/ops/relu.hwx
+++ b/ane/ops/relu.hwx
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+import numpy as np
+from tinygrad.tensor import Tensor
+import time
+
+# Tensor has max size of 0x4000 for now
+ba = Tensor(np.random.normal(size=(0x4000,)))
+for dev in ["CPU", "GPU", "ANE"]:
+  if dev == "GPU":
+    baa = ba.cuda()
+  elif dev == "ANE":
+    baa = ba.ane()
+  else:
+    baa = ba
+  for i in range(3):
+    st = time.time()
+    boaa = baa.relu()
+    et = time.time()
+    if i == 2:
+      print("%s can do at least %.2f MEGAReLUs/sec" % (dev, (np.prod(boaa.shape)/1e6)/(et-st)))
+    # decently reliable
+    assert(np.all(boaa.cpu().data >= 0))
+
+
--- a/examples/use_ane.py
+++ b/examples/use_ane.py
@@ -1,9 +1,11 @@
 #!/usr/bin/env python3
 import numpy as np
 from tinygrad.tensor import Tensor
+import time

 a = Tensor([-2,-1,0,1,2]).ane()
 print(a.cpu())
 b = a.relu()
 print(b.cpu())
+assert(np.all(b.cpu().data >= 0))

--- a/tinygrad/ops_ane.py
+++ b/tinygrad/ops_ane.py
@@ -1,16 +1,37 @@
 from .tensor import Tensor, Function, register
 from functools import lru_cache
+import struct

@lru_cache
 def compile_wrapper(ane, dat):
  return ane.compile(dat)

+def roundup(x, v):
+  return x + (v-x)%v
+
+def fill(dat, addrs, type, val, base=0x4000):
+  x = struct.pack(type, val)
+  for a in addrs:
+    dat[base+a:base+a+len(x)] = x
+  return dat
+
+@lru_cache
+def compile_relu(ane, sz):
+  dat = list(open("ane/ops/relu.hwx", "rb").read())
+  # TODO: make this all nice and once
+  # number of relus
+  dat = fill(dat, [0x128, 0x13C], "H", sz)
+  # number of engines? (max 0x100)
+  dat = fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8], "I", max(0x100, roundup(sz*2, 0x10)))
+  # strides?
+  dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz*2, 0x40))
+  return compile_wrapper(ane, bytes(dat))
+
 class ReLU(Function):
  @staticmethod
  def forward(ctx, input):
    ret = ctx.ane.tensor(input.shape)
-    comp = compile_wrapper(ctx.ane, open("ane/ops/relu.hwx", "rb").read())
-    ctx.ane.run(comp, input, ret)
+    ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret)
    return ret
 register('relu', ReLU, device=Tensor.ANE)