hotfix: remove accel from extra

2026-01-09 15:08:02 -05:00 · 2025-06-08 08:20:34 -07:00
parent 1ad8062591
commit 3ece2e4bb5
78 changed files with 0 additions and 7843 deletions
--- a/extra/accel/MAPPING
+++ b/extra/accel/MAPPING
@@ -1,52 +0,0 @@
-We have to figure out how to make the tinygrad ops match to hw.
-Generic folded reduce may not work.
-
-
-GPUs:
-
-
-AMD:
-
-RDNA2: https://developer.amd.com/wp-content/resources/RDNA2_Shader_ISA_November2020.pdf
-
-We have RX6900XT with 80 CU, 40 WGP, and 1 "processor"
-@ 1.825 GHz, there's 18,688 FP32 GFLOPS of compute. 10240 FLOPS/cycle, 128 per CU (32 FMAs per vALU, 2 per compute unit)
-
-286 GFLOP for ENET=2 BS=64. At theoretical max, (286/18688)*1000 = 15.3 ms.
-We observe about 10x factor off with pytorch.
-
-We will focus on speed for AMD, since we have complete docs for that GPU.
-Each "processor" has an "ultra threaded dispatch processor"
-
-Each SIMD unit has 256 vector registers (or 1024?), and operates on 32 at once.
-Ahh, I think there's 1024 total, but only 256 per wavefront
-
-
-M1:
-
-On M1 GPU, theoretical is 2.275 TFLOPS. https://www.notebookcheck.net/Apple-M1-GPU-Benchmarks-and-Specs.503610.0.html
-
-We observe 2000ms for BS=8 (37 GFLOP). 37/2275 = 11.9 ms. tinygrad is over a factor of 100x off (similar on AMD GPU)
-
-NOTE: the timer in the M1 OpenCL doesn't seem to be anywhere close to wall time.
-
-
-Adreno:
-
-TBD, no comma three here. Image > Buffer because the L1 cache is used. Would UBWC help on weights?
-
-We have a good bit of work on this in hyperthneed. Let's get the disassembler out and make this fast.
-
-
-
-TPUs:
-
-These use really big systolic arrays and have a lot less flexibility.
-
-IIRC, their vector math unit is similar to the GPU.
-
-
-
-
-
-
--- a/extra/accel/README
+++ b/extra/accel/README
@@ -1,5 +0,0 @@
-This is where we scope out adding accelerators to tinygrad
-
-ane -- Apple Neural Engine, in the M1 + newer iPhones
-tpu -- Google's TPU, available for rent in Google Cloud
-
--- a/extra/accel/ane/1_build/.gitignore
+++ b/extra/accel/ane/1_build/.gitignore
@@ -1 +0,0 @@
-run
--- a/extra/accel/ane/1_build/coreml_ane.py
+++ b/extra/accel/ane/1_build/coreml_ane.py
@@ -1,30 +0,0 @@
-#!/usr/bin/env python3
-import numpy as np
-import coremltools as ct
-from coremltools.models.neural_network import datatypes, NeuralNetworkBuilder
-
-# KxK GEMM with bias
-K = 64
-
-input_features = [('image', datatypes.Array(K))]
-input_features2 = [('image2', datatypes.Array(K))]
-output_features = [('probs', datatypes.Array(K))]
-
-weights = np.zeros((K, K)) + 3
-bias = np.ones(K)
-
-builder = NeuralNetworkBuilder(input_features+input_features2, output_features)
-
-#builder.add_inner_product(name='ip_layer', W=weights, b=None, input_channels=K, output_channels=K, has_bias=False, input_name='image', output_name='med')
-#builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs')
-builder.add_elementwise(name='element', input_names=['image', 'image2'], output_name='probs', mode='ADD')
-#builder.add_bias(name='bias', b=bias, input_name='med', output_name='probs', shape_bias=(K,))
-#builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs')
-
-# compile the spec
-mlmodel = ct.models.MLModel(builder.spec)
-
-# trigger the ANE!
-out = mlmodel.predict({"image": np.zeros(K, dtype=np.float32)+1, "image2": np.zeros(K, dtype=np.float32)+2})
-print(out)
-mlmodel.save('test.mlmodel')
--- a/extra/accel/ane/1_build/run.swift
+++ b/extra/accel/ane/1_build/run.swift
@@ -1,36 +0,0 @@
-import CoreML
-
-// ANE?
-let config = MLModelConfiguration()
-config.computeUnits = .all
-
-// CPU?
-let opts = MLPredictionOptions()
-opts.usesCPUOnly = false
-
-class MNISTInput : MLFeatureProvider {
-  var featureNames: Set<String> {
-    get {
-      return ["image", "image2"]
-    }
-  }
-  func featureValue(for featureName: String) -> MLFeatureValue? {
-    if (featureName == "image") {
-      let tokenIDMultiArray = try? MLMultiArray(shape: [64], dataType: MLMultiArrayDataType.float32)
-      tokenIDMultiArray?[0] = NSNumber(value: 1337)
-      return MLFeatureValue(multiArray: tokenIDMultiArray!)
-    }
-    if (featureName == "image2") {
-      let tokenIDMultiArray = try? MLMultiArray(shape: [64], dataType: MLMultiArrayDataType.float32)
-      tokenIDMultiArray?[0] = NSNumber(value: 1337)
-      return MLFeatureValue(multiArray: tokenIDMultiArray!)
-    }
-    return nil
-  }
-}
-
-let compiledUrl = try MLModel.compileModel(at: URL(string: "test.mlmodel")!)
-let model = try MLModel(contentsOf: compiledUrl, configuration: config)
-let out = try model.prediction(from: MNISTInput(), options: opts)
-
-print(out.featureValue(for: "probs") as Any)
--- a/extra/accel/ane/1_build/test.mlmodel
+++ b/extra/accel/ane/1_build/test.mlmodel
--- a/extra/accel/ane/2_compile/.gitignore
+++ b/extra/accel/ane/2_compile/.gitignore
@@ -1,4 +0,0 @@
-*.hwx
-anecompiler.swap.*
-context_switch_log.txt
-debug/
--- a/extra/accel/ane/2_compile/ane.py
+++ b/extra/accel/ane/2_compile/ane.py
@@ -1 +0,0 @@
-../lib/ane.py
--- a/extra/accel/ane/2_compile/aneregs.json
+++ b/extra/accel/ane/2_compile/aneregs.json
@@ -1 +0,0 @@
-../lib/aneregs.json
--- a/extra/accel/ane/2_compile/compile.m
+++ b/extra/accel/ane/2_compile/compile.m
@@ -1,57 +0,0 @@
-#import <Foundation/Foundation.h>
-#include <os/log.h>
-#include <stdio.h>
-
-typedef unsigned int ANECStatus;
-
-int ANECCompile(NSDictionary* param_1, NSDictionary* param_2,
-    void (^param_3)(ANECStatus status,
-        NSDictionary* statusDictionary));
-
-int main(int argc, char* argv[])
-{
-    os_log(OS_LOG_DEFAULT, "start compiler");
-
-    NSDictionary* iDictionary = @ {
-        @"NetworkPlistName" : [NSString stringWithCString:argv[1]
-                                                 encoding:NSUTF8StringEncoding],
-        @"NetworkPlistPath" : @"./",
-    };
-    NSArray* plistArray = @[ iDictionary ];
-
-    NSMutableDictionary* optionsDictionary =
-        [NSMutableDictionary dictionaryWithCapacity:4];
-    NSMutableDictionary* flagsDictionary =
-        [NSMutableDictionary dictionaryWithCapacity:4];
-    optionsDictionary[@"InputNetworks"] = plistArray;
-
-    optionsDictionary[@"OutputFilePath"] = @"./";
-
-    // h11 (or anything?) works here too, and creates different outputs that don't
-    // run
-    flagsDictionary[@"TargetArchitecture"] = @"h13";
-
-    if (argc > 2) {
-        optionsDictionary[@"OutputFileName"] = @"debug/model.hwx";
-
-        flagsDictionary[@"CompileANEProgramForDebugging"] =
-            [NSNumber numberWithBool:YES];
-        int debug_mask = 0x7fffffff;
-        flagsDictionary[@"DebugMask"] = [NSNumber numberWithInt:debug_mask];
-    } else {
-        optionsDictionary[@"OutputFileName"] = @"model.hwx";
-    }
-
-    void (^simpleBlock)(ANECStatus status, NSDictionary* statusDictionary) = ^(ANECStatus status, NSDictionary* statusDictionary) {
-        NSLog(@"status = %d\n", status);
-        // when status != 0 dump the dictionary
-        if (status)
-            NSLog(@"%@", statusDictionary);
-    };
-
-    printf("hello\n");
-    int ret = ANECCompile(optionsDictionary, flagsDictionary, simpleBlock);
-    printf("compile: %d\n", ret);
-
-    return ret;
-}
--- a/extra/accel/ane/2_compile/compile.mm
+++ b/extra/accel/ane/2_compile/compile.mm
@@ -1,74 +0,0 @@
-#include <os/log.h>
-#include <stdio.h>
-#import <CoreFoundation/CoreFoundation.h>
-#include <string>
-#include <iostream>
-
-extern "C" {
-  int ANECCompile(CFDictionaryRef param_1, CFDictionaryRef param_2, unsigned long param_3);
-  std::string _ZN21ZinIrEnumToStringUtil14OpCodeToStringE22ZinIrOpLayerOpCodeType(int op);
-  std::string _ZN21ZinIrEnumToStringUtil21NonLinearModeToStringE18ZinIrNonLinearMode(int op);
-  std::string _ZN19ZinMirCacheHintUtil17CacheHintToStringE15ZinMirCacheHint(int op);
-  std::string _ZN30ZinMirKernelSizeSplitterEngine16ConvKindToStringENS_8ConvKindE(int op);
-
-  /*void _Z24ZinIrRegBitPrintOutDebugILj7EE11ZinIrStatusjRN11ZinHWTraitsIXT_EE6HwTypeEiRNSt3__113basic_ostreamIcNS5_11char_traitsIcEEEE(
-    unsigned long param_1, void *param_2,int param_3, std::basic_ostream<char> *param_4);
-
-void debugregs(int a1, void *dat, int a2) {
-  _Z24ZinIrRegBitPrintOutDebugILj7EE11ZinIrStatusjRN11ZinHWTraitsIXT_EE6HwTypeEiRNSt3__113basic_ostreamIcNS5_11char_traitsIcEEEE(a1, dat, a2, &std::cout);
-}*/
-
-}
-
-int main(int argc, char* argv[]) {
-  os_log(OS_LOG_DEFAULT, "start compiler");
-
-  /*for (int i = 0; i < 60; i++) {
-    std::string tmp = _ZN21ZinIrEnumToStringUtil14OpCodeToStringE22ZinIrOpLayerOpCodeType(i);
-    //std::string tmp = _ZN21ZinIrEnumToStringUtil21NonLinearModeToStringE18ZinIrNonLinearMode(i);
-    printf("%2d: %s\n", i, tmp.c_str());
-  }*/
-
-  CFTypeRef ikeys[2];
-  ikeys[0] = CFSTR("NetworkPlistName");
-  ikeys[1] = CFSTR("NetworkPlistPath");
-
-  CFTypeRef ivalues[2];
-  ivalues[0] = CFStringCreateWithCString(kCFAllocatorDefault, argv[1], kCFStringEncodingUTF8);
-  ivalues[1] = CFSTR("./");
-  
-  CFDictionaryRef iDictionary = CFDictionaryCreate(kCFAllocatorDefault, ikeys, ivalues, 2, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
-  CFArrayRef array = CFArrayCreate(kCFAllocatorDefault, (const void**)&iDictionary, 1, &kCFTypeArrayCallBacks);
-
-  CFMutableDictionaryRef optionsDictionary = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
-  CFMutableDictionaryRef flagsDictionary = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
-
-  CFDictionaryAddValue(optionsDictionary, CFSTR("InputNetworks"), array);
-  CFDictionaryAddValue(optionsDictionary, CFSTR("OutputFilePath"), CFSTR("./"));
-  //CFDictionaryAddValue(optionsDictionary, CFSTR("OptionsFilePath"), CFSTR("good.options"));
-
-  // h11 (or anything?) works here too, and creates different outputs that don't run
-  CFDictionaryAddValue(flagsDictionary, CFSTR("TargetArchitecture"), CFSTR("h13"));
-
-  if (argc > 2) {
-    CFDictionaryAddValue(optionsDictionary, CFSTR("OutputFileName"), CFSTR("debug/model.hwx"));
-    //CFDictionaryAddValue(flagsDictionary, CFSTR("DebugDetailPrint"), kCFBooleanTrue);
-    CFDictionaryAddValue(flagsDictionary, CFSTR("CompileANEProgramForDebugging"), kCFBooleanTrue);
-    int debug_mask = 0x7fffffff;
-    CFDictionaryAddValue(flagsDictionary, CFSTR("DebugMask"), CFNumberCreate(kCFAllocatorDefault, 3, &debug_mask));
-  } else {
-    CFDictionaryAddValue(optionsDictionary, CFSTR("OutputFileName"), CFSTR("model.hwx"));
-  }
-  //CFDictionaryAddValue(flagsDictionary, CFSTR("DisableMergeScaleBias"), kCFBooleanTrue);
-  //CFDictionaryAddValue(flagsDictionary, CFSTR("Externs"), CFSTR("swag"));
-
-  //CFShow(optionsDictionary);
-  //CFShow(flagsDictionary);
-
-  printf("hello\n");
-  int ret = ANECCompile(optionsDictionary, flagsDictionary, 0);
-  printf("compile: %d\n", ret);
-
-
-  return ret;
-}
--- a/extra/accel/ane/2_compile/compile.sh
+++ b/extra/accel/ane/2_compile/compile.sh
@@ -1,7 +0,0 @@
-#!/bin/bash -e
-g++ compile.mm -F /System/Library/PrivateFrameworks/ -framework ANECompiler -framework CoreFoundation
-rm -f model.hwx
-./a.out net.plist debug
-rm -f context_switch_log.txt
-log show --process a.out --last 1m --info --debug
-
--- a/extra/accel/ane/2_compile/dcompile.py
+++ b/extra/accel/ane/2_compile/dcompile.py
@@ -1,17 +0,0 @@
-#!/usr/bin/env python3
-import os
-import sys
-import networkx as nx
-import pylab as plt
-from networkx.drawing.nx_pydot import read_dot
-
-ret = os.system("./a.out "+sys.argv[1]+" debug")
-assert(ret == 0)
-
-df = "debug/model.hwx.zinir_graph_after_reg_spill.dot"
-
-#from graphviz import render
-#render('dot', 'png', df)
-
-#plt = Image(pdot.create_png()
-#display(plt)
--- a/extra/accel/ane/2_compile/debug/README
+++ b/extra/accel/ane/2_compile/debug/README
@@ -1 +0,0 @@
-Run compiler with debug in argv[2] to generate these files
--- a/extra/accel/ane/2_compile/hwx_parse.py
+++ b/extra/accel/ane/2_compile/hwx_parse.py
@@ -1,140 +0,0 @@
-#!/usr/bin/env python3
-import sys
-from hexdump import hexdump
-from macholib import MachO
-from tinygrad.helpers import getenv
-def get_macho(fn):
-  # mod to make the header okay
-  # MH_CIGAM_64 is good
-  dat = open(fn, "rb").read()
-  dat = b"\xcf\xfa\xed\xfe"+dat[4:]
-  from tempfile import NamedTemporaryFile
-  with NamedTemporaryFile(delete=False) as f:
-    f.write(dat)
-    f.close()
-  return MachO.MachO(f.name)
-
-a = get_macho("model.hwx.golden")
-
-# load commands
-for c in a.headers[0].commands:
-  print("command", c[0], c[1])
-  if c[0].cmd == 4:
-    hexdump(c[2])
-    pass
-  if c[0].cmd == 6:
-    print("name:", c[2].decode('utf-8'))
-  if c[0].cmd == 8:
-    print(c[2].decode('utf-8'))
-  if c[0].cmd == 25:
-    for section in c[2]:
-      print(section.segname.strip(b'\0'), section.sectname.strip(b'\0'), hex(section.addr), hex(section.size), "@", hex(c[1].fileoff))
-      #print(dir(section))
-      if c[1].filesize > 0:
-        if len(section.section_data) < 0x100:
-          hexdump(section.section_data)
-        else:
-          print("in file, not dumping 0x%x" % len(section.section_data))
-
-# this parser is wrong (fixed with 64-bit one)
-from macholib import SymbolTable
-sym = SymbolTable.SymbolTable(a)
-
-syms = {}
-for l in sym.nlists:
-  print(l)
-  if l[0].n_value != 0:
-    syms[l[1]] = l[0].n_value
-
-for k,v in syms.items():
-  print(k, hex(v))
-
-
-# **** document what we know ***
-from ane import ANE_Struct, ANE
-ane = ANE()
-
-aneb = set()
-for typ, num, nam in ANE_Struct:
-  ltyp = {"u32": 4, "u16": 2, "u8": 1}[typ]
-  for l in range(num, num+ltyp):
-    aneb.add(l)
-
-# we understand these too
-for l in range(0x34, 0xF4):
-  aneb.add(l)
-
-from termcolor import colored
-def compare(x, y):
-  ss = []
-  ln = []
-  ln2 = []
-
-  ll = (max(len(x), len(y)) + 0xF)//0x10 * 0x10
-
-  highlight = False
-  next_highlight = 0x2b
-  for i in range(ll+1):
-    if i == next_highlight:
-      highlight = True
-      if i < len(y):
-        next_highlight += y[i]+8
-      else:
-        next_highlight = None
-    else:
-      highlight = False
-    a = "%02X" % x[i] if i < len(x) else "--", \
-        "%02X" % y[i] if i < len(y) else "--"
-    def fj(x):
-      ss = []
-      for i in range(0, 0x10, 4):
-        ss.append(' '.join(x[i:i+4]))
-      return '  '.join(ss)
-
-    if i!=0 and i%0x10 == 0:
-      ss.append("%8X: " % (i-0x10)+fj(ln)+"  |  "+fj(ln2)+"\n")
-      ln = []
-      ln2 = []
-    if a[0] != a[1] and a[0] != "--" and a[1] != "--":
-      ln.append(colored(a[0], 'green'))
-      ln2.append(colored(a[1], 'red'))
-    else:
-      if highlight:
-        ln.append(colored(a[0], 'yellow'))
-        ln2.append(colored(a[1], 'yellow'))
-      else:
-        if i in aneb:
-          ln.append(colored(a[0], 'white'))
-          ln2.append(colored(a[1], 'white'))
-        else:
-          ln.append(a[0])
-          ln2.append(a[1])
-  return ''.join(ss)
-
-import json
-aneregs = dict(json.load(open("aneregs.json")))
-g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1])
-f1 = g.headers[0].commands[1][2][0].section_data
-f2 = a.headers[0].commands[1][2][0].section_data
-for i in range(0, len(f2), 0x300):
-  print("===== op %d =====" % (i//0x300))
-  if len(f1) < 0x300:
-    c1, c2 = f1, f2[i:i+0x300]
-  else:
-    c1, c2 = f1[i:i+0x300], f2[i:i+0x300]
-  dbg1 = ane.debug(c1, 16)
-  dbg2 = ane.debug(c2, 16)
-  if getenv("PRINTALL"):
-    for k in dbg2:
-      if k in aneregs:
-        rr = aneregs[k] if k in aneregs else (-1,-1,-1)
-        print("0x%3x %d %2d" % tuple(rr), k, dbg1[k], "->", dbg2[k])
-  else:
-    for k in dbg1:
-      if dbg1[k] != dbg2[k]:
-        rr = aneregs[k] if k in aneregs else (-1,-1,-1)
-        print("0x%3x %d %2d" % tuple(rr), k, dbg1[k], "->", dbg2[k])
-
-  print(compare(c1, c2))
-#open("/tmp/data.section", "wb").write(f2)
-#print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read()))
--- a/extra/accel/ane/2_compile/min.weights
+++ b/extra/accel/ane/2_compile/min.weights
--- a/extra/accel/ane/2_compile/min_uint8.weights
+++ b/extra/accel/ane/2_compile/min_uint8.weights
--- a/extra/accel/ane/2_compile/model.espresso.weights
+++ b/extra/accel/ane/2_compile/model.espresso.weights
--- a/extra/accel/ane/2_compile/model.hwx.golden
+++ b/extra/accel/ane/2_compile/model.hwx.golden
--- a/extra/accel/ane/2_compile/net.additional.weights
+++ b/extra/accel/ane/2_compile/net.additional.weights
--- a/extra/accel/ane/2_compile/net.plist
+++ b/extra/accel/ane/2_compile/net.plist
@@ -1,127 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-		<key>Inputs</key>
-		<array>
-			<string>image</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>probs@output</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>probs_tmp_0</string>
-			<string>probs</string>
-		</array>
-		<key>Weights</key>
-		<array>
-			<string>model.espresso.weights</string>
-			<string>net.additional.weights</string>
-		</array>
-		<key>image</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>1</integer>
-			<key>InputChannels</key>
-			<integer>3</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>1</integer>
-		</dict>
-		<key>probs</key>
-		<dict>
-			<key>Bottom</key>
-			<string>probs_tmp_0</string>
-			<key>Name</key>
-			<string>probs</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>BiasScaleGroupData</key>
-				<dict>
-					<key>BiasCount</key>
-					<integer>2</integer>
-					<key>BiasIndex</key>
-					<integer>1</integer>
-					<key>BiasOffset</key>
-					<integer>0</integer>
-					<key>BiasType</key>
-					<string>Float16</string>
-				</dict>
-			</dict>
-			<key>Type</key>
-			<string>GOC</string>
-		</dict>
-		<key>probs@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>probs</string>
-			<key>OutputInterleave</key>
-			<integer>1</integer>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-		<key>probs_tmp_0</key>
-		<dict>
-			<key>Bottom</key>
-			<string>image</string>
-			<key>Name</key>
-			<string>probs_tmp_0</string>
-			<key>OutputChannels</key>
-			<integer>2</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>KernelGroupReuse</key>
-				<false/>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelIndex</key>
-				<integer>0</integer>
-				<key>KernelMode</key>
-				<string>Dense</string>
-				<key>KernelOffset</key>
-				<integer>192</integer>
-				<key>KernelType</key>
-				<string>Float32</string>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/plists/broadcast.plist
+++ b/extra/accel/ane/2_compile/plists/broadcast.plist
@@ -1,196 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-		<key>A</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>7</integer>
-			<key>InputBatchStride</key>
-			<integer>64</integer>
-			<key>InputChannels</key>
-			<integer>1</integer>
-			<key>InputDepth</key>
-			<integer>1</integer>
-			<key>InputDepthStride</key>
-			<integer>448</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>1</integer>
-		</dict>
-		<key>A_broadcasted_output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>A</string>
-			<key>Name</key>
-			<string>A_broadcasted_output</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>BroadcastInfo</key>
-				<array>
-					<dict>
-						<key>Dimension</key>
-						<string>Width</string>
-						<key>Size</key>
-						<integer>2</integer>
-					</dict>
-				</array>
-			</dict>
-			<key>Type</key>
-			<string>Broadcast</string>
-			<key>UnescapedBottom</key>
-			<string>A</string>
-			<key>UnescapedName</key>
-			<string>A_broadcasted_output</string>
-		</dict>
-		<key>B</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>1</integer>
-			<key>InputChannels</key>
-			<integer>1</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>2</integer>
-		</dict>
-		<key>B_broadcasted_output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>B</string>
-			<key>Name</key>
-			<string>B_broadcasted_output</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>BroadcastInfo</key>
-				<array>
-					<dict>
-						<key>Dimension</key>
-						<string>Depth</string>
-						<key>Size</key>
-						<integer>1</integer>
-					</dict>
-					<dict>
-						<key>Dimension</key>
-						<string>Batch</string>
-						<key>Size</key>
-						<integer>7</integer>
-					</dict>
-					<dict>
-						<key>Dimension</key>
-						<string>Channel</string>
-						<key>Size</key>
-						<integer>1</integer>
-					</dict>
-					<dict>
-						<key>Dimension</key>
-						<string>Height</string>
-						<key>Size</key>
-						<integer>1</integer>
-					</dict>
-				</array>
-			</dict>
-			<key>Type</key>
-			<string>Broadcast</string>
-			<key>UnescapedBottom</key>
-			<string>B</string>
-			<key>UnescapedName</key>
-			<string>B_broadcasted_output</string>
-		</dict>
-		<key>Inputs</key>
-		<array>
-			<string>B</string>
-			<string>A</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>output@output</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>A_broadcasted_output</string>
-			<string>B_broadcasted_output</string>
-			<string>output</string>
-		</array>
-		<key>Weights</key>
-		<array>
-			<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpy5yeqxdi.mlmodelc/model.espresso.weights</string>
-			<string>net.additional.weights</string>
-		</array>
-		<key>output</key>
-		<dict>
-			<key>Bottom</key>
-			<array>
-				<string>A_broadcasted_output</string>
-				<string>B_broadcasted_output</string>
-			</array>
-			<key>Name</key>
-			<string>output</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>Scale</key>
-				<integer>15360</integer>
-				<key>Type</key>
-				<string>Min</string>
-			</dict>
-			<key>Type</key>
-			<string>ScaledElementWise</string>
-			<key>UnescapedBottom</key>
-			<array>
-				<string>A_broadcasted_output</string>
-				<string>B_broadcasted_output</string>
-			</array>
-			<key>UnescapedName</key>
-			<string>output</string>
-		</dict>
-		<key>output@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>output</string>
-			<key>OutputBatchStride</key>
-			<integer>64</integer>
-			<key>OutputDepthStride</key>
-			<integer>448</integer>
-			<key>OutputInterleave</key>
-			<integer>1</integer>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/plists/concat.plist
+++ b/extra/accel/ane/2_compile/plists/concat.plist
@@ -1,128 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-		<key>Inputs</key>
-		<array>
-			<string>input_1</string>
-			<string>input_0</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>output@output</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>output</string>
-		</array>
-		<key>Weights</key>
-		<array>
-			<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmp0yvkl2ux.mlmodelc/model.espresso.weights</string>
-			<string>net.additional.weights</string>
-		</array>
-		<key>input_0</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>2</integer>
-			<key>InputBatchStride</key>
-			<integer>512</integer>
-			<key>InputChannels</key>
-			<integer>4</integer>
-			<key>InputDepth</key>
-			<integer>4</integer>
-			<key>InputDepthStride</key>
-			<integer>1024</integer>
-			<key>InputHeight</key>
-			<integer>2</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>128</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>3</integer>
-		</dict>
-		<key>input_1</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>2</integer>
-			<key>InputBatchStride</key>
-			<integer>256</integer>
-			<key>InputChannels</key>
-			<integer>2</integer>
-			<key>InputDepth</key>
-			<integer>4</integer>
-			<key>InputDepthStride</key>
-			<integer>512</integer>
-			<key>InputHeight</key>
-			<integer>2</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>128</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>3</integer>
-		</dict>
-		<key>output</key>
-		<dict>
-			<key>Bottom</key>
-			<array>
-				<string>input_0</string>
-				<string>input_1</string>
-			</array>
-			<key>Name</key>
-			<string>output</string>
-			<key>OutputChannels</key>
-			<integer>6</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>Dimension</key>
-				<string>Channel</string>
-			</dict>
-			<key>Type</key>
-			<string>Concat</string>
-			<key>UnescapedBottom</key>
-			<array>
-				<string>input_0</string>
-				<string>input_1</string>
-			</array>
-			<key>UnescapedName</key>
-			<string>output</string>
-		</dict>
-		<key>output@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>output</string>
-			<key>OutputBatchStride</key>
-			<integer>768</integer>
-			<key>OutputDepthStride</key>
-			<integer>1536</integer>
-			<key>OutputInterleave</key>
-			<integer>1</integer>
-			<key>OutputPlaneStride</key>
-			<integer>128</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/plists/gemm.plist
+++ b/extra/accel/ane/2_compile/plists/gemm.plist
@@ -1,135 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-		<key>Inputs</key>
-		<array>
-			<string>image</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>probs@output</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>probs_tmp_0</string>
-			<string>probs</string>
-		</array>
-		<key>Weights</key>
-		<array>
-			<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmph2sg50xi.mlmodelc/model.espresso.weights</string>
-			<string>net.additional.weights</string>
-		</array>
-		<key>image</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>1</integer>
-			<key>InputChannels</key>
-			<integer>64</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>1</integer>
-		</dict>
-		<key>probs</key>
-		<dict>
-			<key>Bottom</key>
-			<string>probs_tmp_0</string>
-			<key>Name</key>
-			<string>probs</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>BiasScaleGroupData</key>
-				<dict>
-					<key>BiasCount</key>
-					<integer>64</integer>
-					<key>BiasIndex</key>
-					<integer>1</integer>
-					<key>BiasOffset</key>
-					<integer>0</integer>
-					<key>BiasType</key>
-					<string>Float16</string>
-				</dict>
-			</dict>
-			<key>Type</key>
-			<string>GOC</string>
-			<key>UnescapedBottom</key>
-			<string>probs_tmp_0</string>
-			<key>UnescapedName</key>
-			<string>probs</string>
-		</dict>
-		<key>probs@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>probs</string>
-			<key>OutputInterleave</key>
-			<integer>1</integer>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-		<key>probs_tmp_0</key>
-		<dict>
-			<key>Bottom</key>
-			<string>image</string>
-			<key>Name</key>
-			<string>probs_tmp_0</string>
-			<key>OutputChannels</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>KernelGroupReuse</key>
-				<false/>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelIndex</key>
-				<integer>0</integer>
-				<key>KernelMode</key>
-				<string>Dense</string>
-				<key>KernelOffset</key>
-				<integer>384</integer>
-				<key>KernelType</key>
-				<string>Float32</string>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-			<key>UnescapedBottom</key>
-			<string>image</string>
-			<key>UnescapedName</key>
-			<string>probs_tmp_0</string>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/plists/goc.plist
+++ b/extra/accel/ane/2_compile/plists/goc.plist
@@ -1,86 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-		<key>Inputs</key>
-		<array>
-			<string>data</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>output@output</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>output</string>
-		</array>
-		<key>Weights</key>
-		<array>
-			<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpm7rb6ba9.mlmodelc/model.espresso.weights</string>
-			<string>net.additional.weights</string>
-		</array>
-		<key>data</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>1</integer>
-			<key>InputChannels</key>
-			<integer>1</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>6</integer>
-		</dict>
-		<key>output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>data</string>
-			<key>Name</key>
-			<string>output</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>BiasScalar</key>
-				<integer>16354</integer>
-				<key>ScaleScalar</key>
-				<integer>20544</integer>
-			</dict>
-			<key>Type</key>
-			<string>GOC</string>
-			<key>UnescapedBottom</key>
-			<string>data</string>
-			<key>UnescapedName</key>
-			<string>output</string>
-		</dict>
-		<key>output@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>output</string>
-			<key>OutputInterleave</key>
-			<integer>1</integer>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/plists/inputview.plist
+++ b/extra/accel/ane/2_compile/plists/inputview.plist
@@ -1,166 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-		<key>Inputs</key>
-		<array>
-			<string>data</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>out_2@output</string>
-			<string>out_1@output</string>
-			<string>out_0@output</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>out_0</string>
-			<string>out_1</string>
-			<string>out_2</string>
-		</array>
-		<key>Weights</key>
-		<array>
-			<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmp_c4fweo3.mlmodelc/model.espresso.weights</string>
-			<string>net.additional.weights</string>
-		</array>
-		<key>data</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>1</integer>
-			<key>InputChannels</key>
-			<integer>9</integer>
-			<key>InputHeight</key>
-			<integer>2</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>128</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>2</integer>
-		</dict>
-		<key>out_0</key>
-		<dict>
-			<key>Bottom</key>
-			<string>data</string>
-			<key>Name</key>
-			<string>out_0</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>Dimension</key>
-				<string>Channel</string>
-				<key>Offset</key>
-				<integer>0</integer>
-				<key>Size</key>
-				<integer>3</integer>
-			</dict>
-			<key>Type</key>
-			<string>InputView</string>
-			<key>UnescapedBottom</key>
-			<string>data</string>
-			<key>UnescapedName</key>
-			<string>out_0</string>
-		</dict>
-		<key>out_0@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>out_0</string>
-			<key>OutputInterleave</key>
-			<integer>1</integer>
-			<key>OutputPlaneStride</key>
-			<integer>128</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-		<key>out_1</key>
-		<dict>
-			<key>Bottom</key>
-			<string>data</string>
-			<key>Name</key>
-			<string>out_1</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>Dimension</key>
-				<string>Channel</string>
-				<key>Offset</key>
-				<integer>3</integer>
-				<key>Size</key>
-				<integer>3</integer>
-			</dict>
-			<key>Type</key>
-			<string>InputView</string>
-			<key>UnescapedBottom</key>
-			<string>data</string>
-			<key>UnescapedName</key>
-			<string>out_1</string>
-		</dict>
-		<key>out_1@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>out_1</string>
-			<key>OutputInterleave</key>
-			<integer>1</integer>
-			<key>OutputPlaneStride</key>
-			<integer>128</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-		<key>out_2</key>
-		<dict>
-			<key>Bottom</key>
-			<string>data</string>
-			<key>Name</key>
-			<string>out_2</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>Dimension</key>
-				<string>Channel</string>
-				<key>Offset</key>
-				<integer>6</integer>
-				<key>Size</key>
-				<integer>3</integer>
-			</dict>
-			<key>Type</key>
-			<string>InputView</string>
-			<key>UnescapedBottom</key>
-			<string>data</string>
-			<key>UnescapedName</key>
-			<string>out_2</string>
-		</dict>
-		<key>out_2@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>out_2</string>
-			<key>OutputInterleave</key>
-			<integer>1</integer>
-			<key>OutputPlaneStride</key>
-			<integer>128</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/plists/neuron.plist
+++ b/extra/accel/ane/2_compile/plists/neuron.plist
@@ -1,84 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-		<key>Inputs</key>
-		<array>
-			<string>data</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>output@output</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>output</string>
-		</array>
-		<key>Weights</key>
-		<array>
-			<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpwvvanb0c.mlmodelc/model.espresso.weights</string>
-			<string>net.additional.weights</string>
-		</array>
-		<key>data</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>7</integer>
-			<key>InputChannels</key>
-			<integer>7</integer>
-			<key>InputHeight</key>
-			<integer>7</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>448</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>7</integer>
-		</dict>
-		<key>output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>data</string>
-			<key>Name</key>
-			<string>output</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>Type</key>
-				<string>Exp2</string>
-			</dict>
-			<key>Type</key>
-			<string>Neuron</string>
-			<key>UnescapedBottom</key>
-			<string>data</string>
-			<key>UnescapedName</key>
-			<string>output</string>
-		</dict>
-		<key>output@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>output</string>
-			<key>OutputInterleave</key>
-			<integer>1</integer>
-			<key>OutputPlaneStride</key>
-			<integer>448</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/plists/reshape.plist
+++ b/extra/accel/ane/2_compile/plists/reshape.plist
@@ -1,92 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-		<key>Inputs</key>
-		<array>
-			<string>data</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>output@output</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>output</string>
-		</array>
-		<key>Weights</key>
-		<array>
-			<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpcwj7kqrw.mlmodelc/model.espresso.weights</string>
-			<string>net.additional.weights</string>
-		</array>
-		<key>data</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>1</integer>
-			<key>InputChannels</key>
-			<integer>1</integer>
-			<key>InputHeight</key>
-			<integer>2</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>128</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>5</integer>
-		</dict>
-		<key>output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>data</string>
-			<key>Name</key>
-			<string>output</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>ReshapedBatch</key>
-				<integer>1</integer>
-				<key>ReshapedChannel</key>
-				<integer>10</integer>
-				<key>ReshapedDepth</key>
-				<integer>1</integer>
-				<key>ReshapedHeight</key>
-				<integer>1</integer>
-				<key>ReshapedWidth</key>
-				<integer>1</integer>
-			</dict>
-			<key>Type</key>
-			<string>Reshape</string>
-			<key>UnescapedBottom</key>
-			<string>data</string>
-			<key>UnescapedName</key>
-			<string>output</string>
-		</dict>
-		<key>output@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>output</string>
-			<key>OutputInterleave</key>
-			<integer>1</integer>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/plists/scaled.plist
+++ b/extra/accel/ane/2_compile/plists/scaled.plist
@@ -1,140 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-		<key>A</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>1</integer>
-			<key>InputChannels</key>
-			<integer>1</integer>
-			<key>InputHeight</key>
-			<integer>5</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>320</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>7</integer>
-		</dict>
-		<key>B</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>1</integer>
-			<key>InputChannels</key>
-			<integer>1</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>7</integer>
-		</dict>
-		<key>B_broadcasted_output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>B</string>
-			<key>Name</key>
-			<string>B_broadcasted_output</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>BroadcastInfo</key>
-				<array>
-					<dict>
-						<key>Dimension</key>
-						<string>Height</string>
-						<key>Size</key>
-						<integer>5</integer>
-					</dict>
-				</array>
-			</dict>
-			<key>Type</key>
-			<string>Broadcast</string>
-			<key>UnescapedBottom</key>
-			<string>B</string>
-			<key>UnescapedName</key>
-			<string>B_broadcasted_output</string>
-		</dict>
-		<key>Inputs</key>
-		<array>
-			<string>B</string>
-			<string>A</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>output@output</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>B_broadcasted_output</string>
-			<string>output</string>
-		</array>
-		<key>Weights</key>
-		<array>
-			<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmp40ksdbf5.mlmodelc/model.espresso.weights</string>
-			<string>net.additional.weights</string>
-		</array>
-		<key>output</key>
-		<dict>
-			<key>Bottom</key>
-			<array>
-				<string>A</string>
-				<string>B_broadcasted_output</string>
-			</array>
-			<key>Name</key>
-			<string>output</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>Scale</key>
-				<integer>15360</integer>
-				<key>Type</key>
-				<string>Min</string>
-			</dict>
-			<key>Type</key>
-			<string>ScaledElementWise</string>
-			<key>UnescapedBottom</key>
-			<array>
-				<string>A</string>
-				<string>B_broadcasted_output</string>
-			</array>
-			<key>UnescapedName</key>
-			<string>output</string>
-		</dict>
-		<key>output@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>output</string>
-			<key>OutputInterleave</key>
-			<integer>1</integer>
-			<key>OutputPlaneStride</key>
-			<integer>320</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/plists/sum.plist
+++ b/extra/accel/ane/2_compile/plists/sum.plist
@@ -1,112 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-		<key>Inputs</key>
-		<array>
-			<string>image2</string>
-			<string>image</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>probs@output</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>probs</string>
-		</array>
-		<key>Weights</key>
-		<array>
-			<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpkp9irqtj.mlmodelc/model.espresso.weights</string>
-			<string>net.additional.weights</string>
-		</array>
-		<key>image</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>1</integer>
-			<key>InputChannels</key>
-			<integer>64</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>1</integer>
-		</dict>
-		<key>image2</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>1</integer>
-			<key>InputChannels</key>
-			<integer>64</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputInterleave</key>
-			<integer>1</integer>
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-			<key>InputWidth</key>
-			<integer>1</integer>
-		</dict>
-		<key>probs</key>
-		<dict>
-			<key>Bottom</key>
-			<array>
-				<string>image</string>
-				<string>image2</string>
-			</array>
-			<key>Name</key>
-			<string>probs</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>Scale</key>
-				<integer>15360</integer>
-				<key>Type</key>
-				<string>Add</string>
-			</dict>
-			<key>Type</key>
-			<string>ScaledElementWise</string>
-			<key>UnescapedBottom</key>
-			<array>
-				<string>image</string>
-				<string>image2</string>
-			</array>
-			<key>UnescapedName</key>
-			<string>probs</string>
-		</dict>
-		<key>probs@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>probs</string>
-			<key>OutputInterleave</key>
-			<integer>1</integer>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/concat.plist
+++ b/extra/accel/ane/2_compile/simple/concat.plist
@@ -1,78 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-		<key>Inputs</key>
-		<array>
-			<string>input_1</string>
-			<string>input_0</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>output@output</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>output</string>
-		</array>
-		<key>Weights</key>
-		<array>
-			<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmp0yvkl2ux.mlmodelc/model.espresso.weights</string>
-			<string>net.additional.weights</string>
-		</array>
-		<key>input_0</key>
-		<dict>
-			<key>InputChannels</key>
-			<integer>16384</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputWidth</key>
-			<integer>1</integer>
-
-			<key>InputType</key>
-			<string>Float16</string>
-		</dict>
-		<key>input_1</key>
-		<dict>
-			<key>InputChannels</key>
-			<integer>16</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputWidth</key>
-			<integer>1</integer>
-
-			<key>InputType</key>
-			<string>Float16</string>
-		</dict>
-		<key>output</key>
-		<dict>
-			<key>Bottom</key>
-			<array>
-				<string>input_0</string>
-				<string>input_1</string>
-			</array>
-			<key>Name</key>
-			<string>output</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Type</key>
-			<string>Concat</string>
-		</dict>
-		<key>output@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>output</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/conv.plist
+++ b/extra/accel/ane/2_compile/simple/conv.plist
@@ -1,94 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-  <key>Networks</key>
-  <array>
-    <string>net</string>
-  </array>
-  <key>Version</key>
-  <string>1.0.9</string>
-  <key>net</key>
-  <dict>
-    <!-- This is a one layer network -->
-    <key>Inputs</key>
-    <array>
-      <string>image</string>
-    </array>
-    <key>Units</key>
-    <array>
-      <string>my_layer</string>
-    </array>
-    <key>Outputs</key>
-    <array>
-      <string>probs@output</string>
-    </array>
-
-    <!-- Global array of weights -->
-    <key>Weights</key>
-    <array>
-      <string>../twos.weights</string>
-    </array>
-
-    <key>image</key>
-    <dict>
-      <key>InputChannels</key>
-      <integer>3</integer>
-      <key>InputHeight</key>
-      <integer>1</integer>
-      <key>InputWidth</key>
-      <integer>1</integer>
-
-      <key>InputType</key>
-      <string>Float16</string>
-    </dict>
-
-    <key>my_layer</key>
-    <dict>
-      <key>Bottom</key>
-      <string>image</string>
-
-      <key>Name</key>
-      <string>my_layer</string>
-      <key>OutputChannels</key>
-      <integer>3</integer>
-      <key>OutputType</key>
-      <string>Float16</string>
-
-      <key>Params</key>
-      <dict>
-        <key>KernelHeight</key>
-        <integer>1</integer>
-        <key>KernelWidth</key>
-        <integer>1</integer>
-
-        <key>KernelIndex</key>
-        <integer>0</integer>
-        <key>KernelOffset</key>
-        <integer>0</integer>
-        <key>KernelType</key>
-        <string>Float16</string>
-
-        <key>Step</key>
-        <array>
-          <integer>1</integer>
-          <integer>1</integer>
-        </array>
-
-        <key>Type</key>
-        <string>Conv</string>
-      </dict>
-
-      <key>Type</key>
-      <string>Conv</string>
-    </dict>
-
-    <key>probs@output</key>
-    <dict>
-      <key>Bottom</key>
-      <string>my_layer</string>
-    </dict>
-
-  </dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/convneuron.plist
+++ b/extra/accel/ane/2_compile/simple/convneuron.plist
@@ -1,130 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-  <key>Networks</key>
-  <array>
-    <string>net</string>
-  </array>
-  <key>Version</key>
-  <string>1.0.9</string>
-  <key>net</key>
-  <dict>
-    <!-- This is a one layer network -->
-    <key>Inputs</key>
-    <array>
-      <string>image</string>
-    </array>
-    <key>Units</key>
-    <array>
-      <string>my_layer</string>
-      <string>my_layer_2</string>
-    </array>
-    <key>Outputs</key>
-    <array>
-      <string>probs@output</string>
-    </array>
-
-    <!-- Global array of weights -->
-    <key>Weights</key>
-    <array>
-      <string>../min.weights</string>
-    </array>
-
-    <key>image</key>
-    <dict>
-      <!-- default is 1 -->
-      <key>BatchSize</key>
-      <integer>1</integer>
-
-      <key>InputChannels</key>
-      <integer>3</integer>
-      <key>InputHeight</key>
-      <integer>1</integer>
-      <key>InputWidth</key>
-      <integer>1</integer>
-
-      <key>InputInterleave</key>
-      <integer>1</integer>
-
-      <key>InputPlaneStride</key>
-      <integer>64</integer>
-      <key>InputRowStride</key>
-      <integer>64</integer>
-      <key>InputType</key>
-      <string>Float16</string>
-    </dict>
-
-    <key>my_layer</key>
-    <dict>
-      <key>Bottom</key>
-      <string>image</string>
-
-      <key>Name</key>
-      <string>my_layer</string>
-      <key>OutputChannels</key>
-      <integer>3</integer>
-      <key>OutputType</key>
-      <string>Float16</string>
-
-      <key>Params</key>
-      <dict>
-        <key>KernelHeight</key>
-        <integer>1</integer>
-        <key>KernelWidth</key>
-        <integer>1</integer>
-
-        <key>KernelIndex</key>
-        <integer>0</integer>
-        <key>KernelOffset</key>
-        <integer>0</integer>
-        <key>KernelType</key>
-        <string>Float16</string>
-
-        <key>Step</key>
-        <array>
-          <integer>1</integer>
-          <integer>1</integer>
-        </array>
-        <key>Type</key>
-        <string>Conv</string>
-      </dict>
-
-      <key>Type</key>
-      <string>Conv</string>
-    </dict>
-
-		<key>my_layer_2</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer</string>
-			<key>Name</key>
-			<string>my_layer_2</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>Type</key>
-				<string>Sign</string>
-			</dict>
-			<key>Type</key>
-			<string>Neuron</string>
-		</dict>
-
-    <key>probs@output</key>
-    <dict>
-      <key>Bottom</key>
-      <string>my_layer_2</string>
-
-      <key>OutputInterleave</key>
-      <integer>1</integer>
-
-      <key>OutputPlaneStride</key>
-      <integer>64</integer>
-      <key>OutputRowStride</key>
-      <integer>64</integer>
-    </dict>
-
-  </dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/convuint8.plist
+++ b/extra/accel/ane/2_compile/simple/convuint8.plist
@@ -1,135 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-  <key>Networks</key>
-  <array>
-    <string>net</string>
-  </array>
-  <key>Version</key>
-  <string>1.0.9</string>
-  <key>net</key>
-  <dict>
-    <!-- This is a one layer network -->
-    <key>Inputs</key>
-    <array>
-      <string>image</string>
-    </array>
-    <key>Units</key>
-    <array>
-      <string>my_layer</string>
-    </array>
-    <key>Outputs</key>
-    <array>
-      <string>probs@output</string>
-    </array>
-
-    <!-- Global array of weights -->
-    <key>Weights</key>
-    <array>
-      <string>../min_uint8.weights</string>
-    </array>
-
-    <key>image</key>
-    <dict>
-      <!-- default is 1 -->
-      <key>BatchSize</key>
-      <integer>1</integer>
-
-      <key>InputChannels</key>
-      <integer>1</integer>
-      <key>InputHeight</key>
-      <integer>1</integer>
-      <key>InputWidth</key>
-      <integer>3</integer>
-
-      <key>InputDepth</key>
-      <integer>1</integer>
-
-      <key>InputInterleave</key>
-      <integer>1</integer>
-
-      <key>InputBatchStride</key>
-      <integer>256</integer>
-      <key>InputDepthStride</key>
-      <integer>256</integer>
-      <key>InputPlaneStride</key>
-      <integer>64</integer>
-      <key>InputRowStride</key>
-      <integer>64</integer>
-      <key>InputType</key>
-      <string>UInt8</string>
-    </dict>
-
-    <key>my_layer</key>
-    <dict>
-      <key>Bottom</key>
-      <string>image</string>
-
-      <key>Name</key>
-      <string>my_layer</string>
-      <key>OutputChannels</key>
-      <integer>3</integer>
-      <key>OutputType</key>
-      <string>UInt8</string>
-
-      <key>Params</key>
-      <dict>
-        <key>KernelHeight</key>
-        <integer>1</integer>
-        <key>KernelWidth</key>
-        <integer>1</integer>
-
-        <key>KernelDepth</key>
-        <integer>1</integer>
-
-        <key>PadTop</key>
-        <integer>0</integer>
-        <key>PadBot</key>
-        <integer>0</integer>
-        <key>PadLeft</key>
-        <integer>0</integer>
-        <key>PadRight</key>
-        <integer>0</integer>
-
-        <key>KernelIndex</key>
-        <integer>0</integer>
-        <key>KernelOffset</key>
-        <integer>0</integer>
-        <key>KernelType</key>
-        <string>UInt8</string>
-
-        <key>Step</key>
-        <array>
-          <integer>1</integer>
-          <integer>1</integer>
-        </array>
-        <key>Type</key>
-        <string>Conv</string>
-      </dict>
-
-      <key>Type</key>
-      <string>Conv</string>
-    </dict>
-
-    <key>probs@output</key>
-    <dict>
-      <key>Bottom</key>
-      <string>my_layer</string>
-
-      <key>OutputInterleave</key>
-      <integer>1</integer>
-
-      <key>OutputBatchStride</key>
-      <integer>256</integer>
-      <key>OutputDepthStride</key>
-      <integer>256</integer>
-      <key>OutputPlaneStride</key>
-      <integer>64</integer>
-      <key>OutputRowStride</key>
-      <integer>64</integer>
-    </dict>
-
-  </dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/doubleconv.plist
+++ b/extra/accel/ane/2_compile/simple/doubleconv.plist
@@ -1,154 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-    <!-- This is a one layer network -->
-		<key>Inputs</key>
-		<array>
-			<string>image</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>my_layer</string>
-			<string>my_layer_2</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>probs@output</string>
-			<string>zalt@output</string>
-		</array>
-
-    <!-- Global array of weights -->
-		<key>Weights</key>
-		<array>
-			<string>../min.weights</string>
-		</array>
-
-		<key>image</key>
-		<dict>
-      <!-- default is 1 -->
-			<key>BatchSize</key>
-			<integer>1</integer>
-
-			<key>InputChannels</key>
-			<integer>3</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputWidth</key>
-			<integer>1</integer>
-
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-		</dict>
-
-		<key>my_layer</key>
-		<dict>
-			<key>Bottom</key>
-			<string>image</string>
-
-			<key>Name</key>
-			<string>my_layer</string>
-			<key>OutputChannels</key>
-			<integer>3</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-
-			<key>Params</key>
-			<dict>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-
-				<key>KernelIndex</key>
-				<integer>0</integer>
-				<key>KernelOffset</key>
-				<integer>0</integer>
-				<key>KernelType</key>
-				<string>Float32</string>
-
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-		</dict>
-
-		<key>my_layer_2</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer</string>
-
-			<key>Name</key>
-			<string>my_layer_2</string>
-			<key>OutputChannels</key>
-			<integer>3</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-
-			<key>Params</key>
-			<dict>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-
-				<key>KernelIndex</key>
-				<integer>0</integer>
-				<key>KernelOffset</key>
-				<integer>0</integer>
-				<key>KernelType</key>
-				<string>Float32</string>
-
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-		</dict>
-
-		<key>probs@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer</string>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-		</dict>
-
-		<key>zalt@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer_2</string>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-		</dict>
-
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/doubleconvrev.plist
+++ b/extra/accel/ane/2_compile/simple/doubleconvrev.plist
@@ -1,154 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-    <!-- This is a one layer network -->
-		<key>Inputs</key>
-		<array>
-			<string>image</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>my_layer</string>
-			<string>my_layer_2</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>probs@output</string>
-			<string>aalt@output</string>
-		</array>
-
-    <!-- Global array of weights -->
-		<key>Weights</key>
-		<array>
-			<string>../min.weights</string>
-		</array>
-
-		<key>image</key>
-		<dict>
-      <!-- default is 1 -->
-			<key>BatchSize</key>
-			<integer>1</integer>
-
-			<key>InputChannels</key>
-			<integer>3</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputWidth</key>
-			<integer>1</integer>
-
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-		</dict>
-
-		<key>my_layer</key>
-		<dict>
-			<key>Bottom</key>
-			<string>image</string>
-
-			<key>Name</key>
-			<string>my_layer</string>
-			<key>OutputChannels</key>
-			<integer>3</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-
-			<key>Params</key>
-			<dict>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-
-				<key>KernelIndex</key>
-				<integer>0</integer>
-				<key>KernelOffset</key>
-				<integer>0</integer>
-				<key>KernelType</key>
-				<string>Float32</string>
-
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-		</dict>
-
-		<key>my_layer_2</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer</string>
-
-			<key>Name</key>
-			<string>my_layer_2</string>
-			<key>OutputChannels</key>
-			<integer>3</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-
-			<key>Params</key>
-			<dict>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-
-				<key>KernelIndex</key>
-				<integer>0</integer>
-				<key>KernelOffset</key>
-				<integer>0</integer>
-				<key>KernelType</key>
-				<string>Float32</string>
-
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-		</dict>
-
-		<key>probs@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer</string>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-		</dict>
-
-		<key>aalt@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer_2</string>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-		</dict>
-
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/doubleconvsout.plist
+++ b/extra/accel/ane/2_compile/simple/doubleconvsout.plist
@@ -1,143 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-    <!-- This is a one layer network -->
-		<key>Inputs</key>
-		<array>
-			<string>image</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>my_layer</string>
-			<string>my_layer_2</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>probs@output</string>
-		</array>
-
-    <!-- Global array of weights -->
-		<key>Weights</key>
-		<array>
-			<string>../min.weights</string>
-		</array>
-
-		<key>image</key>
-		<dict>
-      <!-- default is 1 -->
-			<key>BatchSize</key>
-			<integer>1</integer>
-
-			<key>InputChannels</key>
-			<integer>3</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputWidth</key>
-			<integer>1</integer>
-
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-		</dict>
-
-		<key>my_layer</key>
-		<dict>
-			<key>Bottom</key>
-			<string>image</string>
-
-			<key>Name</key>
-			<string>my_layer</string>
-			<key>OutputChannels</key>
-			<integer>3</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-
-			<key>Params</key>
-			<dict>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-
-				<key>KernelIndex</key>
-				<integer>0</integer>
-				<key>KernelOffset</key>
-				<integer>0</integer>
-				<key>KernelType</key>
-				<string>Float32</string>
-
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-		</dict>
-
-		<key>my_layer_2</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer</string>
-
-			<key>Name</key>
-			<string>my_layer_2</string>
-			<key>OutputChannels</key>
-			<integer>3</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-
-			<key>Params</key>
-			<dict>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-
-				<key>KernelIndex</key>
-				<integer>0</integer>
-				<key>KernelOffset</key>
-				<integer>0</integer>
-				<key>KernelType</key>
-				<string>Float32</string>
-
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-		</dict>
-
-		<key>probs@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer_2</string>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-		</dict>
-
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/doubleneuron.plist
+++ b/extra/accel/ane/2_compile/simple/doubleneuron.plist
@@ -1,87 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-    <!-- This is a one layer network -->
-		<key>Inputs</key>
-		<array>
-			<string>image</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>my_layer</string>
-			<string>my_layer_2</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>probs@output</string>
-		</array>
-
-		<key>image</key>
-		<dict>
-      <!-- default is 1 -->
-			<key>BatchSize</key>
-			<integer>1</integer>
-
-			<key>InputChannels</key>
-			<integer>1</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputWidth</key>
-			<integer>77</integer>
-
-			<key>InputType</key>
-			<string>Float16</string>
-		</dict>
-
-		<key>my_layer</key>
-		<dict>
-			<key>Bottom</key>
-			<string>image</string>
-			<key>Name</key>
-			<string>my_layer</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>Type</key>
-				<string>Sigmoid</string>
-			</dict>
-			<key>Type</key>
-			<string>Neuron</string>
-		</dict>
-
-		<key>my_layer_2</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer</string>
-			<key>Name</key>
-			<string>my_layer_2</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>Type</key>
-				<string>Sigmoid</string>
-			</dict>
-			<key>Type</key>
-			<string>Neuron</string>
-		</dict>
-
-		<key>probs@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer_2</string>
-		</dict>
-
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/gemm.plist
+++ b/extra/accel/ane/2_compile/simple/gemm.plist
@@ -1,90 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<!--  dd if=/dev/zero of=/tmp/zero bs=1024 count=16384 -->
-<!-- 1024x512 is the max that gets compiled into one op -->
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-		<key>Inputs</key>
-		<array>
-			<string>image</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>probs@output</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>probs</string>
-		</array>
-		<key>Weights</key>
-		<array>
-      <string>/tmp/zero</string>
-		</array>
-
-		<key>image</key>
-		<dict>
-			<key>BatchSize</key>
-			<integer>512</integer>
-			<key>InputChannels</key>
-			<integer>512</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputWidth</key>
-			<integer>1</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-		</dict>
-		<key>probs</key>
-		<dict>
-			<key>Bottom</key>
-			<string>image</string>
-			<key>Name</key>
-			<string>probs</string>
-			<key>OutputChannels</key>
-			<integer>512</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-				<key>KernelType</key>
-				<string>Float16</string>
-
-				<key>KernelMode</key>
-				<string>Dense</string>
-				<key>KernelOffset</key>
-				<integer>0</integer>
-
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-		</dict>
-		<key>probs@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>probs</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/goc.plist
+++ b/extra/accel/ane/2_compile/simple/goc.plist
@@ -1,79 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-    <!-- This is a one layer network -->
-		<key>Inputs</key>
-		<array>
-			<string>image</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>my_layer</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>probs@output</string>
-		</array>
-
-		<key>image</key>
-		<dict>
-      <!-- default is 1 -->
-			<key>BatchSize</key>
-			<integer>1</integer>
-
-			<key>InputChannels</key>
-			<integer>3</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputWidth</key>
-			<integer>1</integer>
-
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-		</dict>
-
-		<key>my_layer</key>
-		<dict>
-			<key>Bottom</key>
-			<string>image</string>
-			<key>Name</key>
-			<string>my_layer</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>BiasScalar</key>
-				<integer>16354</integer>
-				<key>ScaleScalar</key>
-				<integer>20544</integer>
-			</dict>
-			<key>Type</key>
-			<string>GOC</string>
-		</dict>
-
-		<key>probs@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer</string>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-		</dict>
-
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/neuron.plist
+++ b/extra/accel/ane/2_compile/simple/neuron.plist
@@ -1,69 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-    <!-- This is a one layer network -->
-		<key>Inputs</key>
-		<array>
-			<string>image</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>my_layer</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>probs@output</string>
-		</array>
-
-		<key>image</key>
-		<dict>
-      <!-- default is 1 -->
-			<key>BatchSize</key>
-			<integer>1</integer>
-
-			<key>InputChannels</key>
-			<integer>1</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputWidth</key>
-			<integer>77</integer>
-
-			<key>InputType</key>
-			<string>Float16</string>
-		</dict>
-
-		<key>my_layer</key>
-		<dict>
-			<key>Bottom</key>
-			<string>image</string>
-			<key>Name</key>
-			<string>my_layer</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>Type</key>
-				<string>Sigmoid</string>
-			</dict>
-			<key>Type</key>
-			<string>Neuron</string>
-		</dict>
-
-		<key>probs@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer</string>
-		</dict>
-
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/quadconv.plist
+++ b/extra/accel/ane/2_compile/simple/quadconv.plist
@@ -1,221 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-    <!-- This is a one layer network -->
-		<key>Inputs</key>
-		<array>
-			<string>image</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>my_layer</string>
-			<string>my_layer_2</string>
-			<string>my_layer_3</string>
-			<string>my_layer_4</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>probs@output</string>
-		</array>
-
-    <!-- Global array of weights -->
-		<key>Weights</key>
-		<array>
-			<string>../min.weights</string>
-		</array>
-
-		<key>image</key>
-		<dict>
-      <!-- default is 1 -->
-			<key>BatchSize</key>
-			<integer>1</integer>
-
-			<key>InputChannels</key>
-			<integer>3</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputWidth</key>
-			<integer>1</integer>
-
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-		</dict>
-
-		<key>my_layer</key>
-		<dict>
-			<key>Bottom</key>
-			<string>image</string>
-
-			<key>Name</key>
-			<string>my_layer</string>
-			<key>OutputChannels</key>
-			<integer>3</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-
-			<key>Params</key>
-			<dict>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-
-				<key>KernelIndex</key>
-				<integer>0</integer>
-				<key>KernelOffset</key>
-				<integer>0</integer>
-				<key>KernelType</key>
-				<string>Float32</string>
-
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-		</dict>
-
-		<key>my_layer_2</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer</string>
-
-			<key>Name</key>
-			<string>my_layer_2</string>
-			<key>OutputChannels</key>
-			<integer>3</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-
-			<key>Params</key>
-			<dict>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-
-				<key>KernelIndex</key>
-				<integer>0</integer>
-				<key>KernelOffset</key>
-				<integer>0</integer>
-				<key>KernelType</key>
-				<string>Float32</string>
-
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-		</dict>
-
-		<key>my_layer_3</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer_2</string>
-
-			<key>Name</key>
-			<string>my_layer_3</string>
-			<key>OutputChannels</key>
-			<integer>3</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-
-			<key>Params</key>
-			<dict>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-
-				<key>KernelIndex</key>
-				<integer>0</integer>
-				<key>KernelOffset</key>
-				<integer>0</integer>
-				<key>KernelType</key>
-				<string>Float32</string>
-
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-		</dict>
-
-		<key>my_layer_4</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer_3</string>
-
-			<key>Name</key>
-			<string>my_layer_4</string>
-			<key>OutputChannels</key>
-			<integer>3</integer>
-			<key>OutputType</key>
-			<string>Float16</string>
-
-			<key>Params</key>
-			<dict>
-				<key>KernelHeight</key>
-				<integer>1</integer>
-				<key>KernelWidth</key>
-				<integer>1</integer>
-
-				<key>KernelIndex</key>
-				<integer>0</integer>
-				<key>KernelOffset</key>
-				<integer>0</integer>
-				<key>KernelType</key>
-				<string>Float32</string>
-
-				<key>Step</key>
-				<array>
-					<integer>1</integer>
-					<integer>1</integer>
-				</array>
-				<key>Type</key>
-				<string>Conv</string>
-			</dict>
-			<key>Type</key>
-			<string>Conv</string>
-		</dict>
-
-		<key>probs@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer_4</string>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-		</dict>
-
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/simple/reshape.plist
+++ b/extra/accel/ane/2_compile/simple/reshape.plist
@@ -1,85 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>Networks</key>
-	<array>
-		<string>net</string>
-	</array>
-	<key>Version</key>
-	<string>1.0.9</string>
-	<key>net</key>
-	<dict>
-    <!-- This is a one layer network -->
-		<key>Inputs</key>
-		<array>
-			<string>image</string>
-		</array>
-		<key>Units</key>
-		<array>
-			<string>my_layer</string>
-		</array>
-		<key>Outputs</key>
-		<array>
-			<string>probs@output</string>
-		</array>
-
-		<key>image</key>
-		<dict>
-      <!-- default is 1 -->
-			<key>BatchSize</key>
-			<integer>1</integer>
-
-			<key>InputChannels</key>
-			<integer>3</integer>
-			<key>InputHeight</key>
-			<integer>1</integer>
-			<key>InputWidth</key>
-			<integer>1</integer>
-
-			<key>InputPlaneStride</key>
-			<integer>64</integer>
-			<key>InputRowStride</key>
-			<integer>64</integer>
-			<key>InputType</key>
-			<string>Float16</string>
-		</dict>
-
-		<key>my_layer</key>
-		<dict>
-			<key>Bottom</key>
-			<string>image</string>
-			<key>Name</key>
-			<string>my_layer</string>
-			<key>OutputType</key>
-			<string>Float16</string>
-			<key>Params</key>
-			<dict>
-				<key>ReshapedBatch</key>
-				<integer>1</integer>
-				<key>ReshapedChannel</key>
-				<integer>3</integer>
-				<key>ReshapedDepth</key>
-				<integer>1</integer>
-				<key>ReshapedHeight</key>
-				<integer>1</integer>
-				<key>ReshapedWidth</key>
-				<integer>1</integer>
-			</dict>
-			<key>Type</key>
-			<string>Reshape</string>
-		</dict>
-
-		<key>probs@output</key>
-		<dict>
-			<key>Bottom</key>
-			<string>my_layer</string>
-			<key>OutputPlaneStride</key>
-			<integer>64</integer>
-			<key>OutputRowStride</key>
-			<integer>64</integer>
-		</dict>
-
-	</dict>
-</dict>
-</plist>
--- a/extra/accel/ane/2_compile/struct_recover.py
+++ b/extra/accel/ane/2_compile/struct_recover.py
@@ -1,36 +0,0 @@
-#!/usr/bin/env python3
-from ane import ANE
-ane = ANE()
-
-lens = {}
-
-dat = b"\xff"*0x300
-ret = ane.debug(dat, 16)
-for k,v in ret.items():
-  found = None
-  for i in range(33):
-    #print(v, (1 << i) - 1)
-    if v == (1 << i) - 1:
-      found = i
-      break
-  #print(k, hex(v), found)
-  lens[k] = found
-
-pos = []
-dat = b"\x00"*0x300
-for i in range(0x300):
-  for j in range(8):
-    dat = b"\x00"*i
-    dat += bytes([1 << j])
-    dat += b"\x00"*(0x300-len(dat))
-    ret = ane.debug(dat, 16)
-    for k,v in ret.items():
-      if v == 1:
-        print("0x%3x %d %2d" % (i, j, lens[k]), k)
-        pos.append((k, (i,j, lens[k])))
-
-import json
-jpos = json.dumps(pos, indent=2)
-with open("aneregs.json", "w") as f:
-  f.write(jpos)
-
--- a/extra/accel/ane/2_compile/twos.weights
+++ b/extra/accel/ane/2_compile/twos.weights
--- a/extra/accel/ane/3_run/build.sh
+++ b/extra/accel/ane/3_run/build.sh
@@ -1,4 +0,0 @@
-#!/bin/bash -e
-clang++ test.mm -F /System/Library/PrivateFrameworks/ -framework ANEServices -framework IOSurface -framework Foundation -framework IOKit
-codesign --entitlements entitlements.xml -s "Taylor Swift Child 2" a.out
-
--- a/extra/accel/ane/3_run/entitlements.xml
+++ b/extra/accel/ane/3_run/entitlements.xml
@@ -1,9 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-
-<plist version="1.0">
-  <dict>
-    <key>com.apple.ane.iokit-user-access</key><true/>
-  </dict>
-</plist>
-
--- a/extra/accel/ane/3_run/h11ane.h
+++ b/extra/accel/ane/3_run/h11ane.h
@@ -1,150 +0,0 @@
-enum ANEDeviceUsageType {
-  UsageNoProgram,
-  UsageWithProgram,  // used in running process
-  UsageCompile       // used in aned
-};
-
-struct H11ANEDeviceInfoStruct {
-  uint64_t program_handle;
-  uint64_t program_auth_code;
-  uint64_t sleep_timer;
-  uint64_t junk[0x100];
-};
-
-struct H11ANEStatusStruct {
-  uint64_t junk[0x100];
-};
-
-struct H11ANEProgramCreateArgsStruct {
-  void *program;
-  uint64_t program_length;
-  uint64_t empty[4];
-  char has_signature;
-};
-
-struct H11ANEProgramCreateArgsStructOutput {
-  uint64_t program_handle;
-  int unknown[0x2000];
-};
-
-struct H11ANEProgramPrepareArgsStruct {
-  uint64_t program_handle;
-  uint64_t flags;
-  uint64_t empty[0x100];
-};
-
-struct H11ANEProgramRequestArgsStruct {
-  uint64_t args[0x1000];
-};
-
-namespace H11ANE {
-  class H11ANEDevice;
-
-  class H11ANEDeviceController {
-    public:
-      H11ANEDeviceController(
-        int (*callback)(H11ANE::H11ANEDeviceController*, void*, H11ANE::H11ANEDevice*),
-        void *arg);
-      int SetupDeviceController();
-    private:   // size is 0x50
-      CFArrayRef array_ref;
-      mach_port_t *master_port;
-      IONotificationPortRef port_ref;
-      CFRunLoopSourceRef source_ref;
-      int (*callback)(H11ANE::H11ANEDeviceController*, void*, H11ANE::H11ANEDevice*);
-      void *callback_arg;
-      CFRunLoopRef run_loop_ref;
-      io_iterator_t io_iterator;
-      pthread_t thread_self;
-      uint64_t unused;
-  };
-
-  // we should switch to the IOKit kernel interface, it's likely a lot more stable
-  // actually this probably isn't true. ANEServices is normal dynamic links
-  // https://googleprojectzero.blogspot.com/2020/11/oops-i-missed-it-again.html
-
-  // H11ANEInDirectPathClient
-  // _ANE_DeviceOpen
-  // _ANE_DeviceClose
-  // _ANE_ProgramSendRequest
-
-  // * if they need kernel debugger attached
-  // H11ANEInUserClient
-  // _ANE_DeviceOpen
-  // _ANE_DeviceClose
-  // _ANE_ProgramSendRequest
-  // _ANE_ProgramCreate
-  // _ANE_ProgramPrepare
-  // _ANE_ProgramUnprepare
-  // _ANE_ProgramDestroy
-  // _ANE_GetStatus
-  // _ANE_PowerOn
-  // _ANE_PowerOff
-  // _ANE_IsPowered
-  // * _ANE_LoadFirmware
-  // * _ANE_ForgetFirmware
-  // * _ANE_SendCommand
-  // _ANE_SetPowerManagement
-  // _ANE_GetTime
-  // * _ANE_SetDriverLoggingFlags
-  // * _ANE_ShowSharedMemoryAllocations
-  // * _ANE_SetDARTCacheTTL
-  // * _ANE_SetFirmwareBootArg
-  // * _ANE_SetThrottlingPercentage
-  // * _ANE_AddPersistentClient
-  // * _ANE_RemovePersistentClient
-  // * _ANE_CreateClientLoggingSession
-  // * _ANE_TerminateClientLoggingSession
-  // _ANE_GetDriverLoggingFlags
-  // * _ANE_FlushInactiveDARTMappings
-  // _ANE_GetVersion
-  // _ANE_RegisterFirmwareWorkProcessor
-  // _ANE_UnregisterFirmwareWorkProcessor
-  // * _ANE_GetFirmwareWorkProcessorItem
-  // _ANE_CompleteFirmwareWorkProcessorItem
-  // _ANE_ReleaseFirmwareWorkProcessorBuffers
-  // * _ANE_ReadANERegister
-  // * _ANE_WriteANERegister
-  // _ANE_ProgramCreateInstance
-
-  // note, this is not the raw IOKit class, it's in ANEServices.framework
-  class H11ANEDevice {
-    public:
-      H11ANEDevice(H11ANE::H11ANEDeviceController *param_1, unsigned int param_2);
-
-      unsigned long H11ANEDeviceOpen(
-        int (*callback)(H11ANE::H11ANEDevice*, unsigned int, void*, void*),
-        void *param_2, ANEDeviceUsageType param_3, H11ANEDeviceInfoStruct *param_4);
-
-      void EnableDeviceMessages();
-      int ANE_AddPersistentClient();
-      int ANE_GetStatus(H11ANEStatusStruct *param_1);
-
-      // power management
-      int ANE_IsPowered();
-      int ANE_PowerOn();
-      int ANE_PowerOff();
-
-      // logging (e00002c7 error, needs PE_i_can_has_debugger)
-      int ANE_CreateClientLoggingSession(unsigned int log_iosurface);
-      int ANE_TerminateClientLoggingSession(unsigned int log_iosurface);
-      int ANE_GetDriverLoggingFlags(unsigned int *flags);
-      int ANE_SetDriverLoggingFlags(unsigned int flags);
-
-      // program creation
-      int ANE_ProgramCreate(H11ANEProgramCreateArgsStruct*,
-                            H11ANEProgramCreateArgsStructOutput*);
-      int ANE_ProgramPrepare(H11ANEProgramPrepareArgsStruct*);
-      int ANE_ProgramSendRequest(H11ANEProgramRequestArgsStruct*, mach_port_t);
-
-      // need PE_i_can_has_debugger
-      int ANE_ReadANERegister(unsigned int param_1, unsigned int *param_2);
-      int ANE_ForgetFirmware();
-
-
-    private:   // size is 0x88 
-      unsigned char unknown[0x88];
-  };
-
-};
-
--- a/extra/accel/ane/3_run/test.mm
+++ b/extra/accel/ane/3_run/test.mm
@@ -1,184 +0,0 @@
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-
-#import <IOSurface/IOSurfaceRef.h>
-
-#import <Foundation/Foundation.h>
-#import <CoreFoundation/CoreFoundation.h>
-
-void hexdump(void *vdat, int l) {
-  unsigned char *dat = (unsigned char *)vdat;
-  for (int i = 0; i < l; i++) {
-    if (i!=0 && (i%0x10) == 0) printf("\n");
-    printf("%02X ", dat[i]);
-  }
-  printf("\n");
-}
-
-#include "h11ane.h"
-
-using namespace H11ANE;
-
-H11ANEDevice *device = NULL;
-
-int MyH11ANEDeviceControllerNotification(H11ANEDeviceController *param_1, void *param_2, H11ANEDevice *param_3) {
-  printf("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
-  device = param_3;
-  return 0;
-}
-
-int MyH11ANEDeviceMessageNotification(H11ANE::H11ANEDevice* dev, unsigned int param_1, void* param_2, void* param_3) {
-  printf("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
-  return 0;
-}
-
-int main() {
-  int ret;
-  printf("hello %d\n", getpid());
-
-  H11ANEDeviceController dc(MyH11ANEDeviceControllerNotification, NULL);
-  dc.SetupDeviceController();
-  assert(device != NULL);
-  H11ANEDevice *dev = device;
-  dev->EnableDeviceMessages();
-
-  char empty[0x90] = {0};
-  H11ANEDeviceInfoStruct dis = {0};
-  //dis.nothing = 0x87c15a20a;
-  //dis.sleep_timer = 5000;
-  ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty, UsageCompile, &dis);
-  printf("open 0x%x %p\n", ret, dev);
-
-  /*ret = dev->ANE_AddPersistentClient();
-  printf("add persistent %x\n", ret);*/
-
-  H11ANEStatusStruct blah = {0};
-  ret = dev->ANE_GetStatus(&blah);
-  printf("get status %x\n", ret);
-
-  // this isn't callable anymore, it requires debugger
-  ret = dev->ANE_PowerOn();
-  printf("power on: %x\n", ret);
-
-  ret = dev->ANE_IsPowered();
-  printf("powered? %d\n", ret);
-
-  /*if (ret == 0) {
-    printf("POWER ON FAILED\n");
-    return -1;
-  }*/
-
-  H11ANEProgramCreateArgsStruct mprog = {0};
-  mprog.program_length = 0xc000;
-  char *prog = (char*)aligned_alloc(0x1000, mprog.program_length);
-  mprog.program = prog;
-
-  FILE *f = fopen("../2_compile/model.hwx", "rb");
-  assert(f);
-  int sz = fread(prog, 1, mprog.program_length, f);
-  printf("read %x %p\n", sz, prog);
-  fclose(f);
-
-  H11ANEProgramCreateArgsStructOutput *out = new H11ANEProgramCreateArgsStructOutput;
-  memset(out, 0, sizeof(H11ANEProgramCreateArgsStructOutput));
-  ret = dev->ANE_ProgramCreate(&mprog, out);
-  uint64_t program_handle = out->program_handle;
-  printf("program create: %lx %lx\n", ret, program_handle);
-
-  H11ANEProgramPrepareArgsStruct pas = {0};
-  pas.program_handle = program_handle;
-  pas.flags = 0x0000000100010001;
-  //pas.flags = 0x0000000102010001;
-  ret = dev->ANE_ProgramPrepare(&pas);
-  printf("program prepare: %lx\n", ret);
-
-  // input buffer
-  NSDictionary* dict = [NSDictionary dictionaryWithObjectsAndKeys:
-                           [NSNumber numberWithInt:16], kIOSurfaceWidth,
-                           [NSNumber numberWithInt:16], kIOSurfaceHeight,
-                           [NSNumber numberWithInt:1], kIOSurfaceBytesPerElement,
-                           [NSNumber numberWithInt:64], kIOSurfaceBytesPerRow,
-                           [NSNumber numberWithInt:1278226536], kIOSurfacePixelFormat,
-                           nil];
-  IOSurfaceRef in_surf = IOSurfaceCreate((CFDictionaryRef)dict);
-  int in_surf_id = IOSurfaceGetID(in_surf);
-  printf("we have surface %p with id 0x%x\n", in_surf, in_surf_id);
-
-  // load inputs
-  IOSurfaceLock(in_surf, 0, nil);
-  unsigned char *inp = (unsigned char *)IOSurfaceGetBaseAddress(in_surf);
-  for (int i = 0; i < 16; i++) inp[i] = (i+1)*0x10;
-  /*inp[0] = 0x39;
-  inp[1] = 0x65;*/
-  hexdump(inp, 0x20);
-  IOSurfaceUnlock(in_surf, 0, nil);
-
-  // output buffer
-  NSDictionary* odict = [NSDictionary dictionaryWithObjectsAndKeys:
-                           [NSNumber numberWithInt:16], kIOSurfaceWidth,
-                           [NSNumber numberWithInt:16], kIOSurfaceHeight,
-                           [NSNumber numberWithInt:1], kIOSurfaceBytesPerElement,
-                           [NSNumber numberWithInt:64], kIOSurfaceBytesPerRow,
-                           [NSNumber numberWithInt:1278226536], kIOSurfacePixelFormat,
-                           nil];
-  IOSurfaceRef out_surf = IOSurfaceCreate((CFDictionaryRef)odict);
-  int out_surf_id = IOSurfaceGetID(out_surf);
-  printf("we have surface %p with id 0x%x\n", out_surf, out_surf_id);
-
-  H11ANEProgramRequestArgsStruct *pras = new H11ANEProgramRequestArgsStruct;
-  memset(pras, 0, sizeof(H11ANEProgramRequestArgsStruct));
-
-  // TODO: make real struct
-  pras->args[0] = program_handle;
-  pras->args[4] = 0x0000002100000003;
-
-  // inputs
-  pras->args[0x28/8] = 1;
-  pras->args[0x128/8] = (long long)in_surf_id<<32LL;
-
-  // outputs
-  pras->args[0x528/8] = 1;
-  // 0x628 = outputBufferSurfaceId
-  pras->args[0x628/8] = (long long)out_surf_id<<32LL;
-
-  mach_port_t recvPort = 0;
-  IOCreateReceivePort(kOSAsyncCompleteMessageID, &recvPort);
-  printf("recv port: 0x%x\n", recvPort);
-
-  // *** reopen with other client ***
-  H11ANEDeviceController dc2(MyH11ANEDeviceControllerNotification, NULL);
-  dc2.SetupDeviceController();
-  assert(device != NULL);
-  dev = device;
-  dev->EnableDeviceMessages();
-
-  char empty2[0x90] = {0};
-  dis.program_handle = program_handle;
-  dis.program_auth_code = 0;
-  ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty2, UsageWithProgram, &dis);
-  printf("reopen 0x%x %p\n", ret, dev);
-
-  // run program (i think we need the other client for this)
-  ret = dev->ANE_ProgramSendRequest(pras, recvPort);
-  printf("send 0x%x\n", ret);
-
-  struct {
-    mach_msg_header_t header;
-    char data[256];
-  } message;
-
-  ret = mach_msg(&message.header,
-          MACH_RCV_MSG,
-          0, sizeof(message),
-          recvPort,
-          MACH_MSG_TIMEOUT_NONE,
-          MACH_PORT_NULL);
-  printf("got message: %d sz %d\n", ret, message.header.msgh_size);
-
-  unsigned char *dat = (unsigned char *)IOSurfaceGetBaseAddress(out_surf);
-  printf("%p\n", dat);
-  hexdump(dat, 0x100);
-}
-
-
--- a/extra/accel/ane/README.md
+++ b/extra/accel/ane/README.md
@@ -1,98 +0,0 @@
-# The Apple Neural Engine
-
-The Apple Neural Engine is a fancy DMA Engine that is based around convolutions. We don't have all the details worked out yet, but we can do some things with it. At its core, it runs through 0x300 ops in an hwx file. See `aneregs` for the registers used in each op.
-
-It operates out of RAM or its 4MB L2 cache. The L2 "cache" appears to be manually managed, and only applies to the input and output, not the weights. The weights are usually included in the program, and it's unclear where they are copied to.
-
-The 16 cores likely refer to the 16 wide Kernel DMA engine. They claim 11 TOPS total, which would be 687.5 GOPS/core. Perhaps it's a 32x32 MAC running at 335 MHz. That clock speed matches the cycle count time ratio from the debug perf stats.
-
-It works with 5D Tensors, you specify the stride for the latter 4. All strides must be a multiple of 0x40 bytes
-* Column (width)    -- aneRegs.Common.InDim.Win / aneRegs.Common.OutDim.Wout
-* Row    (height)   -- aneRegs.Common.InDim.Hin / aneRegs.Common.OutDim.Hout
-* Plane  (channels) -- aneRegs.Common.Cin.Cin / aneRegs.Common.Cout.Cout
-* Depth
-* Group  (batch)    -- aneRegs.Common.GroupConvCfg.NumGroups
-
-It works with 3 data types
-* UInt8
-* Int8
-* Float16
-
-The ops have several parts
-* Header -- The base addresses for the DMA engines
-* KernelDMASrc -- 16x wide DMA engine for the weights/bias/nonlinearity
-* Common -- Specifies the parameters for the convolution
-* TileDMASrc -- Input DMA engine
-* L2 -- Use the L2 cache for Source/Result instead of RAM
-* NE -- Configure Kernel/MAC/Post
-* TileDMADst -- Output DMA engine
-
-It can work with 8 base addresses for the DMA streams per OP
-* 2x Read, both used for things like sum
-* 1x Write
-* 1x T?
-* 4x Kernel, though only the first one seems used
-
-## Normal Flow for ANE Usage
-
-* Keras/ONNX model -> coremltools
-* CoreML model -> Espresso
-* net.plist -> ANECompiler
-* model.hwx -> ANEServices
-* AppleH11ANEInterface, an IOKit interface to the kernel
-
-## hwx file?
-
-This is a Mach-O file. We haven't figured out all the details, but the ops are at 0x4000. See `hwx_parse.py`
-
-## amfid
-
-Sadly disabling amfi breaks things like vscode. You can runtime patch
-
-```
-# MacOS 12.4
-
-smol :: ~/fun/tinygrad » sha1sum /usr/libexec/amfid 
-0f7e7f7e41408f83d7ebc7564a3828f41cb2ab58  /usr/libexec/amfid
-
-# with patching +0x8e38
-
-(lldb) image list
-[  0] 04B6DF6C-6068-3F18-81A7-978985574387 0x0000000102ad0000 /usr/libexec/amfid 
-(lldb) p *(unsigned int *)0x102ad8e38=0xd2800000
-```
-
-This disables the entitlement check, then you don't need a bootarg. I wish Apple made a better way to do this.
-
-## Extracting ANEServices.framework
-
-```
-# install xcode and 
-sudo xcode-select --switch /Applications/Xcode.app
-# xcode also contains ANEServices.tbd
-brew install keith/formulae/dyld-shared-cache-extractor
-dyld-shared-cache-extractor /System/Library/dyld/dyld_shared_cache_arm64e /tmp/libraries
-cp /tmp/libraries/System/Library/PrivateFrameworks/ANECompiler.framework/Versions/A/ANECompiler .
-cp /tmp/libraries/System/Library/PrivateFrameworks/ANEServices.framework/Versions/A/ANEServices .
-cp /tmp/libraries/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/Versions/A/AppleNeuralEngine .
-```
-
-## Other work
-
-```
-# sadly also relies on ZinIrRegBitPrintOutDebug
-https://github.com/antgroup-arclab/ANETools.git
-
-# sadly looks like we do actually need a direct connection to run hwx files, aned is at the espresso level
-* frame #0: 0x00000001c250fecc AppleNeuralEngine`-[_ANEDaemonConnection loadModel:sandboxExtension:options:qos:withReply:]
-(lldb) po $x2
-_ANEModel: { modelURL=file:///var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/test_F48D9B88-A68D-476F-ADC8-32BDAF9A2498.mlmodelc/ : key={"isegment":0,"inputs":{"image":{"shape":[1,1,1,64,1]},"image2":{"shape":[1,1,1,64,1]}},"outputs":{"probs":{"shape":[1,1,1,64,1]}}} : string_id=0x00000000 : program=(null) : state=1 : programHandle=0 : intermediateBufferHandle=0 : queueDepth=0 : attr={
-} : perfStatsMask=0} 
-```
-
-## Choices
-
-* Disable amfid (breaks vscode)
-* Patch amfid to allow restricted entitlements
-* Sign with a "provisioning profile" to allow the entitlement
-* Patch the ANE kext to not require a special entitlement (this is ideal, as we don't need to resign python)
--- a/extra/accel/ane/README.old
+++ b/extra/accel/ane/README.old
@@ -1,367 +0,0 @@
-kernel driver: AppleH11ANEInterface
-  requires entitlement: com.apple.ane.iokit-user-access
-	compiler is run in ANE_ProgramCreate_gated
-
-2 helper processes:
-  /usr/libexec/aned
-  ANECompilerService
-
-Espresso:
-	Contains ANECompilerEngine
-
-AppleNeuralEngine: Objective-C interface called by Espresso
-  ANEServices: communication with the device
-  ANECompiler: compile plist into hwx file
-		com.apple.ANECompilerService.allow in AppleNeuralEngine?
-		Called from ANECompilerService.xpc in AppleNeuralEngine.framework
-
-== Model Flow ==
-
-    Keras/ONNX model
-          |
-          |   1_build
-          | (coremltools, open source)
-          v
-      CoreML model
-          |
-          |   TODO: automate this
-          |   Grabbed plist from lldbing ANECompilerService during 1_build
-          | (Espresso)
-          v
-       net.plist
-          |
-          |   2_compile
-          | (AppleNeuralEngine, ANECompiler)
-          v
-       model.hwx
-          |
-          |   3_run
-          | (AppleNeuralEngine, ANEServices, AppleH11ANEInterface)
-          v
- <run on neural engine>
-
-TODO: Write a nice plist grabber
-DONE: Write a call to the compiler with plist+weights
-DONE: Write an hwx runner
-
-== Tracing the Compiler ==
-
-ANECCompileProcedure
-  ZinAneCreateIr
-    ZinParseNeuronUnit
-  ZinAneCoreCompile
-    ZinAneCodeGeneration
-      ZinIrCodegenHandleKernels
-      ZinIrTargetH13::CodegenTds
-        ZinIrCacheHintTable
-        ZinIrCodegenHandleTds_v7
-          ZinIrCodegenHandleTdsMakeList<7u>
-            ZinAneInstruction
-            ZinAneTd<7u>::HandleEngineLayer
-              ZinAneInstruction::HandleTdHeader
-              HandleNELayer<7u>
-                ZinAneInstruction::HandleCommonConfig
-                  ZinAneInstruction::HandleCommonConfigCommonOpcodes
-        ZinIrCodegenHandleTds<7u>
-          0x1bb93ae00 <-- this is the store of the first byte in the hwx
-          CalculateSizeInBytesFromRegCount (x*4+4)
-            0xf   @ 0x128-0x168  (base 0x1003047b0)
-            0x1b  @ 0x16c-0x1dc
-            0x11  @ 0x1e0-0x228
-            0x3   @ 0x22c-0x23c
-            0x4   @ 0x240-0x254
-            0x6   @ 0x258-0x274(end)
-          AddReloc (this is gold! x4 goes in the hwx)
-          ZinAneTd<7u>::HandleEngineLayer
-
-rbreak ^ZinAneInstruction*
-
-weeee ZinIrRegBitPrintOutDebug_7u_
-print (void)debugregs(0, 0x0000000100211030+8, 3)
-
-== min.plist == 
-
-Types: GOC, Conv, Broadcast, ScaledElementWise, Reshape, InputView, Neuron, Concat
-
-
-ops have length 0x300, seems like one basic op repeated
-
-header 0x0-0x1c
-
-u32 0x1c = next op offset
-u16 0x20 = output address?
-
-== section break 0x2c (weights) ==
-reloc 0x2c-0x74 = K2DBE6976FEB616E6867A2E3853FC37D0F101C4C51BA4A80C103359643338C0C1_ne_0
-                  K2DBE6976FEB616E6867A2E3853FC37D0F101C4C51BA4A80C103359643338C0C1_ne_1
-
-16 output channel parallel:
-u32[16] 0x34-0x74 = 0x80 | 1 if used
-u32[16] 0x74-0xB4 = <channel data offset>
-u32[16] 0xB4-0xF4 = <channel data length>
-
-== section break 0x128 (conv) ==
-u16 0x128 = InputWidth
-u16 0x12A = InputHeight
-u16 0x12C = InputDepth
-
-u32 0x130 = (OutputType * 0x10) | InputType
-
-u32 0x134 = InputChannels
-u32 0x138 = OutputChannels
-
-u16 0x13C = OutputWidth
-u16 0x13E = OutputHeight
-u16 0x140 = OutputDepth
-
-u16 0x144 = 0xa000 | (KernelHeight * 0x20) | KernelWidth
-u16 0x146 = 0x5000 | (PadTop * 0x40) | (PadLeft * 2)
-
-u16 0x14C = BatchSize
-u32 0x150 = OutputHeight?
-
-== section break 0x16c (input) ==
-reloc 0x16c-0x174 = image
-
-u32 0x178 = InputRowStride
-u32 0x17C = InputPlaneStride
-u32 0x180 = InputDepthStride
-u32 0x184 = InputBatchStride
-
-u8 0x1A7 = InputInterleave
-
-== section break 0x1e0 ==
-u8 0x1E5 = InputInterleave
-
-u32 0x1F4 = InputChannels * 0x10
-u32 0x1F8 = InputDepth * InputChannels * 0x10
-
-u8 0x211 = OutputInterleave
-
-u32 0x220 = OutputChannels * 0x10
-u32 0x224 = OutputDepth * OutputChannels * 0x10
-
-== section break 0x22c (scaling) ==
-u16 0x230 = BiasScalar
-u16 0x232 = ScaleScalar
-
-== section break 0x240 ==
-u8 0x240 = 0x80 | KernelType
-u8 0x241 = 4 * hasbatch
-u16 0x246 = 0x10 | 2 * neuron?
-
-== section break 0x258 (output) ==
-reloc 0x258-0x25c = probs@output/src
-
-u32 0x260 = OutputRowStride
-u32 0x264 = OutputPlaneStride
-u32 0x268 = OutputDepthStride
-u32 0x26C = OutputBatchStride
-
-u8 0x273 = OutputInterleave
-
-== Zin Constants ==
-
-kZinIrOpCodeConv = 0?
-kZinIrOpCodePool = 1
-kZinIrOpCodeElementWiseOp = 6
-kZinIrOpCodeConcat = 8
-kZinIrOpCodeFlattenComposite
-kZinIrOpCodeNEConvOp
-kZinIrOpCodeTranspose
-
- 0: CONV
- 1: POOL
- 2: SCALE_BIAS
- 3: TERNARY_DYNAMIC_GOC
- 4: BINARY_DYNAMIC_GOC
- 5: ACTIVATION
- 6: EW
- 7: SCALED_EW
- 8: CONCAT
- 9: SPLIT
-10: COPY
-11: FLATTEN
-12: UNFLATTEN
-13: CROSS_CORRELATION
-14: KERNEL_RASTERIZER
-15: ARG_MIN_MAX
-16: MATRIX_MULT
-17: BROADCAST
-18: FLATTEN_COMPOSITE
-19: UNFLATTEN_COMPOSITE
-20: KERNEL_RASTERIZER_COMPOSITE
-21: CROSS_CORRELATION_COMPOSITE
-22: LIVE_IN
-23: CONST_IN
-24: LIVE_OUT
-25: REDUCTION
-26: ALIAS
-27: Typecast
-28: RESHAPE
-29: VIEW
-30: TRANSPOSE
-31: SPACE_TO_BATCH
-32: BATCH_TO_SPACE
-33: SOFTMAX
-34: INSTANCE_NORM
-35: L2_NORM
-36: MINMAX_NORM
-37: LRN
-38: COST_VOLUME
-39: PIXEL_SHUFFLE
-40: FPS
-41: RS
-42: PEFUSED_ELEMENTWISE
-43: PEFUSED_POOL
-44: PEFUSED_GOC
-45: NEFUSED_CONV
-46: NEFUSED_POOL
-47: NEFUSED_EW
-48: NEFUSED_BYPASS
-
-# guessing from the hwx
-kZinTensorFormatUInt8 = 0
-kZinTensorFormatInt8 = 1
-kZinTensorFormatFloat16 = 2
-kZinTensorFormatInvalid
-
-Zin (plist format) ---(ZinAneCoreCompile)---> Mir (hwx format)?
-  ZinAneCodeGeneration?
-
-ZinIrStatus GetKernelFormat<6u>(ZinKernelFormat param_1,ane_ne_kernel_cfg_kernel_fmt *param_2)
-  List of allowed numbers
-
-NeuronTypes (changes the LUT):
-  Tanh
-  Log2
-  Exp2
-  Sign = ZinMirActivationV7::GetSignLut
-  ...many more in ANECompiler
-
-Investigate:
-  ZinMirActivationV7::PrintLut(ZinMirActivationV7 *this,ane_nonlinear_lut_v7up_t *param_1
-
- 0: NONE
- 1: RELU
- 2: SIGMOID
- 3: SIGMOID_HIGH_PRECISION
- 4: TANH
- 5: CLAMPED_RELU
- 6: PRELU
- 7: DIRAC
- 8: INT
- 9: FRAC
-10: SQRT
-11: RSQRT
-12: INV
-13: SQR
-14: LOG2
-15: EXP2
-16: ELU
-17: SIGN
-18: EQUAL_ZERO
-19: NON_ZERO
-20: LESS_THAN_ZERO
-21: LESS_EQUAL_ZERO
-22: GREATER_EQUAL_ZERO
-23: GREATER_THAN_ZERO
-24: CUSTOM_LUT
-25: C_DIM_CONCAT
-26: C_DIM_STRIDED_CONCAT
-27: H_DIM_CONCAT
-28: W_DIM_CONCAT
-29: D_DIM_CONCAT
-30: N_DIM_CONCAT
-31: H_DIM_STRIDED_CONCAT
-
-CacheHint
-0: ALLOC
-1: NOALLOC
-2: DROP
-3: DEPRI
-
-conv kinds
-0: regular
-1: channelwise
-2: grouped
-
-== plist exploration ==
-
-Float16 -> UInt8 for output works, Float32 doesn't
-Same for input
-All weights must be float
-
-Index 0: model.espresso.weights @ 192 is weights
-Index 1: net.additional.weights @ 0 is bias
-
-Float16 -> Float32 for bias works
-
-It's possible the compiler is Float32 -> Float16 converting, and the engine only supports Float16 + UInt8
-
-== call to the compiler (in dmesg!) ==
-
-[54476.282258]: H11ANEIn: ANE_ProgramCreate_gated:, ZinComputeProgramMake, get Mcache size: 0x0
-[54476.282259]: H11ANEIn: ANE_ProgramCreate_gated:,Program Identifier:ANEC v1
-zin_ane_compiler v4.2.1
-	-t h13
-	--fdram-allocator=ffreuse
-	--fdram-tensor-priority=sizethenliverange
-	--fl2-allocator=ffreuse
-	--fl3-allocator=ffreuse
-	--fl2-cache-mode=resident
-	--fsignature=ident
-	--memcache-strategy=
-[54476.282262]: 	--memcache-size=4194304
-	--fspatial-split=disabled
-	--fkernel-rewind=enabled
-	--Wl-undefined=fvmlib
-	-i /Library/Caches/com.apple.aned/tmp/Python/DB7E897E7F4D5D27501A998428B6D3863AFD96CEA82DAF2207A75394E6BAC44C/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/net.plist
-	-o /Library/Caches/com.apple.aned/20A2411/Python/C9981871BC59572E74AFA3014B183EA37567EE9A2A08328446CE4A2B754E109D/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/model.hwx.tmp
-
-== ANECCompile (in ANECompiler framework) ==
-  ANECCompile(__CFDictionary *param_1, __CFDictionary *param_2, unsigned long param_3)
-
-param_1:
-{   
-    InputNetworks =     (
-                {
-            NetworkPlistName = "net.plist";
-            NetworkPlistPath = "/Library/Caches/com.apple.aned/tmp/run/A2ACB9D5AA31B301563A4F62885BA379E62B0E1240E95C6902A93900FE0A9B54/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/";
-        }
-    );
-    OutputFileName = "model.hwx.tmp";
-    OutputFilePath = "/Library/Caches/com.apple.aned/20A2411/run/E68910CD1994681121EEDAFAE1BC524AA8E84CF80C42AFC0C7DE2C082C58BDFD/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/";
-}
-
-param_2:
-{
-    TargetArchitecture = h13;
-}
-
-== Backtrace of device open ==
-
-  * frame #0: 0x00000001a68fac54 ANEServices`H11ANEDeviceOpen
-    frame #1: 0x00000001a78405b8 AppleNeuralEngine`__29-[_ANEDeviceController start]_block_invoke + 436
-    frame #2: 0x0000000193c84420 libdispatch.dylib`_dispatch_client_callout + 20
-    frame #3: 0x0000000193c92a98 libdispatch.dylib`_dispatch_lane_barrier_sync_invoke_and_complete + 60
-    frame #4: 0x00000001a78403e8 AppleNeuralEngine`-[_ANEDeviceController start] + 136
-    ...
-    frame #23: 0x00000001a64a4f38 Espresso`Espresso::ANERuntimeEngine::compiler::build_segment(std::__1::shared_ptr<Espresso::abstract_batch> const&, int, Espresso::net_compiler_segment_based::segment_t const&) + 2080
-    ...
-    frame #31: 0x000000019ab6099c CoreML`-[MLNeuralNetworkEngine rebuildPlan:] + 1640
-
-== Backtrace of run? ==
-
-  * frame #0: 0x00000001a68f9108 ANEServices`H11ANEProgramProcessRequestDirect
-    frame #1: 0x00000001a7839694 AppleNeuralEngine`-[_ANEProgramForEvaluation processRequest:qos:qIndex:modelStringID:options:error:] + 1904
-    frame #2: 0x00000001a7843ba4 AppleNeuralEngine`-[_ANEClient doEvaluateDirectWithModel:options:request:qos:error:] + 1236
-    frame #3: 0x00000001a7842034 AppleNeuralEngine`-[_ANEClient evaluateWithModel:options:request:qos:error:] + 104
-    frame #4: 0x00000001a64a2988 Espresso`Espresso::ANERuntimeEngine::compiler::__forward_segment(std::__1::shared_ptr<Espresso::abstract_batch> const&, int, Espresso::net_compiler_segment_based::segment_t const&) + 2008
-    frame #5: 0x00000001a6414548 Espresso`Espresso::net_compiler_segment_based::__forward(std::__1::shared_ptr<Espresso::abstract_batch> const&) + 992
-    frame #6: 0x00000001a67e2e3c Espresso`EspressoLight::espresso_plan::dispatch_task_on_compute_batch(std::__1::shared_ptr<Espresso::abstract_batch> const&, std::__1::shared_ptr<EspressoLight::plan_task_t> const&) + 612
-    frame #7: 0x00000001a67ebab0 Espresso`EspressoLight::espresso_plan::execute_sync() + 356
-    frame #8: 0x00000001a67f26fc Espresso`espresso_plan_execute_sync + 120
-    frame #9: 0x000000019ab674b8 CoreML`-[MLNeuralNetworkEngine executePlan:error:] + 136
-    frame #10: 0x000000019ab6799c CoreML`-[MLNeuralNetworkEngine evaluateInputs:bufferIndex:options:error:] + 368
-
--- a/extra/accel/ane/amfi/new_patch.py
+++ b/extra/accel/ane/amfi/new_patch.py
@@ -1,102 +0,0 @@
-import ctypes
-from subprocess import check_output
-from hexdump import hexdump
-
-def get_pid(name):
-  try:
-    output = check_output(["pgrep", name])
-    return int(output)
-  except:
-    return None
-
-from ctypes.util import find_library
-libc = ctypes.CDLL(find_library('c'))
-
-amfid_pid = get_pid("amfid")
-
-task = ctypes.c_uint32()
-mytask = libc.mach_task_self()
-ret = libc.task_for_pid(mytask, ctypes.c_int(amfid_pid), ctypes.pointer(task))
-print(amfid_pid, ret, task, mytask)
-
-#myport = libc.mach_task_self()
-
-class vm_region_submap_short_info_data_64(ctypes.Structure):
-  _pack_ = 1
-  _fields_ = [
-      ("protection", ctypes.c_uint32),
-      ("max_protection", ctypes.c_uint32),
-      ("inheritance", ctypes.c_uint32),
-      ("offset", ctypes.c_ulonglong),
-      ("user_tag", ctypes.c_uint32),
-      ("ref_count", ctypes.c_uint32),
-      ("shadow_depth", ctypes.c_uint16),
-      ("external_pager", ctypes.c_byte),
-      ("share_mode", ctypes.c_byte),
-      ("is_submap", ctypes.c_uint32),
-      ("behavior", ctypes.c_uint32),
-      ("object_id", ctypes.c_uint32),
-      ("user_wired_count", ctypes.c_uint32),
-  ]
-submap_info_size = ctypes.sizeof(vm_region_submap_short_info_data_64) // 4
-
-address = ctypes.c_ulong(0)
-mapsize = ctypes.c_ulong(0)
-count = ctypes.c_uint32(submap_info_size)
-sub_info = vm_region_submap_short_info_data_64()
-depth = 0
-
-c_depth = ctypes.c_uint32(depth)
-for i in range(1):
-  ret = libc.mach_vm_region_recurse(task,
-    ctypes.pointer(address), ctypes.pointer(mapsize),
-    ctypes.pointer(c_depth), ctypes.pointer(sub_info),
-    ctypes.pointer(count))
-  print("aslr", hex(ret), hex(address.value), mapsize, count, sub_info.protection)
-  #address.value += mapsize.value
-#exit(0)
-
-patch_address = address.value + 0x8e38
-patch = b"\x00\x00\x80\xd2"
-
-pdata = ctypes.c_void_p(0)
-data_cnt = ctypes.c_uint32(0)
-
-ret = libc.mach_vm_read(task, ctypes.c_ulong(patch_address), 4, ctypes.pointer(pdata), ctypes.pointer(data_cnt))
-buf = ctypes.string_at(pdata.value, data_cnt.value)
-hexdump(buf)
-
-#ret = libc.mach_vm_wire(mytask, task, patch_address, 4, 3)
-#print(ret)
-#exit(0)
-
-"""
-ret = libc.mach_vm_read(task, address, mapsize, ctypes.pointer(pdata), ctypes.pointer(data_cnt))
-buf = ctypes.string_at(pdata.value, data_cnt.value)
-hexdump(buf)
-
-ret = libc.mach_vm_deallocate(task, address, mapsize)
-print("mach_vm_deallocate", ret)
-
-ret = libc.mach_vm_allocate(task, ctypes.pointer(address), mapsize, 0)
-print("mach_vm_allocate", ret)
-"""
-
-ret = libc.mach_vm_protect(task, ctypes.c_ulong(patch_address), 4, True, 3)
-print("protect", ret)
-
-longptr = ctypes.POINTER(ctypes.c_ulong)
-#shellcodePtr = ctypes.cast(buf, longptr)
-#ret = libc.mach_vm_write(task, address, shellcodePtr, len(buf))
-#print("write", ret)
-
-shellcodePtr = ctypes.cast(patch, longptr)
-ret = libc.mach_vm_write(task, ctypes.c_ulong(patch_address), shellcodePtr, len(buf))
-print("write", ret)
-
-#libc.mach_vm_write.argtypes = [ctypes.c_uint32, ctypes.c_ulong, longptr, ctypes.c_uint32]
-#libc.mach_vm_write.restype = ctypes.c_uint32
-#ret = libc.mach_vm_write(task, ctypes.c_ulong(patch_address), shellcodePtr, len(patch))
-
-ret = libc.mach_vm_protect(task, ctypes.c_ulong(patch_address), 4, False, 5)
-print("protect", ret)
--- a/extra/accel/ane/aneregs
+++ b/extra/accel/ane/aneregs
@@ -1,220 +0,0 @@
-// ZinIrRegBitPrintOutDebug_7u_
-
-Task_ID: 0                       
-
-header = 10*4 = 0x28
-                                                    
-aneTD.Header[0].TID = 0         
-aneTD.Header[0].NID = 0            
-aneTD.Header[0].LNID = 1                 
-aneTD.Header[0].EON = 1                     
-aneTD.Header[1].ExeCycles = 0               
-aneTD.Header[1].NextSize = 0                
-aneTD.Header[2].LogEvents = 1058            
-aneTD.Header[3].Exceptions = 0                
-aneTD.Header[4].DebugLogEvents = 16775274
-aneTD.Header[5].DebugExceptions = 0
-aneTD.Header[6].DisallowAbort = 0  
-aneTD.Header[6].TDSkip = 0     
-aneTD.Header[6].KPC = 0               
-aneTD.Header[6].SPL = 0              
-aneTD.Header[6].TSR = 0                 
-aneTD.Header[6].SPC = 0                
-aneTD.Header[6].DPC = 0            
-aneTD.Header[6].TSE = 0           
-aneTD.Header[6].NextPriority = 0
-aneTD.Header[6].TDE = 0               
-aneTD.Header[6].SrcLoc = 1          
-aneTD.Header[6].DstLoc = 1        
-aneTD.Header[6].TQDis = 0                    
-aneTD.Header[7].NextPointer = 0          
-aneTD.Header[8].RBase0 = 5    
-aneTD.Header[8].RBE0 = 1        
-aneTD.Header[8].RBase1 = 0 
-aneTD.Header[8].RBE1 = 0   
-aneTD.Header[8].WBase = 4     
-aneTD.Header[8].WBE = 1               
-aneTD.Header[8].TBase = 0            
-aneTD.Header[8].TBE = 0                 
-aneTD.Header[8].ENE = 1                
-aneTD.Header[9].KBase0 = 1        
-aneTD.Header[9].KBE0 = 1       
-aneTD.Header[9].KBase1 = 0            
-aneTD.Header[9].KBE1 = 0            
-aneTD.Header[9].KBase2 = 0        
-aneTD.Header[9].KBE2 = 0           
-aneTD.Header[9].KBase3 = 0                
-aneTD.Header[9].KBE3 = 0              
-                                                    
-0x28 = 00 F8 01 F4 = 0x1F800
-+0x30
-aneRegs.KernelDMASrc.CoeffBaseAddr[0].Addr = 0
-aneRegs.KernelDMASrc.CoeffBfrSize[0].MemBfrSize = 2
-aneRegs.KernelDMASrc.CoeffDMAConfig[0].CacheHint = 2                                                     
-aneRegs.KernelDMASrc.CoeffDMAConfig[0].CrH = 0
-aneRegs.KernelDMASrc.CoeffDMAConfig[0].En = 1
-aneRegs.KernelDMASrc.CoeffDMAConfig[0].PrefetchParticipateEn = 0                                         
-aneRegs.KernelDMASrc.CoeffBaseAddr[1].Addr = 0
-aneRegs.KernelDMASrc.CoeffBfrSize[1].MemBfrSize = 1
-aneRegs.KernelDMASrc.CoeffDMAConfig[1].CacheHint = 2                                                     
-aneRegs.KernelDMASrc.CoeffDMAConfig[1].CrH = 0
-aneRegs.KernelDMASrc.CoeffDMAConfig[1].En = 0
-aneRegs.KernelDMASrc.CoeffDMAConfig[1].PrefetchParticipateEn = 0                                         
-aneRegs.KernelDMASrc.CoeffBaseAddr[2].Addr = 0
-aneRegs.KernelDMASrc.CoeffBfrSize[2].MemBfrSize = 1
-aneRegs.KernelDMASrc.CoeffDMAConfig[2].CacheHint = 2                                                     
-aneRegs.KernelDMASrc.CoeffDMAConfig[2].CrH = 0
-aneRegs.KernelDMASrc.CoeffDMAConfig[2].En = 0
-aneRegs.KernelDMASrc.CoeffDMAConfig[2].PrefetchParticipateEn = 0                                         
-# there's 13 more of these
-aneRegs.KernelDMASrc.Spare0.Spare = 0
-aneRegs.KernelDMASrc.Spare1.Spare = 0         
-
-0x124 = 00 00 00 3C = 0
-+0x1d4
-aneRegs.Common.Cfg.AccDoubleBufEn = 1     
-aneRegs.Common.Cfg.ActiveNE = 0           
-aneRegs.Common.Cfg.ContextSwitchIn = 0
-aneRegs.Common.Cfg.ContextSwitchOut = 0
-aneRegs.Common.Cfg.ShMax = 1              
-aneRegs.Common.Cfg.ShMin = 0     
-aneRegs.Common.Cfg.ShPref = 1        
-aneRegs.Common.Cfg.SmallSourceMode = 0
-aneRegs.Common.ChCfg.InFmt = 2       
-aneRegs.Common.ChCfg.OutFmt = 2  
-aneRegs.Common.Cin.Cin = 1         
-aneRegs.Common.ConvCfg.Kh = 1   
-aneRegs.Common.ConvCfg.Kw = 1      
-aneRegs.Common.ConvCfg.OCGSize = 0     
-aneRegs.Common.ConvCfg.Ox = 1         
-aneRegs.Common.ConvCfg.Oy = 1                 
-aneRegs.Common.ConvCfg.Px = 0                 
-aneRegs.Common.ConvCfg.Py = 0             
-aneRegs.Common.ConvCfg.Sx = 1      
-aneRegs.Common.ConvCfg.Sy = 1      
-aneRegs.Common.Cout.Cout = 1                        
-aneRegs.Common.DPE.Category = 0                     
-aneRegs.Common.GroupConvCfg.ElemMultMode = 0
-aneRegs.Common.GroupConvCfg.NumGroups = 1
-aneRegs.Common.GroupConvCfg.UnicastCin = 1
-aneRegs.Common.GroupConvCfg.UnicastEn = 1
-aneRegs.Common.InDim.Hin = 1
-aneRegs.Common.InDim.Win = 77
-aneRegs.Common.OutDim.Hout = 1
-aneRegs.Common.OutDim.Wout = 77
-aneRegs.Common.Spare0.Spare = 0
-aneRegs.Common.Spare1.Spare = 0
-aneRegs.Common.TaskInfo.NID = 1
-aneRegs.Common.TaskInfo.TaskID = 0
-aneRegs.Common.TaskInfo.TaskQ = 0
-aneRegs.Common.TileCfg.TileHeight = 1
-
-0x168 = 00 38 01 6C = 0x13800
-+0x220
-aneRegs.TileDMASrc.BaseAddr.Addr = 0
-aneRegs.TileDMASrc.DMAConfig.CacheHint = 2
-aneRegs.TileDMASrc.DMAConfig.CacheHintNoReuse = 12
-aneRegs.TileDMASrc.DMAConfig.CacheHintReuse = 14
-aneRegs.TileDMASrc.DMAConfig.CrH = 0
-aneRegs.TileDMASrc.DMAConfig.DependencyMode = 0
-aneRegs.TileDMASrc.DMAConfig.En = 1
-aneRegs.TileDMASrc.Fmt.CmpVec = 0
-aneRegs.TileDMASrc.DepthStride.Stride = 3
-aneRegs.TileDMASrc.Fmt.FmtMode = 1
-aneRegs.TileDMASrc.Fmt.Interleave = 1
-aneRegs.TileDMASrc.Fmt.MemFmt = 2
-aneRegs.TileDMASrc.Fmt.OffsetCh = 0
-aneRegs.TileDMASrc.Fmt.Shift = 0
-aneRegs.TileDMASrc.Fmt.Truncate = 3
-aneRegs.TileDMASrc.GroupStride.Stride = 0
-aneRegs.TileDMASrc.PixelOffset[0].Offset = 0
-aneRegs.TileDMASrc.PixelOffset[1].Offset = 0
-aneRegs.TileDMASrc.PixelOffset[2].Offset = 0
-aneRegs.TileDMASrc.PixelOffset[3].Offset = 0
-aneRegs.TileDMASrc.PlaneStride.PlaneStride = 3
-aneRegs.TileDMASrc.RowStride.Stride = 3  
-aneRegs.TileDMASrc.Spare0.Spare = 0
-aneRegs.TileDMASrc.Spare1.Spare = 0
-
-0x1dc = 00 48 00 44 = 0x4800
-+0x29c
-aneRegs.L2.ResultBase.Addr = 10
-aneRegs.L2.ResultCfg.AliasConvRslt = 0
-aneRegs.L2.ResultCfg.AliasConvSrc = 0
-aneRegs.L2.ResultCfg.AliasPlanarRslt = 0
-aneRegs.L2.ResultCfg.AliasPlanarSrc = 0
-aneRegs.L2.ResultCfg.ResultType = 2
-aneRegs.L2.ResultCfg.DMACmpVec = 0
-aneRegs.L2.ResultCfg.DMAFmt = 1 
-aneRegs.L2.ResultCfg.DMAInterleave = 1
-aneRegs.L2.ResultCfg.DMAOffsetCh = 0
-aneRegs.L2.ResultCfg.L2BfrMode = 1
-aneRegs.L2.ConvResultChannelStride.Stride = 0
-aneRegs.L2.ConvResultRowStride.Stride = 0
-aneRegs.L2.L2Cfg.InputReLU = 0
-aneRegs.L2.L2Cfg.PaddingMode = 0
-aneRegs.L2.Spare0.Spare = 0
-aneRegs.L2.Spare1.Spare = 0
-aneRegs.L2.SourceBase.Addr = 0
-aneRegs.L2.SourceCfg.AliasConvRslt = 0
-aneRegs.L2.SourceCfg.AliasConvSrc = 0
-aneRegs.L2.SourceCfg.AliasPlanarRslt = 0
-aneRegs.L2.SourceCfg.AliasPlanarSrc = 0
-aneRegs.L2.SourceCfg.DMACmpVec = 0
-aneRegs.L2.SourceCfg.DMAFmt = 1
-aneRegs.L2.SourceCfg.DMAInterleave = 1
-aneRegs.L2.SourceCfg.DMAOffsetCh = 0
-aneRegs.L2.SourceCfg.Dependent = 0
-aneRegs.L2.SourceCfg.SourceType = 2
-aneRegs.L2.SourceChannelStride.Stride = 10
-aneRegs.L2.SourceRowStride.Stride = 10
-
-0x228 = 00 88 00 0C = 0x8800
-+0x2f0
-0x23C = 00 C8 00 10 = 0xC800
-+0x30c
-aneRegs.NE.AccBias.AccBias = 0                      
-aneRegs.NE.AccBias.AccBiasShift = 0           
-aneRegs.NE.KernelCfg.GroupKernelReuse = 0          
-aneRegs.NE.KernelCfg.KernelFmt = 0                                                                       
-aneRegs.NE.KernelCfg.PalettizedBits = 8       
-aneRegs.NE.KernelCfg.PalettizedEn = 0        
-aneRegs.NE.KernelCfg.SparseFmt = 0                                                                       
-aneRegs.NE.MACCfg.BiasMode = 0                
-aneRegs.NE.MACCfg.BinaryPoint = 0                  
-aneRegs.NE.MACCfg.KernelMode = 1                                                                         
-aneRegs.NE.MACCfg.MatrixBiasEn = 0            
-aneRegs.NE.MACCfg.NonlinearMode = 2          
-aneRegs.NE.MACCfg.OpMode = 4                                                                             
-aneRegs.NE.MACCfg.PostScaleMode = 0           
-aneRegs.NE.MatrixVectorBias.MatrixVectorBias = 0   
-aneRegs.NE.PostScale.PostRightShift = 0                                                                  
-aneRegs.NE.PostScale.PostScale = 15360        
-aneRegs.NE.Spare0.Spare = 0                  
-aneRegs.NE.Spare1.Spare = 0                                                                              
-
-0x254 = 00 78 01 18 = 0x17800
-+0x32c
-aneRegs.TileDMADst.BaseAddr.Addr = 0 
-aneRegs.TileDMADst.DepthStride.DepthStride = 3
-aneRegs.TileDMADst.DMAConfig.BypassEOW = 0
-aneRegs.TileDMADst.DMAConfig.CacheHint = 3
-aneRegs.TileDMADst.DMAConfig.CrH = 0  
-aneRegs.TileDMADst.DMAConfig.En = 1    
-aneRegs.TileDMADst.DMAConfig.L2BfrMode = 1
-aneRegs.TileDMADst.Fmt.CmpVec = 0
-aneRegs.TileDMADst.Fmt.CmpVecFill = 0
-aneRegs.TileDMADst.Fmt.FmtMode = 1    
-aneRegs.TileDMADst.Fmt.Interleave = 1
-aneRegs.TileDMADst.Fmt.MemFmt = 2
-aneRegs.TileDMADst.Fmt.OffsetCh = 0
-aneRegs.TileDMADst.Fmt.Shift = 0
-aneRegs.TileDMADst.Fmt.Truncate = 3
-aneRegs.TileDMADst.Fmt.ZeroPadFirst = 1
-aneRegs.TileDMADst.Fmt.ZeroPadLast = 1
-aneRegs.TileDMADst.GroupStride.GroupStride = 0
-aneRegs.TileDMADst.PlaneStride.PlaneStride = 3
-aneRegs.TileDMADst.RowStride.RowStride = 3
-aneRegs.TileDMADst.Spare0.Spare = 0
-aneRegs.TileDMADst.Spare1.Spare = 0
-
--- a/extra/accel/ane/lib/.gitignore
+++ b/extra/accel/ane/lib/.gitignore
@@ -1 +0,0 @@
-libane.dylib
--- a/extra/accel/ane/lib/ane.mm
+++ b/extra/accel/ane/lib/ane.mm
@@ -1,210 +0,0 @@
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <sstream>
-
-#import <IOSurface/IOSurfaceRef.h>
-
-#import <Foundation/Foundation.h>
-#import <CoreFoundation/CoreFoundation.h>
-
-#include "h11ane.h"
-using namespace H11ANE;
-
-//#define DEBUG printf
-#define DEBUG(x, ...)
-
-extern "C" {
-
-// global vars
-H11ANEDevice *dev = NULL;
-
-int MyH11ANEDeviceControllerNotification(H11ANEDeviceController *param_1, void *param_2, H11ANEDevice *param_3) {
-  DEBUG("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
-  dev = param_3;
-  return 0;
-}
-
-int MyH11ANEDeviceMessageNotification(H11ANE::H11ANEDevice* dev, unsigned int param_1, void* param_2, void* param_3) {
-  DEBUG("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
-  return 0;
-}
-
-int ANE_Open() {
-  int ret;
-  H11ANEDeviceController dc(MyH11ANEDeviceControllerNotification, NULL);
-  dc.SetupDeviceController();
-  assert(dev != NULL);
-  dev->EnableDeviceMessages();
-
-  char empty[0x90] = {0};
-  H11ANEDeviceInfoStruct dis = {0};
-  ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty, UsageCompile, &dis);
-  DEBUG("open 0x%x %p\n", ret, dev);
-
-  ret = dev->ANE_PowerOn();
-  DEBUG("power on: %d\n", ret);
-
-  ret = dev->ANE_IsPowered();
-  DEBUG("powered? %d\n", ret);
-
-  return 0;
-}
-
-int stride_for_width(int width) {
-  int ret = width*2;
-  ret += (64-(ret % 64))%64;
-  return ret;
-}
-
-void *ANE_TensorCreate(int width, int height) {
-  // all float16
-  // input buffer
-
-  NSDictionary* dict = [NSDictionary dictionaryWithObjectsAndKeys:
-                           [NSNumber numberWithInt:width], kIOSurfaceWidth,
-                           [NSNumber numberWithInt:height], kIOSurfaceHeight,
-                           [NSNumber numberWithInt:2], kIOSurfaceBytesPerElement,
-                           [NSNumber numberWithInt:stride_for_width(width)], kIOSurfaceBytesPerRow,
-                           [NSNumber numberWithInt:1278226536], kIOSurfacePixelFormat,
-                           nil];
-  IOSurfaceRef in_surf = IOSurfaceCreate((CFDictionaryRef)dict);
-  IOSurfaceLock((IOSurfaceRef)in_surf, 0, nil);
-
-  return (void *)in_surf;
-}
-
-void* ANE_TensorData(void *out_surf) {
-  void *ret = (void *)IOSurfaceGetBaseAddress((IOSurfaceRef)out_surf);
-  //IOSurfaceUnlock((IOSurfaceRef)out_surf, 0, nil);
-  DEBUG("TensorData %p -> %p\n", out_surf, ret);
-  return ret;
-}
-
-uint64_t ANE_Compile(char *iprog, int sz) {
-  int ret;
-  int cksum = 0;
-  for (int i = 0; i < sz; i++) cksum += iprog[i];
-  DEBUG("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz);
-
-  char *prog = (char*)aligned_alloc(0x1000, sz);
-  memcpy(prog, iprog, sz);
-
-  H11ANEProgramCreateArgsStruct mprog = {0};
-  mprog.program = prog;
-  mprog.program_length = sz;
-
-  H11ANEProgramCreateArgsStructOutput *out = new H11ANEProgramCreateArgsStructOutput;
-  memset(out, 0, sizeof(H11ANEProgramCreateArgsStructOutput));
-  ret = dev->ANE_ProgramCreate(&mprog, out);
-  uint64_t program_handle = out->program_handle;
-  delete out;
-  DEBUG("program create: %lx %lx\n", ret, program_handle);
-  // early failure
-  if (ret != 0) return 0;
-
-  H11ANEProgramPrepareArgsStruct pas = {0};
-  pas.program_handle = program_handle;
-  pas.flags = 0x0000000100010001;
-  ret = dev->ANE_ProgramPrepare(&pas);
-  DEBUG("program prepare: %lx\n", ret);
-
-  return program_handle;
-}
-
-int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf, void *w_surf) {
-  int ret;
-  DEBUG("ANE_Run %p %p\n", in_surf, out_surf);
-  H11ANEProgramRequestArgsStruct *pras = new H11ANEProgramRequestArgsStruct;
-  memset(pras, 0, sizeof(H11ANEProgramRequestArgsStruct));
-
-  // TODO: make real struct
-  pras->args[0] = program_handle;
-  pras->args[4] = 0x0000002100000003;
-
-  // inputs
-  int in_surf_id = IOSurfaceGetID((IOSurfaceRef)in_surf);
-  int out_surf_id = IOSurfaceGetID((IOSurfaceRef)out_surf);
-
-  if (w_surf != NULL) {
-    pras->args[0x28/8] = 0x0000010000000002;
-    int w_surf_id = IOSurfaceGetID((IOSurfaceRef)w_surf);
-    pras->args[0x130/8] = (long long)w_surf_id;
-  } else {
-    pras->args[0x28/8] = 1;
-  }
-  pras->args[0x128/8] = (long long)in_surf_id<<32LL;
-
-  // outputs
-  pras->args[0x528/8] = 1;
-  // 0x628 = outputBufferSurfaceId
-  pras->args[0x628/8] = (long long)out_surf_id<<32LL;
-
-  mach_port_t recvPort = 0;
-  IOCreateReceivePort(kOSAsyncCompleteMessageID, &recvPort);
-  DEBUG("recv port: 0x%x\n", recvPort);
-
-  // run program
-  ret = dev->ANE_ProgramSendRequest(pras, recvPort);
-  DEBUG("send 0x%x\n", ret);
-
-  struct {
-    mach_msg_header_t header;
-    char data[256];
-  } message;
-
-  ret = mach_msg(&message.header,
-          MACH_RCV_MSG,
-          0, sizeof(message),
-          recvPort,
-          MACH_MSG_TIMEOUT_NONE,
-          MACH_PORT_NULL);
-  DEBUG("got message: %d sz %d\n", ret, message.header.msgh_size);
-  delete pras;
-
-  return 0;
-}
-
-int ANECCompile(CFDictionaryRef param_1, CFDictionaryRef param_2, unsigned long param_3);
-int ANE_CompilePlist(char *path, bool debug=false) {
-  CFTypeRef ikeys[2];
-  ikeys[0] = CFSTR("NetworkPlistName");
-  ikeys[1] = CFSTR("NetworkPlistPath");
-
-  CFTypeRef ivalues[2];
-  ivalues[0] = CFStringCreateWithCString(kCFAllocatorDefault, path, kCFStringEncodingUTF8);
-  ivalues[1] = CFSTR("./");
-
-  CFDictionaryRef iDictionary = CFDictionaryCreate(kCFAllocatorDefault, ikeys, ivalues, 2, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
-  CFArrayRef array = CFArrayCreate(kCFAllocatorDefault, (const void**)&iDictionary, 1, &kCFTypeArrayCallBacks);
-
-  CFMutableDictionaryRef optionsDictionary = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
-  CFMutableDictionaryRef flagsDictionary = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
-
-  // h11 (or anything?) works here too, and creates different outputs that don't run
-  CFDictionaryAddValue(flagsDictionary, CFSTR("TargetArchitecture"), CFSTR("h13"));
-  CFDictionaryAddValue(optionsDictionary, CFSTR("OutputFileName"), CFSTR("model.hwx"));
-
-  if (debug) {
-    CFDictionaryAddValue(flagsDictionary, CFSTR("CompileANEProgramForDebugging"), kCFBooleanTrue);
-    int debug_mask = 0x7fffffff;
-    CFDictionaryAddValue(flagsDictionary, CFSTR("DebugMask"), CFNumberCreate(kCFAllocatorDefault, 3, &debug_mask));
-  }
-
-  return ANECCompile(optionsDictionary, flagsDictionary, 0);
-}
-
-/*void _Z24ZinIrRegBitPrintOutDebugILj7EE11ZinIrStatusjRN11ZinHWTraitsIXT_EE6HwTypeEiRNSt3__113basic_ostreamIcNS5_11char_traitsIcEEEE(
-  unsigned long param_1, void *param_2,int param_3, std::basic_ostream<char> *param_4);
-char *ANE_RegDebug(int a1, void *dat, int a2) {
-  std::ostringstream ss;
-  _Z24ZinIrRegBitPrintOutDebugILj7EE11ZinIrStatusjRN11ZinHWTraitsIXT_EE6HwTypeEiRNSt3__113basic_ostreamIcNS5_11char_traitsIcEEEE(a1, dat, a2, &ss);
-  std::string cppstr = ss.str();
-  const char *str = cppstr.c_str();
-  char *ret = (char *)malloc(strlen(str)+1);
-  strcpy(ret, str);
-  return ret;
-}*/
-
-}
-
--- a/extra/accel/ane/lib/ane.py
+++ b/extra/accel/ane/lib/ane.py
@@ -1,222 +0,0 @@
-#!/usr/bin/env python3
-from pathlib import Path
-from ctypes import *
-import json
-import collections
-import numpy as np
-import faulthandler
-import struct
-faulthandler.enable()
-
-basedir = Path(__file__).resolve().parent
-
-libane = None
-aneregs = None
-def init_libane():
-  global libane, aneregs
-  libane = cdll.LoadLibrary((basedir / "libane.dylib").as_posix())
-
-  libane.ANE_Compile.argtypes = [c_char_p, c_int]
-  libane.ANE_Compile.restype = c_void_p
-
-  libane.ANE_TensorCreate.restype = c_void_p
-
-  libane.ANE_TensorData.argtypes = [c_void_p]
-  libane.ANE_TensorData.restype = POINTER(c_uint16)
-
-  libane.ANE_Run.argtypes = [c_void_p]*4
-  libane.ANE_Run.restype = c_int
-
-  #libane.ANE_RegDebug.restype = c_char_p
-
-  with open(basedir / "aneregs.json") as f:
-    aneregs = json.load(f)
-
-ANE_Struct = [
-# aneTD.Header
-  ("u32", 0x1C, "NextCommandOffset"),
-
-# KernelDMASrc @ section @ 0x2C len 0xF4
-  # reloc 0x2c-0x34?? = weights
-  # u32[16] 0x34-0x74 = 0x80 | 1 if used
-  # u32[16] 0x74-0xB4 = <channel data offset>
-  # u32[16] 0xB4-0xF4 = <channel data length>
-
-# Common @ section @ 0x128 len 0x3C (conv)
-  ("u16", 0x128, "InputWidth"),
-  ("u16", 0x12A, "InputHeight"),
-  ("u16", 0x12C, "InputDepth"),
-
-  ("u32", 0x130, "InputOutputType"),   # (OutputType * 0x10) | InputType
-                                       # UInt8 = 0, Int8 = 1, Float16 = 2
-
-  ("u32", 0x134, "InputChannels"),
-  ("u32", 0x138, "OutputChannels"),
-
-  ("u16", 0x13C, "OutputWidth"),
-  ("u16", 0x13E, "OutputHeight"),
-  ("u16", 0x140, "OutputDepth"),
-
-  ("u16", 0x144, "KernelSize"),        # 0xa000 | (KernelHeight * 0x20) | KernelWidth
-  ("u16", 0x146, "Padding"),           # 0x5000 | (PadTop * 0x40) | (PadLeft * 2)
-
-  ("u16", 0x14C, "BatchSize"),
-
-# TileDMASrc @ section @ 0x16C len 0x6C (input)
-  # reloc 0x16c-0x174 = image
-  ("u32", 0x178, "InputRowStride"),
-  ("u32", 0x17C, "InputPlaneStride"),
-  ("u32", 0x180, "InputDepthStride"),
-  ("u32", 0x184, "InputBatchStride"),
-
-  ("u8",  0x1A7, "InputInterleave"),
-
-# L2 @ section @ 0x1E0 len 0x44
-  # [0x1ec, 0x1f0, 0x1f4, 0x1f8, 0x214] = number of engines
-  # [0x1f0, 0x1f4, 0x1f8, 0x214] = engines for inconv?
-  # [0x21c, 0x220, 0x224] = engines for outconv?
-
-# NE @ section @ 0x22c len 0xC (scaling)
-  ("u16", 0x230, "BiasScalar"),
-  ("u16", 0x232, "ScaleScalar"),
-
-# section @ 0x240 len 0x10
-  ("u16", 0x246, "NeuronType"),  # 0x10 = copy, 0x11 = ReLU, 0x12 = custom
-  ("u32", 0x250, "PostScale"),
-
-# TileDMADst @ section @ 0x258 len 0x18
-
-# HandleTileDmaDstConfig
-  # 0x258 -- *(uint *)(this + 0x334) = *(uint *)(this + 0x334) & 0xfffffc3f | 0xc0;
-  #          (GetCacheHintRegisterValue & 0xf) << 6;
-  ("u32", 0x25C, "OutputOffset"),  # offset into output buffer to write at?
-
-  # 0x260 -- *(uint *)(this + 0x33c) = *(uint *)(this + 0x33c) & 0x3f | (int)uVar10 << 6;
-  ("u32", 0x260, "OutputRowStride"),
-  ("u32", 0x264, "OutputPlaneStride"),
-  ("u32", 0x268, "OutputDepthStride"),
-  ("u32", 0x26C, "OutputBatchStride"),
-
-  # 0x270 -- *(uint *)(this + 0x34c) = *(uint *)(this + 0x34c) & 0xf0ffffff | 0x1000000;
-  #  uVar6 = *(uint *)(this + 0x34c) & 0xffffcfcc | 0x2031;
-  #          (ZinTensorDescriptorDmaInterleave & 0xf) << 0x18;
-  ("u8",  0x273, "OutputInterleave"),    # i also have this at 0x211?
-]
-
-ANE_Struct_Dict = {}
-for typ, num, nam in ANE_Struct:
-  styp = {"u32": "I", "u16": "H", "u8": "B"}[typ]
-  ANE_Struct_Dict[nam] = (styp, num)
-
-class ANETensor:
-  def __init__(self, *shape):
-    self.shape = shape
-    self.dtype = np.float16
-    self.sz = int(np.prod(shape))
-    assert(self.sz <= 0x4000)
-    self.tt = libane.ANE_TensorCreate(self.sz, 1)
-    assert(self.tt is not None)
-
-  def data(self):
-    data = libane.ANE_TensorData(self.tt)
-    assert(data is not None)
-    #print(hex(addressof(data.contents)))
-    buf = np.ctypeslib.as_array(data, shape=(self.sz,))
-    ret = np.frombuffer(buf, dtype=self.dtype)
-    #print(ret.data)
-    return ret
-
-class ANE:
-  def __init__(self):
-    init_libane()
-    libane.ANE_Open()
-
-  def compile(self, dat):
-    ret = libane.ANE_Compile(create_string_buffer(dat), len(dat))
-    assert(ret is not None)
-    return ret
-
-  def run(self, prog, tin, tout, tweights=None):
-    libane.ANE_Run(prog, tin.tt, tout.tt, tweights.tt if tweights is not None else 0)
-
-  def tensor(self, shape):
-    return ANETensor(shape)
-
-  def unpack(self, dat):
-    dat = struct.unpack("Q"*(len(dat)//8), dat)
-    ret = {}
-    for k,v in aneregs:
-      by,bi,sz = v
-      bi += (by%8)*8
-      by //= 8
-      rv = (dat[by] >> bi) & ((1 << sz)-1)
-      ret[k] = rv
-    return ret
-
-  def pack(self, pk, dat):
-    dat = list(struct.unpack("Q"*(len(dat)//8), dat))
-    for k,v in aneregs:
-      by,bi,sz = v
-      bi += (by%8)*8
-      by //= 8
-      dat[by] &= ~(((1 << sz)-1) << bi)
-      dat[by] |= pk[k] << bi
-    dat = struct.pack("Q"*len(dat), *dat)
-    return dat
-
-  def debug(self, dat, mems=0):
-    add = [0x30, 0x1d4, 0x220, 0x29c, 0x2f0, 0x30c, 0x32c]
-    lens = [244, 60, 108, 68, 12, 16, 24]
-    ptr = 0x2b
-    ddat = dat[0:0x28]
-    for a, pm in zip(add, lens):
-      #assert pm == dat[ptr]
-      ddat += b"\x00" * (a-len(ddat))
-      ddat += dat[ptr+1:ptr+1+pm+4]
-      ptr += pm+8
-    ddat += b"\x00" * 0x100
-    ret = collections.OrderedDict()
-    for ln in libane.ANE_RegDebug(0, create_string_buffer(ddat), mems).decode('utf-8').strip().split("\n"):
-      lnn = ln.split(" = ")
-      if len(lnn) == 2:
-        ret[lnn[0]] = int(lnn[1])
-    return ret
-
-  def filln(self, dat, nvdict, base=0x4000):
-    for n,v in nvdict.items():
-      styp, num = ANE_Struct_Dict[n]
-      dat = self.fill(dat, [num], styp, v)
-    return dat
-
-  def fill(self, dat, addrs, type, val, base=0x4000):
-    x = struct.pack(type, val)
-    for a in addrs:
-      dat[base+a:base+a+len(x)] = x
-    return dat
-
-if __name__ == "__main__":
-  ane = ANE()
-
-  tin = ANETensor(16)
-  tout = ANETensor(16)
-
-  tind = tin.data()
-  toutd = tout.data()
-
-  tind[0:4] = [-1,1,-2,2]
-  print("** before **")
-  print(tind)
-  print(toutd)
-
-  dat = open("../ops/relu.hwx", "rb").read()
-  md = dat[0x4000:0x4300]
-  dd = ane.unpack(md)
-  mdf = ane.pack(dd, md)
-  assert(md == mdf)
-
-  comp = ane.compile(dat)
-  ret = ane.run(comp, tin, tout)
-  print("** after **")
-  print(tind)
-  print(toutd)
-
--- a/extra/accel/ane/lib/aneregs.json
+++ b/extra/accel/ane/lib/aneregs.json
--- a/extra/accel/ane/lib/build.sh
+++ b/extra/accel/ane/lib/build.sh
@@ -1,3 +0,0 @@
-#!/bin/bash
-clang++ ane.mm --shared -F /System/Library/PrivateFrameworks/ -framework ANEServices -framework IOSurface -framework Foundation -framework IOKit -framework ANECompiler -o libane.dylib
-
--- a/extra/accel/ane/lib/entitlements.xml
+++ b/extra/accel/ane/lib/entitlements.xml
@@ -1 +0,0 @@
-../3_run/entitlements.xml
--- a/extra/accel/ane/lib/h11ane.h
+++ b/extra/accel/ane/lib/h11ane.h
@@ -1 +0,0 @@
-../3_run/h11ane.h
--- a/extra/accel/ane/lib/sign_python.sh
+++ b/extra/accel/ane/lib/sign_python.sh
@@ -1,3 +0,0 @@
-#!/bin/bash
-codesign --force --entitlements entitlements.xml -s "Taylor Swift Child" /opt/homebrew/Cellar/python@3.9/3.9.1_1/Frameworks/Python.framework/Versions/3.9/Resources/Python.app/Contents/MacOS/Python
-
--- a/extra/accel/ane/lib/testconv.py
+++ b/extra/accel/ane/lib/testconv.py
@@ -1,91 +0,0 @@
-#!/usr/bin/env python3
-import time
-from ane import ANE, ANETensor
-
-def benchmark(ane):
-  tin = ANETensor(512*0x20)
-  tout = ANETensor(512*0x20)
-  dat = open("../ops/gemm.hwx", "rb").read()
-  for k,v in ane.debug(dat[0x4000:0x4300], 16).items():
-    print(k,v)
-  comp = ane.compile(dat)
-
-  st = time.time()
-  for i in range(1000):
-    ret = ane.run(comp, tin, tout)
-  et = time.time()
-  ts = (et-st)
-  ops = 1000*512*512*2
-
-  print("%.2f ms, %.2f gigaops/sec" % (ts*1000, ops*1e-9/ts))
-
-
-if __name__ == "__main__":
-  ane = ANE()
-
-  # 0x20 per row
-  tin = ANETensor(0x60)
-  tout = ANETensor(0x60)
-  tw = ANETensor(0x60)
-
-  tind = tin.data()
-  toutd = tout.data()
-  twd = tw.data()
-
-  #tind[0:4] = [-1,1,-2,2]
-  tind[0] =  1
-  tind[0x20] = -2
-  tind[0x40] = 3
-
-  # toutd[0] = \
-  #   tind[0] * twd[0] + \
-  #   tind[0x20] + twd[1] + \
-  #   tind[0x40] + twd[2]
-
-  twd[0] = 4
-  twd[1] = 0x100
-
-  twd[0x20] = 5
-  twd[0x21] = 5
-  twd[0x22] = 5
-
-  twd[0x40] = 12
-
-  print("** before **")
-  print(tind)
-  print(toutd)
-
-  #benchmark(ane)
-  #exit(0)
-
-  """
-  dat = list(open("../ops/sum.hwx", "rb").read())
-  dat = bytes(dat)
-  for k,v in ane.debug(dat[0x4000:0x4300], 16).items():
-    print(k,v)
-  comp = ane.compile(dat)
-  ret = ane.run(comp, tin, tout, tw)
-  """
-
-  datb = open("../ops/sum.hwx", "rb").read()
-  dat = open("../ops/conv.hwx", "rb").read()
-  dd = ane.unpack(dat[0x4000:0x4300])
-  # use the 3rd arg as the weights
-  dd["aneTD.Header[9].KBase0"] = 6
-  dd["aneRegs.NE.PostScale.PostScale"] = 0x3c00
-  #dd["aneRegs.L2.L2Cfg.InputReLU"] = 1
-  #dd["aneRegs.NE.MACCfg.NonlinearMode"] = 1
-  #dd["aneRegs.TileDMADst.Fmt.MemFmt"] = 0
-  #dd["aneRegs.L2.ResultBase.Addr"] = 0
-  #dd["aneRegs.Common.ChCfg.InFmt"] = 1
-  #dd["aneRegs.TileDMADst.Fmt.ZeroPadFirst"] = 0
-  #dd["aneRegs.TileDMADst.DMAConfig.En"] = 0
-  for k,v in dd.items():
-    print(k,v)
-  dat = datb[:0x4000] + ane.pack(dd, dat[0x4000:0x4300]) + datb[0x4300:]
-  comp = ane.compile(dat)
-  ret = ane.run(comp, tin, tout, tw)
-
-  print("** after **")
-  print(tind)
-  print(toutd)
--- a/extra/accel/ane/ops/concat.hwx
+++ b/extra/accel/ane/ops/concat.hwx
--- a/extra/accel/ane/ops/conv.hwx
+++ b/extra/accel/ane/ops/conv.hwx
--- a/extra/accel/ane/ops/gemm.hwx
+++ b/extra/accel/ane/ops/gemm.hwx
--- a/extra/accel/ane/ops/relu.hwx
+++ b/extra/accel/ane/ops/relu.hwx
--- a/extra/accel/ane/ops/sigmoid.hwx
+++ b/extra/accel/ane/ops/sigmoid.hwx
--- a/extra/accel/ane/ops/sum.hwx
+++ b/extra/accel/ane/ops/sum.hwx
--- a/extra/accel/ane/tinygrad/ops_ane.py
+++ b/extra/accel/ane/tinygrad/ops_ane.py
@@ -1,39 +0,0 @@
-from functools import lru_cache
-from .tensor import Device, Function, register
-
-@lru_cache
-def compile_wrapper(ane, dat):
-  return ane.compile(dat)
-
-def roundup(x, v):
-  return x + (v-x)%v
-
-@lru_cache
-def compile_relu(ane, sz):
-  dat = list(open("accel/ane/ops/relu.hwx", "rb").read())
-  # TODO: make this all nice and once
-  # number of engines? (max 0x100)
-  l2_stride = max(0x100, roundup(sz*2, 0x10))
-  # 0x1ec = L2.SourceChannelStride.Stride, 0x1f0 = L2.SourceRowStride.Stride
-  # 0x1f4, 0x1f8?
-  # 0x214 = L2.ResultBase.Addr
-  dat = ane.fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8, 0x214], "I", l2_stride)
-  stride = roundup(sz*2, 0x40)
-  dat = ane.filln(dat, {
-    "NeuronType": 0x11,   # 0x10 makes this a copy, 0x11 = ReLU, 0x12 = crash
-    "InputWidth": sz, "OutputWidth": sz,
-    "InputRowStride": stride, "InputPlaneStride": stride, "InputDepthStride": stride,
-    "OutputRowStride": stride, "OutputPlaneStride": stride, "OutputDepthStride": stride,
-    })
-  return compile_wrapper(ane, bytes(dat))
-
-class ReLU(Function):
-  def forward(ctx, input):
-    ret = ctx.ane.tensor(input.shape)
-    ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret)
-    return ret
-
-  def backward(ctx, grad_output):
-    return 0
-
-register('relu', ReLU, device=Device.ANE)
--- a/extra/accel/intel/.gitignore
+++ b/extra/accel/intel/.gitignore
@@ -1 +0,0 @@
-a.out
--- a/extra/accel/intel/README
+++ b/extra/accel/intel/README
@@ -1,2 +0,0 @@
-source /opt/intel/oneapi/compiler/latest/env/vars.sh
-sycl-ls
--- a/extra/accel/intel/benchmark_matmul.py
+++ b/extra/accel/intel/benchmark_matmul.py
@@ -1,57 +0,0 @@
-import time
-
-onnx_path = "/tmp/my.onnx"
-N = 2048
-CNT = 400
-
-"""
-import torch
-import torch.nn as nn
-#dtype = torch.bfloat16
-dtype = torch.float32
-class MatMul(nn.Module):
-  def __init__(self):
-    super().__init__()
-    self.a = nn.Linear(N, N, bias=False)
-  def forward(self, x):
-    x = x.to(dtype)
-    for i in range(CNT): x = self.a(x).relu()
-    return x.to(torch.float32)
-
-torch_model = MatMul().to(dtype)
-torch.onnx.export(torch_model, torch.randn(N, N), onnx_path)
-"""
-
-"""
-import onnx
-from tinygrad.tensor import Tensor
-from extra.onnx import get_run_onnx
-out = get_run_onnx(onnx.load(onnx_path))({"onnx::MatMul_0": Tensor.zeros(N, N)})
-for x in out.values(): x.realize()
-"""
-
-from openvino.runtime import Core
-core = Core()
-devices = core.available_devices
-for device in devices:
-  device_name = core.get_property(device, "FULL_DEVICE_NAME")
-  print(f"{device}: {device_name}")
-model = core.read_model(onnx_path)
-compiled_model = core.compile_model(model, device_name='GPU.0')
-print(compiled_model)
-ireq = compiled_model.create_infer_request()
-for model_input in compiled_model.inputs:
-  tensor = ireq.get_tensor(model_input)
-  tensor.data[:] = 2
-  print(tensor)
-print("request")
-ireq.infer()
-ireq.infer()
-print("did one")
-
-REPS = 20
-st = time.perf_counter()
-for i in range(REPS): ireq.infer()
-et = time.perf_counter() - st
-print(f"{et*1000:.2f} ms {(CNT*N*N*N*REPS*2/et)*1e-9:.2f} GFLOPS")
-
--- a/extra/accel/intel/go.sh
+++ b/extra/accel/intel/go.sh
@@ -1,3 +0,0 @@
-#!/bin/bash -e
-/opt/intel/oneapi/compiler/latest/linux/bin-llvm/clang++ joint_matrix_bfloat16.cpp -fsycl
-SYCL_PI_TRACE=1 ./a.out 
--- a/extra/accel/intel/joint_matrix_bfloat16.cpp
+++ b/extra/accel/intel/joint_matrix_bfloat16.cpp
@@ -1,173 +0,0 @@
-//==-------- joint_matrix_bfloat16.cpp  - DPC++ joint_matrix----------- ----==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix
-
-// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-
-#include <iostream>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::bfloat16;
-
-//#define SG_SZ 16
-#define SG_SZ 8
-
-#define TM 8
-#define TN SG_SZ
-//#define TK 16
-#define TK 16
-
-#define BF16_EPSILON 0.00781250
-
-template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
-private:
-  T *mat;
-
-public:
-  T *get_data() { return mat; }
-  void set_data(T *data) { mat = data; }
-  big_matrix(T *data) : mat(data) {}
-};
-
-template <typename T1, typename T2, size_t M, size_t N, size_t K>
-void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A, big_matrix<T2, K / 2, N * 2> &B) {
-  size_t NDRangeM = M / TM;
-  size_t NDRangeN = N / TN;
-  buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, K));
-  buffer<bfloat16, 2> bufB(B.get_data(), range<2>(K, N));
-  buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
-
-  auto program = [&](handler &cgh) {
-    auto accC = bufC.get_access<access::mode::read_write>(cgh);
-    auto accA = bufA.get_access<access::mode::read_write>(cgh);
-    auto accB = bufB.get_access<access::mode::read_write>(cgh);
-
-    cgh.parallel_for<class imatrix>(
-      nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-      [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-      {
-        // The submatrix API has to be accessed by all the workitems in a
-        // subgroup these functions will be called once by the subgroup no
-        // code divergence between the workitems
-        const auto global_idx = spmd_item.get_global_id(0);
-        const auto global_idy = spmd_item.get_global_id(1);
-        const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-        const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-        sub_group sg = spmd_item.get_sub_group();
-        joint_matrix<sub_group, bfloat16, use::a, TM, TK, layout::row_major> sub_a;
-        // For B, we assume B has been already VNNIed.
-        joint_matrix<sub_group, bfloat16, use::b, TK, TN, ext::intel::experimental::matrix::layout::packed> sub_b;
-        joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
-        joint_matrix_load(sg, sub_c, accC.get_pointer() + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, layout::row_major);
-
-        for (int k = 0; k < K / TK; k += 1) { //
-          joint_matrix_load(sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, K);
-          joint_matrix_load(sg, sub_b, accB.get_pointer() + (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2, N * 2);
-          sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
-        }
-        joint_matrix_store(sg, sub_c, accC.get_pointer() + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, layout::row_major);
-      }); // parallel for
-  };
-
-  queue q;
-  auto start = std::chrono::steady_clock::now();
-  auto e = q.submit(program);
-  auto submit = std::chrono::steady_clock::now();
-  e.wait();
-  auto end = std::chrono::steady_clock::now();
-  std::cout << "submit:  " << std::chrono::duration_cast<std::chrono::milliseconds>(submit - start).count() << " ms" << std::endl;
-  std::cout << "compute: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - submit).count() << " ms" << std::endl;
-
-  // ahh, freeing is slow
-}
-
-//#define SCALE 1024
-//#define SCALE 64
-#define SCALE 256
-static constexpr size_t MATRIX_M = TM * SCALE;
-static constexpr size_t MATRIX_N = TN * SCALE;
-static constexpr size_t MATRIX_K = TK * SCALE;
-bfloat16 A[MATRIX_M][MATRIX_K];
-bfloat16 B[MATRIX_K / 2][MATRIX_N * 2];
-float C[MATRIX_M][MATRIX_N];
-float D[MATRIX_M][MATRIX_N];
-
-float make_fp32(bfloat16 x) {
-  unsigned int y = *((int *)&x);
-  y = y << 16;
-  float *res = reinterpret_cast<float *>(&y);
-  return *res;
-}
-
-void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N,
-                         int K) {
-  for (int m = 0; m < M; m++)
-    for (int n = 0; n < N; n++) {
-      for (int k = 0; k < K; k++) {
-        // Because B was assumed VNNIed
-        bfloat16 *va = (bfloat16 *)(A_mem + m * K + k);
-        bfloat16 *vb = (bfloat16 *)(B_mem + k * N + n);
-        float acc = *((float *)(C_mem + m * N + n));
-        for (int i = 0; i < 2; i++) {
-          acc += (make_fp32(va[i]) * make_fp32(vb[i]));
-        }
-        *((float *)(C_mem + m * N + n)) = acc;
-      }
-    }
-}
-
-int main() {
-  for (int i = 0; i < MATRIX_M; i++) {
-    for (int j = 0; j < MATRIX_K; j++) {
-      A[i][j] = bfloat16(1.0f * (i + j));
-    }
-  }
-  for (int i = 0; i < MATRIX_K / 2; i++) {
-    for (int j = 0; j < MATRIX_N * 2; j++) {
-      B[i][j] = bfloat16(2.0f * i + 3.0f * j);
-    }
-  }
-  for (int i = 0; i < MATRIX_M; i++) {
-    for (int j = 0; j < MATRIX_N; j++) {
-      C[i][j] = 1.0;
-      D[i][j] = 1.0;
-    }
-  }
-
-  std::cout << "M" << MATRIX_M << "N" << MATRIX_N << "K" << MATRIX_K << std::endl;
-
-  big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C);
-  big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
-  big_matrix<bfloat16, MATRIX_M, MATRIX_K> MA((bfloat16 *)&A);
-  big_matrix<bfloat16, MATRIX_K / 2, MATRIX_N * 2> MB((bfloat16 *)&B);
-
-  matrix_multiply(MC, MA, MB);
-
-  /*start = std::chrono::steady_clock::now();
-  matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, MATRIX_N, MATRIX_K / 2);
-  end = std::chrono::steady_clock::now();
-  std::cout << "Elapsed time in milliseconds (reference): " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
-
-  bool res = true;
-  for (int i = 0; i < MATRIX_M; i++) {
-    for (int j = 0; j < MATRIX_N; j++) {
-      if ((fabs(C[i][j]) - fabs(D[i][j])) > BF16_EPSILON)
-        res = false;
-    }
-  }
-  std::cout << (res ? "passed" : "failed") << std::endl;
-  return !res;*/
-
-  return 0;
-}
-
--- a/extra/accel/tpu/README.md
+++ b/extra/accel/tpu/README.md
@@ -1,127 +0,0 @@
-Google's TPU
--------------------------------------------------------------------
-
-We document the Google TPU v2/v3 in order to support it in tinygrad without the XLA compiler.
-
-## Creating a Google Cloud TPU VM
-
-This costs $4.50/hr for a TPUv2-8 machine, the cheapest VM.
-
-```bash
-gcloud alpha compute tpus tpu-vm create test --zone=us-central1-b --accelerator-type=v2-8 --version=v2-alpha
-gcloud alpha compute tpus tpu-vm ssh test --zone us-central1-b
-# and for when you are done
-gcloud alpha compute tpus tpu-vm delete test --zone us-central1-b
-gcloud alpha compute tpus tpu-vm list --zone us-central1-b
-```
-
-Aside from the usual VM stuff, there's 4 accelerators on the PCI-E bus. (v2-8 is 4 chips with 2 cores each)
-
-```
-# lspci
-00:04.0 Unassigned class [ff00]: Google, Inc. Device 0027
-00:05.0 Unassigned class [ff00]: Google, Inc. Device 0027
-00:06.0 Unassigned class [ff00]: Google, Inc. Device 0027
-00:07.0 Unassigned class [ff00]: Google, Inc. Device 0027
-```
-
-They show up in `/sys/class/accel` (tons of files here) and the driver lives in `/lib/libtpu.so`. The devices are in `/dev/accel[0-3]`, and a bunch of stuff is mmaped. They are "ba16c7433" chips.
-
-We grab the minimal TPU [example from TensorFlow](https://github.com/tensorflow/tensorflow/blob/695b4c93d5da7277eb845937b79b66f9f363ed94/tensorflow/compiler/xla/python/tpu_driver/client/libtpu_client.c). When the compiler runs, it produces tons of great logs in `/tmp/tpu_logs`
-
-```bash
-cd tfexample
-gcc -o libtpu_client libtpu_client.c -ltpu
-TPU_VLOG_LEVEL=99 ./libtpu_client
-```
-
-From these logs, we find the "LLO Instructions"
-
-## VLIW Instruction (322b VLIW bundle)
-
-```
-  spare         : 0   (0,1)
-  vex_mxu       : 0   (1,1)
-* 1 misc slot
-  msc_targ      : 0   (2,3)
-  msc_opnd      : 0   (5,3)
-  msc_op        : 0   (8,5)
-  msc_pred      : 31  (13,5)
-* 2 matrix slots (push, pop)
-  vres_dest     : 28  (18,2)
-  vres_op       : 28  (20,2)
-  vres_pred     : 31  (22,5)
-  vex_source    : 28  (27,2)
-  vex_subop     : 24  (29,3)
-  vex_op        : 24  (32,3)
-  vex_pred      : 31  (35,5)
-* 4 vector slots (2 for load/store)
-  vld_ttu       : 30  (40,1)
-  vld_stride    : 24  (41,3)
-  vld_offset    : 24  (44,2)
-  vld_base      : 24  (46,2)
-  vld_submsk    : 24  (48,3)
-  vld_dest      : 0   (51,5)
-  vld_op        : 0   (56,2)
-  vld_pred      : 31  (58,5)
-  vst_ttu       : 30  (63,1)
-  vst_iar       : 30  (64,1)
-  vst_value_two : 24  (65,3)
-  vst_offset    : 24  (68,2)
-  vst_base      : 24  (70,2)
-  vst_value_one : 24  (72,3)
-  vst_source    : 0   (75,5)
-  vst_op        : 0   (80,5)
-  vst_pred      : 31  (85,5)
-* 4 vector slots (2 for ALU)
-  v1_dest       : 0   (90,5)
-  v1_y_vreg     : 0   (95,5)
-  v1_y_src      : 0   (100,5)
-  v1_x          : 0   (105,5)
-  v1_op         : 0   (110,6)
-  v1_pred       : 31  (116,5)
-  v0_dest       : 0   (121,5)
-  v0_y_vreg     : 0   (126,5)
-  v0_y_src      : 0   (131,5)
-  v0_x          : 0   (136,5)
-  v0_op         : 0   (141,6)
-  v0_pred       : 31  (147,5)
-* 3 scalar registers copied in to the vector units?
-  vs2           : 0   (152,5)
-  vs1           : 0   (157,5)
-  vs0           : 0   (162,5)
-* 6 immediates (16-bit each, two can be merged for 32)
-  imm_5         : 0   (167,16)
-  imm_4         : 0   (183,16)
-  imm_3         : 0   (199,16)
-  imm_2         : 0   (215,16)
-  imm_1         : 0   (231,16)
-  imm_0         : 0   (247,16)
-* ttu? what's a ttu?
-  ttu_set_btr   : 0   (263,1)
-  ttu_iterate   : 0   (264,1)
-  ttu_row       : 0   (265,3)
-* 2 scalar slots
-  s1_dest       : 0   (268,5)
-  s1_y          : 0   (273,6)
-  s1_x          : 0   (279,5)
-  s1_op         : 0   (284,6)
-  s1_pred       : 31  (290,5)
-  s0_dest       : 0   (295,5)
-  s0_y          : 0   (300,6)
-  s0_x          : 0   (306,5)
-  s0_op         : 0   (311,6)
-  s0_pred       : 15  (317,5)
-```
-
-## Running a Program (WIP)
-
-Our goal is to run a program on TPU without the driver.
-
-```
-...
-openat(AT_FDCWD, "/dev/accel3", O_RDWR) = 184
-mmap(NULL, 27799736, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_LOCKED, 184, 0) = 0x7f59a74b3000
-# size is 0x1a830b8, aka 28MB
-```
-
--- a/extra/accel/tpu/logs/tpu_driver.t1v-n-852cd0d5-w-0.taylor.log.INFO.20210619-062914.26926.gz
+++ b/extra/accel/tpu/logs/tpu_driver.t1v-n-852cd0d5-w-0.taylor.log.INFO.20210619-062914.26926.gz
--- a/extra/accel/tpu/tfexample/libtpu.h
+++ b/extra/accel/tpu/tfexample/libtpu.h
@@ -1,303 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_LIBTPU_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_LIBTPU_H_
-
-#include <stdbool.h>
-#include <stdint.h>
-
-#define TPUDRIVER_CAPI_EXPORT __attribute__((visibility("default")))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// ------------------- TPU Driver Support -----------------------
-
-struct TpuDriverFn;
-
-typedef struct TpuDriver TpuDriver;
-
-typedef struct TpuEvent TpuEvent;
-
-typedef struct TpuBufferHandleInternal TpuBufferHandleInternal;
-
-typedef struct TpuCompiledProgramHandleInternal
-    TpuCompiledProgramHandleInternal;
-
-typedef struct TpuLoadedProgramHandleInternal TpuLoadedProgramHandleInternal;
-
-typedef struct TpuBufferHandle {
-  TpuBufferHandleInternal* internal_handle;
-  TpuEvent* event;
-  int64_t size_in_bytes;
-} TpuBufferHandle;
-
-typedef struct TpuCompiledProgramHandle {
-  TpuCompiledProgramHandleInternal* internal_handle;
-  TpuEvent* event;
-} TpuCompiledProgramHandle;
-
-typedef struct TpuLoadedProgramHandle {
-  TpuLoadedProgramHandleInternal* internal_handle;
-  TpuEvent* event;
-} TpuLoadedProgramHandle;
-
-// HloProto is a serialized xla::HloProto buffer.
-typedef struct HloProto {
-  void* buffer;
-  int32_t size;
-} HloProto;
-
-// DeviceAssignment is a serialized xla::DeviceAssignmentProto buffer.
-typedef struct DeviceAssignment {
-  void* bytes;
-  int32_t size;
-} DeviceAssignment;
-
-typedef struct TpuStatus {
-  int32_t code;
-  char* msg;
-} TpuStatus;
-
-typedef struct CompiledProgramShape {
-  struct TpuStatus* status;
-  void* bytes;
-  int32_t size;
-} CompiledProgramShape;
-
-typedef struct TpuAllocationShape {
-  void* bytes;
-  int32_t size;
-} TpuAllocationShape;
-
-typedef struct TpuSystemInfo {
-  void* bytes;
-  int32_t size;
-} TpuSystemInfo;
-
-typedef void(PrototypeTpuDriver_Initialize)(struct TpuDriverFn* driver_fn,
-                                            bool initialize);
-typedef struct TpuDriver*(PrototypeTpuDriver_Open)(const char* worker);
-typedef void(PrototypeTpuDriver_Close)(struct TpuDriver* driver);
-typedef struct TpuStatus*(PrototypeTpuDriver_Reset)(struct TpuDriver* driver);
-
-typedef struct TpuSystemInfo*(PrototypeTpuDriver_QuerySystemInfo)(
-    struct TpuDriver* driver);
-
-typedef void(PrototypeTpuDriver_FreeSystemInfo)(struct TpuSystemInfo* info);
-
-// TODO(frankchn): Make this not a hard-coded constant.
-const int32_t MemoryRegion_HBM = 1;
-
-typedef int64_t(PrototypeTpuDriver_ComputeLinearizedBytesFromShape)(
-    struct TpuDriver* driver, const struct TpuAllocationShape shape);
-
-typedef struct TpuStatus*(PrototypeTpuDriver_LinearizeShape)(
-    struct TpuDriver* driver, void* dst, const void* src,
-    const struct TpuAllocationShape shape);
-
-typedef struct TpuStatus*(PrototypeTpuDriver_DelinearizeShape)(
-    struct TpuDriver* driver, void* dst, const void* src,
-    const struct TpuAllocationShape shape);
-
-typedef struct TpuCompiledProgramHandle*(PrototypeTpuDriver_CompileProgram)(
-    struct TpuDriver* driver, const struct HloProto hlo_proto,
-    int32_t num_replicas, int32_t eventc, struct TpuEvent** eventv);
-
-typedef struct TpuCompiledProgramHandle*(
-    PrototypeTpuDriver_CompileProgramFromText)(struct TpuDriver* driver,
-                                               const char* hlo_text,
-                                               int32_t num_replicas,
-                                               int32_t eventc,
-                                               struct TpuEvent** eventv);
-
-/* Note: We are not responsible for freeing the event within the
- * TpuCompiledProgramHandle. You have to call FreeEvent separately to ensure
- * that memory does not leak.
- */
-typedef void(PrototypeTpuDriver_FreeCompiledProgramHandle)(
-    struct TpuCompiledProgramHandle* handle);
-
-typedef struct TpuLoadedProgramHandle*(PrototypeTpuDriver_LoadProgram)(
-    struct TpuDriver* driver, int32_t core_id,
-    const struct TpuCompiledProgramHandle* compiled_program_handle,
-    int32_t eventc, struct TpuEvent** eventv);
-
-/* Note: We are not responsible for freeing the event within the
- * TpuLoadedProgramHandle. You have to call FreeEvent separately to ensure that
- * memory does not leak.
- */
-typedef struct TpuEvent*(PrototypeTpuDriver_UnloadProgram)(
-    struct TpuDriver* driver,
-    struct TpuLoadedProgramHandle* loaded_program_handle, int32_t eventc,
-    struct TpuEvent** eventv);
-
-typedef struct TpuEvent*(PrototypeTpuDriver_ExecuteProgram)(
-    struct TpuDriver* driver, struct TpuLoadedProgramHandle* handle,
-    int32_t inputc, struct TpuBufferHandle** input_buffer_handle,
-    int32_t outputc, struct TpuBufferHandle** output_buffer_handle,
-    struct DeviceAssignment device_assignment, int32_t eventc,
-    struct TpuEvent** eventv);
-
-typedef struct TpuBufferHandle*(PrototypeTpuDriver_AllocateTuple)(
-    struct TpuDriver* driver, int32_t core_id, int32_t memory_region,
-    int32_t bufferc, struct TpuBufferHandle** buffer_handle, int32_t eventc,
-    struct TpuEvent** eventv);
-
-typedef struct TpuBufferHandle*(PrototypeTpuDriver_Allocate)(
-    struct TpuDriver* driver, int32_t core_id, int32_t memory_region,
-    int64_t num_bytes, int32_t eventc, struct TpuEvent** eventv);
-
-typedef struct TpuBufferHandle*(PrototypeTpuDriver_AllocateShape)(
-    struct TpuDriver* driver, int32_t core_id, int32_t memory_region,
-    const struct TpuAllocationShape shape, int32_t eventc,
-    struct TpuEvent** eventv);
-
-/* Note: We are not responsible for freeing the event within the
- * TpuBufferHandle. You have to call FreeEvent separately to ensure that memory
- * does not leak.
- */
-typedef struct TpuEvent*(PrototypeTpuDriver_Deallocate)(
-    struct TpuDriver* driver, struct TpuBufferHandle* buffer_handle,
-    int32_t eventc, struct TpuEvent** eventv);
-
-typedef struct TpuEvent*(PrototypeTpuDriver_TransferToDevice)(
-    struct TpuDriver* driver, const void* src, struct TpuBufferHandle* dst,
-    int32_t eventc, struct TpuEvent** eventv);
-
-typedef struct TpuEvent*(PrototypeTpuDriver_TransferFromDevice)(
-    struct TpuDriver* driver, struct TpuBufferHandle* src, void* dst,
-    int32_t eventc, struct TpuEvent** eventv);
-
-typedef struct TpuEvent*(PrototypeTpuDriver_TransferFromDeviceToDevice)(
-    struct TpuDriver* driver, struct TpuBufferHandle* src,
-    struct TpuBufferHandle* dst, int32_t eventc, struct TpuEvent** eventv);
-
-typedef struct CompiledProgramShape*(
-    PrototypeTpuDriver_GetCompiledProgramShape)(
-    struct TpuCompiledProgramHandle* handle);
-
-typedef void(PrototypeTpuDriver_FreeCompiledProgramShape)(
-    struct CompiledProgramShape* shape);
-
-typedef void(PrototypeTpuDriver_EventAddCallback)(
-    struct TpuEvent* event,
-    void (*callback_fn)(struct TpuStatus*, void* additional_info),
-    void* additional_info);
-
-typedef struct TpuStatus*(PrototypeTpuDriver_EventAwait)(struct TpuEvent* event,
-                                                         int64_t timeout_in_us);
-
-typedef void(PrototypeTpuDriver_FreeEvent)(struct TpuEvent* event);
-
-typedef void(PrototypeTpuDriver_FreeStatus)(struct TpuStatus* status);
-
-typedef const char*(PrototypeTpuDriver_Version)();
-
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Initialize TpuDriver_Initialize;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Open TpuDriver_Open;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Close TpuDriver_Close;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Reset TpuDriver_Reset;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_QuerySystemInfo
-    TpuDriver_QuerySystemInfo;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeSystemInfo
-    TpuDriver_FreeSystemInfo;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_ComputeLinearizedBytesFromShape
-    TpuDriver_ComputeLinearizedBytesFromShape;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_LinearizeShape
-    TpuDriver_LinearizeShape;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_DelinearizeShape
-    TpuDriver_DelinearizeShape;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_CompileProgram
-    TpuDriver_CompileProgram;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_CompileProgramFromText
-    TpuDriver_CompileProgramFromText;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeCompiledProgramHandle
-    TpuDriver_FreeCompiledProgramHandle;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_LoadProgram
-    TpuDriver_LoadProgram;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_UnloadProgram
-    TpuDriver_UnloadProgram;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_ExecuteProgram
-    TpuDriver_ExecuteProgram;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_AllocateTuple
-    TpuDriver_AllocateTuple;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Allocate TpuDriver_Allocate;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_AllocateShape
-    TpuDriver_AllocateShape;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Deallocate TpuDriver_Deallocate;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_TransferToDevice
-    TpuDriver_TransferToDevice;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_TransferFromDevice
-    TpuDriver_TransferFromDevice;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_TransferFromDeviceToDevice
-    TpuDriver_TransferFromDeviceToDevice;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_GetCompiledProgramShape
-    TpuDriver_GetCompiledProgramShape;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeCompiledProgramShape
-    TpuDriver_FreeCompiledProgramShape;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_EventAddCallback
-    TpuDriver_EventAddCallback;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_EventAwait TpuDriver_EventAwait;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeEvent TpuDriver_FreeEvent;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeStatus TpuDriver_FreeStatus;
-TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Version TpuDriver_Version;
-
-#ifdef __cplusplus
-}
-#endif
-
-struct TpuDriverFn {
-  PrototypeTpuDriver_Open* TpuDriver_Open;                          // NOLINT
-  PrototypeTpuDriver_Close* TpuDriver_Close;                        // NOLINT
-  PrototypeTpuDriver_Reset* TpuDriver_Reset;                        // NOLINT
-  PrototypeTpuDriver_ComputeLinearizedBytesFromShape*
-      TpuDriver_ComputeLinearizedBytesFromShape;                    // NOLINT
-  PrototypeTpuDriver_QuerySystemInfo* TpuDriver_QuerySystemInfo;    // NOLINT
-  PrototypeTpuDriver_FreeSystemInfo* TpuDriver_FreeSystemInfo;      // NOLINT
-  PrototypeTpuDriver_LinearizeShape* TpuDriver_LinearizeShape;      // NOLINT
-  PrototypeTpuDriver_DelinearizeShape* TpuDriver_DelinearizeShape;  // NOLINT
-  PrototypeTpuDriver_CompileProgram* TpuDriver_CompileProgram;      // NOLINT
-  PrototypeTpuDriver_CompileProgramFromText*
-      TpuDriver_CompileProgramFromText;                             // NOLINT
-  PrototypeTpuDriver_FreeCompiledProgramHandle*
-      TpuDriver_FreeCompiledProgramHandle;                          // NOLINT
-  PrototypeTpuDriver_LoadProgram* TpuDriver_LoadProgram;            // NOLINT
-  PrototypeTpuDriver_UnloadProgram* TpuDriver_UnloadProgram;        // NOLINT
-  PrototypeTpuDriver_ExecuteProgram* TpuDriver_ExecuteProgram;      // NOLINT
-  PrototypeTpuDriver_AllocateTuple* TpuDriver_AllocateTuple;        // NOLINT
-  PrototypeTpuDriver_Allocate* TpuDriver_Allocate;                  // NOLINT
-  PrototypeTpuDriver_AllocateShape* TpuDriver_AllocateShape;        // NOLINT
-  PrototypeTpuDriver_Deallocate* TpuDriver_Deallocate;              // NOLINT
-  PrototypeTpuDriver_TransferToDevice* TpuDriver_TransferToDevice;  // NOLINT
-  PrototypeTpuDriver_TransferFromDevice*
-      TpuDriver_TransferFromDevice;  // NOLINT
-  PrototypeTpuDriver_TransferFromDeviceToDevice*
-      TpuDriver_TransferFromDeviceToDevice;                         // NOLINT
-  PrototypeTpuDriver_GetCompiledProgramShape*
-      TpuDriver_GetCompiledProgramShape;  // NOLINT
-  PrototypeTpuDriver_FreeCompiledProgramShape*
-      TpuDriver_FreeCompiledProgramShape;                           // NOLINT
-  PrototypeTpuDriver_EventAddCallback* TpuDriver_EventAddCallback;  // NOLINT
-  PrototypeTpuDriver_EventAwait* TpuDriver_EventAwait;              // NOLINT
-  PrototypeTpuDriver_FreeEvent* TpuDriver_FreeEvent;                // NOLINT
-  PrototypeTpuDriver_FreeStatus* TpuDriver_FreeStatus;              // NOLINT
-
-  PrototypeTpuDriver_Version* TpuDriver_Version;                    // NOLINT
-};
-
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_LIBTPU_H_
--- a/extra/accel/tpu/tfexample/libtpu_client.c
+++ b/extra/accel/tpu/tfexample/libtpu_client.c
@@ -1,159 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Before you start, make sure libtpu.so, libtpu.h and libtpu_client.c are in
-// the same working directory.
-//
-// To compile: gcc -o libtpu_client libtpu_client.c -ldl
-// To run: sudo ./libtpu_client
-
-#include <dlfcn.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "libtpu.h"
-
-void hexdump(void *dat, int len) {
-  /*unsigned char *cdat = (unsigned char*)dat;
-  for (int i = 0; i < len; i++) {
-    if (i!=0 && i%0x10 == 0) printf("\n");
-    printf("%2.2X ", cdat[i]);
-  }
-  printf("\n");*/
-}
-
-
-int main(int argc, char** argv) {
-  struct TpuDriverFn driver_fn;
-  TpuDriver_Initialize(&driver_fn, true);
-
-  fprintf(stdout, "------ Going to Query Version ------\n");
-  fprintf(stdout, "TPU Driver Version: %s\n", driver_fn.TpuDriver_Version());
-
-  fprintf(stdout, "------ Going to Open a TPU Driver ------\n");
-  struct TpuDriver* driver = driver_fn.TpuDriver_Open("local://");
-
-  fprintf(stdout, "------ Going to Query for System Information ------\n");
-  struct TpuSystemInfo* info = driver_fn.TpuDriver_QuerySystemInfo(driver);
-  driver_fn.TpuDriver_FreeSystemInfo(info);
-
-  // An example of simple program to sum two parameters.
-  const char* hlo_module_text = R"(HloModule add_vec_module
-    ENTRY %add_vec (a: s32[256], b: s32[256]) -> s32[256] {
-      %a = s32[256] parameter(0)
-      %b = s32[256] parameter(1)
-      ROOT %sum = s32[256] add(%a, %b)
-    }
-    )";
-
-  fprintf(stdout, "------ Going to Compile a TPU program ------\n");
-  struct TpuCompiledProgramHandle* cph =
-      driver_fn.TpuDriver_CompileProgramFromText(driver, hlo_module_text,
-      /*num_replicas=*/1, /*eventc=*/0, /*eventv*/NULL);
-
-  //hexdump(cph->internal_handle, 0x100);
-
-  TpuEvent* compile_events[] = {cph->event};
-  fprintf(stdout, "------ Going to Load a TPU program ------\n");
-  struct TpuLoadedProgramHandle* lph =
-      driver_fn.TpuDriver_LoadProgram(driver, /*core_id=*/0, cph,
-      /*eventc=*/1, /*eventv=*/compile_events);
-
-  const int size = 1024;
-
-  fprintf(stdout, "------ Going to Allocate a TPU Buffer ------\n");
-  struct TpuBufferHandle* buf_a_handle =
-      driver_fn.TpuDriver_Allocate(driver, /*core-id=*/0, /*memory_region=*/1,
-        /*bytes=*/size, /*eventc=*/0, /*eventv=*/NULL);
-  fprintf(stdout, "------ Going to Allocate a TPU Buffer ------\n");
-  struct TpuBufferHandle* buf_b_handle =
-      driver_fn.TpuDriver_Allocate(driver, /*core-id=*/0, /*memory_region=*/1,
-        /*bytes=*/size, /*eventc=*/0, /*eventv=*/NULL);
-  fprintf(stdout, "------ Going to Allocate a TPU Buffer ------\n");
-  struct TpuBufferHandle* buf_sum_handle =
-      driver_fn.TpuDriver_Allocate(driver, /*core-id=*/0, /*memory_region=*/1,
-        /*bytes=*/size, /*eventc=*/0, /*eventv=*/NULL);
-
-  char a_src[size], b_src[size], sum_src[size];
-  for (int i = 0; i < size; ++i) {
-    a_src[i] = 1;
-    b_src[i] = 2;
-    sum_src[i] = 0;
-  }
-
-  TpuEvent* allocate_buf_a_events[] = {buf_a_handle->event};
-  fprintf(stdout, "------ Going to Transfer To Device ------\n");
-  struct TpuEvent* transfer_ev1 =
-      driver_fn.TpuDriver_TransferToDevice(driver, a_src, buf_a_handle,
-        /*eventc=*/1, /*eventv=*/allocate_buf_a_events);
-  TpuEvent* allocate_buf_b_events[] = {buf_a_handle->event};
-  fprintf(stdout, "------ Going to Transfer To Device ------\n");
-  struct TpuEvent* transfer_ev2 =
-      driver_fn.TpuDriver_TransferToDevice(driver, b_src, buf_b_handle,
-        /*eventc=*/1, /*eventv=*/allocate_buf_b_events);
-
-  //getchar();
-
-  fprintf(stdout, "------ Going to Execute a TPU program ------\n");
-  DeviceAssignment device_assignment = {NULL, 0};
-  TpuBufferHandle* input_buffer_handle[] = {buf_a_handle, buf_b_handle};
-  TpuBufferHandle* output_buffer_handle[] = {buf_sum_handle};
-  TpuEvent* transfer_events[] = {transfer_ev1, transfer_ev2};
-  struct TpuEvent* execute_event =
-      driver_fn.TpuDriver_ExecuteProgram(driver, lph,
-      /*inputc=*/2, /*input_buffer_handle=*/input_buffer_handle,
-      /*outputc=*/1, /*output_buffer_handle=*/output_buffer_handle,
-      device_assignment,
-      /*eventc=*/2, /*eventv*/transfer_events);
-
-  fprintf(stdout, "------ Going to Transfer From Device ------\n");
-  TpuEvent* execute_events[] = {execute_event};
-  struct TpuEvent* transfer_sum_event =
-      driver_fn.TpuDriver_TransferFromDevice(driver, buf_sum_handle, sum_src,
-        /*eventc=*/1, /*eventv=*/execute_events);
-
-  TpuStatus* status = driver_fn.TpuDriver_EventAwait(transfer_sum_event,
-                                                     10000000);
-  if (status->code != 0) {
-    fprintf(stdout, "Transfer Event Await: Code: %d, Message: %s\n",
-          status->code, status->msg);
-  }
-
-  fprintf(stdout, "------ Going to Unload a TPU program ------\n");
-  struct TpuEvent* unload_program_event = driver_fn.TpuDriver_UnloadProgram(
-      driver, lph, /*eventc=*/1, /*eventv=*/execute_events);
-
-  fprintf(stdout, "------ Going to Deallocate a TPU Buffer ------\n");
-  struct TpuEvent* dealloc_ev1 = driver_fn.TpuDriver_Deallocate(driver,
-      buf_a_handle, /*eventc=*/0, /*eventv=*/NULL);
-  driver_fn.TpuDriver_FreeEvent(dealloc_ev1);
-
-  fprintf(stdout, "------ Going to Deallocate a TPU Buffer ------\n");
-  struct TpuEvent* dealloc_ev2 = driver_fn.TpuDriver_Deallocate(driver,
-      buf_b_handle, /*eventc=*/0, /*eventv=*/NULL);
-  driver_fn.TpuDriver_FreeEvent(dealloc_ev2);
-
-  fprintf(stdout, "------ Going to Deallocate a TPU Buffer ------\n");
-  struct TpuEvent* dealloc_ev3 = driver_fn.TpuDriver_Deallocate(driver,
-      buf_sum_handle, /*eventc=*/0, /*eventv=*/NULL);
-  driver_fn.TpuDriver_FreeEvent(dealloc_ev3);
-
-  fprintf(stdout, "sum:\n");
-  for (size_t i = 0; i < size; ++i) {
-    fprintf(stdout, "%d ", sum_src[i]);
-  }
-
-  exit(EXIT_SUCCESS);
-}
				`@@ -1 +0,0 @@`
				`Run compiler with debug in argv[2] to generate these files`