hotfix: remove accel from extra

This commit is contained in:
George Hotz
2025-06-08 08:20:34 -07:00
parent 1ad8062591
commit 3ece2e4bb5
78 changed files with 0 additions and 7843 deletions

View File

@@ -1,52 +0,0 @@
We have to figure out how to make the tinygrad ops match to hw.
Generic folded reduce may not work.
GPUs:
AMD:
RDNA2: https://developer.amd.com/wp-content/resources/RDNA2_Shader_ISA_November2020.pdf
We have RX6900XT with 80 CU, 40 WGP, and 1 "processor"
@ 1.825 GHz, there's 18,688 FP32 GFLOPS of compute. 10240 FLOPS/cycle, 128 per CU (32 FMAs per vALU, 2 per compute unit)
286 GFLOP for ENET=2 BS=64. At theoretical max, (286/18688)*1000 = 15.3 ms.
We observe about 10x factor off with pytorch.
We will focus on speed for AMD, since we have complete docs for that GPU.
Each "processor" has an "ultra threaded dispatch processor"
Each SIMD unit has 256 vector registers (or 1024?), and operates on 32 at once.
Ahh, I think there's 1024 total, but only 256 per wavefront
M1:
On M1 GPU, theoretical is 2.275 TFLOPS. https://www.notebookcheck.net/Apple-M1-GPU-Benchmarks-and-Specs.503610.0.html
We observe 2000ms for BS=8 (37 GFLOP). 37/2275 = 11.9 ms. tinygrad is over a factor of 100x off (similar on AMD GPU)
NOTE: the timer in the M1 OpenCL doesn't seem to be anywhere close to wall time.
Adreno:
TBD, no comma three here. Image > Buffer because the L1 cache is used. Would UBWC help on weights?
We have a good bit of work on this in hyperthneed. Let's get the disassembler out and make this fast.
TPUs:
These use really big systolic arrays and have a lot less flexibility.
IIRC, their vector math unit is similar to the GPU.

View File

@@ -1,5 +0,0 @@
This is where we scope out adding accelerators to tinygrad
ane -- Apple Neural Engine, in the M1 + newer iPhones
tpu -- Google's TPU, available for rent in Google Cloud

View File

@@ -1 +0,0 @@
run

View File

@@ -1,30 +0,0 @@
#!/usr/bin/env python3
import numpy as np
import coremltools as ct
from coremltools.models.neural_network import datatypes, NeuralNetworkBuilder
# KxK GEMM with bias
K = 64
input_features = [('image', datatypes.Array(K))]
input_features2 = [('image2', datatypes.Array(K))]
output_features = [('probs', datatypes.Array(K))]
weights = np.zeros((K, K)) + 3
bias = np.ones(K)
builder = NeuralNetworkBuilder(input_features+input_features2, output_features)
#builder.add_inner_product(name='ip_layer', W=weights, b=None, input_channels=K, output_channels=K, has_bias=False, input_name='image', output_name='med')
#builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs')
builder.add_elementwise(name='element', input_names=['image', 'image2'], output_name='probs', mode='ADD')
#builder.add_bias(name='bias', b=bias, input_name='med', output_name='probs', shape_bias=(K,))
#builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs')
# compile the spec
mlmodel = ct.models.MLModel(builder.spec)
# trigger the ANE!
out = mlmodel.predict({"image": np.zeros(K, dtype=np.float32)+1, "image2": np.zeros(K, dtype=np.float32)+2})
print(out)
mlmodel.save('test.mlmodel')

View File

@@ -1,36 +0,0 @@
import CoreML
// ANE?
let config = MLModelConfiguration()
config.computeUnits = .all
// CPU?
let opts = MLPredictionOptions()
opts.usesCPUOnly = false
class MNISTInput : MLFeatureProvider {
var featureNames: Set<String> {
get {
return ["image", "image2"]
}
}
func featureValue(for featureName: String) -> MLFeatureValue? {
if (featureName == "image") {
let tokenIDMultiArray = try? MLMultiArray(shape: [64], dataType: MLMultiArrayDataType.float32)
tokenIDMultiArray?[0] = NSNumber(value: 1337)
return MLFeatureValue(multiArray: tokenIDMultiArray!)
}
if (featureName == "image2") {
let tokenIDMultiArray = try? MLMultiArray(shape: [64], dataType: MLMultiArrayDataType.float32)
tokenIDMultiArray?[0] = NSNumber(value: 1337)
return MLFeatureValue(multiArray: tokenIDMultiArray!)
}
return nil
}
}
let compiledUrl = try MLModel.compileModel(at: URL(string: "test.mlmodel")!)
let model = try MLModel(contentsOf: compiledUrl, configuration: config)
let out = try model.prediction(from: MNISTInput(), options: opts)
print(out.featureValue(for: "probs") as Any)

Binary file not shown.

View File

@@ -1,4 +0,0 @@
*.hwx
anecompiler.swap.*
context_switch_log.txt
debug/

View File

@@ -1 +0,0 @@
../lib/ane.py

View File

@@ -1 +0,0 @@
../lib/aneregs.json

View File

@@ -1,57 +0,0 @@
#import <Foundation/Foundation.h>
#include <os/log.h>
#include <stdio.h>
typedef unsigned int ANECStatus;
int ANECCompile(NSDictionary* param_1, NSDictionary* param_2,
void (^param_3)(ANECStatus status,
NSDictionary* statusDictionary));
int main(int argc, char* argv[])
{
os_log(OS_LOG_DEFAULT, "start compiler");
NSDictionary* iDictionary = @ {
@"NetworkPlistName" : [NSString stringWithCString:argv[1]
encoding:NSUTF8StringEncoding],
@"NetworkPlistPath" : @"./",
};
NSArray* plistArray = @[ iDictionary ];
NSMutableDictionary* optionsDictionary =
[NSMutableDictionary dictionaryWithCapacity:4];
NSMutableDictionary* flagsDictionary =
[NSMutableDictionary dictionaryWithCapacity:4];
optionsDictionary[@"InputNetworks"] = plistArray;
optionsDictionary[@"OutputFilePath"] = @"./";
// h11 (or anything?) works here too, and creates different outputs that don't
// run
flagsDictionary[@"TargetArchitecture"] = @"h13";
if (argc > 2) {
optionsDictionary[@"OutputFileName"] = @"debug/model.hwx";
flagsDictionary[@"CompileANEProgramForDebugging"] =
[NSNumber numberWithBool:YES];
int debug_mask = 0x7fffffff;
flagsDictionary[@"DebugMask"] = [NSNumber numberWithInt:debug_mask];
} else {
optionsDictionary[@"OutputFileName"] = @"model.hwx";
}
void (^simpleBlock)(ANECStatus status, NSDictionary* statusDictionary) = ^(ANECStatus status, NSDictionary* statusDictionary) {
NSLog(@"status = %d\n", status);
// when status != 0 dump the dictionary
if (status)
NSLog(@"%@", statusDictionary);
};
printf("hello\n");
int ret = ANECCompile(optionsDictionary, flagsDictionary, simpleBlock);
printf("compile: %d\n", ret);
return ret;
}

View File

@@ -1,74 +0,0 @@
#include <os/log.h>
#include <stdio.h>
#import <CoreFoundation/CoreFoundation.h>
#include <string>
#include <iostream>
extern "C" {
int ANECCompile(CFDictionaryRef param_1, CFDictionaryRef param_2, unsigned long param_3);
std::string _ZN21ZinIrEnumToStringUtil14OpCodeToStringE22ZinIrOpLayerOpCodeType(int op);
std::string _ZN21ZinIrEnumToStringUtil21NonLinearModeToStringE18ZinIrNonLinearMode(int op);
std::string _ZN19ZinMirCacheHintUtil17CacheHintToStringE15ZinMirCacheHint(int op);
std::string _ZN30ZinMirKernelSizeSplitterEngine16ConvKindToStringENS_8ConvKindE(int op);
/*void _Z24ZinIrRegBitPrintOutDebugILj7EE11ZinIrStatusjRN11ZinHWTraitsIXT_EE6HwTypeEiRNSt3__113basic_ostreamIcNS5_11char_traitsIcEEEE(
unsigned long param_1, void *param_2,int param_3, std::basic_ostream<char> *param_4);
void debugregs(int a1, void *dat, int a2) {
_Z24ZinIrRegBitPrintOutDebugILj7EE11ZinIrStatusjRN11ZinHWTraitsIXT_EE6HwTypeEiRNSt3__113basic_ostreamIcNS5_11char_traitsIcEEEE(a1, dat, a2, &std::cout);
}*/
}
int main(int argc, char* argv[]) {
os_log(OS_LOG_DEFAULT, "start compiler");
/*for (int i = 0; i < 60; i++) {
std::string tmp = _ZN21ZinIrEnumToStringUtil14OpCodeToStringE22ZinIrOpLayerOpCodeType(i);
//std::string tmp = _ZN21ZinIrEnumToStringUtil21NonLinearModeToStringE18ZinIrNonLinearMode(i);
printf("%2d: %s\n", i, tmp.c_str());
}*/
CFTypeRef ikeys[2];
ikeys[0] = CFSTR("NetworkPlistName");
ikeys[1] = CFSTR("NetworkPlistPath");
CFTypeRef ivalues[2];
ivalues[0] = CFStringCreateWithCString(kCFAllocatorDefault, argv[1], kCFStringEncodingUTF8);
ivalues[1] = CFSTR("./");
CFDictionaryRef iDictionary = CFDictionaryCreate(kCFAllocatorDefault, ikeys, ivalues, 2, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
CFArrayRef array = CFArrayCreate(kCFAllocatorDefault, (const void**)&iDictionary, 1, &kCFTypeArrayCallBacks);
CFMutableDictionaryRef optionsDictionary = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
CFMutableDictionaryRef flagsDictionary = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
CFDictionaryAddValue(optionsDictionary, CFSTR("InputNetworks"), array);
CFDictionaryAddValue(optionsDictionary, CFSTR("OutputFilePath"), CFSTR("./"));
//CFDictionaryAddValue(optionsDictionary, CFSTR("OptionsFilePath"), CFSTR("good.options"));
// h11 (or anything?) works here too, and creates different outputs that don't run
CFDictionaryAddValue(flagsDictionary, CFSTR("TargetArchitecture"), CFSTR("h13"));
if (argc > 2) {
CFDictionaryAddValue(optionsDictionary, CFSTR("OutputFileName"), CFSTR("debug/model.hwx"));
//CFDictionaryAddValue(flagsDictionary, CFSTR("DebugDetailPrint"), kCFBooleanTrue);
CFDictionaryAddValue(flagsDictionary, CFSTR("CompileANEProgramForDebugging"), kCFBooleanTrue);
int debug_mask = 0x7fffffff;
CFDictionaryAddValue(flagsDictionary, CFSTR("DebugMask"), CFNumberCreate(kCFAllocatorDefault, 3, &debug_mask));
} else {
CFDictionaryAddValue(optionsDictionary, CFSTR("OutputFileName"), CFSTR("model.hwx"));
}
//CFDictionaryAddValue(flagsDictionary, CFSTR("DisableMergeScaleBias"), kCFBooleanTrue);
//CFDictionaryAddValue(flagsDictionary, CFSTR("Externs"), CFSTR("swag"));
//CFShow(optionsDictionary);
//CFShow(flagsDictionary);
printf("hello\n");
int ret = ANECCompile(optionsDictionary, flagsDictionary, 0);
printf("compile: %d\n", ret);
return ret;
}

View File

@@ -1,7 +0,0 @@
#!/bin/bash -e
g++ compile.mm -F /System/Library/PrivateFrameworks/ -framework ANECompiler -framework CoreFoundation
rm -f model.hwx
./a.out net.plist debug
rm -f context_switch_log.txt
log show --process a.out --last 1m --info --debug

View File

@@ -1,17 +0,0 @@
#!/usr/bin/env python3
import os
import sys
import networkx as nx
import pylab as plt
from networkx.drawing.nx_pydot import read_dot
ret = os.system("./a.out "+sys.argv[1]+" debug")
assert(ret == 0)
df = "debug/model.hwx.zinir_graph_after_reg_spill.dot"
#from graphviz import render
#render('dot', 'png', df)
#plt = Image(pdot.create_png()
#display(plt)

View File

@@ -1 +0,0 @@
Run compiler with debug in argv[2] to generate these files

View File

@@ -1,140 +0,0 @@
#!/usr/bin/env python3
import sys
from hexdump import hexdump
from macholib import MachO
from tinygrad.helpers import getenv
def get_macho(fn):
# mod to make the header okay
# MH_CIGAM_64 is good
dat = open(fn, "rb").read()
dat = b"\xcf\xfa\xed\xfe"+dat[4:]
from tempfile import NamedTemporaryFile
with NamedTemporaryFile(delete=False) as f:
f.write(dat)
f.close()
return MachO.MachO(f.name)
a = get_macho("model.hwx.golden")
# load commands
for c in a.headers[0].commands:
print("command", c[0], c[1])
if c[0].cmd == 4:
hexdump(c[2])
pass
if c[0].cmd == 6:
print("name:", c[2].decode('utf-8'))
if c[0].cmd == 8:
print(c[2].decode('utf-8'))
if c[0].cmd == 25:
for section in c[2]:
print(section.segname.strip(b'\0'), section.sectname.strip(b'\0'), hex(section.addr), hex(section.size), "@", hex(c[1].fileoff))
#print(dir(section))
if c[1].filesize > 0:
if len(section.section_data) < 0x100:
hexdump(section.section_data)
else:
print("in file, not dumping 0x%x" % len(section.section_data))
# this parser is wrong (fixed with 64-bit one)
from macholib import SymbolTable
sym = SymbolTable.SymbolTable(a)
syms = {}
for l in sym.nlists:
print(l)
if l[0].n_value != 0:
syms[l[1]] = l[0].n_value
for k,v in syms.items():
print(k, hex(v))
# **** document what we know ***
from ane import ANE_Struct, ANE
ane = ANE()
aneb = set()
for typ, num, nam in ANE_Struct:
ltyp = {"u32": 4, "u16": 2, "u8": 1}[typ]
for l in range(num, num+ltyp):
aneb.add(l)
# we understand these too
for l in range(0x34, 0xF4):
aneb.add(l)
from termcolor import colored
def compare(x, y):
ss = []
ln = []
ln2 = []
ll = (max(len(x), len(y)) + 0xF)//0x10 * 0x10
highlight = False
next_highlight = 0x2b
for i in range(ll+1):
if i == next_highlight:
highlight = True
if i < len(y):
next_highlight += y[i]+8
else:
next_highlight = None
else:
highlight = False
a = "%02X" % x[i] if i < len(x) else "--", \
"%02X" % y[i] if i < len(y) else "--"
def fj(x):
ss = []
for i in range(0, 0x10, 4):
ss.append(' '.join(x[i:i+4]))
return ' '.join(ss)
if i!=0 and i%0x10 == 0:
ss.append("%8X: " % (i-0x10)+fj(ln)+" | "+fj(ln2)+"\n")
ln = []
ln2 = []
if a[0] != a[1] and a[0] != "--" and a[1] != "--":
ln.append(colored(a[0], 'green'))
ln2.append(colored(a[1], 'red'))
else:
if highlight:
ln.append(colored(a[0], 'yellow'))
ln2.append(colored(a[1], 'yellow'))
else:
if i in aneb:
ln.append(colored(a[0], 'white'))
ln2.append(colored(a[1], 'white'))
else:
ln.append(a[0])
ln2.append(a[1])
return ''.join(ss)
import json
aneregs = dict(json.load(open("aneregs.json")))
g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1])
f1 = g.headers[0].commands[1][2][0].section_data
f2 = a.headers[0].commands[1][2][0].section_data
for i in range(0, len(f2), 0x300):
print("===== op %d =====" % (i//0x300))
if len(f1) < 0x300:
c1, c2 = f1, f2[i:i+0x300]
else:
c1, c2 = f1[i:i+0x300], f2[i:i+0x300]
dbg1 = ane.debug(c1, 16)
dbg2 = ane.debug(c2, 16)
if getenv("PRINTALL"):
for k in dbg2:
if k in aneregs:
rr = aneregs[k] if k in aneregs else (-1,-1,-1)
print("0x%3x %d %2d" % tuple(rr), k, dbg1[k], "->", dbg2[k])
else:
for k in dbg1:
if dbg1[k] != dbg2[k]:
rr = aneregs[k] if k in aneregs else (-1,-1,-1)
print("0x%3x %d %2d" % tuple(rr), k, dbg1[k], "->", dbg2[k])
print(compare(c1, c2))
#open("/tmp/data.section", "wb").write(f2)
#print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read()))

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.0 KiB

View File

@@ -1,127 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<key>Units</key>
<array>
<string>probs_tmp_0</string>
<string>probs</string>
</array>
<key>Weights</key>
<array>
<string>model.espresso.weights</string>
<string>net.additional.weights</string>
</array>
<key>image</key>
<dict>
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>3</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>1</integer>
</dict>
<key>probs</key>
<dict>
<key>Bottom</key>
<string>probs_tmp_0</string>
<key>Name</key>
<string>probs</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>BiasScaleGroupData</key>
<dict>
<key>BiasCount</key>
<integer>2</integer>
<key>BiasIndex</key>
<integer>1</integer>
<key>BiasOffset</key>
<integer>0</integer>
<key>BiasType</key>
<string>Float16</string>
</dict>
</dict>
<key>Type</key>
<string>GOC</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>probs</string>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
</dict>
<key>probs_tmp_0</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>probs_tmp_0</string>
<key>OutputChannels</key>
<integer>2</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelGroupReuse</key>
<false/>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelMode</key>
<string>Dense</string>
<key>KernelOffset</key>
<integer>192</integer>
<key>KernelType</key>
<string>Float32</string>
<key>KernelWidth</key>
<integer>1</integer>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,196 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<key>A</key>
<dict>
<key>BatchSize</key>
<integer>7</integer>
<key>InputBatchStride</key>
<integer>64</integer>
<key>InputChannels</key>
<integer>1</integer>
<key>InputDepth</key>
<integer>1</integer>
<key>InputDepthStride</key>
<integer>448</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>1</integer>
</dict>
<key>A_broadcasted_output</key>
<dict>
<key>Bottom</key>
<string>A</string>
<key>Name</key>
<string>A_broadcasted_output</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>BroadcastInfo</key>
<array>
<dict>
<key>Dimension</key>
<string>Width</string>
<key>Size</key>
<integer>2</integer>
</dict>
</array>
</dict>
<key>Type</key>
<string>Broadcast</string>
<key>UnescapedBottom</key>
<string>A</string>
<key>UnescapedName</key>
<string>A_broadcasted_output</string>
</dict>
<key>B</key>
<dict>
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>1</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>2</integer>
</dict>
<key>B_broadcasted_output</key>
<dict>
<key>Bottom</key>
<string>B</string>
<key>Name</key>
<string>B_broadcasted_output</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>BroadcastInfo</key>
<array>
<dict>
<key>Dimension</key>
<string>Depth</string>
<key>Size</key>
<integer>1</integer>
</dict>
<dict>
<key>Dimension</key>
<string>Batch</string>
<key>Size</key>
<integer>7</integer>
</dict>
<dict>
<key>Dimension</key>
<string>Channel</string>
<key>Size</key>
<integer>1</integer>
</dict>
<dict>
<key>Dimension</key>
<string>Height</string>
<key>Size</key>
<integer>1</integer>
</dict>
</array>
</dict>
<key>Type</key>
<string>Broadcast</string>
<key>UnescapedBottom</key>
<string>B</string>
<key>UnescapedName</key>
<string>B_broadcasted_output</string>
</dict>
<key>Inputs</key>
<array>
<string>B</string>
<string>A</string>
</array>
<key>Outputs</key>
<array>
<string>output@output</string>
</array>
<key>Units</key>
<array>
<string>A_broadcasted_output</string>
<string>B_broadcasted_output</string>
<string>output</string>
</array>
<key>Weights</key>
<array>
<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpy5yeqxdi.mlmodelc/model.espresso.weights</string>
<string>net.additional.weights</string>
</array>
<key>output</key>
<dict>
<key>Bottom</key>
<array>
<string>A_broadcasted_output</string>
<string>B_broadcasted_output</string>
</array>
<key>Name</key>
<string>output</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>Scale</key>
<integer>15360</integer>
<key>Type</key>
<string>Min</string>
</dict>
<key>Type</key>
<string>ScaledElementWise</string>
<key>UnescapedBottom</key>
<array>
<string>A_broadcasted_output</string>
<string>B_broadcasted_output</string>
</array>
<key>UnescapedName</key>
<string>output</string>
</dict>
<key>output@output</key>
<dict>
<key>Bottom</key>
<string>output</string>
<key>OutputBatchStride</key>
<integer>64</integer>
<key>OutputDepthStride</key>
<integer>448</integer>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,128 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<key>Inputs</key>
<array>
<string>input_1</string>
<string>input_0</string>
</array>
<key>Outputs</key>
<array>
<string>output@output</string>
</array>
<key>Units</key>
<array>
<string>output</string>
</array>
<key>Weights</key>
<array>
<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmp0yvkl2ux.mlmodelc/model.espresso.weights</string>
<string>net.additional.weights</string>
</array>
<key>input_0</key>
<dict>
<key>BatchSize</key>
<integer>2</integer>
<key>InputBatchStride</key>
<integer>512</integer>
<key>InputChannels</key>
<integer>4</integer>
<key>InputDepth</key>
<integer>4</integer>
<key>InputDepthStride</key>
<integer>1024</integer>
<key>InputHeight</key>
<integer>2</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>128</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>3</integer>
</dict>
<key>input_1</key>
<dict>
<key>BatchSize</key>
<integer>2</integer>
<key>InputBatchStride</key>
<integer>256</integer>
<key>InputChannels</key>
<integer>2</integer>
<key>InputDepth</key>
<integer>4</integer>
<key>InputDepthStride</key>
<integer>512</integer>
<key>InputHeight</key>
<integer>2</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>128</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>3</integer>
</dict>
<key>output</key>
<dict>
<key>Bottom</key>
<array>
<string>input_0</string>
<string>input_1</string>
</array>
<key>Name</key>
<string>output</string>
<key>OutputChannels</key>
<integer>6</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>Dimension</key>
<string>Channel</string>
</dict>
<key>Type</key>
<string>Concat</string>
<key>UnescapedBottom</key>
<array>
<string>input_0</string>
<string>input_1</string>
</array>
<key>UnescapedName</key>
<string>output</string>
</dict>
<key>output@output</key>
<dict>
<key>Bottom</key>
<string>output</string>
<key>OutputBatchStride</key>
<integer>768</integer>
<key>OutputDepthStride</key>
<integer>1536</integer>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>128</integer>
<key>OutputRowStride</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,135 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<key>Units</key>
<array>
<string>probs_tmp_0</string>
<string>probs</string>
</array>
<key>Weights</key>
<array>
<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmph2sg50xi.mlmodelc/model.espresso.weights</string>
<string>net.additional.weights</string>
</array>
<key>image</key>
<dict>
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>64</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>1</integer>
</dict>
<key>probs</key>
<dict>
<key>Bottom</key>
<string>probs_tmp_0</string>
<key>Name</key>
<string>probs</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>BiasScaleGroupData</key>
<dict>
<key>BiasCount</key>
<integer>64</integer>
<key>BiasIndex</key>
<integer>1</integer>
<key>BiasOffset</key>
<integer>0</integer>
<key>BiasType</key>
<string>Float16</string>
</dict>
</dict>
<key>Type</key>
<string>GOC</string>
<key>UnescapedBottom</key>
<string>probs_tmp_0</string>
<key>UnescapedName</key>
<string>probs</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>probs</string>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
</dict>
<key>probs_tmp_0</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>probs_tmp_0</string>
<key>OutputChannels</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelGroupReuse</key>
<false/>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelMode</key>
<string>Dense</string>
<key>KernelOffset</key>
<integer>384</integer>
<key>KernelType</key>
<string>Float32</string>
<key>KernelWidth</key>
<integer>1</integer>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
<key>UnescapedBottom</key>
<string>image</string>
<key>UnescapedName</key>
<string>probs_tmp_0</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,86 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<key>Inputs</key>
<array>
<string>data</string>
</array>
<key>Outputs</key>
<array>
<string>output@output</string>
</array>
<key>Units</key>
<array>
<string>output</string>
</array>
<key>Weights</key>
<array>
<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpm7rb6ba9.mlmodelc/model.espresso.weights</string>
<string>net.additional.weights</string>
</array>
<key>data</key>
<dict>
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>1</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>6</integer>
</dict>
<key>output</key>
<dict>
<key>Bottom</key>
<string>data</string>
<key>Name</key>
<string>output</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>BiasScalar</key>
<integer>16354</integer>
<key>ScaleScalar</key>
<integer>20544</integer>
</dict>
<key>Type</key>
<string>GOC</string>
<key>UnescapedBottom</key>
<string>data</string>
<key>UnescapedName</key>
<string>output</string>
</dict>
<key>output@output</key>
<dict>
<key>Bottom</key>
<string>output</string>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,166 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<key>Inputs</key>
<array>
<string>data</string>
</array>
<key>Outputs</key>
<array>
<string>out_2@output</string>
<string>out_1@output</string>
<string>out_0@output</string>
</array>
<key>Units</key>
<array>
<string>out_0</string>
<string>out_1</string>
<string>out_2</string>
</array>
<key>Weights</key>
<array>
<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmp_c4fweo3.mlmodelc/model.espresso.weights</string>
<string>net.additional.weights</string>
</array>
<key>data</key>
<dict>
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>9</integer>
<key>InputHeight</key>
<integer>2</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>128</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>2</integer>
</dict>
<key>out_0</key>
<dict>
<key>Bottom</key>
<string>data</string>
<key>Name</key>
<string>out_0</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>Dimension</key>
<string>Channel</string>
<key>Offset</key>
<integer>0</integer>
<key>Size</key>
<integer>3</integer>
</dict>
<key>Type</key>
<string>InputView</string>
<key>UnescapedBottom</key>
<string>data</string>
<key>UnescapedName</key>
<string>out_0</string>
</dict>
<key>out_0@output</key>
<dict>
<key>Bottom</key>
<string>out_0</string>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>128</integer>
<key>OutputRowStride</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
</dict>
<key>out_1</key>
<dict>
<key>Bottom</key>
<string>data</string>
<key>Name</key>
<string>out_1</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>Dimension</key>
<string>Channel</string>
<key>Offset</key>
<integer>3</integer>
<key>Size</key>
<integer>3</integer>
</dict>
<key>Type</key>
<string>InputView</string>
<key>UnescapedBottom</key>
<string>data</string>
<key>UnescapedName</key>
<string>out_1</string>
</dict>
<key>out_1@output</key>
<dict>
<key>Bottom</key>
<string>out_1</string>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>128</integer>
<key>OutputRowStride</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
</dict>
<key>out_2</key>
<dict>
<key>Bottom</key>
<string>data</string>
<key>Name</key>
<string>out_2</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>Dimension</key>
<string>Channel</string>
<key>Offset</key>
<integer>6</integer>
<key>Size</key>
<integer>3</integer>
</dict>
<key>Type</key>
<string>InputView</string>
<key>UnescapedBottom</key>
<string>data</string>
<key>UnescapedName</key>
<string>out_2</string>
</dict>
<key>out_2@output</key>
<dict>
<key>Bottom</key>
<string>out_2</string>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>128</integer>
<key>OutputRowStride</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,84 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<key>Inputs</key>
<array>
<string>data</string>
</array>
<key>Outputs</key>
<array>
<string>output@output</string>
</array>
<key>Units</key>
<array>
<string>output</string>
</array>
<key>Weights</key>
<array>
<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpwvvanb0c.mlmodelc/model.espresso.weights</string>
<string>net.additional.weights</string>
</array>
<key>data</key>
<dict>
<key>BatchSize</key>
<integer>7</integer>
<key>InputChannels</key>
<integer>7</integer>
<key>InputHeight</key>
<integer>7</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>448</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>7</integer>
</dict>
<key>output</key>
<dict>
<key>Bottom</key>
<string>data</string>
<key>Name</key>
<string>output</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>Type</key>
<string>Exp2</string>
</dict>
<key>Type</key>
<string>Neuron</string>
<key>UnescapedBottom</key>
<string>data</string>
<key>UnescapedName</key>
<string>output</string>
</dict>
<key>output@output</key>
<dict>
<key>Bottom</key>
<string>output</string>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>448</integer>
<key>OutputRowStride</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,92 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<key>Inputs</key>
<array>
<string>data</string>
</array>
<key>Outputs</key>
<array>
<string>output@output</string>
</array>
<key>Units</key>
<array>
<string>output</string>
</array>
<key>Weights</key>
<array>
<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpcwj7kqrw.mlmodelc/model.espresso.weights</string>
<string>net.additional.weights</string>
</array>
<key>data</key>
<dict>
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>1</integer>
<key>InputHeight</key>
<integer>2</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>128</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>5</integer>
</dict>
<key>output</key>
<dict>
<key>Bottom</key>
<string>data</string>
<key>Name</key>
<string>output</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>ReshapedBatch</key>
<integer>1</integer>
<key>ReshapedChannel</key>
<integer>10</integer>
<key>ReshapedDepth</key>
<integer>1</integer>
<key>ReshapedHeight</key>
<integer>1</integer>
<key>ReshapedWidth</key>
<integer>1</integer>
</dict>
<key>Type</key>
<string>Reshape</string>
<key>UnescapedBottom</key>
<string>data</string>
<key>UnescapedName</key>
<string>output</string>
</dict>
<key>output@output</key>
<dict>
<key>Bottom</key>
<string>output</string>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,140 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<key>A</key>
<dict>
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>1</integer>
<key>InputHeight</key>
<integer>5</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>320</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>7</integer>
</dict>
<key>B</key>
<dict>
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>1</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>7</integer>
</dict>
<key>B_broadcasted_output</key>
<dict>
<key>Bottom</key>
<string>B</string>
<key>Name</key>
<string>B_broadcasted_output</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>BroadcastInfo</key>
<array>
<dict>
<key>Dimension</key>
<string>Height</string>
<key>Size</key>
<integer>5</integer>
</dict>
</array>
</dict>
<key>Type</key>
<string>Broadcast</string>
<key>UnescapedBottom</key>
<string>B</string>
<key>UnescapedName</key>
<string>B_broadcasted_output</string>
</dict>
<key>Inputs</key>
<array>
<string>B</string>
<string>A</string>
</array>
<key>Outputs</key>
<array>
<string>output@output</string>
</array>
<key>Units</key>
<array>
<string>B_broadcasted_output</string>
<string>output</string>
</array>
<key>Weights</key>
<array>
<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmp40ksdbf5.mlmodelc/model.espresso.weights</string>
<string>net.additional.weights</string>
</array>
<key>output</key>
<dict>
<key>Bottom</key>
<array>
<string>A</string>
<string>B_broadcasted_output</string>
</array>
<key>Name</key>
<string>output</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>Scale</key>
<integer>15360</integer>
<key>Type</key>
<string>Min</string>
</dict>
<key>Type</key>
<string>ScaledElementWise</string>
<key>UnescapedBottom</key>
<array>
<string>A</string>
<string>B_broadcasted_output</string>
</array>
<key>UnescapedName</key>
<string>output</string>
</dict>
<key>output@output</key>
<dict>
<key>Bottom</key>
<string>output</string>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>320</integer>
<key>OutputRowStride</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,112 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<key>Inputs</key>
<array>
<string>image2</string>
<string>image</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<key>Units</key>
<array>
<string>probs</string>
</array>
<key>Weights</key>
<array>
<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmpkp9irqtj.mlmodelc/model.espresso.weights</string>
<string>net.additional.weights</string>
</array>
<key>image</key>
<dict>
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>64</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>1</integer>
</dict>
<key>image2</key>
<dict>
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>64</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
<key>InputWidth</key>
<integer>1</integer>
</dict>
<key>probs</key>
<dict>
<key>Bottom</key>
<array>
<string>image</string>
<string>image2</string>
</array>
<key>Name</key>
<string>probs</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>Scale</key>
<integer>15360</integer>
<key>Type</key>
<string>Add</string>
</dict>
<key>Type</key>
<string>ScaledElementWise</string>
<key>UnescapedBottom</key>
<array>
<string>image</string>
<string>image2</string>
</array>
<key>UnescapedName</key>
<string>probs</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>probs</string>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
<key>OutputType</key>
<string>Float16</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,78 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<key>Inputs</key>
<array>
<string>input_1</string>
<string>input_0</string>
</array>
<key>Outputs</key>
<array>
<string>output@output</string>
</array>
<key>Units</key>
<array>
<string>output</string>
</array>
<key>Weights</key>
<array>
<string>/private/var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/tmp0yvkl2ux.mlmodelc/model.espresso.weights</string>
<string>net.additional.weights</string>
</array>
<key>input_0</key>
<dict>
<key>InputChannels</key>
<integer>16384</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>1</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>input_1</key>
<dict>
<key>InputChannels</key>
<integer>16</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>1</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>output</key>
<dict>
<key>Bottom</key>
<array>
<string>input_0</string>
<string>input_1</string>
</array>
<key>Name</key>
<string>output</string>
<key>OutputType</key>
<string>Float16</string>
<key>Type</key>
<string>Concat</string>
</dict>
<key>output@output</key>
<dict>
<key>Bottom</key>
<string>output</string>
<key>OutputType</key>
<string>Float16</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,94 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<!-- This is a one layer network -->
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Units</key>
<array>
<string>my_layer</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<!-- Global array of weights -->
<key>Weights</key>
<array>
<string>../twos.weights</string>
</array>
<key>image</key>
<dict>
<key>InputChannels</key>
<integer>3</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>1</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>my_layer</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>my_layer</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>Float16</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,130 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<!-- This is a one layer network -->
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Units</key>
<array>
<string>my_layer</string>
<string>my_layer_2</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<!-- Global array of weights -->
<key>Weights</key>
<array>
<string>../min.weights</string>
</array>
<key>image</key>
<dict>
<!-- default is 1 -->
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>3</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>1</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>my_layer</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>my_layer</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>Float16</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>my_layer_2</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
<key>Name</key>
<string>my_layer_2</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>Type</key>
<string>Sign</string>
</dict>
<key>Type</key>
<string>Neuron</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>my_layer_2</string>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,135 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<!-- This is a one layer network -->
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Units</key>
<array>
<string>my_layer</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<!-- Global array of weights -->
<key>Weights</key>
<array>
<string>../min_uint8.weights</string>
</array>
<key>image</key>
<dict>
<!-- default is 1 -->
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>1</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>3</integer>
<key>InputDepth</key>
<integer>1</integer>
<key>InputInterleave</key>
<integer>1</integer>
<key>InputBatchStride</key>
<integer>256</integer>
<key>InputDepthStride</key>
<integer>256</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>UInt8</string>
</dict>
<key>my_layer</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>my_layer</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>UInt8</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelDepth</key>
<integer>1</integer>
<key>PadTop</key>
<integer>0</integer>
<key>PadBot</key>
<integer>0</integer>
<key>PadLeft</key>
<integer>0</integer>
<key>PadRight</key>
<integer>0</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>UInt8</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
<key>OutputInterleave</key>
<integer>1</integer>
<key>OutputBatchStride</key>
<integer>256</integer>
<key>OutputDepthStride</key>
<integer>256</integer>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,154 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<!-- This is a one layer network -->
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Units</key>
<array>
<string>my_layer</string>
<string>my_layer_2</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
<string>zalt@output</string>
</array>
<!-- Global array of weights -->
<key>Weights</key>
<array>
<string>../min.weights</string>
</array>
<key>image</key>
<dict>
<!-- default is 1 -->
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>3</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>my_layer</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>my_layer</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>Float32</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>my_layer_2</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
<key>Name</key>
<string>my_layer_2</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>Float32</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
</dict>
<key>zalt@output</key>
<dict>
<key>Bottom</key>
<string>my_layer_2</string>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,154 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<!-- This is a one layer network -->
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Units</key>
<array>
<string>my_layer</string>
<string>my_layer_2</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
<string>aalt@output</string>
</array>
<!-- Global array of weights -->
<key>Weights</key>
<array>
<string>../min.weights</string>
</array>
<key>image</key>
<dict>
<!-- default is 1 -->
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>3</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>my_layer</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>my_layer</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>Float32</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>my_layer_2</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
<key>Name</key>
<string>my_layer_2</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>Float32</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
</dict>
<key>aalt@output</key>
<dict>
<key>Bottom</key>
<string>my_layer_2</string>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,143 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<!-- This is a one layer network -->
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Units</key>
<array>
<string>my_layer</string>
<string>my_layer_2</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<!-- Global array of weights -->
<key>Weights</key>
<array>
<string>../min.weights</string>
</array>
<key>image</key>
<dict>
<!-- default is 1 -->
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>3</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>my_layer</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>my_layer</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>Float32</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>my_layer_2</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
<key>Name</key>
<string>my_layer_2</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>Float32</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>my_layer_2</string>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,87 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<!-- This is a one layer network -->
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Units</key>
<array>
<string>my_layer</string>
<string>my_layer_2</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<key>image</key>
<dict>
<!-- default is 1 -->
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>1</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>77</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>my_layer</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>my_layer</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>Type</key>
<string>Sigmoid</string>
</dict>
<key>Type</key>
<string>Neuron</string>
</dict>
<key>my_layer_2</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
<key>Name</key>
<string>my_layer_2</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>Type</key>
<string>Sigmoid</string>
</dict>
<key>Type</key>
<string>Neuron</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>my_layer_2</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,90 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<!-- dd if=/dev/zero of=/tmp/zero bs=1024 count=16384 -->
<!-- 1024x512 is the max that gets compiled into one op -->
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<key>Units</key>
<array>
<string>probs</string>
</array>
<key>Weights</key>
<array>
<string>/tmp/zero</string>
</array>
<key>image</key>
<dict>
<key>BatchSize</key>
<integer>512</integer>
<key>InputChannels</key>
<integer>512</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>1</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>probs</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>probs</string>
<key>OutputChannels</key>
<integer>512</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelType</key>
<string>Float16</string>
<key>KernelMode</key>
<string>Dense</string>
<key>KernelOffset</key>
<integer>0</integer>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>probs</string>
<key>OutputType</key>
<string>Float16</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,79 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<!-- This is a one layer network -->
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Units</key>
<array>
<string>my_layer</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<key>image</key>
<dict>
<!-- default is 1 -->
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>3</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>my_layer</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>my_layer</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>BiasScalar</key>
<integer>16354</integer>
<key>ScaleScalar</key>
<integer>20544</integer>
</dict>
<key>Type</key>
<string>GOC</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,69 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<!-- This is a one layer network -->
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Units</key>
<array>
<string>my_layer</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<key>image</key>
<dict>
<!-- default is 1 -->
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>1</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>77</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>my_layer</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>my_layer</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>Type</key>
<string>Sigmoid</string>
</dict>
<key>Type</key>
<string>Neuron</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,221 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<!-- This is a one layer network -->
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Units</key>
<array>
<string>my_layer</string>
<string>my_layer_2</string>
<string>my_layer_3</string>
<string>my_layer_4</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<!-- Global array of weights -->
<key>Weights</key>
<array>
<string>../min.weights</string>
</array>
<key>image</key>
<dict>
<!-- default is 1 -->
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>3</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>my_layer</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>my_layer</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>Float32</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>my_layer_2</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
<key>Name</key>
<string>my_layer_2</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>Float32</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>my_layer_3</key>
<dict>
<key>Bottom</key>
<string>my_layer_2</string>
<key>Name</key>
<string>my_layer_3</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>Float32</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>my_layer_4</key>
<dict>
<key>Bottom</key>
<string>my_layer_3</string>
<key>Name</key>
<string>my_layer_4</string>
<key>OutputChannels</key>
<integer>3</integer>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>KernelHeight</key>
<integer>1</integer>
<key>KernelWidth</key>
<integer>1</integer>
<key>KernelIndex</key>
<integer>0</integer>
<key>KernelOffset</key>
<integer>0</integer>
<key>KernelType</key>
<string>Float32</string>
<key>Step</key>
<array>
<integer>1</integer>
<integer>1</integer>
</array>
<key>Type</key>
<string>Conv</string>
</dict>
<key>Type</key>
<string>Conv</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>my_layer_4</string>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,85 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Networks</key>
<array>
<string>net</string>
</array>
<key>Version</key>
<string>1.0.9</string>
<key>net</key>
<dict>
<!-- This is a one layer network -->
<key>Inputs</key>
<array>
<string>image</string>
</array>
<key>Units</key>
<array>
<string>my_layer</string>
</array>
<key>Outputs</key>
<array>
<string>probs@output</string>
</array>
<key>image</key>
<dict>
<!-- default is 1 -->
<key>BatchSize</key>
<integer>1</integer>
<key>InputChannels</key>
<integer>3</integer>
<key>InputHeight</key>
<integer>1</integer>
<key>InputWidth</key>
<integer>1</integer>
<key>InputPlaneStride</key>
<integer>64</integer>
<key>InputRowStride</key>
<integer>64</integer>
<key>InputType</key>
<string>Float16</string>
</dict>
<key>my_layer</key>
<dict>
<key>Bottom</key>
<string>image</string>
<key>Name</key>
<string>my_layer</string>
<key>OutputType</key>
<string>Float16</string>
<key>Params</key>
<dict>
<key>ReshapedBatch</key>
<integer>1</integer>
<key>ReshapedChannel</key>
<integer>3</integer>
<key>ReshapedDepth</key>
<integer>1</integer>
<key>ReshapedHeight</key>
<integer>1</integer>
<key>ReshapedWidth</key>
<integer>1</integer>
</dict>
<key>Type</key>
<string>Reshape</string>
</dict>
<key>probs@output</key>
<dict>
<key>Bottom</key>
<string>my_layer</string>
<key>OutputPlaneStride</key>
<integer>64</integer>
<key>OutputRowStride</key>
<integer>64</integer>
</dict>
</dict>
</dict>
</plist>

View File

@@ -1,36 +0,0 @@
#!/usr/bin/env python3
from ane import ANE
ane = ANE()
lens = {}
dat = b"\xff"*0x300
ret = ane.debug(dat, 16)
for k,v in ret.items():
found = None
for i in range(33):
#print(v, (1 << i) - 1)
if v == (1 << i) - 1:
found = i
break
#print(k, hex(v), found)
lens[k] = found
pos = []
dat = b"\x00"*0x300
for i in range(0x300):
for j in range(8):
dat = b"\x00"*i
dat += bytes([1 << j])
dat += b"\x00"*(0x300-len(dat))
ret = ane.debug(dat, 16)
for k,v in ret.items():
if v == 1:
print("0x%3x %d %2d" % (i, j, lens[k]), k)
pos.append((k, (i,j, lens[k])))
import json
jpos = json.dumps(pos, indent=2)
with open("aneregs.json", "w") as f:
f.write(jpos)

View File

@@ -1,4 +0,0 @@
#!/bin/bash -e
clang++ test.mm -F /System/Library/PrivateFrameworks/ -framework ANEServices -framework IOSurface -framework Foundation -framework IOKit
codesign --entitlements entitlements.xml -s "Taylor Swift Child 2" a.out

View File

@@ -1,9 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>com.apple.ane.iokit-user-access</key><true/>
</dict>
</plist>

View File

@@ -1,150 +0,0 @@
enum ANEDeviceUsageType {
UsageNoProgram,
UsageWithProgram, // used in running process
UsageCompile // used in aned
};
struct H11ANEDeviceInfoStruct {
uint64_t program_handle;
uint64_t program_auth_code;
uint64_t sleep_timer;
uint64_t junk[0x100];
};
struct H11ANEStatusStruct {
uint64_t junk[0x100];
};
struct H11ANEProgramCreateArgsStruct {
void *program;
uint64_t program_length;
uint64_t empty[4];
char has_signature;
};
struct H11ANEProgramCreateArgsStructOutput {
uint64_t program_handle;
int unknown[0x2000];
};
struct H11ANEProgramPrepareArgsStruct {
uint64_t program_handle;
uint64_t flags;
uint64_t empty[0x100];
};
struct H11ANEProgramRequestArgsStruct {
uint64_t args[0x1000];
};
namespace H11ANE {
class H11ANEDevice;
class H11ANEDeviceController {
public:
H11ANEDeviceController(
int (*callback)(H11ANE::H11ANEDeviceController*, void*, H11ANE::H11ANEDevice*),
void *arg);
int SetupDeviceController();
private: // size is 0x50
CFArrayRef array_ref;
mach_port_t *master_port;
IONotificationPortRef port_ref;
CFRunLoopSourceRef source_ref;
int (*callback)(H11ANE::H11ANEDeviceController*, void*, H11ANE::H11ANEDevice*);
void *callback_arg;
CFRunLoopRef run_loop_ref;
io_iterator_t io_iterator;
pthread_t thread_self;
uint64_t unused;
};
// we should switch to the IOKit kernel interface, it's likely a lot more stable
// actually this probably isn't true. ANEServices is normal dynamic links
// https://googleprojectzero.blogspot.com/2020/11/oops-i-missed-it-again.html
// H11ANEInDirectPathClient
// _ANE_DeviceOpen
// _ANE_DeviceClose
// _ANE_ProgramSendRequest
// * if they need kernel debugger attached
// H11ANEInUserClient
// _ANE_DeviceOpen
// _ANE_DeviceClose
// _ANE_ProgramSendRequest
// _ANE_ProgramCreate
// _ANE_ProgramPrepare
// _ANE_ProgramUnprepare
// _ANE_ProgramDestroy
// _ANE_GetStatus
// _ANE_PowerOn
// _ANE_PowerOff
// _ANE_IsPowered
// * _ANE_LoadFirmware
// * _ANE_ForgetFirmware
// * _ANE_SendCommand
// _ANE_SetPowerManagement
// _ANE_GetTime
// * _ANE_SetDriverLoggingFlags
// * _ANE_ShowSharedMemoryAllocations
// * _ANE_SetDARTCacheTTL
// * _ANE_SetFirmwareBootArg
// * _ANE_SetThrottlingPercentage
// * _ANE_AddPersistentClient
// * _ANE_RemovePersistentClient
// * _ANE_CreateClientLoggingSession
// * _ANE_TerminateClientLoggingSession
// _ANE_GetDriverLoggingFlags
// * _ANE_FlushInactiveDARTMappings
// _ANE_GetVersion
// _ANE_RegisterFirmwareWorkProcessor
// _ANE_UnregisterFirmwareWorkProcessor
// * _ANE_GetFirmwareWorkProcessorItem
// _ANE_CompleteFirmwareWorkProcessorItem
// _ANE_ReleaseFirmwareWorkProcessorBuffers
// * _ANE_ReadANERegister
// * _ANE_WriteANERegister
// _ANE_ProgramCreateInstance
// note, this is not the raw IOKit class, it's in ANEServices.framework
class H11ANEDevice {
public:
H11ANEDevice(H11ANE::H11ANEDeviceController *param_1, unsigned int param_2);
unsigned long H11ANEDeviceOpen(
int (*callback)(H11ANE::H11ANEDevice*, unsigned int, void*, void*),
void *param_2, ANEDeviceUsageType param_3, H11ANEDeviceInfoStruct *param_4);
void EnableDeviceMessages();
int ANE_AddPersistentClient();
int ANE_GetStatus(H11ANEStatusStruct *param_1);
// power management
int ANE_IsPowered();
int ANE_PowerOn();
int ANE_PowerOff();
// logging (e00002c7 error, needs PE_i_can_has_debugger)
int ANE_CreateClientLoggingSession(unsigned int log_iosurface);
int ANE_TerminateClientLoggingSession(unsigned int log_iosurface);
int ANE_GetDriverLoggingFlags(unsigned int *flags);
int ANE_SetDriverLoggingFlags(unsigned int flags);
// program creation
int ANE_ProgramCreate(H11ANEProgramCreateArgsStruct*,
H11ANEProgramCreateArgsStructOutput*);
int ANE_ProgramPrepare(H11ANEProgramPrepareArgsStruct*);
int ANE_ProgramSendRequest(H11ANEProgramRequestArgsStruct*, mach_port_t);
// need PE_i_can_has_debugger
int ANE_ReadANERegister(unsigned int param_1, unsigned int *param_2);
int ANE_ForgetFirmware();
private: // size is 0x88
unsigned char unknown[0x88];
};
};

View File

@@ -1,184 +0,0 @@
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#import <IOSurface/IOSurfaceRef.h>
#import <Foundation/Foundation.h>
#import <CoreFoundation/CoreFoundation.h>
void hexdump(void *vdat, int l) {
unsigned char *dat = (unsigned char *)vdat;
for (int i = 0; i < l; i++) {
if (i!=0 && (i%0x10) == 0) printf("\n");
printf("%02X ", dat[i]);
}
printf("\n");
}
#include "h11ane.h"
using namespace H11ANE;
H11ANEDevice *device = NULL;
int MyH11ANEDeviceControllerNotification(H11ANEDeviceController *param_1, void *param_2, H11ANEDevice *param_3) {
printf("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
device = param_3;
return 0;
}
int MyH11ANEDeviceMessageNotification(H11ANE::H11ANEDevice* dev, unsigned int param_1, void* param_2, void* param_3) {
printf("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
return 0;
}
int main() {
int ret;
printf("hello %d\n", getpid());
H11ANEDeviceController dc(MyH11ANEDeviceControllerNotification, NULL);
dc.SetupDeviceController();
assert(device != NULL);
H11ANEDevice *dev = device;
dev->EnableDeviceMessages();
char empty[0x90] = {0};
H11ANEDeviceInfoStruct dis = {0};
//dis.nothing = 0x87c15a20a;
//dis.sleep_timer = 5000;
ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty, UsageCompile, &dis);
printf("open 0x%x %p\n", ret, dev);
/*ret = dev->ANE_AddPersistentClient();
printf("add persistent %x\n", ret);*/
H11ANEStatusStruct blah = {0};
ret = dev->ANE_GetStatus(&blah);
printf("get status %x\n", ret);
// this isn't callable anymore, it requires debugger
ret = dev->ANE_PowerOn();
printf("power on: %x\n", ret);
ret = dev->ANE_IsPowered();
printf("powered? %d\n", ret);
/*if (ret == 0) {
printf("POWER ON FAILED\n");
return -1;
}*/
H11ANEProgramCreateArgsStruct mprog = {0};
mprog.program_length = 0xc000;
char *prog = (char*)aligned_alloc(0x1000, mprog.program_length);
mprog.program = prog;
FILE *f = fopen("../2_compile/model.hwx", "rb");
assert(f);
int sz = fread(prog, 1, mprog.program_length, f);
printf("read %x %p\n", sz, prog);
fclose(f);
H11ANEProgramCreateArgsStructOutput *out = new H11ANEProgramCreateArgsStructOutput;
memset(out, 0, sizeof(H11ANEProgramCreateArgsStructOutput));
ret = dev->ANE_ProgramCreate(&mprog, out);
uint64_t program_handle = out->program_handle;
printf("program create: %lx %lx\n", ret, program_handle);
H11ANEProgramPrepareArgsStruct pas = {0};
pas.program_handle = program_handle;
pas.flags = 0x0000000100010001;
//pas.flags = 0x0000000102010001;
ret = dev->ANE_ProgramPrepare(&pas);
printf("program prepare: %lx\n", ret);
// input buffer
NSDictionary* dict = [NSDictionary dictionaryWithObjectsAndKeys:
[NSNumber numberWithInt:16], kIOSurfaceWidth,
[NSNumber numberWithInt:16], kIOSurfaceHeight,
[NSNumber numberWithInt:1], kIOSurfaceBytesPerElement,
[NSNumber numberWithInt:64], kIOSurfaceBytesPerRow,
[NSNumber numberWithInt:1278226536], kIOSurfacePixelFormat,
nil];
IOSurfaceRef in_surf = IOSurfaceCreate((CFDictionaryRef)dict);
int in_surf_id = IOSurfaceGetID(in_surf);
printf("we have surface %p with id 0x%x\n", in_surf, in_surf_id);
// load inputs
IOSurfaceLock(in_surf, 0, nil);
unsigned char *inp = (unsigned char *)IOSurfaceGetBaseAddress(in_surf);
for (int i = 0; i < 16; i++) inp[i] = (i+1)*0x10;
/*inp[0] = 0x39;
inp[1] = 0x65;*/
hexdump(inp, 0x20);
IOSurfaceUnlock(in_surf, 0, nil);
// output buffer
NSDictionary* odict = [NSDictionary dictionaryWithObjectsAndKeys:
[NSNumber numberWithInt:16], kIOSurfaceWidth,
[NSNumber numberWithInt:16], kIOSurfaceHeight,
[NSNumber numberWithInt:1], kIOSurfaceBytesPerElement,
[NSNumber numberWithInt:64], kIOSurfaceBytesPerRow,
[NSNumber numberWithInt:1278226536], kIOSurfacePixelFormat,
nil];
IOSurfaceRef out_surf = IOSurfaceCreate((CFDictionaryRef)odict);
int out_surf_id = IOSurfaceGetID(out_surf);
printf("we have surface %p with id 0x%x\n", out_surf, out_surf_id);
H11ANEProgramRequestArgsStruct *pras = new H11ANEProgramRequestArgsStruct;
memset(pras, 0, sizeof(H11ANEProgramRequestArgsStruct));
// TODO: make real struct
pras->args[0] = program_handle;
pras->args[4] = 0x0000002100000003;
// inputs
pras->args[0x28/8] = 1;
pras->args[0x128/8] = (long long)in_surf_id<<32LL;
// outputs
pras->args[0x528/8] = 1;
// 0x628 = outputBufferSurfaceId
pras->args[0x628/8] = (long long)out_surf_id<<32LL;
mach_port_t recvPort = 0;
IOCreateReceivePort(kOSAsyncCompleteMessageID, &recvPort);
printf("recv port: 0x%x\n", recvPort);
// *** reopen with other client ***
H11ANEDeviceController dc2(MyH11ANEDeviceControllerNotification, NULL);
dc2.SetupDeviceController();
assert(device != NULL);
dev = device;
dev->EnableDeviceMessages();
char empty2[0x90] = {0};
dis.program_handle = program_handle;
dis.program_auth_code = 0;
ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty2, UsageWithProgram, &dis);
printf("reopen 0x%x %p\n", ret, dev);
// run program (i think we need the other client for this)
ret = dev->ANE_ProgramSendRequest(pras, recvPort);
printf("send 0x%x\n", ret);
struct {
mach_msg_header_t header;
char data[256];
} message;
ret = mach_msg(&message.header,
MACH_RCV_MSG,
0, sizeof(message),
recvPort,
MACH_MSG_TIMEOUT_NONE,
MACH_PORT_NULL);
printf("got message: %d sz %d\n", ret, message.header.msgh_size);
unsigned char *dat = (unsigned char *)IOSurfaceGetBaseAddress(out_surf);
printf("%p\n", dat);
hexdump(dat, 0x100);
}

View File

@@ -1,98 +0,0 @@
# The Apple Neural Engine
The Apple Neural Engine is a fancy DMA Engine that is based around convolutions. We don't have all the details worked out yet, but we can do some things with it. At its core, it runs through 0x300 ops in an hwx file. See `aneregs` for the registers used in each op.
It operates out of RAM or its 4MB L2 cache. The L2 "cache" appears to be manually managed, and only applies to the input and output, not the weights. The weights are usually included in the program, and it's unclear where they are copied to.
The 16 cores likely refer to the 16 wide Kernel DMA engine. They claim 11 TOPS total, which would be 687.5 GOPS/core. Perhaps it's a 32x32 MAC running at 335 MHz. That clock speed matches the cycle count time ratio from the debug perf stats.
It works with 5D Tensors, you specify the stride for the latter 4. All strides must be a multiple of 0x40 bytes
* Column (width) -- aneRegs.Common.InDim.Win / aneRegs.Common.OutDim.Wout
* Row (height) -- aneRegs.Common.InDim.Hin / aneRegs.Common.OutDim.Hout
* Plane (channels) -- aneRegs.Common.Cin.Cin / aneRegs.Common.Cout.Cout
* Depth
* Group (batch) -- aneRegs.Common.GroupConvCfg.NumGroups
It works with 3 data types
* UInt8
* Int8
* Float16
The ops have several parts
* Header -- The base addresses for the DMA engines
* KernelDMASrc -- 16x wide DMA engine for the weights/bias/nonlinearity
* Common -- Specifies the parameters for the convolution
* TileDMASrc -- Input DMA engine
* L2 -- Use the L2 cache for Source/Result instead of RAM
* NE -- Configure Kernel/MAC/Post
* TileDMADst -- Output DMA engine
It can work with 8 base addresses for the DMA streams per OP
* 2x Read, both used for things like sum
* 1x Write
* 1x T?
* 4x Kernel, though only the first one seems used
## Normal Flow for ANE Usage
* Keras/ONNX model -> coremltools
* CoreML model -> Espresso
* net.plist -> ANECompiler
* model.hwx -> ANEServices
* AppleH11ANEInterface, an IOKit interface to the kernel
## hwx file?
This is a Mach-O file. We haven't figured out all the details, but the ops are at 0x4000. See `hwx_parse.py`
## amfid
Sadly disabling amfi breaks things like vscode. You can runtime patch
```
# MacOS 12.4
smol :: ~/fun/tinygrad » sha1sum /usr/libexec/amfid
0f7e7f7e41408f83d7ebc7564a3828f41cb2ab58 /usr/libexec/amfid
# with patching +0x8e38
(lldb) image list
[ 0] 04B6DF6C-6068-3F18-81A7-978985574387 0x0000000102ad0000 /usr/libexec/amfid
(lldb) p *(unsigned int *)0x102ad8e38=0xd2800000
```
This disables the entitlement check, then you don't need a bootarg. I wish Apple made a better way to do this.
## Extracting ANEServices.framework
```
# install xcode and
sudo xcode-select --switch /Applications/Xcode.app
# xcode also contains ANEServices.tbd
brew install keith/formulae/dyld-shared-cache-extractor
dyld-shared-cache-extractor /System/Library/dyld/dyld_shared_cache_arm64e /tmp/libraries
cp /tmp/libraries/System/Library/PrivateFrameworks/ANECompiler.framework/Versions/A/ANECompiler .
cp /tmp/libraries/System/Library/PrivateFrameworks/ANEServices.framework/Versions/A/ANEServices .
cp /tmp/libraries/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/Versions/A/AppleNeuralEngine .
```
## Other work
```
# sadly also relies on ZinIrRegBitPrintOutDebug
https://github.com/antgroup-arclab/ANETools.git
# sadly looks like we do actually need a direct connection to run hwx files, aned is at the espresso level
* frame #0: 0x00000001c250fecc AppleNeuralEngine`-[_ANEDaemonConnection loadModel:sandboxExtension:options:qos:withReply:]
(lldb) po $x2
_ANEModel: { modelURL=file:///var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/test_F48D9B88-A68D-476F-ADC8-32BDAF9A2498.mlmodelc/ : key={"isegment":0,"inputs":{"image":{"shape":[1,1,1,64,1]},"image2":{"shape":[1,1,1,64,1]}},"outputs":{"probs":{"shape":[1,1,1,64,1]}}} : string_id=0x00000000 : program=(null) : state=1 : programHandle=0 : intermediateBufferHandle=0 : queueDepth=0 : attr={
} : perfStatsMask=0}
```
## Choices
* Disable amfid (breaks vscode)
* Patch amfid to allow restricted entitlements
* Sign with a "provisioning profile" to allow the entitlement
* Patch the ANE kext to not require a special entitlement (this is ideal, as we don't need to resign python)

View File

@@ -1,367 +0,0 @@
kernel driver: AppleH11ANEInterface
requires entitlement: com.apple.ane.iokit-user-access
compiler is run in ANE_ProgramCreate_gated
2 helper processes:
/usr/libexec/aned
ANECompilerService
Espresso:
Contains ANECompilerEngine
AppleNeuralEngine: Objective-C interface called by Espresso
ANEServices: communication with the device
ANECompiler: compile plist into hwx file
com.apple.ANECompilerService.allow in AppleNeuralEngine?
Called from ANECompilerService.xpc in AppleNeuralEngine.framework
== Model Flow ==
Keras/ONNX model
|
| 1_build
| (coremltools, open source)
v
CoreML model
|
| TODO: automate this
| Grabbed plist from lldbing ANECompilerService during 1_build
| (Espresso)
v
net.plist
|
| 2_compile
| (AppleNeuralEngine, ANECompiler)
v
model.hwx
|
| 3_run
| (AppleNeuralEngine, ANEServices, AppleH11ANEInterface)
v
<run on neural engine>
TODO: Write a nice plist grabber
DONE: Write a call to the compiler with plist+weights
DONE: Write an hwx runner
== Tracing the Compiler ==
ANECCompileProcedure
ZinAneCreateIr
ZinParseNeuronUnit
ZinAneCoreCompile
ZinAneCodeGeneration
ZinIrCodegenHandleKernels
ZinIrTargetH13::CodegenTds
ZinIrCacheHintTable
ZinIrCodegenHandleTds_v7
ZinIrCodegenHandleTdsMakeList<7u>
ZinAneInstruction
ZinAneTd<7u>::HandleEngineLayer
ZinAneInstruction::HandleTdHeader
HandleNELayer<7u>
ZinAneInstruction::HandleCommonConfig
ZinAneInstruction::HandleCommonConfigCommonOpcodes
ZinIrCodegenHandleTds<7u>
0x1bb93ae00 <-- this is the store of the first byte in the hwx
CalculateSizeInBytesFromRegCount (x*4+4)
0xf @ 0x128-0x168 (base 0x1003047b0)
0x1b @ 0x16c-0x1dc
0x11 @ 0x1e0-0x228
0x3 @ 0x22c-0x23c
0x4 @ 0x240-0x254
0x6 @ 0x258-0x274(end)
AddReloc (this is gold! x4 goes in the hwx)
ZinAneTd<7u>::HandleEngineLayer
rbreak ^ZinAneInstruction*
weeee ZinIrRegBitPrintOutDebug_7u_
print (void)debugregs(0, 0x0000000100211030+8, 3)
== min.plist ==
Types: GOC, Conv, Broadcast, ScaledElementWise, Reshape, InputView, Neuron, Concat
ops have length 0x300, seems like one basic op repeated
header 0x0-0x1c
u32 0x1c = next op offset
u16 0x20 = output address?
== section break 0x2c (weights) ==
reloc 0x2c-0x74 = K2DBE6976FEB616E6867A2E3853FC37D0F101C4C51BA4A80C103359643338C0C1_ne_0
K2DBE6976FEB616E6867A2E3853FC37D0F101C4C51BA4A80C103359643338C0C1_ne_1
16 output channel parallel:
u32[16] 0x34-0x74 = 0x80 | 1 if used
u32[16] 0x74-0xB4 = <channel data offset>
u32[16] 0xB4-0xF4 = <channel data length>
== section break 0x128 (conv) ==
u16 0x128 = InputWidth
u16 0x12A = InputHeight
u16 0x12C = InputDepth
u32 0x130 = (OutputType * 0x10) | InputType
u32 0x134 = InputChannels
u32 0x138 = OutputChannels
u16 0x13C = OutputWidth
u16 0x13E = OutputHeight
u16 0x140 = OutputDepth
u16 0x144 = 0xa000 | (KernelHeight * 0x20) | KernelWidth
u16 0x146 = 0x5000 | (PadTop * 0x40) | (PadLeft * 2)
u16 0x14C = BatchSize
u32 0x150 = OutputHeight?
== section break 0x16c (input) ==
reloc 0x16c-0x174 = image
u32 0x178 = InputRowStride
u32 0x17C = InputPlaneStride
u32 0x180 = InputDepthStride
u32 0x184 = InputBatchStride
u8 0x1A7 = InputInterleave
== section break 0x1e0 ==
u8 0x1E5 = InputInterleave
u32 0x1F4 = InputChannels * 0x10
u32 0x1F8 = InputDepth * InputChannels * 0x10
u8 0x211 = OutputInterleave
u32 0x220 = OutputChannels * 0x10
u32 0x224 = OutputDepth * OutputChannels * 0x10
== section break 0x22c (scaling) ==
u16 0x230 = BiasScalar
u16 0x232 = ScaleScalar
== section break 0x240 ==
u8 0x240 = 0x80 | KernelType
u8 0x241 = 4 * hasbatch
u16 0x246 = 0x10 | 2 * neuron?
== section break 0x258 (output) ==
reloc 0x258-0x25c = probs@output/src
u32 0x260 = OutputRowStride
u32 0x264 = OutputPlaneStride
u32 0x268 = OutputDepthStride
u32 0x26C = OutputBatchStride
u8 0x273 = OutputInterleave
== Zin Constants ==
kZinIrOpCodeConv = 0?
kZinIrOpCodePool = 1
kZinIrOpCodeElementWiseOp = 6
kZinIrOpCodeConcat = 8
kZinIrOpCodeFlattenComposite
kZinIrOpCodeNEConvOp
kZinIrOpCodeTranspose
0: CONV
1: POOL
2: SCALE_BIAS
3: TERNARY_DYNAMIC_GOC
4: BINARY_DYNAMIC_GOC
5: ACTIVATION
6: EW
7: SCALED_EW
8: CONCAT
9: SPLIT
10: COPY
11: FLATTEN
12: UNFLATTEN
13: CROSS_CORRELATION
14: KERNEL_RASTERIZER
15: ARG_MIN_MAX
16: MATRIX_MULT
17: BROADCAST
18: FLATTEN_COMPOSITE
19: UNFLATTEN_COMPOSITE
20: KERNEL_RASTERIZER_COMPOSITE
21: CROSS_CORRELATION_COMPOSITE
22: LIVE_IN
23: CONST_IN
24: LIVE_OUT
25: REDUCTION
26: ALIAS
27: Typecast
28: RESHAPE
29: VIEW
30: TRANSPOSE
31: SPACE_TO_BATCH
32: BATCH_TO_SPACE
33: SOFTMAX
34: INSTANCE_NORM
35: L2_NORM
36: MINMAX_NORM
37: LRN
38: COST_VOLUME
39: PIXEL_SHUFFLE
40: FPS
41: RS
42: PEFUSED_ELEMENTWISE
43: PEFUSED_POOL
44: PEFUSED_GOC
45: NEFUSED_CONV
46: NEFUSED_POOL
47: NEFUSED_EW
48: NEFUSED_BYPASS
# guessing from the hwx
kZinTensorFormatUInt8 = 0
kZinTensorFormatInt8 = 1
kZinTensorFormatFloat16 = 2
kZinTensorFormatInvalid
Zin (plist format) ---(ZinAneCoreCompile)---> Mir (hwx format)?
ZinAneCodeGeneration?
ZinIrStatus GetKernelFormat<6u>(ZinKernelFormat param_1,ane_ne_kernel_cfg_kernel_fmt *param_2)
List of allowed numbers
NeuronTypes (changes the LUT):
Tanh
Log2
Exp2
Sign = ZinMirActivationV7::GetSignLut
...many more in ANECompiler
Investigate:
ZinMirActivationV7::PrintLut(ZinMirActivationV7 *this,ane_nonlinear_lut_v7up_t *param_1
0: NONE
1: RELU
2: SIGMOID
3: SIGMOID_HIGH_PRECISION
4: TANH
5: CLAMPED_RELU
6: PRELU
7: DIRAC
8: INT
9: FRAC
10: SQRT
11: RSQRT
12: INV
13: SQR
14: LOG2
15: EXP2
16: ELU
17: SIGN
18: EQUAL_ZERO
19: NON_ZERO
20: LESS_THAN_ZERO
21: LESS_EQUAL_ZERO
22: GREATER_EQUAL_ZERO
23: GREATER_THAN_ZERO
24: CUSTOM_LUT
25: C_DIM_CONCAT
26: C_DIM_STRIDED_CONCAT
27: H_DIM_CONCAT
28: W_DIM_CONCAT
29: D_DIM_CONCAT
30: N_DIM_CONCAT
31: H_DIM_STRIDED_CONCAT
CacheHint
0: ALLOC
1: NOALLOC
2: DROP
3: DEPRI
conv kinds
0: regular
1: channelwise
2: grouped
== plist exploration ==
Float16 -> UInt8 for output works, Float32 doesn't
Same for input
All weights must be float
Index 0: model.espresso.weights @ 192 is weights
Index 1: net.additional.weights @ 0 is bias
Float16 -> Float32 for bias works
It's possible the compiler is Float32 -> Float16 converting, and the engine only supports Float16 + UInt8
== call to the compiler (in dmesg!) ==
[54476.282258]: H11ANEIn: ANE_ProgramCreate_gated:, ZinComputeProgramMake, get Mcache size: 0x0
[54476.282259]: H11ANEIn: ANE_ProgramCreate_gated:,Program Identifier:ANEC v1
zin_ane_compiler v4.2.1
-t h13
--fdram-allocator=ffreuse
--fdram-tensor-priority=sizethenliverange
--fl2-allocator=ffreuse
--fl3-allocator=ffreuse
--fl2-cache-mode=resident
--fsignature=ident
--memcache-strategy=
[54476.282262]: --memcache-size=4194304
--fspatial-split=disabled
--fkernel-rewind=enabled
--Wl-undefined=fvmlib
-i /Library/Caches/com.apple.aned/tmp/Python/DB7E897E7F4D5D27501A998428B6D3863AFD96CEA82DAF2207A75394E6BAC44C/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/net.plist
-o /Library/Caches/com.apple.aned/20A2411/Python/C9981871BC59572E74AFA3014B183EA37567EE9A2A08328446CE4A2B754E109D/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/model.hwx.tmp
== ANECCompile (in ANECompiler framework) ==
ANECCompile(__CFDictionary *param_1, __CFDictionary *param_2, unsigned long param_3)
param_1:
{
InputNetworks = (
{
NetworkPlistName = "net.plist";
NetworkPlistPath = "/Library/Caches/com.apple.aned/tmp/run/A2ACB9D5AA31B301563A4F62885BA379E62B0E1240E95C6902A93900FE0A9B54/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/";
}
);
OutputFileName = "model.hwx.tmp";
OutputFilePath = "/Library/Caches/com.apple.aned/20A2411/run/E68910CD1994681121EEDAFAE1BC524AA8E84CF80C42AFC0C7DE2C082C58BDFD/37C083FF396EB5948979EE20FD0457483E4ACE840AD23391A129BB83CFBC9C63/";
}
param_2:
{
TargetArchitecture = h13;
}
== Backtrace of device open ==
* frame #0: 0x00000001a68fac54 ANEServices`H11ANEDeviceOpen
frame #1: 0x00000001a78405b8 AppleNeuralEngine`__29-[_ANEDeviceController start]_block_invoke + 436
frame #2: 0x0000000193c84420 libdispatch.dylib`_dispatch_client_callout + 20
frame #3: 0x0000000193c92a98 libdispatch.dylib`_dispatch_lane_barrier_sync_invoke_and_complete + 60
frame #4: 0x00000001a78403e8 AppleNeuralEngine`-[_ANEDeviceController start] + 136
...
frame #23: 0x00000001a64a4f38 Espresso`Espresso::ANERuntimeEngine::compiler::build_segment(std::__1::shared_ptr<Espresso::abstract_batch> const&, int, Espresso::net_compiler_segment_based::segment_t const&) + 2080
...
frame #31: 0x000000019ab6099c CoreML`-[MLNeuralNetworkEngine rebuildPlan:] + 1640
== Backtrace of run? ==
* frame #0: 0x00000001a68f9108 ANEServices`H11ANEProgramProcessRequestDirect
frame #1: 0x00000001a7839694 AppleNeuralEngine`-[_ANEProgramForEvaluation processRequest:qos:qIndex:modelStringID:options:error:] + 1904
frame #2: 0x00000001a7843ba4 AppleNeuralEngine`-[_ANEClient doEvaluateDirectWithModel:options:request:qos:error:] + 1236
frame #3: 0x00000001a7842034 AppleNeuralEngine`-[_ANEClient evaluateWithModel:options:request:qos:error:] + 104
frame #4: 0x00000001a64a2988 Espresso`Espresso::ANERuntimeEngine::compiler::__forward_segment(std::__1::shared_ptr<Espresso::abstract_batch> const&, int, Espresso::net_compiler_segment_based::segment_t const&) + 2008
frame #5: 0x00000001a6414548 Espresso`Espresso::net_compiler_segment_based::__forward(std::__1::shared_ptr<Espresso::abstract_batch> const&) + 992
frame #6: 0x00000001a67e2e3c Espresso`EspressoLight::espresso_plan::dispatch_task_on_compute_batch(std::__1::shared_ptr<Espresso::abstract_batch> const&, std::__1::shared_ptr<EspressoLight::plan_task_t> const&) + 612
frame #7: 0x00000001a67ebab0 Espresso`EspressoLight::espresso_plan::execute_sync() + 356
frame #8: 0x00000001a67f26fc Espresso`espresso_plan_execute_sync + 120
frame #9: 0x000000019ab674b8 CoreML`-[MLNeuralNetworkEngine executePlan:error:] + 136
frame #10: 0x000000019ab6799c CoreML`-[MLNeuralNetworkEngine evaluateInputs:bufferIndex:options:error:] + 368

View File

@@ -1,102 +0,0 @@
import ctypes
from subprocess import check_output
from hexdump import hexdump
def get_pid(name):
try:
output = check_output(["pgrep", name])
return int(output)
except:
return None
from ctypes.util import find_library
libc = ctypes.CDLL(find_library('c'))
amfid_pid = get_pid("amfid")
task = ctypes.c_uint32()
mytask = libc.mach_task_self()
ret = libc.task_for_pid(mytask, ctypes.c_int(amfid_pid), ctypes.pointer(task))
print(amfid_pid, ret, task, mytask)
#myport = libc.mach_task_self()
class vm_region_submap_short_info_data_64(ctypes.Structure):
_pack_ = 1
_fields_ = [
("protection", ctypes.c_uint32),
("max_protection", ctypes.c_uint32),
("inheritance", ctypes.c_uint32),
("offset", ctypes.c_ulonglong),
("user_tag", ctypes.c_uint32),
("ref_count", ctypes.c_uint32),
("shadow_depth", ctypes.c_uint16),
("external_pager", ctypes.c_byte),
("share_mode", ctypes.c_byte),
("is_submap", ctypes.c_uint32),
("behavior", ctypes.c_uint32),
("object_id", ctypes.c_uint32),
("user_wired_count", ctypes.c_uint32),
]
submap_info_size = ctypes.sizeof(vm_region_submap_short_info_data_64) // 4
address = ctypes.c_ulong(0)
mapsize = ctypes.c_ulong(0)
count = ctypes.c_uint32(submap_info_size)
sub_info = vm_region_submap_short_info_data_64()
depth = 0
c_depth = ctypes.c_uint32(depth)
for i in range(1):
ret = libc.mach_vm_region_recurse(task,
ctypes.pointer(address), ctypes.pointer(mapsize),
ctypes.pointer(c_depth), ctypes.pointer(sub_info),
ctypes.pointer(count))
print("aslr", hex(ret), hex(address.value), mapsize, count, sub_info.protection)
#address.value += mapsize.value
#exit(0)
patch_address = address.value + 0x8e38
patch = b"\x00\x00\x80\xd2"
pdata = ctypes.c_void_p(0)
data_cnt = ctypes.c_uint32(0)
ret = libc.mach_vm_read(task, ctypes.c_ulong(patch_address), 4, ctypes.pointer(pdata), ctypes.pointer(data_cnt))
buf = ctypes.string_at(pdata.value, data_cnt.value)
hexdump(buf)
#ret = libc.mach_vm_wire(mytask, task, patch_address, 4, 3)
#print(ret)
#exit(0)
"""
ret = libc.mach_vm_read(task, address, mapsize, ctypes.pointer(pdata), ctypes.pointer(data_cnt))
buf = ctypes.string_at(pdata.value, data_cnt.value)
hexdump(buf)
ret = libc.mach_vm_deallocate(task, address, mapsize)
print("mach_vm_deallocate", ret)
ret = libc.mach_vm_allocate(task, ctypes.pointer(address), mapsize, 0)
print("mach_vm_allocate", ret)
"""
ret = libc.mach_vm_protect(task, ctypes.c_ulong(patch_address), 4, True, 3)
print("protect", ret)
longptr = ctypes.POINTER(ctypes.c_ulong)
#shellcodePtr = ctypes.cast(buf, longptr)
#ret = libc.mach_vm_write(task, address, shellcodePtr, len(buf))
#print("write", ret)
shellcodePtr = ctypes.cast(patch, longptr)
ret = libc.mach_vm_write(task, ctypes.c_ulong(patch_address), shellcodePtr, len(buf))
print("write", ret)
#libc.mach_vm_write.argtypes = [ctypes.c_uint32, ctypes.c_ulong, longptr, ctypes.c_uint32]
#libc.mach_vm_write.restype = ctypes.c_uint32
#ret = libc.mach_vm_write(task, ctypes.c_ulong(patch_address), shellcodePtr, len(patch))
ret = libc.mach_vm_protect(task, ctypes.c_ulong(patch_address), 4, False, 5)
print("protect", ret)

View File

@@ -1,220 +0,0 @@
// ZinIrRegBitPrintOutDebug_7u_
Task_ID: 0
header = 10*4 = 0x28
aneTD.Header[0].TID = 0
aneTD.Header[0].NID = 0
aneTD.Header[0].LNID = 1
aneTD.Header[0].EON = 1
aneTD.Header[1].ExeCycles = 0
aneTD.Header[1].NextSize = 0
aneTD.Header[2].LogEvents = 1058
aneTD.Header[3].Exceptions = 0
aneTD.Header[4].DebugLogEvents = 16775274
aneTD.Header[5].DebugExceptions = 0
aneTD.Header[6].DisallowAbort = 0
aneTD.Header[6].TDSkip = 0
aneTD.Header[6].KPC = 0
aneTD.Header[6].SPL = 0
aneTD.Header[6].TSR = 0
aneTD.Header[6].SPC = 0
aneTD.Header[6].DPC = 0
aneTD.Header[6].TSE = 0
aneTD.Header[6].NextPriority = 0
aneTD.Header[6].TDE = 0
aneTD.Header[6].SrcLoc = 1
aneTD.Header[6].DstLoc = 1
aneTD.Header[6].TQDis = 0
aneTD.Header[7].NextPointer = 0
aneTD.Header[8].RBase0 = 5
aneTD.Header[8].RBE0 = 1
aneTD.Header[8].RBase1 = 0
aneTD.Header[8].RBE1 = 0
aneTD.Header[8].WBase = 4
aneTD.Header[8].WBE = 1
aneTD.Header[8].TBase = 0
aneTD.Header[8].TBE = 0
aneTD.Header[8].ENE = 1
aneTD.Header[9].KBase0 = 1
aneTD.Header[9].KBE0 = 1
aneTD.Header[9].KBase1 = 0
aneTD.Header[9].KBE1 = 0
aneTD.Header[9].KBase2 = 0
aneTD.Header[9].KBE2 = 0
aneTD.Header[9].KBase3 = 0
aneTD.Header[9].KBE3 = 0
0x28 = 00 F8 01 F4 = 0x1F800
+0x30
aneRegs.KernelDMASrc.CoeffBaseAddr[0].Addr = 0
aneRegs.KernelDMASrc.CoeffBfrSize[0].MemBfrSize = 2
aneRegs.KernelDMASrc.CoeffDMAConfig[0].CacheHint = 2
aneRegs.KernelDMASrc.CoeffDMAConfig[0].CrH = 0
aneRegs.KernelDMASrc.CoeffDMAConfig[0].En = 1
aneRegs.KernelDMASrc.CoeffDMAConfig[0].PrefetchParticipateEn = 0
aneRegs.KernelDMASrc.CoeffBaseAddr[1].Addr = 0
aneRegs.KernelDMASrc.CoeffBfrSize[1].MemBfrSize = 1
aneRegs.KernelDMASrc.CoeffDMAConfig[1].CacheHint = 2
aneRegs.KernelDMASrc.CoeffDMAConfig[1].CrH = 0
aneRegs.KernelDMASrc.CoeffDMAConfig[1].En = 0
aneRegs.KernelDMASrc.CoeffDMAConfig[1].PrefetchParticipateEn = 0
aneRegs.KernelDMASrc.CoeffBaseAddr[2].Addr = 0
aneRegs.KernelDMASrc.CoeffBfrSize[2].MemBfrSize = 1
aneRegs.KernelDMASrc.CoeffDMAConfig[2].CacheHint = 2
aneRegs.KernelDMASrc.CoeffDMAConfig[2].CrH = 0
aneRegs.KernelDMASrc.CoeffDMAConfig[2].En = 0
aneRegs.KernelDMASrc.CoeffDMAConfig[2].PrefetchParticipateEn = 0
# there's 13 more of these
aneRegs.KernelDMASrc.Spare0.Spare = 0
aneRegs.KernelDMASrc.Spare1.Spare = 0
0x124 = 00 00 00 3C = 0
+0x1d4
aneRegs.Common.Cfg.AccDoubleBufEn = 1
aneRegs.Common.Cfg.ActiveNE = 0
aneRegs.Common.Cfg.ContextSwitchIn = 0
aneRegs.Common.Cfg.ContextSwitchOut = 0
aneRegs.Common.Cfg.ShMax = 1
aneRegs.Common.Cfg.ShMin = 0
aneRegs.Common.Cfg.ShPref = 1
aneRegs.Common.Cfg.SmallSourceMode = 0
aneRegs.Common.ChCfg.InFmt = 2
aneRegs.Common.ChCfg.OutFmt = 2
aneRegs.Common.Cin.Cin = 1
aneRegs.Common.ConvCfg.Kh = 1
aneRegs.Common.ConvCfg.Kw = 1
aneRegs.Common.ConvCfg.OCGSize = 0
aneRegs.Common.ConvCfg.Ox = 1
aneRegs.Common.ConvCfg.Oy = 1
aneRegs.Common.ConvCfg.Px = 0
aneRegs.Common.ConvCfg.Py = 0
aneRegs.Common.ConvCfg.Sx = 1
aneRegs.Common.ConvCfg.Sy = 1
aneRegs.Common.Cout.Cout = 1
aneRegs.Common.DPE.Category = 0
aneRegs.Common.GroupConvCfg.ElemMultMode = 0
aneRegs.Common.GroupConvCfg.NumGroups = 1
aneRegs.Common.GroupConvCfg.UnicastCin = 1
aneRegs.Common.GroupConvCfg.UnicastEn = 1
aneRegs.Common.InDim.Hin = 1
aneRegs.Common.InDim.Win = 77
aneRegs.Common.OutDim.Hout = 1
aneRegs.Common.OutDim.Wout = 77
aneRegs.Common.Spare0.Spare = 0
aneRegs.Common.Spare1.Spare = 0
aneRegs.Common.TaskInfo.NID = 1
aneRegs.Common.TaskInfo.TaskID = 0
aneRegs.Common.TaskInfo.TaskQ = 0
aneRegs.Common.TileCfg.TileHeight = 1
0x168 = 00 38 01 6C = 0x13800
+0x220
aneRegs.TileDMASrc.BaseAddr.Addr = 0
aneRegs.TileDMASrc.DMAConfig.CacheHint = 2
aneRegs.TileDMASrc.DMAConfig.CacheHintNoReuse = 12
aneRegs.TileDMASrc.DMAConfig.CacheHintReuse = 14
aneRegs.TileDMASrc.DMAConfig.CrH = 0
aneRegs.TileDMASrc.DMAConfig.DependencyMode = 0
aneRegs.TileDMASrc.DMAConfig.En = 1
aneRegs.TileDMASrc.Fmt.CmpVec = 0
aneRegs.TileDMASrc.DepthStride.Stride = 3
aneRegs.TileDMASrc.Fmt.FmtMode = 1
aneRegs.TileDMASrc.Fmt.Interleave = 1
aneRegs.TileDMASrc.Fmt.MemFmt = 2
aneRegs.TileDMASrc.Fmt.OffsetCh = 0
aneRegs.TileDMASrc.Fmt.Shift = 0
aneRegs.TileDMASrc.Fmt.Truncate = 3
aneRegs.TileDMASrc.GroupStride.Stride = 0
aneRegs.TileDMASrc.PixelOffset[0].Offset = 0
aneRegs.TileDMASrc.PixelOffset[1].Offset = 0
aneRegs.TileDMASrc.PixelOffset[2].Offset = 0
aneRegs.TileDMASrc.PixelOffset[3].Offset = 0
aneRegs.TileDMASrc.PlaneStride.PlaneStride = 3
aneRegs.TileDMASrc.RowStride.Stride = 3
aneRegs.TileDMASrc.Spare0.Spare = 0
aneRegs.TileDMASrc.Spare1.Spare = 0
0x1dc = 00 48 00 44 = 0x4800
+0x29c
aneRegs.L2.ResultBase.Addr = 10
aneRegs.L2.ResultCfg.AliasConvRslt = 0
aneRegs.L2.ResultCfg.AliasConvSrc = 0
aneRegs.L2.ResultCfg.AliasPlanarRslt = 0
aneRegs.L2.ResultCfg.AliasPlanarSrc = 0
aneRegs.L2.ResultCfg.ResultType = 2
aneRegs.L2.ResultCfg.DMACmpVec = 0
aneRegs.L2.ResultCfg.DMAFmt = 1
aneRegs.L2.ResultCfg.DMAInterleave = 1
aneRegs.L2.ResultCfg.DMAOffsetCh = 0
aneRegs.L2.ResultCfg.L2BfrMode = 1
aneRegs.L2.ConvResultChannelStride.Stride = 0
aneRegs.L2.ConvResultRowStride.Stride = 0
aneRegs.L2.L2Cfg.InputReLU = 0
aneRegs.L2.L2Cfg.PaddingMode = 0
aneRegs.L2.Spare0.Spare = 0
aneRegs.L2.Spare1.Spare = 0
aneRegs.L2.SourceBase.Addr = 0
aneRegs.L2.SourceCfg.AliasConvRslt = 0
aneRegs.L2.SourceCfg.AliasConvSrc = 0
aneRegs.L2.SourceCfg.AliasPlanarRslt = 0
aneRegs.L2.SourceCfg.AliasPlanarSrc = 0
aneRegs.L2.SourceCfg.DMACmpVec = 0
aneRegs.L2.SourceCfg.DMAFmt = 1
aneRegs.L2.SourceCfg.DMAInterleave = 1
aneRegs.L2.SourceCfg.DMAOffsetCh = 0
aneRegs.L2.SourceCfg.Dependent = 0
aneRegs.L2.SourceCfg.SourceType = 2
aneRegs.L2.SourceChannelStride.Stride = 10
aneRegs.L2.SourceRowStride.Stride = 10
0x228 = 00 88 00 0C = 0x8800
+0x2f0
0x23C = 00 C8 00 10 = 0xC800
+0x30c
aneRegs.NE.AccBias.AccBias = 0
aneRegs.NE.AccBias.AccBiasShift = 0
aneRegs.NE.KernelCfg.GroupKernelReuse = 0
aneRegs.NE.KernelCfg.KernelFmt = 0
aneRegs.NE.KernelCfg.PalettizedBits = 8
aneRegs.NE.KernelCfg.PalettizedEn = 0
aneRegs.NE.KernelCfg.SparseFmt = 0
aneRegs.NE.MACCfg.BiasMode = 0
aneRegs.NE.MACCfg.BinaryPoint = 0
aneRegs.NE.MACCfg.KernelMode = 1
aneRegs.NE.MACCfg.MatrixBiasEn = 0
aneRegs.NE.MACCfg.NonlinearMode = 2
aneRegs.NE.MACCfg.OpMode = 4
aneRegs.NE.MACCfg.PostScaleMode = 0
aneRegs.NE.MatrixVectorBias.MatrixVectorBias = 0
aneRegs.NE.PostScale.PostRightShift = 0
aneRegs.NE.PostScale.PostScale = 15360
aneRegs.NE.Spare0.Spare = 0
aneRegs.NE.Spare1.Spare = 0
0x254 = 00 78 01 18 = 0x17800
+0x32c
aneRegs.TileDMADst.BaseAddr.Addr = 0
aneRegs.TileDMADst.DepthStride.DepthStride = 3
aneRegs.TileDMADst.DMAConfig.BypassEOW = 0
aneRegs.TileDMADst.DMAConfig.CacheHint = 3
aneRegs.TileDMADst.DMAConfig.CrH = 0
aneRegs.TileDMADst.DMAConfig.En = 1
aneRegs.TileDMADst.DMAConfig.L2BfrMode = 1
aneRegs.TileDMADst.Fmt.CmpVec = 0
aneRegs.TileDMADst.Fmt.CmpVecFill = 0
aneRegs.TileDMADst.Fmt.FmtMode = 1
aneRegs.TileDMADst.Fmt.Interleave = 1
aneRegs.TileDMADst.Fmt.MemFmt = 2
aneRegs.TileDMADst.Fmt.OffsetCh = 0
aneRegs.TileDMADst.Fmt.Shift = 0
aneRegs.TileDMADst.Fmt.Truncate = 3
aneRegs.TileDMADst.Fmt.ZeroPadFirst = 1
aneRegs.TileDMADst.Fmt.ZeroPadLast = 1
aneRegs.TileDMADst.GroupStride.GroupStride = 0
aneRegs.TileDMADst.PlaneStride.PlaneStride = 3
aneRegs.TileDMADst.RowStride.RowStride = 3
aneRegs.TileDMADst.Spare0.Spare = 0
aneRegs.TileDMADst.Spare1.Spare = 0

View File

@@ -1 +0,0 @@
libane.dylib

View File

@@ -1,210 +0,0 @@
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sstream>
#import <IOSurface/IOSurfaceRef.h>
#import <Foundation/Foundation.h>
#import <CoreFoundation/CoreFoundation.h>
#include "h11ane.h"
using namespace H11ANE;
//#define DEBUG printf
#define DEBUG(x, ...)
extern "C" {
// global vars
H11ANEDevice *dev = NULL;
int MyH11ANEDeviceControllerNotification(H11ANEDeviceController *param_1, void *param_2, H11ANEDevice *param_3) {
DEBUG("MyH11ANEDeviceControllerNotification %p %p %p\n", param_1, param_2, param_3);
dev = param_3;
return 0;
}
int MyH11ANEDeviceMessageNotification(H11ANE::H11ANEDevice* dev, unsigned int param_1, void* param_2, void* param_3) {
DEBUG("MyH11ANEDeviceMessageNotification %d %p %p\n", param_1, param_2, param_3);
return 0;
}
int ANE_Open() {
int ret;
H11ANEDeviceController dc(MyH11ANEDeviceControllerNotification, NULL);
dc.SetupDeviceController();
assert(dev != NULL);
dev->EnableDeviceMessages();
char empty[0x90] = {0};
H11ANEDeviceInfoStruct dis = {0};
ret = dev->H11ANEDeviceOpen(MyH11ANEDeviceMessageNotification, empty, UsageCompile, &dis);
DEBUG("open 0x%x %p\n", ret, dev);
ret = dev->ANE_PowerOn();
DEBUG("power on: %d\n", ret);
ret = dev->ANE_IsPowered();
DEBUG("powered? %d\n", ret);
return 0;
}
int stride_for_width(int width) {
int ret = width*2;
ret += (64-(ret % 64))%64;
return ret;
}
void *ANE_TensorCreate(int width, int height) {
// all float16
// input buffer
NSDictionary* dict = [NSDictionary dictionaryWithObjectsAndKeys:
[NSNumber numberWithInt:width], kIOSurfaceWidth,
[NSNumber numberWithInt:height], kIOSurfaceHeight,
[NSNumber numberWithInt:2], kIOSurfaceBytesPerElement,
[NSNumber numberWithInt:stride_for_width(width)], kIOSurfaceBytesPerRow,
[NSNumber numberWithInt:1278226536], kIOSurfacePixelFormat,
nil];
IOSurfaceRef in_surf = IOSurfaceCreate((CFDictionaryRef)dict);
IOSurfaceLock((IOSurfaceRef)in_surf, 0, nil);
return (void *)in_surf;
}
void* ANE_TensorData(void *out_surf) {
void *ret = (void *)IOSurfaceGetBaseAddress((IOSurfaceRef)out_surf);
//IOSurfaceUnlock((IOSurfaceRef)out_surf, 0, nil);
DEBUG("TensorData %p -> %p\n", out_surf, ret);
return ret;
}
uint64_t ANE_Compile(char *iprog, int sz) {
int ret;
int cksum = 0;
for (int i = 0; i < sz; i++) cksum += iprog[i];
DEBUG("ANE_Compile %p with checksum %x size %d\n", iprog, cksum, sz);
char *prog = (char*)aligned_alloc(0x1000, sz);
memcpy(prog, iprog, sz);
H11ANEProgramCreateArgsStruct mprog = {0};
mprog.program = prog;
mprog.program_length = sz;
H11ANEProgramCreateArgsStructOutput *out = new H11ANEProgramCreateArgsStructOutput;
memset(out, 0, sizeof(H11ANEProgramCreateArgsStructOutput));
ret = dev->ANE_ProgramCreate(&mprog, out);
uint64_t program_handle = out->program_handle;
delete out;
DEBUG("program create: %lx %lx\n", ret, program_handle);
// early failure
if (ret != 0) return 0;
H11ANEProgramPrepareArgsStruct pas = {0};
pas.program_handle = program_handle;
pas.flags = 0x0000000100010001;
ret = dev->ANE_ProgramPrepare(&pas);
DEBUG("program prepare: %lx\n", ret);
return program_handle;
}
int ANE_Run(uint64_t program_handle, void *in_surf, void *out_surf, void *w_surf) {
int ret;
DEBUG("ANE_Run %p %p\n", in_surf, out_surf);
H11ANEProgramRequestArgsStruct *pras = new H11ANEProgramRequestArgsStruct;
memset(pras, 0, sizeof(H11ANEProgramRequestArgsStruct));
// TODO: make real struct
pras->args[0] = program_handle;
pras->args[4] = 0x0000002100000003;
// inputs
int in_surf_id = IOSurfaceGetID((IOSurfaceRef)in_surf);
int out_surf_id = IOSurfaceGetID((IOSurfaceRef)out_surf);
if (w_surf != NULL) {
pras->args[0x28/8] = 0x0000010000000002;
int w_surf_id = IOSurfaceGetID((IOSurfaceRef)w_surf);
pras->args[0x130/8] = (long long)w_surf_id;
} else {
pras->args[0x28/8] = 1;
}
pras->args[0x128/8] = (long long)in_surf_id<<32LL;
// outputs
pras->args[0x528/8] = 1;
// 0x628 = outputBufferSurfaceId
pras->args[0x628/8] = (long long)out_surf_id<<32LL;
mach_port_t recvPort = 0;
IOCreateReceivePort(kOSAsyncCompleteMessageID, &recvPort);
DEBUG("recv port: 0x%x\n", recvPort);
// run program
ret = dev->ANE_ProgramSendRequest(pras, recvPort);
DEBUG("send 0x%x\n", ret);
struct {
mach_msg_header_t header;
char data[256];
} message;
ret = mach_msg(&message.header,
MACH_RCV_MSG,
0, sizeof(message),
recvPort,
MACH_MSG_TIMEOUT_NONE,
MACH_PORT_NULL);
DEBUG("got message: %d sz %d\n", ret, message.header.msgh_size);
delete pras;
return 0;
}
int ANECCompile(CFDictionaryRef param_1, CFDictionaryRef param_2, unsigned long param_3);
int ANE_CompilePlist(char *path, bool debug=false) {
CFTypeRef ikeys[2];
ikeys[0] = CFSTR("NetworkPlistName");
ikeys[1] = CFSTR("NetworkPlistPath");
CFTypeRef ivalues[2];
ivalues[0] = CFStringCreateWithCString(kCFAllocatorDefault, path, kCFStringEncodingUTF8);
ivalues[1] = CFSTR("./");
CFDictionaryRef iDictionary = CFDictionaryCreate(kCFAllocatorDefault, ikeys, ivalues, 2, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
CFArrayRef array = CFArrayCreate(kCFAllocatorDefault, (const void**)&iDictionary, 1, &kCFTypeArrayCallBacks);
CFMutableDictionaryRef optionsDictionary = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
CFMutableDictionaryRef flagsDictionary = CFDictionaryCreateMutable(kCFAllocatorDefault, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
// h11 (or anything?) works here too, and creates different outputs that don't run
CFDictionaryAddValue(flagsDictionary, CFSTR("TargetArchitecture"), CFSTR("h13"));
CFDictionaryAddValue(optionsDictionary, CFSTR("OutputFileName"), CFSTR("model.hwx"));
if (debug) {
CFDictionaryAddValue(flagsDictionary, CFSTR("CompileANEProgramForDebugging"), kCFBooleanTrue);
int debug_mask = 0x7fffffff;
CFDictionaryAddValue(flagsDictionary, CFSTR("DebugMask"), CFNumberCreate(kCFAllocatorDefault, 3, &debug_mask));
}
return ANECCompile(optionsDictionary, flagsDictionary, 0);
}
/*void _Z24ZinIrRegBitPrintOutDebugILj7EE11ZinIrStatusjRN11ZinHWTraitsIXT_EE6HwTypeEiRNSt3__113basic_ostreamIcNS5_11char_traitsIcEEEE(
unsigned long param_1, void *param_2,int param_3, std::basic_ostream<char> *param_4);
char *ANE_RegDebug(int a1, void *dat, int a2) {
std::ostringstream ss;
_Z24ZinIrRegBitPrintOutDebugILj7EE11ZinIrStatusjRN11ZinHWTraitsIXT_EE6HwTypeEiRNSt3__113basic_ostreamIcNS5_11char_traitsIcEEEE(a1, dat, a2, &ss);
std::string cppstr = ss.str();
const char *str = cppstr.c_str();
char *ret = (char *)malloc(strlen(str)+1);
strcpy(ret, str);
return ret;
}*/
}

View File

@@ -1,222 +0,0 @@
#!/usr/bin/env python3
from pathlib import Path
from ctypes import *
import json
import collections
import numpy as np
import faulthandler
import struct
faulthandler.enable()
basedir = Path(__file__).resolve().parent
libane = None
aneregs = None
def init_libane():
global libane, aneregs
libane = cdll.LoadLibrary((basedir / "libane.dylib").as_posix())
libane.ANE_Compile.argtypes = [c_char_p, c_int]
libane.ANE_Compile.restype = c_void_p
libane.ANE_TensorCreate.restype = c_void_p
libane.ANE_TensorData.argtypes = [c_void_p]
libane.ANE_TensorData.restype = POINTER(c_uint16)
libane.ANE_Run.argtypes = [c_void_p]*4
libane.ANE_Run.restype = c_int
#libane.ANE_RegDebug.restype = c_char_p
with open(basedir / "aneregs.json") as f:
aneregs = json.load(f)
ANE_Struct = [
# aneTD.Header
("u32", 0x1C, "NextCommandOffset"),
# KernelDMASrc @ section @ 0x2C len 0xF4
# reloc 0x2c-0x34?? = weights
# u32[16] 0x34-0x74 = 0x80 | 1 if used
# u32[16] 0x74-0xB4 = <channel data offset>
# u32[16] 0xB4-0xF4 = <channel data length>
# Common @ section @ 0x128 len 0x3C (conv)
("u16", 0x128, "InputWidth"),
("u16", 0x12A, "InputHeight"),
("u16", 0x12C, "InputDepth"),
("u32", 0x130, "InputOutputType"), # (OutputType * 0x10) | InputType
# UInt8 = 0, Int8 = 1, Float16 = 2
("u32", 0x134, "InputChannels"),
("u32", 0x138, "OutputChannels"),
("u16", 0x13C, "OutputWidth"),
("u16", 0x13E, "OutputHeight"),
("u16", 0x140, "OutputDepth"),
("u16", 0x144, "KernelSize"), # 0xa000 | (KernelHeight * 0x20) | KernelWidth
("u16", 0x146, "Padding"), # 0x5000 | (PadTop * 0x40) | (PadLeft * 2)
("u16", 0x14C, "BatchSize"),
# TileDMASrc @ section @ 0x16C len 0x6C (input)
# reloc 0x16c-0x174 = image
("u32", 0x178, "InputRowStride"),
("u32", 0x17C, "InputPlaneStride"),
("u32", 0x180, "InputDepthStride"),
("u32", 0x184, "InputBatchStride"),
("u8", 0x1A7, "InputInterleave"),
# L2 @ section @ 0x1E0 len 0x44
# [0x1ec, 0x1f0, 0x1f4, 0x1f8, 0x214] = number of engines
# [0x1f0, 0x1f4, 0x1f8, 0x214] = engines for inconv?
# [0x21c, 0x220, 0x224] = engines for outconv?
# NE @ section @ 0x22c len 0xC (scaling)
("u16", 0x230, "BiasScalar"),
("u16", 0x232, "ScaleScalar"),
# section @ 0x240 len 0x10
("u16", 0x246, "NeuronType"), # 0x10 = copy, 0x11 = ReLU, 0x12 = custom
("u32", 0x250, "PostScale"),
# TileDMADst @ section @ 0x258 len 0x18
# HandleTileDmaDstConfig
# 0x258 -- *(uint *)(this + 0x334) = *(uint *)(this + 0x334) & 0xfffffc3f | 0xc0;
# (GetCacheHintRegisterValue & 0xf) << 6;
("u32", 0x25C, "OutputOffset"), # offset into output buffer to write at?
# 0x260 -- *(uint *)(this + 0x33c) = *(uint *)(this + 0x33c) & 0x3f | (int)uVar10 << 6;
("u32", 0x260, "OutputRowStride"),
("u32", 0x264, "OutputPlaneStride"),
("u32", 0x268, "OutputDepthStride"),
("u32", 0x26C, "OutputBatchStride"),
# 0x270 -- *(uint *)(this + 0x34c) = *(uint *)(this + 0x34c) & 0xf0ffffff | 0x1000000;
# uVar6 = *(uint *)(this + 0x34c) & 0xffffcfcc | 0x2031;
# (ZinTensorDescriptorDmaInterleave & 0xf) << 0x18;
("u8", 0x273, "OutputInterleave"), # i also have this at 0x211?
]
ANE_Struct_Dict = {}
for typ, num, nam in ANE_Struct:
styp = {"u32": "I", "u16": "H", "u8": "B"}[typ]
ANE_Struct_Dict[nam] = (styp, num)
class ANETensor:
def __init__(self, *shape):
self.shape = shape
self.dtype = np.float16
self.sz = int(np.prod(shape))
assert(self.sz <= 0x4000)
self.tt = libane.ANE_TensorCreate(self.sz, 1)
assert(self.tt is not None)
def data(self):
data = libane.ANE_TensorData(self.tt)
assert(data is not None)
#print(hex(addressof(data.contents)))
buf = np.ctypeslib.as_array(data, shape=(self.sz,))
ret = np.frombuffer(buf, dtype=self.dtype)
#print(ret.data)
return ret
class ANE:
def __init__(self):
init_libane()
libane.ANE_Open()
def compile(self, dat):
ret = libane.ANE_Compile(create_string_buffer(dat), len(dat))
assert(ret is not None)
return ret
def run(self, prog, tin, tout, tweights=None):
libane.ANE_Run(prog, tin.tt, tout.tt, tweights.tt if tweights is not None else 0)
def tensor(self, shape):
return ANETensor(shape)
def unpack(self, dat):
dat = struct.unpack("Q"*(len(dat)//8), dat)
ret = {}
for k,v in aneregs:
by,bi,sz = v
bi += (by%8)*8
by //= 8
rv = (dat[by] >> bi) & ((1 << sz)-1)
ret[k] = rv
return ret
def pack(self, pk, dat):
dat = list(struct.unpack("Q"*(len(dat)//8), dat))
for k,v in aneregs:
by,bi,sz = v
bi += (by%8)*8
by //= 8
dat[by] &= ~(((1 << sz)-1) << bi)
dat[by] |= pk[k] << bi
dat = struct.pack("Q"*len(dat), *dat)
return dat
def debug(self, dat, mems=0):
add = [0x30, 0x1d4, 0x220, 0x29c, 0x2f0, 0x30c, 0x32c]
lens = [244, 60, 108, 68, 12, 16, 24]
ptr = 0x2b
ddat = dat[0:0x28]
for a, pm in zip(add, lens):
#assert pm == dat[ptr]
ddat += b"\x00" * (a-len(ddat))
ddat += dat[ptr+1:ptr+1+pm+4]
ptr += pm+8
ddat += b"\x00" * 0x100
ret = collections.OrderedDict()
for ln in libane.ANE_RegDebug(0, create_string_buffer(ddat), mems).decode('utf-8').strip().split("\n"):
lnn = ln.split(" = ")
if len(lnn) == 2:
ret[lnn[0]] = int(lnn[1])
return ret
def filln(self, dat, nvdict, base=0x4000):
for n,v in nvdict.items():
styp, num = ANE_Struct_Dict[n]
dat = self.fill(dat, [num], styp, v)
return dat
def fill(self, dat, addrs, type, val, base=0x4000):
x = struct.pack(type, val)
for a in addrs:
dat[base+a:base+a+len(x)] = x
return dat
if __name__ == "__main__":
ane = ANE()
tin = ANETensor(16)
tout = ANETensor(16)
tind = tin.data()
toutd = tout.data()
tind[0:4] = [-1,1,-2,2]
print("** before **")
print(tind)
print(toutd)
dat = open("../ops/relu.hwx", "rb").read()
md = dat[0x4000:0x4300]
dd = ane.unpack(md)
mdf = ane.pack(dd, md)
assert(md == mdf)
comp = ane.compile(dat)
ret = ane.run(comp, tin, tout)
print("** after **")
print(tind)
print(toutd)

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +0,0 @@
#!/bin/bash
clang++ ane.mm --shared -F /System/Library/PrivateFrameworks/ -framework ANEServices -framework IOSurface -framework Foundation -framework IOKit -framework ANECompiler -o libane.dylib

View File

@@ -1 +0,0 @@
../3_run/entitlements.xml

View File

@@ -1 +0,0 @@
../3_run/h11ane.h

View File

@@ -1,3 +0,0 @@
#!/bin/bash
codesign --force --entitlements entitlements.xml -s "Taylor Swift Child" /opt/homebrew/Cellar/python@3.9/3.9.1_1/Frameworks/Python.framework/Versions/3.9/Resources/Python.app/Contents/MacOS/Python

View File

@@ -1,91 +0,0 @@
#!/usr/bin/env python3
import time
from ane import ANE, ANETensor
def benchmark(ane):
tin = ANETensor(512*0x20)
tout = ANETensor(512*0x20)
dat = open("../ops/gemm.hwx", "rb").read()
for k,v in ane.debug(dat[0x4000:0x4300], 16).items():
print(k,v)
comp = ane.compile(dat)
st = time.time()
for i in range(1000):
ret = ane.run(comp, tin, tout)
et = time.time()
ts = (et-st)
ops = 1000*512*512*2
print("%.2f ms, %.2f gigaops/sec" % (ts*1000, ops*1e-9/ts))
if __name__ == "__main__":
ane = ANE()
# 0x20 per row
tin = ANETensor(0x60)
tout = ANETensor(0x60)
tw = ANETensor(0x60)
tind = tin.data()
toutd = tout.data()
twd = tw.data()
#tind[0:4] = [-1,1,-2,2]
tind[0] = 1
tind[0x20] = -2
tind[0x40] = 3
# toutd[0] = \
# tind[0] * twd[0] + \
# tind[0x20] + twd[1] + \
# tind[0x40] + twd[2]
twd[0] = 4
twd[1] = 0x100
twd[0x20] = 5
twd[0x21] = 5
twd[0x22] = 5
twd[0x40] = 12
print("** before **")
print(tind)
print(toutd)
#benchmark(ane)
#exit(0)
"""
dat = list(open("../ops/sum.hwx", "rb").read())
dat = bytes(dat)
for k,v in ane.debug(dat[0x4000:0x4300], 16).items():
print(k,v)
comp = ane.compile(dat)
ret = ane.run(comp, tin, tout, tw)
"""
datb = open("../ops/sum.hwx", "rb").read()
dat = open("../ops/conv.hwx", "rb").read()
dd = ane.unpack(dat[0x4000:0x4300])
# use the 3rd arg as the weights
dd["aneTD.Header[9].KBase0"] = 6
dd["aneRegs.NE.PostScale.PostScale"] = 0x3c00
#dd["aneRegs.L2.L2Cfg.InputReLU"] = 1
#dd["aneRegs.NE.MACCfg.NonlinearMode"] = 1
#dd["aneRegs.TileDMADst.Fmt.MemFmt"] = 0
#dd["aneRegs.L2.ResultBase.Addr"] = 0
#dd["aneRegs.Common.ChCfg.InFmt"] = 1
#dd["aneRegs.TileDMADst.Fmt.ZeroPadFirst"] = 0
#dd["aneRegs.TileDMADst.DMAConfig.En"] = 0
for k,v in dd.items():
print(k,v)
dat = datb[:0x4000] + ane.pack(dd, dat[0x4000:0x4300]) + datb[0x4300:]
comp = ane.compile(dat)
ret = ane.run(comp, tin, tout, tw)
print("** after **")
print(tind)
print(toutd)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -1,39 +0,0 @@
from functools import lru_cache
from .tensor import Device, Function, register
@lru_cache
def compile_wrapper(ane, dat):
return ane.compile(dat)
def roundup(x, v):
return x + (v-x)%v
@lru_cache
def compile_relu(ane, sz):
dat = list(open("accel/ane/ops/relu.hwx", "rb").read())
# TODO: make this all nice and once
# number of engines? (max 0x100)
l2_stride = max(0x100, roundup(sz*2, 0x10))
# 0x1ec = L2.SourceChannelStride.Stride, 0x1f0 = L2.SourceRowStride.Stride
# 0x1f4, 0x1f8?
# 0x214 = L2.ResultBase.Addr
dat = ane.fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8, 0x214], "I", l2_stride)
stride = roundup(sz*2, 0x40)
dat = ane.filln(dat, {
"NeuronType": 0x11, # 0x10 makes this a copy, 0x11 = ReLU, 0x12 = crash
"InputWidth": sz, "OutputWidth": sz,
"InputRowStride": stride, "InputPlaneStride": stride, "InputDepthStride": stride,
"OutputRowStride": stride, "OutputPlaneStride": stride, "OutputDepthStride": stride,
})
return compile_wrapper(ane, bytes(dat))
class ReLU(Function):
def forward(ctx, input):
ret = ctx.ane.tensor(input.shape)
ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret)
return ret
def backward(ctx, grad_output):
return 0
register('relu', ReLU, device=Device.ANE)

View File

@@ -1 +0,0 @@
a.out

View File

@@ -1,2 +0,0 @@
source /opt/intel/oneapi/compiler/latest/env/vars.sh
sycl-ls

View File

@@ -1,57 +0,0 @@
import time
onnx_path = "/tmp/my.onnx"
N = 2048
CNT = 400
"""
import torch
import torch.nn as nn
#dtype = torch.bfloat16
dtype = torch.float32
class MatMul(nn.Module):
def __init__(self):
super().__init__()
self.a = nn.Linear(N, N, bias=False)
def forward(self, x):
x = x.to(dtype)
for i in range(CNT): x = self.a(x).relu()
return x.to(torch.float32)
torch_model = MatMul().to(dtype)
torch.onnx.export(torch_model, torch.randn(N, N), onnx_path)
"""
"""
import onnx
from tinygrad.tensor import Tensor
from extra.onnx import get_run_onnx
out = get_run_onnx(onnx.load(onnx_path))({"onnx::MatMul_0": Tensor.zeros(N, N)})
for x in out.values(): x.realize()
"""
from openvino.runtime import Core
core = Core()
devices = core.available_devices
for device in devices:
device_name = core.get_property(device, "FULL_DEVICE_NAME")
print(f"{device}: {device_name}")
model = core.read_model(onnx_path)
compiled_model = core.compile_model(model, device_name='GPU.0')
print(compiled_model)
ireq = compiled_model.create_infer_request()
for model_input in compiled_model.inputs:
tensor = ireq.get_tensor(model_input)
tensor.data[:] = 2
print(tensor)
print("request")
ireq.infer()
ireq.infer()
print("did one")
REPS = 20
st = time.perf_counter()
for i in range(REPS): ireq.infer()
et = time.perf_counter() - st
print(f"{et*1000:.2f} ms {(CNT*N*N*N*REPS*2/et)*1e-9:.2f} GFLOPS")

View File

@@ -1,3 +0,0 @@
#!/bin/bash -e
/opt/intel/oneapi/compiler/latest/linux/bin-llvm/clang++ joint_matrix_bfloat16.cpp -fsycl
SYCL_PI_TRACE=1 ./a.out

View File

@@ -1,173 +0,0 @@
//==-------- joint_matrix_bfloat16.cpp - DPC++ joint_matrix----------- ----==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: matrix
// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
#include <iostream>
#include <sycl/sycl.hpp>
using namespace sycl;
using namespace sycl::ext::oneapi::experimental::matrix;
using bfloat16 = sycl::ext::oneapi::bfloat16;
//#define SG_SZ 16
#define SG_SZ 8
#define TM 8
#define TN SG_SZ
//#define TK 16
#define TK 16
#define BF16_EPSILON 0.00781250
template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
private:
T *mat;
public:
T *get_data() { return mat; }
void set_data(T *data) { mat = data; }
big_matrix(T *data) : mat(data) {}
};
template <typename T1, typename T2, size_t M, size_t N, size_t K>
void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A, big_matrix<T2, K / 2, N * 2> &B) {
size_t NDRangeM = M / TM;
size_t NDRangeN = N / TN;
buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, K));
buffer<bfloat16, 2> bufB(B.get_data(), range<2>(K, N));
buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
auto program = [&](handler &cgh) {
auto accC = bufC.get_access<access::mode::read_write>(cgh);
auto accA = bufA.get_access<access::mode::read_write>(cgh);
auto accB = bufB.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<class imatrix>(
nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
[=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
{
// The submatrix API has to be accessed by all the workitems in a
// subgroup these functions will be called once by the subgroup no
// code divergence between the workitems
const auto global_idx = spmd_item.get_global_id(0);
const auto global_idy = spmd_item.get_global_id(1);
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
const auto sg_starty = global_idy - spmd_item.get_local_id(1);
sub_group sg = spmd_item.get_sub_group();
joint_matrix<sub_group, bfloat16, use::a, TM, TK, layout::row_major> sub_a;
// For B, we assume B has been already VNNIed.
joint_matrix<sub_group, bfloat16, use::b, TK, TN, ext::intel::experimental::matrix::layout::packed> sub_b;
joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
joint_matrix_load(sg, sub_c, accC.get_pointer() + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, layout::row_major);
for (int k = 0; k < K / TK; k += 1) { //
joint_matrix_load(sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, K);
joint_matrix_load(sg, sub_b, accB.get_pointer() + (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2, N * 2);
sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
}
joint_matrix_store(sg, sub_c, accC.get_pointer() + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, layout::row_major);
}); // parallel for
};
queue q;
auto start = std::chrono::steady_clock::now();
auto e = q.submit(program);
auto submit = std::chrono::steady_clock::now();
e.wait();
auto end = std::chrono::steady_clock::now();
std::cout << "submit: " << std::chrono::duration_cast<std::chrono::milliseconds>(submit - start).count() << " ms" << std::endl;
std::cout << "compute: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - submit).count() << " ms" << std::endl;
// ahh, freeing is slow
}
//#define SCALE 1024
//#define SCALE 64
#define SCALE 256
static constexpr size_t MATRIX_M = TM * SCALE;
static constexpr size_t MATRIX_N = TN * SCALE;
static constexpr size_t MATRIX_K = TK * SCALE;
bfloat16 A[MATRIX_M][MATRIX_K];
bfloat16 B[MATRIX_K / 2][MATRIX_N * 2];
float C[MATRIX_M][MATRIX_N];
float D[MATRIX_M][MATRIX_N];
float make_fp32(bfloat16 x) {
unsigned int y = *((int *)&x);
y = y << 16;
float *res = reinterpret_cast<float *>(&y);
return *res;
}
void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N,
int K) {
for (int m = 0; m < M; m++)
for (int n = 0; n < N; n++) {
for (int k = 0; k < K; k++) {
// Because B was assumed VNNIed
bfloat16 *va = (bfloat16 *)(A_mem + m * K + k);
bfloat16 *vb = (bfloat16 *)(B_mem + k * N + n);
float acc = *((float *)(C_mem + m * N + n));
for (int i = 0; i < 2; i++) {
acc += (make_fp32(va[i]) * make_fp32(vb[i]));
}
*((float *)(C_mem + m * N + n)) = acc;
}
}
}
int main() {
for (int i = 0; i < MATRIX_M; i++) {
for (int j = 0; j < MATRIX_K; j++) {
A[i][j] = bfloat16(1.0f * (i + j));
}
}
for (int i = 0; i < MATRIX_K / 2; i++) {
for (int j = 0; j < MATRIX_N * 2; j++) {
B[i][j] = bfloat16(2.0f * i + 3.0f * j);
}
}
for (int i = 0; i < MATRIX_M; i++) {
for (int j = 0; j < MATRIX_N; j++) {
C[i][j] = 1.0;
D[i][j] = 1.0;
}
}
std::cout << "M" << MATRIX_M << "N" << MATRIX_N << "K" << MATRIX_K << std::endl;
big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C);
big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
big_matrix<bfloat16, MATRIX_M, MATRIX_K> MA((bfloat16 *)&A);
big_matrix<bfloat16, MATRIX_K / 2, MATRIX_N * 2> MB((bfloat16 *)&B);
matrix_multiply(MC, MA, MB);
/*start = std::chrono::steady_clock::now();
matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, MATRIX_N, MATRIX_K / 2);
end = std::chrono::steady_clock::now();
std::cout << "Elapsed time in milliseconds (reference): " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
bool res = true;
for (int i = 0; i < MATRIX_M; i++) {
for (int j = 0; j < MATRIX_N; j++) {
if ((fabs(C[i][j]) - fabs(D[i][j])) > BF16_EPSILON)
res = false;
}
}
std::cout << (res ? "passed" : "failed") << std::endl;
return !res;*/
return 0;
}

View File

@@ -1,127 +0,0 @@
Google's TPU
--------------------------------------------------------------------
We document the Google TPU v2/v3 in order to support it in tinygrad without the XLA compiler.
## Creating a Google Cloud TPU VM
This costs $4.50/hr for a TPUv2-8 machine, the cheapest VM.
```bash
gcloud alpha compute tpus tpu-vm create test --zone=us-central1-b --accelerator-type=v2-8 --version=v2-alpha
gcloud alpha compute tpus tpu-vm ssh test --zone us-central1-b
# and for when you are done
gcloud alpha compute tpus tpu-vm delete test --zone us-central1-b
gcloud alpha compute tpus tpu-vm list --zone us-central1-b
```
Aside from the usual VM stuff, there's 4 accelerators on the PCI-E bus. (v2-8 is 4 chips with 2 cores each)
```
# lspci
00:04.0 Unassigned class [ff00]: Google, Inc. Device 0027
00:05.0 Unassigned class [ff00]: Google, Inc. Device 0027
00:06.0 Unassigned class [ff00]: Google, Inc. Device 0027
00:07.0 Unassigned class [ff00]: Google, Inc. Device 0027
```
They show up in `/sys/class/accel` (tons of files here) and the driver lives in `/lib/libtpu.so`. The devices are in `/dev/accel[0-3]`, and a bunch of stuff is mmaped. They are "ba16c7433" chips.
We grab the minimal TPU [example from TensorFlow](https://github.com/tensorflow/tensorflow/blob/695b4c93d5da7277eb845937b79b66f9f363ed94/tensorflow/compiler/xla/python/tpu_driver/client/libtpu_client.c). When the compiler runs, it produces tons of great logs in `/tmp/tpu_logs`
```bash
cd tfexample
gcc -o libtpu_client libtpu_client.c -ltpu
TPU_VLOG_LEVEL=99 ./libtpu_client
```
From these logs, we find the "LLO Instructions"
## VLIW Instruction (322b VLIW bundle)
```
spare : 0 (0,1)
vex_mxu : 0 (1,1)
* 1 misc slot
msc_targ : 0 (2,3)
msc_opnd : 0 (5,3)
msc_op : 0 (8,5)
msc_pred : 31 (13,5)
* 2 matrix slots (push, pop)
vres_dest : 28 (18,2)
vres_op : 28 (20,2)
vres_pred : 31 (22,5)
vex_source : 28 (27,2)
vex_subop : 24 (29,3)
vex_op : 24 (32,3)
vex_pred : 31 (35,5)
* 4 vector slots (2 for load/store)
vld_ttu : 30 (40,1)
vld_stride : 24 (41,3)
vld_offset : 24 (44,2)
vld_base : 24 (46,2)
vld_submsk : 24 (48,3)
vld_dest : 0 (51,5)
vld_op : 0 (56,2)
vld_pred : 31 (58,5)
vst_ttu : 30 (63,1)
vst_iar : 30 (64,1)
vst_value_two : 24 (65,3)
vst_offset : 24 (68,2)
vst_base : 24 (70,2)
vst_value_one : 24 (72,3)
vst_source : 0 (75,5)
vst_op : 0 (80,5)
vst_pred : 31 (85,5)
* 4 vector slots (2 for ALU)
v1_dest : 0 (90,5)
v1_y_vreg : 0 (95,5)
v1_y_src : 0 (100,5)
v1_x : 0 (105,5)
v1_op : 0 (110,6)
v1_pred : 31 (116,5)
v0_dest : 0 (121,5)
v0_y_vreg : 0 (126,5)
v0_y_src : 0 (131,5)
v0_x : 0 (136,5)
v0_op : 0 (141,6)
v0_pred : 31 (147,5)
* 3 scalar registers copied in to the vector units?
vs2 : 0 (152,5)
vs1 : 0 (157,5)
vs0 : 0 (162,5)
* 6 immediates (16-bit each, two can be merged for 32)
imm_5 : 0 (167,16)
imm_4 : 0 (183,16)
imm_3 : 0 (199,16)
imm_2 : 0 (215,16)
imm_1 : 0 (231,16)
imm_0 : 0 (247,16)
* ttu? what's a ttu?
ttu_set_btr : 0 (263,1)
ttu_iterate : 0 (264,1)
ttu_row : 0 (265,3)
* 2 scalar slots
s1_dest : 0 (268,5)
s1_y : 0 (273,6)
s1_x : 0 (279,5)
s1_op : 0 (284,6)
s1_pred : 31 (290,5)
s0_dest : 0 (295,5)
s0_y : 0 (300,6)
s0_x : 0 (306,5)
s0_op : 0 (311,6)
s0_pred : 15 (317,5)
```
## Running a Program (WIP)
Our goal is to run a program on TPU without the driver.
```
...
openat(AT_FDCWD, "/dev/accel3", O_RDWR) = 184
mmap(NULL, 27799736, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_LOCKED, 184, 0) = 0x7f59a74b3000
# size is 0x1a830b8, aka 28MB
```

View File

@@ -1,303 +0,0 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_LIBTPU_H_
#define TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_LIBTPU_H_
#include <stdbool.h>
#include <stdint.h>
#define TPUDRIVER_CAPI_EXPORT __attribute__((visibility("default")))
#ifdef __cplusplus
extern "C" {
#endif
// ------------------- TPU Driver Support -----------------------
struct TpuDriverFn;
typedef struct TpuDriver TpuDriver;
typedef struct TpuEvent TpuEvent;
typedef struct TpuBufferHandleInternal TpuBufferHandleInternal;
typedef struct TpuCompiledProgramHandleInternal
TpuCompiledProgramHandleInternal;
typedef struct TpuLoadedProgramHandleInternal TpuLoadedProgramHandleInternal;
typedef struct TpuBufferHandle {
TpuBufferHandleInternal* internal_handle;
TpuEvent* event;
int64_t size_in_bytes;
} TpuBufferHandle;
typedef struct TpuCompiledProgramHandle {
TpuCompiledProgramHandleInternal* internal_handle;
TpuEvent* event;
} TpuCompiledProgramHandle;
typedef struct TpuLoadedProgramHandle {
TpuLoadedProgramHandleInternal* internal_handle;
TpuEvent* event;
} TpuLoadedProgramHandle;
// HloProto is a serialized xla::HloProto buffer.
typedef struct HloProto {
void* buffer;
int32_t size;
} HloProto;
// DeviceAssignment is a serialized xla::DeviceAssignmentProto buffer.
typedef struct DeviceAssignment {
void* bytes;
int32_t size;
} DeviceAssignment;
typedef struct TpuStatus {
int32_t code;
char* msg;
} TpuStatus;
typedef struct CompiledProgramShape {
struct TpuStatus* status;
void* bytes;
int32_t size;
} CompiledProgramShape;
typedef struct TpuAllocationShape {
void* bytes;
int32_t size;
} TpuAllocationShape;
typedef struct TpuSystemInfo {
void* bytes;
int32_t size;
} TpuSystemInfo;
typedef void(PrototypeTpuDriver_Initialize)(struct TpuDriverFn* driver_fn,
bool initialize);
typedef struct TpuDriver*(PrototypeTpuDriver_Open)(const char* worker);
typedef void(PrototypeTpuDriver_Close)(struct TpuDriver* driver);
typedef struct TpuStatus*(PrototypeTpuDriver_Reset)(struct TpuDriver* driver);
typedef struct TpuSystemInfo*(PrototypeTpuDriver_QuerySystemInfo)(
struct TpuDriver* driver);
typedef void(PrototypeTpuDriver_FreeSystemInfo)(struct TpuSystemInfo* info);
// TODO(frankchn): Make this not a hard-coded constant.
const int32_t MemoryRegion_HBM = 1;
typedef int64_t(PrototypeTpuDriver_ComputeLinearizedBytesFromShape)(
struct TpuDriver* driver, const struct TpuAllocationShape shape);
typedef struct TpuStatus*(PrototypeTpuDriver_LinearizeShape)(
struct TpuDriver* driver, void* dst, const void* src,
const struct TpuAllocationShape shape);
typedef struct TpuStatus*(PrototypeTpuDriver_DelinearizeShape)(
struct TpuDriver* driver, void* dst, const void* src,
const struct TpuAllocationShape shape);
typedef struct TpuCompiledProgramHandle*(PrototypeTpuDriver_CompileProgram)(
struct TpuDriver* driver, const struct HloProto hlo_proto,
int32_t num_replicas, int32_t eventc, struct TpuEvent** eventv);
typedef struct TpuCompiledProgramHandle*(
PrototypeTpuDriver_CompileProgramFromText)(struct TpuDriver* driver,
const char* hlo_text,
int32_t num_replicas,
int32_t eventc,
struct TpuEvent** eventv);
/* Note: We are not responsible for freeing the event within the
* TpuCompiledProgramHandle. You have to call FreeEvent separately to ensure
* that memory does not leak.
*/
typedef void(PrototypeTpuDriver_FreeCompiledProgramHandle)(
struct TpuCompiledProgramHandle* handle);
typedef struct TpuLoadedProgramHandle*(PrototypeTpuDriver_LoadProgram)(
struct TpuDriver* driver, int32_t core_id,
const struct TpuCompiledProgramHandle* compiled_program_handle,
int32_t eventc, struct TpuEvent** eventv);
/* Note: We are not responsible for freeing the event within the
* TpuLoadedProgramHandle. You have to call FreeEvent separately to ensure that
* memory does not leak.
*/
typedef struct TpuEvent*(PrototypeTpuDriver_UnloadProgram)(
struct TpuDriver* driver,
struct TpuLoadedProgramHandle* loaded_program_handle, int32_t eventc,
struct TpuEvent** eventv);
typedef struct TpuEvent*(PrototypeTpuDriver_ExecuteProgram)(
struct TpuDriver* driver, struct TpuLoadedProgramHandle* handle,
int32_t inputc, struct TpuBufferHandle** input_buffer_handle,
int32_t outputc, struct TpuBufferHandle** output_buffer_handle,
struct DeviceAssignment device_assignment, int32_t eventc,
struct TpuEvent** eventv);
typedef struct TpuBufferHandle*(PrototypeTpuDriver_AllocateTuple)(
struct TpuDriver* driver, int32_t core_id, int32_t memory_region,
int32_t bufferc, struct TpuBufferHandle** buffer_handle, int32_t eventc,
struct TpuEvent** eventv);
typedef struct TpuBufferHandle*(PrototypeTpuDriver_Allocate)(
struct TpuDriver* driver, int32_t core_id, int32_t memory_region,
int64_t num_bytes, int32_t eventc, struct TpuEvent** eventv);
typedef struct TpuBufferHandle*(PrototypeTpuDriver_AllocateShape)(
struct TpuDriver* driver, int32_t core_id, int32_t memory_region,
const struct TpuAllocationShape shape, int32_t eventc,
struct TpuEvent** eventv);
/* Note: We are not responsible for freeing the event within the
* TpuBufferHandle. You have to call FreeEvent separately to ensure that memory
* does not leak.
*/
typedef struct TpuEvent*(PrototypeTpuDriver_Deallocate)(
struct TpuDriver* driver, struct TpuBufferHandle* buffer_handle,
int32_t eventc, struct TpuEvent** eventv);
typedef struct TpuEvent*(PrototypeTpuDriver_TransferToDevice)(
struct TpuDriver* driver, const void* src, struct TpuBufferHandle* dst,
int32_t eventc, struct TpuEvent** eventv);
typedef struct TpuEvent*(PrototypeTpuDriver_TransferFromDevice)(
struct TpuDriver* driver, struct TpuBufferHandle* src, void* dst,
int32_t eventc, struct TpuEvent** eventv);
typedef struct TpuEvent*(PrototypeTpuDriver_TransferFromDeviceToDevice)(
struct TpuDriver* driver, struct TpuBufferHandle* src,
struct TpuBufferHandle* dst, int32_t eventc, struct TpuEvent** eventv);
typedef struct CompiledProgramShape*(
PrototypeTpuDriver_GetCompiledProgramShape)(
struct TpuCompiledProgramHandle* handle);
typedef void(PrototypeTpuDriver_FreeCompiledProgramShape)(
struct CompiledProgramShape* shape);
typedef void(PrototypeTpuDriver_EventAddCallback)(
struct TpuEvent* event,
void (*callback_fn)(struct TpuStatus*, void* additional_info),
void* additional_info);
typedef struct TpuStatus*(PrototypeTpuDriver_EventAwait)(struct TpuEvent* event,
int64_t timeout_in_us);
typedef void(PrototypeTpuDriver_FreeEvent)(struct TpuEvent* event);
typedef void(PrototypeTpuDriver_FreeStatus)(struct TpuStatus* status);
typedef const char*(PrototypeTpuDriver_Version)();
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Initialize TpuDriver_Initialize;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Open TpuDriver_Open;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Close TpuDriver_Close;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Reset TpuDriver_Reset;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_QuerySystemInfo
TpuDriver_QuerySystemInfo;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeSystemInfo
TpuDriver_FreeSystemInfo;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_ComputeLinearizedBytesFromShape
TpuDriver_ComputeLinearizedBytesFromShape;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_LinearizeShape
TpuDriver_LinearizeShape;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_DelinearizeShape
TpuDriver_DelinearizeShape;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_CompileProgram
TpuDriver_CompileProgram;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_CompileProgramFromText
TpuDriver_CompileProgramFromText;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeCompiledProgramHandle
TpuDriver_FreeCompiledProgramHandle;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_LoadProgram
TpuDriver_LoadProgram;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_UnloadProgram
TpuDriver_UnloadProgram;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_ExecuteProgram
TpuDriver_ExecuteProgram;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_AllocateTuple
TpuDriver_AllocateTuple;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Allocate TpuDriver_Allocate;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_AllocateShape
TpuDriver_AllocateShape;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Deallocate TpuDriver_Deallocate;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_TransferToDevice
TpuDriver_TransferToDevice;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_TransferFromDevice
TpuDriver_TransferFromDevice;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_TransferFromDeviceToDevice
TpuDriver_TransferFromDeviceToDevice;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_GetCompiledProgramShape
TpuDriver_GetCompiledProgramShape;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeCompiledProgramShape
TpuDriver_FreeCompiledProgramShape;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_EventAddCallback
TpuDriver_EventAddCallback;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_EventAwait TpuDriver_EventAwait;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeEvent TpuDriver_FreeEvent;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_FreeStatus TpuDriver_FreeStatus;
TPUDRIVER_CAPI_EXPORT extern PrototypeTpuDriver_Version TpuDriver_Version;
#ifdef __cplusplus
}
#endif
struct TpuDriverFn {
PrototypeTpuDriver_Open* TpuDriver_Open; // NOLINT
PrototypeTpuDriver_Close* TpuDriver_Close; // NOLINT
PrototypeTpuDriver_Reset* TpuDriver_Reset; // NOLINT
PrototypeTpuDriver_ComputeLinearizedBytesFromShape*
TpuDriver_ComputeLinearizedBytesFromShape; // NOLINT
PrototypeTpuDriver_QuerySystemInfo* TpuDriver_QuerySystemInfo; // NOLINT
PrototypeTpuDriver_FreeSystemInfo* TpuDriver_FreeSystemInfo; // NOLINT
PrototypeTpuDriver_LinearizeShape* TpuDriver_LinearizeShape; // NOLINT
PrototypeTpuDriver_DelinearizeShape* TpuDriver_DelinearizeShape; // NOLINT
PrototypeTpuDriver_CompileProgram* TpuDriver_CompileProgram; // NOLINT
PrototypeTpuDriver_CompileProgramFromText*
TpuDriver_CompileProgramFromText; // NOLINT
PrototypeTpuDriver_FreeCompiledProgramHandle*
TpuDriver_FreeCompiledProgramHandle; // NOLINT
PrototypeTpuDriver_LoadProgram* TpuDriver_LoadProgram; // NOLINT
PrototypeTpuDriver_UnloadProgram* TpuDriver_UnloadProgram; // NOLINT
PrototypeTpuDriver_ExecuteProgram* TpuDriver_ExecuteProgram; // NOLINT
PrototypeTpuDriver_AllocateTuple* TpuDriver_AllocateTuple; // NOLINT
PrototypeTpuDriver_Allocate* TpuDriver_Allocate; // NOLINT
PrototypeTpuDriver_AllocateShape* TpuDriver_AllocateShape; // NOLINT
PrototypeTpuDriver_Deallocate* TpuDriver_Deallocate; // NOLINT
PrototypeTpuDriver_TransferToDevice* TpuDriver_TransferToDevice; // NOLINT
PrototypeTpuDriver_TransferFromDevice*
TpuDriver_TransferFromDevice; // NOLINT
PrototypeTpuDriver_TransferFromDeviceToDevice*
TpuDriver_TransferFromDeviceToDevice; // NOLINT
PrototypeTpuDriver_GetCompiledProgramShape*
TpuDriver_GetCompiledProgramShape; // NOLINT
PrototypeTpuDriver_FreeCompiledProgramShape*
TpuDriver_FreeCompiledProgramShape; // NOLINT
PrototypeTpuDriver_EventAddCallback* TpuDriver_EventAddCallback; // NOLINT
PrototypeTpuDriver_EventAwait* TpuDriver_EventAwait; // NOLINT
PrototypeTpuDriver_FreeEvent* TpuDriver_FreeEvent; // NOLINT
PrototypeTpuDriver_FreeStatus* TpuDriver_FreeStatus; // NOLINT
PrototypeTpuDriver_Version* TpuDriver_Version; // NOLINT
};
#endif // TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_LIBTPU_H_

View File

@@ -1,159 +0,0 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Before you start, make sure libtpu.so, libtpu.h and libtpu_client.c are in
// the same working directory.
//
// To compile: gcc -o libtpu_client libtpu_client.c -ldl
// To run: sudo ./libtpu_client
#include <dlfcn.h>
#include <stdio.h>
#include <stdlib.h>
#include "libtpu.h"
void hexdump(void *dat, int len) {
/*unsigned char *cdat = (unsigned char*)dat;
for (int i = 0; i < len; i++) {
if (i!=0 && i%0x10 == 0) printf("\n");
printf("%2.2X ", cdat[i]);
}
printf("\n");*/
}
int main(int argc, char** argv) {
struct TpuDriverFn driver_fn;
TpuDriver_Initialize(&driver_fn, true);
fprintf(stdout, "------ Going to Query Version ------\n");
fprintf(stdout, "TPU Driver Version: %s\n", driver_fn.TpuDriver_Version());
fprintf(stdout, "------ Going to Open a TPU Driver ------\n");
struct TpuDriver* driver = driver_fn.TpuDriver_Open("local://");
fprintf(stdout, "------ Going to Query for System Information ------\n");
struct TpuSystemInfo* info = driver_fn.TpuDriver_QuerySystemInfo(driver);
driver_fn.TpuDriver_FreeSystemInfo(info);
// An example of simple program to sum two parameters.
const char* hlo_module_text = R"(HloModule add_vec_module
ENTRY %add_vec (a: s32[256], b: s32[256]) -> s32[256] {
%a = s32[256] parameter(0)
%b = s32[256] parameter(1)
ROOT %sum = s32[256] add(%a, %b)
}
)";
fprintf(stdout, "------ Going to Compile a TPU program ------\n");
struct TpuCompiledProgramHandle* cph =
driver_fn.TpuDriver_CompileProgramFromText(driver, hlo_module_text,
/*num_replicas=*/1, /*eventc=*/0, /*eventv*/NULL);
//hexdump(cph->internal_handle, 0x100);
TpuEvent* compile_events[] = {cph->event};
fprintf(stdout, "------ Going to Load a TPU program ------\n");
struct TpuLoadedProgramHandle* lph =
driver_fn.TpuDriver_LoadProgram(driver, /*core_id=*/0, cph,
/*eventc=*/1, /*eventv=*/compile_events);
const int size = 1024;
fprintf(stdout, "------ Going to Allocate a TPU Buffer ------\n");
struct TpuBufferHandle* buf_a_handle =
driver_fn.TpuDriver_Allocate(driver, /*core-id=*/0, /*memory_region=*/1,
/*bytes=*/size, /*eventc=*/0, /*eventv=*/NULL);
fprintf(stdout, "------ Going to Allocate a TPU Buffer ------\n");
struct TpuBufferHandle* buf_b_handle =
driver_fn.TpuDriver_Allocate(driver, /*core-id=*/0, /*memory_region=*/1,
/*bytes=*/size, /*eventc=*/0, /*eventv=*/NULL);
fprintf(stdout, "------ Going to Allocate a TPU Buffer ------\n");
struct TpuBufferHandle* buf_sum_handle =
driver_fn.TpuDriver_Allocate(driver, /*core-id=*/0, /*memory_region=*/1,
/*bytes=*/size, /*eventc=*/0, /*eventv=*/NULL);
char a_src[size], b_src[size], sum_src[size];
for (int i = 0; i < size; ++i) {
a_src[i] = 1;
b_src[i] = 2;
sum_src[i] = 0;
}
TpuEvent* allocate_buf_a_events[] = {buf_a_handle->event};
fprintf(stdout, "------ Going to Transfer To Device ------\n");
struct TpuEvent* transfer_ev1 =
driver_fn.TpuDriver_TransferToDevice(driver, a_src, buf_a_handle,
/*eventc=*/1, /*eventv=*/allocate_buf_a_events);
TpuEvent* allocate_buf_b_events[] = {buf_a_handle->event};
fprintf(stdout, "------ Going to Transfer To Device ------\n");
struct TpuEvent* transfer_ev2 =
driver_fn.TpuDriver_TransferToDevice(driver, b_src, buf_b_handle,
/*eventc=*/1, /*eventv=*/allocate_buf_b_events);
//getchar();
fprintf(stdout, "------ Going to Execute a TPU program ------\n");
DeviceAssignment device_assignment = {NULL, 0};
TpuBufferHandle* input_buffer_handle[] = {buf_a_handle, buf_b_handle};
TpuBufferHandle* output_buffer_handle[] = {buf_sum_handle};
TpuEvent* transfer_events[] = {transfer_ev1, transfer_ev2};
struct TpuEvent* execute_event =
driver_fn.TpuDriver_ExecuteProgram(driver, lph,
/*inputc=*/2, /*input_buffer_handle=*/input_buffer_handle,
/*outputc=*/1, /*output_buffer_handle=*/output_buffer_handle,
device_assignment,
/*eventc=*/2, /*eventv*/transfer_events);
fprintf(stdout, "------ Going to Transfer From Device ------\n");
TpuEvent* execute_events[] = {execute_event};
struct TpuEvent* transfer_sum_event =
driver_fn.TpuDriver_TransferFromDevice(driver, buf_sum_handle, sum_src,
/*eventc=*/1, /*eventv=*/execute_events);
TpuStatus* status = driver_fn.TpuDriver_EventAwait(transfer_sum_event,
10000000);
if (status->code != 0) {
fprintf(stdout, "Transfer Event Await: Code: %d, Message: %s\n",
status->code, status->msg);
}
fprintf(stdout, "------ Going to Unload a TPU program ------\n");
struct TpuEvent* unload_program_event = driver_fn.TpuDriver_UnloadProgram(
driver, lph, /*eventc=*/1, /*eventv=*/execute_events);
fprintf(stdout, "------ Going to Deallocate a TPU Buffer ------\n");
struct TpuEvent* dealloc_ev1 = driver_fn.TpuDriver_Deallocate(driver,
buf_a_handle, /*eventc=*/0, /*eventv=*/NULL);
driver_fn.TpuDriver_FreeEvent(dealloc_ev1);
fprintf(stdout, "------ Going to Deallocate a TPU Buffer ------\n");
struct TpuEvent* dealloc_ev2 = driver_fn.TpuDriver_Deallocate(driver,
buf_b_handle, /*eventc=*/0, /*eventv=*/NULL);
driver_fn.TpuDriver_FreeEvent(dealloc_ev2);
fprintf(stdout, "------ Going to Deallocate a TPU Buffer ------\n");
struct TpuEvent* dealloc_ev3 = driver_fn.TpuDriver_Deallocate(driver,
buf_sum_handle, /*eventc=*/0, /*eventv=*/NULL);
driver_fn.TpuDriver_FreeEvent(dealloc_ev3);
fprintf(stdout, "sum:\n");
for (size_t i = 0; i < size; ++i) {
fprintf(stdout, "%d ", sum_src[i]);
}
exit(EXIT_SUCCESS);
}