fixed pylint, formatted python files iwth cblack on localhost

This commit is contained in:
Iain Wong
2020-12-17 13:36:48 -05:00
parent 799ad5ba17
commit 07e2b88466
30 changed files with 1688 additions and 819 deletions

View File

@@ -34,13 +34,21 @@ jobs:
run: python -m pytest -s -v
linter:
name: Indentation Linter
runs-on: ubuntu-latest
name: Indentation Linter
runs-on: ubuntu-latest
steps:
- name: Checkout Code
uses: actions/checkout@v2
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install pylint
- name: Lint with pylint
run: |
python -m pylint --disable=all -e W0311 --jobs=0 --indent-string=' ' **/*.py
steps:
- name: Checkout Code
uses: actions/checkout@v2
- name: Install PyLint
run: sudo apt-get install pylint
- name: Validate indentation is 2 lines
run: if [[ $(pylint --jobs=0 --indent-string=' ' * | grep "Bad indentation") ]]; then exit 1; fi

View File

@@ -3,24 +3,34 @@ import numpy as np
import coremltools as ct
from coremltools.models.neural_network import datatypes, NeuralNetworkBuilder
input_features = [('image', datatypes.Array(3))]
output_features = [('probs', datatypes.Array(3))]
input_features = [("image", datatypes.Array(3))]
output_features = [("probs", datatypes.Array(3))]
weights = np.zeros((3, 3)) + 3
bias = np.ones(3)
builder = NeuralNetworkBuilder(input_features, output_features)
builder.add_inner_product(name='ip_layer', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='image', output_name='med')
#builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs')
#builder.add_elementwise(name='element', input_names=['med', 'med'], output_name='probs', mode='ADD')
builder.add_bias(name='bias', b=bias, input_name='med', output_name='probs', shape_bias=(3,))
#builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs')
builder.add_inner_product(
name="ip_layer",
W=weights,
b=None,
input_channels=3,
output_channels=3,
has_bias=False,
input_name="image",
output_name="med",
)
# builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs')
# builder.add_elementwise(name='element', input_names=['med', 'med'], output_name='probs', mode='ADD')
builder.add_bias(
name="bias", b=bias, input_name="med", output_name="probs", shape_bias=(3,)
)
# builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs')
# compile the spec
mlmodel = ct.models.MLModel(builder.spec)
# trigger the ANE!
out = mlmodel.predict({"image": np.array([1337,0,0], dtype=np.float32)})
out = mlmodel.predict({"image": np.array([1337, 0, 0], dtype=np.float32)})
print(out)
mlmodel.save('test.mlmodel')
mlmodel.save("test.mlmodel")

View File

@@ -2,17 +2,21 @@
import sys
from hexdump import hexdump
from macholib import MachO
def get_macho(fn):
# mod to make the header okay
# MH_CIGAM_64 is good
dat = open(fn, "rb").read()
dat = b"\xcf\xfa\xed\xfe"+dat[4:]
dat = b"\xcf\xfa\xed\xfe" + dat[4:]
from tempfile import NamedTemporaryFile
with NamedTemporaryFile(delete=False) as f:
f.write(dat)
f.close()
return MachO.MachO(f.name)
a = get_macho("model.hwx")
# load commands
@@ -21,14 +25,22 @@ for c in a.headers[0].commands:
if c[0].cmd == 25:
print(c[1])
for section in c[2]:
print(section.segname.strip(b'\0'), section.sectname.strip(b'\0'), hex(section.addr), hex(section.size), "@", hex(c[1].fileoff))
#print(dir(section))
print(
section.segname.strip(b"\0"),
section.sectname.strip(b"\0"),
hex(section.addr),
hex(section.size),
"@",
hex(c[1].fileoff),
)
# print(dir(section))
if c[1].filesize > 0:
hexdump(section.section_data)
# this parser is wrong (fixed with 64-bit one)
from macholib import SymbolTable
sym = SymbolTable.SymbolTable(a)
sym = SymbolTable.SymbolTable(a)
syms = {}
for l in sym.nlists:
@@ -36,64 +48,65 @@ for l in sym.nlists:
if l[0].n_value != 0:
syms[l[1]] = l[0].n_value
for k,v in syms.items():
for k, v in syms.items():
print(k, hex(v))
from termcolor import colored
def compare(x, y):
ss = []
ln = []
ln2 = []
ll = (max(len(x), len(y)) + 0xF)//0x10 * 0x10
ll = (max(len(x), len(y)) + 0xF) // 0x10 * 0x10
highlight = False
next_highlight = 0x2b
for i in range(ll+1):
next_highlight = 0x2B
for i in range(ll + 1):
if i == next_highlight:
highlight = True
if i < len(y):
next_highlight += y[i]+8
next_highlight += y[i] + 8
else:
next_highlight = None
else:
highlight = False
a = "%02X" % x[i] if i < len(x) else "--", \
"%02X" % y[i] if i < len(y) else "--"
a = "%02X" % x[i] if i < len(x) else "--", "%02X" % y[i] if i < len(y) else "--"
def fj(x):
ss = []
for i in range(0, 0x10, 4):
ss.append(' '.join(x[i:i+4]))
return ' '.join(ss)
ss.append(" ".join(x[i : i + 4]))
return " ".join(ss)
if i!=0 and i%0x10 == 0:
ss.append("%8X: " % (i-0x10)+fj(ln)+" | "+fj(ln2)+"\n")
if i != 0 and i % 0x10 == 0:
ss.append("%8X: " % (i - 0x10) + fj(ln) + " | " + fj(ln2) + "\n")
ln = []
ln2 = []
if a[0] != a[1] and a[0] != "--" and a[1] != "--":
ln.append(colored(a[0], 'green'))
ln2.append(colored(a[1], 'red'))
ln.append(colored(a[0], "green"))
ln2.append(colored(a[1], "red"))
else:
if highlight:
ln.append(colored(a[0], 'yellow'))
ln2.append(colored(a[1], 'yellow'))
ln.append(colored(a[0], "yellow"))
ln2.append(colored(a[1], "yellow"))
else:
ln.append(a[0])
ln2.append(a[1])
return ''.join(ss)
return "".join(ss)
g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1])
f1 = g.headers[0].commands[1][2][0].section_data
f2 = a.headers[0].commands[1][2][0].section_data
for i in range(0, len(f2), 0x300):
print("===== op %d =====" % (i//0x300))
print("===== op %d =====" % (i // 0x300))
if len(f1) < 0x300:
print(compare(f1, f2[i:i+0x300]))
print(compare(f1, f2[i : i + 0x300]))
else:
print(compare(f1[i:i+0x300], f2[i:i+0x300]))
#open("/tmp/data.section", "wb").write(f2)
#print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read()))
print(compare(f1[i : i + 0x300], f2[i : i + 0x300]))
# open("/tmp/data.section", "wb").write(f2)
# print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read()))

View File

@@ -3,11 +3,12 @@ import os
from ctypes import *
import numpy as np
import faulthandler
faulthandler.enable()
libane = cdll.LoadLibrary(os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"libane.dylib"))
libane = cdll.LoadLibrary(
os.path.join(os.path.dirname(os.path.abspath(__file__)), "libane.dylib")
)
libane.ANE_Compile.argtypes = [c_char_p, c_int]
libane.ANE_Compile.restype = c_void_p
@@ -17,34 +18,36 @@ libane.ANE_TensorCreate.restype = c_void_p
libane.ANE_TensorData.argtypes = [c_void_p]
libane.ANE_TensorData.restype = POINTER(c_uint16)
libane.ANE_Run.argtypes = [c_void_p]*3
libane.ANE_Run.argtypes = [c_void_p] * 3
libane.ANE_Run.restype = c_int
class ANETensor:
def __init__(self, *shape):
self.shape = shape
self.dtype = np.float16
self.sz = int(np.prod(shape))
assert(self.sz <= 0x4000)
assert self.sz <= 0x4000
self.tt = libane.ANE_TensorCreate(self.sz, 1)
assert(self.tt is not None)
assert self.tt is not None
def data(self):
data = libane.ANE_TensorData(self.tt)
assert(data is not None)
#print(hex(addressof(data.contents)))
assert data is not None
# print(hex(addressof(data.contents)))
buf = np.ctypeslib.as_array(data, shape=(self.sz,))
ret = np.frombuffer(buf, dtype=self.dtype)
#print(ret.data)
# print(ret.data)
return ret
class ANE:
def __init__(self):
libane.ANE_Open()
def compile(self, dat):
ret = libane.ANE_Compile(create_string_buffer(dat), len(dat))
assert(ret is not None)
assert ret is not None
return ret
def run(self, prog, tin, tout):
@@ -53,6 +56,7 @@ class ANE:
def tensor(self, shape):
return ANETensor(shape)
if __name__ == "__main__":
ane = ANE()
@@ -62,7 +66,7 @@ if __name__ == "__main__":
tind = tin.data()
toutd = tout.data()
tind[0:4] = [-1,1,-2,2]
tind[0:4] = [-1, 1, -2, 2]
print(tind)
print(toutd)
@@ -71,4 +75,3 @@ if __name__ == "__main__":
print(tind)
print(toutd)

View File

@@ -17,8 +17,9 @@ for dev in ["CPU", "GPU", "ANE"]:
boaa = baa.relu()
et = time.time()
if i == 2:
print("%s can do at least %.2f MEGAReLUs/sec" % (dev, (np.prod(boaa.shape)/1e6)/(et-st)))
print(
"%s can do at least %.2f MEGAReLUs/sec"
% (dev, (np.prod(boaa.shape) / 1e6) / (et - st))
)
# decently reliable
assert(np.all(boaa.cpu().data >= 0))
assert np.all(boaa.cpu().data >= 0)

View File

@@ -3,24 +3,29 @@
# a rough copy of
# https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py
import os
GPU = os.getenv("GPU", None) is not None
import sys
import io
import time
import numpy as np
np.set_printoptions(suppress=True)
from tinygrad.tensor import Tensor
from extra.utils import fetch, get_parameters
from extra.efficientnet import EfficientNet
def infer(model, img):
# preprocess image
aspect_ratio = img.size[0] / img.size[1]
img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
img = img.resize(
(int(224 * max(aspect_ratio, 1.0)), int(224 * max(1.0 / aspect_ratio, 1.0)))
)
img = np.array(img)
y0,x0=(np.asarray(img.shape)[:2]-224)//2
retimg = img = img[y0:y0+224, x0:x0+224]
y0, x0 = (np.asarray(img.shape)[:2] - 224) // 2
retimg = img = img[y0 : y0 + 224, x0 : x0 + 224]
# if you want to look at the image
"""
@@ -30,11 +35,11 @@ def infer(model, img):
"""
# low level preprocess
img = np.moveaxis(img, [2,0,1], [0,1,2])
img = img.astype(np.float32)[:3].reshape(1,3,224,224)
img = np.moveaxis(img, [2, 0, 1], [0, 1, 2])
img = img.astype(np.float32)[:3].reshape(1, 3, 224, 224)
img /= 255.0
img -= np.array([0.485, 0.456, 0.406]).reshape((1,-1,1,1))
img /= np.array([0.229, 0.224, 0.225]).reshape((1,-1,1,1))
img -= np.array([0.485, 0.456, 0.406]).reshape((1, -1, 1, 1))
img /= np.array([0.229, 0.224, 0.225]).reshape((1, -1, 1, 1))
# run the net
if GPU:
@@ -50,6 +55,7 @@ def infer(model, img):
"""
return out, retimg
if __name__ == "__main__":
# instantiate my net
model = EfficientNet(int(os.getenv("NUM", "0")))
@@ -59,38 +65,42 @@ if __name__ == "__main__":
# category labels
import ast
lbls = fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt")
lbls = ast.literal_eval(lbls.decode('utf-8'))
lbls = fetch(
"https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt"
)
lbls = ast.literal_eval(lbls.decode("utf-8"))
# load image and preprocess
from PIL import Image
url = sys.argv[1]
if url == 'webcam':
if url == "webcam":
import cv2
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
while 1:
_ = cap.grab() # discard one frame to circumvent capture buffering
_ = cap.grab() # discard one frame to circumvent capture buffering
ret, frame = cap.read()
img = Image.fromarray(frame[:, :, [2,1,0]])
img = Image.fromarray(frame[:, :, [2, 1, 0]])
out, retimg = infer(model, img)
print(np.argmax(out.data), np.max(out.data), lbls[np.argmax(out.data)])
SCALE = 3
simg = cv2.resize(retimg, (224*SCALE, 224*SCALE))
simg = cv2.resize(retimg, (224 * SCALE, 224 * SCALE))
retimg = cv2.cvtColor(simg, cv2.COLOR_RGB2BGR)
cv2.imshow('capture', retimg)
if cv2.waitKey(1) & 0xFF == ord('q'):
cv2.imshow("capture", retimg)
if cv2.waitKey(1) & 0xFF == ord("q"):
break
cap.release()
cv2.destroyAllWindows()
else:
if url.startswith('http'):
if url.startswith("http"):
img = Image.open(io.BytesIO(fetch(url)))
else:
img = Image.open(url)
st = time.time()
out, _ = infer(model, img)
print(np.argmax(out.data), np.max(out.data), lbls[np.argmax(out.data)])
print("did inference in %.2f s" % (time.time()-st))
#print("NOT", np.argmin(out.data), np.min(out.data), lbls[np.argmin(out.data)])
print("did inference in %.2f s" % (time.time() - st))
# print("NOT", np.argmin(out.data), np.min(out.data), lbls[np.argmin(out.data)])

View File

@@ -3,8 +3,9 @@ import os
import sys
import numpy as np
from tqdm import tqdm
sys.path.append(os.getcwd())
sys.path.append(os.path.join(os.getcwd(), 'test'))
sys.path.append(os.path.join(os.getcwd(), "test"))
from tinygrad.tensor import Tensor, Function, register
from extra.utils import get_parameters
@@ -12,7 +13,10 @@ import tinygrad.optim as optim
from test_mnist import X_train
from torchvision.utils import make_grid, save_image
import torch
GPU = os.getenv("GPU") is not None
class LinearGen:
def __init__(self):
lv = 128
@@ -28,6 +32,7 @@ class LinearGen:
x = x.dot(self.l4).tanh()
return x
class LinearDisc:
def __init__(self):
in_sh = 784
@@ -39,108 +44,112 @@ class LinearDisc:
def forward(self, x, train=True):
x = x.dot(self.l1).leakyrelu(0.2)
if train:
x = x.dropout(0.3)
x = x.dropout(0.3)
x = x.dot(self.l2).leakyrelu(0.2)
if train:
x = x.dropout(0.3)
x = x.dropout(0.3)
x = x.dot(self.l3).leakyrelu(0.2)
if train:
x = x.dropout(0.3)
x = x.dropout(0.3)
x = x.dot(self.l4).logsoftmax()
return x
if __name__ == "__main__":
generator = LinearGen()
discriminator = LinearDisc()
batch_size = 512
k = 1
epochs = 300
generator_params = get_parameters(generator)
discriminator_params = get_parameters(discriminator)
gen_loss = []
disc_loss = []
output_folder = "outputs"
os.makedirs(output_folder, exist_ok=True)
train_data_size = len(X_train)
ds_noise = Tensor(np.random.randn(64,128).astype(np.float32), gpu=GPU, requires_grad=False)
n_steps = int(train_data_size/batch_size)
if GPU:
[x.cuda_() for x in generator_params+discriminator_params]
# optimizers
optim_g = optim.Adam(generator_params,lr=0.0002, b1=0.5) # 0.0002 for equilibrium!
optim_d = optim.Adam(discriminator_params,lr=0.0002, b1=0.5)
generator = LinearGen()
discriminator = LinearDisc()
batch_size = 512
k = 1
epochs = 300
generator_params = get_parameters(generator)
discriminator_params = get_parameters(discriminator)
gen_loss = []
disc_loss = []
output_folder = "outputs"
os.makedirs(output_folder, exist_ok=True)
train_data_size = len(X_train)
ds_noise = Tensor(
np.random.randn(64, 128).astype(np.float32), gpu=GPU, requires_grad=False
)
n_steps = int(train_data_size / batch_size)
if GPU:
[x.cuda_() for x in generator_params + discriminator_params]
# optimizers
optim_g = optim.Adam(generator_params, lr=0.0002, b1=0.5) # 0.0002 for equilibrium!
optim_d = optim.Adam(discriminator_params, lr=0.0002, b1=0.5)
def regularization_l2(model, a=1e-4):
#TODO: l2 reg loss
pass
def regularization_l2(model, a=1e-4):
# TODO: l2 reg loss
pass
def generator_batch():
idx = np.random.randint(0, X_train.shape[0], size=(batch_size))
image_b = X_train[idx].reshape(-1, 28*28).astype(np.float32)/255.
image_b = (image_b - 0.5)/0.5
return Tensor(image_b, gpu=GPU)
def generator_batch():
idx = np.random.randint(0, X_train.shape[0], size=(batch_size))
image_b = X_train[idx].reshape(-1, 28 * 28).astype(np.float32) / 255.0
image_b = (image_b - 0.5) / 0.5
return Tensor(image_b, gpu=GPU)
def real_label(bs):
y = np.zeros((bs,2), np.float32)
y[range(bs), [1]*bs] = -2.0
real_labels = Tensor(y, gpu=GPU)
return real_labels
def real_label(bs):
y = np.zeros((bs, 2), np.float32)
y[range(bs), [1] * bs] = -2.0
real_labels = Tensor(y, gpu=GPU)
return real_labels
def fake_label(bs):
y = np.zeros((bs,2), np.float32)
y[range(bs), [0]*bs] = -2.0 # Can we do label smoothin? i.e -2.0 changed to -1.98789.
fake_labels = Tensor(y, gpu=GPU)
return fake_labels
def fake_label(bs):
y = np.zeros((bs, 2), np.float32)
y[
range(bs), [0] * bs
] = -2.0 # Can we do label smoothin? i.e -2.0 changed to -1.98789.
fake_labels = Tensor(y, gpu=GPU)
return fake_labels
def train_discriminator(optimizer, data_real, data_fake):
real_labels = real_label(batch_size)
fake_labels = fake_label(batch_size)
def train_discriminator(optimizer, data_real, data_fake):
real_labels = real_label(batch_size)
fake_labels = fake_label(batch_size)
optimizer.zero_grad()
optimizer.zero_grad()
output_real = discriminator.forward(data_real)
loss_real = (output_real * real_labels).mean()
output_real = discriminator.forward(data_real)
loss_real = (output_real * real_labels).mean()
output_fake = discriminator.forward(data_fake)
loss_fake = (output_fake * fake_labels).mean()
output_fake = discriminator.forward(data_fake)
loss_fake = (output_fake * fake_labels).mean()
loss_real.backward()
loss_fake.backward()
optimizer.step()
return loss_real.cpu().data + loss_fake.cpu().data
loss_real.backward()
loss_fake.backward()
optimizer.step()
return loss_real.cpu().data + loss_fake.cpu().data
def train_generator(optimizer, data_fake):
real_labels = real_label(batch_size)
optimizer.zero_grad()
output = discriminator.forward(data_fake)
loss = (output * real_labels).mean()
loss.backward()
optimizer.step()
return loss.cpu().data
for epoch in tqdm(range(epochs)):
loss_g = 0.0
loss_d = 0.0
print(f"Epoch {epoch} of {epochs}")
for i in tqdm(range(n_steps)):
image = generator_batch()
for step in range(k): # Try with k = 5 or 7.
noise = Tensor(np.random.randn(batch_size,128), gpu=GPU)
data_fake = generator.forward(noise).detach()
data_real = image
loss_d_step = train_discriminator(optim_d, data_real, data_fake)
loss_d += loss_d_step
noise = Tensor(np.random.randn(batch_size,128), gpu=GPU)
data_fake = generator.forward(noise)
loss_g_step = train_generator(optim_g, data_fake)
loss_g += loss_g_step
fake_images = generator.forward(ds_noise).detach().cpu().data
fake_images = (fake_images.reshape(-1, 1, 28, 28)+ 1) / 2 # 0 - 1 range.
fake_images = make_grid(torch.tensor(fake_images))
save_image(fake_images, os.path.join(output_folder,f"image_{epoch}.jpg"))
epoch_loss_g = loss_g / n_steps
epoch_loss_d = loss_d / n_steps
print(f"EPOCH: Generator loss: {epoch_loss_g}, Discriminator loss: {epoch_loss_d}")
else:
print("Training Completed!")
def train_generator(optimizer, data_fake):
real_labels = real_label(batch_size)
optimizer.zero_grad()
output = discriminator.forward(data_fake)
loss = (output * real_labels).mean()
loss.backward()
optimizer.step()
return loss.cpu().data
for epoch in tqdm(range(epochs)):
loss_g = 0.0
loss_d = 0.0
print(f"Epoch {epoch} of {epochs}")
for i in tqdm(range(n_steps)):
image = generator_batch()
for step in range(k): # Try with k = 5 or 7.
noise = Tensor(np.random.randn(batch_size, 128), gpu=GPU)
data_fake = generator.forward(noise).detach()
data_real = image
loss_d_step = train_discriminator(optim_d, data_real, data_fake)
loss_d += loss_d_step
noise = Tensor(np.random.randn(batch_size, 128), gpu=GPU)
data_fake = generator.forward(noise)
loss_g_step = train_generator(optim_g, data_fake)
loss_g += loss_g_step
fake_images = generator.forward(ds_noise).detach().cpu().data
fake_images = (fake_images.reshape(-1, 1, 28, 28) + 1) / 2 # 0 - 1 range.
fake_images = make_grid(torch.tensor(fake_images))
save_image(fake_images, os.path.join(output_folder, f"image_{epoch}.jpg"))
epoch_loss_g = loss_g / n_steps
epoch_loss_d = loss_d / n_steps
print(f"EPOCH: Generator loss: {epoch_loss_g}, Discriminator loss: {epoch_loss_d}")
else:
print("Training Completed!")

View File

@@ -1,9 +1,10 @@
#!/usr/bin/env python
#inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
# inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
import os
import sys
sys.path.append(os.getcwd())
sys.path.append(os.path.join(os.getcwd(), 'test'))
sys.path.append(os.path.join(os.getcwd(), "test"))
import numpy as np
from tinygrad.tensor import Tensor, GPU
@@ -13,126 +14,139 @@ from test_mnist import fetch_mnist
from extra.training import train, evaluate
import tinygrad.optim as optim
from extra.augment import augment_img
GPU = os.getenv("GPU", None) is not None
QUICK = os.getenv("QUICK", None) is not None
DEBUG = os.getenv("DEBUG", None) is not None
class SqueezeExciteBlock2D:
def __init__(self, filters):
self.filters = filters
self.weight1 = Tensor.uniform(self.filters, self.filters//32)
self.bias1 = Tensor.uniform(1,self.filters//32)
self.weight2 = Tensor.uniform(self.filters//32, self.filters)
self.weight1 = Tensor.uniform(self.filters, self.filters // 32)
self.bias1 = Tensor.uniform(1, self.filters // 32)
self.weight2 = Tensor.uniform(self.filters // 32, self.filters)
self.bias2 = Tensor.uniform(1, self.filters)
def __call__(self, input):
se = input.avg_pool2d(kernel_size=(input.shape[2], input.shape[3])) #GlobalAveragePool2D
se = input.avg_pool2d(
kernel_size=(input.shape[2], input.shape[3])
) # GlobalAveragePool2D
se = se.reshape(shape=(-1, self.filters))
se = se.dot(self.weight1) + self.bias1
se = se.relu()
se = se.relu()
se = se.dot(self.weight2) + self.bias2
se = se.sigmoid().reshape(shape=(-1,self.filters,1,1)) #for broadcasting
se = se.sigmoid().reshape(shape=(-1, self.filters, 1, 1)) # for broadcasting
se = input.mul(se)
return se
class ConvBlock:
def __init__(self, h, w, inp, filters=128, conv=3):
self.h, self.w = h, w
self.inp = inp
#init weights
self.cweights = [Tensor.uniform(filters, inp if i==0 else filters, conv, conv) for i in range(3)]
# init weights
self.cweights = [
Tensor.uniform(filters, inp if i == 0 else filters, conv, conv) for i in range(3)
]
self.cbiases = [Tensor.uniform(1, filters, 1, 1) for i in range(3)]
#init layers
# init layers
self._bn = BatchNorm2D(128, training=True)
self._seb = SqueezeExciteBlock2D(filters)
def __call__(self, input):
x = input.reshape(shape=(-1, self.inp, self.w, self.h))
x = input.reshape(shape=(-1, self.inp, self.w, self.h))
for cweight, cbias in zip(self.cweights, self.cbiases):
x = x.pad2d(padding=[1,1,1,1]).conv2d(cweight).add(cbias).relu()
x = x.pad2d(padding=[1, 1, 1, 1]).conv2d(cweight).add(cbias).relu()
x = self._bn(x)
x = self._seb(x)
return x
class BigConvNet:
def __init__(self):
self.conv = [ConvBlock(28,28,1), ConvBlock(28,28,128), ConvBlock(14,14,128)]
self.weight1 = Tensor.uniform(128,10)
self.weight2 = Tensor.uniform(128,10)
self.conv = [ConvBlock(28, 28, 1), ConvBlock(28, 28, 128), ConvBlock(14, 14, 128)]
self.weight1 = Tensor.uniform(128, 10)
self.weight2 = Tensor.uniform(128, 10)
def parameters(self):
if DEBUG: #keeping this for a moment
if DEBUG: # keeping this for a moment
pars = [par for par in get_parameters(self) if par.requires_grad]
no_pars = 0
for par in pars:
print(par.shape)
no_pars += np.prod(par.shape)
print('no of parameters', no_pars)
print("no of parameters", no_pars)
return pars
else:
return get_parameters(self)
def save(self, filename):
with open(filename+'.npy', 'wb') as f:
with open(filename + ".npy", "wb") as f:
for par in get_parameters(self):
#if par.requires_grad:
# if par.requires_grad:
np.save(f, par.cpu().data)
def load(self, filename):
with open(filename+'.npy', 'rb') as f:
for par in get_parameters(self):
#if par.requires_grad:
with open(filename + ".npy", "rb") as f:
for par in get_parameters(self):
# if par.requires_grad:
try:
par.cpu().data[:] = np.load(f)
if GPU:
par.cuda()
except:
print('Could not load parameter')
print("Could not load parameter")
def forward(self, x):
x = self.conv[0](x)
x = self.conv[1](x)
x = x.avg_pool2d(kernel_size=(2,2))
x = x.avg_pool2d(kernel_size=(2, 2))
x = self.conv[2](x)
x1 = x.avg_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
x2 = x.max_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
x1 = x.avg_pool2d(kernel_size=(14, 14)).reshape(shape=(-1, 128)) # global
x2 = x.max_pool2d(kernel_size=(14, 14)).reshape(shape=(-1, 128)) # global
xo = x1.dot(self.weight1) + x2.dot(self.weight2)
return xo.logsoftmax()
if __name__ == "__main__":
lrs = [1e-4, 1e-5] if QUICK else [1e-3, 1e-4, 1e-5, 1e-5]
epochss = [2, 1] if QUICK else [13, 3, 3, 1]
epochss = [2, 1] if QUICK else [13, 3, 3, 1]
BS = 32
lmbd = 0.00025
lossfn = lambda out,y: out.mul(y).mean() + lmbd*(model.weight1.abs() + model.weight2.abs()).sum()
lossfn = (
lambda out, y: out.mul(y).mean()
+ lmbd * (model.weight1.abs() + model.weight2.abs()).sum()
)
X_train, Y_train, X_test, Y_test = fetch_mnist()
steps = len(X_train)//BS
steps = len(X_train) // BS
np.random.seed(1337)
if QUICK:
steps = 1
X_test, Y_test = X_test[:BS], Y_test[:BS]
model = BigConvNet()
if len(sys.argv) > 1:
try:
model.load(sys.argv[1])
print('Loaded weights "'+sys.argv[1]+'", evaluating...')
print('Loaded weights "' + sys.argv[1] + '", evaluating...')
evaluate(model, X_test, Y_test, BS=BS)
except:
print('could not load weights "'+sys.argv[1]+'".')
print('could not load weights "' + sys.argv[1] + '".')
if GPU:
params = get_parameters(model)
[x.cuda_() for x in params]
for lr, epochs in zip(lrs, epochss):
optimizer = optim.Adam(model.parameters(), lr=lr)
for epoch in range(1,epochs+1):
#first epoch without augmentation
for epoch in range(1, epochs + 1):
# first epoch without augmentation
X_aug = X_train if epoch == 1 else augment_img(X_train)
train(model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, gpu=GPU, BS=BS)
train(
model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, gpu=GPU, BS=BS
)
accuracy = evaluate(model, X_test, Y_test, BS=BS)
model.save('examples/checkpoint'+str("%.0f" % (accuracy*1.0e6)))
model.save("examples/checkpoint" + str("%.0f" % (accuracy * 1.0e6)))

View File

@@ -10,13 +10,14 @@ import io
import tarfile
import pickle
class TinyConvNet:
def __init__(self, classes=10):
conv = 3
inter_chan, out_chan = 8, 16 # for speed
self.c1 = Tensor.uniform(inter_chan,3,conv,conv)
self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
self.l1 = Tensor.uniform(out_chan*6*6, classes)
inter_chan, out_chan = 8, 16 # for speed
self.c1 = Tensor.uniform(inter_chan, 3, conv, conv)
self.c2 = Tensor.uniform(out_chan, inter_chan, conv, conv)
self.l1 = Tensor.uniform(out_chan * 6 * 6, classes)
def forward(self, x):
x = x.conv2d(self.c1).relu().max_pool2d()
@@ -24,13 +25,20 @@ class TinyConvNet:
x = x.reshape(shape=[x.shape[0], -1])
return x.dot(self.l1).logsoftmax()
def load_cifar():
tt = tarfile.open(fileobj=io.BytesIO(fetch('https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz')), mode='r:gz')
db = pickle.load(tt.extractfile('cifar-10-batches-py/data_batch_1'), encoding="bytes")
X = db[b'data'].reshape((-1, 3, 32, 32))
Y = np.array(db[b'labels'])
tt = tarfile.open(
fileobj=io.BytesIO(
fetch("https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz")
),
mode="r:gz",
)
db = pickle.load(tt.extractfile("cifar-10-batches-py/data_batch_1"), encoding="bytes")
X = db[b"data"].reshape((-1, 3, 32, 32))
Y = np.array(db[b"labels"])
return X, Y
if __name__ == "__main__":
X_train, Y_train = load_cifar()
classes = 10
@@ -40,7 +48,7 @@ if __name__ == "__main__":
TRANSFER = os.getenv("TRANSFER") is not None
if TINY:
model = TinyConvNet(classes)
elif TRANSFER:
elif TRANSFER:
model = EfficientNet(int(os.getenv("NUM", "0")), classes, has_se=True)
model.load_weights_from_torch()
else:
@@ -50,21 +58,21 @@ if __name__ == "__main__":
print("parameters", len(parameters))
optimizer = optim.Adam(parameters, lr=0.001)
#BS, steps = 16, 32
# BS, steps = 16, 32
BS, steps = 64 if TINY else 16, 2048
for i in (t := trange(steps)):
for i in (t := trange(steps)) :
samp = np.random.randint(0, X_train.shape[0], size=(BS))
img = X_train[samp].astype(np.float32)
st = time.time()
out = model.forward(Tensor(img))
fp_time = (time.time()-st)*1000.0
fp_time = (time.time() - st) * 1000.0
Y = Y_train[samp]
y = np.zeros((BS,classes), np.float32)
y[range(y.shape[0]),Y] = -classes
y = np.zeros((BS, classes), np.float32)
y[range(y.shape[0]), Y] = -classes
y = Tensor(y)
loss = out.logsoftmax().mul(y).mean()
@@ -72,25 +80,32 @@ if __name__ == "__main__":
st = time.time()
loss.backward()
bp_time = (time.time()-st)*1000.0
bp_time = (time.time() - st) * 1000.0
st = time.time()
optimizer.step()
opt_time = (time.time()-st)*1000.0
opt_time = (time.time() - st) * 1000.0
#print(out.cpu().data)
# print(out.cpu().data)
st = time.time()
loss = loss.cpu().data
cat = np.argmax(out.cpu().data, axis=1)
accuracy = (cat == Y).mean()
finish_time = (time.time()-st)*1000.0
finish_time = (time.time() - st) * 1000.0
# printing
t.set_description("loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f" %
(loss, accuracy,
fp_time, bp_time, opt_time, finish_time,
fp_time + bp_time + opt_time + finish_time))
t.set_description(
"loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f"
% (
loss,
accuracy,
fp_time,
bp_time,
opt_time,
finish_time,
fp_time + bp_time + opt_time + finish_time,
)
)
del out, y, loss

View File

@@ -3,9 +3,8 @@ import numpy as np
from tinygrad.tensor import Tensor
import time
a = Tensor([-2,-1,0,1,2]).ane()
a = Tensor([-2, -1, 0, 1, 2]).ane()
print(a.cpu())
b = a.relu()
print(b.cpu())
assert(np.all(b.cpu().data >= 0))
assert np.all(b.cpu().data >= 0)

View File

@@ -2,39 +2,47 @@ import numpy as np
from PIL import Image
import os
import sys
sys.path.append(os.getcwd())
sys.path.append(os.path.join(os.getcwd(), 'test'))
sys.path.append(os.path.join(os.getcwd(), "test"))
from test_mnist import fetch_mnist
from tqdm import trange
def augment_img(X, rotate=10, px=3):
Xaug = np.zeros_like(X)
for i in trange(len(X)):
im = Image.fromarray(X[i])
im = im.rotate(np.random.randint(-rotate,rotate), resample=Image.BICUBIC)
im = im.rotate(np.random.randint(-rotate, rotate), resample=Image.BICUBIC)
w, h = X.shape[1:]
#upper left, lower left, lower right, upper right
quad = np.random.randint(-px,px,size=(8)) + np.array([0,0,0,h,w,h,w,0])
# upper left, lower left, lower right, upper right
quad = np.random.randint(-px, px, size=(8)) + np.array([0, 0, 0, h, w, h, w, 0])
im = im.transform((w, h), Image.QUAD, quad, resample=Image.BICUBIC)
Xaug[i] = im
return Xaug
if __name__ == "__main__":
from test_mnist import fetch_mnist
import matplotlib.pyplot as plt
X_train, Y_train, X_test, Y_test = fetch_mnist()
X = np.vstack([X_train[:1]]*10+[X_train[1:2]]*10)
fig, a = plt.subplots(2,len(X))
X = np.vstack([X_train[:1]] * 10 + [X_train[1:2]] * 10)
fig, a = plt.subplots(2, len(X))
Xaug = augment_img(X)
for i in range(len(X)):
a[0][i].imshow(X[i], cmap='gray')
a[1][i].imshow(Xaug[i],cmap='gray')
a[0][i].axis('off')
a[1][i].axis('off')
a[0][i].imshow(X[i], cmap="gray")
a[1][i].imshow(Xaug[i], cmap="gray")
a[0][i].axis("off")
a[1][i].axis("off")
plt.show()
#create some nice gifs for doc?!
# create some nice gifs for doc?!
for i in range(10):
im = Image.fromarray(X_train[7353+i])
im_aug = [Image.fromarray(x) for x in augment_img(np.array([X_train[7353+i]]*100))]
im.save("aug"+str(i)+".gif", save_all=True, append_images=im_aug, duration=100, loop=0)
im = Image.fromarray(X_train[7353 + i])
im_aug = [
Image.fromarray(x) for x in augment_img(np.array([X_train[7353 + i]] * 100))
]
im.save(
"aug" + str(i) + ".gif", save_all=True, append_images=im_aug, duration=100, loop=0
)

View File

@@ -6,6 +6,7 @@ from extra.utils import fetch
USE_TORCH = False
def fake_torch_load(b0):
import io
import pickle
@@ -23,9 +24,9 @@ def fake_torch_load(b0):
class HackTensor:
def __new__(cls, *args):
#print(args)
# print(args)
ident, storage_type, obj_key, location, obj_size, view_metadata = args[0]
assert ident == 'storage'
assert ident == "storage"
ret = np.zeros(obj_size, dtype=storage_type)
key_prelookup[obj_key] = (storage_type, obj_size, ret, args[2], args[3])
@@ -33,10 +34,10 @@ def fake_torch_load(b0):
class MyPickle(pickle.Unpickler):
def find_class(self, module, name):
#print(module, name)
if name == 'FloatStorage':
# print(module, name)
if name == "FloatStorage":
return np.float32
if name == 'LongStorage':
if name == "LongStorage":
return np.int64
if module == "torch._utils" or module == "torch":
return HackTensor
@@ -51,7 +52,7 @@ def fake_torch_load(b0):
# create key_lookup
key_lookup = pickle.load(fb0)
key_real = [None] * len(key_lookup)
for k,v in key_prelookup.items():
for k, v in key_prelookup.items():
key_real[key_lookup.index(k)] = v
# read in the actual data
@@ -64,13 +65,23 @@ def fake_torch_load(b0):
np_array.shape = np_shape
# numpy stores its strides in bytes
real_strides = tuple([x*bytes_size for x in np_strides])
real_strides = tuple([x * bytes_size for x in np_strides])
np_array.strides = real_strides
return ret
class MBConvBlock:
def __init__(self, kernel_size, strides, expand_ratio, input_filters, output_filters, se_ratio, has_se):
def __init__(
self,
kernel_size,
strides,
expand_ratio,
input_filters,
output_filters,
se_ratio,
has_se,
):
oup = expand_ratio * input_filters
if expand_ratio != 1:
self._expand_conv = Tensor.uniform(oup, input_filters, 1, 1)
@@ -79,10 +90,10 @@ class MBConvBlock:
self._expand_conv = None
self.strides = strides
if strides == (2,2):
self.pad = [(kernel_size-1)//2-1, (kernel_size-1)//2]*2
if strides == (2, 2):
self.pad = [(kernel_size - 1) // 2 - 1, (kernel_size - 1) // 2] * 2
else:
self.pad = [(kernel_size-1)//2]*4
self.pad = [(kernel_size - 1) // 2] * 4
self._depthwise_conv = Tensor.uniform(oup, 1, kernel_size, kernel_size)
self._bn1 = BatchNorm2D(oup)
@@ -103,14 +114,22 @@ class MBConvBlock:
if self._expand_conv:
x = self._bn0(x.conv2d(self._expand_conv)).swish()
x = x.pad2d(padding=self.pad)
x = x.conv2d(self._depthwise_conv, stride=self.strides, groups=self._depthwise_conv.shape[0])
x = x.conv2d(
self._depthwise_conv, stride=self.strides, groups=self._depthwise_conv.shape[0]
)
x = self._bn1(x).swish()
# has_se
if self.has_se:
x_squeezed = x.avg_pool2d(kernel_size=x.shape[2:4])
x_squeezed = x_squeezed.conv2d(self._se_reduce).add(self._se_reduce_bias.reshape(shape=[1, -1, 1, 1])).swish()
x_squeezed = x_squeezed.conv2d(self._se_expand).add(self._se_expand_bias.reshape(shape=[1, -1, 1, 1]))
x_squeezed = (
x_squeezed.conv2d(self._se_reduce)
.add(self._se_reduce_bias.reshape(shape=[1, -1, 1, 1]))
.swish()
)
x_squeezed = x_squeezed.conv2d(self._se_expand).add(
self._se_expand_bias.reshape(shape=[1, -1, 1, 1])
)
x = x.mul(x_squeezed.sigmoid())
x = self._bn2(x.conv2d(self._project_conv))
@@ -118,21 +137,22 @@ class MBConvBlock:
x = x.add(inputs)
return x
class EfficientNet:
def __init__(self, number=0, classes=1000, has_se=True):
self.number = number
global_params = [
# width, depth
(1.0, 1.0), # b0
(1.0, 1.1), # b1
(1.1, 1.2), # b2
(1.2, 1.4), # b3
(1.4, 1.8), # b4
(1.6, 2.2), # b5
(1.8, 2.6), # b6
(2.0, 3.1), # b7
(2.2, 3.6), # b8
(4.3, 5.3), # l2
(1.0, 1.0), # b0
(1.0, 1.1), # b1
(1.1, 1.2), # b2
(1.2, 1.4), # b3
(1.4, 1.8), # b4
(1.6, 2.2), # b5
(1.8, 2.6), # b6
(2.0, 3.1), # b7
(2.2, 3.6), # b8
(4.3, 5.3), # l2
][number]
def round_filters(filters):
@@ -140,7 +160,7 @@ class EfficientNet:
divisor = 8
filters *= multiplier
new_filters = max(divisor, int(filters + divisor / 2) // divisor * divisor)
if new_filters < 0.9 * filters: # prevent rounding by more than 10%
if new_filters < 0.9 * filters: # prevent rounding by more than 10%
new_filters += divisor
return int(new_filters)
@@ -151,13 +171,13 @@ class EfficientNet:
self._conv_stem = Tensor.uniform(out_channels, 3, 3, 3)
self._bn0 = BatchNorm2D(out_channels)
blocks_args = [
[1, 3, (1,1), 1, 32, 16, 0.25],
[2, 3, (2,2), 6, 16, 24, 0.25],
[2, 5, (2,2), 6, 24, 40, 0.25],
[3, 3, (2,2), 6, 40, 80, 0.25],
[3, 5, (1,1), 6, 80, 112, 0.25],
[4, 5, (2,2), 6, 112, 192, 0.25],
[1, 3, (1,1), 6, 192, 320, 0.25],
[1, 3, (1, 1), 1, 32, 16, 0.25],
[2, 3, (2, 2), 6, 16, 24, 0.25],
[2, 5, (2, 2), 6, 24, 40, 0.25],
[3, 3, (2, 2), 6, 40, 80, 0.25],
[3, 5, (1, 1), 6, 80, 112, 0.25],
[4, 5, (2, 2), 6, 112, 192, 0.25],
[1, 3, (1, 1), 6, 192, 320, 0.25],
]
self._blocks = []
# num_repeats, kernel_size, strides, expand_ratio, input_filters, output_filters, se_ratio
@@ -168,7 +188,7 @@ class EfficientNet:
for n in range(round_repeats(b[0])):
self._blocks.append(MBConvBlock(*args, has_se=has_se))
args[3] = args[4]
args[1] = (1,1)
args[1] = (1, 1)
in_channels = round_filters(320)
out_channels = round_filters(1280)
@@ -178,43 +198,52 @@ class EfficientNet:
self._fc_bias = Tensor.zeros(classes)
def forward(self, x):
x = x.pad2d(padding=(0,1,0,1))
x = x.pad2d(padding=(0, 1, 0, 1))
x = self._bn0(x.conv2d(self._conv_stem, stride=2)).swish()
#print(x.shape, x.data[:, 0, 0, 0])
# print(x.shape, x.data[:, 0, 0, 0])
for block in self._blocks:
x = block(x)
x = self._bn1(x.conv2d(self._conv_head)).swish()
x = x.avg_pool2d(kernel_size=x.shape[2:4])
x = x.reshape(shape=(-1, x.shape[1]))
#x = x.dropout(0.2)
return x.dot(self._fc).add(self._fc_bias.reshape(shape=[1,-1]))
# x = x.dropout(0.2)
return x.dot(self._fc).add(self._fc_bias.reshape(shape=[1, -1]))
def load_weights_from_torch(self):
# load b0
# https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/utils.py#L551
if self.number == 0:
b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth")
b0 = fetch(
"https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth"
)
elif self.number == 2:
b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth")
b0 = fetch(
"https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth"
)
elif self.number == 4:
b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth")
b0 = fetch(
"https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth"
)
elif self.number == 7:
b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth")
b0 = fetch(
"https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth"
)
else:
raise Exception("no pretrained weights")
if USE_TORCH:
import io
import torch
b0 = torch.load(io.BytesIO(b0))
else:
b0 = fake_torch_load(b0)
for k,v in b0.items():
if '_blocks.' in k:
for k, v in b0.items():
if "_blocks." in k:
k = "%s[%s].%s" % tuple(k.split(".", 2))
mk = "self."+k
#print(k, v.shape)
mk = "self." + k
# print(k, v.shape)
try:
mv = eval(mk)
except AttributeError:
@@ -223,10 +252,9 @@ class EfficientNet:
except AttributeError:
mv = eval(mk.replace(".bias", "_bias"))
vnp = v.numpy().astype(np.float32) if USE_TORCH else v
vnp = vnp if k != '_fc.weight' else vnp.T
vnp = vnp if k != "_fc.weight" else vnp.T
if mv.shape == vnp.shape or vnp.shape == ():
mv.data[:] = vnp
else:
print("MISMATCH SHAPE IN %s, %r %r" % (k, mv.shape, vnp.shape))

View File

@@ -1,17 +1,19 @@
import numpy as np
from tinygrad.tensor import Tensor
def mask_like(like, mask_inx, mask_value = 1.0):
def mask_like(like, mask_inx, mask_value=1.0):
mask = np.zeros_like(like).reshape(-1)
mask[mask_inx] = mask_value
return mask.reshape(like.shape)
def jacobian(func, input):
output = func(input)
ji = input.data.reshape(-1).shape[-1]
jo = output.data.reshape(-1).shape[-1]
J = np.zeros((jo,ji), dtype=np.float32)
J = np.zeros((jo, ji), dtype=np.float32)
for o in range(jo):
input.grad = None
@@ -19,14 +21,15 @@ def jacobian(func, input):
# tinygrad doesn't support slicing, tiny-hack to select
# the needed scalar an backpropagate only through it
o_scalar = Tensor(mask_like(output.data, o, 1.)).mul(output).sum()
o_scalar = Tensor(mask_like(output.data, o, 1.0)).mul(output).sum()
o_scalar.backward()
for i, grad in enumerate(input.grad.data.reshape(-1)):
J[o,i] = grad
J[o, i] = grad
return J
def numerical_jacobian(func, input, eps = 1e-6):
def numerical_jacobian(func, input, eps=1e-6):
output = func(input)
ji = input.data.reshape(-1).shape[-1]
@@ -34,17 +37,18 @@ def numerical_jacobian(func, input, eps = 1e-6):
NJ = np.zeros((jo, ji), dtype=np.float32)
for i in range(ji):
eps_perturb = mask_like(input.data, i, mask_value = eps)
eps_perturb = mask_like(input.data, i, mask_value=eps)
output_perturb_add = func(Tensor(input.data + eps_perturb)).data.reshape(-1)
output_perturb_sub = func(Tensor(input.data - eps_perturb)).data.reshape(-1)
grad_approx = ((output_perturb_add) - (output_perturb_sub)) / (2*eps)
grad_approx = ((output_perturb_add) - (output_perturb_sub)) / (2 * eps)
NJ[:,i] = grad_approx
NJ[:, i] = grad_approx
return NJ
def gradcheck(func, input, eps = 1e-06, atol = 1e-5, rtol = 0.001):
def gradcheck(func, input, eps=1e-06, atol=1e-5, rtol=0.001):
NJ = numerical_jacobian(func, input, eps)
J = jacobian(func, input)
return np.allclose(J, NJ, atol=atol, rtol=rtol)

View File

@@ -4,26 +4,40 @@ from tqdm import trange
from extra.utils import get_parameters
from tinygrad.tensor import Tensor, GPU, Device
def train(model, X_train, Y_train, optim, steps, num_classes=None, BS=128, device=Device.CPU, lossfn = lambda out,y: out.mul(y).mean()):
if device == Device.GPU: [x.gpu_() for x in get_parameters([model, optim])]
elif device == Device.ANE: [x.ane_() for x in get_parameters([model, optim])]
if num_classes is None: num_classes = Y_train.max().astype(int)+1
def train(
model,
X_train,
Y_train,
optim,
steps,
num_classes=None,
BS=128,
device=Device.CPU,
lossfn=lambda out, y: out.mul(y).mean(),
):
if device == Device.GPU:
[x.gpu_() for x in get_parameters([model, optim])]
elif device == Device.ANE:
[x.ane_() for x in get_parameters([model, optim])]
if num_classes is None:
num_classes = Y_train.max().astype(int) + 1
losses, accuracies = [], []
for i in (t := trange(steps, disable=os.getenv('CI') is not None)):
for i in (t := trange(steps, disable=os.getenv("CI") is not None)) :
samp = np.random.randint(0, X_train.shape[0], size=(BS))
x = Tensor(X_train[samp].reshape((-1, 28*28)).astype(np.float32), device=device)
x = Tensor(X_train[samp].reshape((-1, 28 * 28)).astype(np.float32), device=device)
Y = Y_train[samp]
y = np.zeros((len(samp),num_classes), np.float32)
y = np.zeros((len(samp), num_classes), np.float32)
# correct loss for NLL, torch NLL loss returns one per row
y[range(y.shape[0]),Y] = -1.0*num_classes
y[range(y.shape[0]), Y] = -1.0 * num_classes
y = Tensor(y, device=device)
# network
out = model.forward(x)
# NLL loss function
loss = lossfn(out, y)
loss = lossfn(out, y)
optim.zero_grad()
loss.backward()
optim.step()
@@ -37,15 +51,26 @@ def train(model, X_train, Y_train, optim, steps, num_classes=None, BS=128, devic
accuracies.append(accuracy)
t.set_description("loss %.2f accuracy %.2f" % (loss, accuracy))
def evaluate(model, X_test, Y_test, num_classes=None, device=Device.CPU, BS=128):
def numpy_eval(num_classes):
Y_test_preds_out = np.zeros((len(Y_test),num_classes))
for i in trange(len(Y_test)//BS, disable=os.getenv('CI') is not None):
Y_test_preds_out[i*BS:(i+1)*BS] = model.forward(Tensor(X_test[i*BS:(i+1)*BS].reshape((-1, 28*28)).astype(np.float32), device=device)).cpu().data
Y_test_preds_out = np.zeros((len(Y_test), num_classes))
for i in trange(len(Y_test) // BS, disable=os.getenv("CI") is not None):
Y_test_preds_out[i * BS : (i + 1) * BS] = (
model.forward(
Tensor(
X_test[i * BS : (i + 1) * BS].reshape((-1, 28 * 28)).astype(np.float32),
device=device,
)
)
.cpu()
.data
)
Y_test_preds = np.argmax(Y_test_preds_out, axis=1)
return (Y_test == Y_test_preds).mean()
if num_classes is None: num_classes = Y_test.max().astype(int)+1
if num_classes is None:
num_classes = Y_test.max().astype(int) + 1
accuracy = numpy_eval(num_classes)
print("test set accuracy is %f" % accuracy)
return accuracy
return accuracy

View File

@@ -1,19 +1,22 @@
from tinygrad.tensor import Tensor
def fetch(url):
import requests, os, hashlib, tempfile
fp = os.path.join(tempfile.gettempdir(), hashlib.md5(url.encode('utf-8')).hexdigest())
fp = os.path.join(tempfile.gettempdir(), hashlib.md5(url.encode("utf-8")).hexdigest())
if os.path.isfile(fp) and os.stat(fp).st_size > 0:
with open(fp, "rb") as f:
dat = f.read()
else:
print("fetching %s" % url)
dat = requests.get(url).content
with open(fp+".tmp", "wb") as f:
with open(fp + ".tmp", "wb") as f:
f.write(dat)
os.rename(fp+".tmp", fp)
os.rename(fp + ".tmp", fp)
return dat
def get_parameters(obj):
parameters = []
if isinstance(obj, Tensor):
@@ -21,8 +24,7 @@ def get_parameters(obj):
elif isinstance(obj, list):
for x in obj:
parameters.extend(get_parameters(x))
elif hasattr(obj, '__dict__'):
for k,v in obj.__dict__.items():
elif hasattr(obj, "__dict__"):
for k, v in obj.__dict__.items():
parameters.extend(get_parameters(v))
return parameters

View File

@@ -4,29 +4,31 @@ import os
from setuptools import setup
directory = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(directory, 'README.md'), encoding='utf-8') as f:
with open(os.path.join(directory, "README.md"), encoding="utf-8") as f:
long_description = f.read()
setup(name='tinygrad',
version='0.3.0',
description='You like pytorch? You like micrograd? You love tinygrad! heart',
author='George Hotz',
license='MIT',
long_description=long_description,
long_description_content_type='text/markdown',
packages = ['tinygrad'],
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License"
],
install_requires=['numpy', 'requests'],
python_requires='>=3.8',
extras_require={
'gpu': ["pyopencl", "six"],
'testing': [
"pytest",
"torch",
"tqdm",
],
},
include_package_data=True)
setup(
name="tinygrad",
version="0.3.0",
description="You like pytorch? You like micrograd? You love tinygrad! heart",
author="George Hotz",
license="MIT",
long_description=long_description,
long_description_content_type="text/markdown",
packages=["tinygrad"],
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
],
install_requires=["numpy", "requests"],
python_requires=">=3.8",
extras_require={
"gpu": ["pyopencl", "six"],
"testing": [
"pytest",
"torch",
"tqdm",
],
},
include_package_data=True,
)

View File

@@ -1,3 +1,3 @@
import os
ANE = os.environ.get('ANE', False)
ANE = os.environ.get("ANE", False)

View File

@@ -4,43 +4,48 @@ import unittest
from tinygrad.tensor import Tensor, GPU, Device
from .config import ANE
def tensors_allocated():
return sum([isinstance(x, Tensor) for x in gc.get_objects()])
class TestGC(unittest.TestCase):
device = Device.CPU
def test_gc(self):
a = Tensor.zeros(4,4, device=self.device)
b = Tensor.zeros(4,4, device=self.device)
(a*b).mean().backward()
assert(tensors_allocated() > 0)
del a,b
assert(tensors_allocated() == 0)
a = Tensor.zeros(4, 4, device=self.device)
b = Tensor.zeros(4, 4, device=self.device)
(a * b).mean().backward()
assert tensors_allocated() > 0
del a, b
assert tensors_allocated() == 0
def test_gc_complex(self):
a = Tensor.zeros(4,4, device=self.device)
b = Tensor.zeros(4,4, device=self.device)
assert(tensors_allocated() == 2)
(a*b).mean().backward()
assert(tensors_allocated() == 4)
a = Tensor.zeros(4, 4, device=self.device)
b = Tensor.zeros(4, 4, device=self.device)
assert tensors_allocated() == 2
(a * b).mean().backward()
assert tensors_allocated() == 4
del b
assert(tensors_allocated() == 2)
b = Tensor.zeros(4,4, device=self.device)
assert tensors_allocated() == 2
b = Tensor.zeros(4, 4, device=self.device)
print(tensors_allocated())
(a*b).mean().backward()
(a * b).mean().backward()
print(tensors_allocated())
assert(tensors_allocated() == 4)
assert tensors_allocated() == 4
del b
assert(tensors_allocated() == 2)
assert tensors_allocated() == 2
@unittest.skipUnless(GPU, "Requires GPU")
class TestGCGPU(TestGC):
device = Device.GPU
device = Device.GPU
@unittest.skipUnless(ANE, "Requires ANE")
class TestGCANE(TestGC):
device=Device.ANE
device = Device.ANE
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

View File

@@ -11,19 +11,28 @@ from .config import ANE
# mnist loader
def fetch_mnist():
import gzip
parse = lambda dat: np.frombuffer(gzip.decompress(dat), dtype=np.uint8).copy()
X_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"))[0x10:].reshape((-1, 28, 28))
Y_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"))[8:]
X_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"))[0x10:].reshape((-1, 28, 28))
Y_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"))[8:]
X_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"))[
0x10:
].reshape((-1, 28, 28))
Y_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"))[
8:
]
X_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"))[
0x10:
].reshape((-1, 28, 28))
Y_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"))[
8:
]
return X_train, Y_train, X_test, Y_test
# load the mnist dataset
X_train, Y_train, X_test, Y_test = fetch_mnist()
# create a model
class TinyBobNet:
def __init__(self):
self.l1 = Tensor.uniform(784, 128)
self.l2 = Tensor.uniform(128, 10)
@@ -34,27 +43,29 @@ class TinyBobNet:
def forward(self, x):
return x.dot(self.l1).relu().dot(self.l2).logsoftmax()
# create a model with a conv layer
class TinyConvNet:
def __init__(self):
# https://keras.io/examples/vision/mnist_convnet/
conv = 3
#inter_chan, out_chan = 32, 64
inter_chan, out_chan = 8, 16 # for speed
self.c1 = Tensor.uniform(inter_chan,1,conv,conv)
self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
self.l1 = Tensor.uniform(out_chan*5*5, 10)
# inter_chan, out_chan = 32, 64
inter_chan, out_chan = 8, 16 # for speed
self.c1 = Tensor.uniform(inter_chan, 1, conv, conv)
self.c2 = Tensor.uniform(out_chan, inter_chan, conv, conv)
self.l1 = Tensor.uniform(out_chan * 5 * 5, 10)
def parameters(self):
return get_parameters(self)
def forward(self, x):
x = x.reshape(shape=(-1, 1, 28, 28)) # hacks
x = x.reshape(shape=(-1, 1, 28, 28)) # hacks
x = x.conv2d(self.c1).relu().max_pool2d()
x = x.conv2d(self.c2).relu().max_pool2d()
x = x.reshape(shape=[x.shape[0], -1])
return x.dot(self.l1).logsoftmax()
class TestMNIST(unittest.TestCase):
device = Device.CPU
@@ -76,16 +87,19 @@ class TestMNIST(unittest.TestCase):
np.random.seed(1337)
model = TinyBobNet()
optimizer = optim.RMSprop(model.parameters(), lr=0.0002)
train(model, X_train, Y_train, optimizer, steps=1000, device=self.device)
train(model, X_train, Y_train, optimizer, steps=1000, device=self.device)
assert evaluate(model, X_test, Y_test, device=self.device) > 0.95
@unittest.skipUnless(GPU, "Requires GPU")
class TestMNISTGPU(TestMNIST):
device = Device.GPU
@unittest.skipUnless(ANE, "Requires ANE")
class TestMNISTANE(TestMNIST):
device=Device.ANE
device = Device.ANE
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

View File

@@ -7,21 +7,25 @@ import torch
from tinygrad.tensor import Tensor, GPU, Device
from .config import ANE
def start_profile():
import time
pr = cProfile.Profile(timer=lambda: int(time.time()*1e9), timeunit=1e-6)
pr = cProfile.Profile(timer=lambda: int(time.time() * 1e9), timeunit=1e-6)
pr.enable()
return pr
def stop_profile(pr, sort='cumtime'):
def stop_profile(pr, sort="cumtime"):
pr.disable()
ps = pstats.Stats(pr)
ps.strip_dirs()
ps.sort_stats(sort)
ps.print_stats(0.2)
class TestConvSpeed(unittest.TestCase):
device= Device.CPU
device = Device.CPU
def test_mnist(self):
# https://keras.io/examples/vision/mnist_convnet/
@@ -34,12 +38,12 @@ class TestConvSpeed(unittest.TestCase):
conv = 3
inter_chan, out_chan = 32, 64
c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
c1 = torch.randn(inter_chan, 1, conv, conv, requires_grad=True)
c2 = torch.randn(out_chan, inter_chan, conv, conv, requires_grad=True)
l1 = torch.randn(out_chan * 5 * 5, 10, requires_grad=True)
c2d = torch.nn.functional.conv2d
mp = torch.nn.MaxPool2d((2,2))
mp = torch.nn.MaxPool2d((2, 2))
lsm = torch.nn.LogSoftmax(dim=1)
cnt = 5
@@ -47,19 +51,19 @@ class TestConvSpeed(unittest.TestCase):
for i in range(cnt):
et0 = time.time()
x = torch.randn(128, 1, 28, 28, requires_grad=True)
x = mp(c2d(x,c1).relu())
x = mp(c2d(x,c2).relu())
x = mp(c2d(x, c1).relu())
x = mp(c2d(x, c2).relu())
x = x.reshape(x.shape[0], -1)
out = lsm(x.matmul(l1))
out = out.mean()
et1 = time.time()
out.backward()
et2 = time.time()
fpt += (et1-et0)
bpt += (et2-et1)
fpt += et1 - et0
bpt += et2 - et1
fpt_baseline = (fpt*1000/cnt)
bpt_baseline = (bpt*1000/cnt)
fpt_baseline = fpt * 1000 / cnt
bpt_baseline = bpt * 1000 / cnt
print("torch forward pass: %.3f ms" % fpt_baseline)
print("torch backward pass: %.3f ms" % bpt_baseline)
@@ -71,7 +75,7 @@ class TestConvSpeed(unittest.TestCase):
cnt = 5
fpt, bpt = 0.0, 0.0
for i in range(1+cnt):
for i in range(1 + cnt):
et0 = time.time()
x = Tensor.randn(128, 1, 28, 28, device=self.device)
x = x.conv2d(c1).relu().avg_pool2d()
@@ -85,24 +89,31 @@ class TestConvSpeed(unittest.TestCase):
if i == 0:
pr = start_profile()
else:
fpt += (et1-et0)
bpt += (et2-et1)
fpt += et1 - et0
bpt += et2 - et1
stop_profile(pr, sort="time")
fpt = fpt * 1000 / cnt
bpt = bpt * 1000 / cnt
print(
"forward pass: %.3f ms, %.2fx off baseline %.3f ms"
% (fpt, fpt / fpt_baseline, fpt_baseline)
)
print(
"backward pass: %.3f ms, %.2fx off baseline %.3f ms"
% (bpt, bpt / bpt_baseline, bpt_baseline)
)
stop_profile(pr, sort='time')
fpt = (fpt*1000/cnt)
bpt = (bpt*1000/cnt)
print("forward pass: %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))
@unittest.skipUnless(GPU, "Requires GPU")
class TestConvSpeedGPU(TestConvSpeed):
device = Device.GPU
@unittest.skipUnless(ANE, "Requires ANE")
class TestConvSpeedANE(TestConvSpeed):
device=Device.ANE
device = Device.ANE
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

View File

@@ -7,6 +7,7 @@ from extra.utils import get_parameters
import torch
from .config import ANE
class TestNN(unittest.TestCase):
device = Device.CPU
@@ -30,8 +31,12 @@ class TestNN(unittest.TestCase):
tbn.running_mean[:] = torch.tensor(bn.running_mean.data)
tbn.running_var[:] = torch.tensor(bn.running_var.data)
np.testing.assert_allclose(bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5)
np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
np.testing.assert_allclose(
bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5
)
np.testing.assert_allclose(
bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5
)
# trial
inn = Tensor.randn(2, sz, 3, 3, device=self.device)
@@ -45,35 +50,42 @@ class TestNN(unittest.TestCase):
# close
np.testing.assert_allclose(outt.data, toutt.detach().numpy(), rtol=5e-5)
np.testing.assert_allclose(bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5)
np.testing.assert_allclose(
bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5
)
# TODO: this is failing
#np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
# np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
def test_batchnorm2d_training(self):
self.test_batchnorm2d(True)
@unittest.skipUnless(GPU, "Requires GPU")
class TestNNGPU(TestNN):
device = Device.GPU
@unittest.skip("Tests not added")
def test_batchnorm2d(self): pass
def test_batchnorm2d(self):
pass
@unittest.skip("Tests not added")
def test_batchnorm2d_training(self): pass
def test_batchnorm2d_training(self):
pass
@unittest.skipUnless(ANE, "Requires ANE")
class TestNNANE(TestNN):
device=Device.ANE
device = Device.ANE
@unittest.skip("Tests not added")
def test_batchnorm2d(self): pass
def test_batchnorm2d(self):
pass
@unittest.skip("Tests not added")
def test_batchnorm2d_training(self): pass
def test_batchnorm2d_training(self):
pass
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

View File

@@ -7,13 +7,24 @@ import functools
from tinygrad.tensor import Tensor, GPU, Device
from .config import ANE
def helper_test_op(shps, torch_fxn, tinygrad_fxn, atol=0, rtol=1e-6, grad_atol=0, grad_rtol=1e-6, device=Device.CPU, forward_only=False):
def helper_test_op(
shps,
torch_fxn,
tinygrad_fxn,
atol=0,
rtol=1e-6,
grad_atol=0,
grad_rtol=1e-6,
device=Device.CPU,
forward_only=False,
):
torch.manual_seed(0)
ts = [torch.rand(x, requires_grad=True) for x in shps]
tst = [Tensor(x.detach().numpy()) for x in ts]
if device==Device.GPU:
if device == Device.GPU:
tst = [x.gpu() for x in tst]
elif device==Device.ANE:
elif device == Device.ANE:
tst = [x.ane() for x in tst]
out = torch_fxn(*ts)
@@ -26,142 +37,291 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn, atol=0, rtol=1e-6, grad_atol=0
ret.mean().backward()
for t, tt in zip(ts, tst):
np.testing.assert_allclose(t.grad, tt.cpu().grad.data, atol=grad_atol, rtol=grad_rtol)
np.testing.assert_allclose(
t.grad, tt.cpu().grad.data, atol=grad_atol, rtol=grad_rtol
)
# speed
torch_fp = timeit.Timer(functools.partial(torch_fxn, *ts)).timeit(5) * 1000/5
tinygrad_fp = timeit.Timer(functools.partial(tinygrad_fxn, *tst)).timeit(5) * 1000/5
torch_fp = timeit.Timer(functools.partial(torch_fxn, *ts)).timeit(5) * 1000 / 5
tinygrad_fp = timeit.Timer(functools.partial(tinygrad_fxn, *tst)).timeit(5) * 1000 / 5
if not forward_only:
torch_fbp = timeit.Timer(functools.partial(lambda f,x: f(*x).mean().backward(), torch_fxn, ts)).timeit(5) * 1000/5
tinygrad_fbp = timeit.Timer(functools.partial(lambda f,x: f(*x).mean().backward(), tinygrad_fxn, tst)).timeit(5) * 1000/5
torch_fbp = (
timeit.Timer(
functools.partial(lambda f, x: f(*x).mean().backward(), torch_fxn, ts)
).timeit(5)
* 1000
/ 5
)
tinygrad_fbp = (
timeit.Timer(
functools.partial(lambda f, x: f(*x).mean().backward(), tinygrad_fxn, tst)
).timeit(5)
* 1000
/ 5
)
else:
torch_fbp, tinygrad_fbp = np.nan, np.nan
print("testing %30r torch/tinygrad fp: %.2f / %.2f ms bp: %.2f / %.2f ms" % (shps, torch_fp, tinygrad_fp, torch_fbp-torch_fp, tinygrad_fbp-tinygrad_fp))
print(
"testing %30r torch/tinygrad fp: %.2f / %.2f ms bp: %.2f / %.2f ms"
% (shps, torch_fp, tinygrad_fp, torch_fbp - torch_fp, tinygrad_fbp - tinygrad_fp)
)
class TestOps(unittest.TestCase):
device=Device.CPU
device = Device.CPU
def test_add(self):
helper_test_op([(45,65), (45,65)], lambda x,y: x+y, Tensor.add, device=self.device)
helper_test_op(
[(45, 65), (45, 65)], lambda x, y: x + y, Tensor.add, device=self.device
)
def test_sub(self):
helper_test_op([(45,65), (45,65)], lambda x,y: x-y, Tensor.sub, device=self.device)
helper_test_op(
[(45, 65), (45, 65)], lambda x, y: x - y, Tensor.sub, device=self.device
)
def test_mul(self):
helper_test_op([(45,65), (45,65)], lambda x,y: x*y, Tensor.mul, device=self.device)
helper_test_op(
[(45, 65), (45, 65)], lambda x, y: x * y, Tensor.mul, device=self.device
)
def test_div(self):
helper_test_op([(45,65), (45,65)], lambda x,y: x/y, Tensor.div, device=self.device)
helper_test_op(
[(45, 65), (45, 65)], lambda x, y: x / y, Tensor.div, device=self.device
)
def test_pow(self):
helper_test_op([(45,65), (45,65)], lambda x,y: x**y, Tensor.pow, device=self.device)
helper_test_op(
[(45, 65), (45, 65)], lambda x, y: x ** y, Tensor.pow, device=self.device
)
def test_sqrt(self):
helper_test_op([(45,65)], lambda x: x.sqrt(), Tensor.sqrt, device=self.device)
helper_test_op([(45, 65)], lambda x: x.sqrt(), Tensor.sqrt, device=self.device)
def test_relu(self):
helper_test_op([(45,65)], lambda x: x.relu(), Tensor.relu, device=self.device)
helper_test_op([(45, 65)], lambda x: x.relu(), Tensor.relu, device=self.device)
def test_leakyrelu(self):
helper_test_op([(45,65)], lambda x: torch.nn.functional.leaky_relu(x,0.01), Tensor.leakyrelu, device=self.device)
helper_test_op(
[(45, 65)],
lambda x: torch.nn.functional.leaky_relu(x, 0.01),
Tensor.leakyrelu,
device=self.device,
)
def test_abs(self):
helper_test_op([(45,65)], lambda x: torch.abs(x), Tensor.abs, device=self.device)
helper_test_op([(45, 65)], lambda x: torch.abs(x), Tensor.abs, device=self.device)
def test_sigmoid(self):
helper_test_op([(45,65)], lambda x: x.sigmoid(), Tensor.sigmoid, device=self.device)
helper_test_op(
[(45, 65)], lambda x: x.sigmoid(), Tensor.sigmoid, device=self.device
)
def test_dot(self):
helper_test_op([(45,65), (65,100)], lambda x,y: x.matmul(y), Tensor.dot, device=self.device)
helper_test_op(
[(45, 65), (65, 100)], lambda x, y: x.matmul(y), Tensor.dot, device=self.device
)
def test_sum(self):
helper_test_op([(45,3)], lambda x: x.sum(), Tensor.sum, device=self.device)
helper_test_op([(45, 3)], lambda x: x.sum(), Tensor.sum, device=self.device)
def test_sum_axis(self):
helper_test_op([(3,4,5,6)], lambda x: x.sum(axis=(1,2)), lambda x: Tensor.sum(x, axis=(1,2)), device=self.device)
helper_test_op(
[(3, 4, 5, 6)],
lambda x: x.sum(axis=(1, 2)),
lambda x: Tensor.sum(x, axis=(1, 2)),
device=self.device,
)
def test_mean_axis(self):
helper_test_op([(3,4,5,6)], lambda x: x.mean(axis=(1,2)), lambda x: Tensor.mean(x, axis=(1,2)), device=self.device)
helper_test_op(
[(3, 4, 5, 6)],
lambda x: x.mean(axis=(1, 2)),
lambda x: Tensor.mean(x, axis=(1, 2)),
device=self.device,
)
def test_logsoftmax(self):
helper_test_op([(45,65)], lambda x: torch.nn.LogSoftmax(dim=1)(x), Tensor.logsoftmax, atol=1e-7, grad_atol=1e-7, device=self.device)
helper_test_op(
[(45, 65)],
lambda x: torch.nn.LogSoftmax(dim=1)(x),
Tensor.logsoftmax,
atol=1e-7,
grad_atol=1e-7,
device=self.device,
)
def test_tanh(self):
helper_test_op([(45,65)], lambda x: x.tanh(), Tensor.tanh, atol=1e-6, grad_atol=1e-6, device=self.device)
helper_test_op(
[(45, 65)],
lambda x: x.tanh(),
Tensor.tanh,
atol=1e-6,
grad_atol=1e-6,
device=self.device,
)
def test_topo_sort(self):
helper_test_op([(45,65)], lambda x: (x+x)*x, lambda x: x.add(x).mul(x), atol=1e-6, grad_atol=1e-6, device=self.device)
helper_test_op(
[(45, 65)],
lambda x: (x + x) * x,
lambda x: x.add(x).mul(x),
atol=1e-6,
grad_atol=1e-6,
device=self.device,
)
def test_scalar_mul(self):
helper_test_op([(45,65)], lambda x: x*2, lambda x: x*2, device=self.device)
helper_test_op([(45, 65)], lambda x: x * 2, lambda x: x * 2, device=self.device)
def test_scalar_rmul(self):
helper_test_op([(45,65)], lambda x: 2*x, lambda x: 2*x, device=self.device)
helper_test_op([(45, 65)], lambda x: 2 * x, lambda x: 2 * x, device=self.device)
def test_scalar_sub(self):
helper_test_op([(45,65)], lambda x: x-2, lambda x: x-2, device=self.device)
helper_test_op([(45, 65)], lambda x: x - 2, lambda x: x - 2, device=self.device)
def test_scalar_rsub(self):
helper_test_op([(45,65)], lambda x: 2-x, lambda x: 2-x, device=self.device)
helper_test_op([(45, 65)], lambda x: 2 - x, lambda x: 2 - x, device=self.device)
def test_broadcast_full(self):
for torch_op, tinygrad_op in [(torch.add, Tensor.add), (torch.sub, Tensor.sub), (torch.mul, Tensor.mul),
(torch.div, Tensor.div), (torch.pow, Tensor.pow)]:
for shapes in [((5,13,24,16), (5,1,24,1)), ((1,3,1,7,1), (2,1,5,1,8))]:
for torch_op, tinygrad_op in [
(torch.add, Tensor.add),
(torch.sub, Tensor.sub),
(torch.mul, Tensor.mul),
(torch.div, Tensor.div),
(torch.pow, Tensor.pow),
]:
for shapes in [
((5, 13, 24, 16), (5, 1, 24, 1)),
((1, 3, 1, 7, 1), (2, 1, 5, 1, 8)),
]:
with self.subTest(op=torch_op.__name__, shapes=shapes):
helper_test_op(shapes, torch_op, tinygrad_op, device=self.device)
def test_broadcast_partial(self):
for torch_op, tinygrad_op in [(torch.add, Tensor.add), (torch.sub, Tensor.sub), (torch.mul, Tensor.mul),
(torch.div, Tensor.div), (torch.pow, Tensor.pow)]:
for shapes in [((1,32,32,32), (1,32,1,1)), ((5,13,24,16,2), (1,13,24,1,1)),
((4,1), (4,5)), ((1,4), (5,4))]:
for torch_op, tinygrad_op in [
(torch.add, Tensor.add),
(torch.sub, Tensor.sub),
(torch.mul, Tensor.mul),
(torch.div, Tensor.div),
(torch.pow, Tensor.pow),
]:
for shapes in [
((1, 32, 32, 32), (1, 32, 1, 1)),
((5, 13, 24, 16, 2), (1, 13, 24, 1, 1)),
((4, 1), (4, 5)),
((1, 4), (5, 4)),
]:
with self.subTest(op=torch_op.__name__, shapes=shapes):
# NOTE: ANE backwards?
helper_test_op(shapes, torch_op, tinygrad_op, device=self.device, forward_only=self.device!=Device.CPU)
helper_test_op(
shapes,
torch_op,
tinygrad_op,
device=self.device,
forward_only=self.device != Device.CPU,
)
def test_pad2d(self):
helper_test_op([(3,3,3,3)], lambda x: torch.nn.functional.pad(x, (1,2,3,4)), lambda x: x.pad2d(padding=(1,2,3,4)), device=self.device)
helper_test_op(
[(3, 3, 3, 3)],
lambda x: torch.nn.functional.pad(x, (1, 2, 3, 4)),
lambda x: x.pad2d(padding=(1, 2, 3, 4)),
device=self.device,
)
def test_reshape(self):
helper_test_op([(4,3,6,6)], lambda x: torch.reshape(x, (-1,3,6,6)), lambda x: x.reshape(shape=(-1,3,6,6)), device=self.device)
helper_test_op([(4,3,6,6)], lambda x: torch.reshape(x, (-1,1,6,6)), lambda x: x.reshape(shape=(-1,1,6,6)), device=self.device)
helper_test_op(
[(4, 3, 6, 6)],
lambda x: torch.reshape(x, (-1, 3, 6, 6)),
lambda x: x.reshape(shape=(-1, 3, 6, 6)),
device=self.device,
)
helper_test_op(
[(4, 3, 6, 6)],
lambda x: torch.reshape(x, (-1, 1, 6, 6)),
lambda x: x.reshape(shape=(-1, 1, 6, 6)),
device=self.device,
)
def test_detach(self):
helper_test_op([(4,3,6,6)], lambda x: x.detach(), lambda x: x.detach(), device=self.device, forward_only=True)
helper_test_op(
[(4, 3, 6, 6)],
lambda x: x.detach(),
lambda x: x.detach(),
device=self.device,
forward_only=True,
)
def test_conv2d(self):
for bs in [1,8]:
for cin in [1,3]:
for groups in [1,3] if cin == 3 else [1]:
for H in [1,2,5]:
for W in [1,2,3,5]:
with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H, width=W):
helper_test_op([(bs,cin,11,28), (6,cin//groups,H,W)],
lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(),
lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), device=self.device, grad_rtol=1e-5)
for bs in [1, 8]:
for cin in [1, 3]:
for groups in [1, 3] if cin == 3 else [1]:
for H in [1, 2, 5]:
for W in [1, 2, 3, 5]:
with self.subTest(
batch_size=bs, channels=cin, groups=groups, height=H, width=W
):
helper_test_op(
[(bs, cin, 11, 28), (6, cin // groups, H, W)],
lambda x, w: torch.nn.functional.conv2d(x, w, groups=groups).relu(),
lambda x, w: Tensor.conv2d(x, w, groups=groups).relu(),
device=self.device,
grad_rtol=1e-5,
)
def test_strided_conv2d(self):
bs = 4
cin = 3
H,W = 3,3
H, W = 3, 3
with self.subTest(stride := 2):
helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
lambda x,w: torch.nn.functional.conv2d(x,w,stride=2).relu(),
lambda x,w: Tensor.conv2d(x,w,stride=stride).relu(), device=self.device)
with self.subTest(stride := (2,1)):
helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
lambda x,w: torch.nn.functional.conv2d(x,w,stride=stride).relu(),
lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), device=self.device)
helper_test_op(
[(bs, cin, 11, 28), (4, cin, H, W)],
lambda x, w: torch.nn.functional.conv2d(x, w, stride=2).relu(),
lambda x, w: Tensor.conv2d(x, w, stride=stride).relu(),
device=self.device,
)
with self.subTest(stride := (2, 1)):
helper_test_op(
[(bs, cin, 11, 28), (4, cin, H, W)],
lambda x, w: torch.nn.functional.conv2d(x, w, stride=stride).relu(),
lambda x, w: Tensor.conv2d(x, w, stride=(2, 1)).relu(),
device=self.device,
)
def test_maxpool2d(self):
for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1)]:
for ksz in [(2, 2), (3, 3), (3, 2), (5, 5), (5, 1)]:
with self.subTest(kernel_size=ksz):
helper_test_op([(32,2,110,28)],
helper_test_op(
[(32, 2, 110, 28)],
lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz),
lambda x: Tensor.max_pool2d(x, kernel_size=ksz), device=self.device)
lambda x: Tensor.max_pool2d(x, kernel_size=ksz),
device=self.device,
)
def test_avgpool2d(self):
shape = (32,2,111,28)
for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1), shape[2:]]:
shape = (32, 2, 111, 28)
for ksz in [(2, 2), (3, 3), (3, 2), (5, 5), (5, 1), shape[2:]]:
with self.subTest(kernel_size=ksz):
helper_test_op([shape],
helper_test_op(
[shape],
lambda x: torch.nn.functional.avg_pool2d(x, kernel_size=ksz),
lambda x: Tensor.avg_pool2d(x, kernel_size=ksz), device=self.device)
lambda x: Tensor.avg_pool2d(x, kernel_size=ksz),
device=self.device,
)
@unittest.skipUnless(GPU, "Requires GPU")
class TestOpsGPU(TestOps):
device=Device.GPU
device = Device.GPU
@unittest.skipUnless(ANE, "Requires ANE")
class TestOpsANE(TestOps):
device=Device.ANE
device = Device.ANE
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main(verbosity=2)

View File

@@ -6,20 +6,24 @@ from tinygrad.optim import Adam, SGD, RMSprop
from extra.utils import get_parameters
from .config import ANE
x_init = np.random.randn(1,3).astype(np.float32)
W_init = np.random.randn(3,3).astype(np.float32)
m_init = np.random.randn(1,3).astype(np.float32)
x_init = np.random.randn(1, 3).astype(np.float32)
W_init = np.random.randn(3, 3).astype(np.float32)
m_init = np.random.randn(1, 3).astype(np.float32)
def step_tinygrad(optim, kwargs={}, device=Device.CPU):
net = TinyNet()
optim = optim([net.x, net.W], **kwargs)
if device==Device.GPU: [x.gpu_() for x in get_parameters([net, optim])]
elif device==Device.ANE: [x.ane_() for x in get_parameters([net, optim])]
if device == Device.GPU:
[x.gpu_() for x in get_parameters([net, optim])]
elif device == Device.ANE:
[x.ane_() for x in get_parameters([net, optim])]
out = net.forward()
out.backward()
optim.step()
return net.x.cpu().data, net.W.cpu().data
def step_pytorch(optim, kwargs={}):
net = TorchNet()
optim = optim([net.x, net.W], **kwargs)
@@ -29,7 +33,7 @@ def step_pytorch(optim, kwargs={}):
return net.x.detach().numpy(), net.W.detach().numpy()
class TinyNet():
class TinyNet:
def __init__(self):
self.x = Tensor(x_init.copy())
self.W = Tensor(W_init.copy())
@@ -42,7 +46,7 @@ class TinyNet():
return out
class TorchNet():
class TorchNet:
def __init__(self):
self.x = torch.tensor(x_init.copy(), requires_grad=True)
self.W = torch.tensor(W_init.copy(), requires_grad=True)
@@ -59,19 +63,23 @@ class TestOptim(unittest.TestCase):
device = Device.CPU
def test_adam(self):
for x,y in zip(step_tinygrad(Adam, device=self.device),
step_pytorch(torch.optim.Adam)):
for x, y in zip(
step_tinygrad(Adam, device=self.device), step_pytorch(torch.optim.Adam)
):
np.testing.assert_allclose(x, y, atol=1e-4)
def test_sgd(self):
for x,y in zip(step_tinygrad(SGD, kwargs={'lr': 0.001}, device=self.device),
step_pytorch(torch.optim.SGD, kwargs={'lr': 0.001})):
for x, y in zip(
step_tinygrad(SGD, kwargs={"lr": 0.001}, device=self.device),
step_pytorch(torch.optim.SGD, kwargs={"lr": 0.001}),
):
np.testing.assert_allclose(x, y, atol=1e-5)
def test_rmsprop(self):
for x,y in zip(step_tinygrad(RMSprop, kwargs={'lr': 0.001, 'decay': 0.99}, device=self.device),
step_pytorch(torch.optim.RMSprop,
kwargs={'lr': 0.001, 'alpha': 0.99})):
for x, y in zip(
step_tinygrad(RMSprop, kwargs={"lr": 0.001, "decay": 0.99}, device=self.device),
step_pytorch(torch.optim.RMSprop, kwargs={"lr": 0.001, "alpha": 0.99}),
):
np.testing.assert_allclose(x, y, atol=1e-5)
@@ -79,10 +87,11 @@ class TestOptim(unittest.TestCase):
class TestOptimGPU(TestOptim):
device = Device.GPU
@unittest.skipUnless(ANE, "Requires ANE")
class TestOptimANE(TestOptim):
device = Device.ANE
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

View File

@@ -6,11 +6,12 @@ from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
from .config import ANE
x_init = np.random.randn(1,3).astype(np.float32)
U_init = np.random.randn(3,3).astype(np.float32)
V_init = np.random.randn(3,3).astype(np.float32)
W_init = np.random.randn(3,3).astype(np.float32)
m_init = np.random.randn(1,3).astype(np.float32)
x_init = np.random.randn(1, 3).astype(np.float32)
U_init = np.random.randn(3, 3).astype(np.float32)
V_init = np.random.randn(3, 3).astype(np.float32)
W_init = np.random.randn(3, 3).astype(np.float32)
m_init = np.random.randn(1, 3).astype(np.float32)
class TestTinygrad(unittest.TestCase):
device = Device.CPU
@@ -36,7 +37,7 @@ class TestTinygrad(unittest.TestCase):
out.backward()
return out.detach().numpy(), x.grad, W.grad
for x,y in zip(test_tinygrad(), test_pytorch()):
for x, y in zip(test_tinygrad(), test_pytorch()):
np.testing.assert_allclose(x, y, atol=1e-5)
def test_backward_pass_diamond_model(self):
@@ -64,7 +65,7 @@ class TestTinygrad(unittest.TestCase):
out.backward()
return out.detach().numpy(), u.grad, v.grad, w.grad
for x,y in zip(test_tinygrad(), test_pytorch()):
for x, y in zip(test_tinygrad(), test_pytorch()):
np.testing.assert_allclose(x, y, atol=1e-5)
def test_jacobian(self):
@@ -73,7 +74,9 @@ class TestTinygrad(unittest.TestCase):
torch_x = torch.tensor(x, requires_grad=True)
torch_W = torch.tensor(W, requires_grad=True)
torch_func = lambda x: torch.nn.functional.log_softmax(x.matmul(torch_W).relu(), dim=1)
torch_func = lambda x: torch.nn.functional.log_softmax(
x.matmul(torch_W).relu(), dim=1
)
PJ = torch.autograd.functional.jacobian(torch_func, torch_x).squeeze().numpy()
tiny_x = Tensor(x, device=self.device)
@@ -82,8 +85,8 @@ class TestTinygrad(unittest.TestCase):
J = jacobian(tiny_func, tiny_x)
NJ = numerical_jacobian(tiny_func, tiny_x)
np.testing.assert_allclose(PJ, J, atol = 1e-5)
np.testing.assert_allclose(PJ, NJ, atol = 1e-5)
np.testing.assert_allclose(PJ, J, atol=1e-5)
np.testing.assert_allclose(PJ, NJ, atol=1e-5)
def test_gradcheck(self):
W = np.random.RandomState(1337).random((10, 5))
@@ -96,7 +99,7 @@ class TestTinygrad(unittest.TestCase):
self.assertTrue(gradcheck(tiny_func, tiny_x))
# coarse approx. since a "big" eps and the non-linearities of the model
self.assertFalse(gradcheck(tiny_func, tiny_x, eps = 0.1))
self.assertFalse(gradcheck(tiny_func, tiny_x, eps=0.1))
@unittest.skipUnless(GPU, "Requires GPU")
@@ -104,14 +107,18 @@ class TestTinygradGPU(TestTinygrad):
device = Device.GPU
@unittest.skip("float64 not supported on GPU")
def test_jacobian(self): pass
def test_jacobian(self):
pass
@unittest.skip("float64 not supported on GPU")
def test_gradcheck(self): pass
def test_gradcheck(self):
pass
@unittest.skipUnless(ANE, "Requires ANE")
class TestOpsANE(TestTinygrad):
device=Device.ANE
device = Device.ANE
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

View File

@@ -1,23 +1,38 @@
from tinygrad.tensor import Tensor
class BatchNorm2D:
def __init__(self, sz, eps=1e-5, track_running_stats=False, training=False, momentum=0.1):
self.eps, self.track_running_stats, self.training, self.momentum = eps, track_running_stats, training, momentum
def __init__(
self, sz, eps=1e-5, track_running_stats=False, training=False, momentum=0.1
):
self.eps, self.track_running_stats, self.training, self.momentum = (
eps,
track_running_stats,
training,
momentum,
)
self.weight, self.bias = Tensor.ones(sz), Tensor.zeros(sz)
self.running_mean, self.running_var = Tensor.zeros(sz, requires_grad=False), Tensor.ones(sz, requires_grad=False)
self.running_mean, self.running_var = (
Tensor.zeros(sz, requires_grad=False),
Tensor.ones(sz, requires_grad=False),
)
self.num_batches_tracked = Tensor.zeros(1, requires_grad=False)
def __call__(self, x):
if self.track_running_stats or self.training:
batch_mean = x.mean(axis=(0,2,3))
y = (x - batch_mean.reshape(shape=[1, -1, 1, 1]))
batch_var = (y*y).mean(axis=(0,2,3))
batch_mean = x.mean(axis=(0, 2, 3))
y = x - batch_mean.reshape(shape=[1, -1, 1, 1])
batch_var = (y * y).mean(axis=(0, 2, 3))
if self.track_running_stats:
self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var
self.running_mean = (
1 - self.momentum
) * self.running_mean + self.momentum * batch_mean
self.running_var = (
1 - self.momentum
) * self.running_var + self.momentum * batch_var
self.num_batches_tracked += 1
if self.training:
@@ -26,6 +41,9 @@ class BatchNorm2D:
return self.normalize(x, self.running_mean, self.running_var)
def normalize(self, x, mean, var):
x = (x - mean.reshape(shape=[1, -1, 1, 1])) * self.weight.reshape(shape=[1, -1, 1, 1])
return x.div(var.add(self.eps).reshape(shape=[1, -1, 1, 1])**0.5) + self.bias.reshape(shape=[1, -1, 1, 1])
x = (x - mean.reshape(shape=[1, -1, 1, 1])) * self.weight.reshape(
shape=[1, -1, 1, 1]
)
return x.div(
var.add(self.eps).reshape(shape=[1, -1, 1, 1]) ** 0.5
) + self.bias.reshape(shape=[1, -1, 1, 1])

View File

@@ -2,19 +2,23 @@ from .tensor import Tensor, Function, register
from functools import lru_cache
import struct
@lru_cache
def compile_wrapper(ane, dat):
return ane.compile(dat)
def roundup(x, v):
return x + (v-x)%v
return x + (v - x) % v
def fill(dat, addrs, type, val, base=0x4000):
x = struct.pack(type, val)
for a in addrs:
dat[base+a:base+a+len(x)] = x
dat[base + a : base + a + len(x)] = x
return dat
@lru_cache
def compile_relu(ane, sz):
dat = list(open("ane/ops/relu.hwx", "rb").read())
@@ -22,16 +26,18 @@ def compile_relu(ane, sz):
# number of relus
dat = fill(dat, [0x128, 0x13C], "H", sz)
# number of engines? (max 0x100)
dat = fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8], "I", max(0x100, roundup(sz*2, 0x10)))
dat = fill(dat, [0x1EC, 0x1F0, 0x1F4, 0x1F8], "I", max(0x100, roundup(sz * 2, 0x10)))
# strides?
dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz*2, 0x40))
dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz * 2, 0x40))
return compile_wrapper(ane, bytes(dat))
class ReLU(Function):
@staticmethod
def forward(ctx, input):
ret = ctx.ane.tensor(input.shape)
ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret)
return ret
register('relu', ReLU, device=Tensor.ANE)
register("relu", ReLU, device=Tensor.ANE)

View File

@@ -5,44 +5,58 @@ from .tensor import Function, register
# ************* basic ops *************
def unbroadcast(out, in_sh):
# adjoint operation to broadcast is sum. Need to sum all axis with 1 = in_sh[i] < out.shape[i]
sum_axis = tuple([i for i in range(len(in_sh)) if in_sh[i]==1 and out.shape[i]>1]) if in_sh != (1,) else None
sum_axis = (
tuple([i for i in range(len(in_sh)) if in_sh[i] == 1 and out.shape[i] > 1])
if in_sh != (1,)
else None
)
return out.sum(axis=sum_axis).reshape(in_sh)
class Add(Function):
@staticmethod
def forward(ctx, x, y):
ctx.save_for_backward(x.shape, y.shape)
return x+y
return x + y
@staticmethod
def backward(ctx, grad_output):
shape_x, shape_y = ctx.saved_tensors
return unbroadcast(grad_output, shape_x), unbroadcast(grad_output, shape_y)
register('add', Add)
register("add", Add)
class Sub(Function):
@staticmethod
def forward(ctx, x, y):
ctx.save_for_backward(x.shape, y.shape)
return x-y
return x - y
@staticmethod
def backward(ctx, grad_output):
shape_x, shape_y = ctx.saved_tensors
return unbroadcast(grad_output, shape_x), unbroadcast(-grad_output, shape_y)
register('sub', Sub)
register("sub", Sub)
class Mul(Function):
@staticmethod
def forward(ctx, x, y):
ctx.save_for_backward(x, y)
return x*y
return x * y
@staticmethod
def backward(ctx, grad_output):
x,y = ctx.saved_tensors
return unbroadcast(y*grad_output, x.shape), unbroadcast(x*grad_output, y.shape)
register('mul', Mul)
x, y = ctx.saved_tensors
return unbroadcast(y * grad_output, x.shape), unbroadcast(x * grad_output, y.shape)
register("mul", Mul)
class Pow(Function):
@staticmethod
@@ -52,27 +66,37 @@ class Pow(Function):
@staticmethod
def backward(ctx, grad_output):
x,y = ctx.saved_tensors
return unbroadcast(y * (x**(y-1.0)) * grad_output, x.shape), \
unbroadcast((x**y) * np.log(x) * grad_output, y.shape)
register('pow', Pow)
x, y = ctx.saved_tensors
return unbroadcast(y * (x ** (y - 1.0)) * grad_output, x.shape), unbroadcast(
(x ** y) * np.log(x) * grad_output, y.shape
)
register("pow", Pow)
class Sum(Function):
@staticmethod
def forward(ctx, input,axis=None):
def forward(ctx, input, axis=None):
ctx.save_for_backward(input, axis)
return np.array([input.sum()]) if axis is None else input.sum(axis=axis)
@staticmethod
def backward(ctx, grad_output):
input, axis = ctx.saved_tensors
shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
shape = [
1 if axis is None or i in axis else input.shape[i]
for i in range(len(input.shape))
]
return grad_output.reshape(shape) + np.zeros_like(input)
register('sum', Sum)
register("sum", Sum)
# ************* GEMM *************
class Dot(Function):
@staticmethod
def forward(ctx, input, weight):
@@ -85,21 +109,27 @@ class Dot(Function):
grad_input = grad_output.dot(weight.T)
grad_weight = input.T.dot(grad_output)
return grad_input, grad_weight
register('dot', Dot)
register("dot", Dot)
# ************* simple ops *************
class Pad2D(Function):
@staticmethod
def forward(ctx, x, padding=None):
ctx.save_for_backward(padding)
return np.pad(x, ((0,0), (0,0), tuple(padding[2:4]), tuple(padding[0:2])))
return np.pad(x, ((0, 0), (0, 0), tuple(padding[2:4]), tuple(padding[0:2])))
@staticmethod
def backward(ctx, grad_output):
padding, = ctx.saved_tensors
return grad_output[..., padding[2]:-padding[3], padding[0]:-padding[1]]
register('pad2d', Pad2D)
(padding,) = ctx.saved_tensors
return grad_output[..., padding[2] : -padding[3], padding[0] : -padding[1]]
register("pad2d", Pad2D)
class Reshape(Function):
@staticmethod
@@ -109,13 +139,16 @@ class Reshape(Function):
@staticmethod
def backward(ctx, grad_output):
in_shape, = ctx.saved_tensors
(in_shape,) = ctx.saved_tensors
return grad_output.reshape(in_shape)
register('reshape', Reshape)
register("reshape", Reshape)
# ************* activation ops *************
class ReLU(Function):
@staticmethod
def forward(ctx, input):
@@ -124,123 +157,145 @@ class ReLU(Function):
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
(input,) = ctx.saved_tensors
return grad_output * (input >= 0)
register('relu', ReLU)
register("relu", ReLU)
def _exp_normalize(x, axis=None):
y = np.exp(x - x.max(axis=axis, keepdims=True))
return y / y.sum(axis=axis, keepdims=True)
y = np.exp(x - x.max(axis=axis, keepdims=True))
return y / y.sum(axis=axis, keepdims=True)
class Sigmoid(Function):
@staticmethod
def forward(ctx, input):
with np.warnings.catch_warnings():
np.warnings.filterwarnings('ignore')
ret = np.where(input >= 0,
1/(1 + np.exp(-input)),
np.exp(input)/(1 + np.exp(input))
np.warnings.filterwarnings("ignore")
ret = np.where(
input >= 0, 1 / (1 + np.exp(-input)), np.exp(input) / (1 + np.exp(input))
)
ctx.save_for_backward(ret)
return ret
@staticmethod
def backward(ctx, grad_output):
ret, = ctx.saved_tensors
(ret,) = ctx.saved_tensors
return grad_output * (ret * (1 - ret))
register('sigmoid', Sigmoid)
register("sigmoid", Sigmoid)
class LogSoftmax(Function):
@staticmethod
def forward(ctx, input):
softmax = _exp_normalize(input, axis=1)
ctx.save_for_backward(softmax)
return np.log(softmax)
return np.log(softmax)
@staticmethod
def backward(ctx, grad_output):
softmax, = ctx.saved_tensors
return grad_output - grad_output.sum(axis=1, keepdims=True)*softmax
register('logsoftmax', LogSoftmax)
(softmax,) = ctx.saved_tensors
return grad_output - grad_output.sum(axis=1, keepdims=True) * softmax
register("logsoftmax", LogSoftmax)
# ************* conv ops *************
class Conv2D(Function):
@staticmethod
def forward(ctx, x, w, stride=1, groups=1):
if type(ctx.stride) == int:
ctx.stride = (ctx.stride, ctx.stride)
cout,cin,H,W = w.shape
ys,xs = ctx.stride
bs,cin_ = x.shape[0], x.shape[1]
oy,ox = (x.shape[2]-(H-ys))//ys, (x.shape[3]-(W-xs))//xs
assert cin*ctx.groups == cin_
cout, cin, H, W = w.shape
ys, xs = ctx.stride
bs, cin_ = x.shape[0], x.shape[1]
oy, ox = (x.shape[2] - (H - ys)) // ys, (x.shape[3] - (W - xs)) // xs
assert cin * ctx.groups == cin_
assert cout % ctx.groups == 0
rcout = cout//ctx.groups
rcout = cout // ctx.groups
gx = x.reshape(bs,ctx.groups,cin,x.shape[2],x.shape[3])
tx = np.lib.stride_tricks.as_strided(gx,
gx = x.reshape(bs, ctx.groups, cin, x.shape[2], x.shape[3])
tx = np.lib.stride_tricks.as_strided(
gx,
shape=(bs, ctx.groups, cin, oy, ox, H, W),
strides=(*gx.strides[0:3], gx.strides[3]*ys, gx.strides[4]*xs, *gx.strides[3:5]),
strides=(
*gx.strides[0:3],
gx.strides[3] * ys,
gx.strides[4] * xs,
*gx.strides[3:5],
),
writeable=False,
)
tw = w.reshape(ctx.groups, rcout, cin, H, W)
ctx.save_for_backward(tx, tw, x.shape)
ret = np.zeros((bs,ctx.groups,oy,ox,rcout),dtype=x.dtype)
ret = np.zeros((bs, ctx.groups, oy, ox, rcout), dtype=x.dtype)
for g in range(ctx.groups):
#ijYXyx,kjyx -> iYXk ->ikYX
ret[:,g] += np.tensordot(tx[:,g], tw[g], ((1,4,5),(1,2,3)))
return np.moveaxis(ret,4,2).reshape(bs, cout, oy, ox)
# ijYXyx,kjyx -> iYXk ->ikYX
ret[:, g] += np.tensordot(tx[:, g], tw[g], ((1, 4, 5), (1, 2, 3)))
return np.moveaxis(ret, 4, 2).reshape(bs, cout, oy, ox)
@staticmethod
def backward(ctx, grad_output):
bs,_,oy,ox = grad_output.shape
bs, _, oy, ox = grad_output.shape
tx, tw, x_shape = ctx.saved_tensors
_,rcout,cin,H,W = tw.shape
ys,xs = ctx.stride
OY,OX = x_shape[2:4]
_, rcout, cin, H, W = tw.shape
ys, xs = ctx.stride
OY, OX = x_shape[2:4]
ggg = grad_output.reshape(bs,ctx.groups,rcout,oy,ox)
ggg = grad_output.reshape(bs, ctx.groups, rcout, oy, ox)
gdw = np.zeros((ctx.groups,rcout,cin,H,W), dtype=tx.dtype)
gdw = np.zeros((ctx.groups, rcout, cin, H, W), dtype=tx.dtype)
for g in range(ctx.groups):
#'ikYX,ijYXyx -> kjyx'
gdw[g] += np.tensordot(ggg[:,g], tx[:,g], ((0,2,3),(0,2,3)))
gdw[g] += np.tensordot(ggg[:, g], tx[:, g], ((0, 2, 3), (0, 2, 3)))
# needs to be optimized
gdx = np.zeros((bs,ctx.groups,cin,OY,OX), dtype=tx.dtype)
for k in range(oy*ox):
Y, X = k//ox, k%ox
iY,iX = Y*ys, X*xs
#gdx[:,:,: , iY:iY+H, iX:iX+W] += np.einsum('igk,gkjyx->igjyx', ggg[:,:,:,Y,X], tw)
gdx = np.zeros((bs, ctx.groups, cin, OY, OX), dtype=tx.dtype)
for k in range(oy * ox):
Y, X = k // ox, k % ox
iY, iX = Y * ys, X * xs
# gdx[:,:,: , iY:iY+H, iX:iX+W] += np.einsum('igk,gkjyx->igjyx', ggg[:,:,:,Y,X], tw)
for g in range(ctx.groups):
tg = np.dot(ggg[:,g,:,Y,X].reshape(bs, -1), tw[g].reshape(rcout, -1))
gdx[:, g, :, iY:iY+H, iX:iX+W] += tg.reshape((bs, cin, H, W))
tg = np.dot(ggg[:, g, :, Y, X].reshape(bs, -1), tw[g].reshape(rcout, -1))
gdx[:, g, :, iY : iY + H, iX : iX + W] += tg.reshape((bs, cin, H, W))
return gdx.reshape((bs, ctx.groups*cin, OY, OX)), gdw.reshape((ctx.groups*rcout, cin, H, W))
register('conv2d', Conv2D)
return gdx.reshape((bs, ctx.groups * cin, OY, OX)), gdw.reshape(
(ctx.groups * rcout, cin, H, W)
)
register("conv2d", Conv2D)
# ************* pooling ops *************
def stack_for_pool(x, py, px):
my, mx = (x.shape[2]//py)*py, (x.shape[3]//px)*px
my, mx = (x.shape[2] // py) * py, (x.shape[3] // px) * px
xup = x[:, :, :my, :mx]
stack = [xup[:, :, k//px::py, k%px::px][None] for k in range(py*px)]
stack = [xup[:, :, k // px :: py, k % px :: px][None] for k in range(py * px)]
return np.concatenate(stack, axis=0)
def unstack_for_pool(fxn, s, py, px):
my, mx = (s[2]//py)*py, (s[3]//px)*px
for k in range(py*px):
Y, X = k//px, k%px
ll = fxn(Y*px+X)
my, mx = (s[2] // py) * py, (s[3] // px) * px
for k in range(py * px):
Y, X = k // px, k % px
ll = fxn(Y * px + X)
if X == 0 and Y == 0:
ret = np.zeros(s, dtype=ll.dtype)
ret[:, :, Y:my:py, X:mx:px] = ll
return ret
class MaxPool2D(Function):
@staticmethod
def forward(ctx, x, kernel_size=(2, 2)):
@@ -251,9 +306,14 @@ class MaxPool2D(Function):
@staticmethod
def backward(ctx, grad_output):
idxs,s = ctx.saved_tensors
return unstack_for_pool(lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size)
register('max_pool2d', MaxPool2D)
idxs, s = ctx.saved_tensors
return unstack_for_pool(
lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size
)
register("max_pool2d", MaxPool2D)
class AvgPool2D(Function):
@staticmethod
@@ -264,8 +324,9 @@ class AvgPool2D(Function):
@staticmethod
def backward(ctx, grad_output):
s, = ctx.saved_tensors
(s,) = ctx.saved_tensors
py, px = ctx.kernel_size
return unstack_for_pool(lambda idx: grad_output/py/px, s, py, px)
register('avg_pool2d', AvgPool2D)
return unstack_for_pool(lambda idx: grad_output / py / px, s, py, px)
register("avg_pool2d", AvgPool2D)

View File

@@ -3,131 +3,229 @@ from .tensor import Function, register, GPUBuffer, Tensor, Device
import pyopencl as cl
import functools
def buffer_new(ctx, shape, zero=False):
return GPUBuffer(shape, hostbuf=None if not zero else np.zeros(shape, dtype=np.float32))
return GPUBuffer(
shape, hostbuf=None if not zero else np.zeros(shape, dtype=np.float32)
)
@functools.lru_cache()
def clbuild(cl_ctx, name, prg):
return cl.Program(cl_ctx, prg).build().__getattr__(name)
def uint2(x, y):
return np.array((x,y), dtype=cl.cltypes.uint2)
return np.array((x, y), dtype=cl.cltypes.uint2)
i32 = np.int32
def subsample_op(ctx, input, kernel_size, stride, iter_op, result_op, decls=''):
def subsample_op(ctx, input, kernel_size, stride, iter_op, result_op, decls=""):
py, px = stride
N, C, Yin, Xin = input.shape
Yout, Xout = (Yin-kernel_size[0])//py+1, (Xin-kernel_size[1])//px+1
Yout, Xout = (Yin - kernel_size[0]) // py + 1, (Xin - kernel_size[1]) // px + 1
ret = buffer_new(ctx, (N, C, Yout, Xout), zero=True)
subsample = clbuild(ctx.cl_ctx, "subsample", """
subsample = clbuild(
ctx.cl_ctx,
"subsample",
"""
__kernel void subsample(__global float *output, __global const float *input, uint2 osize, uint2 isize,
uint2 ksz, uint2 stride) {
int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
"""+decls+""";
"""
+ decls
+ """;
for (uint j=0; j<ksz.y; ++j) {
for (uint i=0; i<ksz.x; ++i) {
int iid = (gid.x*stride.x+i) + isize.x*((gid.y*stride.y+j) + isize.y*gid.z);
if (gid.x*stride.x+i < isize.x && gid.y*stride.y+j < isize.y) {
"""+iter_op+""";
"""
+ iter_op
+ """;
}
}
}
output[oid] = """+result_op+""";
}""")
subsample(ctx.cl_queue, (N*C, Yout, Xout), None,
ret.cl, input.cl, uint2(Xout, Yout), uint2(Xin, Yin),
uint2(*kernel_size[::-1]), uint2(px, py))
ctx.data = np.empty((N, C, Yout, Xout)) # set shape expectation on tensor instance
output[oid] = """
+ result_op
+ """;
}""",
)
subsample(
ctx.cl_queue,
(N * C, Yout, Xout),
None,
ret.cl,
input.cl,
uint2(Xout, Yout),
uint2(Xin, Yin),
uint2(*kernel_size[::-1]),
uint2(px, py),
)
ctx.data = np.empty((N, C, Yout, Xout)) # set shape expectation on tensor instance
return ret
def supersample_op(ctx, input, out_shape, kernel_size, result_op, decls='', input2=None):
def supersample_op(
ctx, input, out_shape, kernel_size, result_op, decls="", input2=None
):
(N, C, Yin, Xin), (Yout, Xout) = input.shape, out_shape[2:]
py,px = kernel_size
py, px = kernel_size
ret = buffer_new(ctx, out_shape, zero=True)
supsample = clbuild(ctx.cl_ctx, "supsample", """
supsample = clbuild(
ctx.cl_ctx,
"supsample",
"""
__kernel void supsample(__global float *output, __global const float *input, __global const void *input2,
uint2 osize, uint2 isize, uint2 ksz) {
int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
int iid = (gid.x/ksz.x) + isize.x*((gid.y/ksz.y) + isize.y*gid.z);
"""+decls+""";
"""
+ decls
+ """;
if (gid.x/ksz.x < isize.x && gid.y/ksz.y < isize.y) {
output[oid] = """+result_op+""";
output[oid] = """
+ result_op
+ """;
}
}""")
supsample(ctx.cl_queue, (N*C, Yout, Xout), None,
ret.cl, input.cl, input2.cl if input2 is not None else input2,
uint2(Xout, Yout), uint2(Xin, Yin), uint2(px, py))
ctx.data = np.empty((N, C, Yout, Xout)) # set shape expectation on tensor instance
}""",
)
supsample(
ctx.cl_queue,
(N * C, Yout, Xout),
None,
ret.cl,
input.cl,
input2.cl if input2 is not None else input2,
uint2(Xout, Yout),
uint2(Xin, Yin),
uint2(px, py),
)
ctx.data = np.empty((N, C, Yout, Xout)) # set shape expectation on tensor instance
return ret
@functools.lru_cache()
def get_binop_prg(cl_ctx, code, complist):
ndims = len(complist)
args = "".join([", int d%d" % i for i in range(ndims)]) + "".join([", int p%d" % i for i in range(ndims-1)])
compute_idx_rets = ["\n int idx_ret"+str(i)+" = (gid0 / "+("p%d"%i if i < ndims-1 else "1")+") % d"+str(i)+";" for i in range(ndims)]
idx_exprs = ["0", "0"] # [idx_x, idx_y]
args = "".join([", int d%d" % i for i in range(ndims)]) + "".join(
[", int p%d" % i for i in range(ndims - 1)]
)
compute_idx_rets = [
"\n int idx_ret"
+ str(i)
+ " = (gid0 / "
+ ("p%d" % i if i < ndims - 1 else "1")
+ ") % d"
+ str(i)
+ ";"
for i in range(ndims)
]
idx_exprs = ["0", "0"] # [idx_x, idx_y]
for i in range(ndims):
for j in range(2):
if complist[i][j]:
idx_exprs[j] = "idx_ret%d + d%d*(%s)" % (i, i, idx_exprs[j])
return cl.Program(cl_ctx, """__kernel void binop(__global const float *x_g, __global const float *y_g, __global float *res_g"""+args+""") {
int gid0 = get_global_id(0);"""+"".join(compute_idx_rets)+"""
float a = x_g["""+idx_exprs[0]+"""];
float b = y_g["""+idx_exprs[1]+"""];
res_g[gid0] = """+code+""";\n}""").build()
return cl.Program(
cl_ctx,
"""__kernel void binop(__global const float *x_g, __global const float *y_g, __global float *res_g"""
+ args
+ """) {
int gid0 = get_global_id(0);"""
+ "".join(compute_idx_rets)
+ """
float a = x_g["""
+ idx_exprs[0]
+ """];
float b = y_g["""
+ idx_exprs[1]
+ """];
res_g[gid0] = """
+ code
+ """;\n}""",
).build()
def binary_op(ctx, code, x, y):
n_dims = max(len(x.shape), len(y.shape))
shape_x, shape_y = np.ones(n_dims, dtype=np.int32), np.ones(n_dims, dtype=np.int32)
shape_x[:len(x.shape)] = np.array(x.shape, dtype=np.int32)
shape_y[:len(y.shape)] = np.array(y.shape, dtype=np.int32)
shape_x[: len(x.shape)] = np.array(x.shape, dtype=np.int32)
shape_y[: len(y.shape)] = np.array(y.shape, dtype=np.int32)
if not np.all((shape_x == 1) | (shape_y == 1) | (shape_x == shape_y)):
raise Exception(f"binary op unbroadcastable shape mismatch: {x.shape} vs {y.shape}")
shape_ret = np.maximum(shape_x, shape_y)
dimlist, complist = [], [] # note: len(dimlist) may be less than n_dims
dimlist, complist = [], [] # note: len(dimlist) may be less than n_dims
def push(dim, comp):
if len(complist) > 0 and complist[-1] == comp:
dimlist[-1] *= dim
elif comp != (False, False):
dimlist.append(dim); complist.append(comp)
for i in range(n_dims): # group together any adjacent dimensions that we can to simplify broadcasting
dimlist.append(dim)
complist.append(comp)
for i in range(
n_dims
): # group together any adjacent dimensions that we can to simplify broadcasting
push(i32(max(shape_x[i], shape_y[i])), (shape_x[i] > 1, shape_y[i] > 1))
prg = get_binop_prg(ctx.cl_ctx, code, tuple(complist))
ret = buffer_new(ctx, shape_ret, zero=True)
prod_list = np.array(dimlist, dtype=i32)[-1::-1].cumprod(dtype=i32)[-1::-1] # take cumprod from back to front
prg.binop(ctx.cl_queue, [prod_list[0]] if len(dimlist) > 0 else [1], None, x.cl, y.cl, ret.cl, *dimlist, *(prod_list[1:]))
prod_list = np.array(dimlist, dtype=i32)[-1::-1].cumprod(dtype=i32)[
-1::-1
] # take cumprod from back to front
prg.binop(
ctx.cl_queue,
[prod_list[0]] if len(dimlist) > 0 else [1],
None,
x.cl,
y.cl,
ret.cl,
*dimlist,
*(prod_list[1:]),
)
return ret
def unary_op(ctx, code, x):
ret = buffer_new(ctx, x.shape)
unop = clbuild(ctx.cl_ctx, "unop", """
unop = clbuild(
ctx.cl_ctx,
"unop",
"""
__kernel void unop(__global const float *a_g, __global float *res_g) {
int gid = get_global_id(0);
float a = a_g[gid];
res_g[gid] = """+code+""";
}""")
res_g[gid] = """
+ code
+ """;
}""",
)
unop(ctx.cl_queue, [np.prod(ret.shape)], None, x.cl, ret.cl)
return ret
def reduce_op(ctx, code, code2, inp, axis=None):
if axis is None:
# full reduce
osize = [1]*len(inp.shape)
osize = [1] * len(inp.shape)
else:
osize = np.array(inp.shape)
osize[list(axis)] = 1
osize[list(axis)] = 1
ret = buffer_new(ctx, osize)
if axis is None:
ret.shape = (1,)
# TODO: this is insanely slow
reduce = clbuild(ctx.cl_ctx, "reduce", """
reduce = clbuild(
ctx.cl_ctx,
"reduce",
"""
__kernel void reduce(__global const float *a_g, int sz, __global float *res_g, int prod, int n_dims,
__global const int *shape_x, __global const int *shape_ret) {
int gid = get_global_id(0);
@@ -148,80 +246,129 @@ def reduce_op(ctx, code, code2, inp, axis=None):
}
}
float a = a_g[idx];
"""+code+""";
"""
+ code
+ """;
}
res_g[gid] = """+code2+""";
}""")
buffer_np = lambda x: cl.Buffer(ctx.cl_ctx,
cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x)
reduce(ctx.cl_queue, [np.prod(osize)], None, inp.cl,
i32(np.prod(inp.shape)//np.prod(osize)), ret.cl,
i32(np.prod(osize)), i32(len(osize)),
res_g[gid] = """
+ code2
+ """;
}""",
)
buffer_np = lambda x: cl.Buffer(
ctx.cl_ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x
)
reduce(
ctx.cl_queue,
[np.prod(osize)],
None,
inp.cl,
i32(np.prod(inp.shape) // np.prod(osize)),
ret.cl,
i32(np.prod(osize)),
i32(len(osize)),
buffer_np(np.array(inp.shape, dtype=np.int32)),
buffer_np(np.array(osize, dtype=np.int32)))
buffer_np(np.array(osize, dtype=np.int32)),
)
return ret
def unbroadcast(ctx, out, in_sh):
sum_axis = [i for i in range(len(in_sh)) if in_sh[i]==1 and out.shape[i]>1] if in_sh != (1,) else None
sum_axis = (
[i for i in range(len(in_sh)) if in_sh[i] == 1 and out.shape[i] > 1]
if in_sh != (1,)
else None
)
return reduce_op(ctx, "out += a", "out", out, sum_axis)
# ***** now for the ops themselves *****
class Add(Function):
@staticmethod
def forward(ctx, x, y):
ctx.save_for_backward(x.shape, y.shape)
return binary_op(ctx, 'a+b', x, y)
return binary_op(ctx, "a+b", x, y)
@staticmethod
def backward(ctx, grad_output):
grad_x, grad_y = grad_output, grad_output
shape_x, shape_y = ctx.saved_tensors
return unbroadcast(ctx, grad_x, shape_x), unbroadcast(ctx, grad_y, shape_y),
register('add', Add, device=Device.GPU)
return (
unbroadcast(ctx, grad_x, shape_x),
unbroadcast(ctx, grad_y, shape_y),
)
register("add", Add, device=Device.GPU)
class Sub(Function):
@staticmethod
def forward(ctx, x, y):
ctx.save_for_backward(x.shape, y.shape)
return binary_op(ctx, 'a-b', x, y)
return binary_op(ctx, "a-b", x, y)
@staticmethod
def backward(ctx, grad_output):
grad_x, grad_y = grad_output, unary_op(ctx, '-a', grad_output)
grad_x, grad_y = grad_output, unary_op(ctx, "-a", grad_output)
shape_x, shape_y = ctx.saved_tensors
return unbroadcast(ctx, grad_x, shape_x), unbroadcast(ctx, grad_y, shape_y),
register('sub', Sub, device=Device.GPU)
return (
unbroadcast(ctx, grad_x, shape_x),
unbroadcast(ctx, grad_y, shape_y),
)
register("sub", Sub, device=Device.GPU)
class Mul(Function):
@staticmethod
def forward(ctx, x, y):
ctx.save_for_backward(x, y)
return binary_op(ctx, 'a*b', x, y)
return binary_op(ctx, "a*b", x, y)
@staticmethod
def backward(ctx, grad_output):
x,y = ctx.saved_tensors
grad_x = binary_op(ctx, 'a*b', y, grad_output)
grad_y = binary_op(ctx, 'a*b', x, grad_output)
return unbroadcast(ctx, grad_x, x.shape), unbroadcast(ctx, grad_y, y.shape),
register('mul', Mul, device=Device.GPU)
x, y = ctx.saved_tensors
grad_x = binary_op(ctx, "a*b", y, grad_output)
grad_y = binary_op(ctx, "a*b", x, grad_output)
return (
unbroadcast(ctx, grad_x, x.shape),
unbroadcast(ctx, grad_y, y.shape),
)
register("mul", Mul, device=Device.GPU)
class Pow(Function):
@staticmethod
def forward(ctx, x, y):
ctx.save_for_backward(x, y)
return binary_op(ctx, 'pow(a,b)', x, y)
return binary_op(ctx, "pow(a,b)", x, y)
@staticmethod
def backward(ctx, grad_output):
x,y = ctx.saved_tensors
grad_x = binary_op(ctx, 'a*b', grad_output,
binary_op(ctx, 'b * (pow((float)a, (float)(b-1.0)))', x, y))
grad_y = binary_op(ctx, 'a*b', grad_output,
binary_op(ctx, 'pow(a, (float)b) * log(a);', x, y))
return unbroadcast(ctx, grad_x, x.shape), unbroadcast(ctx, grad_y, y.shape),
register('pow', Pow, device=Device.GPU)
x, y = ctx.saved_tensors
grad_x = binary_op(
ctx,
"a*b",
grad_output,
binary_op(ctx, "b * (pow((float)a, (float)(b-1.0)))", x, y),
)
grad_y = binary_op(
ctx, "a*b", grad_output, binary_op(ctx, "pow(a, (float)b) * log(a);", x, y)
)
return (
unbroadcast(ctx, grad_x, x.shape),
unbroadcast(ctx, grad_y, y.shape),
)
register("pow", Pow, device=Device.GPU)
class Sum(Function):
@staticmethod
@@ -229,16 +376,24 @@ class Sum(Function):
ctx.save_for_backward(input, axis)
ret = reduce_op(ctx, "out += a", "out", input, axis=axis)
if axis is not None:
ret.shape = tuple([input.shape[i] for i in range(len(input.shape)) if i not in axis])
ret.shape = tuple(
[input.shape[i] for i in range(len(input.shape)) if i not in axis]
)
return ret
@staticmethod
def backward(ctx, grad_output):
input, axis = ctx.saved_tensors
shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
shape = [
1 if axis is None or i in axis else input.shape[i]
for i in range(len(input.shape))
]
output = GPUBuffer(shape, hostbuf=grad_output)
return binary_op(ctx, 'a+b', output, buffer_new(ctx, input.shape, zero=True))
register('sum', Sum, device=Device.GPU)
return binary_op(ctx, "a+b", output, buffer_new(ctx, input.shape, zero=True))
register("sum", Sum, device=Device.GPU)
class Dot(Function):
@staticmethod
@@ -247,7 +402,10 @@ class Dot(Function):
isize, msize, osize = i32(input.shape[0]), i32(input.shape[1]), i32(weight.shape[1])
ret = buffer_new(ctx, (isize, osize))
matmul = clbuild(ctx.cl_ctx, "matmul", """
matmul = clbuild(
ctx.cl_ctx,
"matmul",
"""
__kernel void matmul(
__global const float *input, __global const float *weight, __global float *res,
int is0, int is1, int msize, int ws0, int ws1, int osize
@@ -261,13 +419,25 @@ class Dot(Function):
}
res[X * osize + Y] = ret;
}""")
}""",
)
ctx.save_for_backward(input, weight, matmul)
# (isize,msize) x (msize,osize) = (isize,osize)
matmul(ctx.cl_queue, [isize, osize], None,
input.cl, weight.cl, ret.cl,
msize, i32(1), msize, i32(1), osize, osize)
matmul(
ctx.cl_queue,
[isize, osize],
None,
input.cl,
weight.cl,
ret.cl,
msize,
i32(1),
msize,
i32(1),
osize,
osize,
)
return ret
@staticmethod
@@ -279,28 +449,56 @@ class Dot(Function):
grad_weight = buffer_new(ctx, weight.shape)
# (isize,osize) x (msize,osize) = (isize,msize)
matmul(ctx.cl_queue, [isize, msize], None,
grad_output.cl, weight.cl, grad_input.cl,
osize, i32(1), osize, osize, i32(1), msize)
matmul(
ctx.cl_queue,
[isize, msize],
None,
grad_output.cl,
weight.cl,
grad_input.cl,
osize,
i32(1),
osize,
osize,
i32(1),
msize,
)
# (isize,msize) x (isize,osize) = (msize,osize)
matmul(ctx.cl_queue, [msize, osize], None,
input.cl, grad_output.cl, grad_weight.cl,
i32(1), msize, isize, i32(1), osize, osize)
matmul(
ctx.cl_queue,
[msize, osize],
None,
input.cl,
grad_output.cl,
grad_weight.cl,
i32(1),
msize,
isize,
i32(1),
osize,
osize,
)
return grad_input, grad_weight
register('dot', Dot, device=Device.GPU)
register("dot", Dot, device=Device.GPU)
# ************* simple ops *************
class Pad2D(Function):
@staticmethod
def forward(ctx, x, padding=None):
bs,cin,iy,ix = x.shape
oy,ox = iy+padding[2]+padding[3], ix+padding[0]+padding[1]
bs, cin, iy, ix = x.shape
oy, ox = iy + padding[2] + padding[3], ix + padding[0] + padding[1]
ret = buffer_new(ctx, (bs, cin, oy, ox), zero=True)
pad2d = clbuild(ctx.cl_ctx, "pad2d", """
pad2d = clbuild(
ctx.cl_ctx,
"pad2d",
"""
__kernel void pad2d(__global const float *input, __global float *output,
int ipx, int ipy, int py, int px, int oy, int ox, int iy, int ix) {
int BC = get_global_id(0);
@@ -311,13 +509,24 @@ class Pad2D(Function):
int optr = BC*oy*ox + (Y+py)*ox + px + X;
output[optr] = input[iptr];
}""")
}""",
)
ctx.save_for_backward(padding, pad2d)
pad2d(ctx.cl_queue, [bs*cin, iy, ix], None,
x.cl, ret.cl,
i32(0), i32(0), i32(padding[2]), i32(padding[0]),
i32(oy), i32(ox), i32(iy), i32(ix)
)
pad2d(
ctx.cl_queue,
[bs * cin, iy, ix],
None,
x.cl,
ret.cl,
i32(0),
i32(0),
i32(padding[2]),
i32(padding[0]),
i32(oy),
i32(ox),
i32(iy),
i32(ix),
)
return ret
@staticmethod
@@ -326,13 +535,26 @@ class Pad2D(Function):
bs, cin, iy, ix = grad_output.shape
oy, ox = iy - padding[2] - padding[3], ix - padding[0] - padding[1]
ret = buffer_new(ctx, (bs, cin, oy, ox))
pad2d(ctx.cl_queue, [bs*cin, oy, ox], None,
grad_output.cl, ret.cl,
i32(padding[2]), i32(padding[0]), i32(0), i32(0),
i32(oy), i32(ox), i32(iy), i32(ix)
)
pad2d(
ctx.cl_queue,
[bs * cin, oy, ox],
None,
grad_output.cl,
ret.cl,
i32(padding[2]),
i32(padding[0]),
i32(0),
i32(0),
i32(oy),
i32(ox),
i32(iy),
i32(ix),
)
return ret
register('pad2d', Pad2D, device=Device.GPU)
register("pad2d", Pad2D, device=Device.GPU)
class Reshape(Function):
@staticmethod
@@ -345,105 +567,152 @@ class Reshape(Function):
@staticmethod
def backward(ctx, grad_output):
in_shape, = ctx.saved_tensors
(in_shape,) = ctx.saved_tensors
return GPUBuffer(in_shape, hostbuf=grad_output)
register('reshape', Reshape, device=Device.GPU)
register("reshape", Reshape, device=Device.GPU)
# ************* activation ops *************
class ReLU(Function):
@staticmethod
def forward(ctx, input):
ctx.save_for_backward(input)
return unary_op(ctx, 'max(a, (float)0.)', input)
return unary_op(ctx, "max(a, (float)0.)", input)
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
return binary_op(ctx, 'a * (b >= 0)', grad_output, input)
register('relu', ReLU, device=Device.GPU)
(input,) = ctx.saved_tensors
return binary_op(ctx, "a * (b >= 0)", grad_output, input)
register("relu", ReLU, device=Device.GPU)
class Sigmoid(Function):
@staticmethod
def forward(ctx, input):
ret = unary_op(ctx, '1./(1+exp(-a))', input)
ret = unary_op(ctx, "1./(1+exp(-a))", input)
ctx.save_for_backward(ret)
return ret
@staticmethod
def backward(ctx, grad_output):
ret, = ctx.saved_tensors
return binary_op(ctx, 'a * (b * (1 - b));', grad_output, ret)
register('sigmoid', Sigmoid, device=Device.GPU)
(ret,) = ctx.saved_tensors
return binary_op(ctx, "a * (b * (1 - b));", grad_output, ret)
register("sigmoid", Sigmoid, device=Device.GPU)
class AvgPool2D(Function):
@staticmethod
def forward(ctx, input, kernel_size=(2, 2)):
ret = subsample_op(ctx, input, kernel_size, kernel_size, iter_op="sumval += input[iid]",
result_op="sumval / (ksz.x * ksz.y)", decls="float sumval=0.f")
ret = subsample_op(
ctx,
input,
kernel_size,
kernel_size,
iter_op="sumval += input[iid]",
result_op="sumval / (ksz.x * ksz.y)",
decls="float sumval=0.f",
)
ctx.save_for_backward(input.shape)
return ret
@staticmethod
def backward(ctx, grad_output):
orig_shape, = ctx.saved_tensors
return supersample_op(ctx, grad_output, orig_shape, ctx.kernel_size,
result_op="input[iid] / (ksz.x * ksz.y)")
register('avg_pool2d', AvgPool2D, device=Device.GPU)
(orig_shape,) = ctx.saved_tensors
return supersample_op(
ctx,
grad_output,
orig_shape,
ctx.kernel_size,
result_op="input[iid] / (ksz.x * ksz.y)",
)
register("avg_pool2d", AvgPool2D, device=Device.GPU)
class MaxPool2D(Function):
@staticmethod
def forward(ctx, input, kernel_size=(2, 2)):
idxs = subsample_op(ctx, input, kernel_size, kernel_size,
idxs = subsample_op(
ctx,
input,
kernel_size,
kernel_size,
iter_op="if (input[iid]>maxval) { maxval = input[iid]; maxidx = j * ksz.x + i; }",
result_op="(float)maxidx", decls="float maxval=-FLT_MAX; int maxidx=0")
result_op="(float)maxidx",
decls="float maxval=-FLT_MAX; int maxidx=0",
)
ctx.save_for_backward(idxs, input.shape)
return subsample_op(ctx, input, kernel_size, kernel_size,
return subsample_op(
ctx,
input,
kernel_size,
kernel_size,
iter_op="maxval = max(maxval, input[iid])",
result_op="maxval", decls="float maxval = -FLT_MAX")
result_op="maxval",
decls="float maxval = -FLT_MAX",
)
@staticmethod
def backward(ctx, grad_output):
idxs, orig_shape = ctx.saved_tensors
return supersample_op(ctx, grad_output, orig_shape, ctx.kernel_size,
return supersample_op(
ctx,
grad_output,
orig_shape,
ctx.kernel_size,
result_op="(maxidx == kernidx) * input[iid]",
decls="int maxidx=((__global float*)input2)[iid]; int kernidx=(gid.x%ksz.x) + ksz.x*(gid.y%ksz.y)",
input2=idxs)
register('max_pool2d', MaxPool2D, device=Device.GPU)
input2=idxs,
)
register("max_pool2d", MaxPool2D, device=Device.GPU)
class LogSoftmax(Function):
@staticmethod
def forward(ctx, input):
# TODO: stability?
lsum = reduce_op(ctx, "out += exp(a)", "log(out)", input, axis=[1])
output = binary_op(ctx, 'a-b', input, lsum)
output = binary_op(ctx, "a-b", input, lsum)
ctx.save_for_backward(output)
return output
@staticmethod
def backward(ctx, grad_output):
output, = ctx.saved_tensors
(output,) = ctx.saved_tensors
lsum = reduce_op(ctx, "out += a", "out", grad_output, axis=[1])
texp = binary_op(ctx, "exp(a) * b", output, lsum)
return binary_op(ctx, "a - b", grad_output, texp)
register('logsoftmax', LogSoftmax, device=Device.GPU)
register("logsoftmax", LogSoftmax, device=Device.GPU)
# ************* conv ops *************
class Conv2D(Function):
@staticmethod
def forward(ctx, x, w, stride=1, groups=1):
if type(ctx.stride) == int:
ctx.stride = (ctx.stride, ctx.stride)
cout,cin,H,W = w.shape
ys,xs = ctx.stride
bs,cin_,iy,ix = x.shape
oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs
assert cin*ctx.groups == cin_
cout, cin, H, W = w.shape
ys, xs = ctx.stride
bs, cin_, iy, ix = x.shape
oy, ox = (iy - (H - ys)) // ys, (ix - (W - xs)) // xs
assert cin * ctx.groups == cin_
assert cout % ctx.groups == 0
rcout = cout//ctx.groups
rcout = cout // ctx.groups
ctx.save_for_backward(x,w)
ctx.save_for_backward(x, w)
# output buffer
ret = buffer_new(ctx, (bs, cout, oy, ox))
@@ -452,7 +721,10 @@ class Conv2D(Function):
# weight = (groups, rcout, cin, H, W)
# output = (bs, groups, rcout, oy, ox)
conv = clbuild(ctx.cl_ctx, "conv", """
conv = clbuild(
ctx.cl_ctx,
"conv",
"""
__kernel void conv(__global const float *input, __global const float *weight, __global float *output,
int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs) {
@@ -475,26 +747,41 @@ class Conv2D(Function):
}
}
output[B*groups*rcout*oy*ox + g*rcout*oy*ox + c*oy*ox + Y*ox + X] = acc;
}""")
}""",
)
conv(ctx.cl_queue, [bs*groups*rcout, oy, ox], None,
x.cl, w.cl, ret.cl,
i32(H), i32(W), i32(groups), i32(rcout), i32(cin),
i32(oy), i32(ox), i32(iy), i32(ix), i32(ys), i32(xs)
conv(
ctx.cl_queue,
[bs * groups * rcout, oy, ox],
None,
x.cl,
w.cl,
ret.cl,
i32(H),
i32(W),
i32(groups),
i32(rcout),
i32(cin),
i32(oy),
i32(ox),
i32(iy),
i32(ix),
i32(ys),
i32(xs),
)
return ret
@staticmethod
def backward(ctx, grad_output):
bs,_,oy,ox = grad_output.shape
bs, _, oy, ox = grad_output.shape
x, w = ctx.saved_tensors
cout,cin,H,W = w.shape
ys,xs = ctx.stride
bs,cin_,iy,ix = x.shape
oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs
assert cin*ctx.groups == cin_
cout, cin, H, W = w.shape
ys, xs = ctx.stride
bs, cin_, iy, ix = x.shape
oy, ox = (iy - (H - ys)) // ys, (ix - (W - xs)) // xs
assert cin * ctx.groups == cin_
assert cout % ctx.groups == 0
rcout = cout//ctx.groups
rcout = cout // ctx.groups
dx = buffer_new(ctx, (bs, cin_, iy, ix), zero=True)
dw = buffer_new(ctx, (cout, cin, H, W))
@@ -503,7 +790,10 @@ class Conv2D(Function):
# tensw = (groups*rcout, cin, H, W)
# ggg = (bs, groups*rout, oy, ox)
convw = clbuild(ctx.cl_ctx, "convw", """
convw = clbuild(
ctx.cl_ctx,
"convw",
"""
__kernel void convw(__global const float *tensx, __global const float *ggg, __global float *dw,
int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs) {
@@ -523,8 +813,12 @@ class Conv2D(Function):
}
}
dw[get_global_id(0)*H*W + y*W + x] = acc;
}""")
convx = clbuild(ctx.cl_ctx, "convx", """
}""",
)
convx = clbuild(
ctx.cl_ctx,
"convx",
"""
__kernel void convx(__global const float *tensw, __global const float *ggg, __global float *dx,
int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs) {
@@ -547,10 +841,36 @@ class Conv2D(Function):
}
}
}
""")
""",
)
conv_args = i32(H), i32(W), i32(ctx.groups), i32(rcout), i32(cin), i32(oy), i32(ox), i32(iy), i32(ix), i32(ys), i32(xs), i32(bs)
convw(ctx.cl_queue, [ctx.groups*rcout*cin, H, W], None, x.cl, grad_output.cl, dw.cl, *conv_args)
convx(ctx.cl_queue, [bs, ctx.groups, cin], None, w.cl, grad_output.cl, dx.cl, *conv_args)
conv_args = (
i32(H),
i32(W),
i32(ctx.groups),
i32(rcout),
i32(cin),
i32(oy),
i32(ox),
i32(iy),
i32(ix),
i32(ys),
i32(xs),
i32(bs),
)
convw(
ctx.cl_queue,
[ctx.groups * rcout * cin, H, W],
None,
x.cl,
grad_output.cl,
dw.cl,
*conv_args,
)
convx(
ctx.cl_queue, [bs, ctx.groups, cin], None, w.cl, grad_output.cl, dx.cl, *conv_args
)
return dx, dw
register('conv2d', Conv2D, device=Device.GPU)
register("conv2d", Conv2D, device=Device.GPU)

View File

@@ -3,6 +3,7 @@
import numpy as np
from tinygrad.tensor import Tensor
class Optimizer:
def __init__(self, params):
self.params = [x for x in params if x.requires_grad == True]
@@ -11,6 +12,7 @@ class Optimizer:
for param in self.params:
param.grad = None
class SGD(Optimizer):
def __init__(self, params, lr=0.001):
super().__init__(params)
@@ -20,30 +22,53 @@ class SGD(Optimizer):
for t in self.params:
t -= t.grad * self.lr
class RMSprop(Optimizer):
def __init__(self, params, lr=0.001, decay=0.9, eps=1e-8):
super().__init__(params)
self.lr, self.decay, self.eps = lr, decay, eps
self.v = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params]
self.v = [
Tensor(
np.zeros(t.shape, dtype=np.float32),
device=params[0].device,
requires_grad=False,
)
for t in self.params
]
def step(self):
for i, t in enumerate(self.params):
self.v[i] = self.decay * self.v[i] + (1.0 - self.decay) * t.grad * t.grad
t -= (t.grad * self.lr).div(self.v[i].sqrt() + self.eps)
class Adam(Optimizer):
def __init__(self, params, lr=0.001, b1=0.9, b2=0.999, eps=1e-8):
super().__init__(params)
self.lr, self.b1, self.b2, self.eps, self.t = lr, b1, b2, eps, 0
self.m = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params]
self.v = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params]
self.m = [
Tensor(
np.zeros(t.shape, dtype=np.float32),
device=params[0].device,
requires_grad=False,
)
for t in self.params
]
self.v = [
Tensor(
np.zeros(t.shape, dtype=np.float32),
device=params[0].device,
requires_grad=False,
)
for t in self.params
]
def step(self):
self.t = self.t + 1
a = self.lr * ((1.0 - self.b2**self.t)**0.5) / (1.0 - self.b1**self.t)
for i,t in enumerate(self.params):
a = self.lr * ((1.0 - self.b2 ** self.t) ** 0.5) / (1.0 - self.b1 ** self.t)
for i, t in enumerate(self.params):
self.m[i] = self.b1 * self.m[i] + (1.0 - self.b1) * t.grad
self.v[i] = self.b2 * self.v[i] + (1.0 - self.b2) * t.grad * t.grad
t -= a * self.m[i].div(self.v[i].sqrt() + self.eps)

View File

@@ -10,32 +10,43 @@ from collections import defaultdict
DEBUG = os.getenv("DEBUG", None) is not None
if DEBUG:
import atexit, time
debug_counts, debug_times = defaultdict(int), defaultdict(float)
def print_debug_exit():
for name, _ in sorted(debug_times.items(), key=lambda x: -x[1]):
print(f"{name:>20} : {debug_counts[name]:>6} {debug_times[name]:>10.2f} ms")
atexit.register(print_debug_exit)
class ProfileOp:
def __init__(self, name, x, backward=False):
self.name = ("back_" if backward else "")+name
self.name = ("back_" if backward else "") + name
self.x = x
def __enter__(self):
if DEBUG: self.st = time.time()
if DEBUG:
self.st = time.time()
def __exit__(self, *junk):
if DEBUG:
if cl_queue is not None:
cl_queue.finish()
et = (time.time()-self.st)*1000.
et = (time.time() - self.st) * 1000.0
debug_counts[self.name] += 1
debug_times[self.name] += et
print(f"{self.name:>20} : {et:>7.2f} ms {[y.shape for y in self.x]}")
# **** GPU functions ****
cl_ctx, cl_queue = None, None
def require_init_gpu():
if not GPU: raise Exception("No GPU Support, install pyopencl")
if not GPU:
raise Exception("No GPU Support, install pyopencl")
global cl_ctx, cl_queue
if cl_queue is None:
devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.GPU)
@@ -45,28 +56,45 @@ def require_init_gpu():
# this is an in-order command queue
cl_queue = cl.CommandQueue(cl_ctx)
class GPUBuffer:
def __init__(self, shape, hostbuf=None):
self.shape, self.dtype = tuple(shape), np.float32
self.cl = hostbuf.cl if isinstance(hostbuf, GPUBuffer) else \
cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0), 4*np.prod(shape),
hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None)
self.cl = (
hostbuf.cl
if isinstance(hostbuf, GPUBuffer)
else cl.Buffer(
cl_ctx,
cl.mem_flags.READ_WRITE
| (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0),
4 * np.prod(shape),
hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None,
)
)
def __repr__(self):
return f"<GPUBuffer with shape {self.shape!r}>"
# **** ANE functions ****
ane = None
def require_init_ane():
global ane
if ane is None:
import ane.lib.ane, tinygrad.ops_ane
ane = ane.lib.ane.ANE()
# **** start with two base classes, Tensor and Function ****
class Device: CPU, GPU, ANE = 0, 1, 2
class Device:
CPU, GPU, ANE = 0, 1, 2
class Tensor:
did_float_warning = False
@@ -110,7 +138,12 @@ class Tensor:
@classmethod
def uniform(cls, *shape, **kwargs):
return cls((np.random.uniform(-1., 1., size=shape)/np.sqrt(np.prod(shape))).astype(np.float32), **kwargs)
return cls(
(np.random.uniform(-1.0, 1.0, size=shape) / np.sqrt(np.prod(shape))).astype(
np.float32
),
**kwargs,
)
@classmethod
def eye(cls, dim, **kwargs):
@@ -130,18 +163,21 @@ class Tensor:
# fill in the first grad with one
# this is "implicit gradient creation"
self.grad = Tensor(np.ones(self.shape, dtype=self.dtype), device=self.device, requires_grad=False)
self.grad = Tensor(
np.ones(self.shape, dtype=self.dtype), device=self.device, requires_grad=False
)
for t0 in reversed(self.deepwalk(set(), [])):
assert (t0.grad is not None)
assert t0.grad is not None
with ProfileOp(t0._ctx.__class__.__name__, [t0.grad], backward=True):
grads = t0._ctx.backward(t0._ctx, t0.grad.data)
if len(t0._ctx.parents) == 1:
grads = [grads]
for t,g in zip(t0._ctx.parents, grads):
for t, g in zip(t0._ctx.parents, grads):
if g is not None:
assert g.shape == t.shape, \
f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
assert (
g.shape == t.shape
), f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
gt = Tensor(g, device=self.device, requires_grad=False)
t.grad = gt if t.grad is None else (t.grad + gt)
@@ -150,19 +186,21 @@ class Tensor:
@staticmethod
def _move_data(data, device):
if isinstance(data, GPUBuffer):
if device == Device.GPU: return data
if device == Device.GPU:
return data
old = data
data = np.empty(old.shape, dtype=np.float32)
with ProfileOp("toCPU", [data]):
cl.enqueue_copy(cl_queue, data, old.cl, is_blocking=True)
cl.enqueue_copy(cl_queue, data, old.cl, is_blocking=True)
elif "ANETensor" in str(type(data)):
if device == Device.ANE: return data
if device == Device.ANE:
return data
with ProfileOp("toCPU", [data]):
data = data.data().astype(np.float32)
data = data.data().astype(np.float32)
if not isinstance(data, np.ndarray):
data = np.array(data, dtype=np.float32)
data = np.array(data, dtype=np.float32)
if data.dtype != np.float32 and not Tensor.did_float_warning:
# warning? float64 is actually needed for numerical jacobian
@@ -184,14 +222,17 @@ class Tensor:
def to_(self, device):
self.data, self.device = self._move_data(self.data, device), device
if self.grad: self.grad.to_(device)
if self.grad:
self.grad.to_(device)
def to(self, device):
ret = Tensor(self.data, device)
if self.grad: ret.grad = self.grad.to(device)
if self.grad:
ret.grad = self.grad.to(device)
return ret
def _is(self, device): return self.device == device
def _is(self, device):
return self.device == device
def detach(self):
return Tensor(self.data, device=self.device)
@@ -203,7 +244,7 @@ class Tensor:
def mean(self, axis=None):
out = self.sum(axis=axis)
coeff = np.prod(out.shape)/np.prod(self.shape)
coeff = np.prod(out.shape) / np.prod(self.shape)
return out * coeff
def sqrt(self):
@@ -219,15 +260,18 @@ class Tensor:
return 2.0 * ((2.0 * self).sigmoid()) - 1.0
def leakyrelu(self, neg_slope=0.01):
return self.relu() - (-neg_slope*self).relu()
return self.relu() - (-neg_slope * self).relu()
def dropout(self, p=0.5):
_mask = np.asarray(np.random.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype)
_mask = np.asarray(
np.random.binomial(1, 1.0 - p, size=self.shape), dtype=self.dtype
)
ret = self * Tensor(_mask, requires_grad=False, device=self.device)
return ret.div(1.0 - p)
def abs(self):
return self.relu() + (-1.0*self).relu()
return self.relu() + (-1.0 * self).relu()
# An instantiation of the Function is the Context
class Function:
@@ -239,7 +283,7 @@ class Function:
self.saved_tensors.extend(x)
def apply(self, *x, **kwargs):
ctx = self(*x) # self - operation i.e 'add', 'sub', etc.
ctx = self(*x) # self - operation i.e 'add', 'sub', etc.
# use default params
params = signature(self.forward).parameters
for p in params.values():
@@ -249,40 +293,66 @@ class Function:
for k, v in kwargs.items():
setattr(ctx, k, v)
with ProfileOp(ctx.__class__.__name__, x):
ret = Tensor(self.forward(ctx, *[t.data for t in x], **kwargs),
device=ctx.device, requires_grad=any([t.requires_grad for t in x]))
ret = Tensor(
self.forward(ctx, *[t.data for t in x], **kwargs),
device=ctx.device,
requires_grad=any([t.requires_grad for t in x]),
)
if ret.requires_grad:
ret._ctx = ctx
return ret
def register(name, fxn, device=Device.CPU):
Tensor.ops[device][name] = fxn
def dispatch(*x, **kwargs):
tt = [arg for arg in x if isinstance(arg, Tensor)][0]
x = [Tensor(np.array([arg], dtype=tt.dtype), device=tt.device, requires_grad=False) if not isinstance(arg, Tensor) else arg for arg in x]
x = [
Tensor(np.array([arg], dtype=tt.dtype), device=tt.device, requires_grad=False)
if not isinstance(arg, Tensor)
else arg
for arg in x
]
f = (Tensor.ops[tt.device])[name]
f.cl_ctx, f.cl_queue, f.ane, f.device = cl_ctx, cl_queue, ane, tt.device
return f.apply(f, *x, **kwargs)
setattr(Tensor, name, dispatch)
# TODO: div is a second class op, so it doesn't work here
if name in ['add', 'sub', 'mul', 'pow']:
if name in ["add", "sub", "mul", "pow"]:
setattr(Tensor, f"__{name}__", dispatch)
setattr(Tensor, f"__i{name}__", lambda self,x: self.assign(dispatch(self,x)))
setattr(Tensor, f"__r{name}__", lambda self,x: dispatch(x,self))
setattr(Tensor, f"__i{name}__", lambda self, x: self.assign(dispatch(self, x)))
setattr(Tensor, f"__r{name}__", lambda self, x: dispatch(x, self))
for device in [device for device in Device.__dict__.keys() if device[0] != "_"]:
setattr(Tensor, f"{device.lower()}", functools.partialmethod(Tensor.to, Device.__dict__[device]))
setattr(Tensor, f"{device.lower()}_", functools.partialmethod(Tensor.to_, Device.__dict__[device]))
setattr(Tensor, f"is_{device.lower()}", property(functools.partialmethod(Tensor._is, Device.__dict__[device])))
setattr(
Tensor,
f"{device.lower()}",
functools.partialmethod(Tensor.to, Device.__dict__[device]),
)
setattr(
Tensor,
f"{device.lower()}_",
functools.partialmethod(Tensor.to_, Device.__dict__[device]),
)
setattr(
Tensor,
f"is_{device.lower()}",
property(functools.partialmethod(Tensor._is, Device.__dict__[device])),
)
# this registers all the operations
import tinygrad.ops_cpu
try:
import pyopencl as cl
# TODO: move this import to require_init_gpu?
import tinygrad.ops_gpu
GPU = True
except ImportError:
# no GPU support
GPU = False