mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 15:08:02 -05:00
fixed pylint, formatted python files iwth cblack on localhost
This commit is contained in:
26
.github/workflows/test.yml
vendored
26
.github/workflows/test.yml
vendored
@@ -34,13 +34,21 @@ jobs:
|
||||
run: python -m pytest -s -v
|
||||
|
||||
linter:
|
||||
name: Indentation Linter
|
||||
runs-on: ubuntu-latest
|
||||
name: Indentation Linter
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v2
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.8
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install pylint
|
||||
- name: Lint with pylint
|
||||
run: |
|
||||
python -m pylint --disable=all -e W0311 --jobs=0 --indent-string=' ' **/*.py
|
||||
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v2
|
||||
- name: Install PyLint
|
||||
run: sudo apt-get install pylint
|
||||
- name: Validate indentation is 2 lines
|
||||
run: if [[ $(pylint --jobs=0 --indent-string=' ' * | grep "Bad indentation") ]]; then exit 1; fi
|
||||
|
||||
@@ -3,24 +3,34 @@ import numpy as np
|
||||
import coremltools as ct
|
||||
from coremltools.models.neural_network import datatypes, NeuralNetworkBuilder
|
||||
|
||||
input_features = [('image', datatypes.Array(3))]
|
||||
output_features = [('probs', datatypes.Array(3))]
|
||||
input_features = [("image", datatypes.Array(3))]
|
||||
output_features = [("probs", datatypes.Array(3))]
|
||||
|
||||
weights = np.zeros((3, 3)) + 3
|
||||
bias = np.ones(3)
|
||||
|
||||
builder = NeuralNetworkBuilder(input_features, output_features)
|
||||
builder.add_inner_product(name='ip_layer', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='image', output_name='med')
|
||||
#builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs')
|
||||
#builder.add_elementwise(name='element', input_names=['med', 'med'], output_name='probs', mode='ADD')
|
||||
builder.add_bias(name='bias', b=bias, input_name='med', output_name='probs', shape_bias=(3,))
|
||||
#builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs')
|
||||
builder.add_inner_product(
|
||||
name="ip_layer",
|
||||
W=weights,
|
||||
b=None,
|
||||
input_channels=3,
|
||||
output_channels=3,
|
||||
has_bias=False,
|
||||
input_name="image",
|
||||
output_name="med",
|
||||
)
|
||||
# builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs')
|
||||
# builder.add_elementwise(name='element', input_names=['med', 'med'], output_name='probs', mode='ADD')
|
||||
builder.add_bias(
|
||||
name="bias", b=bias, input_name="med", output_name="probs", shape_bias=(3,)
|
||||
)
|
||||
# builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs')
|
||||
|
||||
# compile the spec
|
||||
mlmodel = ct.models.MLModel(builder.spec)
|
||||
|
||||
# trigger the ANE!
|
||||
out = mlmodel.predict({"image": np.array([1337,0,0], dtype=np.float32)})
|
||||
out = mlmodel.predict({"image": np.array([1337, 0, 0], dtype=np.float32)})
|
||||
print(out)
|
||||
mlmodel.save('test.mlmodel')
|
||||
|
||||
mlmodel.save("test.mlmodel")
|
||||
|
||||
@@ -2,17 +2,21 @@
|
||||
import sys
|
||||
from hexdump import hexdump
|
||||
from macholib import MachO
|
||||
|
||||
|
||||
def get_macho(fn):
|
||||
# mod to make the header okay
|
||||
# MH_CIGAM_64 is good
|
||||
dat = open(fn, "rb").read()
|
||||
dat = b"\xcf\xfa\xed\xfe"+dat[4:]
|
||||
dat = b"\xcf\xfa\xed\xfe" + dat[4:]
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
with NamedTemporaryFile(delete=False) as f:
|
||||
f.write(dat)
|
||||
f.close()
|
||||
return MachO.MachO(f.name)
|
||||
|
||||
|
||||
a = get_macho("model.hwx")
|
||||
|
||||
# load commands
|
||||
@@ -21,14 +25,22 @@ for c in a.headers[0].commands:
|
||||
if c[0].cmd == 25:
|
||||
print(c[1])
|
||||
for section in c[2]:
|
||||
print(section.segname.strip(b'\0'), section.sectname.strip(b'\0'), hex(section.addr), hex(section.size), "@", hex(c[1].fileoff))
|
||||
#print(dir(section))
|
||||
print(
|
||||
section.segname.strip(b"\0"),
|
||||
section.sectname.strip(b"\0"),
|
||||
hex(section.addr),
|
||||
hex(section.size),
|
||||
"@",
|
||||
hex(c[1].fileoff),
|
||||
)
|
||||
# print(dir(section))
|
||||
if c[1].filesize > 0:
|
||||
hexdump(section.section_data)
|
||||
|
||||
# this parser is wrong (fixed with 64-bit one)
|
||||
from macholib import SymbolTable
|
||||
sym = SymbolTable.SymbolTable(a)
|
||||
|
||||
sym = SymbolTable.SymbolTable(a)
|
||||
|
||||
syms = {}
|
||||
for l in sym.nlists:
|
||||
@@ -36,64 +48,65 @@ for l in sym.nlists:
|
||||
if l[0].n_value != 0:
|
||||
syms[l[1]] = l[0].n_value
|
||||
|
||||
for k,v in syms.items():
|
||||
for k, v in syms.items():
|
||||
print(k, hex(v))
|
||||
|
||||
|
||||
|
||||
from termcolor import colored
|
||||
|
||||
|
||||
def compare(x, y):
|
||||
ss = []
|
||||
ln = []
|
||||
ln2 = []
|
||||
|
||||
ll = (max(len(x), len(y)) + 0xF)//0x10 * 0x10
|
||||
ll = (max(len(x), len(y)) + 0xF) // 0x10 * 0x10
|
||||
|
||||
highlight = False
|
||||
next_highlight = 0x2b
|
||||
for i in range(ll+1):
|
||||
next_highlight = 0x2B
|
||||
for i in range(ll + 1):
|
||||
if i == next_highlight:
|
||||
highlight = True
|
||||
if i < len(y):
|
||||
next_highlight += y[i]+8
|
||||
next_highlight += y[i] + 8
|
||||
else:
|
||||
next_highlight = None
|
||||
else:
|
||||
highlight = False
|
||||
a = "%02X" % x[i] if i < len(x) else "--", \
|
||||
"%02X" % y[i] if i < len(y) else "--"
|
||||
a = "%02X" % x[i] if i < len(x) else "--", "%02X" % y[i] if i < len(y) else "--"
|
||||
|
||||
def fj(x):
|
||||
ss = []
|
||||
for i in range(0, 0x10, 4):
|
||||
ss.append(' '.join(x[i:i+4]))
|
||||
return ' '.join(ss)
|
||||
ss.append(" ".join(x[i : i + 4]))
|
||||
return " ".join(ss)
|
||||
|
||||
if i!=0 and i%0x10 == 0:
|
||||
ss.append("%8X: " % (i-0x10)+fj(ln)+" | "+fj(ln2)+"\n")
|
||||
if i != 0 and i % 0x10 == 0:
|
||||
ss.append("%8X: " % (i - 0x10) + fj(ln) + " | " + fj(ln2) + "\n")
|
||||
ln = []
|
||||
ln2 = []
|
||||
if a[0] != a[1] and a[0] != "--" and a[1] != "--":
|
||||
ln.append(colored(a[0], 'green'))
|
||||
ln2.append(colored(a[1], 'red'))
|
||||
ln.append(colored(a[0], "green"))
|
||||
ln2.append(colored(a[1], "red"))
|
||||
else:
|
||||
if highlight:
|
||||
ln.append(colored(a[0], 'yellow'))
|
||||
ln2.append(colored(a[1], 'yellow'))
|
||||
ln.append(colored(a[0], "yellow"))
|
||||
ln2.append(colored(a[1], "yellow"))
|
||||
else:
|
||||
ln.append(a[0])
|
||||
ln2.append(a[1])
|
||||
return ''.join(ss)
|
||||
return "".join(ss)
|
||||
|
||||
|
||||
g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1])
|
||||
f1 = g.headers[0].commands[1][2][0].section_data
|
||||
f2 = a.headers[0].commands[1][2][0].section_data
|
||||
for i in range(0, len(f2), 0x300):
|
||||
print("===== op %d =====" % (i//0x300))
|
||||
print("===== op %d =====" % (i // 0x300))
|
||||
if len(f1) < 0x300:
|
||||
print(compare(f1, f2[i:i+0x300]))
|
||||
print(compare(f1, f2[i : i + 0x300]))
|
||||
else:
|
||||
print(compare(f1[i:i+0x300], f2[i:i+0x300]))
|
||||
|
||||
#open("/tmp/data.section", "wb").write(f2)
|
||||
#print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read()))
|
||||
print(compare(f1[i : i + 0x300], f2[i : i + 0x300]))
|
||||
|
||||
# open("/tmp/data.section", "wb").write(f2)
|
||||
# print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read()))
|
||||
|
||||
@@ -3,11 +3,12 @@ import os
|
||||
from ctypes import *
|
||||
import numpy as np
|
||||
import faulthandler
|
||||
|
||||
faulthandler.enable()
|
||||
|
||||
libane = cdll.LoadLibrary(os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"libane.dylib"))
|
||||
libane = cdll.LoadLibrary(
|
||||
os.path.join(os.path.dirname(os.path.abspath(__file__)), "libane.dylib")
|
||||
)
|
||||
|
||||
libane.ANE_Compile.argtypes = [c_char_p, c_int]
|
||||
libane.ANE_Compile.restype = c_void_p
|
||||
@@ -17,34 +18,36 @@ libane.ANE_TensorCreate.restype = c_void_p
|
||||
libane.ANE_TensorData.argtypes = [c_void_p]
|
||||
libane.ANE_TensorData.restype = POINTER(c_uint16)
|
||||
|
||||
libane.ANE_Run.argtypes = [c_void_p]*3
|
||||
libane.ANE_Run.argtypes = [c_void_p] * 3
|
||||
libane.ANE_Run.restype = c_int
|
||||
|
||||
|
||||
class ANETensor:
|
||||
def __init__(self, *shape):
|
||||
self.shape = shape
|
||||
self.dtype = np.float16
|
||||
self.sz = int(np.prod(shape))
|
||||
assert(self.sz <= 0x4000)
|
||||
assert self.sz <= 0x4000
|
||||
self.tt = libane.ANE_TensorCreate(self.sz, 1)
|
||||
assert(self.tt is not None)
|
||||
assert self.tt is not None
|
||||
|
||||
def data(self):
|
||||
data = libane.ANE_TensorData(self.tt)
|
||||
assert(data is not None)
|
||||
#print(hex(addressof(data.contents)))
|
||||
assert data is not None
|
||||
# print(hex(addressof(data.contents)))
|
||||
buf = np.ctypeslib.as_array(data, shape=(self.sz,))
|
||||
ret = np.frombuffer(buf, dtype=self.dtype)
|
||||
#print(ret.data)
|
||||
# print(ret.data)
|
||||
return ret
|
||||
|
||||
|
||||
class ANE:
|
||||
def __init__(self):
|
||||
libane.ANE_Open()
|
||||
|
||||
def compile(self, dat):
|
||||
ret = libane.ANE_Compile(create_string_buffer(dat), len(dat))
|
||||
assert(ret is not None)
|
||||
assert ret is not None
|
||||
return ret
|
||||
|
||||
def run(self, prog, tin, tout):
|
||||
@@ -53,6 +56,7 @@ class ANE:
|
||||
def tensor(self, shape):
|
||||
return ANETensor(shape)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ane = ANE()
|
||||
|
||||
@@ -62,7 +66,7 @@ if __name__ == "__main__":
|
||||
tind = tin.data()
|
||||
toutd = tout.data()
|
||||
|
||||
tind[0:4] = [-1,1,-2,2]
|
||||
tind[0:4] = [-1, 1, -2, 2]
|
||||
print(tind)
|
||||
print(toutd)
|
||||
|
||||
@@ -71,4 +75,3 @@ if __name__ == "__main__":
|
||||
|
||||
print(tind)
|
||||
print(toutd)
|
||||
|
||||
|
||||
@@ -17,8 +17,9 @@ for dev in ["CPU", "GPU", "ANE"]:
|
||||
boaa = baa.relu()
|
||||
et = time.time()
|
||||
if i == 2:
|
||||
print("%s can do at least %.2f MEGAReLUs/sec" % (dev, (np.prod(boaa.shape)/1e6)/(et-st)))
|
||||
print(
|
||||
"%s can do at least %.2f MEGAReLUs/sec"
|
||||
% (dev, (np.prod(boaa.shape) / 1e6) / (et - st))
|
||||
)
|
||||
# decently reliable
|
||||
assert(np.all(boaa.cpu().data >= 0))
|
||||
|
||||
|
||||
assert np.all(boaa.cpu().data >= 0)
|
||||
|
||||
@@ -3,24 +3,29 @@
|
||||
# a rough copy of
|
||||
# https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py
|
||||
import os
|
||||
|
||||
GPU = os.getenv("GPU", None) is not None
|
||||
import sys
|
||||
import io
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
np.set_printoptions(suppress=True)
|
||||
from tinygrad.tensor import Tensor
|
||||
from extra.utils import fetch, get_parameters
|
||||
from extra.efficientnet import EfficientNet
|
||||
|
||||
|
||||
def infer(model, img):
|
||||
# preprocess image
|
||||
aspect_ratio = img.size[0] / img.size[1]
|
||||
img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
|
||||
img = img.resize(
|
||||
(int(224 * max(aspect_ratio, 1.0)), int(224 * max(1.0 / aspect_ratio, 1.0)))
|
||||
)
|
||||
|
||||
img = np.array(img)
|
||||
y0,x0=(np.asarray(img.shape)[:2]-224)//2
|
||||
retimg = img = img[y0:y0+224, x0:x0+224]
|
||||
y0, x0 = (np.asarray(img.shape)[:2] - 224) // 2
|
||||
retimg = img = img[y0 : y0 + 224, x0 : x0 + 224]
|
||||
|
||||
# if you want to look at the image
|
||||
"""
|
||||
@@ -30,11 +35,11 @@ def infer(model, img):
|
||||
"""
|
||||
|
||||
# low level preprocess
|
||||
img = np.moveaxis(img, [2,0,1], [0,1,2])
|
||||
img = img.astype(np.float32)[:3].reshape(1,3,224,224)
|
||||
img = np.moveaxis(img, [2, 0, 1], [0, 1, 2])
|
||||
img = img.astype(np.float32)[:3].reshape(1, 3, 224, 224)
|
||||
img /= 255.0
|
||||
img -= np.array([0.485, 0.456, 0.406]).reshape((1,-1,1,1))
|
||||
img /= np.array([0.229, 0.224, 0.225]).reshape((1,-1,1,1))
|
||||
img -= np.array([0.485, 0.456, 0.406]).reshape((1, -1, 1, 1))
|
||||
img /= np.array([0.229, 0.224, 0.225]).reshape((1, -1, 1, 1))
|
||||
|
||||
# run the net
|
||||
if GPU:
|
||||
@@ -50,6 +55,7 @@ def infer(model, img):
|
||||
"""
|
||||
return out, retimg
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# instantiate my net
|
||||
model = EfficientNet(int(os.getenv("NUM", "0")))
|
||||
@@ -59,38 +65,42 @@ if __name__ == "__main__":
|
||||
|
||||
# category labels
|
||||
import ast
|
||||
lbls = fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt")
|
||||
lbls = ast.literal_eval(lbls.decode('utf-8'))
|
||||
|
||||
lbls = fetch(
|
||||
"https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt"
|
||||
)
|
||||
lbls = ast.literal_eval(lbls.decode("utf-8"))
|
||||
|
||||
# load image and preprocess
|
||||
from PIL import Image
|
||||
|
||||
url = sys.argv[1]
|
||||
if url == 'webcam':
|
||||
if url == "webcam":
|
||||
import cv2
|
||||
|
||||
cap = cv2.VideoCapture(0)
|
||||
cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
|
||||
while 1:
|
||||
_ = cap.grab() # discard one frame to circumvent capture buffering
|
||||
_ = cap.grab() # discard one frame to circumvent capture buffering
|
||||
ret, frame = cap.read()
|
||||
img = Image.fromarray(frame[:, :, [2,1,0]])
|
||||
img = Image.fromarray(frame[:, :, [2, 1, 0]])
|
||||
out, retimg = infer(model, img)
|
||||
print(np.argmax(out.data), np.max(out.data), lbls[np.argmax(out.data)])
|
||||
SCALE = 3
|
||||
simg = cv2.resize(retimg, (224*SCALE, 224*SCALE))
|
||||
simg = cv2.resize(retimg, (224 * SCALE, 224 * SCALE))
|
||||
retimg = cv2.cvtColor(simg, cv2.COLOR_RGB2BGR)
|
||||
cv2.imshow('capture', retimg)
|
||||
if cv2.waitKey(1) & 0xFF == ord('q'):
|
||||
cv2.imshow("capture", retimg)
|
||||
if cv2.waitKey(1) & 0xFF == ord("q"):
|
||||
break
|
||||
cap.release()
|
||||
cv2.destroyAllWindows()
|
||||
else:
|
||||
if url.startswith('http'):
|
||||
if url.startswith("http"):
|
||||
img = Image.open(io.BytesIO(fetch(url)))
|
||||
else:
|
||||
img = Image.open(url)
|
||||
st = time.time()
|
||||
out, _ = infer(model, img)
|
||||
print(np.argmax(out.data), np.max(out.data), lbls[np.argmax(out.data)])
|
||||
print("did inference in %.2f s" % (time.time()-st))
|
||||
#print("NOT", np.argmin(out.data), np.min(out.data), lbls[np.argmin(out.data)])
|
||||
|
||||
print("did inference in %.2f s" % (time.time() - st))
|
||||
# print("NOT", np.argmin(out.data), np.min(out.data), lbls[np.argmin(out.data)])
|
||||
|
||||
@@ -3,8 +3,9 @@ import os
|
||||
import sys
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
sys.path.append(os.getcwd())
|
||||
sys.path.append(os.path.join(os.getcwd(), 'test'))
|
||||
sys.path.append(os.path.join(os.getcwd(), "test"))
|
||||
|
||||
from tinygrad.tensor import Tensor, Function, register
|
||||
from extra.utils import get_parameters
|
||||
@@ -12,7 +13,10 @@ import tinygrad.optim as optim
|
||||
from test_mnist import X_train
|
||||
from torchvision.utils import make_grid, save_image
|
||||
import torch
|
||||
|
||||
GPU = os.getenv("GPU") is not None
|
||||
|
||||
|
||||
class LinearGen:
|
||||
def __init__(self):
|
||||
lv = 128
|
||||
@@ -28,6 +32,7 @@ class LinearGen:
|
||||
x = x.dot(self.l4).tanh()
|
||||
return x
|
||||
|
||||
|
||||
class LinearDisc:
|
||||
def __init__(self):
|
||||
in_sh = 784
|
||||
@@ -39,108 +44,112 @@ class LinearDisc:
|
||||
def forward(self, x, train=True):
|
||||
x = x.dot(self.l1).leakyrelu(0.2)
|
||||
if train:
|
||||
x = x.dropout(0.3)
|
||||
x = x.dropout(0.3)
|
||||
x = x.dot(self.l2).leakyrelu(0.2)
|
||||
if train:
|
||||
x = x.dropout(0.3)
|
||||
x = x.dropout(0.3)
|
||||
x = x.dot(self.l3).leakyrelu(0.2)
|
||||
if train:
|
||||
x = x.dropout(0.3)
|
||||
x = x.dropout(0.3)
|
||||
x = x.dot(self.l4).logsoftmax()
|
||||
return x
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
generator = LinearGen()
|
||||
discriminator = LinearDisc()
|
||||
batch_size = 512
|
||||
k = 1
|
||||
epochs = 300
|
||||
generator_params = get_parameters(generator)
|
||||
discriminator_params = get_parameters(discriminator)
|
||||
gen_loss = []
|
||||
disc_loss = []
|
||||
output_folder = "outputs"
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
train_data_size = len(X_train)
|
||||
ds_noise = Tensor(np.random.randn(64,128).astype(np.float32), gpu=GPU, requires_grad=False)
|
||||
n_steps = int(train_data_size/batch_size)
|
||||
if GPU:
|
||||
[x.cuda_() for x in generator_params+discriminator_params]
|
||||
# optimizers
|
||||
optim_g = optim.Adam(generator_params,lr=0.0002, b1=0.5) # 0.0002 for equilibrium!
|
||||
optim_d = optim.Adam(discriminator_params,lr=0.0002, b1=0.5)
|
||||
generator = LinearGen()
|
||||
discriminator = LinearDisc()
|
||||
batch_size = 512
|
||||
k = 1
|
||||
epochs = 300
|
||||
generator_params = get_parameters(generator)
|
||||
discriminator_params = get_parameters(discriminator)
|
||||
gen_loss = []
|
||||
disc_loss = []
|
||||
output_folder = "outputs"
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
train_data_size = len(X_train)
|
||||
ds_noise = Tensor(
|
||||
np.random.randn(64, 128).astype(np.float32), gpu=GPU, requires_grad=False
|
||||
)
|
||||
n_steps = int(train_data_size / batch_size)
|
||||
if GPU:
|
||||
[x.cuda_() for x in generator_params + discriminator_params]
|
||||
# optimizers
|
||||
optim_g = optim.Adam(generator_params, lr=0.0002, b1=0.5) # 0.0002 for equilibrium!
|
||||
optim_d = optim.Adam(discriminator_params, lr=0.0002, b1=0.5)
|
||||
|
||||
def regularization_l2(model, a=1e-4):
|
||||
#TODO: l2 reg loss
|
||||
pass
|
||||
def regularization_l2(model, a=1e-4):
|
||||
# TODO: l2 reg loss
|
||||
pass
|
||||
|
||||
def generator_batch():
|
||||
idx = np.random.randint(0, X_train.shape[0], size=(batch_size))
|
||||
image_b = X_train[idx].reshape(-1, 28*28).astype(np.float32)/255.
|
||||
image_b = (image_b - 0.5)/0.5
|
||||
return Tensor(image_b, gpu=GPU)
|
||||
def generator_batch():
|
||||
idx = np.random.randint(0, X_train.shape[0], size=(batch_size))
|
||||
image_b = X_train[idx].reshape(-1, 28 * 28).astype(np.float32) / 255.0
|
||||
image_b = (image_b - 0.5) / 0.5
|
||||
return Tensor(image_b, gpu=GPU)
|
||||
|
||||
def real_label(bs):
|
||||
y = np.zeros((bs,2), np.float32)
|
||||
y[range(bs), [1]*bs] = -2.0
|
||||
real_labels = Tensor(y, gpu=GPU)
|
||||
return real_labels
|
||||
def real_label(bs):
|
||||
y = np.zeros((bs, 2), np.float32)
|
||||
y[range(bs), [1] * bs] = -2.0
|
||||
real_labels = Tensor(y, gpu=GPU)
|
||||
return real_labels
|
||||
|
||||
def fake_label(bs):
|
||||
y = np.zeros((bs,2), np.float32)
|
||||
y[range(bs), [0]*bs] = -2.0 # Can we do label smoothin? i.e -2.0 changed to -1.98789.
|
||||
fake_labels = Tensor(y, gpu=GPU)
|
||||
return fake_labels
|
||||
def fake_label(bs):
|
||||
y = np.zeros((bs, 2), np.float32)
|
||||
y[
|
||||
range(bs), [0] * bs
|
||||
] = -2.0 # Can we do label smoothin? i.e -2.0 changed to -1.98789.
|
||||
fake_labels = Tensor(y, gpu=GPU)
|
||||
return fake_labels
|
||||
|
||||
def train_discriminator(optimizer, data_real, data_fake):
|
||||
real_labels = real_label(batch_size)
|
||||
fake_labels = fake_label(batch_size)
|
||||
def train_discriminator(optimizer, data_real, data_fake):
|
||||
real_labels = real_label(batch_size)
|
||||
fake_labels = fake_label(batch_size)
|
||||
|
||||
optimizer.zero_grad()
|
||||
optimizer.zero_grad()
|
||||
|
||||
output_real = discriminator.forward(data_real)
|
||||
loss_real = (output_real * real_labels).mean()
|
||||
output_real = discriminator.forward(data_real)
|
||||
loss_real = (output_real * real_labels).mean()
|
||||
|
||||
output_fake = discriminator.forward(data_fake)
|
||||
loss_fake = (output_fake * fake_labels).mean()
|
||||
output_fake = discriminator.forward(data_fake)
|
||||
loss_fake = (output_fake * fake_labels).mean()
|
||||
|
||||
loss_real.backward()
|
||||
loss_fake.backward()
|
||||
optimizer.step()
|
||||
return loss_real.cpu().data + loss_fake.cpu().data
|
||||
loss_real.backward()
|
||||
loss_fake.backward()
|
||||
optimizer.step()
|
||||
return loss_real.cpu().data + loss_fake.cpu().data
|
||||
|
||||
def train_generator(optimizer, data_fake):
|
||||
real_labels = real_label(batch_size)
|
||||
optimizer.zero_grad()
|
||||
output = discriminator.forward(data_fake)
|
||||
loss = (output * real_labels).mean()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
return loss.cpu().data
|
||||
|
||||
for epoch in tqdm(range(epochs)):
|
||||
loss_g = 0.0
|
||||
loss_d = 0.0
|
||||
print(f"Epoch {epoch} of {epochs}")
|
||||
for i in tqdm(range(n_steps)):
|
||||
image = generator_batch()
|
||||
for step in range(k): # Try with k = 5 or 7.
|
||||
noise = Tensor(np.random.randn(batch_size,128), gpu=GPU)
|
||||
data_fake = generator.forward(noise).detach()
|
||||
data_real = image
|
||||
loss_d_step = train_discriminator(optim_d, data_real, data_fake)
|
||||
loss_d += loss_d_step
|
||||
noise = Tensor(np.random.randn(batch_size,128), gpu=GPU)
|
||||
data_fake = generator.forward(noise)
|
||||
loss_g_step = train_generator(optim_g, data_fake)
|
||||
loss_g += loss_g_step
|
||||
fake_images = generator.forward(ds_noise).detach().cpu().data
|
||||
fake_images = (fake_images.reshape(-1, 1, 28, 28)+ 1) / 2 # 0 - 1 range.
|
||||
fake_images = make_grid(torch.tensor(fake_images))
|
||||
save_image(fake_images, os.path.join(output_folder,f"image_{epoch}.jpg"))
|
||||
epoch_loss_g = loss_g / n_steps
|
||||
epoch_loss_d = loss_d / n_steps
|
||||
print(f"EPOCH: Generator loss: {epoch_loss_g}, Discriminator loss: {epoch_loss_d}")
|
||||
else:
|
||||
print("Training Completed!")
|
||||
def train_generator(optimizer, data_fake):
|
||||
real_labels = real_label(batch_size)
|
||||
optimizer.zero_grad()
|
||||
output = discriminator.forward(data_fake)
|
||||
loss = (output * real_labels).mean()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
return loss.cpu().data
|
||||
|
||||
for epoch in tqdm(range(epochs)):
|
||||
loss_g = 0.0
|
||||
loss_d = 0.0
|
||||
print(f"Epoch {epoch} of {epochs}")
|
||||
for i in tqdm(range(n_steps)):
|
||||
image = generator_batch()
|
||||
for step in range(k): # Try with k = 5 or 7.
|
||||
noise = Tensor(np.random.randn(batch_size, 128), gpu=GPU)
|
||||
data_fake = generator.forward(noise).detach()
|
||||
data_real = image
|
||||
loss_d_step = train_discriminator(optim_d, data_real, data_fake)
|
||||
loss_d += loss_d_step
|
||||
noise = Tensor(np.random.randn(batch_size, 128), gpu=GPU)
|
||||
data_fake = generator.forward(noise)
|
||||
loss_g_step = train_generator(optim_g, data_fake)
|
||||
loss_g += loss_g_step
|
||||
fake_images = generator.forward(ds_noise).detach().cpu().data
|
||||
fake_images = (fake_images.reshape(-1, 1, 28, 28) + 1) / 2 # 0 - 1 range.
|
||||
fake_images = make_grid(torch.tensor(fake_images))
|
||||
save_image(fake_images, os.path.join(output_folder, f"image_{epoch}.jpg"))
|
||||
epoch_loss_g = loss_g / n_steps
|
||||
epoch_loss_d = loss_d / n_steps
|
||||
print(f"EPOCH: Generator loss: {epoch_loss_g}, Discriminator loss: {epoch_loss_d}")
|
||||
else:
|
||||
print("Training Completed!")
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
#!/usr/bin/env python
|
||||
#inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
|
||||
# inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.append(os.getcwd())
|
||||
sys.path.append(os.path.join(os.getcwd(), 'test'))
|
||||
sys.path.append(os.path.join(os.getcwd(), "test"))
|
||||
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor, GPU
|
||||
@@ -13,126 +14,139 @@ from test_mnist import fetch_mnist
|
||||
from extra.training import train, evaluate
|
||||
import tinygrad.optim as optim
|
||||
from extra.augment import augment_img
|
||||
|
||||
GPU = os.getenv("GPU", None) is not None
|
||||
QUICK = os.getenv("QUICK", None) is not None
|
||||
DEBUG = os.getenv("DEBUG", None) is not None
|
||||
|
||||
|
||||
class SqueezeExciteBlock2D:
|
||||
def __init__(self, filters):
|
||||
self.filters = filters
|
||||
self.weight1 = Tensor.uniform(self.filters, self.filters//32)
|
||||
self.bias1 = Tensor.uniform(1,self.filters//32)
|
||||
self.weight2 = Tensor.uniform(self.filters//32, self.filters)
|
||||
self.weight1 = Tensor.uniform(self.filters, self.filters // 32)
|
||||
self.bias1 = Tensor.uniform(1, self.filters // 32)
|
||||
self.weight2 = Tensor.uniform(self.filters // 32, self.filters)
|
||||
self.bias2 = Tensor.uniform(1, self.filters)
|
||||
|
||||
def __call__(self, input):
|
||||
se = input.avg_pool2d(kernel_size=(input.shape[2], input.shape[3])) #GlobalAveragePool2D
|
||||
se = input.avg_pool2d(
|
||||
kernel_size=(input.shape[2], input.shape[3])
|
||||
) # GlobalAveragePool2D
|
||||
se = se.reshape(shape=(-1, self.filters))
|
||||
se = se.dot(self.weight1) + self.bias1
|
||||
se = se.relu()
|
||||
se = se.relu()
|
||||
se = se.dot(self.weight2) + self.bias2
|
||||
se = se.sigmoid().reshape(shape=(-1,self.filters,1,1)) #for broadcasting
|
||||
se = se.sigmoid().reshape(shape=(-1, self.filters, 1, 1)) # for broadcasting
|
||||
se = input.mul(se)
|
||||
return se
|
||||
|
||||
|
||||
class ConvBlock:
|
||||
def __init__(self, h, w, inp, filters=128, conv=3):
|
||||
self.h, self.w = h, w
|
||||
self.inp = inp
|
||||
#init weights
|
||||
self.cweights = [Tensor.uniform(filters, inp if i==0 else filters, conv, conv) for i in range(3)]
|
||||
# init weights
|
||||
self.cweights = [
|
||||
Tensor.uniform(filters, inp if i == 0 else filters, conv, conv) for i in range(3)
|
||||
]
|
||||
self.cbiases = [Tensor.uniform(1, filters, 1, 1) for i in range(3)]
|
||||
#init layers
|
||||
# init layers
|
||||
self._bn = BatchNorm2D(128, training=True)
|
||||
self._seb = SqueezeExciteBlock2D(filters)
|
||||
|
||||
|
||||
def __call__(self, input):
|
||||
x = input.reshape(shape=(-1, self.inp, self.w, self.h))
|
||||
x = input.reshape(shape=(-1, self.inp, self.w, self.h))
|
||||
for cweight, cbias in zip(self.cweights, self.cbiases):
|
||||
x = x.pad2d(padding=[1,1,1,1]).conv2d(cweight).add(cbias).relu()
|
||||
x = x.pad2d(padding=[1, 1, 1, 1]).conv2d(cweight).add(cbias).relu()
|
||||
x = self._bn(x)
|
||||
x = self._seb(x)
|
||||
return x
|
||||
|
||||
|
||||
class BigConvNet:
|
||||
def __init__(self):
|
||||
self.conv = [ConvBlock(28,28,1), ConvBlock(28,28,128), ConvBlock(14,14,128)]
|
||||
self.weight1 = Tensor.uniform(128,10)
|
||||
self.weight2 = Tensor.uniform(128,10)
|
||||
self.conv = [ConvBlock(28, 28, 1), ConvBlock(28, 28, 128), ConvBlock(14, 14, 128)]
|
||||
self.weight1 = Tensor.uniform(128, 10)
|
||||
self.weight2 = Tensor.uniform(128, 10)
|
||||
|
||||
def parameters(self):
|
||||
if DEBUG: #keeping this for a moment
|
||||
if DEBUG: # keeping this for a moment
|
||||
pars = [par for par in get_parameters(self) if par.requires_grad]
|
||||
no_pars = 0
|
||||
for par in pars:
|
||||
print(par.shape)
|
||||
no_pars += np.prod(par.shape)
|
||||
print('no of parameters', no_pars)
|
||||
print("no of parameters", no_pars)
|
||||
return pars
|
||||
else:
|
||||
return get_parameters(self)
|
||||
|
||||
def save(self, filename):
|
||||
with open(filename+'.npy', 'wb') as f:
|
||||
with open(filename + ".npy", "wb") as f:
|
||||
for par in get_parameters(self):
|
||||
#if par.requires_grad:
|
||||
# if par.requires_grad:
|
||||
np.save(f, par.cpu().data)
|
||||
|
||||
def load(self, filename):
|
||||
with open(filename+'.npy', 'rb') as f:
|
||||
for par in get_parameters(self):
|
||||
#if par.requires_grad:
|
||||
with open(filename + ".npy", "rb") as f:
|
||||
for par in get_parameters(self):
|
||||
# if par.requires_grad:
|
||||
try:
|
||||
par.cpu().data[:] = np.load(f)
|
||||
if GPU:
|
||||
par.cuda()
|
||||
except:
|
||||
print('Could not load parameter')
|
||||
print("Could not load parameter")
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv[0](x)
|
||||
x = self.conv[1](x)
|
||||
x = x.avg_pool2d(kernel_size=(2,2))
|
||||
x = x.avg_pool2d(kernel_size=(2, 2))
|
||||
x = self.conv[2](x)
|
||||
x1 = x.avg_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
|
||||
x2 = x.max_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
|
||||
x1 = x.avg_pool2d(kernel_size=(14, 14)).reshape(shape=(-1, 128)) # global
|
||||
x2 = x.max_pool2d(kernel_size=(14, 14)).reshape(shape=(-1, 128)) # global
|
||||
xo = x1.dot(self.weight1) + x2.dot(self.weight2)
|
||||
return xo.logsoftmax()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
lrs = [1e-4, 1e-5] if QUICK else [1e-3, 1e-4, 1e-5, 1e-5]
|
||||
epochss = [2, 1] if QUICK else [13, 3, 3, 1]
|
||||
epochss = [2, 1] if QUICK else [13, 3, 3, 1]
|
||||
BS = 32
|
||||
|
||||
lmbd = 0.00025
|
||||
lossfn = lambda out,y: out.mul(y).mean() + lmbd*(model.weight1.abs() + model.weight2.abs()).sum()
|
||||
lossfn = (
|
||||
lambda out, y: out.mul(y).mean()
|
||||
+ lmbd * (model.weight1.abs() + model.weight2.abs()).sum()
|
||||
)
|
||||
X_train, Y_train, X_test, Y_test = fetch_mnist()
|
||||
steps = len(X_train)//BS
|
||||
steps = len(X_train) // BS
|
||||
np.random.seed(1337)
|
||||
if QUICK:
|
||||
steps = 1
|
||||
X_test, Y_test = X_test[:BS], Y_test[:BS]
|
||||
|
||||
|
||||
model = BigConvNet()
|
||||
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
try:
|
||||
model.load(sys.argv[1])
|
||||
print('Loaded weights "'+sys.argv[1]+'", evaluating...')
|
||||
print('Loaded weights "' + sys.argv[1] + '", evaluating...')
|
||||
evaluate(model, X_test, Y_test, BS=BS)
|
||||
except:
|
||||
print('could not load weights "'+sys.argv[1]+'".')
|
||||
|
||||
print('could not load weights "' + sys.argv[1] + '".')
|
||||
|
||||
if GPU:
|
||||
params = get_parameters(model)
|
||||
[x.cuda_() for x in params]
|
||||
|
||||
for lr, epochs in zip(lrs, epochss):
|
||||
optimizer = optim.Adam(model.parameters(), lr=lr)
|
||||
for epoch in range(1,epochs+1):
|
||||
#first epoch without augmentation
|
||||
for epoch in range(1, epochs + 1):
|
||||
# first epoch without augmentation
|
||||
X_aug = X_train if epoch == 1 else augment_img(X_train)
|
||||
train(model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, gpu=GPU, BS=BS)
|
||||
train(
|
||||
model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, gpu=GPU, BS=BS
|
||||
)
|
||||
accuracy = evaluate(model, X_test, Y_test, BS=BS)
|
||||
model.save('examples/checkpoint'+str("%.0f" % (accuracy*1.0e6)))
|
||||
model.save("examples/checkpoint" + str("%.0f" % (accuracy * 1.0e6)))
|
||||
|
||||
@@ -10,13 +10,14 @@ import io
|
||||
import tarfile
|
||||
import pickle
|
||||
|
||||
|
||||
class TinyConvNet:
|
||||
def __init__(self, classes=10):
|
||||
conv = 3
|
||||
inter_chan, out_chan = 8, 16 # for speed
|
||||
self.c1 = Tensor.uniform(inter_chan,3,conv,conv)
|
||||
self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
|
||||
self.l1 = Tensor.uniform(out_chan*6*6, classes)
|
||||
inter_chan, out_chan = 8, 16 # for speed
|
||||
self.c1 = Tensor.uniform(inter_chan, 3, conv, conv)
|
||||
self.c2 = Tensor.uniform(out_chan, inter_chan, conv, conv)
|
||||
self.l1 = Tensor.uniform(out_chan * 6 * 6, classes)
|
||||
|
||||
def forward(self, x):
|
||||
x = x.conv2d(self.c1).relu().max_pool2d()
|
||||
@@ -24,13 +25,20 @@ class TinyConvNet:
|
||||
x = x.reshape(shape=[x.shape[0], -1])
|
||||
return x.dot(self.l1).logsoftmax()
|
||||
|
||||
|
||||
def load_cifar():
|
||||
tt = tarfile.open(fileobj=io.BytesIO(fetch('https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz')), mode='r:gz')
|
||||
db = pickle.load(tt.extractfile('cifar-10-batches-py/data_batch_1'), encoding="bytes")
|
||||
X = db[b'data'].reshape((-1, 3, 32, 32))
|
||||
Y = np.array(db[b'labels'])
|
||||
tt = tarfile.open(
|
||||
fileobj=io.BytesIO(
|
||||
fetch("https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz")
|
||||
),
|
||||
mode="r:gz",
|
||||
)
|
||||
db = pickle.load(tt.extractfile("cifar-10-batches-py/data_batch_1"), encoding="bytes")
|
||||
X = db[b"data"].reshape((-1, 3, 32, 32))
|
||||
Y = np.array(db[b"labels"])
|
||||
return X, Y
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
X_train, Y_train = load_cifar()
|
||||
classes = 10
|
||||
@@ -40,7 +48,7 @@ if __name__ == "__main__":
|
||||
TRANSFER = os.getenv("TRANSFER") is not None
|
||||
if TINY:
|
||||
model = TinyConvNet(classes)
|
||||
elif TRANSFER:
|
||||
elif TRANSFER:
|
||||
model = EfficientNet(int(os.getenv("NUM", "0")), classes, has_se=True)
|
||||
model.load_weights_from_torch()
|
||||
else:
|
||||
@@ -50,21 +58,21 @@ if __name__ == "__main__":
|
||||
print("parameters", len(parameters))
|
||||
optimizer = optim.Adam(parameters, lr=0.001)
|
||||
|
||||
#BS, steps = 16, 32
|
||||
# BS, steps = 16, 32
|
||||
BS, steps = 64 if TINY else 16, 2048
|
||||
|
||||
for i in (t := trange(steps)):
|
||||
for i in (t := trange(steps)) :
|
||||
samp = np.random.randint(0, X_train.shape[0], size=(BS))
|
||||
|
||||
img = X_train[samp].astype(np.float32)
|
||||
|
||||
st = time.time()
|
||||
out = model.forward(Tensor(img))
|
||||
fp_time = (time.time()-st)*1000.0
|
||||
fp_time = (time.time() - st) * 1000.0
|
||||
|
||||
Y = Y_train[samp]
|
||||
y = np.zeros((BS,classes), np.float32)
|
||||
y[range(y.shape[0]),Y] = -classes
|
||||
y = np.zeros((BS, classes), np.float32)
|
||||
y[range(y.shape[0]), Y] = -classes
|
||||
y = Tensor(y)
|
||||
loss = out.logsoftmax().mul(y).mean()
|
||||
|
||||
@@ -72,25 +80,32 @@ if __name__ == "__main__":
|
||||
|
||||
st = time.time()
|
||||
loss.backward()
|
||||
bp_time = (time.time()-st)*1000.0
|
||||
bp_time = (time.time() - st) * 1000.0
|
||||
|
||||
st = time.time()
|
||||
optimizer.step()
|
||||
opt_time = (time.time()-st)*1000.0
|
||||
opt_time = (time.time() - st) * 1000.0
|
||||
|
||||
#print(out.cpu().data)
|
||||
# print(out.cpu().data)
|
||||
|
||||
st = time.time()
|
||||
loss = loss.cpu().data
|
||||
cat = np.argmax(out.cpu().data, axis=1)
|
||||
accuracy = (cat == Y).mean()
|
||||
finish_time = (time.time()-st)*1000.0
|
||||
finish_time = (time.time() - st) * 1000.0
|
||||
|
||||
# printing
|
||||
t.set_description("loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f" %
|
||||
(loss, accuracy,
|
||||
fp_time, bp_time, opt_time, finish_time,
|
||||
fp_time + bp_time + opt_time + finish_time))
|
||||
t.set_description(
|
||||
"loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f"
|
||||
% (
|
||||
loss,
|
||||
accuracy,
|
||||
fp_time,
|
||||
bp_time,
|
||||
opt_time,
|
||||
finish_time,
|
||||
fp_time + bp_time + opt_time + finish_time,
|
||||
)
|
||||
)
|
||||
|
||||
del out, y, loss
|
||||
|
||||
|
||||
@@ -3,9 +3,8 @@ import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
import time
|
||||
|
||||
a = Tensor([-2,-1,0,1,2]).ane()
|
||||
a = Tensor([-2, -1, 0, 1, 2]).ane()
|
||||
print(a.cpu())
|
||||
b = a.relu()
|
||||
print(b.cpu())
|
||||
assert(np.all(b.cpu().data >= 0))
|
||||
|
||||
assert np.all(b.cpu().data >= 0)
|
||||
|
||||
@@ -2,39 +2,47 @@ import numpy as np
|
||||
from PIL import Image
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.append(os.getcwd())
|
||||
sys.path.append(os.path.join(os.getcwd(), 'test'))
|
||||
sys.path.append(os.path.join(os.getcwd(), "test"))
|
||||
from test_mnist import fetch_mnist
|
||||
from tqdm import trange
|
||||
|
||||
|
||||
def augment_img(X, rotate=10, px=3):
|
||||
Xaug = np.zeros_like(X)
|
||||
for i in trange(len(X)):
|
||||
im = Image.fromarray(X[i])
|
||||
im = im.rotate(np.random.randint(-rotate,rotate), resample=Image.BICUBIC)
|
||||
im = im.rotate(np.random.randint(-rotate, rotate), resample=Image.BICUBIC)
|
||||
w, h = X.shape[1:]
|
||||
#upper left, lower left, lower right, upper right
|
||||
quad = np.random.randint(-px,px,size=(8)) + np.array([0,0,0,h,w,h,w,0])
|
||||
# upper left, lower left, lower right, upper right
|
||||
quad = np.random.randint(-px, px, size=(8)) + np.array([0, 0, 0, h, w, h, w, 0])
|
||||
im = im.transform((w, h), Image.QUAD, quad, resample=Image.BICUBIC)
|
||||
Xaug[i] = im
|
||||
return Xaug
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from test_mnist import fetch_mnist
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
X_train, Y_train, X_test, Y_test = fetch_mnist()
|
||||
X = np.vstack([X_train[:1]]*10+[X_train[1:2]]*10)
|
||||
fig, a = plt.subplots(2,len(X))
|
||||
X = np.vstack([X_train[:1]] * 10 + [X_train[1:2]] * 10)
|
||||
fig, a = plt.subplots(2, len(X))
|
||||
Xaug = augment_img(X)
|
||||
for i in range(len(X)):
|
||||
a[0][i].imshow(X[i], cmap='gray')
|
||||
a[1][i].imshow(Xaug[i],cmap='gray')
|
||||
a[0][i].axis('off')
|
||||
a[1][i].axis('off')
|
||||
a[0][i].imshow(X[i], cmap="gray")
|
||||
a[1][i].imshow(Xaug[i], cmap="gray")
|
||||
a[0][i].axis("off")
|
||||
a[1][i].axis("off")
|
||||
plt.show()
|
||||
|
||||
#create some nice gifs for doc?!
|
||||
# create some nice gifs for doc?!
|
||||
for i in range(10):
|
||||
im = Image.fromarray(X_train[7353+i])
|
||||
im_aug = [Image.fromarray(x) for x in augment_img(np.array([X_train[7353+i]]*100))]
|
||||
im.save("aug"+str(i)+".gif", save_all=True, append_images=im_aug, duration=100, loop=0)
|
||||
im = Image.fromarray(X_train[7353 + i])
|
||||
im_aug = [
|
||||
Image.fromarray(x) for x in augment_img(np.array([X_train[7353 + i]] * 100))
|
||||
]
|
||||
im.save(
|
||||
"aug" + str(i) + ".gif", save_all=True, append_images=im_aug, duration=100, loop=0
|
||||
)
|
||||
|
||||
@@ -6,6 +6,7 @@ from extra.utils import fetch
|
||||
|
||||
USE_TORCH = False
|
||||
|
||||
|
||||
def fake_torch_load(b0):
|
||||
import io
|
||||
import pickle
|
||||
@@ -23,9 +24,9 @@ def fake_torch_load(b0):
|
||||
|
||||
class HackTensor:
|
||||
def __new__(cls, *args):
|
||||
#print(args)
|
||||
# print(args)
|
||||
ident, storage_type, obj_key, location, obj_size, view_metadata = args[0]
|
||||
assert ident == 'storage'
|
||||
assert ident == "storage"
|
||||
|
||||
ret = np.zeros(obj_size, dtype=storage_type)
|
||||
key_prelookup[obj_key] = (storage_type, obj_size, ret, args[2], args[3])
|
||||
@@ -33,10 +34,10 @@ def fake_torch_load(b0):
|
||||
|
||||
class MyPickle(pickle.Unpickler):
|
||||
def find_class(self, module, name):
|
||||
#print(module, name)
|
||||
if name == 'FloatStorage':
|
||||
# print(module, name)
|
||||
if name == "FloatStorage":
|
||||
return np.float32
|
||||
if name == 'LongStorage':
|
||||
if name == "LongStorage":
|
||||
return np.int64
|
||||
if module == "torch._utils" or module == "torch":
|
||||
return HackTensor
|
||||
@@ -51,7 +52,7 @@ def fake_torch_load(b0):
|
||||
# create key_lookup
|
||||
key_lookup = pickle.load(fb0)
|
||||
key_real = [None] * len(key_lookup)
|
||||
for k,v in key_prelookup.items():
|
||||
for k, v in key_prelookup.items():
|
||||
key_real[key_lookup.index(k)] = v
|
||||
|
||||
# read in the actual data
|
||||
@@ -64,13 +65,23 @@ def fake_torch_load(b0):
|
||||
np_array.shape = np_shape
|
||||
|
||||
# numpy stores its strides in bytes
|
||||
real_strides = tuple([x*bytes_size for x in np_strides])
|
||||
real_strides = tuple([x * bytes_size for x in np_strides])
|
||||
np_array.strides = real_strides
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
class MBConvBlock:
|
||||
def __init__(self, kernel_size, strides, expand_ratio, input_filters, output_filters, se_ratio, has_se):
|
||||
def __init__(
|
||||
self,
|
||||
kernel_size,
|
||||
strides,
|
||||
expand_ratio,
|
||||
input_filters,
|
||||
output_filters,
|
||||
se_ratio,
|
||||
has_se,
|
||||
):
|
||||
oup = expand_ratio * input_filters
|
||||
if expand_ratio != 1:
|
||||
self._expand_conv = Tensor.uniform(oup, input_filters, 1, 1)
|
||||
@@ -79,10 +90,10 @@ class MBConvBlock:
|
||||
self._expand_conv = None
|
||||
|
||||
self.strides = strides
|
||||
if strides == (2,2):
|
||||
self.pad = [(kernel_size-1)//2-1, (kernel_size-1)//2]*2
|
||||
if strides == (2, 2):
|
||||
self.pad = [(kernel_size - 1) // 2 - 1, (kernel_size - 1) // 2] * 2
|
||||
else:
|
||||
self.pad = [(kernel_size-1)//2]*4
|
||||
self.pad = [(kernel_size - 1) // 2] * 4
|
||||
|
||||
self._depthwise_conv = Tensor.uniform(oup, 1, kernel_size, kernel_size)
|
||||
self._bn1 = BatchNorm2D(oup)
|
||||
@@ -103,14 +114,22 @@ class MBConvBlock:
|
||||
if self._expand_conv:
|
||||
x = self._bn0(x.conv2d(self._expand_conv)).swish()
|
||||
x = x.pad2d(padding=self.pad)
|
||||
x = x.conv2d(self._depthwise_conv, stride=self.strides, groups=self._depthwise_conv.shape[0])
|
||||
x = x.conv2d(
|
||||
self._depthwise_conv, stride=self.strides, groups=self._depthwise_conv.shape[0]
|
||||
)
|
||||
x = self._bn1(x).swish()
|
||||
|
||||
# has_se
|
||||
if self.has_se:
|
||||
x_squeezed = x.avg_pool2d(kernel_size=x.shape[2:4])
|
||||
x_squeezed = x_squeezed.conv2d(self._se_reduce).add(self._se_reduce_bias.reshape(shape=[1, -1, 1, 1])).swish()
|
||||
x_squeezed = x_squeezed.conv2d(self._se_expand).add(self._se_expand_bias.reshape(shape=[1, -1, 1, 1]))
|
||||
x_squeezed = (
|
||||
x_squeezed.conv2d(self._se_reduce)
|
||||
.add(self._se_reduce_bias.reshape(shape=[1, -1, 1, 1]))
|
||||
.swish()
|
||||
)
|
||||
x_squeezed = x_squeezed.conv2d(self._se_expand).add(
|
||||
self._se_expand_bias.reshape(shape=[1, -1, 1, 1])
|
||||
)
|
||||
x = x.mul(x_squeezed.sigmoid())
|
||||
|
||||
x = self._bn2(x.conv2d(self._project_conv))
|
||||
@@ -118,21 +137,22 @@ class MBConvBlock:
|
||||
x = x.add(inputs)
|
||||
return x
|
||||
|
||||
|
||||
class EfficientNet:
|
||||
def __init__(self, number=0, classes=1000, has_se=True):
|
||||
self.number = number
|
||||
global_params = [
|
||||
# width, depth
|
||||
(1.0, 1.0), # b0
|
||||
(1.0, 1.1), # b1
|
||||
(1.1, 1.2), # b2
|
||||
(1.2, 1.4), # b3
|
||||
(1.4, 1.8), # b4
|
||||
(1.6, 2.2), # b5
|
||||
(1.8, 2.6), # b6
|
||||
(2.0, 3.1), # b7
|
||||
(2.2, 3.6), # b8
|
||||
(4.3, 5.3), # l2
|
||||
(1.0, 1.0), # b0
|
||||
(1.0, 1.1), # b1
|
||||
(1.1, 1.2), # b2
|
||||
(1.2, 1.4), # b3
|
||||
(1.4, 1.8), # b4
|
||||
(1.6, 2.2), # b5
|
||||
(1.8, 2.6), # b6
|
||||
(2.0, 3.1), # b7
|
||||
(2.2, 3.6), # b8
|
||||
(4.3, 5.3), # l2
|
||||
][number]
|
||||
|
||||
def round_filters(filters):
|
||||
@@ -140,7 +160,7 @@ class EfficientNet:
|
||||
divisor = 8
|
||||
filters *= multiplier
|
||||
new_filters = max(divisor, int(filters + divisor / 2) // divisor * divisor)
|
||||
if new_filters < 0.9 * filters: # prevent rounding by more than 10%
|
||||
if new_filters < 0.9 * filters: # prevent rounding by more than 10%
|
||||
new_filters += divisor
|
||||
return int(new_filters)
|
||||
|
||||
@@ -151,13 +171,13 @@ class EfficientNet:
|
||||
self._conv_stem = Tensor.uniform(out_channels, 3, 3, 3)
|
||||
self._bn0 = BatchNorm2D(out_channels)
|
||||
blocks_args = [
|
||||
[1, 3, (1,1), 1, 32, 16, 0.25],
|
||||
[2, 3, (2,2), 6, 16, 24, 0.25],
|
||||
[2, 5, (2,2), 6, 24, 40, 0.25],
|
||||
[3, 3, (2,2), 6, 40, 80, 0.25],
|
||||
[3, 5, (1,1), 6, 80, 112, 0.25],
|
||||
[4, 5, (2,2), 6, 112, 192, 0.25],
|
||||
[1, 3, (1,1), 6, 192, 320, 0.25],
|
||||
[1, 3, (1, 1), 1, 32, 16, 0.25],
|
||||
[2, 3, (2, 2), 6, 16, 24, 0.25],
|
||||
[2, 5, (2, 2), 6, 24, 40, 0.25],
|
||||
[3, 3, (2, 2), 6, 40, 80, 0.25],
|
||||
[3, 5, (1, 1), 6, 80, 112, 0.25],
|
||||
[4, 5, (2, 2), 6, 112, 192, 0.25],
|
||||
[1, 3, (1, 1), 6, 192, 320, 0.25],
|
||||
]
|
||||
self._blocks = []
|
||||
# num_repeats, kernel_size, strides, expand_ratio, input_filters, output_filters, se_ratio
|
||||
@@ -168,7 +188,7 @@ class EfficientNet:
|
||||
for n in range(round_repeats(b[0])):
|
||||
self._blocks.append(MBConvBlock(*args, has_se=has_se))
|
||||
args[3] = args[4]
|
||||
args[1] = (1,1)
|
||||
args[1] = (1, 1)
|
||||
|
||||
in_channels = round_filters(320)
|
||||
out_channels = round_filters(1280)
|
||||
@@ -178,43 +198,52 @@ class EfficientNet:
|
||||
self._fc_bias = Tensor.zeros(classes)
|
||||
|
||||
def forward(self, x):
|
||||
x = x.pad2d(padding=(0,1,0,1))
|
||||
x = x.pad2d(padding=(0, 1, 0, 1))
|
||||
x = self._bn0(x.conv2d(self._conv_stem, stride=2)).swish()
|
||||
#print(x.shape, x.data[:, 0, 0, 0])
|
||||
# print(x.shape, x.data[:, 0, 0, 0])
|
||||
for block in self._blocks:
|
||||
x = block(x)
|
||||
x = self._bn1(x.conv2d(self._conv_head)).swish()
|
||||
x = x.avg_pool2d(kernel_size=x.shape[2:4])
|
||||
x = x.reshape(shape=(-1, x.shape[1]))
|
||||
#x = x.dropout(0.2)
|
||||
return x.dot(self._fc).add(self._fc_bias.reshape(shape=[1,-1]))
|
||||
# x = x.dropout(0.2)
|
||||
return x.dot(self._fc).add(self._fc_bias.reshape(shape=[1, -1]))
|
||||
|
||||
def load_weights_from_torch(self):
|
||||
# load b0
|
||||
# https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/utils.py#L551
|
||||
if self.number == 0:
|
||||
b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth")
|
||||
b0 = fetch(
|
||||
"https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth"
|
||||
)
|
||||
elif self.number == 2:
|
||||
b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth")
|
||||
b0 = fetch(
|
||||
"https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth"
|
||||
)
|
||||
elif self.number == 4:
|
||||
b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth")
|
||||
b0 = fetch(
|
||||
"https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth"
|
||||
)
|
||||
elif self.number == 7:
|
||||
b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth")
|
||||
b0 = fetch(
|
||||
"https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth"
|
||||
)
|
||||
else:
|
||||
raise Exception("no pretrained weights")
|
||||
|
||||
if USE_TORCH:
|
||||
import io
|
||||
import torch
|
||||
|
||||
b0 = torch.load(io.BytesIO(b0))
|
||||
else:
|
||||
b0 = fake_torch_load(b0)
|
||||
|
||||
for k,v in b0.items():
|
||||
if '_blocks.' in k:
|
||||
for k, v in b0.items():
|
||||
if "_blocks." in k:
|
||||
k = "%s[%s].%s" % tuple(k.split(".", 2))
|
||||
mk = "self."+k
|
||||
#print(k, v.shape)
|
||||
mk = "self." + k
|
||||
# print(k, v.shape)
|
||||
try:
|
||||
mv = eval(mk)
|
||||
except AttributeError:
|
||||
@@ -223,10 +252,9 @@ class EfficientNet:
|
||||
except AttributeError:
|
||||
mv = eval(mk.replace(".bias", "_bias"))
|
||||
vnp = v.numpy().astype(np.float32) if USE_TORCH else v
|
||||
vnp = vnp if k != '_fc.weight' else vnp.T
|
||||
vnp = vnp if k != "_fc.weight" else vnp.T
|
||||
|
||||
if mv.shape == vnp.shape or vnp.shape == ():
|
||||
mv.data[:] = vnp
|
||||
else:
|
||||
print("MISMATCH SHAPE IN %s, %r %r" % (k, mv.shape, vnp.shape))
|
||||
|
||||
|
||||
@@ -1,17 +1,19 @@
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
def mask_like(like, mask_inx, mask_value = 1.0):
|
||||
|
||||
def mask_like(like, mask_inx, mask_value=1.0):
|
||||
mask = np.zeros_like(like).reshape(-1)
|
||||
mask[mask_inx] = mask_value
|
||||
return mask.reshape(like.shape)
|
||||
|
||||
|
||||
def jacobian(func, input):
|
||||
output = func(input)
|
||||
|
||||
ji = input.data.reshape(-1).shape[-1]
|
||||
jo = output.data.reshape(-1).shape[-1]
|
||||
J = np.zeros((jo,ji), dtype=np.float32)
|
||||
J = np.zeros((jo, ji), dtype=np.float32)
|
||||
|
||||
for o in range(jo):
|
||||
input.grad = None
|
||||
@@ -19,14 +21,15 @@ def jacobian(func, input):
|
||||
|
||||
# tinygrad doesn't support slicing, tiny-hack to select
|
||||
# the needed scalar an backpropagate only through it
|
||||
o_scalar = Tensor(mask_like(output.data, o, 1.)).mul(output).sum()
|
||||
o_scalar = Tensor(mask_like(output.data, o, 1.0)).mul(output).sum()
|
||||
o_scalar.backward()
|
||||
|
||||
for i, grad in enumerate(input.grad.data.reshape(-1)):
|
||||
J[o,i] = grad
|
||||
J[o, i] = grad
|
||||
return J
|
||||
|
||||
def numerical_jacobian(func, input, eps = 1e-6):
|
||||
|
||||
def numerical_jacobian(func, input, eps=1e-6):
|
||||
output = func(input)
|
||||
|
||||
ji = input.data.reshape(-1).shape[-1]
|
||||
@@ -34,17 +37,18 @@ def numerical_jacobian(func, input, eps = 1e-6):
|
||||
NJ = np.zeros((jo, ji), dtype=np.float32)
|
||||
|
||||
for i in range(ji):
|
||||
eps_perturb = mask_like(input.data, i, mask_value = eps)
|
||||
eps_perturb = mask_like(input.data, i, mask_value=eps)
|
||||
|
||||
output_perturb_add = func(Tensor(input.data + eps_perturb)).data.reshape(-1)
|
||||
output_perturb_sub = func(Tensor(input.data - eps_perturb)).data.reshape(-1)
|
||||
|
||||
grad_approx = ((output_perturb_add) - (output_perturb_sub)) / (2*eps)
|
||||
grad_approx = ((output_perturb_add) - (output_perturb_sub)) / (2 * eps)
|
||||
|
||||
NJ[:,i] = grad_approx
|
||||
NJ[:, i] = grad_approx
|
||||
return NJ
|
||||
|
||||
def gradcheck(func, input, eps = 1e-06, atol = 1e-5, rtol = 0.001):
|
||||
|
||||
def gradcheck(func, input, eps=1e-06, atol=1e-5, rtol=0.001):
|
||||
NJ = numerical_jacobian(func, input, eps)
|
||||
J = jacobian(func, input)
|
||||
return np.allclose(J, NJ, atol=atol, rtol=rtol)
|
||||
|
||||
@@ -4,26 +4,40 @@ from tqdm import trange
|
||||
from extra.utils import get_parameters
|
||||
from tinygrad.tensor import Tensor, GPU, Device
|
||||
|
||||
def train(model, X_train, Y_train, optim, steps, num_classes=None, BS=128, device=Device.CPU, lossfn = lambda out,y: out.mul(y).mean()):
|
||||
if device == Device.GPU: [x.gpu_() for x in get_parameters([model, optim])]
|
||||
elif device == Device.ANE: [x.ane_() for x in get_parameters([model, optim])]
|
||||
if num_classes is None: num_classes = Y_train.max().astype(int)+1
|
||||
|
||||
def train(
|
||||
model,
|
||||
X_train,
|
||||
Y_train,
|
||||
optim,
|
||||
steps,
|
||||
num_classes=None,
|
||||
BS=128,
|
||||
device=Device.CPU,
|
||||
lossfn=lambda out, y: out.mul(y).mean(),
|
||||
):
|
||||
if device == Device.GPU:
|
||||
[x.gpu_() for x in get_parameters([model, optim])]
|
||||
elif device == Device.ANE:
|
||||
[x.ane_() for x in get_parameters([model, optim])]
|
||||
if num_classes is None:
|
||||
num_classes = Y_train.max().astype(int) + 1
|
||||
losses, accuracies = [], []
|
||||
for i in (t := trange(steps, disable=os.getenv('CI') is not None)):
|
||||
for i in (t := trange(steps, disable=os.getenv("CI") is not None)) :
|
||||
samp = np.random.randint(0, X_train.shape[0], size=(BS))
|
||||
|
||||
x = Tensor(X_train[samp].reshape((-1, 28*28)).astype(np.float32), device=device)
|
||||
x = Tensor(X_train[samp].reshape((-1, 28 * 28)).astype(np.float32), device=device)
|
||||
Y = Y_train[samp]
|
||||
y = np.zeros((len(samp),num_classes), np.float32)
|
||||
y = np.zeros((len(samp), num_classes), np.float32)
|
||||
# correct loss for NLL, torch NLL loss returns one per row
|
||||
y[range(y.shape[0]),Y] = -1.0*num_classes
|
||||
y[range(y.shape[0]), Y] = -1.0 * num_classes
|
||||
y = Tensor(y, device=device)
|
||||
|
||||
# network
|
||||
out = model.forward(x)
|
||||
|
||||
# NLL loss function
|
||||
loss = lossfn(out, y)
|
||||
loss = lossfn(out, y)
|
||||
optim.zero_grad()
|
||||
loss.backward()
|
||||
optim.step()
|
||||
@@ -37,15 +51,26 @@ def train(model, X_train, Y_train, optim, steps, num_classes=None, BS=128, devic
|
||||
accuracies.append(accuracy)
|
||||
t.set_description("loss %.2f accuracy %.2f" % (loss, accuracy))
|
||||
|
||||
|
||||
def evaluate(model, X_test, Y_test, num_classes=None, device=Device.CPU, BS=128):
|
||||
def numpy_eval(num_classes):
|
||||
Y_test_preds_out = np.zeros((len(Y_test),num_classes))
|
||||
for i in trange(len(Y_test)//BS, disable=os.getenv('CI') is not None):
|
||||
Y_test_preds_out[i*BS:(i+1)*BS] = model.forward(Tensor(X_test[i*BS:(i+1)*BS].reshape((-1, 28*28)).astype(np.float32), device=device)).cpu().data
|
||||
Y_test_preds_out = np.zeros((len(Y_test), num_classes))
|
||||
for i in trange(len(Y_test) // BS, disable=os.getenv("CI") is not None):
|
||||
Y_test_preds_out[i * BS : (i + 1) * BS] = (
|
||||
model.forward(
|
||||
Tensor(
|
||||
X_test[i * BS : (i + 1) * BS].reshape((-1, 28 * 28)).astype(np.float32),
|
||||
device=device,
|
||||
)
|
||||
)
|
||||
.cpu()
|
||||
.data
|
||||
)
|
||||
Y_test_preds = np.argmax(Y_test_preds_out, axis=1)
|
||||
return (Y_test == Y_test_preds).mean()
|
||||
|
||||
if num_classes is None: num_classes = Y_test.max().astype(int)+1
|
||||
if num_classes is None:
|
||||
num_classes = Y_test.max().astype(int) + 1
|
||||
accuracy = numpy_eval(num_classes)
|
||||
print("test set accuracy is %f" % accuracy)
|
||||
return accuracy
|
||||
return accuracy
|
||||
|
||||
@@ -1,19 +1,22 @@
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
|
||||
def fetch(url):
|
||||
import requests, os, hashlib, tempfile
|
||||
fp = os.path.join(tempfile.gettempdir(), hashlib.md5(url.encode('utf-8')).hexdigest())
|
||||
|
||||
fp = os.path.join(tempfile.gettempdir(), hashlib.md5(url.encode("utf-8")).hexdigest())
|
||||
if os.path.isfile(fp) and os.stat(fp).st_size > 0:
|
||||
with open(fp, "rb") as f:
|
||||
dat = f.read()
|
||||
else:
|
||||
print("fetching %s" % url)
|
||||
dat = requests.get(url).content
|
||||
with open(fp+".tmp", "wb") as f:
|
||||
with open(fp + ".tmp", "wb") as f:
|
||||
f.write(dat)
|
||||
os.rename(fp+".tmp", fp)
|
||||
os.rename(fp + ".tmp", fp)
|
||||
return dat
|
||||
|
||||
|
||||
def get_parameters(obj):
|
||||
parameters = []
|
||||
if isinstance(obj, Tensor):
|
||||
@@ -21,8 +24,7 @@ def get_parameters(obj):
|
||||
elif isinstance(obj, list):
|
||||
for x in obj:
|
||||
parameters.extend(get_parameters(x))
|
||||
elif hasattr(obj, '__dict__'):
|
||||
for k,v in obj.__dict__.items():
|
||||
elif hasattr(obj, "__dict__"):
|
||||
for k, v in obj.__dict__.items():
|
||||
parameters.extend(get_parameters(v))
|
||||
return parameters
|
||||
|
||||
|
||||
50
setup.py
50
setup.py
@@ -4,29 +4,31 @@ import os
|
||||
from setuptools import setup
|
||||
|
||||
directory = os.path.abspath(os.path.dirname(__file__))
|
||||
with open(os.path.join(directory, 'README.md'), encoding='utf-8') as f:
|
||||
with open(os.path.join(directory, "README.md"), encoding="utf-8") as f:
|
||||
long_description = f.read()
|
||||
|
||||
setup(name='tinygrad',
|
||||
version='0.3.0',
|
||||
description='You like pytorch? You like micrograd? You love tinygrad! heart',
|
||||
author='George Hotz',
|
||||
license='MIT',
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
packages = ['tinygrad'],
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License"
|
||||
],
|
||||
install_requires=['numpy', 'requests'],
|
||||
python_requires='>=3.8',
|
||||
extras_require={
|
||||
'gpu': ["pyopencl", "six"],
|
||||
'testing': [
|
||||
"pytest",
|
||||
"torch",
|
||||
"tqdm",
|
||||
],
|
||||
},
|
||||
include_package_data=True)
|
||||
setup(
|
||||
name="tinygrad",
|
||||
version="0.3.0",
|
||||
description="You like pytorch? You like micrograd? You love tinygrad! heart",
|
||||
author="George Hotz",
|
||||
license="MIT",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
packages=["tinygrad"],
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
],
|
||||
install_requires=["numpy", "requests"],
|
||||
python_requires=">=3.8",
|
||||
extras_require={
|
||||
"gpu": ["pyopencl", "six"],
|
||||
"testing": [
|
||||
"pytest",
|
||||
"torch",
|
||||
"tqdm",
|
||||
],
|
||||
},
|
||||
include_package_data=True,
|
||||
)
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
import os
|
||||
|
||||
ANE = os.environ.get('ANE', False)
|
||||
ANE = os.environ.get("ANE", False)
|
||||
|
||||
@@ -4,43 +4,48 @@ import unittest
|
||||
from tinygrad.tensor import Tensor, GPU, Device
|
||||
from .config import ANE
|
||||
|
||||
|
||||
def tensors_allocated():
|
||||
return sum([isinstance(x, Tensor) for x in gc.get_objects()])
|
||||
|
||||
|
||||
|
||||
class TestGC(unittest.TestCase):
|
||||
device = Device.CPU
|
||||
|
||||
def test_gc(self):
|
||||
a = Tensor.zeros(4,4, device=self.device)
|
||||
b = Tensor.zeros(4,4, device=self.device)
|
||||
(a*b).mean().backward()
|
||||
assert(tensors_allocated() > 0)
|
||||
del a,b
|
||||
assert(tensors_allocated() == 0)
|
||||
a = Tensor.zeros(4, 4, device=self.device)
|
||||
b = Tensor.zeros(4, 4, device=self.device)
|
||||
(a * b).mean().backward()
|
||||
assert tensors_allocated() > 0
|
||||
del a, b
|
||||
assert tensors_allocated() == 0
|
||||
|
||||
def test_gc_complex(self):
|
||||
a = Tensor.zeros(4,4, device=self.device)
|
||||
b = Tensor.zeros(4,4, device=self.device)
|
||||
assert(tensors_allocated() == 2)
|
||||
(a*b).mean().backward()
|
||||
assert(tensors_allocated() == 4)
|
||||
a = Tensor.zeros(4, 4, device=self.device)
|
||||
b = Tensor.zeros(4, 4, device=self.device)
|
||||
assert tensors_allocated() == 2
|
||||
(a * b).mean().backward()
|
||||
assert tensors_allocated() == 4
|
||||
del b
|
||||
assert(tensors_allocated() == 2)
|
||||
b = Tensor.zeros(4,4, device=self.device)
|
||||
assert tensors_allocated() == 2
|
||||
b = Tensor.zeros(4, 4, device=self.device)
|
||||
print(tensors_allocated())
|
||||
(a*b).mean().backward()
|
||||
(a * b).mean().backward()
|
||||
print(tensors_allocated())
|
||||
assert(tensors_allocated() == 4)
|
||||
assert tensors_allocated() == 4
|
||||
del b
|
||||
assert(tensors_allocated() == 2)
|
||||
assert tensors_allocated() == 2
|
||||
|
||||
|
||||
@unittest.skipUnless(GPU, "Requires GPU")
|
||||
class TestGCGPU(TestGC):
|
||||
device = Device.GPU
|
||||
device = Device.GPU
|
||||
|
||||
|
||||
@unittest.skipUnless(ANE, "Requires ANE")
|
||||
class TestGCANE(TestGC):
|
||||
device=Device.ANE
|
||||
device = Device.ANE
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -11,19 +11,28 @@ from .config import ANE
|
||||
# mnist loader
|
||||
def fetch_mnist():
|
||||
import gzip
|
||||
|
||||
parse = lambda dat: np.frombuffer(gzip.decompress(dat), dtype=np.uint8).copy()
|
||||
X_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"))[0x10:].reshape((-1, 28, 28))
|
||||
Y_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"))[8:]
|
||||
X_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"))[0x10:].reshape((-1, 28, 28))
|
||||
Y_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"))[8:]
|
||||
X_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"))[
|
||||
0x10:
|
||||
].reshape((-1, 28, 28))
|
||||
Y_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"))[
|
||||
8:
|
||||
]
|
||||
X_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"))[
|
||||
0x10:
|
||||
].reshape((-1, 28, 28))
|
||||
Y_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"))[
|
||||
8:
|
||||
]
|
||||
return X_train, Y_train, X_test, Y_test
|
||||
|
||||
|
||||
# load the mnist dataset
|
||||
X_train, Y_train, X_test, Y_test = fetch_mnist()
|
||||
|
||||
# create a model
|
||||
class TinyBobNet:
|
||||
|
||||
def __init__(self):
|
||||
self.l1 = Tensor.uniform(784, 128)
|
||||
self.l2 = Tensor.uniform(128, 10)
|
||||
@@ -34,27 +43,29 @@ class TinyBobNet:
|
||||
def forward(self, x):
|
||||
return x.dot(self.l1).relu().dot(self.l2).logsoftmax()
|
||||
|
||||
|
||||
# create a model with a conv layer
|
||||
class TinyConvNet:
|
||||
def __init__(self):
|
||||
# https://keras.io/examples/vision/mnist_convnet/
|
||||
conv = 3
|
||||
#inter_chan, out_chan = 32, 64
|
||||
inter_chan, out_chan = 8, 16 # for speed
|
||||
self.c1 = Tensor.uniform(inter_chan,1,conv,conv)
|
||||
self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
|
||||
self.l1 = Tensor.uniform(out_chan*5*5, 10)
|
||||
# inter_chan, out_chan = 32, 64
|
||||
inter_chan, out_chan = 8, 16 # for speed
|
||||
self.c1 = Tensor.uniform(inter_chan, 1, conv, conv)
|
||||
self.c2 = Tensor.uniform(out_chan, inter_chan, conv, conv)
|
||||
self.l1 = Tensor.uniform(out_chan * 5 * 5, 10)
|
||||
|
||||
def parameters(self):
|
||||
return get_parameters(self)
|
||||
|
||||
def forward(self, x):
|
||||
x = x.reshape(shape=(-1, 1, 28, 28)) # hacks
|
||||
x = x.reshape(shape=(-1, 1, 28, 28)) # hacks
|
||||
x = x.conv2d(self.c1).relu().max_pool2d()
|
||||
x = x.conv2d(self.c2).relu().max_pool2d()
|
||||
x = x.reshape(shape=[x.shape[0], -1])
|
||||
return x.dot(self.l1).logsoftmax()
|
||||
|
||||
|
||||
class TestMNIST(unittest.TestCase):
|
||||
device = Device.CPU
|
||||
|
||||
@@ -76,16 +87,19 @@ class TestMNIST(unittest.TestCase):
|
||||
np.random.seed(1337)
|
||||
model = TinyBobNet()
|
||||
optimizer = optim.RMSprop(model.parameters(), lr=0.0002)
|
||||
train(model, X_train, Y_train, optimizer, steps=1000, device=self.device)
|
||||
train(model, X_train, Y_train, optimizer, steps=1000, device=self.device)
|
||||
assert evaluate(model, X_test, Y_test, device=self.device) > 0.95
|
||||
|
||||
|
||||
@unittest.skipUnless(GPU, "Requires GPU")
|
||||
class TestMNISTGPU(TestMNIST):
|
||||
device = Device.GPU
|
||||
|
||||
|
||||
@unittest.skipUnless(ANE, "Requires ANE")
|
||||
class TestMNISTANE(TestMNIST):
|
||||
device=Device.ANE
|
||||
device = Device.ANE
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -7,21 +7,25 @@ import torch
|
||||
from tinygrad.tensor import Tensor, GPU, Device
|
||||
from .config import ANE
|
||||
|
||||
|
||||
def start_profile():
|
||||
import time
|
||||
pr = cProfile.Profile(timer=lambda: int(time.time()*1e9), timeunit=1e-6)
|
||||
|
||||
pr = cProfile.Profile(timer=lambda: int(time.time() * 1e9), timeunit=1e-6)
|
||||
pr.enable()
|
||||
return pr
|
||||
|
||||
def stop_profile(pr, sort='cumtime'):
|
||||
|
||||
def stop_profile(pr, sort="cumtime"):
|
||||
pr.disable()
|
||||
ps = pstats.Stats(pr)
|
||||
ps.strip_dirs()
|
||||
ps.sort_stats(sort)
|
||||
ps.print_stats(0.2)
|
||||
|
||||
|
||||
class TestConvSpeed(unittest.TestCase):
|
||||
device= Device.CPU
|
||||
device = Device.CPU
|
||||
|
||||
def test_mnist(self):
|
||||
# https://keras.io/examples/vision/mnist_convnet/
|
||||
@@ -34,12 +38,12 @@ class TestConvSpeed(unittest.TestCase):
|
||||
|
||||
conv = 3
|
||||
inter_chan, out_chan = 32, 64
|
||||
c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
|
||||
c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
|
||||
l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
|
||||
c1 = torch.randn(inter_chan, 1, conv, conv, requires_grad=True)
|
||||
c2 = torch.randn(out_chan, inter_chan, conv, conv, requires_grad=True)
|
||||
l1 = torch.randn(out_chan * 5 * 5, 10, requires_grad=True)
|
||||
|
||||
c2d = torch.nn.functional.conv2d
|
||||
mp = torch.nn.MaxPool2d((2,2))
|
||||
mp = torch.nn.MaxPool2d((2, 2))
|
||||
lsm = torch.nn.LogSoftmax(dim=1)
|
||||
|
||||
cnt = 5
|
||||
@@ -47,19 +51,19 @@ class TestConvSpeed(unittest.TestCase):
|
||||
for i in range(cnt):
|
||||
et0 = time.time()
|
||||
x = torch.randn(128, 1, 28, 28, requires_grad=True)
|
||||
x = mp(c2d(x,c1).relu())
|
||||
x = mp(c2d(x,c2).relu())
|
||||
x = mp(c2d(x, c1).relu())
|
||||
x = mp(c2d(x, c2).relu())
|
||||
x = x.reshape(x.shape[0], -1)
|
||||
out = lsm(x.matmul(l1))
|
||||
out = out.mean()
|
||||
et1 = time.time()
|
||||
out.backward()
|
||||
et2 = time.time()
|
||||
fpt += (et1-et0)
|
||||
bpt += (et2-et1)
|
||||
fpt += et1 - et0
|
||||
bpt += et2 - et1
|
||||
|
||||
fpt_baseline = (fpt*1000/cnt)
|
||||
bpt_baseline = (bpt*1000/cnt)
|
||||
fpt_baseline = fpt * 1000 / cnt
|
||||
bpt_baseline = bpt * 1000 / cnt
|
||||
print("torch forward pass: %.3f ms" % fpt_baseline)
|
||||
print("torch backward pass: %.3f ms" % bpt_baseline)
|
||||
|
||||
@@ -71,7 +75,7 @@ class TestConvSpeed(unittest.TestCase):
|
||||
|
||||
cnt = 5
|
||||
fpt, bpt = 0.0, 0.0
|
||||
for i in range(1+cnt):
|
||||
for i in range(1 + cnt):
|
||||
et0 = time.time()
|
||||
x = Tensor.randn(128, 1, 28, 28, device=self.device)
|
||||
x = x.conv2d(c1).relu().avg_pool2d()
|
||||
@@ -85,24 +89,31 @@ class TestConvSpeed(unittest.TestCase):
|
||||
if i == 0:
|
||||
pr = start_profile()
|
||||
else:
|
||||
fpt += (et1-et0)
|
||||
bpt += (et2-et1)
|
||||
fpt += et1 - et0
|
||||
bpt += et2 - et1
|
||||
|
||||
stop_profile(pr, sort="time")
|
||||
fpt = fpt * 1000 / cnt
|
||||
bpt = bpt * 1000 / cnt
|
||||
print(
|
||||
"forward pass: %.3f ms, %.2fx off baseline %.3f ms"
|
||||
% (fpt, fpt / fpt_baseline, fpt_baseline)
|
||||
)
|
||||
print(
|
||||
"backward pass: %.3f ms, %.2fx off baseline %.3f ms"
|
||||
% (bpt, bpt / bpt_baseline, bpt_baseline)
|
||||
)
|
||||
|
||||
stop_profile(pr, sort='time')
|
||||
fpt = (fpt*1000/cnt)
|
||||
bpt = (bpt*1000/cnt)
|
||||
print("forward pass: %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
|
||||
print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))
|
||||
|
||||
@unittest.skipUnless(GPU, "Requires GPU")
|
||||
class TestConvSpeedGPU(TestConvSpeed):
|
||||
device = Device.GPU
|
||||
|
||||
|
||||
@unittest.skipUnless(ANE, "Requires ANE")
|
||||
class TestConvSpeedANE(TestConvSpeed):
|
||||
device=Device.ANE
|
||||
device = Device.ANE
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ from extra.utils import get_parameters
|
||||
import torch
|
||||
from .config import ANE
|
||||
|
||||
|
||||
class TestNN(unittest.TestCase):
|
||||
device = Device.CPU
|
||||
|
||||
@@ -30,8 +31,12 @@ class TestNN(unittest.TestCase):
|
||||
tbn.running_mean[:] = torch.tensor(bn.running_mean.data)
|
||||
tbn.running_var[:] = torch.tensor(bn.running_var.data)
|
||||
|
||||
np.testing.assert_allclose(bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5)
|
||||
np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
|
||||
np.testing.assert_allclose(
|
||||
bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5
|
||||
)
|
||||
np.testing.assert_allclose(
|
||||
bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5
|
||||
)
|
||||
|
||||
# trial
|
||||
inn = Tensor.randn(2, sz, 3, 3, device=self.device)
|
||||
@@ -45,35 +50,42 @@ class TestNN(unittest.TestCase):
|
||||
# close
|
||||
np.testing.assert_allclose(outt.data, toutt.detach().numpy(), rtol=5e-5)
|
||||
|
||||
np.testing.assert_allclose(bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5)
|
||||
np.testing.assert_allclose(
|
||||
bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5
|
||||
)
|
||||
|
||||
# TODO: this is failing
|
||||
#np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
|
||||
# np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
|
||||
|
||||
def test_batchnorm2d_training(self):
|
||||
self.test_batchnorm2d(True)
|
||||
|
||||
|
||||
@unittest.skipUnless(GPU, "Requires GPU")
|
||||
class TestNNGPU(TestNN):
|
||||
device = Device.GPU
|
||||
|
||||
@unittest.skip("Tests not added")
|
||||
def test_batchnorm2d(self): pass
|
||||
def test_batchnorm2d(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Tests not added")
|
||||
def test_batchnorm2d_training(self): pass
|
||||
def test_batchnorm2d_training(self):
|
||||
pass
|
||||
|
||||
|
||||
@unittest.skipUnless(ANE, "Requires ANE")
|
||||
class TestNNANE(TestNN):
|
||||
device=Device.ANE
|
||||
device = Device.ANE
|
||||
|
||||
@unittest.skip("Tests not added")
|
||||
def test_batchnorm2d(self): pass
|
||||
def test_batchnorm2d(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Tests not added")
|
||||
def test_batchnorm2d_training(self): pass
|
||||
def test_batchnorm2d_training(self):
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
304
test/test_ops.py
304
test/test_ops.py
@@ -7,13 +7,24 @@ import functools
|
||||
from tinygrad.tensor import Tensor, GPU, Device
|
||||
from .config import ANE
|
||||
|
||||
def helper_test_op(shps, torch_fxn, tinygrad_fxn, atol=0, rtol=1e-6, grad_atol=0, grad_rtol=1e-6, device=Device.CPU, forward_only=False):
|
||||
|
||||
def helper_test_op(
|
||||
shps,
|
||||
torch_fxn,
|
||||
tinygrad_fxn,
|
||||
atol=0,
|
||||
rtol=1e-6,
|
||||
grad_atol=0,
|
||||
grad_rtol=1e-6,
|
||||
device=Device.CPU,
|
||||
forward_only=False,
|
||||
):
|
||||
torch.manual_seed(0)
|
||||
ts = [torch.rand(x, requires_grad=True) for x in shps]
|
||||
tst = [Tensor(x.detach().numpy()) for x in ts]
|
||||
if device==Device.GPU:
|
||||
if device == Device.GPU:
|
||||
tst = [x.gpu() for x in tst]
|
||||
elif device==Device.ANE:
|
||||
elif device == Device.ANE:
|
||||
tst = [x.ane() for x in tst]
|
||||
|
||||
out = torch_fxn(*ts)
|
||||
@@ -26,142 +37,291 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn, atol=0, rtol=1e-6, grad_atol=0
|
||||
ret.mean().backward()
|
||||
|
||||
for t, tt in zip(ts, tst):
|
||||
np.testing.assert_allclose(t.grad, tt.cpu().grad.data, atol=grad_atol, rtol=grad_rtol)
|
||||
np.testing.assert_allclose(
|
||||
t.grad, tt.cpu().grad.data, atol=grad_atol, rtol=grad_rtol
|
||||
)
|
||||
|
||||
# speed
|
||||
torch_fp = timeit.Timer(functools.partial(torch_fxn, *ts)).timeit(5) * 1000/5
|
||||
tinygrad_fp = timeit.Timer(functools.partial(tinygrad_fxn, *tst)).timeit(5) * 1000/5
|
||||
torch_fp = timeit.Timer(functools.partial(torch_fxn, *ts)).timeit(5) * 1000 / 5
|
||||
tinygrad_fp = timeit.Timer(functools.partial(tinygrad_fxn, *tst)).timeit(5) * 1000 / 5
|
||||
|
||||
if not forward_only:
|
||||
torch_fbp = timeit.Timer(functools.partial(lambda f,x: f(*x).mean().backward(), torch_fxn, ts)).timeit(5) * 1000/5
|
||||
tinygrad_fbp = timeit.Timer(functools.partial(lambda f,x: f(*x).mean().backward(), tinygrad_fxn, tst)).timeit(5) * 1000/5
|
||||
torch_fbp = (
|
||||
timeit.Timer(
|
||||
functools.partial(lambda f, x: f(*x).mean().backward(), torch_fxn, ts)
|
||||
).timeit(5)
|
||||
* 1000
|
||||
/ 5
|
||||
)
|
||||
tinygrad_fbp = (
|
||||
timeit.Timer(
|
||||
functools.partial(lambda f, x: f(*x).mean().backward(), tinygrad_fxn, tst)
|
||||
).timeit(5)
|
||||
* 1000
|
||||
/ 5
|
||||
)
|
||||
else:
|
||||
torch_fbp, tinygrad_fbp = np.nan, np.nan
|
||||
|
||||
print("testing %30r torch/tinygrad fp: %.2f / %.2f ms bp: %.2f / %.2f ms" % (shps, torch_fp, tinygrad_fp, torch_fbp-torch_fp, tinygrad_fbp-tinygrad_fp))
|
||||
print(
|
||||
"testing %30r torch/tinygrad fp: %.2f / %.2f ms bp: %.2f / %.2f ms"
|
||||
% (shps, torch_fp, tinygrad_fp, torch_fbp - torch_fp, tinygrad_fbp - tinygrad_fp)
|
||||
)
|
||||
|
||||
|
||||
class TestOps(unittest.TestCase):
|
||||
device=Device.CPU
|
||||
device = Device.CPU
|
||||
|
||||
def test_add(self):
|
||||
helper_test_op([(45,65), (45,65)], lambda x,y: x+y, Tensor.add, device=self.device)
|
||||
helper_test_op(
|
||||
[(45, 65), (45, 65)], lambda x, y: x + y, Tensor.add, device=self.device
|
||||
)
|
||||
|
||||
def test_sub(self):
|
||||
helper_test_op([(45,65), (45,65)], lambda x,y: x-y, Tensor.sub, device=self.device)
|
||||
helper_test_op(
|
||||
[(45, 65), (45, 65)], lambda x, y: x - y, Tensor.sub, device=self.device
|
||||
)
|
||||
|
||||
def test_mul(self):
|
||||
helper_test_op([(45,65), (45,65)], lambda x,y: x*y, Tensor.mul, device=self.device)
|
||||
helper_test_op(
|
||||
[(45, 65), (45, 65)], lambda x, y: x * y, Tensor.mul, device=self.device
|
||||
)
|
||||
|
||||
def test_div(self):
|
||||
helper_test_op([(45,65), (45,65)], lambda x,y: x/y, Tensor.div, device=self.device)
|
||||
helper_test_op(
|
||||
[(45, 65), (45, 65)], lambda x, y: x / y, Tensor.div, device=self.device
|
||||
)
|
||||
|
||||
def test_pow(self):
|
||||
helper_test_op([(45,65), (45,65)], lambda x,y: x**y, Tensor.pow, device=self.device)
|
||||
helper_test_op(
|
||||
[(45, 65), (45, 65)], lambda x, y: x ** y, Tensor.pow, device=self.device
|
||||
)
|
||||
|
||||
def test_sqrt(self):
|
||||
helper_test_op([(45,65)], lambda x: x.sqrt(), Tensor.sqrt, device=self.device)
|
||||
helper_test_op([(45, 65)], lambda x: x.sqrt(), Tensor.sqrt, device=self.device)
|
||||
|
||||
def test_relu(self):
|
||||
helper_test_op([(45,65)], lambda x: x.relu(), Tensor.relu, device=self.device)
|
||||
helper_test_op([(45, 65)], lambda x: x.relu(), Tensor.relu, device=self.device)
|
||||
|
||||
def test_leakyrelu(self):
|
||||
helper_test_op([(45,65)], lambda x: torch.nn.functional.leaky_relu(x,0.01), Tensor.leakyrelu, device=self.device)
|
||||
helper_test_op(
|
||||
[(45, 65)],
|
||||
lambda x: torch.nn.functional.leaky_relu(x, 0.01),
|
||||
Tensor.leakyrelu,
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
def test_abs(self):
|
||||
helper_test_op([(45,65)], lambda x: torch.abs(x), Tensor.abs, device=self.device)
|
||||
helper_test_op([(45, 65)], lambda x: torch.abs(x), Tensor.abs, device=self.device)
|
||||
|
||||
def test_sigmoid(self):
|
||||
helper_test_op([(45,65)], lambda x: x.sigmoid(), Tensor.sigmoid, device=self.device)
|
||||
helper_test_op(
|
||||
[(45, 65)], lambda x: x.sigmoid(), Tensor.sigmoid, device=self.device
|
||||
)
|
||||
|
||||
def test_dot(self):
|
||||
helper_test_op([(45,65), (65,100)], lambda x,y: x.matmul(y), Tensor.dot, device=self.device)
|
||||
helper_test_op(
|
||||
[(45, 65), (65, 100)], lambda x, y: x.matmul(y), Tensor.dot, device=self.device
|
||||
)
|
||||
|
||||
def test_sum(self):
|
||||
helper_test_op([(45,3)], lambda x: x.sum(), Tensor.sum, device=self.device)
|
||||
helper_test_op([(45, 3)], lambda x: x.sum(), Tensor.sum, device=self.device)
|
||||
|
||||
def test_sum_axis(self):
|
||||
helper_test_op([(3,4,5,6)], lambda x: x.sum(axis=(1,2)), lambda x: Tensor.sum(x, axis=(1,2)), device=self.device)
|
||||
helper_test_op(
|
||||
[(3, 4, 5, 6)],
|
||||
lambda x: x.sum(axis=(1, 2)),
|
||||
lambda x: Tensor.sum(x, axis=(1, 2)),
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
def test_mean_axis(self):
|
||||
helper_test_op([(3,4,5,6)], lambda x: x.mean(axis=(1,2)), lambda x: Tensor.mean(x, axis=(1,2)), device=self.device)
|
||||
helper_test_op(
|
||||
[(3, 4, 5, 6)],
|
||||
lambda x: x.mean(axis=(1, 2)),
|
||||
lambda x: Tensor.mean(x, axis=(1, 2)),
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
def test_logsoftmax(self):
|
||||
helper_test_op([(45,65)], lambda x: torch.nn.LogSoftmax(dim=1)(x), Tensor.logsoftmax, atol=1e-7, grad_atol=1e-7, device=self.device)
|
||||
helper_test_op(
|
||||
[(45, 65)],
|
||||
lambda x: torch.nn.LogSoftmax(dim=1)(x),
|
||||
Tensor.logsoftmax,
|
||||
atol=1e-7,
|
||||
grad_atol=1e-7,
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
def test_tanh(self):
|
||||
helper_test_op([(45,65)], lambda x: x.tanh(), Tensor.tanh, atol=1e-6, grad_atol=1e-6, device=self.device)
|
||||
helper_test_op(
|
||||
[(45, 65)],
|
||||
lambda x: x.tanh(),
|
||||
Tensor.tanh,
|
||||
atol=1e-6,
|
||||
grad_atol=1e-6,
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
def test_topo_sort(self):
|
||||
helper_test_op([(45,65)], lambda x: (x+x)*x, lambda x: x.add(x).mul(x), atol=1e-6, grad_atol=1e-6, device=self.device)
|
||||
helper_test_op(
|
||||
[(45, 65)],
|
||||
lambda x: (x + x) * x,
|
||||
lambda x: x.add(x).mul(x),
|
||||
atol=1e-6,
|
||||
grad_atol=1e-6,
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
def test_scalar_mul(self):
|
||||
helper_test_op([(45,65)], lambda x: x*2, lambda x: x*2, device=self.device)
|
||||
helper_test_op([(45, 65)], lambda x: x * 2, lambda x: x * 2, device=self.device)
|
||||
|
||||
def test_scalar_rmul(self):
|
||||
helper_test_op([(45,65)], lambda x: 2*x, lambda x: 2*x, device=self.device)
|
||||
helper_test_op([(45, 65)], lambda x: 2 * x, lambda x: 2 * x, device=self.device)
|
||||
|
||||
def test_scalar_sub(self):
|
||||
helper_test_op([(45,65)], lambda x: x-2, lambda x: x-2, device=self.device)
|
||||
helper_test_op([(45, 65)], lambda x: x - 2, lambda x: x - 2, device=self.device)
|
||||
|
||||
def test_scalar_rsub(self):
|
||||
helper_test_op([(45,65)], lambda x: 2-x, lambda x: 2-x, device=self.device)
|
||||
helper_test_op([(45, 65)], lambda x: 2 - x, lambda x: 2 - x, device=self.device)
|
||||
|
||||
def test_broadcast_full(self):
|
||||
for torch_op, tinygrad_op in [(torch.add, Tensor.add), (torch.sub, Tensor.sub), (torch.mul, Tensor.mul),
|
||||
(torch.div, Tensor.div), (torch.pow, Tensor.pow)]:
|
||||
for shapes in [((5,13,24,16), (5,1,24,1)), ((1,3,1,7,1), (2,1,5,1,8))]:
|
||||
for torch_op, tinygrad_op in [
|
||||
(torch.add, Tensor.add),
|
||||
(torch.sub, Tensor.sub),
|
||||
(torch.mul, Tensor.mul),
|
||||
(torch.div, Tensor.div),
|
||||
(torch.pow, Tensor.pow),
|
||||
]:
|
||||
for shapes in [
|
||||
((5, 13, 24, 16), (5, 1, 24, 1)),
|
||||
((1, 3, 1, 7, 1), (2, 1, 5, 1, 8)),
|
||||
]:
|
||||
with self.subTest(op=torch_op.__name__, shapes=shapes):
|
||||
helper_test_op(shapes, torch_op, tinygrad_op, device=self.device)
|
||||
|
||||
|
||||
def test_broadcast_partial(self):
|
||||
for torch_op, tinygrad_op in [(torch.add, Tensor.add), (torch.sub, Tensor.sub), (torch.mul, Tensor.mul),
|
||||
(torch.div, Tensor.div), (torch.pow, Tensor.pow)]:
|
||||
for shapes in [((1,32,32,32), (1,32,1,1)), ((5,13,24,16,2), (1,13,24,1,1)),
|
||||
((4,1), (4,5)), ((1,4), (5,4))]:
|
||||
for torch_op, tinygrad_op in [
|
||||
(torch.add, Tensor.add),
|
||||
(torch.sub, Tensor.sub),
|
||||
(torch.mul, Tensor.mul),
|
||||
(torch.div, Tensor.div),
|
||||
(torch.pow, Tensor.pow),
|
||||
]:
|
||||
for shapes in [
|
||||
((1, 32, 32, 32), (1, 32, 1, 1)),
|
||||
((5, 13, 24, 16, 2), (1, 13, 24, 1, 1)),
|
||||
((4, 1), (4, 5)),
|
||||
((1, 4), (5, 4)),
|
||||
]:
|
||||
with self.subTest(op=torch_op.__name__, shapes=shapes):
|
||||
# NOTE: ANE backwards?
|
||||
helper_test_op(shapes, torch_op, tinygrad_op, device=self.device, forward_only=self.device!=Device.CPU)
|
||||
helper_test_op(
|
||||
shapes,
|
||||
torch_op,
|
||||
tinygrad_op,
|
||||
device=self.device,
|
||||
forward_only=self.device != Device.CPU,
|
||||
)
|
||||
|
||||
def test_pad2d(self):
|
||||
helper_test_op([(3,3,3,3)], lambda x: torch.nn.functional.pad(x, (1,2,3,4)), lambda x: x.pad2d(padding=(1,2,3,4)), device=self.device)
|
||||
helper_test_op(
|
||||
[(3, 3, 3, 3)],
|
||||
lambda x: torch.nn.functional.pad(x, (1, 2, 3, 4)),
|
||||
lambda x: x.pad2d(padding=(1, 2, 3, 4)),
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
def test_reshape(self):
|
||||
helper_test_op([(4,3,6,6)], lambda x: torch.reshape(x, (-1,3,6,6)), lambda x: x.reshape(shape=(-1,3,6,6)), device=self.device)
|
||||
helper_test_op([(4,3,6,6)], lambda x: torch.reshape(x, (-1,1,6,6)), lambda x: x.reshape(shape=(-1,1,6,6)), device=self.device)
|
||||
helper_test_op(
|
||||
[(4, 3, 6, 6)],
|
||||
lambda x: torch.reshape(x, (-1, 3, 6, 6)),
|
||||
lambda x: x.reshape(shape=(-1, 3, 6, 6)),
|
||||
device=self.device,
|
||||
)
|
||||
helper_test_op(
|
||||
[(4, 3, 6, 6)],
|
||||
lambda x: torch.reshape(x, (-1, 1, 6, 6)),
|
||||
lambda x: x.reshape(shape=(-1, 1, 6, 6)),
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
def test_detach(self):
|
||||
helper_test_op([(4,3,6,6)], lambda x: x.detach(), lambda x: x.detach(), device=self.device, forward_only=True)
|
||||
helper_test_op(
|
||||
[(4, 3, 6, 6)],
|
||||
lambda x: x.detach(),
|
||||
lambda x: x.detach(),
|
||||
device=self.device,
|
||||
forward_only=True,
|
||||
)
|
||||
|
||||
def test_conv2d(self):
|
||||
for bs in [1,8]:
|
||||
for cin in [1,3]:
|
||||
for groups in [1,3] if cin == 3 else [1]:
|
||||
for H in [1,2,5]:
|
||||
for W in [1,2,3,5]:
|
||||
with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H, width=W):
|
||||
helper_test_op([(bs,cin,11,28), (6,cin//groups,H,W)],
|
||||
lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(),
|
||||
lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), device=self.device, grad_rtol=1e-5)
|
||||
for bs in [1, 8]:
|
||||
for cin in [1, 3]:
|
||||
for groups in [1, 3] if cin == 3 else [1]:
|
||||
for H in [1, 2, 5]:
|
||||
for W in [1, 2, 3, 5]:
|
||||
with self.subTest(
|
||||
batch_size=bs, channels=cin, groups=groups, height=H, width=W
|
||||
):
|
||||
helper_test_op(
|
||||
[(bs, cin, 11, 28), (6, cin // groups, H, W)],
|
||||
lambda x, w: torch.nn.functional.conv2d(x, w, groups=groups).relu(),
|
||||
lambda x, w: Tensor.conv2d(x, w, groups=groups).relu(),
|
||||
device=self.device,
|
||||
grad_rtol=1e-5,
|
||||
)
|
||||
|
||||
def test_strided_conv2d(self):
|
||||
bs = 4
|
||||
cin = 3
|
||||
H,W = 3,3
|
||||
H, W = 3, 3
|
||||
with self.subTest(stride := 2):
|
||||
helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
|
||||
lambda x,w: torch.nn.functional.conv2d(x,w,stride=2).relu(),
|
||||
lambda x,w: Tensor.conv2d(x,w,stride=stride).relu(), device=self.device)
|
||||
with self.subTest(stride := (2,1)):
|
||||
helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
|
||||
lambda x,w: torch.nn.functional.conv2d(x,w,stride=stride).relu(),
|
||||
lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), device=self.device)
|
||||
helper_test_op(
|
||||
[(bs, cin, 11, 28), (4, cin, H, W)],
|
||||
lambda x, w: torch.nn.functional.conv2d(x, w, stride=2).relu(),
|
||||
lambda x, w: Tensor.conv2d(x, w, stride=stride).relu(),
|
||||
device=self.device,
|
||||
)
|
||||
with self.subTest(stride := (2, 1)):
|
||||
helper_test_op(
|
||||
[(bs, cin, 11, 28), (4, cin, H, W)],
|
||||
lambda x, w: torch.nn.functional.conv2d(x, w, stride=stride).relu(),
|
||||
lambda x, w: Tensor.conv2d(x, w, stride=(2, 1)).relu(),
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
def test_maxpool2d(self):
|
||||
for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1)]:
|
||||
for ksz in [(2, 2), (3, 3), (3, 2), (5, 5), (5, 1)]:
|
||||
with self.subTest(kernel_size=ksz):
|
||||
helper_test_op([(32,2,110,28)],
|
||||
helper_test_op(
|
||||
[(32, 2, 110, 28)],
|
||||
lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz),
|
||||
lambda x: Tensor.max_pool2d(x, kernel_size=ksz), device=self.device)
|
||||
lambda x: Tensor.max_pool2d(x, kernel_size=ksz),
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
def test_avgpool2d(self):
|
||||
shape = (32,2,111,28)
|
||||
for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1), shape[2:]]:
|
||||
shape = (32, 2, 111, 28)
|
||||
for ksz in [(2, 2), (3, 3), (3, 2), (5, 5), (5, 1), shape[2:]]:
|
||||
with self.subTest(kernel_size=ksz):
|
||||
helper_test_op([shape],
|
||||
helper_test_op(
|
||||
[shape],
|
||||
lambda x: torch.nn.functional.avg_pool2d(x, kernel_size=ksz),
|
||||
lambda x: Tensor.avg_pool2d(x, kernel_size=ksz), device=self.device)
|
||||
lambda x: Tensor.avg_pool2d(x, kernel_size=ksz),
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
|
||||
@unittest.skipUnless(GPU, "Requires GPU")
|
||||
class TestOpsGPU(TestOps):
|
||||
device=Device.GPU
|
||||
device = Device.GPU
|
||||
|
||||
|
||||
@unittest.skipUnless(ANE, "Requires ANE")
|
||||
class TestOpsANE(TestOps):
|
||||
device=Device.ANE
|
||||
device = Device.ANE
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
|
||||
@@ -6,20 +6,24 @@ from tinygrad.optim import Adam, SGD, RMSprop
|
||||
from extra.utils import get_parameters
|
||||
from .config import ANE
|
||||
|
||||
x_init = np.random.randn(1,3).astype(np.float32)
|
||||
W_init = np.random.randn(3,3).astype(np.float32)
|
||||
m_init = np.random.randn(1,3).astype(np.float32)
|
||||
x_init = np.random.randn(1, 3).astype(np.float32)
|
||||
W_init = np.random.randn(3, 3).astype(np.float32)
|
||||
m_init = np.random.randn(1, 3).astype(np.float32)
|
||||
|
||||
|
||||
def step_tinygrad(optim, kwargs={}, device=Device.CPU):
|
||||
net = TinyNet()
|
||||
optim = optim([net.x, net.W], **kwargs)
|
||||
if device==Device.GPU: [x.gpu_() for x in get_parameters([net, optim])]
|
||||
elif device==Device.ANE: [x.ane_() for x in get_parameters([net, optim])]
|
||||
if device == Device.GPU:
|
||||
[x.gpu_() for x in get_parameters([net, optim])]
|
||||
elif device == Device.ANE:
|
||||
[x.ane_() for x in get_parameters([net, optim])]
|
||||
out = net.forward()
|
||||
out.backward()
|
||||
optim.step()
|
||||
return net.x.cpu().data, net.W.cpu().data
|
||||
|
||||
|
||||
def step_pytorch(optim, kwargs={}):
|
||||
net = TorchNet()
|
||||
optim = optim([net.x, net.W], **kwargs)
|
||||
@@ -29,7 +33,7 @@ def step_pytorch(optim, kwargs={}):
|
||||
return net.x.detach().numpy(), net.W.detach().numpy()
|
||||
|
||||
|
||||
class TinyNet():
|
||||
class TinyNet:
|
||||
def __init__(self):
|
||||
self.x = Tensor(x_init.copy())
|
||||
self.W = Tensor(W_init.copy())
|
||||
@@ -42,7 +46,7 @@ class TinyNet():
|
||||
return out
|
||||
|
||||
|
||||
class TorchNet():
|
||||
class TorchNet:
|
||||
def __init__(self):
|
||||
self.x = torch.tensor(x_init.copy(), requires_grad=True)
|
||||
self.W = torch.tensor(W_init.copy(), requires_grad=True)
|
||||
@@ -59,19 +63,23 @@ class TestOptim(unittest.TestCase):
|
||||
device = Device.CPU
|
||||
|
||||
def test_adam(self):
|
||||
for x,y in zip(step_tinygrad(Adam, device=self.device),
|
||||
step_pytorch(torch.optim.Adam)):
|
||||
for x, y in zip(
|
||||
step_tinygrad(Adam, device=self.device), step_pytorch(torch.optim.Adam)
|
||||
):
|
||||
np.testing.assert_allclose(x, y, atol=1e-4)
|
||||
|
||||
def test_sgd(self):
|
||||
for x,y in zip(step_tinygrad(SGD, kwargs={'lr': 0.001}, device=self.device),
|
||||
step_pytorch(torch.optim.SGD, kwargs={'lr': 0.001})):
|
||||
for x, y in zip(
|
||||
step_tinygrad(SGD, kwargs={"lr": 0.001}, device=self.device),
|
||||
step_pytorch(torch.optim.SGD, kwargs={"lr": 0.001}),
|
||||
):
|
||||
np.testing.assert_allclose(x, y, atol=1e-5)
|
||||
|
||||
def test_rmsprop(self):
|
||||
for x,y in zip(step_tinygrad(RMSprop, kwargs={'lr': 0.001, 'decay': 0.99}, device=self.device),
|
||||
step_pytorch(torch.optim.RMSprop,
|
||||
kwargs={'lr': 0.001, 'alpha': 0.99})):
|
||||
for x, y in zip(
|
||||
step_tinygrad(RMSprop, kwargs={"lr": 0.001, "decay": 0.99}, device=self.device),
|
||||
step_pytorch(torch.optim.RMSprop, kwargs={"lr": 0.001, "alpha": 0.99}),
|
||||
):
|
||||
np.testing.assert_allclose(x, y, atol=1e-5)
|
||||
|
||||
|
||||
@@ -79,10 +87,11 @@ class TestOptim(unittest.TestCase):
|
||||
class TestOptimGPU(TestOptim):
|
||||
device = Device.GPU
|
||||
|
||||
|
||||
@unittest.skipUnless(ANE, "Requires ANE")
|
||||
class TestOptimANE(TestOptim):
|
||||
device = Device.ANE
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -6,11 +6,12 @@ from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
|
||||
from .config import ANE
|
||||
|
||||
|
||||
x_init = np.random.randn(1,3).astype(np.float32)
|
||||
U_init = np.random.randn(3,3).astype(np.float32)
|
||||
V_init = np.random.randn(3,3).astype(np.float32)
|
||||
W_init = np.random.randn(3,3).astype(np.float32)
|
||||
m_init = np.random.randn(1,3).astype(np.float32)
|
||||
x_init = np.random.randn(1, 3).astype(np.float32)
|
||||
U_init = np.random.randn(3, 3).astype(np.float32)
|
||||
V_init = np.random.randn(3, 3).astype(np.float32)
|
||||
W_init = np.random.randn(3, 3).astype(np.float32)
|
||||
m_init = np.random.randn(1, 3).astype(np.float32)
|
||||
|
||||
|
||||
class TestTinygrad(unittest.TestCase):
|
||||
device = Device.CPU
|
||||
@@ -36,7 +37,7 @@ class TestTinygrad(unittest.TestCase):
|
||||
out.backward()
|
||||
return out.detach().numpy(), x.grad, W.grad
|
||||
|
||||
for x,y in zip(test_tinygrad(), test_pytorch()):
|
||||
for x, y in zip(test_tinygrad(), test_pytorch()):
|
||||
np.testing.assert_allclose(x, y, atol=1e-5)
|
||||
|
||||
def test_backward_pass_diamond_model(self):
|
||||
@@ -64,7 +65,7 @@ class TestTinygrad(unittest.TestCase):
|
||||
out.backward()
|
||||
return out.detach().numpy(), u.grad, v.grad, w.grad
|
||||
|
||||
for x,y in zip(test_tinygrad(), test_pytorch()):
|
||||
for x, y in zip(test_tinygrad(), test_pytorch()):
|
||||
np.testing.assert_allclose(x, y, atol=1e-5)
|
||||
|
||||
def test_jacobian(self):
|
||||
@@ -73,7 +74,9 @@ class TestTinygrad(unittest.TestCase):
|
||||
|
||||
torch_x = torch.tensor(x, requires_grad=True)
|
||||
torch_W = torch.tensor(W, requires_grad=True)
|
||||
torch_func = lambda x: torch.nn.functional.log_softmax(x.matmul(torch_W).relu(), dim=1)
|
||||
torch_func = lambda x: torch.nn.functional.log_softmax(
|
||||
x.matmul(torch_W).relu(), dim=1
|
||||
)
|
||||
PJ = torch.autograd.functional.jacobian(torch_func, torch_x).squeeze().numpy()
|
||||
|
||||
tiny_x = Tensor(x, device=self.device)
|
||||
@@ -82,8 +85,8 @@ class TestTinygrad(unittest.TestCase):
|
||||
J = jacobian(tiny_func, tiny_x)
|
||||
NJ = numerical_jacobian(tiny_func, tiny_x)
|
||||
|
||||
np.testing.assert_allclose(PJ, J, atol = 1e-5)
|
||||
np.testing.assert_allclose(PJ, NJ, atol = 1e-5)
|
||||
np.testing.assert_allclose(PJ, J, atol=1e-5)
|
||||
np.testing.assert_allclose(PJ, NJ, atol=1e-5)
|
||||
|
||||
def test_gradcheck(self):
|
||||
W = np.random.RandomState(1337).random((10, 5))
|
||||
@@ -96,7 +99,7 @@ class TestTinygrad(unittest.TestCase):
|
||||
self.assertTrue(gradcheck(tiny_func, tiny_x))
|
||||
|
||||
# coarse approx. since a "big" eps and the non-linearities of the model
|
||||
self.assertFalse(gradcheck(tiny_func, tiny_x, eps = 0.1))
|
||||
self.assertFalse(gradcheck(tiny_func, tiny_x, eps=0.1))
|
||||
|
||||
|
||||
@unittest.skipUnless(GPU, "Requires GPU")
|
||||
@@ -104,14 +107,18 @@ class TestTinygradGPU(TestTinygrad):
|
||||
device = Device.GPU
|
||||
|
||||
@unittest.skip("float64 not supported on GPU")
|
||||
def test_jacobian(self): pass
|
||||
def test_jacobian(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("float64 not supported on GPU")
|
||||
def test_gradcheck(self): pass
|
||||
def test_gradcheck(self):
|
||||
pass
|
||||
|
||||
|
||||
@unittest.skipUnless(ANE, "Requires ANE")
|
||||
class TestOpsANE(TestTinygrad):
|
||||
device=Device.ANE
|
||||
device = Device.ANE
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -1,23 +1,38 @@
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
|
||||
class BatchNorm2D:
|
||||
def __init__(self, sz, eps=1e-5, track_running_stats=False, training=False, momentum=0.1):
|
||||
self.eps, self.track_running_stats, self.training, self.momentum = eps, track_running_stats, training, momentum
|
||||
def __init__(
|
||||
self, sz, eps=1e-5, track_running_stats=False, training=False, momentum=0.1
|
||||
):
|
||||
self.eps, self.track_running_stats, self.training, self.momentum = (
|
||||
eps,
|
||||
track_running_stats,
|
||||
training,
|
||||
momentum,
|
||||
)
|
||||
|
||||
self.weight, self.bias = Tensor.ones(sz), Tensor.zeros(sz)
|
||||
|
||||
self.running_mean, self.running_var = Tensor.zeros(sz, requires_grad=False), Tensor.ones(sz, requires_grad=False)
|
||||
self.running_mean, self.running_var = (
|
||||
Tensor.zeros(sz, requires_grad=False),
|
||||
Tensor.ones(sz, requires_grad=False),
|
||||
)
|
||||
self.num_batches_tracked = Tensor.zeros(1, requires_grad=False)
|
||||
|
||||
def __call__(self, x):
|
||||
if self.track_running_stats or self.training:
|
||||
batch_mean = x.mean(axis=(0,2,3))
|
||||
y = (x - batch_mean.reshape(shape=[1, -1, 1, 1]))
|
||||
batch_var = (y*y).mean(axis=(0,2,3))
|
||||
batch_mean = x.mean(axis=(0, 2, 3))
|
||||
y = x - batch_mean.reshape(shape=[1, -1, 1, 1])
|
||||
batch_var = (y * y).mean(axis=(0, 2, 3))
|
||||
|
||||
if self.track_running_stats:
|
||||
self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean
|
||||
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var
|
||||
self.running_mean = (
|
||||
1 - self.momentum
|
||||
) * self.running_mean + self.momentum * batch_mean
|
||||
self.running_var = (
|
||||
1 - self.momentum
|
||||
) * self.running_var + self.momentum * batch_var
|
||||
self.num_batches_tracked += 1
|
||||
|
||||
if self.training:
|
||||
@@ -26,6 +41,9 @@ class BatchNorm2D:
|
||||
return self.normalize(x, self.running_mean, self.running_var)
|
||||
|
||||
def normalize(self, x, mean, var):
|
||||
x = (x - mean.reshape(shape=[1, -1, 1, 1])) * self.weight.reshape(shape=[1, -1, 1, 1])
|
||||
return x.div(var.add(self.eps).reshape(shape=[1, -1, 1, 1])**0.5) + self.bias.reshape(shape=[1, -1, 1, 1])
|
||||
|
||||
x = (x - mean.reshape(shape=[1, -1, 1, 1])) * self.weight.reshape(
|
||||
shape=[1, -1, 1, 1]
|
||||
)
|
||||
return x.div(
|
||||
var.add(self.eps).reshape(shape=[1, -1, 1, 1]) ** 0.5
|
||||
) + self.bias.reshape(shape=[1, -1, 1, 1])
|
||||
|
||||
@@ -2,19 +2,23 @@ from .tensor import Tensor, Function, register
|
||||
from functools import lru_cache
|
||||
import struct
|
||||
|
||||
|
||||
@lru_cache
|
||||
def compile_wrapper(ane, dat):
|
||||
return ane.compile(dat)
|
||||
|
||||
|
||||
def roundup(x, v):
|
||||
return x + (v-x)%v
|
||||
return x + (v - x) % v
|
||||
|
||||
|
||||
def fill(dat, addrs, type, val, base=0x4000):
|
||||
x = struct.pack(type, val)
|
||||
for a in addrs:
|
||||
dat[base+a:base+a+len(x)] = x
|
||||
dat[base + a : base + a + len(x)] = x
|
||||
return dat
|
||||
|
||||
|
||||
@lru_cache
|
||||
def compile_relu(ane, sz):
|
||||
dat = list(open("ane/ops/relu.hwx", "rb").read())
|
||||
@@ -22,16 +26,18 @@ def compile_relu(ane, sz):
|
||||
# number of relus
|
||||
dat = fill(dat, [0x128, 0x13C], "H", sz)
|
||||
# number of engines? (max 0x100)
|
||||
dat = fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8], "I", max(0x100, roundup(sz*2, 0x10)))
|
||||
dat = fill(dat, [0x1EC, 0x1F0, 0x1F4, 0x1F8], "I", max(0x100, roundup(sz * 2, 0x10)))
|
||||
# strides?
|
||||
dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz*2, 0x40))
|
||||
dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz * 2, 0x40))
|
||||
return compile_wrapper(ane, bytes(dat))
|
||||
|
||||
|
||||
class ReLU(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input):
|
||||
ret = ctx.ane.tensor(input.shape)
|
||||
ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret)
|
||||
return ret
|
||||
register('relu', ReLU, device=Tensor.ANE)
|
||||
|
||||
|
||||
register("relu", ReLU, device=Tensor.ANE)
|
||||
|
||||
@@ -5,44 +5,58 @@ from .tensor import Function, register
|
||||
# ************* basic ops *************
|
||||
def unbroadcast(out, in_sh):
|
||||
# adjoint operation to broadcast is sum. Need to sum all axis with 1 = in_sh[i] < out.shape[i]
|
||||
sum_axis = tuple([i for i in range(len(in_sh)) if in_sh[i]==1 and out.shape[i]>1]) if in_sh != (1,) else None
|
||||
sum_axis = (
|
||||
tuple([i for i in range(len(in_sh)) if in_sh[i] == 1 and out.shape[i] > 1])
|
||||
if in_sh != (1,)
|
||||
else None
|
||||
)
|
||||
return out.sum(axis=sum_axis).reshape(in_sh)
|
||||
|
||||
|
||||
class Add(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, y):
|
||||
ctx.save_for_backward(x.shape, y.shape)
|
||||
return x+y
|
||||
return x + y
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
shape_x, shape_y = ctx.saved_tensors
|
||||
return unbroadcast(grad_output, shape_x), unbroadcast(grad_output, shape_y)
|
||||
register('add', Add)
|
||||
|
||||
|
||||
register("add", Add)
|
||||
|
||||
|
||||
class Sub(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, y):
|
||||
ctx.save_for_backward(x.shape, y.shape)
|
||||
return x-y
|
||||
return x - y
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
shape_x, shape_y = ctx.saved_tensors
|
||||
return unbroadcast(grad_output, shape_x), unbroadcast(-grad_output, shape_y)
|
||||
register('sub', Sub)
|
||||
|
||||
|
||||
register("sub", Sub)
|
||||
|
||||
|
||||
class Mul(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, y):
|
||||
ctx.save_for_backward(x, y)
|
||||
return x*y
|
||||
return x * y
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
x,y = ctx.saved_tensors
|
||||
return unbroadcast(y*grad_output, x.shape), unbroadcast(x*grad_output, y.shape)
|
||||
register('mul', Mul)
|
||||
x, y = ctx.saved_tensors
|
||||
return unbroadcast(y * grad_output, x.shape), unbroadcast(x * grad_output, y.shape)
|
||||
|
||||
|
||||
register("mul", Mul)
|
||||
|
||||
|
||||
class Pow(Function):
|
||||
@staticmethod
|
||||
@@ -52,27 +66,37 @@ class Pow(Function):
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
x,y = ctx.saved_tensors
|
||||
return unbroadcast(y * (x**(y-1.0)) * grad_output, x.shape), \
|
||||
unbroadcast((x**y) * np.log(x) * grad_output, y.shape)
|
||||
register('pow', Pow)
|
||||
x, y = ctx.saved_tensors
|
||||
return unbroadcast(y * (x ** (y - 1.0)) * grad_output, x.shape), unbroadcast(
|
||||
(x ** y) * np.log(x) * grad_output, y.shape
|
||||
)
|
||||
|
||||
|
||||
register("pow", Pow)
|
||||
|
||||
|
||||
class Sum(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input,axis=None):
|
||||
def forward(ctx, input, axis=None):
|
||||
ctx.save_for_backward(input, axis)
|
||||
return np.array([input.sum()]) if axis is None else input.sum(axis=axis)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
input, axis = ctx.saved_tensors
|
||||
shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
|
||||
shape = [
|
||||
1 if axis is None or i in axis else input.shape[i]
|
||||
for i in range(len(input.shape))
|
||||
]
|
||||
return grad_output.reshape(shape) + np.zeros_like(input)
|
||||
register('sum', Sum)
|
||||
|
||||
|
||||
register("sum", Sum)
|
||||
|
||||
|
||||
# ************* GEMM *************
|
||||
|
||||
|
||||
class Dot(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input, weight):
|
||||
@@ -85,21 +109,27 @@ class Dot(Function):
|
||||
grad_input = grad_output.dot(weight.T)
|
||||
grad_weight = input.T.dot(grad_output)
|
||||
return grad_input, grad_weight
|
||||
register('dot', Dot)
|
||||
|
||||
|
||||
register("dot", Dot)
|
||||
|
||||
# ************* simple ops *************
|
||||
|
||||
|
||||
class Pad2D(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, padding=None):
|
||||
ctx.save_for_backward(padding)
|
||||
return np.pad(x, ((0,0), (0,0), tuple(padding[2:4]), tuple(padding[0:2])))
|
||||
return np.pad(x, ((0, 0), (0, 0), tuple(padding[2:4]), tuple(padding[0:2])))
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
padding, = ctx.saved_tensors
|
||||
return grad_output[..., padding[2]:-padding[3], padding[0]:-padding[1]]
|
||||
register('pad2d', Pad2D)
|
||||
(padding,) = ctx.saved_tensors
|
||||
return grad_output[..., padding[2] : -padding[3], padding[0] : -padding[1]]
|
||||
|
||||
|
||||
register("pad2d", Pad2D)
|
||||
|
||||
|
||||
class Reshape(Function):
|
||||
@staticmethod
|
||||
@@ -109,13 +139,16 @@ class Reshape(Function):
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
in_shape, = ctx.saved_tensors
|
||||
(in_shape,) = ctx.saved_tensors
|
||||
return grad_output.reshape(in_shape)
|
||||
register('reshape', Reshape)
|
||||
|
||||
|
||||
register("reshape", Reshape)
|
||||
|
||||
|
||||
# ************* activation ops *************
|
||||
|
||||
|
||||
class ReLU(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input):
|
||||
@@ -124,123 +157,145 @@ class ReLU(Function):
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
input, = ctx.saved_tensors
|
||||
(input,) = ctx.saved_tensors
|
||||
return grad_output * (input >= 0)
|
||||
register('relu', ReLU)
|
||||
|
||||
|
||||
register("relu", ReLU)
|
||||
|
||||
|
||||
def _exp_normalize(x, axis=None):
|
||||
y = np.exp(x - x.max(axis=axis, keepdims=True))
|
||||
return y / y.sum(axis=axis, keepdims=True)
|
||||
y = np.exp(x - x.max(axis=axis, keepdims=True))
|
||||
return y / y.sum(axis=axis, keepdims=True)
|
||||
|
||||
|
||||
class Sigmoid(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input):
|
||||
with np.warnings.catch_warnings():
|
||||
np.warnings.filterwarnings('ignore')
|
||||
ret = np.where(input >= 0,
|
||||
1/(1 + np.exp(-input)),
|
||||
np.exp(input)/(1 + np.exp(input))
|
||||
np.warnings.filterwarnings("ignore")
|
||||
ret = np.where(
|
||||
input >= 0, 1 / (1 + np.exp(-input)), np.exp(input) / (1 + np.exp(input))
|
||||
)
|
||||
ctx.save_for_backward(ret)
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
ret, = ctx.saved_tensors
|
||||
(ret,) = ctx.saved_tensors
|
||||
return grad_output * (ret * (1 - ret))
|
||||
register('sigmoid', Sigmoid)
|
||||
|
||||
|
||||
register("sigmoid", Sigmoid)
|
||||
|
||||
|
||||
class LogSoftmax(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input):
|
||||
softmax = _exp_normalize(input, axis=1)
|
||||
ctx.save_for_backward(softmax)
|
||||
return np.log(softmax)
|
||||
return np.log(softmax)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
softmax, = ctx.saved_tensors
|
||||
return grad_output - grad_output.sum(axis=1, keepdims=True)*softmax
|
||||
register('logsoftmax', LogSoftmax)
|
||||
(softmax,) = ctx.saved_tensors
|
||||
return grad_output - grad_output.sum(axis=1, keepdims=True) * softmax
|
||||
|
||||
|
||||
register("logsoftmax", LogSoftmax)
|
||||
|
||||
|
||||
# ************* conv ops *************
|
||||
|
||||
|
||||
class Conv2D(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, w, stride=1, groups=1):
|
||||
if type(ctx.stride) == int:
|
||||
ctx.stride = (ctx.stride, ctx.stride)
|
||||
cout,cin,H,W = w.shape
|
||||
ys,xs = ctx.stride
|
||||
bs,cin_ = x.shape[0], x.shape[1]
|
||||
oy,ox = (x.shape[2]-(H-ys))//ys, (x.shape[3]-(W-xs))//xs
|
||||
assert cin*ctx.groups == cin_
|
||||
cout, cin, H, W = w.shape
|
||||
ys, xs = ctx.stride
|
||||
bs, cin_ = x.shape[0], x.shape[1]
|
||||
oy, ox = (x.shape[2] - (H - ys)) // ys, (x.shape[3] - (W - xs)) // xs
|
||||
assert cin * ctx.groups == cin_
|
||||
assert cout % ctx.groups == 0
|
||||
rcout = cout//ctx.groups
|
||||
rcout = cout // ctx.groups
|
||||
|
||||
gx = x.reshape(bs,ctx.groups,cin,x.shape[2],x.shape[3])
|
||||
tx = np.lib.stride_tricks.as_strided(gx,
|
||||
gx = x.reshape(bs, ctx.groups, cin, x.shape[2], x.shape[3])
|
||||
tx = np.lib.stride_tricks.as_strided(
|
||||
gx,
|
||||
shape=(bs, ctx.groups, cin, oy, ox, H, W),
|
||||
strides=(*gx.strides[0:3], gx.strides[3]*ys, gx.strides[4]*xs, *gx.strides[3:5]),
|
||||
strides=(
|
||||
*gx.strides[0:3],
|
||||
gx.strides[3] * ys,
|
||||
gx.strides[4] * xs,
|
||||
*gx.strides[3:5],
|
||||
),
|
||||
writeable=False,
|
||||
)
|
||||
tw = w.reshape(ctx.groups, rcout, cin, H, W)
|
||||
ctx.save_for_backward(tx, tw, x.shape)
|
||||
|
||||
ret = np.zeros((bs,ctx.groups,oy,ox,rcout),dtype=x.dtype)
|
||||
ret = np.zeros((bs, ctx.groups, oy, ox, rcout), dtype=x.dtype)
|
||||
for g in range(ctx.groups):
|
||||
#ijYXyx,kjyx -> iYXk ->ikYX
|
||||
ret[:,g] += np.tensordot(tx[:,g], tw[g], ((1,4,5),(1,2,3)))
|
||||
return np.moveaxis(ret,4,2).reshape(bs, cout, oy, ox)
|
||||
# ijYXyx,kjyx -> iYXk ->ikYX
|
||||
ret[:, g] += np.tensordot(tx[:, g], tw[g], ((1, 4, 5), (1, 2, 3)))
|
||||
return np.moveaxis(ret, 4, 2).reshape(bs, cout, oy, ox)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
bs,_,oy,ox = grad_output.shape
|
||||
bs, _, oy, ox = grad_output.shape
|
||||
tx, tw, x_shape = ctx.saved_tensors
|
||||
_,rcout,cin,H,W = tw.shape
|
||||
ys,xs = ctx.stride
|
||||
OY,OX = x_shape[2:4]
|
||||
_, rcout, cin, H, W = tw.shape
|
||||
ys, xs = ctx.stride
|
||||
OY, OX = x_shape[2:4]
|
||||
|
||||
ggg = grad_output.reshape(bs,ctx.groups,rcout,oy,ox)
|
||||
ggg = grad_output.reshape(bs, ctx.groups, rcout, oy, ox)
|
||||
|
||||
gdw = np.zeros((ctx.groups,rcout,cin,H,W), dtype=tx.dtype)
|
||||
gdw = np.zeros((ctx.groups, rcout, cin, H, W), dtype=tx.dtype)
|
||||
for g in range(ctx.groups):
|
||||
#'ikYX,ijYXyx -> kjyx'
|
||||
gdw[g] += np.tensordot(ggg[:,g], tx[:,g], ((0,2,3),(0,2,3)))
|
||||
gdw[g] += np.tensordot(ggg[:, g], tx[:, g], ((0, 2, 3), (0, 2, 3)))
|
||||
|
||||
# needs to be optimized
|
||||
gdx = np.zeros((bs,ctx.groups,cin,OY,OX), dtype=tx.dtype)
|
||||
for k in range(oy*ox):
|
||||
Y, X = k//ox, k%ox
|
||||
iY,iX = Y*ys, X*xs
|
||||
#gdx[:,:,: , iY:iY+H, iX:iX+W] += np.einsum('igk,gkjyx->igjyx', ggg[:,:,:,Y,X], tw)
|
||||
gdx = np.zeros((bs, ctx.groups, cin, OY, OX), dtype=tx.dtype)
|
||||
for k in range(oy * ox):
|
||||
Y, X = k // ox, k % ox
|
||||
iY, iX = Y * ys, X * xs
|
||||
# gdx[:,:,: , iY:iY+H, iX:iX+W] += np.einsum('igk,gkjyx->igjyx', ggg[:,:,:,Y,X], tw)
|
||||
for g in range(ctx.groups):
|
||||
tg = np.dot(ggg[:,g,:,Y,X].reshape(bs, -1), tw[g].reshape(rcout, -1))
|
||||
gdx[:, g, :, iY:iY+H, iX:iX+W] += tg.reshape((bs, cin, H, W))
|
||||
tg = np.dot(ggg[:, g, :, Y, X].reshape(bs, -1), tw[g].reshape(rcout, -1))
|
||||
gdx[:, g, :, iY : iY + H, iX : iX + W] += tg.reshape((bs, cin, H, W))
|
||||
|
||||
return gdx.reshape((bs, ctx.groups*cin, OY, OX)), gdw.reshape((ctx.groups*rcout, cin, H, W))
|
||||
register('conv2d', Conv2D)
|
||||
return gdx.reshape((bs, ctx.groups * cin, OY, OX)), gdw.reshape(
|
||||
(ctx.groups * rcout, cin, H, W)
|
||||
)
|
||||
|
||||
|
||||
register("conv2d", Conv2D)
|
||||
|
||||
|
||||
# ************* pooling ops *************
|
||||
|
||||
|
||||
def stack_for_pool(x, py, px):
|
||||
my, mx = (x.shape[2]//py)*py, (x.shape[3]//px)*px
|
||||
my, mx = (x.shape[2] // py) * py, (x.shape[3] // px) * px
|
||||
xup = x[:, :, :my, :mx]
|
||||
stack = [xup[:, :, k//px::py, k%px::px][None] for k in range(py*px)]
|
||||
stack = [xup[:, :, k // px :: py, k % px :: px][None] for k in range(py * px)]
|
||||
return np.concatenate(stack, axis=0)
|
||||
|
||||
|
||||
def unstack_for_pool(fxn, s, py, px):
|
||||
my, mx = (s[2]//py)*py, (s[3]//px)*px
|
||||
for k in range(py*px):
|
||||
Y, X = k//px, k%px
|
||||
ll = fxn(Y*px+X)
|
||||
my, mx = (s[2] // py) * py, (s[3] // px) * px
|
||||
for k in range(py * px):
|
||||
Y, X = k // px, k % px
|
||||
ll = fxn(Y * px + X)
|
||||
if X == 0 and Y == 0:
|
||||
ret = np.zeros(s, dtype=ll.dtype)
|
||||
ret[:, :, Y:my:py, X:mx:px] = ll
|
||||
return ret
|
||||
|
||||
|
||||
class MaxPool2D(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, kernel_size=(2, 2)):
|
||||
@@ -251,9 +306,14 @@ class MaxPool2D(Function):
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
idxs,s = ctx.saved_tensors
|
||||
return unstack_for_pool(lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size)
|
||||
register('max_pool2d', MaxPool2D)
|
||||
idxs, s = ctx.saved_tensors
|
||||
return unstack_for_pool(
|
||||
lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size
|
||||
)
|
||||
|
||||
|
||||
register("max_pool2d", MaxPool2D)
|
||||
|
||||
|
||||
class AvgPool2D(Function):
|
||||
@staticmethod
|
||||
@@ -264,8 +324,9 @@ class AvgPool2D(Function):
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
s, = ctx.saved_tensors
|
||||
(s,) = ctx.saved_tensors
|
||||
py, px = ctx.kernel_size
|
||||
return unstack_for_pool(lambda idx: grad_output/py/px, s, py, px)
|
||||
register('avg_pool2d', AvgPool2D)
|
||||
return unstack_for_pool(lambda idx: grad_output / py / px, s, py, px)
|
||||
|
||||
|
||||
register("avg_pool2d", AvgPool2D)
|
||||
|
||||
@@ -3,131 +3,229 @@ from .tensor import Function, register, GPUBuffer, Tensor, Device
|
||||
import pyopencl as cl
|
||||
import functools
|
||||
|
||||
|
||||
def buffer_new(ctx, shape, zero=False):
|
||||
return GPUBuffer(shape, hostbuf=None if not zero else np.zeros(shape, dtype=np.float32))
|
||||
return GPUBuffer(
|
||||
shape, hostbuf=None if not zero else np.zeros(shape, dtype=np.float32)
|
||||
)
|
||||
|
||||
|
||||
@functools.lru_cache()
|
||||
def clbuild(cl_ctx, name, prg):
|
||||
return cl.Program(cl_ctx, prg).build().__getattr__(name)
|
||||
|
||||
|
||||
def uint2(x, y):
|
||||
return np.array((x,y), dtype=cl.cltypes.uint2)
|
||||
return np.array((x, y), dtype=cl.cltypes.uint2)
|
||||
|
||||
|
||||
i32 = np.int32
|
||||
|
||||
def subsample_op(ctx, input, kernel_size, stride, iter_op, result_op, decls=''):
|
||||
|
||||
def subsample_op(ctx, input, kernel_size, stride, iter_op, result_op, decls=""):
|
||||
py, px = stride
|
||||
N, C, Yin, Xin = input.shape
|
||||
Yout, Xout = (Yin-kernel_size[0])//py+1, (Xin-kernel_size[1])//px+1
|
||||
Yout, Xout = (Yin - kernel_size[0]) // py + 1, (Xin - kernel_size[1]) // px + 1
|
||||
ret = buffer_new(ctx, (N, C, Yout, Xout), zero=True)
|
||||
subsample = clbuild(ctx.cl_ctx, "subsample", """
|
||||
subsample = clbuild(
|
||||
ctx.cl_ctx,
|
||||
"subsample",
|
||||
"""
|
||||
__kernel void subsample(__global float *output, __global const float *input, uint2 osize, uint2 isize,
|
||||
uint2 ksz, uint2 stride) {
|
||||
int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
|
||||
int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
|
||||
"""+decls+""";
|
||||
"""
|
||||
+ decls
|
||||
+ """;
|
||||
for (uint j=0; j<ksz.y; ++j) {
|
||||
for (uint i=0; i<ksz.x; ++i) {
|
||||
int iid = (gid.x*stride.x+i) + isize.x*((gid.y*stride.y+j) + isize.y*gid.z);
|
||||
if (gid.x*stride.x+i < isize.x && gid.y*stride.y+j < isize.y) {
|
||||
"""+iter_op+""";
|
||||
"""
|
||||
+ iter_op
|
||||
+ """;
|
||||
}
|
||||
}
|
||||
}
|
||||
output[oid] = """+result_op+""";
|
||||
}""")
|
||||
subsample(ctx.cl_queue, (N*C, Yout, Xout), None,
|
||||
ret.cl, input.cl, uint2(Xout, Yout), uint2(Xin, Yin),
|
||||
uint2(*kernel_size[::-1]), uint2(px, py))
|
||||
ctx.data = np.empty((N, C, Yout, Xout)) # set shape expectation on tensor instance
|
||||
output[oid] = """
|
||||
+ result_op
|
||||
+ """;
|
||||
}""",
|
||||
)
|
||||
subsample(
|
||||
ctx.cl_queue,
|
||||
(N * C, Yout, Xout),
|
||||
None,
|
||||
ret.cl,
|
||||
input.cl,
|
||||
uint2(Xout, Yout),
|
||||
uint2(Xin, Yin),
|
||||
uint2(*kernel_size[::-1]),
|
||||
uint2(px, py),
|
||||
)
|
||||
ctx.data = np.empty((N, C, Yout, Xout)) # set shape expectation on tensor instance
|
||||
return ret
|
||||
|
||||
def supersample_op(ctx, input, out_shape, kernel_size, result_op, decls='', input2=None):
|
||||
|
||||
def supersample_op(
|
||||
ctx, input, out_shape, kernel_size, result_op, decls="", input2=None
|
||||
):
|
||||
(N, C, Yin, Xin), (Yout, Xout) = input.shape, out_shape[2:]
|
||||
py,px = kernel_size
|
||||
py, px = kernel_size
|
||||
ret = buffer_new(ctx, out_shape, zero=True)
|
||||
supsample = clbuild(ctx.cl_ctx, "supsample", """
|
||||
supsample = clbuild(
|
||||
ctx.cl_ctx,
|
||||
"supsample",
|
||||
"""
|
||||
__kernel void supsample(__global float *output, __global const float *input, __global const void *input2,
|
||||
uint2 osize, uint2 isize, uint2 ksz) {
|
||||
int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
|
||||
int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
|
||||
int iid = (gid.x/ksz.x) + isize.x*((gid.y/ksz.y) + isize.y*gid.z);
|
||||
"""+decls+""";
|
||||
"""
|
||||
+ decls
|
||||
+ """;
|
||||
if (gid.x/ksz.x < isize.x && gid.y/ksz.y < isize.y) {
|
||||
output[oid] = """+result_op+""";
|
||||
output[oid] = """
|
||||
+ result_op
|
||||
+ """;
|
||||
}
|
||||
}""")
|
||||
supsample(ctx.cl_queue, (N*C, Yout, Xout), None,
|
||||
ret.cl, input.cl, input2.cl if input2 is not None else input2,
|
||||
uint2(Xout, Yout), uint2(Xin, Yin), uint2(px, py))
|
||||
ctx.data = np.empty((N, C, Yout, Xout)) # set shape expectation on tensor instance
|
||||
}""",
|
||||
)
|
||||
supsample(
|
||||
ctx.cl_queue,
|
||||
(N * C, Yout, Xout),
|
||||
None,
|
||||
ret.cl,
|
||||
input.cl,
|
||||
input2.cl if input2 is not None else input2,
|
||||
uint2(Xout, Yout),
|
||||
uint2(Xin, Yin),
|
||||
uint2(px, py),
|
||||
)
|
||||
ctx.data = np.empty((N, C, Yout, Xout)) # set shape expectation on tensor instance
|
||||
return ret
|
||||
|
||||
|
||||
@functools.lru_cache()
|
||||
def get_binop_prg(cl_ctx, code, complist):
|
||||
ndims = len(complist)
|
||||
args = "".join([", int d%d" % i for i in range(ndims)]) + "".join([", int p%d" % i for i in range(ndims-1)])
|
||||
compute_idx_rets = ["\n int idx_ret"+str(i)+" = (gid0 / "+("p%d"%i if i < ndims-1 else "1")+") % d"+str(i)+";" for i in range(ndims)]
|
||||
|
||||
idx_exprs = ["0", "0"] # [idx_x, idx_y]
|
||||
args = "".join([", int d%d" % i for i in range(ndims)]) + "".join(
|
||||
[", int p%d" % i for i in range(ndims - 1)]
|
||||
)
|
||||
compute_idx_rets = [
|
||||
"\n int idx_ret"
|
||||
+ str(i)
|
||||
+ " = (gid0 / "
|
||||
+ ("p%d" % i if i < ndims - 1 else "1")
|
||||
+ ") % d"
|
||||
+ str(i)
|
||||
+ ";"
|
||||
for i in range(ndims)
|
||||
]
|
||||
|
||||
idx_exprs = ["0", "0"] # [idx_x, idx_y]
|
||||
for i in range(ndims):
|
||||
for j in range(2):
|
||||
if complist[i][j]:
|
||||
idx_exprs[j] = "idx_ret%d + d%d*(%s)" % (i, i, idx_exprs[j])
|
||||
|
||||
return cl.Program(cl_ctx, """__kernel void binop(__global const float *x_g, __global const float *y_g, __global float *res_g"""+args+""") {
|
||||
int gid0 = get_global_id(0);"""+"".join(compute_idx_rets)+"""
|
||||
float a = x_g["""+idx_exprs[0]+"""];
|
||||
float b = y_g["""+idx_exprs[1]+"""];
|
||||
res_g[gid0] = """+code+""";\n}""").build()
|
||||
|
||||
return cl.Program(
|
||||
cl_ctx,
|
||||
"""__kernel void binop(__global const float *x_g, __global const float *y_g, __global float *res_g"""
|
||||
+ args
|
||||
+ """) {
|
||||
int gid0 = get_global_id(0);"""
|
||||
+ "".join(compute_idx_rets)
|
||||
+ """
|
||||
float a = x_g["""
|
||||
+ idx_exprs[0]
|
||||
+ """];
|
||||
float b = y_g["""
|
||||
+ idx_exprs[1]
|
||||
+ """];
|
||||
res_g[gid0] = """
|
||||
+ code
|
||||
+ """;\n}""",
|
||||
).build()
|
||||
|
||||
|
||||
def binary_op(ctx, code, x, y):
|
||||
n_dims = max(len(x.shape), len(y.shape))
|
||||
shape_x, shape_y = np.ones(n_dims, dtype=np.int32), np.ones(n_dims, dtype=np.int32)
|
||||
shape_x[:len(x.shape)] = np.array(x.shape, dtype=np.int32)
|
||||
shape_y[:len(y.shape)] = np.array(y.shape, dtype=np.int32)
|
||||
shape_x[: len(x.shape)] = np.array(x.shape, dtype=np.int32)
|
||||
shape_y[: len(y.shape)] = np.array(y.shape, dtype=np.int32)
|
||||
if not np.all((shape_x == 1) | (shape_y == 1) | (shape_x == shape_y)):
|
||||
raise Exception(f"binary op unbroadcastable shape mismatch: {x.shape} vs {y.shape}")
|
||||
shape_ret = np.maximum(shape_x, shape_y)
|
||||
|
||||
dimlist, complist = [], [] # note: len(dimlist) may be less than n_dims
|
||||
|
||||
dimlist, complist = [], [] # note: len(dimlist) may be less than n_dims
|
||||
|
||||
def push(dim, comp):
|
||||
if len(complist) > 0 and complist[-1] == comp:
|
||||
dimlist[-1] *= dim
|
||||
elif comp != (False, False):
|
||||
dimlist.append(dim); complist.append(comp)
|
||||
for i in range(n_dims): # group together any adjacent dimensions that we can to simplify broadcasting
|
||||
dimlist.append(dim)
|
||||
complist.append(comp)
|
||||
|
||||
for i in range(
|
||||
n_dims
|
||||
): # group together any adjacent dimensions that we can to simplify broadcasting
|
||||
push(i32(max(shape_x[i], shape_y[i])), (shape_x[i] > 1, shape_y[i] > 1))
|
||||
|
||||
|
||||
prg = get_binop_prg(ctx.cl_ctx, code, tuple(complist))
|
||||
ret = buffer_new(ctx, shape_ret, zero=True)
|
||||
prod_list = np.array(dimlist, dtype=i32)[-1::-1].cumprod(dtype=i32)[-1::-1] # take cumprod from back to front
|
||||
prg.binop(ctx.cl_queue, [prod_list[0]] if len(dimlist) > 0 else [1], None, x.cl, y.cl, ret.cl, *dimlist, *(prod_list[1:]))
|
||||
prod_list = np.array(dimlist, dtype=i32)[-1::-1].cumprod(dtype=i32)[
|
||||
-1::-1
|
||||
] # take cumprod from back to front
|
||||
prg.binop(
|
||||
ctx.cl_queue,
|
||||
[prod_list[0]] if len(dimlist) > 0 else [1],
|
||||
None,
|
||||
x.cl,
|
||||
y.cl,
|
||||
ret.cl,
|
||||
*dimlist,
|
||||
*(prod_list[1:]),
|
||||
)
|
||||
return ret
|
||||
|
||||
|
||||
def unary_op(ctx, code, x):
|
||||
ret = buffer_new(ctx, x.shape)
|
||||
unop = clbuild(ctx.cl_ctx, "unop", """
|
||||
unop = clbuild(
|
||||
ctx.cl_ctx,
|
||||
"unop",
|
||||
"""
|
||||
__kernel void unop(__global const float *a_g, __global float *res_g) {
|
||||
int gid = get_global_id(0);
|
||||
float a = a_g[gid];
|
||||
res_g[gid] = """+code+""";
|
||||
}""")
|
||||
res_g[gid] = """
|
||||
+ code
|
||||
+ """;
|
||||
}""",
|
||||
)
|
||||
unop(ctx.cl_queue, [np.prod(ret.shape)], None, x.cl, ret.cl)
|
||||
return ret
|
||||
|
||||
|
||||
def reduce_op(ctx, code, code2, inp, axis=None):
|
||||
if axis is None:
|
||||
# full reduce
|
||||
osize = [1]*len(inp.shape)
|
||||
osize = [1] * len(inp.shape)
|
||||
else:
|
||||
osize = np.array(inp.shape)
|
||||
osize[list(axis)] = 1
|
||||
osize[list(axis)] = 1
|
||||
ret = buffer_new(ctx, osize)
|
||||
if axis is None:
|
||||
ret.shape = (1,)
|
||||
|
||||
# TODO: this is insanely slow
|
||||
reduce = clbuild(ctx.cl_ctx, "reduce", """
|
||||
reduce = clbuild(
|
||||
ctx.cl_ctx,
|
||||
"reduce",
|
||||
"""
|
||||
__kernel void reduce(__global const float *a_g, int sz, __global float *res_g, int prod, int n_dims,
|
||||
__global const int *shape_x, __global const int *shape_ret) {
|
||||
int gid = get_global_id(0);
|
||||
@@ -148,80 +246,129 @@ def reduce_op(ctx, code, code2, inp, axis=None):
|
||||
}
|
||||
}
|
||||
float a = a_g[idx];
|
||||
"""+code+""";
|
||||
"""
|
||||
+ code
|
||||
+ """;
|
||||
}
|
||||
res_g[gid] = """+code2+""";
|
||||
}""")
|
||||
buffer_np = lambda x: cl.Buffer(ctx.cl_ctx,
|
||||
cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x)
|
||||
reduce(ctx.cl_queue, [np.prod(osize)], None, inp.cl,
|
||||
i32(np.prod(inp.shape)//np.prod(osize)), ret.cl,
|
||||
i32(np.prod(osize)), i32(len(osize)),
|
||||
res_g[gid] = """
|
||||
+ code2
|
||||
+ """;
|
||||
}""",
|
||||
)
|
||||
buffer_np = lambda x: cl.Buffer(
|
||||
ctx.cl_ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x
|
||||
)
|
||||
reduce(
|
||||
ctx.cl_queue,
|
||||
[np.prod(osize)],
|
||||
None,
|
||||
inp.cl,
|
||||
i32(np.prod(inp.shape) // np.prod(osize)),
|
||||
ret.cl,
|
||||
i32(np.prod(osize)),
|
||||
i32(len(osize)),
|
||||
buffer_np(np.array(inp.shape, dtype=np.int32)),
|
||||
buffer_np(np.array(osize, dtype=np.int32)))
|
||||
buffer_np(np.array(osize, dtype=np.int32)),
|
||||
)
|
||||
return ret
|
||||
|
||||
|
||||
def unbroadcast(ctx, out, in_sh):
|
||||
sum_axis = [i for i in range(len(in_sh)) if in_sh[i]==1 and out.shape[i]>1] if in_sh != (1,) else None
|
||||
sum_axis = (
|
||||
[i for i in range(len(in_sh)) if in_sh[i] == 1 and out.shape[i] > 1]
|
||||
if in_sh != (1,)
|
||||
else None
|
||||
)
|
||||
return reduce_op(ctx, "out += a", "out", out, sum_axis)
|
||||
|
||||
|
||||
# ***** now for the ops themselves *****
|
||||
|
||||
|
||||
class Add(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, y):
|
||||
ctx.save_for_backward(x.shape, y.shape)
|
||||
return binary_op(ctx, 'a+b', x, y)
|
||||
return binary_op(ctx, "a+b", x, y)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
grad_x, grad_y = grad_output, grad_output
|
||||
shape_x, shape_y = ctx.saved_tensors
|
||||
return unbroadcast(ctx, grad_x, shape_x), unbroadcast(ctx, grad_y, shape_y),
|
||||
register('add', Add, device=Device.GPU)
|
||||
return (
|
||||
unbroadcast(ctx, grad_x, shape_x),
|
||||
unbroadcast(ctx, grad_y, shape_y),
|
||||
)
|
||||
|
||||
|
||||
register("add", Add, device=Device.GPU)
|
||||
|
||||
|
||||
class Sub(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, y):
|
||||
ctx.save_for_backward(x.shape, y.shape)
|
||||
return binary_op(ctx, 'a-b', x, y)
|
||||
return binary_op(ctx, "a-b", x, y)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
grad_x, grad_y = grad_output, unary_op(ctx, '-a', grad_output)
|
||||
grad_x, grad_y = grad_output, unary_op(ctx, "-a", grad_output)
|
||||
shape_x, shape_y = ctx.saved_tensors
|
||||
return unbroadcast(ctx, grad_x, shape_x), unbroadcast(ctx, grad_y, shape_y),
|
||||
register('sub', Sub, device=Device.GPU)
|
||||
return (
|
||||
unbroadcast(ctx, grad_x, shape_x),
|
||||
unbroadcast(ctx, grad_y, shape_y),
|
||||
)
|
||||
|
||||
|
||||
register("sub", Sub, device=Device.GPU)
|
||||
|
||||
|
||||
class Mul(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, y):
|
||||
ctx.save_for_backward(x, y)
|
||||
return binary_op(ctx, 'a*b', x, y)
|
||||
return binary_op(ctx, "a*b", x, y)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
x,y = ctx.saved_tensors
|
||||
grad_x = binary_op(ctx, 'a*b', y, grad_output)
|
||||
grad_y = binary_op(ctx, 'a*b', x, grad_output)
|
||||
return unbroadcast(ctx, grad_x, x.shape), unbroadcast(ctx, grad_y, y.shape),
|
||||
register('mul', Mul, device=Device.GPU)
|
||||
x, y = ctx.saved_tensors
|
||||
grad_x = binary_op(ctx, "a*b", y, grad_output)
|
||||
grad_y = binary_op(ctx, "a*b", x, grad_output)
|
||||
return (
|
||||
unbroadcast(ctx, grad_x, x.shape),
|
||||
unbroadcast(ctx, grad_y, y.shape),
|
||||
)
|
||||
|
||||
|
||||
register("mul", Mul, device=Device.GPU)
|
||||
|
||||
|
||||
class Pow(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, y):
|
||||
ctx.save_for_backward(x, y)
|
||||
return binary_op(ctx, 'pow(a,b)', x, y)
|
||||
return binary_op(ctx, "pow(a,b)", x, y)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
x,y = ctx.saved_tensors
|
||||
grad_x = binary_op(ctx, 'a*b', grad_output,
|
||||
binary_op(ctx, 'b * (pow((float)a, (float)(b-1.0)))', x, y))
|
||||
grad_y = binary_op(ctx, 'a*b', grad_output,
|
||||
binary_op(ctx, 'pow(a, (float)b) * log(a);', x, y))
|
||||
return unbroadcast(ctx, grad_x, x.shape), unbroadcast(ctx, grad_y, y.shape),
|
||||
register('pow', Pow, device=Device.GPU)
|
||||
x, y = ctx.saved_tensors
|
||||
grad_x = binary_op(
|
||||
ctx,
|
||||
"a*b",
|
||||
grad_output,
|
||||
binary_op(ctx, "b * (pow((float)a, (float)(b-1.0)))", x, y),
|
||||
)
|
||||
grad_y = binary_op(
|
||||
ctx, "a*b", grad_output, binary_op(ctx, "pow(a, (float)b) * log(a);", x, y)
|
||||
)
|
||||
return (
|
||||
unbroadcast(ctx, grad_x, x.shape),
|
||||
unbroadcast(ctx, grad_y, y.shape),
|
||||
)
|
||||
|
||||
|
||||
register("pow", Pow, device=Device.GPU)
|
||||
|
||||
|
||||
class Sum(Function):
|
||||
@staticmethod
|
||||
@@ -229,16 +376,24 @@ class Sum(Function):
|
||||
ctx.save_for_backward(input, axis)
|
||||
ret = reduce_op(ctx, "out += a", "out", input, axis=axis)
|
||||
if axis is not None:
|
||||
ret.shape = tuple([input.shape[i] for i in range(len(input.shape)) if i not in axis])
|
||||
ret.shape = tuple(
|
||||
[input.shape[i] for i in range(len(input.shape)) if i not in axis]
|
||||
)
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
input, axis = ctx.saved_tensors
|
||||
shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
|
||||
shape = [
|
||||
1 if axis is None or i in axis else input.shape[i]
|
||||
for i in range(len(input.shape))
|
||||
]
|
||||
output = GPUBuffer(shape, hostbuf=grad_output)
|
||||
return binary_op(ctx, 'a+b', output, buffer_new(ctx, input.shape, zero=True))
|
||||
register('sum', Sum, device=Device.GPU)
|
||||
return binary_op(ctx, "a+b", output, buffer_new(ctx, input.shape, zero=True))
|
||||
|
||||
|
||||
register("sum", Sum, device=Device.GPU)
|
||||
|
||||
|
||||
class Dot(Function):
|
||||
@staticmethod
|
||||
@@ -247,7 +402,10 @@ class Dot(Function):
|
||||
isize, msize, osize = i32(input.shape[0]), i32(input.shape[1]), i32(weight.shape[1])
|
||||
ret = buffer_new(ctx, (isize, osize))
|
||||
|
||||
matmul = clbuild(ctx.cl_ctx, "matmul", """
|
||||
matmul = clbuild(
|
||||
ctx.cl_ctx,
|
||||
"matmul",
|
||||
"""
|
||||
__kernel void matmul(
|
||||
__global const float *input, __global const float *weight, __global float *res,
|
||||
int is0, int is1, int msize, int ws0, int ws1, int osize
|
||||
@@ -261,13 +419,25 @@ class Dot(Function):
|
||||
}
|
||||
|
||||
res[X * osize + Y] = ret;
|
||||
}""")
|
||||
}""",
|
||||
)
|
||||
ctx.save_for_backward(input, weight, matmul)
|
||||
|
||||
# (isize,msize) x (msize,osize) = (isize,osize)
|
||||
matmul(ctx.cl_queue, [isize, osize], None,
|
||||
input.cl, weight.cl, ret.cl,
|
||||
msize, i32(1), msize, i32(1), osize, osize)
|
||||
matmul(
|
||||
ctx.cl_queue,
|
||||
[isize, osize],
|
||||
None,
|
||||
input.cl,
|
||||
weight.cl,
|
||||
ret.cl,
|
||||
msize,
|
||||
i32(1),
|
||||
msize,
|
||||
i32(1),
|
||||
osize,
|
||||
osize,
|
||||
)
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
@@ -279,28 +449,56 @@ class Dot(Function):
|
||||
grad_weight = buffer_new(ctx, weight.shape)
|
||||
|
||||
# (isize,osize) x (msize,osize) = (isize,msize)
|
||||
matmul(ctx.cl_queue, [isize, msize], None,
|
||||
grad_output.cl, weight.cl, grad_input.cl,
|
||||
osize, i32(1), osize, osize, i32(1), msize)
|
||||
matmul(
|
||||
ctx.cl_queue,
|
||||
[isize, msize],
|
||||
None,
|
||||
grad_output.cl,
|
||||
weight.cl,
|
||||
grad_input.cl,
|
||||
osize,
|
||||
i32(1),
|
||||
osize,
|
||||
osize,
|
||||
i32(1),
|
||||
msize,
|
||||
)
|
||||
|
||||
# (isize,msize) x (isize,osize) = (msize,osize)
|
||||
matmul(ctx.cl_queue, [msize, osize], None,
|
||||
input.cl, grad_output.cl, grad_weight.cl,
|
||||
i32(1), msize, isize, i32(1), osize, osize)
|
||||
matmul(
|
||||
ctx.cl_queue,
|
||||
[msize, osize],
|
||||
None,
|
||||
input.cl,
|
||||
grad_output.cl,
|
||||
grad_weight.cl,
|
||||
i32(1),
|
||||
msize,
|
||||
isize,
|
||||
i32(1),
|
||||
osize,
|
||||
osize,
|
||||
)
|
||||
|
||||
return grad_input, grad_weight
|
||||
register('dot', Dot, device=Device.GPU)
|
||||
|
||||
|
||||
register("dot", Dot, device=Device.GPU)
|
||||
|
||||
# ************* simple ops *************
|
||||
|
||||
|
||||
class Pad2D(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, padding=None):
|
||||
bs,cin,iy,ix = x.shape
|
||||
oy,ox = iy+padding[2]+padding[3], ix+padding[0]+padding[1]
|
||||
bs, cin, iy, ix = x.shape
|
||||
oy, ox = iy + padding[2] + padding[3], ix + padding[0] + padding[1]
|
||||
ret = buffer_new(ctx, (bs, cin, oy, ox), zero=True)
|
||||
|
||||
pad2d = clbuild(ctx.cl_ctx, "pad2d", """
|
||||
pad2d = clbuild(
|
||||
ctx.cl_ctx,
|
||||
"pad2d",
|
||||
"""
|
||||
__kernel void pad2d(__global const float *input, __global float *output,
|
||||
int ipx, int ipy, int py, int px, int oy, int ox, int iy, int ix) {
|
||||
int BC = get_global_id(0);
|
||||
@@ -311,13 +509,24 @@ class Pad2D(Function):
|
||||
int optr = BC*oy*ox + (Y+py)*ox + px + X;
|
||||
|
||||
output[optr] = input[iptr];
|
||||
}""")
|
||||
}""",
|
||||
)
|
||||
ctx.save_for_backward(padding, pad2d)
|
||||
pad2d(ctx.cl_queue, [bs*cin, iy, ix], None,
|
||||
x.cl, ret.cl,
|
||||
i32(0), i32(0), i32(padding[2]), i32(padding[0]),
|
||||
i32(oy), i32(ox), i32(iy), i32(ix)
|
||||
)
|
||||
pad2d(
|
||||
ctx.cl_queue,
|
||||
[bs * cin, iy, ix],
|
||||
None,
|
||||
x.cl,
|
||||
ret.cl,
|
||||
i32(0),
|
||||
i32(0),
|
||||
i32(padding[2]),
|
||||
i32(padding[0]),
|
||||
i32(oy),
|
||||
i32(ox),
|
||||
i32(iy),
|
||||
i32(ix),
|
||||
)
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
@@ -326,13 +535,26 @@ class Pad2D(Function):
|
||||
bs, cin, iy, ix = grad_output.shape
|
||||
oy, ox = iy - padding[2] - padding[3], ix - padding[0] - padding[1]
|
||||
ret = buffer_new(ctx, (bs, cin, oy, ox))
|
||||
pad2d(ctx.cl_queue, [bs*cin, oy, ox], None,
|
||||
grad_output.cl, ret.cl,
|
||||
i32(padding[2]), i32(padding[0]), i32(0), i32(0),
|
||||
i32(oy), i32(ox), i32(iy), i32(ix)
|
||||
)
|
||||
pad2d(
|
||||
ctx.cl_queue,
|
||||
[bs * cin, oy, ox],
|
||||
None,
|
||||
grad_output.cl,
|
||||
ret.cl,
|
||||
i32(padding[2]),
|
||||
i32(padding[0]),
|
||||
i32(0),
|
||||
i32(0),
|
||||
i32(oy),
|
||||
i32(ox),
|
||||
i32(iy),
|
||||
i32(ix),
|
||||
)
|
||||
return ret
|
||||
register('pad2d', Pad2D, device=Device.GPU)
|
||||
|
||||
|
||||
register("pad2d", Pad2D, device=Device.GPU)
|
||||
|
||||
|
||||
class Reshape(Function):
|
||||
@staticmethod
|
||||
@@ -345,105 +567,152 @@ class Reshape(Function):
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
in_shape, = ctx.saved_tensors
|
||||
(in_shape,) = ctx.saved_tensors
|
||||
return GPUBuffer(in_shape, hostbuf=grad_output)
|
||||
register('reshape', Reshape, device=Device.GPU)
|
||||
|
||||
|
||||
register("reshape", Reshape, device=Device.GPU)
|
||||
|
||||
# ************* activation ops *************
|
||||
|
||||
|
||||
class ReLU(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input):
|
||||
ctx.save_for_backward(input)
|
||||
return unary_op(ctx, 'max(a, (float)0.)', input)
|
||||
return unary_op(ctx, "max(a, (float)0.)", input)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
input, = ctx.saved_tensors
|
||||
return binary_op(ctx, 'a * (b >= 0)', grad_output, input)
|
||||
register('relu', ReLU, device=Device.GPU)
|
||||
(input,) = ctx.saved_tensors
|
||||
return binary_op(ctx, "a * (b >= 0)", grad_output, input)
|
||||
|
||||
|
||||
register("relu", ReLU, device=Device.GPU)
|
||||
|
||||
|
||||
class Sigmoid(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input):
|
||||
ret = unary_op(ctx, '1./(1+exp(-a))', input)
|
||||
ret = unary_op(ctx, "1./(1+exp(-a))", input)
|
||||
ctx.save_for_backward(ret)
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
ret, = ctx.saved_tensors
|
||||
return binary_op(ctx, 'a * (b * (1 - b));', grad_output, ret)
|
||||
register('sigmoid', Sigmoid, device=Device.GPU)
|
||||
(ret,) = ctx.saved_tensors
|
||||
return binary_op(ctx, "a * (b * (1 - b));", grad_output, ret)
|
||||
|
||||
|
||||
register("sigmoid", Sigmoid, device=Device.GPU)
|
||||
|
||||
|
||||
class AvgPool2D(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input, kernel_size=(2, 2)):
|
||||
ret = subsample_op(ctx, input, kernel_size, kernel_size, iter_op="sumval += input[iid]",
|
||||
result_op="sumval / (ksz.x * ksz.y)", decls="float sumval=0.f")
|
||||
ret = subsample_op(
|
||||
ctx,
|
||||
input,
|
||||
kernel_size,
|
||||
kernel_size,
|
||||
iter_op="sumval += input[iid]",
|
||||
result_op="sumval / (ksz.x * ksz.y)",
|
||||
decls="float sumval=0.f",
|
||||
)
|
||||
ctx.save_for_backward(input.shape)
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
orig_shape, = ctx.saved_tensors
|
||||
return supersample_op(ctx, grad_output, orig_shape, ctx.kernel_size,
|
||||
result_op="input[iid] / (ksz.x * ksz.y)")
|
||||
register('avg_pool2d', AvgPool2D, device=Device.GPU)
|
||||
(orig_shape,) = ctx.saved_tensors
|
||||
return supersample_op(
|
||||
ctx,
|
||||
grad_output,
|
||||
orig_shape,
|
||||
ctx.kernel_size,
|
||||
result_op="input[iid] / (ksz.x * ksz.y)",
|
||||
)
|
||||
|
||||
|
||||
register("avg_pool2d", AvgPool2D, device=Device.GPU)
|
||||
|
||||
|
||||
class MaxPool2D(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input, kernel_size=(2, 2)):
|
||||
idxs = subsample_op(ctx, input, kernel_size, kernel_size,
|
||||
idxs = subsample_op(
|
||||
ctx,
|
||||
input,
|
||||
kernel_size,
|
||||
kernel_size,
|
||||
iter_op="if (input[iid]>maxval) { maxval = input[iid]; maxidx = j * ksz.x + i; }",
|
||||
result_op="(float)maxidx", decls="float maxval=-FLT_MAX; int maxidx=0")
|
||||
result_op="(float)maxidx",
|
||||
decls="float maxval=-FLT_MAX; int maxidx=0",
|
||||
)
|
||||
ctx.save_for_backward(idxs, input.shape)
|
||||
return subsample_op(ctx, input, kernel_size, kernel_size,
|
||||
return subsample_op(
|
||||
ctx,
|
||||
input,
|
||||
kernel_size,
|
||||
kernel_size,
|
||||
iter_op="maxval = max(maxval, input[iid])",
|
||||
result_op="maxval", decls="float maxval = -FLT_MAX")
|
||||
result_op="maxval",
|
||||
decls="float maxval = -FLT_MAX",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
idxs, orig_shape = ctx.saved_tensors
|
||||
return supersample_op(ctx, grad_output, orig_shape, ctx.kernel_size,
|
||||
return supersample_op(
|
||||
ctx,
|
||||
grad_output,
|
||||
orig_shape,
|
||||
ctx.kernel_size,
|
||||
result_op="(maxidx == kernidx) * input[iid]",
|
||||
decls="int maxidx=((__global float*)input2)[iid]; int kernidx=(gid.x%ksz.x) + ksz.x*(gid.y%ksz.y)",
|
||||
input2=idxs)
|
||||
register('max_pool2d', MaxPool2D, device=Device.GPU)
|
||||
input2=idxs,
|
||||
)
|
||||
|
||||
|
||||
register("max_pool2d", MaxPool2D, device=Device.GPU)
|
||||
|
||||
|
||||
class LogSoftmax(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input):
|
||||
# TODO: stability?
|
||||
lsum = reduce_op(ctx, "out += exp(a)", "log(out)", input, axis=[1])
|
||||
output = binary_op(ctx, 'a-b', input, lsum)
|
||||
output = binary_op(ctx, "a-b", input, lsum)
|
||||
ctx.save_for_backward(output)
|
||||
return output
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
output, = ctx.saved_tensors
|
||||
(output,) = ctx.saved_tensors
|
||||
lsum = reduce_op(ctx, "out += a", "out", grad_output, axis=[1])
|
||||
texp = binary_op(ctx, "exp(a) * b", output, lsum)
|
||||
return binary_op(ctx, "a - b", grad_output, texp)
|
||||
register('logsoftmax', LogSoftmax, device=Device.GPU)
|
||||
|
||||
|
||||
register("logsoftmax", LogSoftmax, device=Device.GPU)
|
||||
|
||||
# ************* conv ops *************
|
||||
|
||||
|
||||
class Conv2D(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, w, stride=1, groups=1):
|
||||
if type(ctx.stride) == int:
|
||||
ctx.stride = (ctx.stride, ctx.stride)
|
||||
cout,cin,H,W = w.shape
|
||||
ys,xs = ctx.stride
|
||||
bs,cin_,iy,ix = x.shape
|
||||
oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs
|
||||
assert cin*ctx.groups == cin_
|
||||
cout, cin, H, W = w.shape
|
||||
ys, xs = ctx.stride
|
||||
bs, cin_, iy, ix = x.shape
|
||||
oy, ox = (iy - (H - ys)) // ys, (ix - (W - xs)) // xs
|
||||
assert cin * ctx.groups == cin_
|
||||
assert cout % ctx.groups == 0
|
||||
rcout = cout//ctx.groups
|
||||
rcout = cout // ctx.groups
|
||||
|
||||
ctx.save_for_backward(x,w)
|
||||
ctx.save_for_backward(x, w)
|
||||
|
||||
# output buffer
|
||||
ret = buffer_new(ctx, (bs, cout, oy, ox))
|
||||
@@ -452,7 +721,10 @@ class Conv2D(Function):
|
||||
# weight = (groups, rcout, cin, H, W)
|
||||
# output = (bs, groups, rcout, oy, ox)
|
||||
|
||||
conv = clbuild(ctx.cl_ctx, "conv", """
|
||||
conv = clbuild(
|
||||
ctx.cl_ctx,
|
||||
"conv",
|
||||
"""
|
||||
__kernel void conv(__global const float *input, __global const float *weight, __global float *output,
|
||||
int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs) {
|
||||
|
||||
@@ -475,26 +747,41 @@ class Conv2D(Function):
|
||||
}
|
||||
}
|
||||
output[B*groups*rcout*oy*ox + g*rcout*oy*ox + c*oy*ox + Y*ox + X] = acc;
|
||||
}""")
|
||||
}""",
|
||||
)
|
||||
|
||||
conv(ctx.cl_queue, [bs*groups*rcout, oy, ox], None,
|
||||
x.cl, w.cl, ret.cl,
|
||||
i32(H), i32(W), i32(groups), i32(rcout), i32(cin),
|
||||
i32(oy), i32(ox), i32(iy), i32(ix), i32(ys), i32(xs)
|
||||
conv(
|
||||
ctx.cl_queue,
|
||||
[bs * groups * rcout, oy, ox],
|
||||
None,
|
||||
x.cl,
|
||||
w.cl,
|
||||
ret.cl,
|
||||
i32(H),
|
||||
i32(W),
|
||||
i32(groups),
|
||||
i32(rcout),
|
||||
i32(cin),
|
||||
i32(oy),
|
||||
i32(ox),
|
||||
i32(iy),
|
||||
i32(ix),
|
||||
i32(ys),
|
||||
i32(xs),
|
||||
)
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
bs,_,oy,ox = grad_output.shape
|
||||
bs, _, oy, ox = grad_output.shape
|
||||
x, w = ctx.saved_tensors
|
||||
cout,cin,H,W = w.shape
|
||||
ys,xs = ctx.stride
|
||||
bs,cin_,iy,ix = x.shape
|
||||
oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs
|
||||
assert cin*ctx.groups == cin_
|
||||
cout, cin, H, W = w.shape
|
||||
ys, xs = ctx.stride
|
||||
bs, cin_, iy, ix = x.shape
|
||||
oy, ox = (iy - (H - ys)) // ys, (ix - (W - xs)) // xs
|
||||
assert cin * ctx.groups == cin_
|
||||
assert cout % ctx.groups == 0
|
||||
rcout = cout//ctx.groups
|
||||
rcout = cout // ctx.groups
|
||||
|
||||
dx = buffer_new(ctx, (bs, cin_, iy, ix), zero=True)
|
||||
dw = buffer_new(ctx, (cout, cin, H, W))
|
||||
@@ -503,7 +790,10 @@ class Conv2D(Function):
|
||||
# tensw = (groups*rcout, cin, H, W)
|
||||
# ggg = (bs, groups*rout, oy, ox)
|
||||
|
||||
convw = clbuild(ctx.cl_ctx, "convw", """
|
||||
convw = clbuild(
|
||||
ctx.cl_ctx,
|
||||
"convw",
|
||||
"""
|
||||
__kernel void convw(__global const float *tensx, __global const float *ggg, __global float *dw,
|
||||
int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs) {
|
||||
|
||||
@@ -523,8 +813,12 @@ class Conv2D(Function):
|
||||
}
|
||||
}
|
||||
dw[get_global_id(0)*H*W + y*W + x] = acc;
|
||||
}""")
|
||||
convx = clbuild(ctx.cl_ctx, "convx", """
|
||||
}""",
|
||||
)
|
||||
convx = clbuild(
|
||||
ctx.cl_ctx,
|
||||
"convx",
|
||||
"""
|
||||
__kernel void convx(__global const float *tensw, __global const float *ggg, __global float *dx,
|
||||
int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs) {
|
||||
|
||||
@@ -547,10 +841,36 @@ class Conv2D(Function):
|
||||
}
|
||||
}
|
||||
}
|
||||
""")
|
||||
""",
|
||||
)
|
||||
|
||||
conv_args = i32(H), i32(W), i32(ctx.groups), i32(rcout), i32(cin), i32(oy), i32(ox), i32(iy), i32(ix), i32(ys), i32(xs), i32(bs)
|
||||
convw(ctx.cl_queue, [ctx.groups*rcout*cin, H, W], None, x.cl, grad_output.cl, dw.cl, *conv_args)
|
||||
convx(ctx.cl_queue, [bs, ctx.groups, cin], None, w.cl, grad_output.cl, dx.cl, *conv_args)
|
||||
conv_args = (
|
||||
i32(H),
|
||||
i32(W),
|
||||
i32(ctx.groups),
|
||||
i32(rcout),
|
||||
i32(cin),
|
||||
i32(oy),
|
||||
i32(ox),
|
||||
i32(iy),
|
||||
i32(ix),
|
||||
i32(ys),
|
||||
i32(xs),
|
||||
i32(bs),
|
||||
)
|
||||
convw(
|
||||
ctx.cl_queue,
|
||||
[ctx.groups * rcout * cin, H, W],
|
||||
None,
|
||||
x.cl,
|
||||
grad_output.cl,
|
||||
dw.cl,
|
||||
*conv_args,
|
||||
)
|
||||
convx(
|
||||
ctx.cl_queue, [bs, ctx.groups, cin], None, w.cl, grad_output.cl, dx.cl, *conv_args
|
||||
)
|
||||
return dx, dw
|
||||
register('conv2d', Conv2D, device=Device.GPU)
|
||||
|
||||
|
||||
register("conv2d", Conv2D, device=Device.GPU)
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
import numpy as np
|
||||
from tinygrad.tensor import Tensor
|
||||
|
||||
|
||||
class Optimizer:
|
||||
def __init__(self, params):
|
||||
self.params = [x for x in params if x.requires_grad == True]
|
||||
@@ -11,6 +12,7 @@ class Optimizer:
|
||||
for param in self.params:
|
||||
param.grad = None
|
||||
|
||||
|
||||
class SGD(Optimizer):
|
||||
def __init__(self, params, lr=0.001):
|
||||
super().__init__(params)
|
||||
@@ -20,30 +22,53 @@ class SGD(Optimizer):
|
||||
for t in self.params:
|
||||
t -= t.grad * self.lr
|
||||
|
||||
|
||||
class RMSprop(Optimizer):
|
||||
def __init__(self, params, lr=0.001, decay=0.9, eps=1e-8):
|
||||
super().__init__(params)
|
||||
self.lr, self.decay, self.eps = lr, decay, eps
|
||||
|
||||
self.v = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params]
|
||||
self.v = [
|
||||
Tensor(
|
||||
np.zeros(t.shape, dtype=np.float32),
|
||||
device=params[0].device,
|
||||
requires_grad=False,
|
||||
)
|
||||
for t in self.params
|
||||
]
|
||||
|
||||
def step(self):
|
||||
for i, t in enumerate(self.params):
|
||||
self.v[i] = self.decay * self.v[i] + (1.0 - self.decay) * t.grad * t.grad
|
||||
t -= (t.grad * self.lr).div(self.v[i].sqrt() + self.eps)
|
||||
|
||||
|
||||
class Adam(Optimizer):
|
||||
def __init__(self, params, lr=0.001, b1=0.9, b2=0.999, eps=1e-8):
|
||||
super().__init__(params)
|
||||
self.lr, self.b1, self.b2, self.eps, self.t = lr, b1, b2, eps, 0
|
||||
|
||||
self.m = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params]
|
||||
self.v = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params]
|
||||
self.m = [
|
||||
Tensor(
|
||||
np.zeros(t.shape, dtype=np.float32),
|
||||
device=params[0].device,
|
||||
requires_grad=False,
|
||||
)
|
||||
for t in self.params
|
||||
]
|
||||
self.v = [
|
||||
Tensor(
|
||||
np.zeros(t.shape, dtype=np.float32),
|
||||
device=params[0].device,
|
||||
requires_grad=False,
|
||||
)
|
||||
for t in self.params
|
||||
]
|
||||
|
||||
def step(self):
|
||||
self.t = self.t + 1
|
||||
a = self.lr * ((1.0 - self.b2**self.t)**0.5) / (1.0 - self.b1**self.t)
|
||||
for i,t in enumerate(self.params):
|
||||
a = self.lr * ((1.0 - self.b2 ** self.t) ** 0.5) / (1.0 - self.b1 ** self.t)
|
||||
for i, t in enumerate(self.params):
|
||||
self.m[i] = self.b1 * self.m[i] + (1.0 - self.b1) * t.grad
|
||||
self.v[i] = self.b2 * self.v[i] + (1.0 - self.b2) * t.grad * t.grad
|
||||
t -= a * self.m[i].div(self.v[i].sqrt() + self.eps)
|
||||
|
||||
@@ -10,32 +10,43 @@ from collections import defaultdict
|
||||
DEBUG = os.getenv("DEBUG", None) is not None
|
||||
if DEBUG:
|
||||
import atexit, time
|
||||
|
||||
debug_counts, debug_times = defaultdict(int), defaultdict(float)
|
||||
|
||||
def print_debug_exit():
|
||||
for name, _ in sorted(debug_times.items(), key=lambda x: -x[1]):
|
||||
print(f"{name:>20} : {debug_counts[name]:>6} {debug_times[name]:>10.2f} ms")
|
||||
|
||||
atexit.register(print_debug_exit)
|
||||
|
||||
|
||||
class ProfileOp:
|
||||
def __init__(self, name, x, backward=False):
|
||||
self.name = ("back_" if backward else "")+name
|
||||
self.name = ("back_" if backward else "") + name
|
||||
self.x = x
|
||||
|
||||
def __enter__(self):
|
||||
if DEBUG: self.st = time.time()
|
||||
if DEBUG:
|
||||
self.st = time.time()
|
||||
|
||||
def __exit__(self, *junk):
|
||||
if DEBUG:
|
||||
if cl_queue is not None:
|
||||
cl_queue.finish()
|
||||
et = (time.time()-self.st)*1000.
|
||||
et = (time.time() - self.st) * 1000.0
|
||||
debug_counts[self.name] += 1
|
||||
debug_times[self.name] += et
|
||||
print(f"{self.name:>20} : {et:>7.2f} ms {[y.shape for y in self.x]}")
|
||||
|
||||
|
||||
# **** GPU functions ****
|
||||
|
||||
cl_ctx, cl_queue = None, None
|
||||
|
||||
|
||||
def require_init_gpu():
|
||||
if not GPU: raise Exception("No GPU Support, install pyopencl")
|
||||
if not GPU:
|
||||
raise Exception("No GPU Support, install pyopencl")
|
||||
global cl_ctx, cl_queue
|
||||
if cl_queue is None:
|
||||
devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.GPU)
|
||||
@@ -45,28 +56,45 @@ def require_init_gpu():
|
||||
# this is an in-order command queue
|
||||
cl_queue = cl.CommandQueue(cl_ctx)
|
||||
|
||||
|
||||
class GPUBuffer:
|
||||
def __init__(self, shape, hostbuf=None):
|
||||
self.shape, self.dtype = tuple(shape), np.float32
|
||||
self.cl = hostbuf.cl if isinstance(hostbuf, GPUBuffer) else \
|
||||
cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0), 4*np.prod(shape),
|
||||
hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None)
|
||||
self.cl = (
|
||||
hostbuf.cl
|
||||
if isinstance(hostbuf, GPUBuffer)
|
||||
else cl.Buffer(
|
||||
cl_ctx,
|
||||
cl.mem_flags.READ_WRITE
|
||||
| (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0),
|
||||
4 * np.prod(shape),
|
||||
hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None,
|
||||
)
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<GPUBuffer with shape {self.shape!r}>"
|
||||
|
||||
|
||||
# **** ANE functions ****
|
||||
|
||||
ane = None
|
||||
|
||||
|
||||
def require_init_ane():
|
||||
global ane
|
||||
if ane is None:
|
||||
import ane.lib.ane, tinygrad.ops_ane
|
||||
|
||||
ane = ane.lib.ane.ANE()
|
||||
|
||||
|
||||
# **** start with two base classes, Tensor and Function ****
|
||||
|
||||
class Device: CPU, GPU, ANE = 0, 1, 2
|
||||
|
||||
class Device:
|
||||
CPU, GPU, ANE = 0, 1, 2
|
||||
|
||||
|
||||
class Tensor:
|
||||
did_float_warning = False
|
||||
@@ -110,7 +138,12 @@ class Tensor:
|
||||
|
||||
@classmethod
|
||||
def uniform(cls, *shape, **kwargs):
|
||||
return cls((np.random.uniform(-1., 1., size=shape)/np.sqrt(np.prod(shape))).astype(np.float32), **kwargs)
|
||||
return cls(
|
||||
(np.random.uniform(-1.0, 1.0, size=shape) / np.sqrt(np.prod(shape))).astype(
|
||||
np.float32
|
||||
),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def eye(cls, dim, **kwargs):
|
||||
@@ -130,18 +163,21 @@ class Tensor:
|
||||
|
||||
# fill in the first grad with one
|
||||
# this is "implicit gradient creation"
|
||||
self.grad = Tensor(np.ones(self.shape, dtype=self.dtype), device=self.device, requires_grad=False)
|
||||
self.grad = Tensor(
|
||||
np.ones(self.shape, dtype=self.dtype), device=self.device, requires_grad=False
|
||||
)
|
||||
|
||||
for t0 in reversed(self.deepwalk(set(), [])):
|
||||
assert (t0.grad is not None)
|
||||
assert t0.grad is not None
|
||||
with ProfileOp(t0._ctx.__class__.__name__, [t0.grad], backward=True):
|
||||
grads = t0._ctx.backward(t0._ctx, t0.grad.data)
|
||||
if len(t0._ctx.parents) == 1:
|
||||
grads = [grads]
|
||||
for t,g in zip(t0._ctx.parents, grads):
|
||||
for t, g in zip(t0._ctx.parents, grads):
|
||||
if g is not None:
|
||||
assert g.shape == t.shape, \
|
||||
f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
|
||||
assert (
|
||||
g.shape == t.shape
|
||||
), f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
|
||||
gt = Tensor(g, device=self.device, requires_grad=False)
|
||||
t.grad = gt if t.grad is None else (t.grad + gt)
|
||||
|
||||
@@ -150,19 +186,21 @@ class Tensor:
|
||||
@staticmethod
|
||||
def _move_data(data, device):
|
||||
if isinstance(data, GPUBuffer):
|
||||
if device == Device.GPU: return data
|
||||
if device == Device.GPU:
|
||||
return data
|
||||
old = data
|
||||
data = np.empty(old.shape, dtype=np.float32)
|
||||
with ProfileOp("toCPU", [data]):
|
||||
cl.enqueue_copy(cl_queue, data, old.cl, is_blocking=True)
|
||||
cl.enqueue_copy(cl_queue, data, old.cl, is_blocking=True)
|
||||
|
||||
elif "ANETensor" in str(type(data)):
|
||||
if device == Device.ANE: return data
|
||||
if device == Device.ANE:
|
||||
return data
|
||||
with ProfileOp("toCPU", [data]):
|
||||
data = data.data().astype(np.float32)
|
||||
data = data.data().astype(np.float32)
|
||||
|
||||
if not isinstance(data, np.ndarray):
|
||||
data = np.array(data, dtype=np.float32)
|
||||
data = np.array(data, dtype=np.float32)
|
||||
|
||||
if data.dtype != np.float32 and not Tensor.did_float_warning:
|
||||
# warning? float64 is actually needed for numerical jacobian
|
||||
@@ -184,14 +222,17 @@ class Tensor:
|
||||
|
||||
def to_(self, device):
|
||||
self.data, self.device = self._move_data(self.data, device), device
|
||||
if self.grad: self.grad.to_(device)
|
||||
if self.grad:
|
||||
self.grad.to_(device)
|
||||
|
||||
def to(self, device):
|
||||
ret = Tensor(self.data, device)
|
||||
if self.grad: ret.grad = self.grad.to(device)
|
||||
if self.grad:
|
||||
ret.grad = self.grad.to(device)
|
||||
return ret
|
||||
|
||||
def _is(self, device): return self.device == device
|
||||
def _is(self, device):
|
||||
return self.device == device
|
||||
|
||||
def detach(self):
|
||||
return Tensor(self.data, device=self.device)
|
||||
@@ -203,7 +244,7 @@ class Tensor:
|
||||
|
||||
def mean(self, axis=None):
|
||||
out = self.sum(axis=axis)
|
||||
coeff = np.prod(out.shape)/np.prod(self.shape)
|
||||
coeff = np.prod(out.shape) / np.prod(self.shape)
|
||||
return out * coeff
|
||||
|
||||
def sqrt(self):
|
||||
@@ -219,15 +260,18 @@ class Tensor:
|
||||
return 2.0 * ((2.0 * self).sigmoid()) - 1.0
|
||||
|
||||
def leakyrelu(self, neg_slope=0.01):
|
||||
return self.relu() - (-neg_slope*self).relu()
|
||||
return self.relu() - (-neg_slope * self).relu()
|
||||
|
||||
def dropout(self, p=0.5):
|
||||
_mask = np.asarray(np.random.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype)
|
||||
_mask = np.asarray(
|
||||
np.random.binomial(1, 1.0 - p, size=self.shape), dtype=self.dtype
|
||||
)
|
||||
ret = self * Tensor(_mask, requires_grad=False, device=self.device)
|
||||
return ret.div(1.0 - p)
|
||||
|
||||
def abs(self):
|
||||
return self.relu() + (-1.0*self).relu()
|
||||
return self.relu() + (-1.0 * self).relu()
|
||||
|
||||
|
||||
# An instantiation of the Function is the Context
|
||||
class Function:
|
||||
@@ -239,7 +283,7 @@ class Function:
|
||||
self.saved_tensors.extend(x)
|
||||
|
||||
def apply(self, *x, **kwargs):
|
||||
ctx = self(*x) # self - operation i.e 'add', 'sub', etc.
|
||||
ctx = self(*x) # self - operation i.e 'add', 'sub', etc.
|
||||
# use default params
|
||||
params = signature(self.forward).parameters
|
||||
for p in params.values():
|
||||
@@ -249,40 +293,66 @@ class Function:
|
||||
for k, v in kwargs.items():
|
||||
setattr(ctx, k, v)
|
||||
with ProfileOp(ctx.__class__.__name__, x):
|
||||
ret = Tensor(self.forward(ctx, *[t.data for t in x], **kwargs),
|
||||
device=ctx.device, requires_grad=any([t.requires_grad for t in x]))
|
||||
ret = Tensor(
|
||||
self.forward(ctx, *[t.data for t in x], **kwargs),
|
||||
device=ctx.device,
|
||||
requires_grad=any([t.requires_grad for t in x]),
|
||||
)
|
||||
if ret.requires_grad:
|
||||
ret._ctx = ctx
|
||||
return ret
|
||||
|
||||
|
||||
def register(name, fxn, device=Device.CPU):
|
||||
Tensor.ops[device][name] = fxn
|
||||
|
||||
def dispatch(*x, **kwargs):
|
||||
tt = [arg for arg in x if isinstance(arg, Tensor)][0]
|
||||
x = [Tensor(np.array([arg], dtype=tt.dtype), device=tt.device, requires_grad=False) if not isinstance(arg, Tensor) else arg for arg in x]
|
||||
x = [
|
||||
Tensor(np.array([arg], dtype=tt.dtype), device=tt.device, requires_grad=False)
|
||||
if not isinstance(arg, Tensor)
|
||||
else arg
|
||||
for arg in x
|
||||
]
|
||||
f = (Tensor.ops[tt.device])[name]
|
||||
f.cl_ctx, f.cl_queue, f.ane, f.device = cl_ctx, cl_queue, ane, tt.device
|
||||
return f.apply(f, *x, **kwargs)
|
||||
|
||||
setattr(Tensor, name, dispatch)
|
||||
# TODO: div is a second class op, so it doesn't work here
|
||||
if name in ['add', 'sub', 'mul', 'pow']:
|
||||
if name in ["add", "sub", "mul", "pow"]:
|
||||
setattr(Tensor, f"__{name}__", dispatch)
|
||||
setattr(Tensor, f"__i{name}__", lambda self,x: self.assign(dispatch(self,x)))
|
||||
setattr(Tensor, f"__r{name}__", lambda self,x: dispatch(x,self))
|
||||
setattr(Tensor, f"__i{name}__", lambda self, x: self.assign(dispatch(self, x)))
|
||||
setattr(Tensor, f"__r{name}__", lambda self, x: dispatch(x, self))
|
||||
|
||||
|
||||
for device in [device for device in Device.__dict__.keys() if device[0] != "_"]:
|
||||
setattr(Tensor, f"{device.lower()}", functools.partialmethod(Tensor.to, Device.__dict__[device]))
|
||||
setattr(Tensor, f"{device.lower()}_", functools.partialmethod(Tensor.to_, Device.__dict__[device]))
|
||||
setattr(Tensor, f"is_{device.lower()}", property(functools.partialmethod(Tensor._is, Device.__dict__[device])))
|
||||
setattr(
|
||||
Tensor,
|
||||
f"{device.lower()}",
|
||||
functools.partialmethod(Tensor.to, Device.__dict__[device]),
|
||||
)
|
||||
setattr(
|
||||
Tensor,
|
||||
f"{device.lower()}_",
|
||||
functools.partialmethod(Tensor.to_, Device.__dict__[device]),
|
||||
)
|
||||
setattr(
|
||||
Tensor,
|
||||
f"is_{device.lower()}",
|
||||
property(functools.partialmethod(Tensor._is, Device.__dict__[device])),
|
||||
)
|
||||
|
||||
# this registers all the operations
|
||||
import tinygrad.ops_cpu
|
||||
|
||||
try:
|
||||
import pyopencl as cl
|
||||
|
||||
# TODO: move this import to require_init_gpu?
|
||||
import tinygrad.ops_gpu
|
||||
|
||||
GPU = True
|
||||
except ImportError:
|
||||
# no GPU support
|
||||
GPU = False
|
||||
|
||||
|
||||
Reference in New Issue
Block a user