diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 78c2c36239..c06ba50eeb 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -34,13 +34,21 @@ jobs: run: python -m pytest -s -v linter: - name: Indentation Linter - runs-on: ubuntu-latest + name: Indentation Linter + runs-on: ubuntu-latest + + steps: + - name: Checkout Code + uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install pylint + - name: Lint with pylint + run: | + python -m pylint --disable=all -e W0311 --jobs=0 --indent-string=' ' **/*.py - steps: - - name: Checkout Code - uses: actions/checkout@v2 - - name: Install PyLint - run: sudo apt-get install pylint - - name: Validate indentation is 2 lines - run: if [[ $(pylint --jobs=0 --indent-string=' ' * | grep "Bad indentation") ]]; then exit 1; fi diff --git a/ane/1_build/coreml_ane.py b/ane/1_build/coreml_ane.py index f3c8dcbfd1..e4c89266ae 100755 --- a/ane/1_build/coreml_ane.py +++ b/ane/1_build/coreml_ane.py @@ -3,24 +3,34 @@ import numpy as np import coremltools as ct from coremltools.models.neural_network import datatypes, NeuralNetworkBuilder -input_features = [('image', datatypes.Array(3))] -output_features = [('probs', datatypes.Array(3))] +input_features = [("image", datatypes.Array(3))] +output_features = [("probs", datatypes.Array(3))] weights = np.zeros((3, 3)) + 3 bias = np.ones(3) builder = NeuralNetworkBuilder(input_features, output_features) -builder.add_inner_product(name='ip_layer', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='image', output_name='med') -#builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs') -#builder.add_elementwise(name='element', input_names=['med', 'med'], output_name='probs', mode='ADD') -builder.add_bias(name='bias', b=bias, input_name='med', output_name='probs', shape_bias=(3,)) -#builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs') +builder.add_inner_product( + name="ip_layer", + W=weights, + b=None, + input_channels=3, + output_channels=3, + has_bias=False, + input_name="image", + output_name="med", +) +# builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs') +# builder.add_elementwise(name='element', input_names=['med', 'med'], output_name='probs', mode='ADD') +builder.add_bias( + name="bias", b=bias, input_name="med", output_name="probs", shape_bias=(3,) +) +# builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs') # compile the spec mlmodel = ct.models.MLModel(builder.spec) # trigger the ANE! -out = mlmodel.predict({"image": np.array([1337,0,0], dtype=np.float32)}) +out = mlmodel.predict({"image": np.array([1337, 0, 0], dtype=np.float32)}) print(out) -mlmodel.save('test.mlmodel') - +mlmodel.save("test.mlmodel") diff --git a/ane/2_compile/hwx_parse.py b/ane/2_compile/hwx_parse.py index 4961ade896..d42ff500ef 100755 --- a/ane/2_compile/hwx_parse.py +++ b/ane/2_compile/hwx_parse.py @@ -2,17 +2,21 @@ import sys from hexdump import hexdump from macholib import MachO + + def get_macho(fn): # mod to make the header okay # MH_CIGAM_64 is good dat = open(fn, "rb").read() - dat = b"\xcf\xfa\xed\xfe"+dat[4:] + dat = b"\xcf\xfa\xed\xfe" + dat[4:] from tempfile import NamedTemporaryFile + with NamedTemporaryFile(delete=False) as f: f.write(dat) f.close() return MachO.MachO(f.name) + a = get_macho("model.hwx") # load commands @@ -21,14 +25,22 @@ for c in a.headers[0].commands: if c[0].cmd == 25: print(c[1]) for section in c[2]: - print(section.segname.strip(b'\0'), section.sectname.strip(b'\0'), hex(section.addr), hex(section.size), "@", hex(c[1].fileoff)) - #print(dir(section)) + print( + section.segname.strip(b"\0"), + section.sectname.strip(b"\0"), + hex(section.addr), + hex(section.size), + "@", + hex(c[1].fileoff), + ) + # print(dir(section)) if c[1].filesize > 0: hexdump(section.section_data) # this parser is wrong (fixed with 64-bit one) from macholib import SymbolTable -sym = SymbolTable.SymbolTable(a) + +sym = SymbolTable.SymbolTable(a) syms = {} for l in sym.nlists: @@ -36,64 +48,65 @@ for l in sym.nlists: if l[0].n_value != 0: syms[l[1]] = l[0].n_value -for k,v in syms.items(): +for k, v in syms.items(): print(k, hex(v)) - from termcolor import colored + + def compare(x, y): ss = [] ln = [] ln2 = [] - ll = (max(len(x), len(y)) + 0xF)//0x10 * 0x10 + ll = (max(len(x), len(y)) + 0xF) // 0x10 * 0x10 highlight = False - next_highlight = 0x2b - for i in range(ll+1): + next_highlight = 0x2B + for i in range(ll + 1): if i == next_highlight: highlight = True if i < len(y): - next_highlight += y[i]+8 + next_highlight += y[i] + 8 else: next_highlight = None else: highlight = False - a = "%02X" % x[i] if i < len(x) else "--", \ - "%02X" % y[i] if i < len(y) else "--" + a = "%02X" % x[i] if i < len(x) else "--", "%02X" % y[i] if i < len(y) else "--" + def fj(x): ss = [] for i in range(0, 0x10, 4): - ss.append(' '.join(x[i:i+4])) - return ' '.join(ss) + ss.append(" ".join(x[i : i + 4])) + return " ".join(ss) - if i!=0 and i%0x10 == 0: - ss.append("%8X: " % (i-0x10)+fj(ln)+" | "+fj(ln2)+"\n") + if i != 0 and i % 0x10 == 0: + ss.append("%8X: " % (i - 0x10) + fj(ln) + " | " + fj(ln2) + "\n") ln = [] ln2 = [] if a[0] != a[1] and a[0] != "--" and a[1] != "--": - ln.append(colored(a[0], 'green')) - ln2.append(colored(a[1], 'red')) + ln.append(colored(a[0], "green")) + ln2.append(colored(a[1], "red")) else: if highlight: - ln.append(colored(a[0], 'yellow')) - ln2.append(colored(a[1], 'yellow')) + ln.append(colored(a[0], "yellow")) + ln2.append(colored(a[1], "yellow")) else: ln.append(a[0]) ln2.append(a[1]) - return ''.join(ss) + return "".join(ss) + g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1]) f1 = g.headers[0].commands[1][2][0].section_data f2 = a.headers[0].commands[1][2][0].section_data for i in range(0, len(f2), 0x300): - print("===== op %d =====" % (i//0x300)) + print("===== op %d =====" % (i // 0x300)) if len(f1) < 0x300: - print(compare(f1, f2[i:i+0x300])) + print(compare(f1, f2[i : i + 0x300])) else: - print(compare(f1[i:i+0x300], f2[i:i+0x300])) - -#open("/tmp/data.section", "wb").write(f2) -#print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read())) + print(compare(f1[i : i + 0x300], f2[i : i + 0x300])) +# open("/tmp/data.section", "wb").write(f2) +# print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read())) diff --git a/ane/lib/ane.py b/ane/lib/ane.py index def815fabb..fa50dc4ec3 100755 --- a/ane/lib/ane.py +++ b/ane/lib/ane.py @@ -3,11 +3,12 @@ import os from ctypes import * import numpy as np import faulthandler + faulthandler.enable() -libane = cdll.LoadLibrary(os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "libane.dylib")) +libane = cdll.LoadLibrary( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "libane.dylib") +) libane.ANE_Compile.argtypes = [c_char_p, c_int] libane.ANE_Compile.restype = c_void_p @@ -17,34 +18,36 @@ libane.ANE_TensorCreate.restype = c_void_p libane.ANE_TensorData.argtypes = [c_void_p] libane.ANE_TensorData.restype = POINTER(c_uint16) -libane.ANE_Run.argtypes = [c_void_p]*3 +libane.ANE_Run.argtypes = [c_void_p] * 3 libane.ANE_Run.restype = c_int + class ANETensor: def __init__(self, *shape): self.shape = shape self.dtype = np.float16 self.sz = int(np.prod(shape)) - assert(self.sz <= 0x4000) + assert self.sz <= 0x4000 self.tt = libane.ANE_TensorCreate(self.sz, 1) - assert(self.tt is not None) + assert self.tt is not None def data(self): data = libane.ANE_TensorData(self.tt) - assert(data is not None) - #print(hex(addressof(data.contents))) + assert data is not None + # print(hex(addressof(data.contents))) buf = np.ctypeslib.as_array(data, shape=(self.sz,)) ret = np.frombuffer(buf, dtype=self.dtype) - #print(ret.data) + # print(ret.data) return ret + class ANE: def __init__(self): libane.ANE_Open() def compile(self, dat): ret = libane.ANE_Compile(create_string_buffer(dat), len(dat)) - assert(ret is not None) + assert ret is not None return ret def run(self, prog, tin, tout): @@ -53,6 +56,7 @@ class ANE: def tensor(self, shape): return ANETensor(shape) + if __name__ == "__main__": ane = ANE() @@ -62,7 +66,7 @@ if __name__ == "__main__": tind = tin.data() toutd = tout.data() - tind[0:4] = [-1,1,-2,2] + tind[0:4] = [-1, 1, -2, 2] print(tind) print(toutd) @@ -71,4 +75,3 @@ if __name__ == "__main__": print(tind) print(toutd) - diff --git a/examples/benchmark.py b/examples/benchmark.py index 2788915055..5a673db647 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -17,8 +17,9 @@ for dev in ["CPU", "GPU", "ANE"]: boaa = baa.relu() et = time.time() if i == 2: - print("%s can do at least %.2f MEGAReLUs/sec" % (dev, (np.prod(boaa.shape)/1e6)/(et-st))) + print( + "%s can do at least %.2f MEGAReLUs/sec" + % (dev, (np.prod(boaa.shape) / 1e6) / (et - st)) + ) # decently reliable - assert(np.all(boaa.cpu().data >= 0)) - - + assert np.all(boaa.cpu().data >= 0) diff --git a/examples/efficientnet.py b/examples/efficientnet.py index 1f73cfdf32..66165ab869 100644 --- a/examples/efficientnet.py +++ b/examples/efficientnet.py @@ -3,24 +3,29 @@ # a rough copy of # https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py import os + GPU = os.getenv("GPU", None) is not None import sys import io import time import numpy as np + np.set_printoptions(suppress=True) from tinygrad.tensor import Tensor from extra.utils import fetch, get_parameters from extra.efficientnet import EfficientNet + def infer(model, img): # preprocess image aspect_ratio = img.size[0] / img.size[1] - img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0)))) + img = img.resize( + (int(224 * max(aspect_ratio, 1.0)), int(224 * max(1.0 / aspect_ratio, 1.0))) + ) img = np.array(img) - y0,x0=(np.asarray(img.shape)[:2]-224)//2 - retimg = img = img[y0:y0+224, x0:x0+224] + y0, x0 = (np.asarray(img.shape)[:2] - 224) // 2 + retimg = img = img[y0 : y0 + 224, x0 : x0 + 224] # if you want to look at the image """ @@ -30,11 +35,11 @@ def infer(model, img): """ # low level preprocess - img = np.moveaxis(img, [2,0,1], [0,1,2]) - img = img.astype(np.float32)[:3].reshape(1,3,224,224) + img = np.moveaxis(img, [2, 0, 1], [0, 1, 2]) + img = img.astype(np.float32)[:3].reshape(1, 3, 224, 224) img /= 255.0 - img -= np.array([0.485, 0.456, 0.406]).reshape((1,-1,1,1)) - img /= np.array([0.229, 0.224, 0.225]).reshape((1,-1,1,1)) + img -= np.array([0.485, 0.456, 0.406]).reshape((1, -1, 1, 1)) + img /= np.array([0.229, 0.224, 0.225]).reshape((1, -1, 1, 1)) # run the net if GPU: @@ -50,6 +55,7 @@ def infer(model, img): """ return out, retimg + if __name__ == "__main__": # instantiate my net model = EfficientNet(int(os.getenv("NUM", "0"))) @@ -59,38 +65,42 @@ if __name__ == "__main__": # category labels import ast - lbls = fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt") - lbls = ast.literal_eval(lbls.decode('utf-8')) + + lbls = fetch( + "https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt" + ) + lbls = ast.literal_eval(lbls.decode("utf-8")) # load image and preprocess from PIL import Image + url = sys.argv[1] - if url == 'webcam': + if url == "webcam": import cv2 + cap = cv2.VideoCapture(0) cap.set(cv2.CAP_PROP_BUFFERSIZE, 1) while 1: - _ = cap.grab() # discard one frame to circumvent capture buffering + _ = cap.grab() # discard one frame to circumvent capture buffering ret, frame = cap.read() - img = Image.fromarray(frame[:, :, [2,1,0]]) + img = Image.fromarray(frame[:, :, [2, 1, 0]]) out, retimg = infer(model, img) print(np.argmax(out.data), np.max(out.data), lbls[np.argmax(out.data)]) SCALE = 3 - simg = cv2.resize(retimg, (224*SCALE, 224*SCALE)) + simg = cv2.resize(retimg, (224 * SCALE, 224 * SCALE)) retimg = cv2.cvtColor(simg, cv2.COLOR_RGB2BGR) - cv2.imshow('capture', retimg) - if cv2.waitKey(1) & 0xFF == ord('q'): + cv2.imshow("capture", retimg) + if cv2.waitKey(1) & 0xFF == ord("q"): break cap.release() cv2.destroyAllWindows() else: - if url.startswith('http'): + if url.startswith("http"): img = Image.open(io.BytesIO(fetch(url))) else: img = Image.open(url) st = time.time() out, _ = infer(model, img) print(np.argmax(out.data), np.max(out.data), lbls[np.argmax(out.data)]) - print("did inference in %.2f s" % (time.time()-st)) - #print("NOT", np.argmin(out.data), np.min(out.data), lbls[np.argmin(out.data)]) - + print("did inference in %.2f s" % (time.time() - st)) + # print("NOT", np.argmin(out.data), np.min(out.data), lbls[np.argmin(out.data)]) diff --git a/examples/mnist_gan.py b/examples/mnist_gan.py index 67e4b7ba7f..eab20c6a6d 100644 --- a/examples/mnist_gan.py +++ b/examples/mnist_gan.py @@ -3,8 +3,9 @@ import os import sys import numpy as np from tqdm import tqdm + sys.path.append(os.getcwd()) -sys.path.append(os.path.join(os.getcwd(), 'test')) +sys.path.append(os.path.join(os.getcwd(), "test")) from tinygrad.tensor import Tensor, Function, register from extra.utils import get_parameters @@ -12,7 +13,10 @@ import tinygrad.optim as optim from test_mnist import X_train from torchvision.utils import make_grid, save_image import torch + GPU = os.getenv("GPU") is not None + + class LinearGen: def __init__(self): lv = 128 @@ -28,6 +32,7 @@ class LinearGen: x = x.dot(self.l4).tanh() return x + class LinearDisc: def __init__(self): in_sh = 784 @@ -39,108 +44,112 @@ class LinearDisc: def forward(self, x, train=True): x = x.dot(self.l1).leakyrelu(0.2) if train: - x = x.dropout(0.3) + x = x.dropout(0.3) x = x.dot(self.l2).leakyrelu(0.2) if train: - x = x.dropout(0.3) + x = x.dropout(0.3) x = x.dot(self.l3).leakyrelu(0.2) if train: - x = x.dropout(0.3) + x = x.dropout(0.3) x = x.dot(self.l4).logsoftmax() return x + if __name__ == "__main__": - generator = LinearGen() - discriminator = LinearDisc() - batch_size = 512 - k = 1 - epochs = 300 - generator_params = get_parameters(generator) - discriminator_params = get_parameters(discriminator) - gen_loss = [] - disc_loss = [] - output_folder = "outputs" - os.makedirs(output_folder, exist_ok=True) - train_data_size = len(X_train) - ds_noise = Tensor(np.random.randn(64,128).astype(np.float32), gpu=GPU, requires_grad=False) - n_steps = int(train_data_size/batch_size) - if GPU: - [x.cuda_() for x in generator_params+discriminator_params] - # optimizers - optim_g = optim.Adam(generator_params,lr=0.0002, b1=0.5) # 0.0002 for equilibrium! - optim_d = optim.Adam(discriminator_params,lr=0.0002, b1=0.5) + generator = LinearGen() + discriminator = LinearDisc() + batch_size = 512 + k = 1 + epochs = 300 + generator_params = get_parameters(generator) + discriminator_params = get_parameters(discriminator) + gen_loss = [] + disc_loss = [] + output_folder = "outputs" + os.makedirs(output_folder, exist_ok=True) + train_data_size = len(X_train) + ds_noise = Tensor( + np.random.randn(64, 128).astype(np.float32), gpu=GPU, requires_grad=False + ) + n_steps = int(train_data_size / batch_size) + if GPU: + [x.cuda_() for x in generator_params + discriminator_params] + # optimizers + optim_g = optim.Adam(generator_params, lr=0.0002, b1=0.5) # 0.0002 for equilibrium! + optim_d = optim.Adam(discriminator_params, lr=0.0002, b1=0.5) - def regularization_l2(model, a=1e-4): - #TODO: l2 reg loss - pass + def regularization_l2(model, a=1e-4): + # TODO: l2 reg loss + pass - def generator_batch(): - idx = np.random.randint(0, X_train.shape[0], size=(batch_size)) - image_b = X_train[idx].reshape(-1, 28*28).astype(np.float32)/255. - image_b = (image_b - 0.5)/0.5 - return Tensor(image_b, gpu=GPU) + def generator_batch(): + idx = np.random.randint(0, X_train.shape[0], size=(batch_size)) + image_b = X_train[idx].reshape(-1, 28 * 28).astype(np.float32) / 255.0 + image_b = (image_b - 0.5) / 0.5 + return Tensor(image_b, gpu=GPU) - def real_label(bs): - y = np.zeros((bs,2), np.float32) - y[range(bs), [1]*bs] = -2.0 - real_labels = Tensor(y, gpu=GPU) - return real_labels + def real_label(bs): + y = np.zeros((bs, 2), np.float32) + y[range(bs), [1] * bs] = -2.0 + real_labels = Tensor(y, gpu=GPU) + return real_labels - def fake_label(bs): - y = np.zeros((bs,2), np.float32) - y[range(bs), [0]*bs] = -2.0 # Can we do label smoothin? i.e -2.0 changed to -1.98789. - fake_labels = Tensor(y, gpu=GPU) - return fake_labels + def fake_label(bs): + y = np.zeros((bs, 2), np.float32) + y[ + range(bs), [0] * bs + ] = -2.0 # Can we do label smoothin? i.e -2.0 changed to -1.98789. + fake_labels = Tensor(y, gpu=GPU) + return fake_labels - def train_discriminator(optimizer, data_real, data_fake): - real_labels = real_label(batch_size) - fake_labels = fake_label(batch_size) + def train_discriminator(optimizer, data_real, data_fake): + real_labels = real_label(batch_size) + fake_labels = fake_label(batch_size) - optimizer.zero_grad() + optimizer.zero_grad() - output_real = discriminator.forward(data_real) - loss_real = (output_real * real_labels).mean() + output_real = discriminator.forward(data_real) + loss_real = (output_real * real_labels).mean() - output_fake = discriminator.forward(data_fake) - loss_fake = (output_fake * fake_labels).mean() + output_fake = discriminator.forward(data_fake) + loss_fake = (output_fake * fake_labels).mean() - loss_real.backward() - loss_fake.backward() - optimizer.step() - return loss_real.cpu().data + loss_fake.cpu().data + loss_real.backward() + loss_fake.backward() + optimizer.step() + return loss_real.cpu().data + loss_fake.cpu().data - def train_generator(optimizer, data_fake): - real_labels = real_label(batch_size) - optimizer.zero_grad() - output = discriminator.forward(data_fake) - loss = (output * real_labels).mean() - loss.backward() - optimizer.step() - return loss.cpu().data - - for epoch in tqdm(range(epochs)): - loss_g = 0.0 - loss_d = 0.0 - print(f"Epoch {epoch} of {epochs}") - for i in tqdm(range(n_steps)): - image = generator_batch() - for step in range(k): # Try with k = 5 or 7. - noise = Tensor(np.random.randn(batch_size,128), gpu=GPU) - data_fake = generator.forward(noise).detach() - data_real = image - loss_d_step = train_discriminator(optim_d, data_real, data_fake) - loss_d += loss_d_step - noise = Tensor(np.random.randn(batch_size,128), gpu=GPU) - data_fake = generator.forward(noise) - loss_g_step = train_generator(optim_g, data_fake) - loss_g += loss_g_step - fake_images = generator.forward(ds_noise).detach().cpu().data - fake_images = (fake_images.reshape(-1, 1, 28, 28)+ 1) / 2 # 0 - 1 range. - fake_images = make_grid(torch.tensor(fake_images)) - save_image(fake_images, os.path.join(output_folder,f"image_{epoch}.jpg")) - epoch_loss_g = loss_g / n_steps - epoch_loss_d = loss_d / n_steps - print(f"EPOCH: Generator loss: {epoch_loss_g}, Discriminator loss: {epoch_loss_d}") - else: - print("Training Completed!") + def train_generator(optimizer, data_fake): + real_labels = real_label(batch_size) + optimizer.zero_grad() + output = discriminator.forward(data_fake) + loss = (output * real_labels).mean() + loss.backward() + optimizer.step() + return loss.cpu().data + for epoch in tqdm(range(epochs)): + loss_g = 0.0 + loss_d = 0.0 + print(f"Epoch {epoch} of {epochs}") + for i in tqdm(range(n_steps)): + image = generator_batch() + for step in range(k): # Try with k = 5 or 7. + noise = Tensor(np.random.randn(batch_size, 128), gpu=GPU) + data_fake = generator.forward(noise).detach() + data_real = image + loss_d_step = train_discriminator(optim_d, data_real, data_fake) + loss_d += loss_d_step + noise = Tensor(np.random.randn(batch_size, 128), gpu=GPU) + data_fake = generator.forward(noise) + loss_g_step = train_generator(optim_g, data_fake) + loss_g += loss_g_step + fake_images = generator.forward(ds_noise).detach().cpu().data + fake_images = (fake_images.reshape(-1, 1, 28, 28) + 1) / 2 # 0 - 1 range. + fake_images = make_grid(torch.tensor(fake_images)) + save_image(fake_images, os.path.join(output_folder, f"image_{epoch}.jpg")) + epoch_loss_g = loss_g / n_steps + epoch_loss_d = loss_d / n_steps + print(f"EPOCH: Generator loss: {epoch_loss_g}, Discriminator loss: {epoch_loss_d}") + else: + print("Training Completed!") diff --git a/examples/serious_mnist.py b/examples/serious_mnist.py index 56cc432c15..b1612e5249 100644 --- a/examples/serious_mnist.py +++ b/examples/serious_mnist.py @@ -1,9 +1,10 @@ #!/usr/bin/env python -#inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb +# inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb import os import sys + sys.path.append(os.getcwd()) -sys.path.append(os.path.join(os.getcwd(), 'test')) +sys.path.append(os.path.join(os.getcwd(), "test")) import numpy as np from tinygrad.tensor import Tensor, GPU @@ -13,126 +14,139 @@ from test_mnist import fetch_mnist from extra.training import train, evaluate import tinygrad.optim as optim from extra.augment import augment_img + GPU = os.getenv("GPU", None) is not None QUICK = os.getenv("QUICK", None) is not None DEBUG = os.getenv("DEBUG", None) is not None + class SqueezeExciteBlock2D: def __init__(self, filters): self.filters = filters - self.weight1 = Tensor.uniform(self.filters, self.filters//32) - self.bias1 = Tensor.uniform(1,self.filters//32) - self.weight2 = Tensor.uniform(self.filters//32, self.filters) + self.weight1 = Tensor.uniform(self.filters, self.filters // 32) + self.bias1 = Tensor.uniform(1, self.filters // 32) + self.weight2 = Tensor.uniform(self.filters // 32, self.filters) self.bias2 = Tensor.uniform(1, self.filters) def __call__(self, input): - se = input.avg_pool2d(kernel_size=(input.shape[2], input.shape[3])) #GlobalAveragePool2D + se = input.avg_pool2d( + kernel_size=(input.shape[2], input.shape[3]) + ) # GlobalAveragePool2D se = se.reshape(shape=(-1, self.filters)) se = se.dot(self.weight1) + self.bias1 - se = se.relu() + se = se.relu() se = se.dot(self.weight2) + self.bias2 - se = se.sigmoid().reshape(shape=(-1,self.filters,1,1)) #for broadcasting + se = se.sigmoid().reshape(shape=(-1, self.filters, 1, 1)) # for broadcasting se = input.mul(se) return se + class ConvBlock: def __init__(self, h, w, inp, filters=128, conv=3): self.h, self.w = h, w self.inp = inp - #init weights - self.cweights = [Tensor.uniform(filters, inp if i==0 else filters, conv, conv) for i in range(3)] + # init weights + self.cweights = [ + Tensor.uniform(filters, inp if i == 0 else filters, conv, conv) for i in range(3) + ] self.cbiases = [Tensor.uniform(1, filters, 1, 1) for i in range(3)] - #init layers + # init layers self._bn = BatchNorm2D(128, training=True) self._seb = SqueezeExciteBlock2D(filters) - + def __call__(self, input): - x = input.reshape(shape=(-1, self.inp, self.w, self.h)) + x = input.reshape(shape=(-1, self.inp, self.w, self.h)) for cweight, cbias in zip(self.cweights, self.cbiases): - x = x.pad2d(padding=[1,1,1,1]).conv2d(cweight).add(cbias).relu() + x = x.pad2d(padding=[1, 1, 1, 1]).conv2d(cweight).add(cbias).relu() x = self._bn(x) x = self._seb(x) return x + class BigConvNet: def __init__(self): - self.conv = [ConvBlock(28,28,1), ConvBlock(28,28,128), ConvBlock(14,14,128)] - self.weight1 = Tensor.uniform(128,10) - self.weight2 = Tensor.uniform(128,10) + self.conv = [ConvBlock(28, 28, 1), ConvBlock(28, 28, 128), ConvBlock(14, 14, 128)] + self.weight1 = Tensor.uniform(128, 10) + self.weight2 = Tensor.uniform(128, 10) def parameters(self): - if DEBUG: #keeping this for a moment + if DEBUG: # keeping this for a moment pars = [par for par in get_parameters(self) if par.requires_grad] no_pars = 0 for par in pars: print(par.shape) no_pars += np.prod(par.shape) - print('no of parameters', no_pars) + print("no of parameters", no_pars) return pars else: return get_parameters(self) def save(self, filename): - with open(filename+'.npy', 'wb') as f: + with open(filename + ".npy", "wb") as f: for par in get_parameters(self): - #if par.requires_grad: + # if par.requires_grad: np.save(f, par.cpu().data) def load(self, filename): - with open(filename+'.npy', 'rb') as f: - for par in get_parameters(self): - #if par.requires_grad: + with open(filename + ".npy", "rb") as f: + for par in get_parameters(self): + # if par.requires_grad: try: par.cpu().data[:] = np.load(f) if GPU: par.cuda() except: - print('Could not load parameter') + print("Could not load parameter") def forward(self, x): x = self.conv[0](x) x = self.conv[1](x) - x = x.avg_pool2d(kernel_size=(2,2)) + x = x.avg_pool2d(kernel_size=(2, 2)) x = self.conv[2](x) - x1 = x.avg_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global - x2 = x.max_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global + x1 = x.avg_pool2d(kernel_size=(14, 14)).reshape(shape=(-1, 128)) # global + x2 = x.max_pool2d(kernel_size=(14, 14)).reshape(shape=(-1, 128)) # global xo = x1.dot(self.weight1) + x2.dot(self.weight2) return xo.logsoftmax() if __name__ == "__main__": lrs = [1e-4, 1e-5] if QUICK else [1e-3, 1e-4, 1e-5, 1e-5] - epochss = [2, 1] if QUICK else [13, 3, 3, 1] + epochss = [2, 1] if QUICK else [13, 3, 3, 1] BS = 32 lmbd = 0.00025 - lossfn = lambda out,y: out.mul(y).mean() + lmbd*(model.weight1.abs() + model.weight2.abs()).sum() + lossfn = ( + lambda out, y: out.mul(y).mean() + + lmbd * (model.weight1.abs() + model.weight2.abs()).sum() + ) X_train, Y_train, X_test, Y_test = fetch_mnist() - steps = len(X_train)//BS + steps = len(X_train) // BS np.random.seed(1337) if QUICK: steps = 1 X_test, Y_test = X_test[:BS], Y_test[:BS] - + model = BigConvNet() - + if len(sys.argv) > 1: try: model.load(sys.argv[1]) - print('Loaded weights "'+sys.argv[1]+'", evaluating...') + print('Loaded weights "' + sys.argv[1] + '", evaluating...') evaluate(model, X_test, Y_test, BS=BS) except: - print('could not load weights "'+sys.argv[1]+'".') - + print('could not load weights "' + sys.argv[1] + '".') + if GPU: params = get_parameters(model) [x.cuda_() for x in params] for lr, epochs in zip(lrs, epochss): optimizer = optim.Adam(model.parameters(), lr=lr) - for epoch in range(1,epochs+1): - #first epoch without augmentation + for epoch in range(1, epochs + 1): + # first epoch without augmentation X_aug = X_train if epoch == 1 else augment_img(X_train) - train(model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, gpu=GPU, BS=BS) + train( + model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, gpu=GPU, BS=BS + ) accuracy = evaluate(model, X_test, Y_test, BS=BS) - model.save('examples/checkpoint'+str("%.0f" % (accuracy*1.0e6))) + model.save("examples/checkpoint" + str("%.0f" % (accuracy * 1.0e6))) diff --git a/examples/train_efficientnet.py b/examples/train_efficientnet.py index e806f5ef3d..2760ff0412 100644 --- a/examples/train_efficientnet.py +++ b/examples/train_efficientnet.py @@ -10,13 +10,14 @@ import io import tarfile import pickle + class TinyConvNet: def __init__(self, classes=10): conv = 3 - inter_chan, out_chan = 8, 16 # for speed - self.c1 = Tensor.uniform(inter_chan,3,conv,conv) - self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv) - self.l1 = Tensor.uniform(out_chan*6*6, classes) + inter_chan, out_chan = 8, 16 # for speed + self.c1 = Tensor.uniform(inter_chan, 3, conv, conv) + self.c2 = Tensor.uniform(out_chan, inter_chan, conv, conv) + self.l1 = Tensor.uniform(out_chan * 6 * 6, classes) def forward(self, x): x = x.conv2d(self.c1).relu().max_pool2d() @@ -24,13 +25,20 @@ class TinyConvNet: x = x.reshape(shape=[x.shape[0], -1]) return x.dot(self.l1).logsoftmax() + def load_cifar(): - tt = tarfile.open(fileobj=io.BytesIO(fetch('https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz')), mode='r:gz') - db = pickle.load(tt.extractfile('cifar-10-batches-py/data_batch_1'), encoding="bytes") - X = db[b'data'].reshape((-1, 3, 32, 32)) - Y = np.array(db[b'labels']) + tt = tarfile.open( + fileobj=io.BytesIO( + fetch("https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz") + ), + mode="r:gz", + ) + db = pickle.load(tt.extractfile("cifar-10-batches-py/data_batch_1"), encoding="bytes") + X = db[b"data"].reshape((-1, 3, 32, 32)) + Y = np.array(db[b"labels"]) return X, Y + if __name__ == "__main__": X_train, Y_train = load_cifar() classes = 10 @@ -40,7 +48,7 @@ if __name__ == "__main__": TRANSFER = os.getenv("TRANSFER") is not None if TINY: model = TinyConvNet(classes) - elif TRANSFER: + elif TRANSFER: model = EfficientNet(int(os.getenv("NUM", "0")), classes, has_se=True) model.load_weights_from_torch() else: @@ -50,21 +58,21 @@ if __name__ == "__main__": print("parameters", len(parameters)) optimizer = optim.Adam(parameters, lr=0.001) - #BS, steps = 16, 32 + # BS, steps = 16, 32 BS, steps = 64 if TINY else 16, 2048 - for i in (t := trange(steps)): + for i in (t := trange(steps)) : samp = np.random.randint(0, X_train.shape[0], size=(BS)) img = X_train[samp].astype(np.float32) st = time.time() out = model.forward(Tensor(img)) - fp_time = (time.time()-st)*1000.0 + fp_time = (time.time() - st) * 1000.0 Y = Y_train[samp] - y = np.zeros((BS,classes), np.float32) - y[range(y.shape[0]),Y] = -classes + y = np.zeros((BS, classes), np.float32) + y[range(y.shape[0]), Y] = -classes y = Tensor(y) loss = out.logsoftmax().mul(y).mean() @@ -72,25 +80,32 @@ if __name__ == "__main__": st = time.time() loss.backward() - bp_time = (time.time()-st)*1000.0 + bp_time = (time.time() - st) * 1000.0 st = time.time() optimizer.step() - opt_time = (time.time()-st)*1000.0 + opt_time = (time.time() - st) * 1000.0 - #print(out.cpu().data) + # print(out.cpu().data) st = time.time() loss = loss.cpu().data cat = np.argmax(out.cpu().data, axis=1) accuracy = (cat == Y).mean() - finish_time = (time.time()-st)*1000.0 + finish_time = (time.time() - st) * 1000.0 # printing - t.set_description("loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f" % - (loss, accuracy, - fp_time, bp_time, opt_time, finish_time, - fp_time + bp_time + opt_time + finish_time)) + t.set_description( + "loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f" + % ( + loss, + accuracy, + fp_time, + bp_time, + opt_time, + finish_time, + fp_time + bp_time + opt_time + finish_time, + ) + ) del out, y, loss - diff --git a/examples/use_ane.py b/examples/use_ane.py index 9c4bb5bd2c..ee78ca16ff 100755 --- a/examples/use_ane.py +++ b/examples/use_ane.py @@ -3,9 +3,8 @@ import numpy as np from tinygrad.tensor import Tensor import time -a = Tensor([-2,-1,0,1,2]).ane() +a = Tensor([-2, -1, 0, 1, 2]).ane() print(a.cpu()) b = a.relu() print(b.cpu()) -assert(np.all(b.cpu().data >= 0)) - +assert np.all(b.cpu().data >= 0) diff --git a/extra/augment.py b/extra/augment.py index c68205717c..658ac3ae69 100644 --- a/extra/augment.py +++ b/extra/augment.py @@ -2,39 +2,47 @@ import numpy as np from PIL import Image import os import sys + sys.path.append(os.getcwd()) -sys.path.append(os.path.join(os.getcwd(), 'test')) +sys.path.append(os.path.join(os.getcwd(), "test")) from test_mnist import fetch_mnist from tqdm import trange + def augment_img(X, rotate=10, px=3): Xaug = np.zeros_like(X) for i in trange(len(X)): im = Image.fromarray(X[i]) - im = im.rotate(np.random.randint(-rotate,rotate), resample=Image.BICUBIC) + im = im.rotate(np.random.randint(-rotate, rotate), resample=Image.BICUBIC) w, h = X.shape[1:] - #upper left, lower left, lower right, upper right - quad = np.random.randint(-px,px,size=(8)) + np.array([0,0,0,h,w,h,w,0]) + # upper left, lower left, lower right, upper right + quad = np.random.randint(-px, px, size=(8)) + np.array([0, 0, 0, h, w, h, w, 0]) im = im.transform((w, h), Image.QUAD, quad, resample=Image.BICUBIC) Xaug[i] = im return Xaug + if __name__ == "__main__": from test_mnist import fetch_mnist import matplotlib.pyplot as plt + X_train, Y_train, X_test, Y_test = fetch_mnist() - X = np.vstack([X_train[:1]]*10+[X_train[1:2]]*10) - fig, a = plt.subplots(2,len(X)) + X = np.vstack([X_train[:1]] * 10 + [X_train[1:2]] * 10) + fig, a = plt.subplots(2, len(X)) Xaug = augment_img(X) for i in range(len(X)): - a[0][i].imshow(X[i], cmap='gray') - a[1][i].imshow(Xaug[i],cmap='gray') - a[0][i].axis('off') - a[1][i].axis('off') + a[0][i].imshow(X[i], cmap="gray") + a[1][i].imshow(Xaug[i], cmap="gray") + a[0][i].axis("off") + a[1][i].axis("off") plt.show() - #create some nice gifs for doc?! + # create some nice gifs for doc?! for i in range(10): - im = Image.fromarray(X_train[7353+i]) - im_aug = [Image.fromarray(x) for x in augment_img(np.array([X_train[7353+i]]*100))] - im.save("aug"+str(i)+".gif", save_all=True, append_images=im_aug, duration=100, loop=0) + im = Image.fromarray(X_train[7353 + i]) + im_aug = [ + Image.fromarray(x) for x in augment_img(np.array([X_train[7353 + i]] * 100)) + ] + im.save( + "aug" + str(i) + ".gif", save_all=True, append_images=im_aug, duration=100, loop=0 + ) diff --git a/extra/efficientnet.py b/extra/efficientnet.py index cd32acfab6..d936aa0855 100644 --- a/extra/efficientnet.py +++ b/extra/efficientnet.py @@ -6,6 +6,7 @@ from extra.utils import fetch USE_TORCH = False + def fake_torch_load(b0): import io import pickle @@ -23,9 +24,9 @@ def fake_torch_load(b0): class HackTensor: def __new__(cls, *args): - #print(args) + # print(args) ident, storage_type, obj_key, location, obj_size, view_metadata = args[0] - assert ident == 'storage' + assert ident == "storage" ret = np.zeros(obj_size, dtype=storage_type) key_prelookup[obj_key] = (storage_type, obj_size, ret, args[2], args[3]) @@ -33,10 +34,10 @@ def fake_torch_load(b0): class MyPickle(pickle.Unpickler): def find_class(self, module, name): - #print(module, name) - if name == 'FloatStorage': + # print(module, name) + if name == "FloatStorage": return np.float32 - if name == 'LongStorage': + if name == "LongStorage": return np.int64 if module == "torch._utils" or module == "torch": return HackTensor @@ -51,7 +52,7 @@ def fake_torch_load(b0): # create key_lookup key_lookup = pickle.load(fb0) key_real = [None] * len(key_lookup) - for k,v in key_prelookup.items(): + for k, v in key_prelookup.items(): key_real[key_lookup.index(k)] = v # read in the actual data @@ -64,13 +65,23 @@ def fake_torch_load(b0): np_array.shape = np_shape # numpy stores its strides in bytes - real_strides = tuple([x*bytes_size for x in np_strides]) + real_strides = tuple([x * bytes_size for x in np_strides]) np_array.strides = real_strides return ret + class MBConvBlock: - def __init__(self, kernel_size, strides, expand_ratio, input_filters, output_filters, se_ratio, has_se): + def __init__( + self, + kernel_size, + strides, + expand_ratio, + input_filters, + output_filters, + se_ratio, + has_se, + ): oup = expand_ratio * input_filters if expand_ratio != 1: self._expand_conv = Tensor.uniform(oup, input_filters, 1, 1) @@ -79,10 +90,10 @@ class MBConvBlock: self._expand_conv = None self.strides = strides - if strides == (2,2): - self.pad = [(kernel_size-1)//2-1, (kernel_size-1)//2]*2 + if strides == (2, 2): + self.pad = [(kernel_size - 1) // 2 - 1, (kernel_size - 1) // 2] * 2 else: - self.pad = [(kernel_size-1)//2]*4 + self.pad = [(kernel_size - 1) // 2] * 4 self._depthwise_conv = Tensor.uniform(oup, 1, kernel_size, kernel_size) self._bn1 = BatchNorm2D(oup) @@ -103,14 +114,22 @@ class MBConvBlock: if self._expand_conv: x = self._bn0(x.conv2d(self._expand_conv)).swish() x = x.pad2d(padding=self.pad) - x = x.conv2d(self._depthwise_conv, stride=self.strides, groups=self._depthwise_conv.shape[0]) + x = x.conv2d( + self._depthwise_conv, stride=self.strides, groups=self._depthwise_conv.shape[0] + ) x = self._bn1(x).swish() # has_se if self.has_se: x_squeezed = x.avg_pool2d(kernel_size=x.shape[2:4]) - x_squeezed = x_squeezed.conv2d(self._se_reduce).add(self._se_reduce_bias.reshape(shape=[1, -1, 1, 1])).swish() - x_squeezed = x_squeezed.conv2d(self._se_expand).add(self._se_expand_bias.reshape(shape=[1, -1, 1, 1])) + x_squeezed = ( + x_squeezed.conv2d(self._se_reduce) + .add(self._se_reduce_bias.reshape(shape=[1, -1, 1, 1])) + .swish() + ) + x_squeezed = x_squeezed.conv2d(self._se_expand).add( + self._se_expand_bias.reshape(shape=[1, -1, 1, 1]) + ) x = x.mul(x_squeezed.sigmoid()) x = self._bn2(x.conv2d(self._project_conv)) @@ -118,21 +137,22 @@ class MBConvBlock: x = x.add(inputs) return x + class EfficientNet: def __init__(self, number=0, classes=1000, has_se=True): self.number = number global_params = [ # width, depth - (1.0, 1.0), # b0 - (1.0, 1.1), # b1 - (1.1, 1.2), # b2 - (1.2, 1.4), # b3 - (1.4, 1.8), # b4 - (1.6, 2.2), # b5 - (1.8, 2.6), # b6 - (2.0, 3.1), # b7 - (2.2, 3.6), # b8 - (4.3, 5.3), # l2 + (1.0, 1.0), # b0 + (1.0, 1.1), # b1 + (1.1, 1.2), # b2 + (1.2, 1.4), # b3 + (1.4, 1.8), # b4 + (1.6, 2.2), # b5 + (1.8, 2.6), # b6 + (2.0, 3.1), # b7 + (2.2, 3.6), # b8 + (4.3, 5.3), # l2 ][number] def round_filters(filters): @@ -140,7 +160,7 @@ class EfficientNet: divisor = 8 filters *= multiplier new_filters = max(divisor, int(filters + divisor / 2) // divisor * divisor) - if new_filters < 0.9 * filters: # prevent rounding by more than 10% + if new_filters < 0.9 * filters: # prevent rounding by more than 10% new_filters += divisor return int(new_filters) @@ -151,13 +171,13 @@ class EfficientNet: self._conv_stem = Tensor.uniform(out_channels, 3, 3, 3) self._bn0 = BatchNorm2D(out_channels) blocks_args = [ - [1, 3, (1,1), 1, 32, 16, 0.25], - [2, 3, (2,2), 6, 16, 24, 0.25], - [2, 5, (2,2), 6, 24, 40, 0.25], - [3, 3, (2,2), 6, 40, 80, 0.25], - [3, 5, (1,1), 6, 80, 112, 0.25], - [4, 5, (2,2), 6, 112, 192, 0.25], - [1, 3, (1,1), 6, 192, 320, 0.25], + [1, 3, (1, 1), 1, 32, 16, 0.25], + [2, 3, (2, 2), 6, 16, 24, 0.25], + [2, 5, (2, 2), 6, 24, 40, 0.25], + [3, 3, (2, 2), 6, 40, 80, 0.25], + [3, 5, (1, 1), 6, 80, 112, 0.25], + [4, 5, (2, 2), 6, 112, 192, 0.25], + [1, 3, (1, 1), 6, 192, 320, 0.25], ] self._blocks = [] # num_repeats, kernel_size, strides, expand_ratio, input_filters, output_filters, se_ratio @@ -168,7 +188,7 @@ class EfficientNet: for n in range(round_repeats(b[0])): self._blocks.append(MBConvBlock(*args, has_se=has_se)) args[3] = args[4] - args[1] = (1,1) + args[1] = (1, 1) in_channels = round_filters(320) out_channels = round_filters(1280) @@ -178,43 +198,52 @@ class EfficientNet: self._fc_bias = Tensor.zeros(classes) def forward(self, x): - x = x.pad2d(padding=(0,1,0,1)) + x = x.pad2d(padding=(0, 1, 0, 1)) x = self._bn0(x.conv2d(self._conv_stem, stride=2)).swish() - #print(x.shape, x.data[:, 0, 0, 0]) + # print(x.shape, x.data[:, 0, 0, 0]) for block in self._blocks: x = block(x) x = self._bn1(x.conv2d(self._conv_head)).swish() x = x.avg_pool2d(kernel_size=x.shape[2:4]) x = x.reshape(shape=(-1, x.shape[1])) - #x = x.dropout(0.2) - return x.dot(self._fc).add(self._fc_bias.reshape(shape=[1,-1])) + # x = x.dropout(0.2) + return x.dot(self._fc).add(self._fc_bias.reshape(shape=[1, -1])) def load_weights_from_torch(self): # load b0 # https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/utils.py#L551 if self.number == 0: - b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth") + b0 = fetch( + "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth" + ) elif self.number == 2: - b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth") + b0 = fetch( + "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth" + ) elif self.number == 4: - b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth") + b0 = fetch( + "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth" + ) elif self.number == 7: - b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth") + b0 = fetch( + "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth" + ) else: raise Exception("no pretrained weights") if USE_TORCH: import io import torch + b0 = torch.load(io.BytesIO(b0)) else: b0 = fake_torch_load(b0) - for k,v in b0.items(): - if '_blocks.' in k: + for k, v in b0.items(): + if "_blocks." in k: k = "%s[%s].%s" % tuple(k.split(".", 2)) - mk = "self."+k - #print(k, v.shape) + mk = "self." + k + # print(k, v.shape) try: mv = eval(mk) except AttributeError: @@ -223,10 +252,9 @@ class EfficientNet: except AttributeError: mv = eval(mk.replace(".bias", "_bias")) vnp = v.numpy().astype(np.float32) if USE_TORCH else v - vnp = vnp if k != '_fc.weight' else vnp.T + vnp = vnp if k != "_fc.weight" else vnp.T if mv.shape == vnp.shape or vnp.shape == (): mv.data[:] = vnp else: print("MISMATCH SHAPE IN %s, %r %r" % (k, mv.shape, vnp.shape)) - diff --git a/extra/gradcheck.py b/extra/gradcheck.py index 4ebf04db50..603ac0f2b5 100644 --- a/extra/gradcheck.py +++ b/extra/gradcheck.py @@ -1,17 +1,19 @@ import numpy as np from tinygrad.tensor import Tensor -def mask_like(like, mask_inx, mask_value = 1.0): + +def mask_like(like, mask_inx, mask_value=1.0): mask = np.zeros_like(like).reshape(-1) mask[mask_inx] = mask_value return mask.reshape(like.shape) + def jacobian(func, input): output = func(input) ji = input.data.reshape(-1).shape[-1] jo = output.data.reshape(-1).shape[-1] - J = np.zeros((jo,ji), dtype=np.float32) + J = np.zeros((jo, ji), dtype=np.float32) for o in range(jo): input.grad = None @@ -19,14 +21,15 @@ def jacobian(func, input): # tinygrad doesn't support slicing, tiny-hack to select # the needed scalar an backpropagate only through it - o_scalar = Tensor(mask_like(output.data, o, 1.)).mul(output).sum() + o_scalar = Tensor(mask_like(output.data, o, 1.0)).mul(output).sum() o_scalar.backward() for i, grad in enumerate(input.grad.data.reshape(-1)): - J[o,i] = grad + J[o, i] = grad return J -def numerical_jacobian(func, input, eps = 1e-6): + +def numerical_jacobian(func, input, eps=1e-6): output = func(input) ji = input.data.reshape(-1).shape[-1] @@ -34,17 +37,18 @@ def numerical_jacobian(func, input, eps = 1e-6): NJ = np.zeros((jo, ji), dtype=np.float32) for i in range(ji): - eps_perturb = mask_like(input.data, i, mask_value = eps) + eps_perturb = mask_like(input.data, i, mask_value=eps) output_perturb_add = func(Tensor(input.data + eps_perturb)).data.reshape(-1) output_perturb_sub = func(Tensor(input.data - eps_perturb)).data.reshape(-1) - grad_approx = ((output_perturb_add) - (output_perturb_sub)) / (2*eps) + grad_approx = ((output_perturb_add) - (output_perturb_sub)) / (2 * eps) - NJ[:,i] = grad_approx + NJ[:, i] = grad_approx return NJ -def gradcheck(func, input, eps = 1e-06, atol = 1e-5, rtol = 0.001): + +def gradcheck(func, input, eps=1e-06, atol=1e-5, rtol=0.001): NJ = numerical_jacobian(func, input, eps) J = jacobian(func, input) return np.allclose(J, NJ, atol=atol, rtol=rtol) diff --git a/extra/training.py b/extra/training.py index 24fc84acb4..8e00b8ffb4 100644 --- a/extra/training.py +++ b/extra/training.py @@ -4,26 +4,40 @@ from tqdm import trange from extra.utils import get_parameters from tinygrad.tensor import Tensor, GPU, Device -def train(model, X_train, Y_train, optim, steps, num_classes=None, BS=128, device=Device.CPU, lossfn = lambda out,y: out.mul(y).mean()): - if device == Device.GPU: [x.gpu_() for x in get_parameters([model, optim])] - elif device == Device.ANE: [x.ane_() for x in get_parameters([model, optim])] - if num_classes is None: num_classes = Y_train.max().astype(int)+1 + +def train( + model, + X_train, + Y_train, + optim, + steps, + num_classes=None, + BS=128, + device=Device.CPU, + lossfn=lambda out, y: out.mul(y).mean(), +): + if device == Device.GPU: + [x.gpu_() for x in get_parameters([model, optim])] + elif device == Device.ANE: + [x.ane_() for x in get_parameters([model, optim])] + if num_classes is None: + num_classes = Y_train.max().astype(int) + 1 losses, accuracies = [], [] - for i in (t := trange(steps, disable=os.getenv('CI') is not None)): + for i in (t := trange(steps, disable=os.getenv("CI") is not None)) : samp = np.random.randint(0, X_train.shape[0], size=(BS)) - x = Tensor(X_train[samp].reshape((-1, 28*28)).astype(np.float32), device=device) + x = Tensor(X_train[samp].reshape((-1, 28 * 28)).astype(np.float32), device=device) Y = Y_train[samp] - y = np.zeros((len(samp),num_classes), np.float32) + y = np.zeros((len(samp), num_classes), np.float32) # correct loss for NLL, torch NLL loss returns one per row - y[range(y.shape[0]),Y] = -1.0*num_classes + y[range(y.shape[0]), Y] = -1.0 * num_classes y = Tensor(y, device=device) # network out = model.forward(x) # NLL loss function - loss = lossfn(out, y) + loss = lossfn(out, y) optim.zero_grad() loss.backward() optim.step() @@ -37,15 +51,26 @@ def train(model, X_train, Y_train, optim, steps, num_classes=None, BS=128, devic accuracies.append(accuracy) t.set_description("loss %.2f accuracy %.2f" % (loss, accuracy)) + def evaluate(model, X_test, Y_test, num_classes=None, device=Device.CPU, BS=128): def numpy_eval(num_classes): - Y_test_preds_out = np.zeros((len(Y_test),num_classes)) - for i in trange(len(Y_test)//BS, disable=os.getenv('CI') is not None): - Y_test_preds_out[i*BS:(i+1)*BS] = model.forward(Tensor(X_test[i*BS:(i+1)*BS].reshape((-1, 28*28)).astype(np.float32), device=device)).cpu().data + Y_test_preds_out = np.zeros((len(Y_test), num_classes)) + for i in trange(len(Y_test) // BS, disable=os.getenv("CI") is not None): + Y_test_preds_out[i * BS : (i + 1) * BS] = ( + model.forward( + Tensor( + X_test[i * BS : (i + 1) * BS].reshape((-1, 28 * 28)).astype(np.float32), + device=device, + ) + ) + .cpu() + .data + ) Y_test_preds = np.argmax(Y_test_preds_out, axis=1) return (Y_test == Y_test_preds).mean() - if num_classes is None: num_classes = Y_test.max().astype(int)+1 + if num_classes is None: + num_classes = Y_test.max().astype(int) + 1 accuracy = numpy_eval(num_classes) print("test set accuracy is %f" % accuracy) - return accuracy + return accuracy diff --git a/extra/utils.py b/extra/utils.py index 01d20b05e8..e8a822bb0e 100644 --- a/extra/utils.py +++ b/extra/utils.py @@ -1,19 +1,22 @@ from tinygrad.tensor import Tensor + def fetch(url): import requests, os, hashlib, tempfile - fp = os.path.join(tempfile.gettempdir(), hashlib.md5(url.encode('utf-8')).hexdigest()) + + fp = os.path.join(tempfile.gettempdir(), hashlib.md5(url.encode("utf-8")).hexdigest()) if os.path.isfile(fp) and os.stat(fp).st_size > 0: with open(fp, "rb") as f: dat = f.read() else: print("fetching %s" % url) dat = requests.get(url).content - with open(fp+".tmp", "wb") as f: + with open(fp + ".tmp", "wb") as f: f.write(dat) - os.rename(fp+".tmp", fp) + os.rename(fp + ".tmp", fp) return dat + def get_parameters(obj): parameters = [] if isinstance(obj, Tensor): @@ -21,8 +24,7 @@ def get_parameters(obj): elif isinstance(obj, list): for x in obj: parameters.extend(get_parameters(x)) - elif hasattr(obj, '__dict__'): - for k,v in obj.__dict__.items(): + elif hasattr(obj, "__dict__"): + for k, v in obj.__dict__.items(): parameters.extend(get_parameters(v)) return parameters - diff --git a/setup.py b/setup.py index 823bfcab2c..7ef6c20782 100644 --- a/setup.py +++ b/setup.py @@ -4,29 +4,31 @@ import os from setuptools import setup directory = os.path.abspath(os.path.dirname(__file__)) -with open(os.path.join(directory, 'README.md'), encoding='utf-8') as f: +with open(os.path.join(directory, "README.md"), encoding="utf-8") as f: long_description = f.read() -setup(name='tinygrad', - version='0.3.0', - description='You like pytorch? You like micrograd? You love tinygrad! heart', - author='George Hotz', - license='MIT', - long_description=long_description, - long_description_content_type='text/markdown', - packages = ['tinygrad'], - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License" - ], - install_requires=['numpy', 'requests'], - python_requires='>=3.8', - extras_require={ - 'gpu': ["pyopencl", "six"], - 'testing': [ - "pytest", - "torch", - "tqdm", - ], - }, - include_package_data=True) +setup( + name="tinygrad", + version="0.3.0", + description="You like pytorch? You like micrograd? You love tinygrad! heart", + author="George Hotz", + license="MIT", + long_description=long_description, + long_description_content_type="text/markdown", + packages=["tinygrad"], + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + ], + install_requires=["numpy", "requests"], + python_requires=">=3.8", + extras_require={ + "gpu": ["pyopencl", "six"], + "testing": [ + "pytest", + "torch", + "tqdm", + ], + }, + include_package_data=True, +) diff --git a/test/config.py b/test/config.py index ab20e8b39e..990bbebef7 100644 --- a/test/config.py +++ b/test/config.py @@ -1,3 +1,3 @@ import os -ANE = os.environ.get('ANE', False) +ANE = os.environ.get("ANE", False) diff --git a/test/test_gc.py b/test/test_gc.py index 2dc007d34b..2e7d84df42 100644 --- a/test/test_gc.py +++ b/test/test_gc.py @@ -4,43 +4,48 @@ import unittest from tinygrad.tensor import Tensor, GPU, Device from .config import ANE + def tensors_allocated(): return sum([isinstance(x, Tensor) for x in gc.get_objects()]) - + + class TestGC(unittest.TestCase): device = Device.CPU def test_gc(self): - a = Tensor.zeros(4,4, device=self.device) - b = Tensor.zeros(4,4, device=self.device) - (a*b).mean().backward() - assert(tensors_allocated() > 0) - del a,b - assert(tensors_allocated() == 0) + a = Tensor.zeros(4, 4, device=self.device) + b = Tensor.zeros(4, 4, device=self.device) + (a * b).mean().backward() + assert tensors_allocated() > 0 + del a, b + assert tensors_allocated() == 0 def test_gc_complex(self): - a = Tensor.zeros(4,4, device=self.device) - b = Tensor.zeros(4,4, device=self.device) - assert(tensors_allocated() == 2) - (a*b).mean().backward() - assert(tensors_allocated() == 4) + a = Tensor.zeros(4, 4, device=self.device) + b = Tensor.zeros(4, 4, device=self.device) + assert tensors_allocated() == 2 + (a * b).mean().backward() + assert tensors_allocated() == 4 del b - assert(tensors_allocated() == 2) - b = Tensor.zeros(4,4, device=self.device) + assert tensors_allocated() == 2 + b = Tensor.zeros(4, 4, device=self.device) print(tensors_allocated()) - (a*b).mean().backward() + (a * b).mean().backward() print(tensors_allocated()) - assert(tensors_allocated() == 4) + assert tensors_allocated() == 4 del b - assert(tensors_allocated() == 2) + assert tensors_allocated() == 2 + @unittest.skipUnless(GPU, "Requires GPU") class TestGCGPU(TestGC): - device = Device.GPU + device = Device.GPU + @unittest.skipUnless(ANE, "Requires ANE") class TestGCANE(TestGC): - device=Device.ANE + device = Device.ANE -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/test/test_mnist.py b/test/test_mnist.py index 8edae37f4d..35ed13c50e 100644 --- a/test/test_mnist.py +++ b/test/test_mnist.py @@ -11,19 +11,28 @@ from .config import ANE # mnist loader def fetch_mnist(): import gzip + parse = lambda dat: np.frombuffer(gzip.decompress(dat), dtype=np.uint8).copy() - X_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"))[0x10:].reshape((-1, 28, 28)) - Y_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"))[8:] - X_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"))[0x10:].reshape((-1, 28, 28)) - Y_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"))[8:] + X_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"))[ + 0x10: + ].reshape((-1, 28, 28)) + Y_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"))[ + 8: + ] + X_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"))[ + 0x10: + ].reshape((-1, 28, 28)) + Y_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"))[ + 8: + ] return X_train, Y_train, X_test, Y_test + # load the mnist dataset X_train, Y_train, X_test, Y_test = fetch_mnist() # create a model class TinyBobNet: - def __init__(self): self.l1 = Tensor.uniform(784, 128) self.l2 = Tensor.uniform(128, 10) @@ -34,27 +43,29 @@ class TinyBobNet: def forward(self, x): return x.dot(self.l1).relu().dot(self.l2).logsoftmax() + # create a model with a conv layer class TinyConvNet: def __init__(self): # https://keras.io/examples/vision/mnist_convnet/ conv = 3 - #inter_chan, out_chan = 32, 64 - inter_chan, out_chan = 8, 16 # for speed - self.c1 = Tensor.uniform(inter_chan,1,conv,conv) - self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv) - self.l1 = Tensor.uniform(out_chan*5*5, 10) + # inter_chan, out_chan = 32, 64 + inter_chan, out_chan = 8, 16 # for speed + self.c1 = Tensor.uniform(inter_chan, 1, conv, conv) + self.c2 = Tensor.uniform(out_chan, inter_chan, conv, conv) + self.l1 = Tensor.uniform(out_chan * 5 * 5, 10) def parameters(self): return get_parameters(self) def forward(self, x): - x = x.reshape(shape=(-1, 1, 28, 28)) # hacks + x = x.reshape(shape=(-1, 1, 28, 28)) # hacks x = x.conv2d(self.c1).relu().max_pool2d() x = x.conv2d(self.c2).relu().max_pool2d() x = x.reshape(shape=[x.shape[0], -1]) return x.dot(self.l1).logsoftmax() + class TestMNIST(unittest.TestCase): device = Device.CPU @@ -76,16 +87,19 @@ class TestMNIST(unittest.TestCase): np.random.seed(1337) model = TinyBobNet() optimizer = optim.RMSprop(model.parameters(), lr=0.0002) - train(model, X_train, Y_train, optimizer, steps=1000, device=self.device) + train(model, X_train, Y_train, optimizer, steps=1000, device=self.device) assert evaluate(model, X_test, Y_test, device=self.device) > 0.95 + @unittest.skipUnless(GPU, "Requires GPU") class TestMNISTGPU(TestMNIST): device = Device.GPU + @unittest.skipUnless(ANE, "Requires ANE") class TestMNISTANE(TestMNIST): - device=Device.ANE + device = Device.ANE -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/test/test_net_speed.py b/test/test_net_speed.py index 8b1ed84a16..bcf60b2f13 100644 --- a/test/test_net_speed.py +++ b/test/test_net_speed.py @@ -7,21 +7,25 @@ import torch from tinygrad.tensor import Tensor, GPU, Device from .config import ANE + def start_profile(): import time - pr = cProfile.Profile(timer=lambda: int(time.time()*1e9), timeunit=1e-6) + + pr = cProfile.Profile(timer=lambda: int(time.time() * 1e9), timeunit=1e-6) pr.enable() return pr -def stop_profile(pr, sort='cumtime'): + +def stop_profile(pr, sort="cumtime"): pr.disable() ps = pstats.Stats(pr) ps.strip_dirs() ps.sort_stats(sort) ps.print_stats(0.2) + class TestConvSpeed(unittest.TestCase): - device= Device.CPU + device = Device.CPU def test_mnist(self): # https://keras.io/examples/vision/mnist_convnet/ @@ -34,12 +38,12 @@ class TestConvSpeed(unittest.TestCase): conv = 3 inter_chan, out_chan = 32, 64 - c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True) - c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True) - l1 = torch.randn(out_chan*5*5, 10, requires_grad=True) + c1 = torch.randn(inter_chan, 1, conv, conv, requires_grad=True) + c2 = torch.randn(out_chan, inter_chan, conv, conv, requires_grad=True) + l1 = torch.randn(out_chan * 5 * 5, 10, requires_grad=True) c2d = torch.nn.functional.conv2d - mp = torch.nn.MaxPool2d((2,2)) + mp = torch.nn.MaxPool2d((2, 2)) lsm = torch.nn.LogSoftmax(dim=1) cnt = 5 @@ -47,19 +51,19 @@ class TestConvSpeed(unittest.TestCase): for i in range(cnt): et0 = time.time() x = torch.randn(128, 1, 28, 28, requires_grad=True) - x = mp(c2d(x,c1).relu()) - x = mp(c2d(x,c2).relu()) + x = mp(c2d(x, c1).relu()) + x = mp(c2d(x, c2).relu()) x = x.reshape(x.shape[0], -1) out = lsm(x.matmul(l1)) out = out.mean() et1 = time.time() out.backward() et2 = time.time() - fpt += (et1-et0) - bpt += (et2-et1) + fpt += et1 - et0 + bpt += et2 - et1 - fpt_baseline = (fpt*1000/cnt) - bpt_baseline = (bpt*1000/cnt) + fpt_baseline = fpt * 1000 / cnt + bpt_baseline = bpt * 1000 / cnt print("torch forward pass: %.3f ms" % fpt_baseline) print("torch backward pass: %.3f ms" % bpt_baseline) @@ -71,7 +75,7 @@ class TestConvSpeed(unittest.TestCase): cnt = 5 fpt, bpt = 0.0, 0.0 - for i in range(1+cnt): + for i in range(1 + cnt): et0 = time.time() x = Tensor.randn(128, 1, 28, 28, device=self.device) x = x.conv2d(c1).relu().avg_pool2d() @@ -85,24 +89,31 @@ class TestConvSpeed(unittest.TestCase): if i == 0: pr = start_profile() else: - fpt += (et1-et0) - bpt += (et2-et1) + fpt += et1 - et0 + bpt += et2 - et1 + + stop_profile(pr, sort="time") + fpt = fpt * 1000 / cnt + bpt = bpt * 1000 / cnt + print( + "forward pass: %.3f ms, %.2fx off baseline %.3f ms" + % (fpt, fpt / fpt_baseline, fpt_baseline) + ) + print( + "backward pass: %.3f ms, %.2fx off baseline %.3f ms" + % (bpt, bpt / bpt_baseline, bpt_baseline) + ) - stop_profile(pr, sort='time') - fpt = (fpt*1000/cnt) - bpt = (bpt*1000/cnt) - print("forward pass: %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline)) - print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline)) @unittest.skipUnless(GPU, "Requires GPU") class TestConvSpeedGPU(TestConvSpeed): device = Device.GPU + @unittest.skipUnless(ANE, "Requires ANE") class TestConvSpeedANE(TestConvSpeed): - device=Device.ANE + device = Device.ANE -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() - diff --git a/test/test_nn.py b/test/test_nn.py index ba00c7340b..6273ba479f 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -7,6 +7,7 @@ from extra.utils import get_parameters import torch from .config import ANE + class TestNN(unittest.TestCase): device = Device.CPU @@ -30,8 +31,12 @@ class TestNN(unittest.TestCase): tbn.running_mean[:] = torch.tensor(bn.running_mean.data) tbn.running_var[:] = torch.tensor(bn.running_var.data) - np.testing.assert_allclose(bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5) - np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5) + np.testing.assert_allclose( + bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5 + ) + np.testing.assert_allclose( + bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5 + ) # trial inn = Tensor.randn(2, sz, 3, 3, device=self.device) @@ -45,35 +50,42 @@ class TestNN(unittest.TestCase): # close np.testing.assert_allclose(outt.data, toutt.detach().numpy(), rtol=5e-5) - np.testing.assert_allclose(bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5) + np.testing.assert_allclose( + bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5 + ) # TODO: this is failing - #np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5) + # np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5) def test_batchnorm2d_training(self): self.test_batchnorm2d(True) + @unittest.skipUnless(GPU, "Requires GPU") class TestNNGPU(TestNN): device = Device.GPU @unittest.skip("Tests not added") - def test_batchnorm2d(self): pass + def test_batchnorm2d(self): + pass @unittest.skip("Tests not added") - def test_batchnorm2d_training(self): pass + def test_batchnorm2d_training(self): + pass @unittest.skipUnless(ANE, "Requires ANE") class TestNNANE(TestNN): - device=Device.ANE + device = Device.ANE @unittest.skip("Tests not added") - def test_batchnorm2d(self): pass + def test_batchnorm2d(self): + pass @unittest.skip("Tests not added") - def test_batchnorm2d_training(self): pass + def test_batchnorm2d_training(self): + pass -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/test/test_ops.py b/test/test_ops.py index 4e283f87e2..afaffa5125 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -7,13 +7,24 @@ import functools from tinygrad.tensor import Tensor, GPU, Device from .config import ANE -def helper_test_op(shps, torch_fxn, tinygrad_fxn, atol=0, rtol=1e-6, grad_atol=0, grad_rtol=1e-6, device=Device.CPU, forward_only=False): + +def helper_test_op( + shps, + torch_fxn, + tinygrad_fxn, + atol=0, + rtol=1e-6, + grad_atol=0, + grad_rtol=1e-6, + device=Device.CPU, + forward_only=False, +): torch.manual_seed(0) ts = [torch.rand(x, requires_grad=True) for x in shps] tst = [Tensor(x.detach().numpy()) for x in ts] - if device==Device.GPU: + if device == Device.GPU: tst = [x.gpu() for x in tst] - elif device==Device.ANE: + elif device == Device.ANE: tst = [x.ane() for x in tst] out = torch_fxn(*ts) @@ -26,142 +37,291 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn, atol=0, rtol=1e-6, grad_atol=0 ret.mean().backward() for t, tt in zip(ts, tst): - np.testing.assert_allclose(t.grad, tt.cpu().grad.data, atol=grad_atol, rtol=grad_rtol) + np.testing.assert_allclose( + t.grad, tt.cpu().grad.data, atol=grad_atol, rtol=grad_rtol + ) # speed - torch_fp = timeit.Timer(functools.partial(torch_fxn, *ts)).timeit(5) * 1000/5 - tinygrad_fp = timeit.Timer(functools.partial(tinygrad_fxn, *tst)).timeit(5) * 1000/5 + torch_fp = timeit.Timer(functools.partial(torch_fxn, *ts)).timeit(5) * 1000 / 5 + tinygrad_fp = timeit.Timer(functools.partial(tinygrad_fxn, *tst)).timeit(5) * 1000 / 5 if not forward_only: - torch_fbp = timeit.Timer(functools.partial(lambda f,x: f(*x).mean().backward(), torch_fxn, ts)).timeit(5) * 1000/5 - tinygrad_fbp = timeit.Timer(functools.partial(lambda f,x: f(*x).mean().backward(), tinygrad_fxn, tst)).timeit(5) * 1000/5 + torch_fbp = ( + timeit.Timer( + functools.partial(lambda f, x: f(*x).mean().backward(), torch_fxn, ts) + ).timeit(5) + * 1000 + / 5 + ) + tinygrad_fbp = ( + timeit.Timer( + functools.partial(lambda f, x: f(*x).mean().backward(), tinygrad_fxn, tst) + ).timeit(5) + * 1000 + / 5 + ) else: torch_fbp, tinygrad_fbp = np.nan, np.nan - print("testing %30r torch/tinygrad fp: %.2f / %.2f ms bp: %.2f / %.2f ms" % (shps, torch_fp, tinygrad_fp, torch_fbp-torch_fp, tinygrad_fbp-tinygrad_fp)) + print( + "testing %30r torch/tinygrad fp: %.2f / %.2f ms bp: %.2f / %.2f ms" + % (shps, torch_fp, tinygrad_fp, torch_fbp - torch_fp, tinygrad_fbp - tinygrad_fp) + ) + class TestOps(unittest.TestCase): - device=Device.CPU + device = Device.CPU def test_add(self): - helper_test_op([(45,65), (45,65)], lambda x,y: x+y, Tensor.add, device=self.device) + helper_test_op( + [(45, 65), (45, 65)], lambda x, y: x + y, Tensor.add, device=self.device + ) + def test_sub(self): - helper_test_op([(45,65), (45,65)], lambda x,y: x-y, Tensor.sub, device=self.device) + helper_test_op( + [(45, 65), (45, 65)], lambda x, y: x - y, Tensor.sub, device=self.device + ) + def test_mul(self): - helper_test_op([(45,65), (45,65)], lambda x,y: x*y, Tensor.mul, device=self.device) + helper_test_op( + [(45, 65), (45, 65)], lambda x, y: x * y, Tensor.mul, device=self.device + ) + def test_div(self): - helper_test_op([(45,65), (45,65)], lambda x,y: x/y, Tensor.div, device=self.device) + helper_test_op( + [(45, 65), (45, 65)], lambda x, y: x / y, Tensor.div, device=self.device + ) + def test_pow(self): - helper_test_op([(45,65), (45,65)], lambda x,y: x**y, Tensor.pow, device=self.device) + helper_test_op( + [(45, 65), (45, 65)], lambda x, y: x ** y, Tensor.pow, device=self.device + ) + def test_sqrt(self): - helper_test_op([(45,65)], lambda x: x.sqrt(), Tensor.sqrt, device=self.device) + helper_test_op([(45, 65)], lambda x: x.sqrt(), Tensor.sqrt, device=self.device) + def test_relu(self): - helper_test_op([(45,65)], lambda x: x.relu(), Tensor.relu, device=self.device) + helper_test_op([(45, 65)], lambda x: x.relu(), Tensor.relu, device=self.device) + def test_leakyrelu(self): - helper_test_op([(45,65)], lambda x: torch.nn.functional.leaky_relu(x,0.01), Tensor.leakyrelu, device=self.device) + helper_test_op( + [(45, 65)], + lambda x: torch.nn.functional.leaky_relu(x, 0.01), + Tensor.leakyrelu, + device=self.device, + ) + def test_abs(self): - helper_test_op([(45,65)], lambda x: torch.abs(x), Tensor.abs, device=self.device) + helper_test_op([(45, 65)], lambda x: torch.abs(x), Tensor.abs, device=self.device) + def test_sigmoid(self): - helper_test_op([(45,65)], lambda x: x.sigmoid(), Tensor.sigmoid, device=self.device) + helper_test_op( + [(45, 65)], lambda x: x.sigmoid(), Tensor.sigmoid, device=self.device + ) + def test_dot(self): - helper_test_op([(45,65), (65,100)], lambda x,y: x.matmul(y), Tensor.dot, device=self.device) + helper_test_op( + [(45, 65), (65, 100)], lambda x, y: x.matmul(y), Tensor.dot, device=self.device + ) + def test_sum(self): - helper_test_op([(45,3)], lambda x: x.sum(), Tensor.sum, device=self.device) + helper_test_op([(45, 3)], lambda x: x.sum(), Tensor.sum, device=self.device) + def test_sum_axis(self): - helper_test_op([(3,4,5,6)], lambda x: x.sum(axis=(1,2)), lambda x: Tensor.sum(x, axis=(1,2)), device=self.device) + helper_test_op( + [(3, 4, 5, 6)], + lambda x: x.sum(axis=(1, 2)), + lambda x: Tensor.sum(x, axis=(1, 2)), + device=self.device, + ) + def test_mean_axis(self): - helper_test_op([(3,4,5,6)], lambda x: x.mean(axis=(1,2)), lambda x: Tensor.mean(x, axis=(1,2)), device=self.device) + helper_test_op( + [(3, 4, 5, 6)], + lambda x: x.mean(axis=(1, 2)), + lambda x: Tensor.mean(x, axis=(1, 2)), + device=self.device, + ) + def test_logsoftmax(self): - helper_test_op([(45,65)], lambda x: torch.nn.LogSoftmax(dim=1)(x), Tensor.logsoftmax, atol=1e-7, grad_atol=1e-7, device=self.device) + helper_test_op( + [(45, 65)], + lambda x: torch.nn.LogSoftmax(dim=1)(x), + Tensor.logsoftmax, + atol=1e-7, + grad_atol=1e-7, + device=self.device, + ) + def test_tanh(self): - helper_test_op([(45,65)], lambda x: x.tanh(), Tensor.tanh, atol=1e-6, grad_atol=1e-6, device=self.device) + helper_test_op( + [(45, 65)], + lambda x: x.tanh(), + Tensor.tanh, + atol=1e-6, + grad_atol=1e-6, + device=self.device, + ) + def test_topo_sort(self): - helper_test_op([(45,65)], lambda x: (x+x)*x, lambda x: x.add(x).mul(x), atol=1e-6, grad_atol=1e-6, device=self.device) + helper_test_op( + [(45, 65)], + lambda x: (x + x) * x, + lambda x: x.add(x).mul(x), + atol=1e-6, + grad_atol=1e-6, + device=self.device, + ) def test_scalar_mul(self): - helper_test_op([(45,65)], lambda x: x*2, lambda x: x*2, device=self.device) + helper_test_op([(45, 65)], lambda x: x * 2, lambda x: x * 2, device=self.device) + def test_scalar_rmul(self): - helper_test_op([(45,65)], lambda x: 2*x, lambda x: 2*x, device=self.device) + helper_test_op([(45, 65)], lambda x: 2 * x, lambda x: 2 * x, device=self.device) def test_scalar_sub(self): - helper_test_op([(45,65)], lambda x: x-2, lambda x: x-2, device=self.device) + helper_test_op([(45, 65)], lambda x: x - 2, lambda x: x - 2, device=self.device) + def test_scalar_rsub(self): - helper_test_op([(45,65)], lambda x: 2-x, lambda x: 2-x, device=self.device) + helper_test_op([(45, 65)], lambda x: 2 - x, lambda x: 2 - x, device=self.device) def test_broadcast_full(self): - for torch_op, tinygrad_op in [(torch.add, Tensor.add), (torch.sub, Tensor.sub), (torch.mul, Tensor.mul), - (torch.div, Tensor.div), (torch.pow, Tensor.pow)]: - for shapes in [((5,13,24,16), (5,1,24,1)), ((1,3,1,7,1), (2,1,5,1,8))]: + for torch_op, tinygrad_op in [ + (torch.add, Tensor.add), + (torch.sub, Tensor.sub), + (torch.mul, Tensor.mul), + (torch.div, Tensor.div), + (torch.pow, Tensor.pow), + ]: + for shapes in [ + ((5, 13, 24, 16), (5, 1, 24, 1)), + ((1, 3, 1, 7, 1), (2, 1, 5, 1, 8)), + ]: with self.subTest(op=torch_op.__name__, shapes=shapes): helper_test_op(shapes, torch_op, tinygrad_op, device=self.device) - def test_broadcast_partial(self): - for torch_op, tinygrad_op in [(torch.add, Tensor.add), (torch.sub, Tensor.sub), (torch.mul, Tensor.mul), - (torch.div, Tensor.div), (torch.pow, Tensor.pow)]: - for shapes in [((1,32,32,32), (1,32,1,1)), ((5,13,24,16,2), (1,13,24,1,1)), - ((4,1), (4,5)), ((1,4), (5,4))]: + for torch_op, tinygrad_op in [ + (torch.add, Tensor.add), + (torch.sub, Tensor.sub), + (torch.mul, Tensor.mul), + (torch.div, Tensor.div), + (torch.pow, Tensor.pow), + ]: + for shapes in [ + ((1, 32, 32, 32), (1, 32, 1, 1)), + ((5, 13, 24, 16, 2), (1, 13, 24, 1, 1)), + ((4, 1), (4, 5)), + ((1, 4), (5, 4)), + ]: with self.subTest(op=torch_op.__name__, shapes=shapes): # NOTE: ANE backwards? - helper_test_op(shapes, torch_op, tinygrad_op, device=self.device, forward_only=self.device!=Device.CPU) + helper_test_op( + shapes, + torch_op, + tinygrad_op, + device=self.device, + forward_only=self.device != Device.CPU, + ) def test_pad2d(self): - helper_test_op([(3,3,3,3)], lambda x: torch.nn.functional.pad(x, (1,2,3,4)), lambda x: x.pad2d(padding=(1,2,3,4)), device=self.device) + helper_test_op( + [(3, 3, 3, 3)], + lambda x: torch.nn.functional.pad(x, (1, 2, 3, 4)), + lambda x: x.pad2d(padding=(1, 2, 3, 4)), + device=self.device, + ) def test_reshape(self): - helper_test_op([(4,3,6,6)], lambda x: torch.reshape(x, (-1,3,6,6)), lambda x: x.reshape(shape=(-1,3,6,6)), device=self.device) - helper_test_op([(4,3,6,6)], lambda x: torch.reshape(x, (-1,1,6,6)), lambda x: x.reshape(shape=(-1,1,6,6)), device=self.device) + helper_test_op( + [(4, 3, 6, 6)], + lambda x: torch.reshape(x, (-1, 3, 6, 6)), + lambda x: x.reshape(shape=(-1, 3, 6, 6)), + device=self.device, + ) + helper_test_op( + [(4, 3, 6, 6)], + lambda x: torch.reshape(x, (-1, 1, 6, 6)), + lambda x: x.reshape(shape=(-1, 1, 6, 6)), + device=self.device, + ) def test_detach(self): - helper_test_op([(4,3,6,6)], lambda x: x.detach(), lambda x: x.detach(), device=self.device, forward_only=True) + helper_test_op( + [(4, 3, 6, 6)], + lambda x: x.detach(), + lambda x: x.detach(), + device=self.device, + forward_only=True, + ) def test_conv2d(self): - for bs in [1,8]: - for cin in [1,3]: - for groups in [1,3] if cin == 3 else [1]: - for H in [1,2,5]: - for W in [1,2,3,5]: - with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H, width=W): - helper_test_op([(bs,cin,11,28), (6,cin//groups,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(), - lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), device=self.device, grad_rtol=1e-5) + for bs in [1, 8]: + for cin in [1, 3]: + for groups in [1, 3] if cin == 3 else [1]: + for H in [1, 2, 5]: + for W in [1, 2, 3, 5]: + with self.subTest( + batch_size=bs, channels=cin, groups=groups, height=H, width=W + ): + helper_test_op( + [(bs, cin, 11, 28), (6, cin // groups, H, W)], + lambda x, w: torch.nn.functional.conv2d(x, w, groups=groups).relu(), + lambda x, w: Tensor.conv2d(x, w, groups=groups).relu(), + device=self.device, + grad_rtol=1e-5, + ) def test_strided_conv2d(self): bs = 4 cin = 3 - H,W = 3,3 + H, W = 3, 3 with self.subTest(stride := 2): - helper_test_op([(bs,cin,11,28), (4,cin,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,stride=2).relu(), - lambda x,w: Tensor.conv2d(x,w,stride=stride).relu(), device=self.device) - with self.subTest(stride := (2,1)): - helper_test_op([(bs,cin,11,28), (4,cin,H,W)], - lambda x,w: torch.nn.functional.conv2d(x,w,stride=stride).relu(), - lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), device=self.device) + helper_test_op( + [(bs, cin, 11, 28), (4, cin, H, W)], + lambda x, w: torch.nn.functional.conv2d(x, w, stride=2).relu(), + lambda x, w: Tensor.conv2d(x, w, stride=stride).relu(), + device=self.device, + ) + with self.subTest(stride := (2, 1)): + helper_test_op( + [(bs, cin, 11, 28), (4, cin, H, W)], + lambda x, w: torch.nn.functional.conv2d(x, w, stride=stride).relu(), + lambda x, w: Tensor.conv2d(x, w, stride=(2, 1)).relu(), + device=self.device, + ) def test_maxpool2d(self): - for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1)]: + for ksz in [(2, 2), (3, 3), (3, 2), (5, 5), (5, 1)]: with self.subTest(kernel_size=ksz): - helper_test_op([(32,2,110,28)], + helper_test_op( + [(32, 2, 110, 28)], lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz), - lambda x: Tensor.max_pool2d(x, kernel_size=ksz), device=self.device) + lambda x: Tensor.max_pool2d(x, kernel_size=ksz), + device=self.device, + ) def test_avgpool2d(self): - shape = (32,2,111,28) - for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1), shape[2:]]: + shape = (32, 2, 111, 28) + for ksz in [(2, 2), (3, 3), (3, 2), (5, 5), (5, 1), shape[2:]]: with self.subTest(kernel_size=ksz): - helper_test_op([shape], + helper_test_op( + [shape], lambda x: torch.nn.functional.avg_pool2d(x, kernel_size=ksz), - lambda x: Tensor.avg_pool2d(x, kernel_size=ksz), device=self.device) + lambda x: Tensor.avg_pool2d(x, kernel_size=ksz), + device=self.device, + ) + @unittest.skipUnless(GPU, "Requires GPU") class TestOpsGPU(TestOps): - device=Device.GPU + device = Device.GPU + @unittest.skipUnless(ANE, "Requires ANE") class TestOpsANE(TestOps): - device=Device.ANE + device = Device.ANE -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main(verbosity=2) - diff --git a/test/test_optim.py b/test/test_optim.py index 99ddcff57c..e4111c80bd 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -6,20 +6,24 @@ from tinygrad.optim import Adam, SGD, RMSprop from extra.utils import get_parameters from .config import ANE -x_init = np.random.randn(1,3).astype(np.float32) -W_init = np.random.randn(3,3).astype(np.float32) -m_init = np.random.randn(1,3).astype(np.float32) +x_init = np.random.randn(1, 3).astype(np.float32) +W_init = np.random.randn(3, 3).astype(np.float32) +m_init = np.random.randn(1, 3).astype(np.float32) + def step_tinygrad(optim, kwargs={}, device=Device.CPU): net = TinyNet() optim = optim([net.x, net.W], **kwargs) - if device==Device.GPU: [x.gpu_() for x in get_parameters([net, optim])] - elif device==Device.ANE: [x.ane_() for x in get_parameters([net, optim])] + if device == Device.GPU: + [x.gpu_() for x in get_parameters([net, optim])] + elif device == Device.ANE: + [x.ane_() for x in get_parameters([net, optim])] out = net.forward() out.backward() optim.step() return net.x.cpu().data, net.W.cpu().data + def step_pytorch(optim, kwargs={}): net = TorchNet() optim = optim([net.x, net.W], **kwargs) @@ -29,7 +33,7 @@ def step_pytorch(optim, kwargs={}): return net.x.detach().numpy(), net.W.detach().numpy() -class TinyNet(): +class TinyNet: def __init__(self): self.x = Tensor(x_init.copy()) self.W = Tensor(W_init.copy()) @@ -42,7 +46,7 @@ class TinyNet(): return out -class TorchNet(): +class TorchNet: def __init__(self): self.x = torch.tensor(x_init.copy(), requires_grad=True) self.W = torch.tensor(W_init.copy(), requires_grad=True) @@ -59,19 +63,23 @@ class TestOptim(unittest.TestCase): device = Device.CPU def test_adam(self): - for x,y in zip(step_tinygrad(Adam, device=self.device), - step_pytorch(torch.optim.Adam)): + for x, y in zip( + step_tinygrad(Adam, device=self.device), step_pytorch(torch.optim.Adam) + ): np.testing.assert_allclose(x, y, atol=1e-4) def test_sgd(self): - for x,y in zip(step_tinygrad(SGD, kwargs={'lr': 0.001}, device=self.device), - step_pytorch(torch.optim.SGD, kwargs={'lr': 0.001})): + for x, y in zip( + step_tinygrad(SGD, kwargs={"lr": 0.001}, device=self.device), + step_pytorch(torch.optim.SGD, kwargs={"lr": 0.001}), + ): np.testing.assert_allclose(x, y, atol=1e-5) def test_rmsprop(self): - for x,y in zip(step_tinygrad(RMSprop, kwargs={'lr': 0.001, 'decay': 0.99}, device=self.device), - step_pytorch(torch.optim.RMSprop, - kwargs={'lr': 0.001, 'alpha': 0.99})): + for x, y in zip( + step_tinygrad(RMSprop, kwargs={"lr": 0.001, "decay": 0.99}, device=self.device), + step_pytorch(torch.optim.RMSprop, kwargs={"lr": 0.001, "alpha": 0.99}), + ): np.testing.assert_allclose(x, y, atol=1e-5) @@ -79,10 +87,11 @@ class TestOptim(unittest.TestCase): class TestOptimGPU(TestOptim): device = Device.GPU + @unittest.skipUnless(ANE, "Requires ANE") class TestOptimANE(TestOptim): device = Device.ANE -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/test/test_tensor.py b/test/test_tensor.py index f54527b084..a81a10e011 100644 --- a/test/test_tensor.py +++ b/test/test_tensor.py @@ -6,11 +6,12 @@ from extra.gradcheck import numerical_jacobian, jacobian, gradcheck from .config import ANE -x_init = np.random.randn(1,3).astype(np.float32) -U_init = np.random.randn(3,3).astype(np.float32) -V_init = np.random.randn(3,3).astype(np.float32) -W_init = np.random.randn(3,3).astype(np.float32) -m_init = np.random.randn(1,3).astype(np.float32) +x_init = np.random.randn(1, 3).astype(np.float32) +U_init = np.random.randn(3, 3).astype(np.float32) +V_init = np.random.randn(3, 3).astype(np.float32) +W_init = np.random.randn(3, 3).astype(np.float32) +m_init = np.random.randn(1, 3).astype(np.float32) + class TestTinygrad(unittest.TestCase): device = Device.CPU @@ -36,7 +37,7 @@ class TestTinygrad(unittest.TestCase): out.backward() return out.detach().numpy(), x.grad, W.grad - for x,y in zip(test_tinygrad(), test_pytorch()): + for x, y in zip(test_tinygrad(), test_pytorch()): np.testing.assert_allclose(x, y, atol=1e-5) def test_backward_pass_diamond_model(self): @@ -64,7 +65,7 @@ class TestTinygrad(unittest.TestCase): out.backward() return out.detach().numpy(), u.grad, v.grad, w.grad - for x,y in zip(test_tinygrad(), test_pytorch()): + for x, y in zip(test_tinygrad(), test_pytorch()): np.testing.assert_allclose(x, y, atol=1e-5) def test_jacobian(self): @@ -73,7 +74,9 @@ class TestTinygrad(unittest.TestCase): torch_x = torch.tensor(x, requires_grad=True) torch_W = torch.tensor(W, requires_grad=True) - torch_func = lambda x: torch.nn.functional.log_softmax(x.matmul(torch_W).relu(), dim=1) + torch_func = lambda x: torch.nn.functional.log_softmax( + x.matmul(torch_W).relu(), dim=1 + ) PJ = torch.autograd.functional.jacobian(torch_func, torch_x).squeeze().numpy() tiny_x = Tensor(x, device=self.device) @@ -82,8 +85,8 @@ class TestTinygrad(unittest.TestCase): J = jacobian(tiny_func, tiny_x) NJ = numerical_jacobian(tiny_func, tiny_x) - np.testing.assert_allclose(PJ, J, atol = 1e-5) - np.testing.assert_allclose(PJ, NJ, atol = 1e-5) + np.testing.assert_allclose(PJ, J, atol=1e-5) + np.testing.assert_allclose(PJ, NJ, atol=1e-5) def test_gradcheck(self): W = np.random.RandomState(1337).random((10, 5)) @@ -96,7 +99,7 @@ class TestTinygrad(unittest.TestCase): self.assertTrue(gradcheck(tiny_func, tiny_x)) # coarse approx. since a "big" eps and the non-linearities of the model - self.assertFalse(gradcheck(tiny_func, tiny_x, eps = 0.1)) + self.assertFalse(gradcheck(tiny_func, tiny_x, eps=0.1)) @unittest.skipUnless(GPU, "Requires GPU") @@ -104,14 +107,18 @@ class TestTinygradGPU(TestTinygrad): device = Device.GPU @unittest.skip("float64 not supported on GPU") - def test_jacobian(self): pass + def test_jacobian(self): + pass @unittest.skip("float64 not supported on GPU") - def test_gradcheck(self): pass + def test_gradcheck(self): + pass + @unittest.skipUnless(ANE, "Requires ANE") class TestOpsANE(TestTinygrad): - device=Device.ANE + device = Device.ANE -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/tinygrad/nn.py b/tinygrad/nn.py index 17a99d5941..0e1cc704d7 100644 --- a/tinygrad/nn.py +++ b/tinygrad/nn.py @@ -1,23 +1,38 @@ from tinygrad.tensor import Tensor + class BatchNorm2D: - def __init__(self, sz, eps=1e-5, track_running_stats=False, training=False, momentum=0.1): - self.eps, self.track_running_stats, self.training, self.momentum = eps, track_running_stats, training, momentum + def __init__( + self, sz, eps=1e-5, track_running_stats=False, training=False, momentum=0.1 + ): + self.eps, self.track_running_stats, self.training, self.momentum = ( + eps, + track_running_stats, + training, + momentum, + ) self.weight, self.bias = Tensor.ones(sz), Tensor.zeros(sz) - self.running_mean, self.running_var = Tensor.zeros(sz, requires_grad=False), Tensor.ones(sz, requires_grad=False) + self.running_mean, self.running_var = ( + Tensor.zeros(sz, requires_grad=False), + Tensor.ones(sz, requires_grad=False), + ) self.num_batches_tracked = Tensor.zeros(1, requires_grad=False) def __call__(self, x): if self.track_running_stats or self.training: - batch_mean = x.mean(axis=(0,2,3)) - y = (x - batch_mean.reshape(shape=[1, -1, 1, 1])) - batch_var = (y*y).mean(axis=(0,2,3)) + batch_mean = x.mean(axis=(0, 2, 3)) + y = x - batch_mean.reshape(shape=[1, -1, 1, 1]) + batch_var = (y * y).mean(axis=(0, 2, 3)) if self.track_running_stats: - self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean - self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var + self.running_mean = ( + 1 - self.momentum + ) * self.running_mean + self.momentum * batch_mean + self.running_var = ( + 1 - self.momentum + ) * self.running_var + self.momentum * batch_var self.num_batches_tracked += 1 if self.training: @@ -26,6 +41,9 @@ class BatchNorm2D: return self.normalize(x, self.running_mean, self.running_var) def normalize(self, x, mean, var): - x = (x - mean.reshape(shape=[1, -1, 1, 1])) * self.weight.reshape(shape=[1, -1, 1, 1]) - return x.div(var.add(self.eps).reshape(shape=[1, -1, 1, 1])**0.5) + self.bias.reshape(shape=[1, -1, 1, 1]) - + x = (x - mean.reshape(shape=[1, -1, 1, 1])) * self.weight.reshape( + shape=[1, -1, 1, 1] + ) + return x.div( + var.add(self.eps).reshape(shape=[1, -1, 1, 1]) ** 0.5 + ) + self.bias.reshape(shape=[1, -1, 1, 1]) diff --git a/tinygrad/ops_ane.py b/tinygrad/ops_ane.py index ba1c9a2d66..c7bb23c09f 100644 --- a/tinygrad/ops_ane.py +++ b/tinygrad/ops_ane.py @@ -2,19 +2,23 @@ from .tensor import Tensor, Function, register from functools import lru_cache import struct + @lru_cache def compile_wrapper(ane, dat): return ane.compile(dat) + def roundup(x, v): - return x + (v-x)%v + return x + (v - x) % v + def fill(dat, addrs, type, val, base=0x4000): x = struct.pack(type, val) for a in addrs: - dat[base+a:base+a+len(x)] = x + dat[base + a : base + a + len(x)] = x return dat + @lru_cache def compile_relu(ane, sz): dat = list(open("ane/ops/relu.hwx", "rb").read()) @@ -22,16 +26,18 @@ def compile_relu(ane, sz): # number of relus dat = fill(dat, [0x128, 0x13C], "H", sz) # number of engines? (max 0x100) - dat = fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8], "I", max(0x100, roundup(sz*2, 0x10))) + dat = fill(dat, [0x1EC, 0x1F0, 0x1F4, 0x1F8], "I", max(0x100, roundup(sz * 2, 0x10))) # strides? - dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz*2, 0x40)) + dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz * 2, 0x40)) return compile_wrapper(ane, bytes(dat)) + class ReLU(Function): @staticmethod def forward(ctx, input): ret = ctx.ane.tensor(input.shape) ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret) return ret -register('relu', ReLU, device=Tensor.ANE) + +register("relu", ReLU, device=Tensor.ANE) diff --git a/tinygrad/ops_cpu.py b/tinygrad/ops_cpu.py index 87b253dbc0..c3661c1df2 100644 --- a/tinygrad/ops_cpu.py +++ b/tinygrad/ops_cpu.py @@ -5,44 +5,58 @@ from .tensor import Function, register # ************* basic ops ************* def unbroadcast(out, in_sh): # adjoint operation to broadcast is sum. Need to sum all axis with 1 = in_sh[i] < out.shape[i] - sum_axis = tuple([i for i in range(len(in_sh)) if in_sh[i]==1 and out.shape[i]>1]) if in_sh != (1,) else None + sum_axis = ( + tuple([i for i in range(len(in_sh)) if in_sh[i] == 1 and out.shape[i] > 1]) + if in_sh != (1,) + else None + ) return out.sum(axis=sum_axis).reshape(in_sh) + class Add(Function): @staticmethod def forward(ctx, x, y): ctx.save_for_backward(x.shape, y.shape) - return x+y + return x + y @staticmethod def backward(ctx, grad_output): shape_x, shape_y = ctx.saved_tensors return unbroadcast(grad_output, shape_x), unbroadcast(grad_output, shape_y) -register('add', Add) + + +register("add", Add) + class Sub(Function): @staticmethod def forward(ctx, x, y): ctx.save_for_backward(x.shape, y.shape) - return x-y + return x - y @staticmethod def backward(ctx, grad_output): shape_x, shape_y = ctx.saved_tensors return unbroadcast(grad_output, shape_x), unbroadcast(-grad_output, shape_y) -register('sub', Sub) + + +register("sub", Sub) + class Mul(Function): @staticmethod def forward(ctx, x, y): ctx.save_for_backward(x, y) - return x*y + return x * y @staticmethod def backward(ctx, grad_output): - x,y = ctx.saved_tensors - return unbroadcast(y*grad_output, x.shape), unbroadcast(x*grad_output, y.shape) -register('mul', Mul) + x, y = ctx.saved_tensors + return unbroadcast(y * grad_output, x.shape), unbroadcast(x * grad_output, y.shape) + + +register("mul", Mul) + class Pow(Function): @staticmethod @@ -52,27 +66,37 @@ class Pow(Function): @staticmethod def backward(ctx, grad_output): - x,y = ctx.saved_tensors - return unbroadcast(y * (x**(y-1.0)) * grad_output, x.shape), \ - unbroadcast((x**y) * np.log(x) * grad_output, y.shape) -register('pow', Pow) + x, y = ctx.saved_tensors + return unbroadcast(y * (x ** (y - 1.0)) * grad_output, x.shape), unbroadcast( + (x ** y) * np.log(x) * grad_output, y.shape + ) + + +register("pow", Pow) + class Sum(Function): @staticmethod - def forward(ctx, input,axis=None): + def forward(ctx, input, axis=None): ctx.save_for_backward(input, axis) return np.array([input.sum()]) if axis is None else input.sum(axis=axis) @staticmethod def backward(ctx, grad_output): input, axis = ctx.saved_tensors - shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))] + shape = [ + 1 if axis is None or i in axis else input.shape[i] + for i in range(len(input.shape)) + ] return grad_output.reshape(shape) + np.zeros_like(input) -register('sum', Sum) + + +register("sum", Sum) # ************* GEMM ************* + class Dot(Function): @staticmethod def forward(ctx, input, weight): @@ -85,21 +109,27 @@ class Dot(Function): grad_input = grad_output.dot(weight.T) grad_weight = input.T.dot(grad_output) return grad_input, grad_weight -register('dot', Dot) + + +register("dot", Dot) # ************* simple ops ************* + class Pad2D(Function): @staticmethod def forward(ctx, x, padding=None): ctx.save_for_backward(padding) - return np.pad(x, ((0,0), (0,0), tuple(padding[2:4]), tuple(padding[0:2]))) + return np.pad(x, ((0, 0), (0, 0), tuple(padding[2:4]), tuple(padding[0:2]))) @staticmethod def backward(ctx, grad_output): - padding, = ctx.saved_tensors - return grad_output[..., padding[2]:-padding[3], padding[0]:-padding[1]] -register('pad2d', Pad2D) + (padding,) = ctx.saved_tensors + return grad_output[..., padding[2] : -padding[3], padding[0] : -padding[1]] + + +register("pad2d", Pad2D) + class Reshape(Function): @staticmethod @@ -109,13 +139,16 @@ class Reshape(Function): @staticmethod def backward(ctx, grad_output): - in_shape, = ctx.saved_tensors + (in_shape,) = ctx.saved_tensors return grad_output.reshape(in_shape) -register('reshape', Reshape) + + +register("reshape", Reshape) # ************* activation ops ************* + class ReLU(Function): @staticmethod def forward(ctx, input): @@ -124,123 +157,145 @@ class ReLU(Function): @staticmethod def backward(ctx, grad_output): - input, = ctx.saved_tensors + (input,) = ctx.saved_tensors return grad_output * (input >= 0) -register('relu', ReLU) + + +register("relu", ReLU) + def _exp_normalize(x, axis=None): - y = np.exp(x - x.max(axis=axis, keepdims=True)) - return y / y.sum(axis=axis, keepdims=True) + y = np.exp(x - x.max(axis=axis, keepdims=True)) + return y / y.sum(axis=axis, keepdims=True) + class Sigmoid(Function): @staticmethod def forward(ctx, input): with np.warnings.catch_warnings(): - np.warnings.filterwarnings('ignore') - ret = np.where(input >= 0, - 1/(1 + np.exp(-input)), - np.exp(input)/(1 + np.exp(input)) + np.warnings.filterwarnings("ignore") + ret = np.where( + input >= 0, 1 / (1 + np.exp(-input)), np.exp(input) / (1 + np.exp(input)) ) ctx.save_for_backward(ret) return ret @staticmethod def backward(ctx, grad_output): - ret, = ctx.saved_tensors + (ret,) = ctx.saved_tensors return grad_output * (ret * (1 - ret)) -register('sigmoid', Sigmoid) + + +register("sigmoid", Sigmoid) + class LogSoftmax(Function): @staticmethod def forward(ctx, input): softmax = _exp_normalize(input, axis=1) ctx.save_for_backward(softmax) - return np.log(softmax) + return np.log(softmax) @staticmethod def backward(ctx, grad_output): - softmax, = ctx.saved_tensors - return grad_output - grad_output.sum(axis=1, keepdims=True)*softmax -register('logsoftmax', LogSoftmax) + (softmax,) = ctx.saved_tensors + return grad_output - grad_output.sum(axis=1, keepdims=True) * softmax + + +register("logsoftmax", LogSoftmax) # ************* conv ops ************* + class Conv2D(Function): @staticmethod def forward(ctx, x, w, stride=1, groups=1): if type(ctx.stride) == int: ctx.stride = (ctx.stride, ctx.stride) - cout,cin,H,W = w.shape - ys,xs = ctx.stride - bs,cin_ = x.shape[0], x.shape[1] - oy,ox = (x.shape[2]-(H-ys))//ys, (x.shape[3]-(W-xs))//xs - assert cin*ctx.groups == cin_ + cout, cin, H, W = w.shape + ys, xs = ctx.stride + bs, cin_ = x.shape[0], x.shape[1] + oy, ox = (x.shape[2] - (H - ys)) // ys, (x.shape[3] - (W - xs)) // xs + assert cin * ctx.groups == cin_ assert cout % ctx.groups == 0 - rcout = cout//ctx.groups + rcout = cout // ctx.groups - gx = x.reshape(bs,ctx.groups,cin,x.shape[2],x.shape[3]) - tx = np.lib.stride_tricks.as_strided(gx, + gx = x.reshape(bs, ctx.groups, cin, x.shape[2], x.shape[3]) + tx = np.lib.stride_tricks.as_strided( + gx, shape=(bs, ctx.groups, cin, oy, ox, H, W), - strides=(*gx.strides[0:3], gx.strides[3]*ys, gx.strides[4]*xs, *gx.strides[3:5]), + strides=( + *gx.strides[0:3], + gx.strides[3] * ys, + gx.strides[4] * xs, + *gx.strides[3:5], + ), writeable=False, ) tw = w.reshape(ctx.groups, rcout, cin, H, W) ctx.save_for_backward(tx, tw, x.shape) - ret = np.zeros((bs,ctx.groups,oy,ox,rcout),dtype=x.dtype) + ret = np.zeros((bs, ctx.groups, oy, ox, rcout), dtype=x.dtype) for g in range(ctx.groups): - #ijYXyx,kjyx -> iYXk ->ikYX - ret[:,g] += np.tensordot(tx[:,g], tw[g], ((1,4,5),(1,2,3))) - return np.moveaxis(ret,4,2).reshape(bs, cout, oy, ox) + # ijYXyx,kjyx -> iYXk ->ikYX + ret[:, g] += np.tensordot(tx[:, g], tw[g], ((1, 4, 5), (1, 2, 3))) + return np.moveaxis(ret, 4, 2).reshape(bs, cout, oy, ox) @staticmethod def backward(ctx, grad_output): - bs,_,oy,ox = grad_output.shape + bs, _, oy, ox = grad_output.shape tx, tw, x_shape = ctx.saved_tensors - _,rcout,cin,H,W = tw.shape - ys,xs = ctx.stride - OY,OX = x_shape[2:4] + _, rcout, cin, H, W = tw.shape + ys, xs = ctx.stride + OY, OX = x_shape[2:4] - ggg = grad_output.reshape(bs,ctx.groups,rcout,oy,ox) + ggg = grad_output.reshape(bs, ctx.groups, rcout, oy, ox) - gdw = np.zeros((ctx.groups,rcout,cin,H,W), dtype=tx.dtype) + gdw = np.zeros((ctx.groups, rcout, cin, H, W), dtype=tx.dtype) for g in range(ctx.groups): #'ikYX,ijYXyx -> kjyx' - gdw[g] += np.tensordot(ggg[:,g], tx[:,g], ((0,2,3),(0,2,3))) + gdw[g] += np.tensordot(ggg[:, g], tx[:, g], ((0, 2, 3), (0, 2, 3))) # needs to be optimized - gdx = np.zeros((bs,ctx.groups,cin,OY,OX), dtype=tx.dtype) - for k in range(oy*ox): - Y, X = k//ox, k%ox - iY,iX = Y*ys, X*xs - #gdx[:,:,: , iY:iY+H, iX:iX+W] += np.einsum('igk,gkjyx->igjyx', ggg[:,:,:,Y,X], tw) + gdx = np.zeros((bs, ctx.groups, cin, OY, OX), dtype=tx.dtype) + for k in range(oy * ox): + Y, X = k // ox, k % ox + iY, iX = Y * ys, X * xs + # gdx[:,:,: , iY:iY+H, iX:iX+W] += np.einsum('igk,gkjyx->igjyx', ggg[:,:,:,Y,X], tw) for g in range(ctx.groups): - tg = np.dot(ggg[:,g,:,Y,X].reshape(bs, -1), tw[g].reshape(rcout, -1)) - gdx[:, g, :, iY:iY+H, iX:iX+W] += tg.reshape((bs, cin, H, W)) + tg = np.dot(ggg[:, g, :, Y, X].reshape(bs, -1), tw[g].reshape(rcout, -1)) + gdx[:, g, :, iY : iY + H, iX : iX + W] += tg.reshape((bs, cin, H, W)) - return gdx.reshape((bs, ctx.groups*cin, OY, OX)), gdw.reshape((ctx.groups*rcout, cin, H, W)) -register('conv2d', Conv2D) + return gdx.reshape((bs, ctx.groups * cin, OY, OX)), gdw.reshape( + (ctx.groups * rcout, cin, H, W) + ) + + +register("conv2d", Conv2D) # ************* pooling ops ************* + def stack_for_pool(x, py, px): - my, mx = (x.shape[2]//py)*py, (x.shape[3]//px)*px + my, mx = (x.shape[2] // py) * py, (x.shape[3] // px) * px xup = x[:, :, :my, :mx] - stack = [xup[:, :, k//px::py, k%px::px][None] for k in range(py*px)] + stack = [xup[:, :, k // px :: py, k % px :: px][None] for k in range(py * px)] return np.concatenate(stack, axis=0) + def unstack_for_pool(fxn, s, py, px): - my, mx = (s[2]//py)*py, (s[3]//px)*px - for k in range(py*px): - Y, X = k//px, k%px - ll = fxn(Y*px+X) + my, mx = (s[2] // py) * py, (s[3] // px) * px + for k in range(py * px): + Y, X = k // px, k % px + ll = fxn(Y * px + X) if X == 0 and Y == 0: ret = np.zeros(s, dtype=ll.dtype) ret[:, :, Y:my:py, X:mx:px] = ll return ret + class MaxPool2D(Function): @staticmethod def forward(ctx, x, kernel_size=(2, 2)): @@ -251,9 +306,14 @@ class MaxPool2D(Function): @staticmethod def backward(ctx, grad_output): - idxs,s = ctx.saved_tensors - return unstack_for_pool(lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size) -register('max_pool2d', MaxPool2D) + idxs, s = ctx.saved_tensors + return unstack_for_pool( + lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size + ) + + +register("max_pool2d", MaxPool2D) + class AvgPool2D(Function): @staticmethod @@ -264,8 +324,9 @@ class AvgPool2D(Function): @staticmethod def backward(ctx, grad_output): - s, = ctx.saved_tensors + (s,) = ctx.saved_tensors py, px = ctx.kernel_size - return unstack_for_pool(lambda idx: grad_output/py/px, s, py, px) -register('avg_pool2d', AvgPool2D) + return unstack_for_pool(lambda idx: grad_output / py / px, s, py, px) + +register("avg_pool2d", AvgPool2D) diff --git a/tinygrad/ops_gpu.py b/tinygrad/ops_gpu.py index 269dbb49ac..422a3df1e7 100644 --- a/tinygrad/ops_gpu.py +++ b/tinygrad/ops_gpu.py @@ -3,131 +3,229 @@ from .tensor import Function, register, GPUBuffer, Tensor, Device import pyopencl as cl import functools + def buffer_new(ctx, shape, zero=False): - return GPUBuffer(shape, hostbuf=None if not zero else np.zeros(shape, dtype=np.float32)) + return GPUBuffer( + shape, hostbuf=None if not zero else np.zeros(shape, dtype=np.float32) + ) + @functools.lru_cache() def clbuild(cl_ctx, name, prg): return cl.Program(cl_ctx, prg).build().__getattr__(name) + def uint2(x, y): - return np.array((x,y), dtype=cl.cltypes.uint2) + return np.array((x, y), dtype=cl.cltypes.uint2) + + i32 = np.int32 -def subsample_op(ctx, input, kernel_size, stride, iter_op, result_op, decls=''): + +def subsample_op(ctx, input, kernel_size, stride, iter_op, result_op, decls=""): py, px = stride N, C, Yin, Xin = input.shape - Yout, Xout = (Yin-kernel_size[0])//py+1, (Xin-kernel_size[1])//px+1 + Yout, Xout = (Yin - kernel_size[0]) // py + 1, (Xin - kernel_size[1]) // px + 1 ret = buffer_new(ctx, (N, C, Yout, Xout), zero=True) - subsample = clbuild(ctx.cl_ctx, "subsample", """ + subsample = clbuild( + ctx.cl_ctx, + "subsample", + """ __kernel void subsample(__global float *output, __global const float *input, uint2 osize, uint2 isize, uint2 ksz, uint2 stride) { int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0)); int oid = gid.x + osize.x*(gid.y + osize.y*gid.z); - """+decls+"""; + """ + + decls + + """; for (uint j=0; j 0 and complist[-1] == comp: dimlist[-1] *= dim elif comp != (False, False): - dimlist.append(dim); complist.append(comp) - for i in range(n_dims): # group together any adjacent dimensions that we can to simplify broadcasting + dimlist.append(dim) + complist.append(comp) + + for i in range( + n_dims + ): # group together any adjacent dimensions that we can to simplify broadcasting push(i32(max(shape_x[i], shape_y[i])), (shape_x[i] > 1, shape_y[i] > 1)) - + prg = get_binop_prg(ctx.cl_ctx, code, tuple(complist)) ret = buffer_new(ctx, shape_ret, zero=True) - prod_list = np.array(dimlist, dtype=i32)[-1::-1].cumprod(dtype=i32)[-1::-1] # take cumprod from back to front - prg.binop(ctx.cl_queue, [prod_list[0]] if len(dimlist) > 0 else [1], None, x.cl, y.cl, ret.cl, *dimlist, *(prod_list[1:])) + prod_list = np.array(dimlist, dtype=i32)[-1::-1].cumprod(dtype=i32)[ + -1::-1 + ] # take cumprod from back to front + prg.binop( + ctx.cl_queue, + [prod_list[0]] if len(dimlist) > 0 else [1], + None, + x.cl, + y.cl, + ret.cl, + *dimlist, + *(prod_list[1:]), + ) return ret + def unary_op(ctx, code, x): ret = buffer_new(ctx, x.shape) - unop = clbuild(ctx.cl_ctx, "unop", """ + unop = clbuild( + ctx.cl_ctx, + "unop", + """ __kernel void unop(__global const float *a_g, __global float *res_g) { int gid = get_global_id(0); float a = a_g[gid]; - res_g[gid] = """+code+"""; - }""") + res_g[gid] = """ + + code + + """; + }""", + ) unop(ctx.cl_queue, [np.prod(ret.shape)], None, x.cl, ret.cl) return ret + def reduce_op(ctx, code, code2, inp, axis=None): if axis is None: # full reduce - osize = [1]*len(inp.shape) + osize = [1] * len(inp.shape) else: osize = np.array(inp.shape) - osize[list(axis)] = 1 + osize[list(axis)] = 1 ret = buffer_new(ctx, osize) if axis is None: ret.shape = (1,) # TODO: this is insanely slow - reduce = clbuild(ctx.cl_ctx, "reduce", """ + reduce = clbuild( + ctx.cl_ctx, + "reduce", + """ __kernel void reduce(__global const float *a_g, int sz, __global float *res_g, int prod, int n_dims, __global const int *shape_x, __global const int *shape_ret) { int gid = get_global_id(0); @@ -148,80 +246,129 @@ def reduce_op(ctx, code, code2, inp, axis=None): } } float a = a_g[idx]; - """+code+"""; + """ + + code + + """; } - res_g[gid] = """+code2+"""; - }""") - buffer_np = lambda x: cl.Buffer(ctx.cl_ctx, - cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x) - reduce(ctx.cl_queue, [np.prod(osize)], None, inp.cl, - i32(np.prod(inp.shape)//np.prod(osize)), ret.cl, - i32(np.prod(osize)), i32(len(osize)), + res_g[gid] = """ + + code2 + + """; + }""", + ) + buffer_np = lambda x: cl.Buffer( + ctx.cl_ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x + ) + reduce( + ctx.cl_queue, + [np.prod(osize)], + None, + inp.cl, + i32(np.prod(inp.shape) // np.prod(osize)), + ret.cl, + i32(np.prod(osize)), + i32(len(osize)), buffer_np(np.array(inp.shape, dtype=np.int32)), - buffer_np(np.array(osize, dtype=np.int32))) + buffer_np(np.array(osize, dtype=np.int32)), + ) return ret + def unbroadcast(ctx, out, in_sh): - sum_axis = [i for i in range(len(in_sh)) if in_sh[i]==1 and out.shape[i]>1] if in_sh != (1,) else None + sum_axis = ( + [i for i in range(len(in_sh)) if in_sh[i] == 1 and out.shape[i] > 1] + if in_sh != (1,) + else None + ) return reduce_op(ctx, "out += a", "out", out, sum_axis) + # ***** now for the ops themselves ***** + class Add(Function): @staticmethod def forward(ctx, x, y): ctx.save_for_backward(x.shape, y.shape) - return binary_op(ctx, 'a+b', x, y) + return binary_op(ctx, "a+b", x, y) @staticmethod def backward(ctx, grad_output): grad_x, grad_y = grad_output, grad_output shape_x, shape_y = ctx.saved_tensors - return unbroadcast(ctx, grad_x, shape_x), unbroadcast(ctx, grad_y, shape_y), -register('add', Add, device=Device.GPU) + return ( + unbroadcast(ctx, grad_x, shape_x), + unbroadcast(ctx, grad_y, shape_y), + ) + + +register("add", Add, device=Device.GPU) + class Sub(Function): @staticmethod def forward(ctx, x, y): ctx.save_for_backward(x.shape, y.shape) - return binary_op(ctx, 'a-b', x, y) + return binary_op(ctx, "a-b", x, y) @staticmethod def backward(ctx, grad_output): - grad_x, grad_y = grad_output, unary_op(ctx, '-a', grad_output) + grad_x, grad_y = grad_output, unary_op(ctx, "-a", grad_output) shape_x, shape_y = ctx.saved_tensors - return unbroadcast(ctx, grad_x, shape_x), unbroadcast(ctx, grad_y, shape_y), -register('sub', Sub, device=Device.GPU) + return ( + unbroadcast(ctx, grad_x, shape_x), + unbroadcast(ctx, grad_y, shape_y), + ) + + +register("sub", Sub, device=Device.GPU) + class Mul(Function): @staticmethod def forward(ctx, x, y): ctx.save_for_backward(x, y) - return binary_op(ctx, 'a*b', x, y) + return binary_op(ctx, "a*b", x, y) @staticmethod def backward(ctx, grad_output): - x,y = ctx.saved_tensors - grad_x = binary_op(ctx, 'a*b', y, grad_output) - grad_y = binary_op(ctx, 'a*b', x, grad_output) - return unbroadcast(ctx, grad_x, x.shape), unbroadcast(ctx, grad_y, y.shape), -register('mul', Mul, device=Device.GPU) + x, y = ctx.saved_tensors + grad_x = binary_op(ctx, "a*b", y, grad_output) + grad_y = binary_op(ctx, "a*b", x, grad_output) + return ( + unbroadcast(ctx, grad_x, x.shape), + unbroadcast(ctx, grad_y, y.shape), + ) + + +register("mul", Mul, device=Device.GPU) + class Pow(Function): @staticmethod def forward(ctx, x, y): ctx.save_for_backward(x, y) - return binary_op(ctx, 'pow(a,b)', x, y) + return binary_op(ctx, "pow(a,b)", x, y) @staticmethod def backward(ctx, grad_output): - x,y = ctx.saved_tensors - grad_x = binary_op(ctx, 'a*b', grad_output, - binary_op(ctx, 'b * (pow((float)a, (float)(b-1.0)))', x, y)) - grad_y = binary_op(ctx, 'a*b', grad_output, - binary_op(ctx, 'pow(a, (float)b) * log(a);', x, y)) - return unbroadcast(ctx, grad_x, x.shape), unbroadcast(ctx, grad_y, y.shape), -register('pow', Pow, device=Device.GPU) + x, y = ctx.saved_tensors + grad_x = binary_op( + ctx, + "a*b", + grad_output, + binary_op(ctx, "b * (pow((float)a, (float)(b-1.0)))", x, y), + ) + grad_y = binary_op( + ctx, "a*b", grad_output, binary_op(ctx, "pow(a, (float)b) * log(a);", x, y) + ) + return ( + unbroadcast(ctx, grad_x, x.shape), + unbroadcast(ctx, grad_y, y.shape), + ) + + +register("pow", Pow, device=Device.GPU) + class Sum(Function): @staticmethod @@ -229,16 +376,24 @@ class Sum(Function): ctx.save_for_backward(input, axis) ret = reduce_op(ctx, "out += a", "out", input, axis=axis) if axis is not None: - ret.shape = tuple([input.shape[i] for i in range(len(input.shape)) if i not in axis]) + ret.shape = tuple( + [input.shape[i] for i in range(len(input.shape)) if i not in axis] + ) return ret @staticmethod def backward(ctx, grad_output): input, axis = ctx.saved_tensors - shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))] + shape = [ + 1 if axis is None or i in axis else input.shape[i] + for i in range(len(input.shape)) + ] output = GPUBuffer(shape, hostbuf=grad_output) - return binary_op(ctx, 'a+b', output, buffer_new(ctx, input.shape, zero=True)) -register('sum', Sum, device=Device.GPU) + return binary_op(ctx, "a+b", output, buffer_new(ctx, input.shape, zero=True)) + + +register("sum", Sum, device=Device.GPU) + class Dot(Function): @staticmethod @@ -247,7 +402,10 @@ class Dot(Function): isize, msize, osize = i32(input.shape[0]), i32(input.shape[1]), i32(weight.shape[1]) ret = buffer_new(ctx, (isize, osize)) - matmul = clbuild(ctx.cl_ctx, "matmul", """ + matmul = clbuild( + ctx.cl_ctx, + "matmul", + """ __kernel void matmul( __global const float *input, __global const float *weight, __global float *res, int is0, int is1, int msize, int ws0, int ws1, int osize @@ -261,13 +419,25 @@ class Dot(Function): } res[X * osize + Y] = ret; - }""") + }""", + ) ctx.save_for_backward(input, weight, matmul) # (isize,msize) x (msize,osize) = (isize,osize) - matmul(ctx.cl_queue, [isize, osize], None, - input.cl, weight.cl, ret.cl, - msize, i32(1), msize, i32(1), osize, osize) + matmul( + ctx.cl_queue, + [isize, osize], + None, + input.cl, + weight.cl, + ret.cl, + msize, + i32(1), + msize, + i32(1), + osize, + osize, + ) return ret @staticmethod @@ -279,28 +449,56 @@ class Dot(Function): grad_weight = buffer_new(ctx, weight.shape) # (isize,osize) x (msize,osize) = (isize,msize) - matmul(ctx.cl_queue, [isize, msize], None, - grad_output.cl, weight.cl, grad_input.cl, - osize, i32(1), osize, osize, i32(1), msize) + matmul( + ctx.cl_queue, + [isize, msize], + None, + grad_output.cl, + weight.cl, + grad_input.cl, + osize, + i32(1), + osize, + osize, + i32(1), + msize, + ) # (isize,msize) x (isize,osize) = (msize,osize) - matmul(ctx.cl_queue, [msize, osize], None, - input.cl, grad_output.cl, grad_weight.cl, - i32(1), msize, isize, i32(1), osize, osize) + matmul( + ctx.cl_queue, + [msize, osize], + None, + input.cl, + grad_output.cl, + grad_weight.cl, + i32(1), + msize, + isize, + i32(1), + osize, + osize, + ) return grad_input, grad_weight -register('dot', Dot, device=Device.GPU) + + +register("dot", Dot, device=Device.GPU) # ************* simple ops ************* + class Pad2D(Function): @staticmethod def forward(ctx, x, padding=None): - bs,cin,iy,ix = x.shape - oy,ox = iy+padding[2]+padding[3], ix+padding[0]+padding[1] + bs, cin, iy, ix = x.shape + oy, ox = iy + padding[2] + padding[3], ix + padding[0] + padding[1] ret = buffer_new(ctx, (bs, cin, oy, ox), zero=True) - pad2d = clbuild(ctx.cl_ctx, "pad2d", """ + pad2d = clbuild( + ctx.cl_ctx, + "pad2d", + """ __kernel void pad2d(__global const float *input, __global float *output, int ipx, int ipy, int py, int px, int oy, int ox, int iy, int ix) { int BC = get_global_id(0); @@ -311,13 +509,24 @@ class Pad2D(Function): int optr = BC*oy*ox + (Y+py)*ox + px + X; output[optr] = input[iptr]; - }""") + }""", + ) ctx.save_for_backward(padding, pad2d) - pad2d(ctx.cl_queue, [bs*cin, iy, ix], None, - x.cl, ret.cl, - i32(0), i32(0), i32(padding[2]), i32(padding[0]), - i32(oy), i32(ox), i32(iy), i32(ix) - ) + pad2d( + ctx.cl_queue, + [bs * cin, iy, ix], + None, + x.cl, + ret.cl, + i32(0), + i32(0), + i32(padding[2]), + i32(padding[0]), + i32(oy), + i32(ox), + i32(iy), + i32(ix), + ) return ret @staticmethod @@ -326,13 +535,26 @@ class Pad2D(Function): bs, cin, iy, ix = grad_output.shape oy, ox = iy - padding[2] - padding[3], ix - padding[0] - padding[1] ret = buffer_new(ctx, (bs, cin, oy, ox)) - pad2d(ctx.cl_queue, [bs*cin, oy, ox], None, - grad_output.cl, ret.cl, - i32(padding[2]), i32(padding[0]), i32(0), i32(0), - i32(oy), i32(ox), i32(iy), i32(ix) - ) + pad2d( + ctx.cl_queue, + [bs * cin, oy, ox], + None, + grad_output.cl, + ret.cl, + i32(padding[2]), + i32(padding[0]), + i32(0), + i32(0), + i32(oy), + i32(ox), + i32(iy), + i32(ix), + ) return ret -register('pad2d', Pad2D, device=Device.GPU) + + +register("pad2d", Pad2D, device=Device.GPU) + class Reshape(Function): @staticmethod @@ -345,105 +567,152 @@ class Reshape(Function): @staticmethod def backward(ctx, grad_output): - in_shape, = ctx.saved_tensors + (in_shape,) = ctx.saved_tensors return GPUBuffer(in_shape, hostbuf=grad_output) -register('reshape', Reshape, device=Device.GPU) + + +register("reshape", Reshape, device=Device.GPU) # ************* activation ops ************* + class ReLU(Function): @staticmethod def forward(ctx, input): ctx.save_for_backward(input) - return unary_op(ctx, 'max(a, (float)0.)', input) + return unary_op(ctx, "max(a, (float)0.)", input) @staticmethod def backward(ctx, grad_output): - input, = ctx.saved_tensors - return binary_op(ctx, 'a * (b >= 0)', grad_output, input) -register('relu', ReLU, device=Device.GPU) + (input,) = ctx.saved_tensors + return binary_op(ctx, "a * (b >= 0)", grad_output, input) + + +register("relu", ReLU, device=Device.GPU) + class Sigmoid(Function): @staticmethod def forward(ctx, input): - ret = unary_op(ctx, '1./(1+exp(-a))', input) + ret = unary_op(ctx, "1./(1+exp(-a))", input) ctx.save_for_backward(ret) return ret @staticmethod def backward(ctx, grad_output): - ret, = ctx.saved_tensors - return binary_op(ctx, 'a * (b * (1 - b));', grad_output, ret) -register('sigmoid', Sigmoid, device=Device.GPU) + (ret,) = ctx.saved_tensors + return binary_op(ctx, "a * (b * (1 - b));", grad_output, ret) + + +register("sigmoid", Sigmoid, device=Device.GPU) + class AvgPool2D(Function): @staticmethod def forward(ctx, input, kernel_size=(2, 2)): - ret = subsample_op(ctx, input, kernel_size, kernel_size, iter_op="sumval += input[iid]", - result_op="sumval / (ksz.x * ksz.y)", decls="float sumval=0.f") + ret = subsample_op( + ctx, + input, + kernel_size, + kernel_size, + iter_op="sumval += input[iid]", + result_op="sumval / (ksz.x * ksz.y)", + decls="float sumval=0.f", + ) ctx.save_for_backward(input.shape) return ret @staticmethod def backward(ctx, grad_output): - orig_shape, = ctx.saved_tensors - return supersample_op(ctx, grad_output, orig_shape, ctx.kernel_size, - result_op="input[iid] / (ksz.x * ksz.y)") -register('avg_pool2d', AvgPool2D, device=Device.GPU) + (orig_shape,) = ctx.saved_tensors + return supersample_op( + ctx, + grad_output, + orig_shape, + ctx.kernel_size, + result_op="input[iid] / (ksz.x * ksz.y)", + ) + + +register("avg_pool2d", AvgPool2D, device=Device.GPU) + class MaxPool2D(Function): @staticmethod def forward(ctx, input, kernel_size=(2, 2)): - idxs = subsample_op(ctx, input, kernel_size, kernel_size, + idxs = subsample_op( + ctx, + input, + kernel_size, + kernel_size, iter_op="if (input[iid]>maxval) { maxval = input[iid]; maxidx = j * ksz.x + i; }", - result_op="(float)maxidx", decls="float maxval=-FLT_MAX; int maxidx=0") + result_op="(float)maxidx", + decls="float maxval=-FLT_MAX; int maxidx=0", + ) ctx.save_for_backward(idxs, input.shape) - return subsample_op(ctx, input, kernel_size, kernel_size, + return subsample_op( + ctx, + input, + kernel_size, + kernel_size, iter_op="maxval = max(maxval, input[iid])", - result_op="maxval", decls="float maxval = -FLT_MAX") + result_op="maxval", + decls="float maxval = -FLT_MAX", + ) @staticmethod def backward(ctx, grad_output): idxs, orig_shape = ctx.saved_tensors - return supersample_op(ctx, grad_output, orig_shape, ctx.kernel_size, + return supersample_op( + ctx, + grad_output, + orig_shape, + ctx.kernel_size, result_op="(maxidx == kernidx) * input[iid]", decls="int maxidx=((__global float*)input2)[iid]; int kernidx=(gid.x%ksz.x) + ksz.x*(gid.y%ksz.y)", - input2=idxs) -register('max_pool2d', MaxPool2D, device=Device.GPU) + input2=idxs, + ) + + +register("max_pool2d", MaxPool2D, device=Device.GPU) + class LogSoftmax(Function): @staticmethod def forward(ctx, input): # TODO: stability? lsum = reduce_op(ctx, "out += exp(a)", "log(out)", input, axis=[1]) - output = binary_op(ctx, 'a-b', input, lsum) + output = binary_op(ctx, "a-b", input, lsum) ctx.save_for_backward(output) return output @staticmethod def backward(ctx, grad_output): - output, = ctx.saved_tensors + (output,) = ctx.saved_tensors lsum = reduce_op(ctx, "out += a", "out", grad_output, axis=[1]) texp = binary_op(ctx, "exp(a) * b", output, lsum) return binary_op(ctx, "a - b", grad_output, texp) -register('logsoftmax', LogSoftmax, device=Device.GPU) + + +register("logsoftmax", LogSoftmax, device=Device.GPU) # ************* conv ops ************* + class Conv2D(Function): @staticmethod def forward(ctx, x, w, stride=1, groups=1): if type(ctx.stride) == int: ctx.stride = (ctx.stride, ctx.stride) - cout,cin,H,W = w.shape - ys,xs = ctx.stride - bs,cin_,iy,ix = x.shape - oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs - assert cin*ctx.groups == cin_ + cout, cin, H, W = w.shape + ys, xs = ctx.stride + bs, cin_, iy, ix = x.shape + oy, ox = (iy - (H - ys)) // ys, (ix - (W - xs)) // xs + assert cin * ctx.groups == cin_ assert cout % ctx.groups == 0 - rcout = cout//ctx.groups + rcout = cout // ctx.groups - ctx.save_for_backward(x,w) + ctx.save_for_backward(x, w) # output buffer ret = buffer_new(ctx, (bs, cout, oy, ox)) @@ -452,7 +721,10 @@ class Conv2D(Function): # weight = (groups, rcout, cin, H, W) # output = (bs, groups, rcout, oy, ox) - conv = clbuild(ctx.cl_ctx, "conv", """ + conv = clbuild( + ctx.cl_ctx, + "conv", + """ __kernel void conv(__global const float *input, __global const float *weight, __global float *output, int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs) { @@ -475,26 +747,41 @@ class Conv2D(Function): } } output[B*groups*rcout*oy*ox + g*rcout*oy*ox + c*oy*ox + Y*ox + X] = acc; - }""") + }""", + ) - conv(ctx.cl_queue, [bs*groups*rcout, oy, ox], None, - x.cl, w.cl, ret.cl, - i32(H), i32(W), i32(groups), i32(rcout), i32(cin), - i32(oy), i32(ox), i32(iy), i32(ix), i32(ys), i32(xs) + conv( + ctx.cl_queue, + [bs * groups * rcout, oy, ox], + None, + x.cl, + w.cl, + ret.cl, + i32(H), + i32(W), + i32(groups), + i32(rcout), + i32(cin), + i32(oy), + i32(ox), + i32(iy), + i32(ix), + i32(ys), + i32(xs), ) return ret @staticmethod def backward(ctx, grad_output): - bs,_,oy,ox = grad_output.shape + bs, _, oy, ox = grad_output.shape x, w = ctx.saved_tensors - cout,cin,H,W = w.shape - ys,xs = ctx.stride - bs,cin_,iy,ix = x.shape - oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs - assert cin*ctx.groups == cin_ + cout, cin, H, W = w.shape + ys, xs = ctx.stride + bs, cin_, iy, ix = x.shape + oy, ox = (iy - (H - ys)) // ys, (ix - (W - xs)) // xs + assert cin * ctx.groups == cin_ assert cout % ctx.groups == 0 - rcout = cout//ctx.groups + rcout = cout // ctx.groups dx = buffer_new(ctx, (bs, cin_, iy, ix), zero=True) dw = buffer_new(ctx, (cout, cin, H, W)) @@ -503,7 +790,10 @@ class Conv2D(Function): # tensw = (groups*rcout, cin, H, W) # ggg = (bs, groups*rout, oy, ox) - convw = clbuild(ctx.cl_ctx, "convw", """ + convw = clbuild( + ctx.cl_ctx, + "convw", + """ __kernel void convw(__global const float *tensx, __global const float *ggg, __global float *dw, int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs) { @@ -523,8 +813,12 @@ class Conv2D(Function): } } dw[get_global_id(0)*H*W + y*W + x] = acc; - }""") - convx = clbuild(ctx.cl_ctx, "convx", """ + }""", + ) + convx = clbuild( + ctx.cl_ctx, + "convx", + """ __kernel void convx(__global const float *tensw, __global const float *ggg, __global float *dx, int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs) { @@ -547,10 +841,36 @@ class Conv2D(Function): } } } - """) + """, + ) - conv_args = i32(H), i32(W), i32(ctx.groups), i32(rcout), i32(cin), i32(oy), i32(ox), i32(iy), i32(ix), i32(ys), i32(xs), i32(bs) - convw(ctx.cl_queue, [ctx.groups*rcout*cin, H, W], None, x.cl, grad_output.cl, dw.cl, *conv_args) - convx(ctx.cl_queue, [bs, ctx.groups, cin], None, w.cl, grad_output.cl, dx.cl, *conv_args) + conv_args = ( + i32(H), + i32(W), + i32(ctx.groups), + i32(rcout), + i32(cin), + i32(oy), + i32(ox), + i32(iy), + i32(ix), + i32(ys), + i32(xs), + i32(bs), + ) + convw( + ctx.cl_queue, + [ctx.groups * rcout * cin, H, W], + None, + x.cl, + grad_output.cl, + dw.cl, + *conv_args, + ) + convx( + ctx.cl_queue, [bs, ctx.groups, cin], None, w.cl, grad_output.cl, dx.cl, *conv_args + ) return dx, dw -register('conv2d', Conv2D, device=Device.GPU) + + +register("conv2d", Conv2D, device=Device.GPU) diff --git a/tinygrad/optim.py b/tinygrad/optim.py index 5edf6976e7..127a6223ae 100644 --- a/tinygrad/optim.py +++ b/tinygrad/optim.py @@ -3,6 +3,7 @@ import numpy as np from tinygrad.tensor import Tensor + class Optimizer: def __init__(self, params): self.params = [x for x in params if x.requires_grad == True] @@ -11,6 +12,7 @@ class Optimizer: for param in self.params: param.grad = None + class SGD(Optimizer): def __init__(self, params, lr=0.001): super().__init__(params) @@ -20,30 +22,53 @@ class SGD(Optimizer): for t in self.params: t -= t.grad * self.lr + class RMSprop(Optimizer): def __init__(self, params, lr=0.001, decay=0.9, eps=1e-8): super().__init__(params) self.lr, self.decay, self.eps = lr, decay, eps - self.v = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params] + self.v = [ + Tensor( + np.zeros(t.shape, dtype=np.float32), + device=params[0].device, + requires_grad=False, + ) + for t in self.params + ] def step(self): for i, t in enumerate(self.params): self.v[i] = self.decay * self.v[i] + (1.0 - self.decay) * t.grad * t.grad t -= (t.grad * self.lr).div(self.v[i].sqrt() + self.eps) + class Adam(Optimizer): def __init__(self, params, lr=0.001, b1=0.9, b2=0.999, eps=1e-8): super().__init__(params) self.lr, self.b1, self.b2, self.eps, self.t = lr, b1, b2, eps, 0 - self.m = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params] - self.v = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params] + self.m = [ + Tensor( + np.zeros(t.shape, dtype=np.float32), + device=params[0].device, + requires_grad=False, + ) + for t in self.params + ] + self.v = [ + Tensor( + np.zeros(t.shape, dtype=np.float32), + device=params[0].device, + requires_grad=False, + ) + for t in self.params + ] def step(self): self.t = self.t + 1 - a = self.lr * ((1.0 - self.b2**self.t)**0.5) / (1.0 - self.b1**self.t) - for i,t in enumerate(self.params): + a = self.lr * ((1.0 - self.b2 ** self.t) ** 0.5) / (1.0 - self.b1 ** self.t) + for i, t in enumerate(self.params): self.m[i] = self.b1 * self.m[i] + (1.0 - self.b1) * t.grad self.v[i] = self.b2 * self.v[i] + (1.0 - self.b2) * t.grad * t.grad t -= a * self.m[i].div(self.v[i].sqrt() + self.eps) diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py index 14c5643c16..806d1d9ea7 100644 --- a/tinygrad/tensor.py +++ b/tinygrad/tensor.py @@ -10,32 +10,43 @@ from collections import defaultdict DEBUG = os.getenv("DEBUG", None) is not None if DEBUG: import atexit, time + debug_counts, debug_times = defaultdict(int), defaultdict(float) + def print_debug_exit(): for name, _ in sorted(debug_times.items(), key=lambda x: -x[1]): print(f"{name:>20} : {debug_counts[name]:>6} {debug_times[name]:>10.2f} ms") + atexit.register(print_debug_exit) + class ProfileOp: def __init__(self, name, x, backward=False): - self.name = ("back_" if backward else "")+name + self.name = ("back_" if backward else "") + name self.x = x + def __enter__(self): - if DEBUG: self.st = time.time() + if DEBUG: + self.st = time.time() + def __exit__(self, *junk): if DEBUG: if cl_queue is not None: cl_queue.finish() - et = (time.time()-self.st)*1000. + et = (time.time() - self.st) * 1000.0 debug_counts[self.name] += 1 debug_times[self.name] += et print(f"{self.name:>20} : {et:>7.2f} ms {[y.shape for y in self.x]}") + # **** GPU functions **** cl_ctx, cl_queue = None, None + + def require_init_gpu(): - if not GPU: raise Exception("No GPU Support, install pyopencl") + if not GPU: + raise Exception("No GPU Support, install pyopencl") global cl_ctx, cl_queue if cl_queue is None: devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.GPU) @@ -45,28 +56,45 @@ def require_init_gpu(): # this is an in-order command queue cl_queue = cl.CommandQueue(cl_ctx) + class GPUBuffer: def __init__(self, shape, hostbuf=None): self.shape, self.dtype = tuple(shape), np.float32 - self.cl = hostbuf.cl if isinstance(hostbuf, GPUBuffer) else \ - cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0), 4*np.prod(shape), - hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None) + self.cl = ( + hostbuf.cl + if isinstance(hostbuf, GPUBuffer) + else cl.Buffer( + cl_ctx, + cl.mem_flags.READ_WRITE + | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0), + 4 * np.prod(shape), + hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None, + ) + ) def __repr__(self): return f"" + # **** ANE functions **** ane = None + + def require_init_ane(): global ane if ane is None: import ane.lib.ane, tinygrad.ops_ane + ane = ane.lib.ane.ANE() + # **** start with two base classes, Tensor and Function **** -class Device: CPU, GPU, ANE = 0, 1, 2 + +class Device: + CPU, GPU, ANE = 0, 1, 2 + class Tensor: did_float_warning = False @@ -110,7 +138,12 @@ class Tensor: @classmethod def uniform(cls, *shape, **kwargs): - return cls((np.random.uniform(-1., 1., size=shape)/np.sqrt(np.prod(shape))).astype(np.float32), **kwargs) + return cls( + (np.random.uniform(-1.0, 1.0, size=shape) / np.sqrt(np.prod(shape))).astype( + np.float32 + ), + **kwargs, + ) @classmethod def eye(cls, dim, **kwargs): @@ -130,18 +163,21 @@ class Tensor: # fill in the first grad with one # this is "implicit gradient creation" - self.grad = Tensor(np.ones(self.shape, dtype=self.dtype), device=self.device, requires_grad=False) + self.grad = Tensor( + np.ones(self.shape, dtype=self.dtype), device=self.device, requires_grad=False + ) for t0 in reversed(self.deepwalk(set(), [])): - assert (t0.grad is not None) + assert t0.grad is not None with ProfileOp(t0._ctx.__class__.__name__, [t0.grad], backward=True): grads = t0._ctx.backward(t0._ctx, t0.grad.data) if len(t0._ctx.parents) == 1: grads = [grads] - for t,g in zip(t0._ctx.parents, grads): + for t, g in zip(t0._ctx.parents, grads): if g is not None: - assert g.shape == t.shape, \ - f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}" + assert ( + g.shape == t.shape + ), f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}" gt = Tensor(g, device=self.device, requires_grad=False) t.grad = gt if t.grad is None else (t.grad + gt) @@ -150,19 +186,21 @@ class Tensor: @staticmethod def _move_data(data, device): if isinstance(data, GPUBuffer): - if device == Device.GPU: return data + if device == Device.GPU: + return data old = data data = np.empty(old.shape, dtype=np.float32) with ProfileOp("toCPU", [data]): - cl.enqueue_copy(cl_queue, data, old.cl, is_blocking=True) + cl.enqueue_copy(cl_queue, data, old.cl, is_blocking=True) elif "ANETensor" in str(type(data)): - if device == Device.ANE: return data + if device == Device.ANE: + return data with ProfileOp("toCPU", [data]): - data = data.data().astype(np.float32) + data = data.data().astype(np.float32) if not isinstance(data, np.ndarray): - data = np.array(data, dtype=np.float32) + data = np.array(data, dtype=np.float32) if data.dtype != np.float32 and not Tensor.did_float_warning: # warning? float64 is actually needed for numerical jacobian @@ -184,14 +222,17 @@ class Tensor: def to_(self, device): self.data, self.device = self._move_data(self.data, device), device - if self.grad: self.grad.to_(device) + if self.grad: + self.grad.to_(device) def to(self, device): ret = Tensor(self.data, device) - if self.grad: ret.grad = self.grad.to(device) + if self.grad: + ret.grad = self.grad.to(device) return ret - def _is(self, device): return self.device == device + def _is(self, device): + return self.device == device def detach(self): return Tensor(self.data, device=self.device) @@ -203,7 +244,7 @@ class Tensor: def mean(self, axis=None): out = self.sum(axis=axis) - coeff = np.prod(out.shape)/np.prod(self.shape) + coeff = np.prod(out.shape) / np.prod(self.shape) return out * coeff def sqrt(self): @@ -219,15 +260,18 @@ class Tensor: return 2.0 * ((2.0 * self).sigmoid()) - 1.0 def leakyrelu(self, neg_slope=0.01): - return self.relu() - (-neg_slope*self).relu() + return self.relu() - (-neg_slope * self).relu() def dropout(self, p=0.5): - _mask = np.asarray(np.random.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype) + _mask = np.asarray( + np.random.binomial(1, 1.0 - p, size=self.shape), dtype=self.dtype + ) ret = self * Tensor(_mask, requires_grad=False, device=self.device) return ret.div(1.0 - p) def abs(self): - return self.relu() + (-1.0*self).relu() + return self.relu() + (-1.0 * self).relu() + # An instantiation of the Function is the Context class Function: @@ -239,7 +283,7 @@ class Function: self.saved_tensors.extend(x) def apply(self, *x, **kwargs): - ctx = self(*x) # self - operation i.e 'add', 'sub', etc. + ctx = self(*x) # self - operation i.e 'add', 'sub', etc. # use default params params = signature(self.forward).parameters for p in params.values(): @@ -249,40 +293,66 @@ class Function: for k, v in kwargs.items(): setattr(ctx, k, v) with ProfileOp(ctx.__class__.__name__, x): - ret = Tensor(self.forward(ctx, *[t.data for t in x], **kwargs), - device=ctx.device, requires_grad=any([t.requires_grad for t in x])) + ret = Tensor( + self.forward(ctx, *[t.data for t in x], **kwargs), + device=ctx.device, + requires_grad=any([t.requires_grad for t in x]), + ) if ret.requires_grad: ret._ctx = ctx return ret + def register(name, fxn, device=Device.CPU): Tensor.ops[device][name] = fxn + def dispatch(*x, **kwargs): tt = [arg for arg in x if isinstance(arg, Tensor)][0] - x = [Tensor(np.array([arg], dtype=tt.dtype), device=tt.device, requires_grad=False) if not isinstance(arg, Tensor) else arg for arg in x] + x = [ + Tensor(np.array([arg], dtype=tt.dtype), device=tt.device, requires_grad=False) + if not isinstance(arg, Tensor) + else arg + for arg in x + ] f = (Tensor.ops[tt.device])[name] f.cl_ctx, f.cl_queue, f.ane, f.device = cl_ctx, cl_queue, ane, tt.device return f.apply(f, *x, **kwargs) + setattr(Tensor, name, dispatch) # TODO: div is a second class op, so it doesn't work here - if name in ['add', 'sub', 'mul', 'pow']: + if name in ["add", "sub", "mul", "pow"]: setattr(Tensor, f"__{name}__", dispatch) - setattr(Tensor, f"__i{name}__", lambda self,x: self.assign(dispatch(self,x))) - setattr(Tensor, f"__r{name}__", lambda self,x: dispatch(x,self)) + setattr(Tensor, f"__i{name}__", lambda self, x: self.assign(dispatch(self, x))) + setattr(Tensor, f"__r{name}__", lambda self, x: dispatch(x, self)) + for device in [device for device in Device.__dict__.keys() if device[0] != "_"]: - setattr(Tensor, f"{device.lower()}", functools.partialmethod(Tensor.to, Device.__dict__[device])) - setattr(Tensor, f"{device.lower()}_", functools.partialmethod(Tensor.to_, Device.__dict__[device])) - setattr(Tensor, f"is_{device.lower()}", property(functools.partialmethod(Tensor._is, Device.__dict__[device]))) + setattr( + Tensor, + f"{device.lower()}", + functools.partialmethod(Tensor.to, Device.__dict__[device]), + ) + setattr( + Tensor, + f"{device.lower()}_", + functools.partialmethod(Tensor.to_, Device.__dict__[device]), + ) + setattr( + Tensor, + f"is_{device.lower()}", + property(functools.partialmethod(Tensor._is, Device.__dict__[device])), + ) # this registers all the operations import tinygrad.ops_cpu + try: import pyopencl as cl + # TODO: move this import to require_init_gpu? import tinygrad.ops_gpu + GPU = True except ImportError: # no GPU support GPU = False -