fixed pylint, formatted python files iwth cblack on localhost

2026-01-09 15:08:02 -05:00 · 2020-12-17 13:36:48 -05:00
parent 799ad5ba17
commit 07e2b88466
30 changed files with 1688 additions and 819 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -34,13 +34,21 @@ jobs:
      run: python -m pytest -s -v

  linter:
-      name: Indentation Linter
-      runs-on: ubuntu-latest
+    name: Indentation  Linter
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v2
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install pylint
+    - name: Lint with pylint
+      run: |
+        python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='  ' **/*.py

-      steps:
-      - name: Checkout Code
-        uses: actions/checkout@v2
-      - name: Install PyLint
-        run: sudo apt-get install pylint
-      - name: Validate indentation is 2 lines
-        run: if [[ $(pylint --jobs=0 --indent-string='  ' * | grep "Bad indentation") ]]; then exit 1; fi
--- a/ane/1_build/coreml_ane.py
+++ b/ane/1_build/coreml_ane.py
@@ -3,24 +3,34 @@ import numpy as np
 import coremltools as ct
 from coremltools.models.neural_network import datatypes, NeuralNetworkBuilder

-input_features = [('image', datatypes.Array(3))]
-output_features = [('probs', datatypes.Array(3))]
+input_features = [("image", datatypes.Array(3))]
+output_features = [("probs", datatypes.Array(3))]

 weights = np.zeros((3, 3)) + 3
 bias = np.ones(3)

 builder = NeuralNetworkBuilder(input_features, output_features)
-builder.add_inner_product(name='ip_layer', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='image', output_name='med')
-#builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs')
-#builder.add_elementwise(name='element', input_names=['med', 'med'], output_name='probs', mode='ADD')
-builder.add_bias(name='bias', b=bias, input_name='med', output_name='probs', shape_bias=(3,))
-#builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs')
+builder.add_inner_product(
+  name="ip_layer",
+  W=weights,
+  b=None,
+  input_channels=3,
+  output_channels=3,
+  has_bias=False,
+  input_name="image",
+  output_name="med",
+)
+# builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs')
+# builder.add_elementwise(name='element', input_names=['med', 'med'], output_name='probs', mode='ADD')
+builder.add_bias(
+  name="bias", b=bias, input_name="med", output_name="probs", shape_bias=(3,)
+)
+# builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs')

 # compile the spec
 mlmodel = ct.models.MLModel(builder.spec)

 # trigger the ANE!
-out = mlmodel.predict({"image": np.array([1337,0,0], dtype=np.float32)})
+out = mlmodel.predict({"image": np.array([1337, 0, 0], dtype=np.float32)})
 print(out)
-mlmodel.save('test.mlmodel')
-
+mlmodel.save("test.mlmodel")
--- a/ane/2_compile/hwx_parse.py
+++ b/ane/2_compile/hwx_parse.py
@@ -2,17 +2,21 @@
 import sys
 from hexdump import hexdump
 from macholib import MachO
+
+
 def get_macho(fn):
  # mod to make the header okay
  # MH_CIGAM_64 is good
  dat = open(fn, "rb").read()
-  dat = b"\xcf\xfa\xed\xfe"+dat[4:]
+  dat = b"\xcf\xfa\xed\xfe" + dat[4:]
  from tempfile import NamedTemporaryFile
+
  with NamedTemporaryFile(delete=False) as f:
    f.write(dat)
    f.close()
  return MachO.MachO(f.name)

+
 a = get_macho("model.hwx")

 # load commands
@@ -21,14 +25,22 @@ for c in a.headers[0].commands:
  if c[0].cmd == 25:
    print(c[1])
    for section in c[2]:
-      print(section.segname.strip(b'\0'), section.sectname.strip(b'\0'), hex(section.addr), hex(section.size), "@", hex(c[1].fileoff))
-      #print(dir(section))
+      print(
+        section.segname.strip(b"\0"),
+        section.sectname.strip(b"\0"),
+        hex(section.addr),
+        hex(section.size),
+        "@",
+        hex(c[1].fileoff),
+      )
+      # print(dir(section))
      if c[1].filesize > 0:
        hexdump(section.section_data)

 # this parser is wrong (fixed with 64-bit one)
 from macholib import SymbolTable
-sym = SymbolTable.SymbolTable(a) 
+
+sym = SymbolTable.SymbolTable(a)

 syms = {}
 for l in sym.nlists:
@@ -36,64 +48,65 @@ for l in sym.nlists:
  if l[0].n_value != 0:
    syms[l[1]] = l[0].n_value

-for k,v in syms.items():
+for k, v in syms.items():
  print(k, hex(v))


-
 from termcolor import colored
+
+
 def compare(x, y):
  ss = []
  ln = []
  ln2 = []

-  ll = (max(len(x), len(y)) + 0xF)//0x10 * 0x10
+  ll = (max(len(x), len(y)) + 0xF) // 0x10 * 0x10

  highlight = False
-  next_highlight = 0x2b
-  for i in range(ll+1):
+  next_highlight = 0x2B
+  for i in range(ll + 1):
    if i == next_highlight:
      highlight = True
      if i < len(y):
-        next_highlight += y[i]+8
+        next_highlight += y[i] + 8
      else:
        next_highlight = None
    else:
      highlight = False
-    a = "%02X" % x[i] if i < len(x) else "--", \
-        "%02X" % y[i] if i < len(y) else "--"
+    a = "%02X" % x[i] if i < len(x) else "--", "%02X" % y[i] if i < len(y) else "--"
+
    def fj(x):
      ss = []
      for i in range(0, 0x10, 4):
-        ss.append(' '.join(x[i:i+4]))
-      return '  '.join(ss)
+        ss.append(" ".join(x[i : i + 4]))
+      return "  ".join(ss)

-    if i!=0 and i%0x10 == 0:
-      ss.append("%8X: " % (i-0x10)+fj(ln)+"  |  "+fj(ln2)+"\n")
+    if i != 0 and i % 0x10 == 0:
+      ss.append("%8X: " % (i - 0x10) + fj(ln) + "  |  " + fj(ln2) + "\n")
      ln = []
      ln2 = []
    if a[0] != a[1] and a[0] != "--" and a[1] != "--":
-      ln.append(colored(a[0], 'green'))
-      ln2.append(colored(a[1], 'red'))
+      ln.append(colored(a[0], "green"))
+      ln2.append(colored(a[1], "red"))
    else:
      if highlight:
-        ln.append(colored(a[0], 'yellow'))
-        ln2.append(colored(a[1], 'yellow'))
+        ln.append(colored(a[0], "yellow"))
+        ln2.append(colored(a[1], "yellow"))
      else:
        ln.append(a[0])
        ln2.append(a[1])
-  return ''.join(ss)
+  return "".join(ss)
+

 g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1])
 f1 = g.headers[0].commands[1][2][0].section_data
 f2 = a.headers[0].commands[1][2][0].section_data
 for i in range(0, len(f2), 0x300):
-  print("===== op %d =====" % (i//0x300))
+  print("===== op %d =====" % (i // 0x300))
  if len(f1) < 0x300:
-    print(compare(f1, f2[i:i+0x300]))
+    print(compare(f1, f2[i : i + 0x300]))
  else:
-    print(compare(f1[i:i+0x300], f2[i:i+0x300]))
-
-#open("/tmp/data.section", "wb").write(f2)
-#print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read()))
+    print(compare(f1[i : i + 0x300], f2[i : i + 0x300]))

+# open("/tmp/data.section", "wb").write(f2)
+# print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read()))
--- a/ane/lib/ane.py
+++ b/ane/lib/ane.py
@@ -3,11 +3,12 @@ import os
 from ctypes import *
 import numpy as np
 import faulthandler
+
 faulthandler.enable()

-libane = cdll.LoadLibrary(os.path.join(
-  os.path.dirname(os.path.abspath(__file__)), 
-  "libane.dylib"))
+libane = cdll.LoadLibrary(
+  os.path.join(os.path.dirname(os.path.abspath(__file__)), "libane.dylib")
+)

 libane.ANE_Compile.argtypes = [c_char_p, c_int]
 libane.ANE_Compile.restype = c_void_p
@@ -17,34 +18,36 @@ libane.ANE_TensorCreate.restype = c_void_p
 libane.ANE_TensorData.argtypes = [c_void_p]
 libane.ANE_TensorData.restype = POINTER(c_uint16)

-libane.ANE_Run.argtypes = [c_void_p]*3
+libane.ANE_Run.argtypes = [c_void_p] * 3
 libane.ANE_Run.restype = c_int

+
 class ANETensor:
  def __init__(self, *shape):
    self.shape = shape
    self.dtype = np.float16
    self.sz = int(np.prod(shape))
-    assert(self.sz <= 0x4000)
+    assert self.sz <= 0x4000
    self.tt = libane.ANE_TensorCreate(self.sz, 1)
-    assert(self.tt is not None)
+    assert self.tt is not None

  def data(self):
    data = libane.ANE_TensorData(self.tt)
-    assert(data is not None)
-    #print(hex(addressof(data.contents)))
+    assert data is not None
+    # print(hex(addressof(data.contents)))
    buf = np.ctypeslib.as_array(data, shape=(self.sz,))
    ret = np.frombuffer(buf, dtype=self.dtype)
-    #print(ret.data)
+    # print(ret.data)
    return ret

+
 class ANE:
  def __init__(self):
    libane.ANE_Open()

  def compile(self, dat):
    ret = libane.ANE_Compile(create_string_buffer(dat), len(dat))
-    assert(ret is not None)
+    assert ret is not None
    return ret

  def run(self, prog, tin, tout):
@@ -53,6 +56,7 @@ class ANE:
  def tensor(self, shape):
    return ANETensor(shape)

+
 if __name__ == "__main__":
  ane = ANE()

@@ -62,7 +66,7 @@ if __name__ == "__main__":
  tind = tin.data()
  toutd = tout.data()

-  tind[0:4] = [-1,1,-2,2]
+  tind[0:4] = [-1, 1, -2, 2]
  print(tind)
  print(toutd)

@@ -71,4 +75,3 @@ if __name__ == "__main__":

  print(tind)
  print(toutd)
-
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -17,8 +17,9 @@ for dev in ["CPU", "GPU", "ANE"]:
    boaa = baa.relu()
    et = time.time()
    if i == 2:
-      print("%s can do at least %.2f MEGAReLUs/sec" % (dev, (np.prod(boaa.shape)/1e6)/(et-st)))
+      print(
+        "%s can do at least %.2f MEGAReLUs/sec"
+        % (dev, (np.prod(boaa.shape) / 1e6) / (et - st))
+      )
    # decently reliable
-    assert(np.all(boaa.cpu().data >= 0))
-
-
+    assert np.all(boaa.cpu().data >= 0)
--- a/examples/efficientnet.py
+++ b/examples/efficientnet.py
@@ -3,24 +3,29 @@
 # a rough copy of
 # https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py
 import os
+
 GPU = os.getenv("GPU", None) is not None
 import sys
 import io
 import time
 import numpy as np
+
 np.set_printoptions(suppress=True)
 from tinygrad.tensor import Tensor
 from extra.utils import fetch, get_parameters
 from extra.efficientnet import EfficientNet

+
 def infer(model, img):
  # preprocess image
  aspect_ratio = img.size[0] / img.size[1]
-  img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
+  img = img.resize(
+    (int(224 * max(aspect_ratio, 1.0)), int(224 * max(1.0 / aspect_ratio, 1.0)))
+  )

  img = np.array(img)
-  y0,x0=(np.asarray(img.shape)[:2]-224)//2
-  retimg = img = img[y0:y0+224, x0:x0+224]
+  y0, x0 = (np.asarray(img.shape)[:2] - 224) // 2
+  retimg = img = img[y0 : y0 + 224, x0 : x0 + 224]

  # if you want to look at the image
  """
@@ -30,11 +35,11 @@ def infer(model, img):
  """

  # low level preprocess
-  img = np.moveaxis(img, [2,0,1], [0,1,2])
-  img = img.astype(np.float32)[:3].reshape(1,3,224,224)
+  img = np.moveaxis(img, [2, 0, 1], [0, 1, 2])
+  img = img.astype(np.float32)[:3].reshape(1, 3, 224, 224)
  img /= 255.0
-  img -= np.array([0.485, 0.456, 0.406]).reshape((1,-1,1,1))
-  img /= np.array([0.229, 0.224, 0.225]).reshape((1,-1,1,1))
+  img -= np.array([0.485, 0.456, 0.406]).reshape((1, -1, 1, 1))
+  img /= np.array([0.229, 0.224, 0.225]).reshape((1, -1, 1, 1))

  # run the net
  if GPU:
@@ -50,6 +55,7 @@ def infer(model, img):
  """
  return out, retimg

+
 if __name__ == "__main__":
  # instantiate my net
  model = EfficientNet(int(os.getenv("NUM", "0")))
@@ -59,38 +65,42 @@ if __name__ == "__main__":

  # category labels
  import ast
-  lbls = fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt")
-  lbls = ast.literal_eval(lbls.decode('utf-8'))
+
+  lbls = fetch(
+    "https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt"
+  )
+  lbls = ast.literal_eval(lbls.decode("utf-8"))

  # load image and preprocess
  from PIL import Image
+
  url = sys.argv[1]
-  if url == 'webcam':
+  if url == "webcam":
    import cv2
+
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
    while 1:
-      _ = cap.grab() # discard one frame to circumvent capture buffering
+      _ = cap.grab()  # discard one frame to circumvent capture buffering
      ret, frame = cap.read()
-      img = Image.fromarray(frame[:, :, [2,1,0]])
+      img = Image.fromarray(frame[:, :, [2, 1, 0]])
      out, retimg = infer(model, img)
      print(np.argmax(out.data), np.max(out.data), lbls[np.argmax(out.data)])
      SCALE = 3
-      simg = cv2.resize(retimg, (224*SCALE, 224*SCALE))
+      simg = cv2.resize(retimg, (224 * SCALE, 224 * SCALE))
      retimg = cv2.cvtColor(simg, cv2.COLOR_RGB2BGR)
-      cv2.imshow('capture', retimg)
-      if cv2.waitKey(1) & 0xFF == ord('q'):
+      cv2.imshow("capture", retimg)
+      if cv2.waitKey(1) & 0xFF == ord("q"):
        break
    cap.release()
    cv2.destroyAllWindows()
  else:
-    if url.startswith('http'):
+    if url.startswith("http"):
      img = Image.open(io.BytesIO(fetch(url)))
    else:
      img = Image.open(url)
    st = time.time()
    out, _ = infer(model, img)
    print(np.argmax(out.data), np.max(out.data), lbls[np.argmax(out.data)])
-    print("did inference in %.2f s" % (time.time()-st))
-  #print("NOT", np.argmin(out.data), np.min(out.data), lbls[np.argmin(out.data)])
-
+    print("did inference in %.2f s" % (time.time() - st))
+  # print("NOT", np.argmin(out.data), np.min(out.data), lbls[np.argmin(out.data)])
--- a/examples/mnist_gan.py
+++ b/examples/mnist_gan.py
@@ -3,8 +3,9 @@ import os
 import sys
 import numpy as np
 from tqdm import tqdm
+
 sys.path.append(os.getcwd())
-sys.path.append(os.path.join(os.getcwd(), 'test'))
+sys.path.append(os.path.join(os.getcwd(), "test"))

 from tinygrad.tensor import Tensor, Function, register
 from extra.utils import get_parameters
@@ -12,7 +13,10 @@ import tinygrad.optim as optim
 from test_mnist import X_train
 from torchvision.utils import make_grid, save_image
 import torch
+
 GPU = os.getenv("GPU") is not None
+
+
 class LinearGen:
  def __init__(self):
    lv = 128
@@ -28,6 +32,7 @@ class LinearGen:
    x = x.dot(self.l4).tanh()
    return x

+
 class LinearDisc:
  def __init__(self):
    in_sh = 784
@@ -39,108 +44,112 @@ class LinearDisc:
  def forward(self, x, train=True):
    x = x.dot(self.l1).leakyrelu(0.2)
    if train:
-        x = x.dropout(0.3)
+      x = x.dropout(0.3)
    x = x.dot(self.l2).leakyrelu(0.2)
    if train:
-        x = x.dropout(0.3)
+      x = x.dropout(0.3)
    x = x.dot(self.l3).leakyrelu(0.2)
    if train:
-        x = x.dropout(0.3)
+      x = x.dropout(0.3)
    x = x.dot(self.l4).logsoftmax()
    return x

+
 if __name__ == "__main__":
-    generator = LinearGen()
-    discriminator = LinearDisc()
-    batch_size = 512
-    k = 1
-    epochs = 300
-    generator_params = get_parameters(generator)
-    discriminator_params = get_parameters(discriminator)
-    gen_loss = []
-    disc_loss = []
-    output_folder = "outputs"
-    os.makedirs(output_folder, exist_ok=True)
-    train_data_size = len(X_train)
-    ds_noise = Tensor(np.random.randn(64,128).astype(np.float32), gpu=GPU, requires_grad=False)
-    n_steps = int(train_data_size/batch_size)
-    if GPU:
-      [x.cuda_() for x in generator_params+discriminator_params]
-    # optimizers
-    optim_g = optim.Adam(generator_params,lr=0.0002, b1=0.5) # 0.0002 for equilibrium!
-    optim_d = optim.Adam(discriminator_params,lr=0.0002, b1=0.5)
+  generator = LinearGen()
+  discriminator = LinearDisc()
+  batch_size = 512
+  k = 1
+  epochs = 300
+  generator_params = get_parameters(generator)
+  discriminator_params = get_parameters(discriminator)
+  gen_loss = []
+  disc_loss = []
+  output_folder = "outputs"
+  os.makedirs(output_folder, exist_ok=True)
+  train_data_size = len(X_train)
+  ds_noise = Tensor(
+    np.random.randn(64, 128).astype(np.float32), gpu=GPU, requires_grad=False
+  )
+  n_steps = int(train_data_size / batch_size)
+  if GPU:
+    [x.cuda_() for x in generator_params + discriminator_params]
+  # optimizers
+  optim_g = optim.Adam(generator_params, lr=0.0002, b1=0.5)  # 0.0002 for equilibrium!
+  optim_d = optim.Adam(discriminator_params, lr=0.0002, b1=0.5)

-    def regularization_l2(model, a=1e-4):
-        #TODO: l2 reg loss
-        pass
+  def regularization_l2(model, a=1e-4):
+    # TODO: l2 reg loss
+    pass

-    def generator_batch():
-        idx = np.random.randint(0, X_train.shape[0], size=(batch_size))
-        image_b = X_train[idx].reshape(-1, 28*28).astype(np.float32)/255.
-        image_b = (image_b - 0.5)/0.5
-        return Tensor(image_b, gpu=GPU)
+  def generator_batch():
+    idx = np.random.randint(0, X_train.shape[0], size=(batch_size))
+    image_b = X_train[idx].reshape(-1, 28 * 28).astype(np.float32) / 255.0
+    image_b = (image_b - 0.5) / 0.5
+    return Tensor(image_b, gpu=GPU)

-    def real_label(bs):
-        y = np.zeros((bs,2), np.float32)
-        y[range(bs), [1]*bs] = -2.0
-        real_labels = Tensor(y, gpu=GPU)
-        return real_labels
+  def real_label(bs):
+    y = np.zeros((bs, 2), np.float32)
+    y[range(bs), [1] * bs] = -2.0
+    real_labels = Tensor(y, gpu=GPU)
+    return real_labels

-    def fake_label(bs):
-        y = np.zeros((bs,2), np.float32)
-        y[range(bs), [0]*bs] = -2.0 # Can we do label smoothin? i.e -2.0 changed to -1.98789.
-        fake_labels = Tensor(y, gpu=GPU)
-        return fake_labels
+  def fake_label(bs):
+    y = np.zeros((bs, 2), np.float32)
+    y[
+      range(bs), [0] * bs
+    ] = -2.0  # Can we do label smoothin? i.e -2.0 changed to -1.98789.
+    fake_labels = Tensor(y, gpu=GPU)
+    return fake_labels

-    def train_discriminator(optimizer, data_real, data_fake):
-        real_labels = real_label(batch_size)
-        fake_labels = fake_label(batch_size)
+  def train_discriminator(optimizer, data_real, data_fake):
+    real_labels = real_label(batch_size)
+    fake_labels = fake_label(batch_size)

-        optimizer.zero_grad()
+    optimizer.zero_grad()

-        output_real = discriminator.forward(data_real)
-        loss_real = (output_real * real_labels).mean()
+    output_real = discriminator.forward(data_real)
+    loss_real = (output_real * real_labels).mean()

-        output_fake = discriminator.forward(data_fake)
-        loss_fake = (output_fake * fake_labels).mean()
+    output_fake = discriminator.forward(data_fake)
+    loss_fake = (output_fake * fake_labels).mean()

-        loss_real.backward()
-        loss_fake.backward()
-        optimizer.step()
-        return loss_real.cpu().data + loss_fake.cpu().data
+    loss_real.backward()
+    loss_fake.backward()
+    optimizer.step()
+    return loss_real.cpu().data + loss_fake.cpu().data

-    def train_generator(optimizer, data_fake):
-        real_labels = real_label(batch_size)
-        optimizer.zero_grad()
-        output = discriminator.forward(data_fake)
-        loss = (output * real_labels).mean()
-        loss.backward()
-        optimizer.step()
-        return loss.cpu().data
-
-    for epoch in tqdm(range(epochs)):
-        loss_g = 0.0
-        loss_d = 0.0
-        print(f"Epoch {epoch} of {epochs}")
-        for i in tqdm(range(n_steps)):
-            image = generator_batch()
-            for step in range(k): # Try with k = 5 or 7.
-                noise = Tensor(np.random.randn(batch_size,128), gpu=GPU)
-                data_fake = generator.forward(noise).detach()
-                data_real = image
-                loss_d_step = train_discriminator(optim_d, data_real, data_fake)
-                loss_d += loss_d_step
-            noise = Tensor(np.random.randn(batch_size,128), gpu=GPU)
-            data_fake = generator.forward(noise)
-            loss_g_step = train_generator(optim_g, data_fake)
-            loss_g += loss_g_step
-        fake_images = generator.forward(ds_noise).detach().cpu().data
-        fake_images = (fake_images.reshape(-1, 1, 28, 28)+ 1) / 2 # 0 - 1 range.
-        fake_images = make_grid(torch.tensor(fake_images))
-        save_image(fake_images, os.path.join(output_folder,f"image_{epoch}.jpg"))
-        epoch_loss_g = loss_g / n_steps
-        epoch_loss_d = loss_d / n_steps
-        print(f"EPOCH: Generator loss: {epoch_loss_g}, Discriminator loss: {epoch_loss_d}")
-    else:
-        print("Training Completed!")
+  def train_generator(optimizer, data_fake):
+    real_labels = real_label(batch_size)
+    optimizer.zero_grad()
+    output = discriminator.forward(data_fake)
+    loss = (output * real_labels).mean()
+    loss.backward()
+    optimizer.step()
+    return loss.cpu().data

+  for epoch in tqdm(range(epochs)):
+    loss_g = 0.0
+    loss_d = 0.0
+    print(f"Epoch {epoch} of {epochs}")
+    for i in tqdm(range(n_steps)):
+      image = generator_batch()
+      for step in range(k):  # Try with k = 5 or 7.
+        noise = Tensor(np.random.randn(batch_size, 128), gpu=GPU)
+        data_fake = generator.forward(noise).detach()
+        data_real = image
+        loss_d_step = train_discriminator(optim_d, data_real, data_fake)
+        loss_d += loss_d_step
+      noise = Tensor(np.random.randn(batch_size, 128), gpu=GPU)
+      data_fake = generator.forward(noise)
+      loss_g_step = train_generator(optim_g, data_fake)
+      loss_g += loss_g_step
+    fake_images = generator.forward(ds_noise).detach().cpu().data
+    fake_images = (fake_images.reshape(-1, 1, 28, 28) + 1) / 2  # 0 - 1 range.
+    fake_images = make_grid(torch.tensor(fake_images))
+    save_image(fake_images, os.path.join(output_folder, f"image_{epoch}.jpg"))
+    epoch_loss_g = loss_g / n_steps
+    epoch_loss_d = loss_d / n_steps
+    print(f"EPOCH: Generator loss: {epoch_loss_g}, Discriminator loss: {epoch_loss_d}")
+  else:
+    print("Training Completed!")
--- a/examples/serious_mnist.py
+++ b/examples/serious_mnist.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python
-#inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
+# inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
 import os
 import sys
+
 sys.path.append(os.getcwd())
-sys.path.append(os.path.join(os.getcwd(), 'test'))
+sys.path.append(os.path.join(os.getcwd(), "test"))

 import numpy as np
 from tinygrad.tensor import Tensor, GPU
@@ -13,126 +14,139 @@ from test_mnist import fetch_mnist
 from extra.training import train, evaluate
 import tinygrad.optim as optim
 from extra.augment import augment_img
+
 GPU = os.getenv("GPU", None) is not None
 QUICK = os.getenv("QUICK", None) is not None
 DEBUG = os.getenv("DEBUG", None) is not None

+
 class SqueezeExciteBlock2D:
  def __init__(self, filters):
    self.filters = filters
-    self.weight1 = Tensor.uniform(self.filters, self.filters//32)
-    self.bias1 = Tensor.uniform(1,self.filters//32)
-    self.weight2 = Tensor.uniform(self.filters//32, self.filters)
+    self.weight1 = Tensor.uniform(self.filters, self.filters // 32)
+    self.bias1 = Tensor.uniform(1, self.filters // 32)
+    self.weight2 = Tensor.uniform(self.filters // 32, self.filters)
    self.bias2 = Tensor.uniform(1, self.filters)

  def __call__(self, input):
-    se = input.avg_pool2d(kernel_size=(input.shape[2], input.shape[3])) #GlobalAveragePool2D
+    se = input.avg_pool2d(
+      kernel_size=(input.shape[2], input.shape[3])
+    )  # GlobalAveragePool2D
    se = se.reshape(shape=(-1, self.filters))
    se = se.dot(self.weight1) + self.bias1
-    se = se.relu() 
+    se = se.relu()
    se = se.dot(self.weight2) + self.bias2
-    se = se.sigmoid().reshape(shape=(-1,self.filters,1,1)) #for broadcasting 
+    se = se.sigmoid().reshape(shape=(-1, self.filters, 1, 1))  # for broadcasting
    se = input.mul(se)
    return se

+
 class ConvBlock:
  def __init__(self, h, w, inp, filters=128, conv=3):
    self.h, self.w = h, w
    self.inp = inp
-    #init weights
-    self.cweights = [Tensor.uniform(filters, inp if i==0 else filters, conv, conv) for i in range(3)]
+    # init weights
+    self.cweights = [
+      Tensor.uniform(filters, inp if i == 0 else filters, conv, conv) for i in range(3)
+    ]
    self.cbiases = [Tensor.uniform(1, filters, 1, 1) for i in range(3)]
-    #init layers
+    # init layers
    self._bn = BatchNorm2D(128, training=True)
    self._seb = SqueezeExciteBlock2D(filters)
-  
+
  def __call__(self, input):
-    x = input.reshape(shape=(-1, self.inp, self.w, self.h)) 
+    x = input.reshape(shape=(-1, self.inp, self.w, self.h))
    for cweight, cbias in zip(self.cweights, self.cbiases):
-      x = x.pad2d(padding=[1,1,1,1]).conv2d(cweight).add(cbias).relu()
+      x = x.pad2d(padding=[1, 1, 1, 1]).conv2d(cweight).add(cbias).relu()
    x = self._bn(x)
    x = self._seb(x)
    return x

+
 class BigConvNet:
  def __init__(self):
-    self.conv = [ConvBlock(28,28,1), ConvBlock(28,28,128), ConvBlock(14,14,128)]
-    self.weight1 = Tensor.uniform(128,10)
-    self.weight2 = Tensor.uniform(128,10)
+    self.conv = [ConvBlock(28, 28, 1), ConvBlock(28, 28, 128), ConvBlock(14, 14, 128)]
+    self.weight1 = Tensor.uniform(128, 10)
+    self.weight2 = Tensor.uniform(128, 10)

  def parameters(self):
-    if DEBUG: #keeping this for a moment
+    if DEBUG:  # keeping this for a moment
      pars = [par for par in get_parameters(self) if par.requires_grad]
      no_pars = 0
      for par in pars:
        print(par.shape)
        no_pars += np.prod(par.shape)
-      print('no of parameters', no_pars)
+      print("no of parameters", no_pars)
      return pars
    else:
      return get_parameters(self)

  def save(self, filename):
-    with open(filename+'.npy', 'wb') as f:
+    with open(filename + ".npy", "wb") as f:
      for par in get_parameters(self):
-        #if par.requires_grad:
+        # if par.requires_grad:
        np.save(f, par.cpu().data)

  def load(self, filename):
-    with open(filename+'.npy', 'rb') as f:
-      for par in get_parameters(self): 
-        #if par.requires_grad:
+    with open(filename + ".npy", "rb") as f:
+      for par in get_parameters(self):
+        # if par.requires_grad:
        try:
          par.cpu().data[:] = np.load(f)
          if GPU:
            par.cuda()
        except:
-          print('Could not load parameter')
+          print("Could not load parameter")

  def forward(self, x):
    x = self.conv[0](x)
    x = self.conv[1](x)
-    x = x.avg_pool2d(kernel_size=(2,2))
+    x = x.avg_pool2d(kernel_size=(2, 2))
    x = self.conv[2](x)
-    x1 = x.avg_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
-    x2 = x.max_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
+    x1 = x.avg_pool2d(kernel_size=(14, 14)).reshape(shape=(-1, 128))  # global
+    x2 = x.max_pool2d(kernel_size=(14, 14)).reshape(shape=(-1, 128))  # global
    xo = x1.dot(self.weight1) + x2.dot(self.weight2)
    return xo.logsoftmax()


 if __name__ == "__main__":
  lrs = [1e-4, 1e-5] if QUICK else [1e-3, 1e-4, 1e-5, 1e-5]
-  epochss = [2, 1] if QUICK else [13, 3, 3, 1] 
+  epochss = [2, 1] if QUICK else [13, 3, 3, 1]
  BS = 32

  lmbd = 0.00025
-  lossfn = lambda out,y: out.mul(y).mean() + lmbd*(model.weight1.abs() + model.weight2.abs()).sum()
+  lossfn = (
+    lambda out, y: out.mul(y).mean()
+    + lmbd * (model.weight1.abs() + model.weight2.abs()).sum()
+  )
  X_train, Y_train, X_test, Y_test = fetch_mnist()
-  steps = len(X_train)//BS
+  steps = len(X_train) // BS
  np.random.seed(1337)
  if QUICK:
    steps = 1
    X_test, Y_test = X_test[:BS], Y_test[:BS]
-  
+
  model = BigConvNet()
- 
+
  if len(sys.argv) > 1:
    try:
      model.load(sys.argv[1])
-      print('Loaded weights "'+sys.argv[1]+'", evaluating...')
+      print('Loaded weights "' + sys.argv[1] + '", evaluating...')
      evaluate(model, X_test, Y_test, BS=BS)
    except:
-      print('could not load weights "'+sys.argv[1]+'".')
- 
+      print('could not load weights "' + sys.argv[1] + '".')
+
  if GPU:
    params = get_parameters(model)
    [x.cuda_() for x in params]

  for lr, epochs in zip(lrs, epochss):
    optimizer = optim.Adam(model.parameters(), lr=lr)
-    for epoch in range(1,epochs+1):
-      #first epoch without augmentation
+    for epoch in range(1, epochs + 1):
+      # first epoch without augmentation
      X_aug = X_train if epoch == 1 else augment_img(X_train)
-      train(model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, gpu=GPU, BS=BS)
+      train(
+        model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, gpu=GPU, BS=BS
+      )
      accuracy = evaluate(model, X_test, Y_test, BS=BS)
-      model.save('examples/checkpoint'+str("%.0f" % (accuracy*1.0e6)))
+      model.save("examples/checkpoint" + str("%.0f" % (accuracy * 1.0e6)))
--- a/examples/train_efficientnet.py
+++ b/examples/train_efficientnet.py
@@ -10,13 +10,14 @@ import io
 import tarfile
 import pickle

+
 class TinyConvNet:
  def __init__(self, classes=10):
    conv = 3
-    inter_chan, out_chan = 8, 16   # for speed
-    self.c1 = Tensor.uniform(inter_chan,3,conv,conv)
-    self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
-    self.l1 = Tensor.uniform(out_chan*6*6, classes)
+    inter_chan, out_chan = 8, 16  # for speed
+    self.c1 = Tensor.uniform(inter_chan, 3, conv, conv)
+    self.c2 = Tensor.uniform(out_chan, inter_chan, conv, conv)
+    self.l1 = Tensor.uniform(out_chan * 6 * 6, classes)

  def forward(self, x):
    x = x.conv2d(self.c1).relu().max_pool2d()
@@ -24,13 +25,20 @@ class TinyConvNet:
    x = x.reshape(shape=[x.shape[0], -1])
    return x.dot(self.l1).logsoftmax()

+
 def load_cifar():
-  tt = tarfile.open(fileobj=io.BytesIO(fetch('https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz')), mode='r:gz')
-  db = pickle.load(tt.extractfile('cifar-10-batches-py/data_batch_1'), encoding="bytes")
-  X = db[b'data'].reshape((-1, 3, 32, 32))
-  Y = np.array(db[b'labels'])
+  tt = tarfile.open(
+    fileobj=io.BytesIO(
+      fetch("https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz")
+    ),
+    mode="r:gz",
+  )
+  db = pickle.load(tt.extractfile("cifar-10-batches-py/data_batch_1"), encoding="bytes")
+  X = db[b"data"].reshape((-1, 3, 32, 32))
+  Y = np.array(db[b"labels"])
  return X, Y

+
 if __name__ == "__main__":
  X_train, Y_train = load_cifar()
  classes = 10
@@ -40,7 +48,7 @@ if __name__ == "__main__":
  TRANSFER = os.getenv("TRANSFER") is not None
  if TINY:
    model = TinyConvNet(classes)
-  elif TRANSFER: 
+  elif TRANSFER:
    model = EfficientNet(int(os.getenv("NUM", "0")), classes, has_se=True)
    model.load_weights_from_torch()
  else:
@@ -50,21 +58,21 @@ if __name__ == "__main__":
  print("parameters", len(parameters))
  optimizer = optim.Adam(parameters, lr=0.001)

-  #BS, steps = 16, 32
+  # BS, steps = 16, 32
  BS, steps = 64 if TINY else 16, 2048

-  for i in (t := trange(steps)):
+  for i in (t := trange(steps)) :
    samp = np.random.randint(0, X_train.shape[0], size=(BS))

    img = X_train[samp].astype(np.float32)

    st = time.time()
    out = model.forward(Tensor(img))
-    fp_time = (time.time()-st)*1000.0
+    fp_time = (time.time() - st) * 1000.0

    Y = Y_train[samp]
-    y = np.zeros((BS,classes), np.float32)
-    y[range(y.shape[0]),Y] = -classes
+    y = np.zeros((BS, classes), np.float32)
+    y[range(y.shape[0]), Y] = -classes
    y = Tensor(y)
    loss = out.logsoftmax().mul(y).mean()

@@ -72,25 +80,32 @@ if __name__ == "__main__":

    st = time.time()
    loss.backward()
-    bp_time = (time.time()-st)*1000.0
+    bp_time = (time.time() - st) * 1000.0

    st = time.time()
    optimizer.step()
-    opt_time = (time.time()-st)*1000.0
+    opt_time = (time.time() - st) * 1000.0

-    #print(out.cpu().data)
+    # print(out.cpu().data)

    st = time.time()
    loss = loss.cpu().data
    cat = np.argmax(out.cpu().data, axis=1)
    accuracy = (cat == Y).mean()
-    finish_time = (time.time()-st)*1000.0
+    finish_time = (time.time() - st) * 1000.0

    # printing
-    t.set_description("loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f" %
-      (loss, accuracy,
-      fp_time, bp_time, opt_time, finish_time,
-      fp_time + bp_time + opt_time + finish_time))
+    t.set_description(
+      "loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f"
+      % (
+        loss,
+        accuracy,
+        fp_time,
+        bp_time,
+        opt_time,
+        finish_time,
+        fp_time + bp_time + opt_time + finish_time,
+      )
+    )

    del out, y, loss
-
--- a/examples/use_ane.py
+++ b/examples/use_ane.py
@@ -3,9 +3,8 @@ import numpy as np
 from tinygrad.tensor import Tensor
 import time

-a = Tensor([-2,-1,0,1,2]).ane()
+a = Tensor([-2, -1, 0, 1, 2]).ane()
 print(a.cpu())
 b = a.relu()
 print(b.cpu())
-assert(np.all(b.cpu().data >= 0))
-
+assert np.all(b.cpu().data >= 0)
--- a/extra/augment.py
+++ b/extra/augment.py
@@ -2,39 +2,47 @@ import numpy as np
 from PIL import Image
 import os
 import sys
+
 sys.path.append(os.getcwd())
-sys.path.append(os.path.join(os.getcwd(), 'test'))
+sys.path.append(os.path.join(os.getcwd(), "test"))
 from test_mnist import fetch_mnist
 from tqdm import trange

+
 def augment_img(X, rotate=10, px=3):
  Xaug = np.zeros_like(X)
  for i in trange(len(X)):
    im = Image.fromarray(X[i])
-    im = im.rotate(np.random.randint(-rotate,rotate), resample=Image.BICUBIC)
+    im = im.rotate(np.random.randint(-rotate, rotate), resample=Image.BICUBIC)
    w, h = X.shape[1:]
-    #upper left, lower left, lower right, upper right
-    quad = np.random.randint(-px,px,size=(8)) + np.array([0,0,0,h,w,h,w,0])
+    # upper left, lower left, lower right, upper right
+    quad = np.random.randint(-px, px, size=(8)) + np.array([0, 0, 0, h, w, h, w, 0])
    im = im.transform((w, h), Image.QUAD, quad, resample=Image.BICUBIC)
    Xaug[i] = im
  return Xaug

+
 if __name__ == "__main__":
  from test_mnist import fetch_mnist
  import matplotlib.pyplot as plt
+
  X_train, Y_train, X_test, Y_test = fetch_mnist()
-  X = np.vstack([X_train[:1]]*10+[X_train[1:2]]*10)
-  fig, a = plt.subplots(2,len(X))
+  X = np.vstack([X_train[:1]] * 10 + [X_train[1:2]] * 10)
+  fig, a = plt.subplots(2, len(X))
  Xaug = augment_img(X)
  for i in range(len(X)):
-    a[0][i].imshow(X[i], cmap='gray')
-    a[1][i].imshow(Xaug[i],cmap='gray')
-    a[0][i].axis('off')
-    a[1][i].axis('off')
+    a[0][i].imshow(X[i], cmap="gray")
+    a[1][i].imshow(Xaug[i], cmap="gray")
+    a[0][i].axis("off")
+    a[1][i].axis("off")
  plt.show()

-  #create some nice gifs for doc?!
+  # create some nice gifs for doc?!
  for i in range(10):
-    im = Image.fromarray(X_train[7353+i])
-    im_aug = [Image.fromarray(x) for x in augment_img(np.array([X_train[7353+i]]*100))]
-    im.save("aug"+str(i)+".gif", save_all=True, append_images=im_aug, duration=100, loop=0)
+    im = Image.fromarray(X_train[7353 + i])
+    im_aug = [
+      Image.fromarray(x) for x in augment_img(np.array([X_train[7353 + i]] * 100))
+    ]
+    im.save(
+      "aug" + str(i) + ".gif", save_all=True, append_images=im_aug, duration=100, loop=0
+    )
--- a/extra/efficientnet.py
+++ b/extra/efficientnet.py
@@ -6,6 +6,7 @@ from extra.utils import fetch

 USE_TORCH = False

+
 def fake_torch_load(b0):
  import io
  import pickle
@@ -23,9 +24,9 @@ def fake_torch_load(b0):

  class HackTensor:
    def __new__(cls, *args):
-      #print(args)
+      # print(args)
      ident, storage_type, obj_key, location, obj_size, view_metadata = args[0]
-      assert ident == 'storage'
+      assert ident == "storage"

      ret = np.zeros(obj_size, dtype=storage_type)
      key_prelookup[obj_key] = (storage_type, obj_size, ret, args[2], args[3])
@@ -33,10 +34,10 @@ def fake_torch_load(b0):

  class MyPickle(pickle.Unpickler):
    def find_class(self, module, name):
-      #print(module, name)
-      if name == 'FloatStorage':
+      # print(module, name)
+      if name == "FloatStorage":
        return np.float32
-      if name == 'LongStorage':
+      if name == "LongStorage":
        return np.int64
      if module == "torch._utils" or module == "torch":
        return HackTensor
@@ -51,7 +52,7 @@ def fake_torch_load(b0):
  # create key_lookup
  key_lookup = pickle.load(fb0)
  key_real = [None] * len(key_lookup)
-  for k,v in key_prelookup.items():
+  for k, v in key_prelookup.items():
    key_real[key_lookup.index(k)] = v

  # read in the actual data
@@ -64,13 +65,23 @@ def fake_torch_load(b0):
    np_array.shape = np_shape

    # numpy stores its strides in bytes
-    real_strides = tuple([x*bytes_size for x in np_strides])
+    real_strides = tuple([x * bytes_size for x in np_strides])
    np_array.strides = real_strides

  return ret

+
 class MBConvBlock:
-  def __init__(self, kernel_size, strides, expand_ratio, input_filters, output_filters, se_ratio, has_se):
+  def __init__(
+    self,
+    kernel_size,
+    strides,
+    expand_ratio,
+    input_filters,
+    output_filters,
+    se_ratio,
+    has_se,
+  ):
    oup = expand_ratio * input_filters
    if expand_ratio != 1:
      self._expand_conv = Tensor.uniform(oup, input_filters, 1, 1)
@@ -79,10 +90,10 @@ class MBConvBlock:
      self._expand_conv = None

    self.strides = strides
-    if strides == (2,2):
-      self.pad = [(kernel_size-1)//2-1, (kernel_size-1)//2]*2
+    if strides == (2, 2):
+      self.pad = [(kernel_size - 1) // 2 - 1, (kernel_size - 1) // 2] * 2
    else:
-      self.pad = [(kernel_size-1)//2]*4
+      self.pad = [(kernel_size - 1) // 2] * 4

    self._depthwise_conv = Tensor.uniform(oup, 1, kernel_size, kernel_size)
    self._bn1 = BatchNorm2D(oup)
@@ -103,14 +114,22 @@ class MBConvBlock:
    if self._expand_conv:
      x = self._bn0(x.conv2d(self._expand_conv)).swish()
    x = x.pad2d(padding=self.pad)
-    x = x.conv2d(self._depthwise_conv, stride=self.strides, groups=self._depthwise_conv.shape[0])
+    x = x.conv2d(
+      self._depthwise_conv, stride=self.strides, groups=self._depthwise_conv.shape[0]
+    )
    x = self._bn1(x).swish()

    # has_se
    if self.has_se:
      x_squeezed = x.avg_pool2d(kernel_size=x.shape[2:4])
-      x_squeezed = x_squeezed.conv2d(self._se_reduce).add(self._se_reduce_bias.reshape(shape=[1, -1, 1, 1])).swish()
-      x_squeezed = x_squeezed.conv2d(self._se_expand).add(self._se_expand_bias.reshape(shape=[1, -1, 1, 1]))
+      x_squeezed = (
+        x_squeezed.conv2d(self._se_reduce)
+        .add(self._se_reduce_bias.reshape(shape=[1, -1, 1, 1]))
+        .swish()
+      )
+      x_squeezed = x_squeezed.conv2d(self._se_expand).add(
+        self._se_expand_bias.reshape(shape=[1, -1, 1, 1])
+      )
      x = x.mul(x_squeezed.sigmoid())

    x = self._bn2(x.conv2d(self._project_conv))
@@ -118,21 +137,22 @@ class MBConvBlock:
      x = x.add(inputs)
    return x

+
 class EfficientNet:
  def __init__(self, number=0, classes=1000, has_se=True):
    self.number = number
    global_params = [
      # width, depth
-      (1.0, 1.0), # b0
-      (1.0, 1.1), # b1
-      (1.1, 1.2), # b2
-      (1.2, 1.4), # b3
-      (1.4, 1.8), # b4
-      (1.6, 2.2), # b5
-      (1.8, 2.6), # b6
-      (2.0, 3.1), # b7
-      (2.2, 3.6), # b8
-      (4.3, 5.3), # l2
+      (1.0, 1.0),  # b0
+      (1.0, 1.1),  # b1
+      (1.1, 1.2),  # b2
+      (1.2, 1.4),  # b3
+      (1.4, 1.8),  # b4
+      (1.6, 2.2),  # b5
+      (1.8, 2.6),  # b6
+      (2.0, 3.1),  # b7
+      (2.2, 3.6),  # b8
+      (4.3, 5.3),  # l2
    ][number]

    def round_filters(filters):
@@ -140,7 +160,7 @@ class EfficientNet:
      divisor = 8
      filters *= multiplier
      new_filters = max(divisor, int(filters + divisor / 2) // divisor * divisor)
-      if new_filters < 0.9 * filters: # prevent rounding by more than 10%
+      if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
        new_filters += divisor
      return int(new_filters)

@@ -151,13 +171,13 @@ class EfficientNet:
    self._conv_stem = Tensor.uniform(out_channels, 3, 3, 3)
    self._bn0 = BatchNorm2D(out_channels)
    blocks_args = [
-      [1, 3, (1,1), 1, 32, 16, 0.25],
-      [2, 3, (2,2), 6, 16, 24, 0.25],
-      [2, 5, (2,2), 6, 24, 40, 0.25],
-      [3, 3, (2,2), 6, 40, 80, 0.25],
-      [3, 5, (1,1), 6, 80, 112, 0.25],
-      [4, 5, (2,2), 6, 112, 192, 0.25],
-      [1, 3, (1,1), 6, 192, 320, 0.25],
+      [1, 3, (1, 1), 1, 32, 16, 0.25],
+      [2, 3, (2, 2), 6, 16, 24, 0.25],
+      [2, 5, (2, 2), 6, 24, 40, 0.25],
+      [3, 3, (2, 2), 6, 40, 80, 0.25],
+      [3, 5, (1, 1), 6, 80, 112, 0.25],
+      [4, 5, (2, 2), 6, 112, 192, 0.25],
+      [1, 3, (1, 1), 6, 192, 320, 0.25],
    ]
    self._blocks = []
    # num_repeats, kernel_size, strides, expand_ratio, input_filters, output_filters, se_ratio
@@ -168,7 +188,7 @@ class EfficientNet:
      for n in range(round_repeats(b[0])):
        self._blocks.append(MBConvBlock(*args, has_se=has_se))
        args[3] = args[4]
-        args[1] = (1,1)
+        args[1] = (1, 1)

    in_channels = round_filters(320)
    out_channels = round_filters(1280)
@@ -178,43 +198,52 @@ class EfficientNet:
    self._fc_bias = Tensor.zeros(classes)

  def forward(self, x):
-    x = x.pad2d(padding=(0,1,0,1))
+    x = x.pad2d(padding=(0, 1, 0, 1))
    x = self._bn0(x.conv2d(self._conv_stem, stride=2)).swish()
-    #print(x.shape, x.data[:, 0, 0, 0])
+    # print(x.shape, x.data[:, 0, 0, 0])
    for block in self._blocks:
      x = block(x)
    x = self._bn1(x.conv2d(self._conv_head)).swish()
    x = x.avg_pool2d(kernel_size=x.shape[2:4])
    x = x.reshape(shape=(-1, x.shape[1]))
-    #x = x.dropout(0.2)
-    return x.dot(self._fc).add(self._fc_bias.reshape(shape=[1,-1]))
+    # x = x.dropout(0.2)
+    return x.dot(self._fc).add(self._fc_bias.reshape(shape=[1, -1]))

  def load_weights_from_torch(self):
    # load b0
    # https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/utils.py#L551
    if self.number == 0:
-      b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth")
+      b0 = fetch(
+        "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth"
+      )
    elif self.number == 2:
-      b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth")
+      b0 = fetch(
+        "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth"
+      )
    elif self.number == 4:
-      b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth")
+      b0 = fetch(
+        "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth"
+      )
    elif self.number == 7:
-      b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth")
+      b0 = fetch(
+        "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth"
+      )
    else:
      raise Exception("no pretrained weights")

    if USE_TORCH:
      import io
      import torch
+
      b0 = torch.load(io.BytesIO(b0))
    else:
      b0 = fake_torch_load(b0)

-    for k,v in b0.items():
-      if '_blocks.' in k:
+    for k, v in b0.items():
+      if "_blocks." in k:
        k = "%s[%s].%s" % tuple(k.split(".", 2))
-      mk = "self."+k
-      #print(k, v.shape)
+      mk = "self." + k
+      # print(k, v.shape)
      try:
        mv = eval(mk)
      except AttributeError:
@@ -223,10 +252,9 @@ class EfficientNet:
        except AttributeError:
          mv = eval(mk.replace(".bias", "_bias"))
      vnp = v.numpy().astype(np.float32) if USE_TORCH else v
-      vnp = vnp if k != '_fc.weight' else vnp.T
+      vnp = vnp if k != "_fc.weight" else vnp.T

      if mv.shape == vnp.shape or vnp.shape == ():
        mv.data[:] = vnp
      else:
        print("MISMATCH SHAPE IN %s, %r %r" % (k, mv.shape, vnp.shape))
-
--- a/extra/gradcheck.py
+++ b/extra/gradcheck.py
@@ -1,17 +1,19 @@
 import numpy as np
 from tinygrad.tensor import Tensor

-def mask_like(like, mask_inx, mask_value = 1.0):
+
+def mask_like(like, mask_inx, mask_value=1.0):
  mask = np.zeros_like(like).reshape(-1)
  mask[mask_inx] = mask_value
  return mask.reshape(like.shape)

+
 def jacobian(func, input):
  output = func(input)

  ji = input.data.reshape(-1).shape[-1]
  jo = output.data.reshape(-1).shape[-1]
-  J = np.zeros((jo,ji), dtype=np.float32)
+  J = np.zeros((jo, ji), dtype=np.float32)

  for o in range(jo):
    input.grad = None
@@ -19,14 +21,15 @@ def jacobian(func, input):

    # tinygrad doesn't support slicing, tiny-hack to select
    # the needed scalar an backpropagate only through it
-    o_scalar = Tensor(mask_like(output.data, o, 1.)).mul(output).sum()
+    o_scalar = Tensor(mask_like(output.data, o, 1.0)).mul(output).sum()
    o_scalar.backward()

    for i, grad in enumerate(input.grad.data.reshape(-1)):
-      J[o,i] = grad
+      J[o, i] = grad
  return J

-def numerical_jacobian(func, input, eps = 1e-6):
+
+def numerical_jacobian(func, input, eps=1e-6):
  output = func(input)

  ji = input.data.reshape(-1).shape[-1]
@@ -34,17 +37,18 @@ def numerical_jacobian(func, input, eps = 1e-6):
  NJ = np.zeros((jo, ji), dtype=np.float32)

  for i in range(ji):
-    eps_perturb = mask_like(input.data, i, mask_value = eps)
+    eps_perturb = mask_like(input.data, i, mask_value=eps)

    output_perturb_add = func(Tensor(input.data + eps_perturb)).data.reshape(-1)
    output_perturb_sub = func(Tensor(input.data - eps_perturb)).data.reshape(-1)

-    grad_approx = ((output_perturb_add) - (output_perturb_sub)) / (2*eps)
+    grad_approx = ((output_perturb_add) - (output_perturb_sub)) / (2 * eps)

-    NJ[:,i] = grad_approx
+    NJ[:, i] = grad_approx
  return NJ

-def gradcheck(func, input, eps = 1e-06, atol = 1e-5, rtol = 0.001):
+
+def gradcheck(func, input, eps=1e-06, atol=1e-5, rtol=0.001):
  NJ = numerical_jacobian(func, input, eps)
  J = jacobian(func, input)
  return np.allclose(J, NJ, atol=atol, rtol=rtol)
--- a/extra/training.py
+++ b/extra/training.py
@@ -4,26 +4,40 @@ from tqdm import trange
 from extra.utils import get_parameters
 from tinygrad.tensor import Tensor, GPU, Device

-def train(model, X_train, Y_train, optim, steps, num_classes=None, BS=128, device=Device.CPU, lossfn = lambda out,y: out.mul(y).mean()):
-  if device == Device.GPU: [x.gpu_() for x in get_parameters([model, optim])]
-  elif device == Device.ANE: [x.ane_() for x in get_parameters([model, optim])]
-  if num_classes is None: num_classes = Y_train.max().astype(int)+1
+
+def train(
+  model,
+  X_train,
+  Y_train,
+  optim,
+  steps,
+  num_classes=None,
+  BS=128,
+  device=Device.CPU,
+  lossfn=lambda out, y: out.mul(y).mean(),
+):
+  if device == Device.GPU:
+    [x.gpu_() for x in get_parameters([model, optim])]
+  elif device == Device.ANE:
+    [x.ane_() for x in get_parameters([model, optim])]
+  if num_classes is None:
+    num_classes = Y_train.max().astype(int) + 1
  losses, accuracies = [], []
-  for i in (t := trange(steps, disable=os.getenv('CI') is not None)):
+  for i in (t := trange(steps, disable=os.getenv("CI") is not None)) :
    samp = np.random.randint(0, X_train.shape[0], size=(BS))

-    x = Tensor(X_train[samp].reshape((-1, 28*28)).astype(np.float32), device=device)
+    x = Tensor(X_train[samp].reshape((-1, 28 * 28)).astype(np.float32), device=device)
    Y = Y_train[samp]
-    y = np.zeros((len(samp),num_classes), np.float32)
+    y = np.zeros((len(samp), num_classes), np.float32)
    # correct loss for NLL, torch NLL loss returns one per row
-    y[range(y.shape[0]),Y] = -1.0*num_classes
+    y[range(y.shape[0]), Y] = -1.0 * num_classes
    y = Tensor(y, device=device)

    # network
    out = model.forward(x)

    # NLL loss function
-    loss = lossfn(out, y) 
+    loss = lossfn(out, y)
    optim.zero_grad()
    loss.backward()
    optim.step()
@@ -37,15 +51,26 @@ def train(model, X_train, Y_train, optim, steps, num_classes=None, BS=128, devic
    accuracies.append(accuracy)
    t.set_description("loss %.2f accuracy %.2f" % (loss, accuracy))

+
 def evaluate(model, X_test, Y_test, num_classes=None, device=Device.CPU, BS=128):
  def numpy_eval(num_classes):
-    Y_test_preds_out = np.zeros((len(Y_test),num_classes))
-    for i in trange(len(Y_test)//BS, disable=os.getenv('CI') is not None):
-      Y_test_preds_out[i*BS:(i+1)*BS] = model.forward(Tensor(X_test[i*BS:(i+1)*BS].reshape((-1, 28*28)).astype(np.float32), device=device)).cpu().data
+    Y_test_preds_out = np.zeros((len(Y_test), num_classes))
+    for i in trange(len(Y_test) // BS, disable=os.getenv("CI") is not None):
+      Y_test_preds_out[i * BS : (i + 1) * BS] = (
+        model.forward(
+          Tensor(
+            X_test[i * BS : (i + 1) * BS].reshape((-1, 28 * 28)).astype(np.float32),
+            device=device,
+          )
+        )
+        .cpu()
+        .data
+      )
    Y_test_preds = np.argmax(Y_test_preds_out, axis=1)
    return (Y_test == Y_test_preds).mean()

-  if num_classes is None: num_classes = Y_test.max().astype(int)+1
+  if num_classes is None:
+    num_classes = Y_test.max().astype(int) + 1
  accuracy = numpy_eval(num_classes)
  print("test set accuracy is %f" % accuracy)
-  return accuracy 
+  return accuracy
--- a/extra/utils.py
+++ b/extra/utils.py
@@ -1,19 +1,22 @@
 from tinygrad.tensor import Tensor

+
 def fetch(url):
  import requests, os, hashlib, tempfile
-  fp = os.path.join(tempfile.gettempdir(), hashlib.md5(url.encode('utf-8')).hexdigest())
+
+  fp = os.path.join(tempfile.gettempdir(), hashlib.md5(url.encode("utf-8")).hexdigest())
  if os.path.isfile(fp) and os.stat(fp).st_size > 0:
    with open(fp, "rb") as f:
      dat = f.read()
  else:
    print("fetching %s" % url)
    dat = requests.get(url).content
-    with open(fp+".tmp", "wb") as f:
+    with open(fp + ".tmp", "wb") as f:
      f.write(dat)
-    os.rename(fp+".tmp", fp)
+    os.rename(fp + ".tmp", fp)
  return dat

+
 def get_parameters(obj):
  parameters = []
  if isinstance(obj, Tensor):
@@ -21,8 +24,7 @@ def get_parameters(obj):
  elif isinstance(obj, list):
    for x in obj:
      parameters.extend(get_parameters(x))
-  elif hasattr(obj, '__dict__'):
-    for k,v in obj.__dict__.items():
+  elif hasattr(obj, "__dict__"):
+    for k, v in obj.__dict__.items():
      parameters.extend(get_parameters(v))
  return parameters
-
--- a/setup.py
+++ b/setup.py
@@ -4,29 +4,31 @@ import os
 from setuptools import setup

 directory = os.path.abspath(os.path.dirname(__file__))
-with open(os.path.join(directory, 'README.md'), encoding='utf-8') as f:
+with open(os.path.join(directory, "README.md"), encoding="utf-8") as f:
  long_description = f.read()

-setup(name='tinygrad',
-      version='0.3.0',
-      description='You like pytorch? You like micrograd? You love tinygrad! heart',
-      author='George Hotz',
-      license='MIT',
-      long_description=long_description,
-      long_description_content_type='text/markdown',
-      packages = ['tinygrad'],
-      classifiers=[
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License"
-      ],
-      install_requires=['numpy', 'requests'],
-      python_requires='>=3.8',
-      extras_require={
-        'gpu': ["pyopencl", "six"],
-        'testing': [
-            "pytest",
-            "torch",
-            "tqdm",
-        ],
-      },
-      include_package_data=True)
+setup(
+  name="tinygrad",
+  version="0.3.0",
+  description="You like pytorch? You like micrograd? You love tinygrad! heart",
+  author="George Hotz",
+  license="MIT",
+  long_description=long_description,
+  long_description_content_type="text/markdown",
+  packages=["tinygrad"],
+  classifiers=[
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+  ],
+  install_requires=["numpy", "requests"],
+  python_requires=">=3.8",
+  extras_require={
+    "gpu": ["pyopencl", "six"],
+    "testing": [
+      "pytest",
+      "torch",
+      "tqdm",
+    ],
+  },
+  include_package_data=True,
+)
--- a/test/config.py
+++ b/test/config.py
@@ -1,3 +1,3 @@
 import os

-ANE = os.environ.get('ANE', False)
+ANE = os.environ.get("ANE", False)
--- a/test/test_gc.py
+++ b/test/test_gc.py
@@ -4,43 +4,48 @@ import unittest
 from tinygrad.tensor import Tensor, GPU, Device
 from .config import ANE

+
 def tensors_allocated():
  return sum([isinstance(x, Tensor) for x in gc.get_objects()])
-    
+
+
 class TestGC(unittest.TestCase):
  device = Device.CPU

  def test_gc(self):
-    a = Tensor.zeros(4,4, device=self.device)
-    b = Tensor.zeros(4,4, device=self.device)
-    (a*b).mean().backward()
-    assert(tensors_allocated() > 0)
-    del a,b
-    assert(tensors_allocated() == 0)
+    a = Tensor.zeros(4, 4, device=self.device)
+    b = Tensor.zeros(4, 4, device=self.device)
+    (a * b).mean().backward()
+    assert tensors_allocated() > 0
+    del a, b
+    assert tensors_allocated() == 0

  def test_gc_complex(self):
-    a = Tensor.zeros(4,4, device=self.device)
-    b = Tensor.zeros(4,4, device=self.device)
-    assert(tensors_allocated() == 2)
-    (a*b).mean().backward()
-    assert(tensors_allocated() == 4)
+    a = Tensor.zeros(4, 4, device=self.device)
+    b = Tensor.zeros(4, 4, device=self.device)
+    assert tensors_allocated() == 2
+    (a * b).mean().backward()
+    assert tensors_allocated() == 4
    del b
-    assert(tensors_allocated() == 2)
-    b = Tensor.zeros(4,4, device=self.device)
+    assert tensors_allocated() == 2
+    b = Tensor.zeros(4, 4, device=self.device)
    print(tensors_allocated())
-    (a*b).mean().backward()
+    (a * b).mean().backward()
    print(tensors_allocated())
-    assert(tensors_allocated() == 4)
+    assert tensors_allocated() == 4
    del b
-    assert(tensors_allocated() == 2)
+    assert tensors_allocated() == 2
+

@unittest.skipUnless(GPU, "Requires GPU")
 class TestGCGPU(TestGC):
-  device = Device.GPU 
+  device = Device.GPU
+

@unittest.skipUnless(ANE, "Requires ANE")
 class TestGCANE(TestGC):
-  device=Device.ANE
+  device = Device.ANE

-if __name__ == '__main__':
+
+if __name__ == "__main__":
  unittest.main()
--- a/test/test_mnist.py
+++ b/test/test_mnist.py
@@ -11,19 +11,28 @@ from .config import ANE
 # mnist loader
 def fetch_mnist():
  import gzip
+
  parse = lambda dat: np.frombuffer(gzip.decompress(dat), dtype=np.uint8).copy()
-  X_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"))[0x10:].reshape((-1, 28, 28))
-  Y_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"))[8:]
-  X_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"))[0x10:].reshape((-1, 28, 28))
-  Y_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"))[8:]
+  X_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"))[
+    0x10:
+  ].reshape((-1, 28, 28))
+  Y_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"))[
+    8:
+  ]
+  X_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"))[
+    0x10:
+  ].reshape((-1, 28, 28))
+  Y_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"))[
+    8:
+  ]
  return X_train, Y_train, X_test, Y_test

+
 # load the mnist dataset
 X_train, Y_train, X_test, Y_test = fetch_mnist()

 # create a model
 class TinyBobNet:
-
  def __init__(self):
    self.l1 = Tensor.uniform(784, 128)
    self.l2 = Tensor.uniform(128, 10)
@@ -34,27 +43,29 @@ class TinyBobNet:
  def forward(self, x):
    return x.dot(self.l1).relu().dot(self.l2).logsoftmax()

+
 # create a model with a conv layer
 class TinyConvNet:
  def __init__(self):
    # https://keras.io/examples/vision/mnist_convnet/
    conv = 3
-    #inter_chan, out_chan = 32, 64
-    inter_chan, out_chan = 8, 16   # for speed
-    self.c1 = Tensor.uniform(inter_chan,1,conv,conv)
-    self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
-    self.l1 = Tensor.uniform(out_chan*5*5, 10)
+    # inter_chan, out_chan = 32, 64
+    inter_chan, out_chan = 8, 16  # for speed
+    self.c1 = Tensor.uniform(inter_chan, 1, conv, conv)
+    self.c2 = Tensor.uniform(out_chan, inter_chan, conv, conv)
+    self.l1 = Tensor.uniform(out_chan * 5 * 5, 10)

  def parameters(self):
    return get_parameters(self)

  def forward(self, x):
-    x = x.reshape(shape=(-1, 1, 28, 28)) # hacks
+    x = x.reshape(shape=(-1, 1, 28, 28))  # hacks
    x = x.conv2d(self.c1).relu().max_pool2d()
    x = x.conv2d(self.c2).relu().max_pool2d()
    x = x.reshape(shape=[x.shape[0], -1])
    return x.dot(self.l1).logsoftmax()

+
 class TestMNIST(unittest.TestCase):
  device = Device.CPU

@@ -76,16 +87,19 @@ class TestMNIST(unittest.TestCase):
    np.random.seed(1337)
    model = TinyBobNet()
    optimizer = optim.RMSprop(model.parameters(), lr=0.0002)
-    train(model,  X_train, Y_train, optimizer, steps=1000, device=self.device)
+    train(model, X_train, Y_train, optimizer, steps=1000, device=self.device)
    assert evaluate(model, X_test, Y_test, device=self.device) > 0.95

+
@unittest.skipUnless(GPU, "Requires GPU")
 class TestMNISTGPU(TestMNIST):
  device = Device.GPU

+
@unittest.skipUnless(ANE, "Requires ANE")
 class TestMNISTANE(TestMNIST):
-  device=Device.ANE
+  device = Device.ANE

-if __name__ == '__main__':
+
+if __name__ == "__main__":
  unittest.main()
--- a/test/test_net_speed.py
+++ b/test/test_net_speed.py
@@ -7,21 +7,25 @@ import torch
 from tinygrad.tensor import Tensor, GPU, Device
 from .config import ANE

+
 def start_profile():
  import time
-  pr = cProfile.Profile(timer=lambda: int(time.time()*1e9), timeunit=1e-6)
+
+  pr = cProfile.Profile(timer=lambda: int(time.time() * 1e9), timeunit=1e-6)
  pr.enable()
  return pr

-def stop_profile(pr, sort='cumtime'):
+
+def stop_profile(pr, sort="cumtime"):
  pr.disable()
  ps = pstats.Stats(pr)
  ps.strip_dirs()
  ps.sort_stats(sort)
  ps.print_stats(0.2)

+
 class TestConvSpeed(unittest.TestCase):
-  device= Device.CPU
+  device = Device.CPU

  def test_mnist(self):
    # https://keras.io/examples/vision/mnist_convnet/
@@ -34,12 +38,12 @@ class TestConvSpeed(unittest.TestCase):

    conv = 3
    inter_chan, out_chan = 32, 64
-    c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
-    c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
-    l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
+    c1 = torch.randn(inter_chan, 1, conv, conv, requires_grad=True)
+    c2 = torch.randn(out_chan, inter_chan, conv, conv, requires_grad=True)
+    l1 = torch.randn(out_chan * 5 * 5, 10, requires_grad=True)

    c2d = torch.nn.functional.conv2d
-    mp = torch.nn.MaxPool2d((2,2))
+    mp = torch.nn.MaxPool2d((2, 2))
    lsm = torch.nn.LogSoftmax(dim=1)

    cnt = 5
@@ -47,19 +51,19 @@ class TestConvSpeed(unittest.TestCase):
    for i in range(cnt):
      et0 = time.time()
      x = torch.randn(128, 1, 28, 28, requires_grad=True)
-      x = mp(c2d(x,c1).relu())
-      x = mp(c2d(x,c2).relu())
+      x = mp(c2d(x, c1).relu())
+      x = mp(c2d(x, c2).relu())
      x = x.reshape(x.shape[0], -1)
      out = lsm(x.matmul(l1))
      out = out.mean()
      et1 = time.time()
      out.backward()
      et2 = time.time()
-      fpt += (et1-et0)
-      bpt += (et2-et1)
+      fpt += et1 - et0
+      bpt += et2 - et1

-    fpt_baseline = (fpt*1000/cnt)
-    bpt_baseline = (bpt*1000/cnt)
+    fpt_baseline = fpt * 1000 / cnt
+    bpt_baseline = bpt * 1000 / cnt
    print("torch forward pass:  %.3f ms" % fpt_baseline)
    print("torch backward pass: %.3f ms" % bpt_baseline)

@@ -71,7 +75,7 @@ class TestConvSpeed(unittest.TestCase):

    cnt = 5
    fpt, bpt = 0.0, 0.0
-    for i in range(1+cnt):
+    for i in range(1 + cnt):
      et0 = time.time()
      x = Tensor.randn(128, 1, 28, 28, device=self.device)
      x = x.conv2d(c1).relu().avg_pool2d()
@@ -85,24 +89,31 @@ class TestConvSpeed(unittest.TestCase):
      if i == 0:
        pr = start_profile()
      else:
-        fpt += (et1-et0)
-        bpt += (et2-et1)
+        fpt += et1 - et0
+        bpt += et2 - et1
+
+    stop_profile(pr, sort="time")
+    fpt = fpt * 1000 / cnt
+    bpt = bpt * 1000 / cnt
+    print(
+      "forward pass:  %.3f ms, %.2fx off baseline %.3f ms"
+      % (fpt, fpt / fpt_baseline, fpt_baseline)
+    )
+    print(
+      "backward pass: %.3f ms, %.2fx off baseline %.3f ms"
+      % (bpt, bpt / bpt_baseline, bpt_baseline)
+    )

-    stop_profile(pr, sort='time')
-    fpt = (fpt*1000/cnt)
-    bpt = (bpt*1000/cnt)
-    print("forward pass:  %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
-    print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))

@unittest.skipUnless(GPU, "Requires GPU")
 class TestConvSpeedGPU(TestConvSpeed):
  device = Device.GPU

+
@unittest.skipUnless(ANE, "Requires ANE")
 class TestConvSpeedANE(TestConvSpeed):
-  device=Device.ANE
+  device = Device.ANE


-if __name__ == '__main__':
+if __name__ == "__main__":
  unittest.main()
-
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -7,6 +7,7 @@ from extra.utils import get_parameters
 import torch
 from .config import ANE

+
 class TestNN(unittest.TestCase):
  device = Device.CPU

@@ -30,8 +31,12 @@ class TestNN(unittest.TestCase):
      tbn.running_mean[:] = torch.tensor(bn.running_mean.data)
      tbn.running_var[:] = torch.tensor(bn.running_var.data)

-    np.testing.assert_allclose(bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5)
-    np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
+    np.testing.assert_allclose(
+      bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5
+    )
+    np.testing.assert_allclose(
+      bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5
+    )

    # trial
    inn = Tensor.randn(2, sz, 3, 3, device=self.device)
@@ -45,35 +50,42 @@ class TestNN(unittest.TestCase):
    # close
    np.testing.assert_allclose(outt.data, toutt.detach().numpy(), rtol=5e-5)

-    np.testing.assert_allclose(bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5)
+    np.testing.assert_allclose(
+      bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5
+    )

    # TODO: this is failing
-    #np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
+    # np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)

  def test_batchnorm2d_training(self):
    self.test_batchnorm2d(True)

+
@unittest.skipUnless(GPU, "Requires GPU")
 class TestNNGPU(TestNN):
  device = Device.GPU

  @unittest.skip("Tests not added")
-  def test_batchnorm2d(self): pass
+  def test_batchnorm2d(self):
+    pass

  @unittest.skip("Tests not added")
-  def test_batchnorm2d_training(self): pass
+  def test_batchnorm2d_training(self):
+    pass


@unittest.skipUnless(ANE, "Requires ANE")
 class TestNNANE(TestNN):
-  device=Device.ANE
+  device = Device.ANE

  @unittest.skip("Tests not added")
-  def test_batchnorm2d(self): pass
+  def test_batchnorm2d(self):
+    pass

  @unittest.skip("Tests not added")
-  def test_batchnorm2d_training(self): pass
+  def test_batchnorm2d_training(self):
+    pass


-if __name__ == '__main__':
+if __name__ == "__main__":
  unittest.main()
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -7,13 +7,24 @@ import functools
 from tinygrad.tensor import Tensor, GPU, Device
 from .config import ANE

-def helper_test_op(shps, torch_fxn, tinygrad_fxn, atol=0, rtol=1e-6, grad_atol=0, grad_rtol=1e-6, device=Device.CPU, forward_only=False):
+
+def helper_test_op(
+  shps,
+  torch_fxn,
+  tinygrad_fxn,
+  atol=0,
+  rtol=1e-6,
+  grad_atol=0,
+  grad_rtol=1e-6,
+  device=Device.CPU,
+  forward_only=False,
+):
  torch.manual_seed(0)
  ts = [torch.rand(x, requires_grad=True) for x in shps]
  tst = [Tensor(x.detach().numpy()) for x in ts]
-  if device==Device.GPU:
+  if device == Device.GPU:
    tst = [x.gpu() for x in tst]
-  elif device==Device.ANE:
+  elif device == Device.ANE:
    tst = [x.ane() for x in tst]

  out = torch_fxn(*ts)
@@ -26,142 +37,291 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn, atol=0, rtol=1e-6, grad_atol=0
    ret.mean().backward()

    for t, tt in zip(ts, tst):
-      np.testing.assert_allclose(t.grad, tt.cpu().grad.data, atol=grad_atol, rtol=grad_rtol)
+      np.testing.assert_allclose(
+        t.grad, tt.cpu().grad.data, atol=grad_atol, rtol=grad_rtol
+      )

  # speed
-  torch_fp = timeit.Timer(functools.partial(torch_fxn, *ts)).timeit(5) * 1000/5
-  tinygrad_fp = timeit.Timer(functools.partial(tinygrad_fxn, *tst)).timeit(5) * 1000/5
+  torch_fp = timeit.Timer(functools.partial(torch_fxn, *ts)).timeit(5) * 1000 / 5
+  tinygrad_fp = timeit.Timer(functools.partial(tinygrad_fxn, *tst)).timeit(5) * 1000 / 5

  if not forward_only:
-    torch_fbp = timeit.Timer(functools.partial(lambda f,x: f(*x).mean().backward(), torch_fxn, ts)).timeit(5) * 1000/5
-    tinygrad_fbp = timeit.Timer(functools.partial(lambda f,x: f(*x).mean().backward(), tinygrad_fxn, tst)).timeit(5) * 1000/5
+    torch_fbp = (
+      timeit.Timer(
+        functools.partial(lambda f, x: f(*x).mean().backward(), torch_fxn, ts)
+      ).timeit(5)
+      * 1000
+      / 5
+    )
+    tinygrad_fbp = (
+      timeit.Timer(
+        functools.partial(lambda f, x: f(*x).mean().backward(), tinygrad_fxn, tst)
+      ).timeit(5)
+      * 1000
+      / 5
+    )
  else:
    torch_fbp, tinygrad_fbp = np.nan, np.nan

-  print("testing %30r   torch/tinygrad fp: %.2f / %.2f ms  bp: %.2f / %.2f ms" % (shps, torch_fp, tinygrad_fp, torch_fbp-torch_fp, tinygrad_fbp-tinygrad_fp))
+  print(
+    "testing %30r   torch/tinygrad fp: %.2f / %.2f ms  bp: %.2f / %.2f ms"
+    % (shps, torch_fp, tinygrad_fp, torch_fbp - torch_fp, tinygrad_fbp - tinygrad_fp)
+  )
+

 class TestOps(unittest.TestCase):
-  device=Device.CPU
+  device = Device.CPU

  def test_add(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x+y, Tensor.add, device=self.device)
+    helper_test_op(
+      [(45, 65), (45, 65)], lambda x, y: x + y, Tensor.add, device=self.device
+    )
+
  def test_sub(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x-y, Tensor.sub, device=self.device)
+    helper_test_op(
+      [(45, 65), (45, 65)], lambda x, y: x - y, Tensor.sub, device=self.device
+    )
+
  def test_mul(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x*y, Tensor.mul, device=self.device)
+    helper_test_op(
+      [(45, 65), (45, 65)], lambda x, y: x * y, Tensor.mul, device=self.device
+    )
+
  def test_div(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x/y, Tensor.div, device=self.device)
+    helper_test_op(
+      [(45, 65), (45, 65)], lambda x, y: x / y, Tensor.div, device=self.device
+    )
+
  def test_pow(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x**y, Tensor.pow, device=self.device)
+    helper_test_op(
+      [(45, 65), (45, 65)], lambda x, y: x ** y, Tensor.pow, device=self.device
+    )
+
  def test_sqrt(self):
-    helper_test_op([(45,65)], lambda x: x.sqrt(), Tensor.sqrt, device=self.device)
+    helper_test_op([(45, 65)], lambda x: x.sqrt(), Tensor.sqrt, device=self.device)
+
  def test_relu(self):
-    helper_test_op([(45,65)], lambda x: x.relu(), Tensor.relu, device=self.device)
+    helper_test_op([(45, 65)], lambda x: x.relu(), Tensor.relu, device=self.device)
+
  def test_leakyrelu(self):
-    helper_test_op([(45,65)], lambda x: torch.nn.functional.leaky_relu(x,0.01), Tensor.leakyrelu, device=self.device)
+    helper_test_op(
+      [(45, 65)],
+      lambda x: torch.nn.functional.leaky_relu(x, 0.01),
+      Tensor.leakyrelu,
+      device=self.device,
+    )
+
  def test_abs(self):
-    helper_test_op([(45,65)], lambda x: torch.abs(x), Tensor.abs, device=self.device)
+    helper_test_op([(45, 65)], lambda x: torch.abs(x), Tensor.abs, device=self.device)
+
  def test_sigmoid(self):
-    helper_test_op([(45,65)], lambda x: x.sigmoid(), Tensor.sigmoid, device=self.device)
+    helper_test_op(
+      [(45, 65)], lambda x: x.sigmoid(), Tensor.sigmoid, device=self.device
+    )
+
  def test_dot(self):
-    helper_test_op([(45,65), (65,100)], lambda x,y: x.matmul(y), Tensor.dot, device=self.device)
+    helper_test_op(
+      [(45, 65), (65, 100)], lambda x, y: x.matmul(y), Tensor.dot, device=self.device
+    )
+
  def test_sum(self):
-    helper_test_op([(45,3)], lambda x: x.sum(), Tensor.sum, device=self.device)
+    helper_test_op([(45, 3)], lambda x: x.sum(), Tensor.sum, device=self.device)
+
  def test_sum_axis(self):
-    helper_test_op([(3,4,5,6)], lambda x: x.sum(axis=(1,2)), lambda x: Tensor.sum(x, axis=(1,2)), device=self.device)
+    helper_test_op(
+      [(3, 4, 5, 6)],
+      lambda x: x.sum(axis=(1, 2)),
+      lambda x: Tensor.sum(x, axis=(1, 2)),
+      device=self.device,
+    )
+
  def test_mean_axis(self):
-    helper_test_op([(3,4,5,6)], lambda x: x.mean(axis=(1,2)), lambda x: Tensor.mean(x, axis=(1,2)), device=self.device)
+    helper_test_op(
+      [(3, 4, 5, 6)],
+      lambda x: x.mean(axis=(1, 2)),
+      lambda x: Tensor.mean(x, axis=(1, 2)),
+      device=self.device,
+    )
+
  def test_logsoftmax(self):
-    helper_test_op([(45,65)], lambda x: torch.nn.LogSoftmax(dim=1)(x), Tensor.logsoftmax, atol=1e-7, grad_atol=1e-7, device=self.device)
+    helper_test_op(
+      [(45, 65)],
+      lambda x: torch.nn.LogSoftmax(dim=1)(x),
+      Tensor.logsoftmax,
+      atol=1e-7,
+      grad_atol=1e-7,
+      device=self.device,
+    )
+
  def test_tanh(self):
-    helper_test_op([(45,65)], lambda x: x.tanh(), Tensor.tanh, atol=1e-6, grad_atol=1e-6, device=self.device)
+    helper_test_op(
+      [(45, 65)],
+      lambda x: x.tanh(),
+      Tensor.tanh,
+      atol=1e-6,
+      grad_atol=1e-6,
+      device=self.device,
+    )
+
  def test_topo_sort(self):
-    helper_test_op([(45,65)], lambda x: (x+x)*x, lambda x: x.add(x).mul(x), atol=1e-6, grad_atol=1e-6, device=self.device)
+    helper_test_op(
+      [(45, 65)],
+      lambda x: (x + x) * x,
+      lambda x: x.add(x).mul(x),
+      atol=1e-6,
+      grad_atol=1e-6,
+      device=self.device,
+    )

  def test_scalar_mul(self):
-    helper_test_op([(45,65)], lambda x: x*2, lambda x: x*2, device=self.device)
+    helper_test_op([(45, 65)], lambda x: x * 2, lambda x: x * 2, device=self.device)
+
  def test_scalar_rmul(self):
-    helper_test_op([(45,65)], lambda x: 2*x, lambda x: 2*x, device=self.device)
+    helper_test_op([(45, 65)], lambda x: 2 * x, lambda x: 2 * x, device=self.device)

  def test_scalar_sub(self):
-    helper_test_op([(45,65)], lambda x: x-2, lambda x: x-2, device=self.device)
+    helper_test_op([(45, 65)], lambda x: x - 2, lambda x: x - 2, device=self.device)
+
  def test_scalar_rsub(self):
-    helper_test_op([(45,65)], lambda x: 2-x, lambda x: 2-x, device=self.device)
+    helper_test_op([(45, 65)], lambda x: 2 - x, lambda x: 2 - x, device=self.device)

  def test_broadcast_full(self):
-    for torch_op, tinygrad_op in [(torch.add, Tensor.add), (torch.sub, Tensor.sub), (torch.mul, Tensor.mul),
-                                  (torch.div, Tensor.div), (torch.pow, Tensor.pow)]:
-      for shapes in [((5,13,24,16), (5,1,24,1)), ((1,3,1,7,1), (2,1,5,1,8))]:
+    for torch_op, tinygrad_op in [
+      (torch.add, Tensor.add),
+      (torch.sub, Tensor.sub),
+      (torch.mul, Tensor.mul),
+      (torch.div, Tensor.div),
+      (torch.pow, Tensor.pow),
+    ]:
+      for shapes in [
+        ((5, 13, 24, 16), (5, 1, 24, 1)),
+        ((1, 3, 1, 7, 1), (2, 1, 5, 1, 8)),
+      ]:
        with self.subTest(op=torch_op.__name__, shapes=shapes):
          helper_test_op(shapes, torch_op, tinygrad_op, device=self.device)

-
  def test_broadcast_partial(self):
-    for torch_op, tinygrad_op in [(torch.add, Tensor.add), (torch.sub, Tensor.sub), (torch.mul, Tensor.mul),
-                                  (torch.div, Tensor.div), (torch.pow, Tensor.pow)]:
-      for shapes in [((1,32,32,32), (1,32,1,1)), ((5,13,24,16,2), (1,13,24,1,1)),
-                     ((4,1), (4,5)), ((1,4), (5,4))]:
+    for torch_op, tinygrad_op in [
+      (torch.add, Tensor.add),
+      (torch.sub, Tensor.sub),
+      (torch.mul, Tensor.mul),
+      (torch.div, Tensor.div),
+      (torch.pow, Tensor.pow),
+    ]:
+      for shapes in [
+        ((1, 32, 32, 32), (1, 32, 1, 1)),
+        ((5, 13, 24, 16, 2), (1, 13, 24, 1, 1)),
+        ((4, 1), (4, 5)),
+        ((1, 4), (5, 4)),
+      ]:
        with self.subTest(op=torch_op.__name__, shapes=shapes):
          # NOTE: ANE backwards?
-          helper_test_op(shapes, torch_op, tinygrad_op, device=self.device, forward_only=self.device!=Device.CPU)
+          helper_test_op(
+            shapes,
+            torch_op,
+            tinygrad_op,
+            device=self.device,
+            forward_only=self.device != Device.CPU,
+          )

  def test_pad2d(self):
-    helper_test_op([(3,3,3,3)], lambda x: torch.nn.functional.pad(x, (1,2,3,4)), lambda x: x.pad2d(padding=(1,2,3,4)), device=self.device)
+    helper_test_op(
+      [(3, 3, 3, 3)],
+      lambda x: torch.nn.functional.pad(x, (1, 2, 3, 4)),
+      lambda x: x.pad2d(padding=(1, 2, 3, 4)),
+      device=self.device,
+    )

  def test_reshape(self):
-    helper_test_op([(4,3,6,6)], lambda x: torch.reshape(x, (-1,3,6,6)), lambda x: x.reshape(shape=(-1,3,6,6)), device=self.device)
-    helper_test_op([(4,3,6,6)], lambda x: torch.reshape(x, (-1,1,6,6)), lambda x: x.reshape(shape=(-1,1,6,6)), device=self.device)
+    helper_test_op(
+      [(4, 3, 6, 6)],
+      lambda x: torch.reshape(x, (-1, 3, 6, 6)),
+      lambda x: x.reshape(shape=(-1, 3, 6, 6)),
+      device=self.device,
+    )
+    helper_test_op(
+      [(4, 3, 6, 6)],
+      lambda x: torch.reshape(x, (-1, 1, 6, 6)),
+      lambda x: x.reshape(shape=(-1, 1, 6, 6)),
+      device=self.device,
+    )

  def test_detach(self):
-    helper_test_op([(4,3,6,6)], lambda x: x.detach(), lambda x: x.detach(), device=self.device, forward_only=True)
+    helper_test_op(
+      [(4, 3, 6, 6)],
+      lambda x: x.detach(),
+      lambda x: x.detach(),
+      device=self.device,
+      forward_only=True,
+    )

  def test_conv2d(self):
-    for bs in [1,8]:
-      for cin in [1,3]:
-        for groups in [1,3] if cin == 3 else [1]:
-          for H in [1,2,5]:
-            for W in [1,2,3,5]:
-              with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H, width=W):
-                helper_test_op([(bs,cin,11,28), (6,cin//groups,H,W)],
-                  lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(),
-                  lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), device=self.device, grad_rtol=1e-5)
+    for bs in [1, 8]:
+      for cin in [1, 3]:
+        for groups in [1, 3] if cin == 3 else [1]:
+          for H in [1, 2, 5]:
+            for W in [1, 2, 3, 5]:
+              with self.subTest(
+                batch_size=bs, channels=cin, groups=groups, height=H, width=W
+              ):
+                helper_test_op(
+                  [(bs, cin, 11, 28), (6, cin // groups, H, W)],
+                  lambda x, w: torch.nn.functional.conv2d(x, w, groups=groups).relu(),
+                  lambda x, w: Tensor.conv2d(x, w, groups=groups).relu(),
+                  device=self.device,
+                  grad_rtol=1e-5,
+                )

  def test_strided_conv2d(self):
    bs = 4
    cin = 3
-    H,W = 3,3
+    H, W = 3, 3
    with self.subTest(stride := 2):
-      helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
-        lambda x,w: torch.nn.functional.conv2d(x,w,stride=2).relu(),
-        lambda x,w: Tensor.conv2d(x,w,stride=stride).relu(), device=self.device)
-    with self.subTest(stride := (2,1)):
-      helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
-        lambda x,w: torch.nn.functional.conv2d(x,w,stride=stride).relu(),
-        lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), device=self.device)
+      helper_test_op(
+        [(bs, cin, 11, 28), (4, cin, H, W)],
+        lambda x, w: torch.nn.functional.conv2d(x, w, stride=2).relu(),
+        lambda x, w: Tensor.conv2d(x, w, stride=stride).relu(),
+        device=self.device,
+      )
+    with self.subTest(stride := (2, 1)):
+      helper_test_op(
+        [(bs, cin, 11, 28), (4, cin, H, W)],
+        lambda x, w: torch.nn.functional.conv2d(x, w, stride=stride).relu(),
+        lambda x, w: Tensor.conv2d(x, w, stride=(2, 1)).relu(),
+        device=self.device,
+      )

  def test_maxpool2d(self):
-    for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1)]:
+    for ksz in [(2, 2), (3, 3), (3, 2), (5, 5), (5, 1)]:
      with self.subTest(kernel_size=ksz):
-        helper_test_op([(32,2,110,28)],
+        helper_test_op(
+          [(32, 2, 110, 28)],
          lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz),
-          lambda x: Tensor.max_pool2d(x, kernel_size=ksz), device=self.device)
+          lambda x: Tensor.max_pool2d(x, kernel_size=ksz),
+          device=self.device,
+        )

  def test_avgpool2d(self):
-    shape = (32,2,111,28)
-    for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1), shape[2:]]:
+    shape = (32, 2, 111, 28)
+    for ksz in [(2, 2), (3, 3), (3, 2), (5, 5), (5, 1), shape[2:]]:
      with self.subTest(kernel_size=ksz):
-        helper_test_op([shape],
+        helper_test_op(
+          [shape],
          lambda x: torch.nn.functional.avg_pool2d(x, kernel_size=ksz),
-          lambda x: Tensor.avg_pool2d(x, kernel_size=ksz), device=self.device)
+          lambda x: Tensor.avg_pool2d(x, kernel_size=ksz),
+          device=self.device,
+        )
+

@unittest.skipUnless(GPU, "Requires GPU")
 class TestOpsGPU(TestOps):
-  device=Device.GPU
+  device = Device.GPU
+

@unittest.skipUnless(ANE, "Requires ANE")
 class TestOpsANE(TestOps):
-  device=Device.ANE
+  device = Device.ANE

-if __name__ == '__main__':
+
+if __name__ == "__main__":
  unittest.main(verbosity=2)
-
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -6,20 +6,24 @@ from tinygrad.optim import Adam, SGD, RMSprop
 from extra.utils import get_parameters
 from .config import ANE

-x_init = np.random.randn(1,3).astype(np.float32)
-W_init = np.random.randn(3,3).astype(np.float32)
-m_init = np.random.randn(1,3).astype(np.float32)
+x_init = np.random.randn(1, 3).astype(np.float32)
+W_init = np.random.randn(3, 3).astype(np.float32)
+m_init = np.random.randn(1, 3).astype(np.float32)
+

 def step_tinygrad(optim, kwargs={}, device=Device.CPU):
  net = TinyNet()
  optim = optim([net.x, net.W], **kwargs)
-  if device==Device.GPU: [x.gpu_() for x in get_parameters([net, optim])]
-  elif device==Device.ANE: [x.ane_() for x in get_parameters([net, optim])]
+  if device == Device.GPU:
+    [x.gpu_() for x in get_parameters([net, optim])]
+  elif device == Device.ANE:
+    [x.ane_() for x in get_parameters([net, optim])]
  out = net.forward()
  out.backward()
  optim.step()
  return net.x.cpu().data, net.W.cpu().data

+
 def step_pytorch(optim, kwargs={}):
  net = TorchNet()
  optim = optim([net.x, net.W], **kwargs)
@@ -29,7 +33,7 @@ def step_pytorch(optim, kwargs={}):
  return net.x.detach().numpy(), net.W.detach().numpy()


-class TinyNet():
+class TinyNet:
  def __init__(self):
    self.x = Tensor(x_init.copy())
    self.W = Tensor(W_init.copy())
@@ -42,7 +46,7 @@ class TinyNet():
    return out


-class TorchNet():
+class TorchNet:
  def __init__(self):
    self.x = torch.tensor(x_init.copy(), requires_grad=True)
    self.W = torch.tensor(W_init.copy(), requires_grad=True)
@@ -59,19 +63,23 @@ class TestOptim(unittest.TestCase):
  device = Device.CPU

  def test_adam(self):
-    for x,y in zip(step_tinygrad(Adam, device=self.device),
-                   step_pytorch(torch.optim.Adam)):
+    for x, y in zip(
+      step_tinygrad(Adam, device=self.device), step_pytorch(torch.optim.Adam)
+    ):
      np.testing.assert_allclose(x, y, atol=1e-4)

  def test_sgd(self):
-    for x,y in zip(step_tinygrad(SGD, kwargs={'lr': 0.001}, device=self.device),
-                   step_pytorch(torch.optim.SGD, kwargs={'lr': 0.001})):
+    for x, y in zip(
+      step_tinygrad(SGD, kwargs={"lr": 0.001}, device=self.device),
+      step_pytorch(torch.optim.SGD, kwargs={"lr": 0.001}),
+    ):
      np.testing.assert_allclose(x, y, atol=1e-5)

  def test_rmsprop(self):
-    for x,y in zip(step_tinygrad(RMSprop, kwargs={'lr': 0.001, 'decay': 0.99}, device=self.device),
-                   step_pytorch(torch.optim.RMSprop,
-                                kwargs={'lr': 0.001, 'alpha': 0.99})):
+    for x, y in zip(
+      step_tinygrad(RMSprop, kwargs={"lr": 0.001, "decay": 0.99}, device=self.device),
+      step_pytorch(torch.optim.RMSprop, kwargs={"lr": 0.001, "alpha": 0.99}),
+    ):
      np.testing.assert_allclose(x, y, atol=1e-5)


@@ -79,10 +87,11 @@ class TestOptim(unittest.TestCase):
 class TestOptimGPU(TestOptim):
  device = Device.GPU

+
@unittest.skipUnless(ANE, "Requires ANE")
 class TestOptimANE(TestOptim):
  device = Device.ANE


-if __name__ == '__main__':
+if __name__ == "__main__":
  unittest.main()
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -6,11 +6,12 @@ from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
 from .config import ANE


-x_init = np.random.randn(1,3).astype(np.float32)
-U_init = np.random.randn(3,3).astype(np.float32)
-V_init = np.random.randn(3,3).astype(np.float32)
-W_init = np.random.randn(3,3).astype(np.float32)
-m_init = np.random.randn(1,3).astype(np.float32)
+x_init = np.random.randn(1, 3).astype(np.float32)
+U_init = np.random.randn(3, 3).astype(np.float32)
+V_init = np.random.randn(3, 3).astype(np.float32)
+W_init = np.random.randn(3, 3).astype(np.float32)
+m_init = np.random.randn(1, 3).astype(np.float32)
+

 class TestTinygrad(unittest.TestCase):
  device = Device.CPU
@@ -36,7 +37,7 @@ class TestTinygrad(unittest.TestCase):
      out.backward()
      return out.detach().numpy(), x.grad, W.grad

-    for x,y in zip(test_tinygrad(), test_pytorch()):
+    for x, y in zip(test_tinygrad(), test_pytorch()):
      np.testing.assert_allclose(x, y, atol=1e-5)

  def test_backward_pass_diamond_model(self):
@@ -64,7 +65,7 @@ class TestTinygrad(unittest.TestCase):
      out.backward()
      return out.detach().numpy(), u.grad, v.grad, w.grad

-    for x,y in zip(test_tinygrad(), test_pytorch()):
+    for x, y in zip(test_tinygrad(), test_pytorch()):
      np.testing.assert_allclose(x, y, atol=1e-5)

  def test_jacobian(self):
@@ -73,7 +74,9 @@ class TestTinygrad(unittest.TestCase):

    torch_x = torch.tensor(x, requires_grad=True)
    torch_W = torch.tensor(W, requires_grad=True)
-    torch_func = lambda x: torch.nn.functional.log_softmax(x.matmul(torch_W).relu(), dim=1)
+    torch_func = lambda x: torch.nn.functional.log_softmax(
+      x.matmul(torch_W).relu(), dim=1
+    )
    PJ = torch.autograd.functional.jacobian(torch_func, torch_x).squeeze().numpy()

    tiny_x = Tensor(x, device=self.device)
@@ -82,8 +85,8 @@ class TestTinygrad(unittest.TestCase):
    J = jacobian(tiny_func, tiny_x)
    NJ = numerical_jacobian(tiny_func, tiny_x)

-    np.testing.assert_allclose(PJ, J, atol = 1e-5)
-    np.testing.assert_allclose(PJ, NJ, atol = 1e-5)
+    np.testing.assert_allclose(PJ, J, atol=1e-5)
+    np.testing.assert_allclose(PJ, NJ, atol=1e-5)

  def test_gradcheck(self):
    W = np.random.RandomState(1337).random((10, 5))
@@ -96,7 +99,7 @@ class TestTinygrad(unittest.TestCase):
    self.assertTrue(gradcheck(tiny_func, tiny_x))

    # coarse approx. since a "big" eps and the non-linearities of the model
-    self.assertFalse(gradcheck(tiny_func, tiny_x, eps = 0.1))
+    self.assertFalse(gradcheck(tiny_func, tiny_x, eps=0.1))


@unittest.skipUnless(GPU, "Requires GPU")
@@ -104,14 +107,18 @@ class TestTinygradGPU(TestTinygrad):
  device = Device.GPU

  @unittest.skip("float64 not supported on GPU")
-  def test_jacobian(self): pass
+  def test_jacobian(self):
+    pass

  @unittest.skip("float64 not supported on GPU")
-  def test_gradcheck(self): pass
+  def test_gradcheck(self):
+    pass
+

@unittest.skipUnless(ANE, "Requires ANE")
 class TestOpsANE(TestTinygrad):
-  device=Device.ANE
+  device = Device.ANE

-if __name__ == '__main__':
+
+if __name__ == "__main__":
  unittest.main()
--- a/tinygrad/nn.py
+++ b/tinygrad/nn.py
@@ -1,23 +1,38 @@
 from tinygrad.tensor import Tensor

+
 class BatchNorm2D:
-  def __init__(self, sz, eps=1e-5, track_running_stats=False, training=False, momentum=0.1):
-    self.eps, self.track_running_stats, self.training, self.momentum = eps, track_running_stats, training, momentum
+  def __init__(
+    self, sz, eps=1e-5, track_running_stats=False, training=False, momentum=0.1
+  ):
+    self.eps, self.track_running_stats, self.training, self.momentum = (
+      eps,
+      track_running_stats,
+      training,
+      momentum,
+    )

    self.weight, self.bias = Tensor.ones(sz), Tensor.zeros(sz)

-    self.running_mean, self.running_var = Tensor.zeros(sz, requires_grad=False), Tensor.ones(sz, requires_grad=False)
+    self.running_mean, self.running_var = (
+      Tensor.zeros(sz, requires_grad=False),
+      Tensor.ones(sz, requires_grad=False),
+    )
    self.num_batches_tracked = Tensor.zeros(1, requires_grad=False)

  def __call__(self, x):
    if self.track_running_stats or self.training:
-      batch_mean = x.mean(axis=(0,2,3))
-      y = (x - batch_mean.reshape(shape=[1, -1, 1, 1]))
-      batch_var = (y*y).mean(axis=(0,2,3))
+      batch_mean = x.mean(axis=(0, 2, 3))
+      y = x - batch_mean.reshape(shape=[1, -1, 1, 1])
+      batch_var = (y * y).mean(axis=(0, 2, 3))

    if self.track_running_stats:
-      self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean
-      self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var
+      self.running_mean = (
+        1 - self.momentum
+      ) * self.running_mean + self.momentum * batch_mean
+      self.running_var = (
+        1 - self.momentum
+      ) * self.running_var + self.momentum * batch_var
      self.num_batches_tracked += 1

    if self.training:
@@ -26,6 +41,9 @@ class BatchNorm2D:
    return self.normalize(x, self.running_mean, self.running_var)

  def normalize(self, x, mean, var):
-    x = (x - mean.reshape(shape=[1, -1, 1, 1])) * self.weight.reshape(shape=[1, -1, 1, 1])
-    return x.div(var.add(self.eps).reshape(shape=[1, -1, 1, 1])**0.5) + self.bias.reshape(shape=[1, -1, 1, 1])
-
+    x = (x - mean.reshape(shape=[1, -1, 1, 1])) * self.weight.reshape(
+      shape=[1, -1, 1, 1]
+    )
+    return x.div(
+      var.add(self.eps).reshape(shape=[1, -1, 1, 1]) ** 0.5
+    ) + self.bias.reshape(shape=[1, -1, 1, 1])
--- a/tinygrad/ops_ane.py
+++ b/tinygrad/ops_ane.py
@@ -2,19 +2,23 @@ from .tensor import Tensor, Function, register
 from functools import lru_cache
 import struct

+
@lru_cache
 def compile_wrapper(ane, dat):
  return ane.compile(dat)

+
 def roundup(x, v):
-  return x + (v-x)%v
+  return x + (v - x) % v
+

 def fill(dat, addrs, type, val, base=0x4000):
  x = struct.pack(type, val)
  for a in addrs:
-    dat[base+a:base+a+len(x)] = x
+    dat[base + a : base + a + len(x)] = x
  return dat

+
@lru_cache
 def compile_relu(ane, sz):
  dat = list(open("ane/ops/relu.hwx", "rb").read())
@@ -22,16 +26,18 @@ def compile_relu(ane, sz):
  # number of relus
  dat = fill(dat, [0x128, 0x13C], "H", sz)
  # number of engines? (max 0x100)
-  dat = fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8], "I", max(0x100, roundup(sz*2, 0x10)))
+  dat = fill(dat, [0x1EC, 0x1F0, 0x1F4, 0x1F8], "I", max(0x100, roundup(sz * 2, 0x10)))
  # strides?
-  dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz*2, 0x40))
+  dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz * 2, 0x40))
  return compile_wrapper(ane, bytes(dat))

+
 class ReLU(Function):
  @staticmethod
  def forward(ctx, input):
    ret = ctx.ane.tensor(input.shape)
    ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret)
    return ret
-register('relu', ReLU, device=Tensor.ANE)

+
+register("relu", ReLU, device=Tensor.ANE)
--- a/tinygrad/ops_cpu.py
+++ b/tinygrad/ops_cpu.py
@@ -5,44 +5,58 @@ from .tensor import Function, register
 # ************* basic ops *************
 def unbroadcast(out, in_sh):
  # adjoint operation to broadcast is sum. Need to sum all axis with 1 = in_sh[i] < out.shape[i]
-  sum_axis = tuple([i for i in range(len(in_sh)) if in_sh[i]==1 and out.shape[i]>1]) if in_sh != (1,) else None
+  sum_axis = (
+    tuple([i for i in range(len(in_sh)) if in_sh[i] == 1 and out.shape[i] > 1])
+    if in_sh != (1,)
+    else None
+  )
  return out.sum(axis=sum_axis).reshape(in_sh)

+
 class Add(Function):
  @staticmethod
  def forward(ctx, x, y):
    ctx.save_for_backward(x.shape, y.shape)
-    return x+y
+    return x + y

  @staticmethod
  def backward(ctx, grad_output):
    shape_x, shape_y = ctx.saved_tensors
    return unbroadcast(grad_output, shape_x), unbroadcast(grad_output, shape_y)
-register('add', Add)
+
+
+register("add", Add)
+

 class Sub(Function):
  @staticmethod
  def forward(ctx, x, y):
    ctx.save_for_backward(x.shape, y.shape)
-    return x-y
+    return x - y

  @staticmethod
  def backward(ctx, grad_output):
    shape_x, shape_y = ctx.saved_tensors
    return unbroadcast(grad_output, shape_x), unbroadcast(-grad_output, shape_y)
-register('sub', Sub)
+
+
+register("sub", Sub)
+

 class Mul(Function):
  @staticmethod
  def forward(ctx, x, y):
    ctx.save_for_backward(x, y)
-    return x*y
+    return x * y

  @staticmethod
  def backward(ctx, grad_output):
-    x,y = ctx.saved_tensors
-    return unbroadcast(y*grad_output, x.shape), unbroadcast(x*grad_output, y.shape)
-register('mul', Mul)
+    x, y = ctx.saved_tensors
+    return unbroadcast(y * grad_output, x.shape), unbroadcast(x * grad_output, y.shape)
+
+
+register("mul", Mul)
+

 class Pow(Function):
  @staticmethod
@@ -52,27 +66,37 @@ class Pow(Function):

  @staticmethod
  def backward(ctx, grad_output):
-    x,y = ctx.saved_tensors
-    return unbroadcast(y * (x**(y-1.0)) * grad_output, x.shape), \
-           unbroadcast((x**y) * np.log(x) * grad_output, y.shape)
-register('pow', Pow)
+    x, y = ctx.saved_tensors
+    return unbroadcast(y * (x ** (y - 1.0)) * grad_output, x.shape), unbroadcast(
+      (x ** y) * np.log(x) * grad_output, y.shape
+    )
+
+
+register("pow", Pow)
+

 class Sum(Function):
  @staticmethod
-  def forward(ctx, input,axis=None):
+  def forward(ctx, input, axis=None):
    ctx.save_for_backward(input, axis)
    return np.array([input.sum()]) if axis is None else input.sum(axis=axis)

  @staticmethod
  def backward(ctx, grad_output):
    input, axis = ctx.saved_tensors
-    shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
+    shape = [
+      1 if axis is None or i in axis else input.shape[i]
+      for i in range(len(input.shape))
+    ]
    return grad_output.reshape(shape) + np.zeros_like(input)
-register('sum', Sum)
+
+
+register("sum", Sum)


 # ************* GEMM *************

+
 class Dot(Function):
  @staticmethod
  def forward(ctx, input, weight):
@@ -85,21 +109,27 @@ class Dot(Function):
    grad_input = grad_output.dot(weight.T)
    grad_weight = input.T.dot(grad_output)
    return grad_input, grad_weight
-register('dot', Dot)
+
+
+register("dot", Dot)

 # ************* simple ops *************

+
 class Pad2D(Function):
  @staticmethod
  def forward(ctx, x, padding=None):
    ctx.save_for_backward(padding)
-    return np.pad(x, ((0,0), (0,0), tuple(padding[2:4]), tuple(padding[0:2])))
+    return np.pad(x, ((0, 0), (0, 0), tuple(padding[2:4]), tuple(padding[0:2])))

  @staticmethod
  def backward(ctx, grad_output):
-    padding, = ctx.saved_tensors
-    return grad_output[..., padding[2]:-padding[3], padding[0]:-padding[1]]
-register('pad2d', Pad2D)
+    (padding,) = ctx.saved_tensors
+    return grad_output[..., padding[2] : -padding[3], padding[0] : -padding[1]]
+
+
+register("pad2d", Pad2D)
+

 class Reshape(Function):
  @staticmethod
@@ -109,13 +139,16 @@ class Reshape(Function):

  @staticmethod
  def backward(ctx, grad_output):
-    in_shape, = ctx.saved_tensors
+    (in_shape,) = ctx.saved_tensors
    return grad_output.reshape(in_shape)
-register('reshape', Reshape)
+
+
+register("reshape", Reshape)


 # ************* activation ops *************

+
 class ReLU(Function):
  @staticmethod
  def forward(ctx, input):
@@ -124,123 +157,145 @@ class ReLU(Function):

  @staticmethod
  def backward(ctx, grad_output):
-    input, = ctx.saved_tensors
+    (input,) = ctx.saved_tensors
    return grad_output * (input >= 0)
-register('relu', ReLU)
+
+
+register("relu", ReLU)
+

 def _exp_normalize(x, axis=None):
-    y = np.exp(x - x.max(axis=axis, keepdims=True))
-    return y / y.sum(axis=axis, keepdims=True)
+  y = np.exp(x - x.max(axis=axis, keepdims=True))
+  return y / y.sum(axis=axis, keepdims=True)
+

 class Sigmoid(Function):
  @staticmethod
  def forward(ctx, input):
    with np.warnings.catch_warnings():
-      np.warnings.filterwarnings('ignore')
-      ret = np.where(input >= 0,
-        1/(1 + np.exp(-input)),
-        np.exp(input)/(1 + np.exp(input))
+      np.warnings.filterwarnings("ignore")
+      ret = np.where(
+        input >= 0, 1 / (1 + np.exp(-input)), np.exp(input) / (1 + np.exp(input))
      )
    ctx.save_for_backward(ret)
    return ret

  @staticmethod
  def backward(ctx, grad_output):
-    ret, = ctx.saved_tensors
+    (ret,) = ctx.saved_tensors
    return grad_output * (ret * (1 - ret))
-register('sigmoid', Sigmoid)
+
+
+register("sigmoid", Sigmoid)
+

 class LogSoftmax(Function):
  @staticmethod
  def forward(ctx, input):
    softmax = _exp_normalize(input, axis=1)
    ctx.save_for_backward(softmax)
-    return np.log(softmax) 
+    return np.log(softmax)

  @staticmethod
  def backward(ctx, grad_output):
-    softmax, = ctx.saved_tensors
-    return grad_output - grad_output.sum(axis=1, keepdims=True)*softmax
-register('logsoftmax', LogSoftmax)
+    (softmax,) = ctx.saved_tensors
+    return grad_output - grad_output.sum(axis=1, keepdims=True) * softmax
+
+
+register("logsoftmax", LogSoftmax)


 # ************* conv ops *************

+
 class Conv2D(Function):
  @staticmethod
  def forward(ctx, x, w, stride=1, groups=1):
    if type(ctx.stride) == int:
      ctx.stride = (ctx.stride, ctx.stride)
-    cout,cin,H,W = w.shape
-    ys,xs = ctx.stride
-    bs,cin_ = x.shape[0], x.shape[1]
-    oy,ox = (x.shape[2]-(H-ys))//ys, (x.shape[3]-(W-xs))//xs
-    assert cin*ctx.groups == cin_
+    cout, cin, H, W = w.shape
+    ys, xs = ctx.stride
+    bs, cin_ = x.shape[0], x.shape[1]
+    oy, ox = (x.shape[2] - (H - ys)) // ys, (x.shape[3] - (W - xs)) // xs
+    assert cin * ctx.groups == cin_
    assert cout % ctx.groups == 0
-    rcout = cout//ctx.groups
+    rcout = cout // ctx.groups

-    gx = x.reshape(bs,ctx.groups,cin,x.shape[2],x.shape[3])
-    tx = np.lib.stride_tricks.as_strided(gx,
+    gx = x.reshape(bs, ctx.groups, cin, x.shape[2], x.shape[3])
+    tx = np.lib.stride_tricks.as_strided(
+      gx,
      shape=(bs, ctx.groups, cin, oy, ox, H, W),
-      strides=(*gx.strides[0:3], gx.strides[3]*ys, gx.strides[4]*xs, *gx.strides[3:5]),
+      strides=(
+        *gx.strides[0:3],
+        gx.strides[3] * ys,
+        gx.strides[4] * xs,
+        *gx.strides[3:5],
+      ),
      writeable=False,
    )
    tw = w.reshape(ctx.groups, rcout, cin, H, W)
    ctx.save_for_backward(tx, tw, x.shape)

-    ret = np.zeros((bs,ctx.groups,oy,ox,rcout),dtype=x.dtype)
+    ret = np.zeros((bs, ctx.groups, oy, ox, rcout), dtype=x.dtype)
    for g in range(ctx.groups):
-      #ijYXyx,kjyx -> iYXk ->ikYX
-      ret[:,g] += np.tensordot(tx[:,g], tw[g], ((1,4,5),(1,2,3)))
-    return np.moveaxis(ret,4,2).reshape(bs, cout, oy, ox)
+      # ijYXyx,kjyx -> iYXk ->ikYX
+      ret[:, g] += np.tensordot(tx[:, g], tw[g], ((1, 4, 5), (1, 2, 3)))
+    return np.moveaxis(ret, 4, 2).reshape(bs, cout, oy, ox)

  @staticmethod
  def backward(ctx, grad_output):
-    bs,_,oy,ox = grad_output.shape
+    bs, _, oy, ox = grad_output.shape
    tx, tw, x_shape = ctx.saved_tensors
-    _,rcout,cin,H,W = tw.shape
-    ys,xs = ctx.stride
-    OY,OX = x_shape[2:4]
+    _, rcout, cin, H, W = tw.shape
+    ys, xs = ctx.stride
+    OY, OX = x_shape[2:4]

-    ggg = grad_output.reshape(bs,ctx.groups,rcout,oy,ox)
+    ggg = grad_output.reshape(bs, ctx.groups, rcout, oy, ox)

-    gdw = np.zeros((ctx.groups,rcout,cin,H,W), dtype=tx.dtype)
+    gdw = np.zeros((ctx.groups, rcout, cin, H, W), dtype=tx.dtype)
    for g in range(ctx.groups):
      #'ikYX,ijYXyx -> kjyx'
-      gdw[g] += np.tensordot(ggg[:,g], tx[:,g], ((0,2,3),(0,2,3)))
+      gdw[g] += np.tensordot(ggg[:, g], tx[:, g], ((0, 2, 3), (0, 2, 3)))

    # needs to be optimized
-    gdx = np.zeros((bs,ctx.groups,cin,OY,OX), dtype=tx.dtype)
-    for k in range(oy*ox):
-      Y, X = k//ox, k%ox
-      iY,iX = Y*ys, X*xs
-      #gdx[:,:,: , iY:iY+H, iX:iX+W] += np.einsum('igk,gkjyx->igjyx', ggg[:,:,:,Y,X], tw)
+    gdx = np.zeros((bs, ctx.groups, cin, OY, OX), dtype=tx.dtype)
+    for k in range(oy * ox):
+      Y, X = k // ox, k % ox
+      iY, iX = Y * ys, X * xs
+      # gdx[:,:,: , iY:iY+H, iX:iX+W] += np.einsum('igk,gkjyx->igjyx', ggg[:,:,:,Y,X], tw)
      for g in range(ctx.groups):
-        tg = np.dot(ggg[:,g,:,Y,X].reshape(bs, -1), tw[g].reshape(rcout, -1))
-        gdx[:, g, :, iY:iY+H, iX:iX+W] += tg.reshape((bs, cin, H, W))
+        tg = np.dot(ggg[:, g, :, Y, X].reshape(bs, -1), tw[g].reshape(rcout, -1))
+        gdx[:, g, :, iY : iY + H, iX : iX + W] += tg.reshape((bs, cin, H, W))

-    return gdx.reshape((bs, ctx.groups*cin, OY, OX)), gdw.reshape((ctx.groups*rcout, cin, H, W))
-register('conv2d', Conv2D)
+    return gdx.reshape((bs, ctx.groups * cin, OY, OX)), gdw.reshape(
+      (ctx.groups * rcout, cin, H, W)
+    )
+
+
+register("conv2d", Conv2D)


 # ************* pooling ops *************

+
 def stack_for_pool(x, py, px):
-  my, mx = (x.shape[2]//py)*py, (x.shape[3]//px)*px
+  my, mx = (x.shape[2] // py) * py, (x.shape[3] // px) * px
  xup = x[:, :, :my, :mx]
-  stack = [xup[:, :, k//px::py, k%px::px][None] for k in range(py*px)]
+  stack = [xup[:, :, k // px :: py, k % px :: px][None] for k in range(py * px)]
  return np.concatenate(stack, axis=0)

+
 def unstack_for_pool(fxn, s, py, px):
-  my, mx = (s[2]//py)*py, (s[3]//px)*px
-  for k in range(py*px):
-    Y, X = k//px, k%px
-    ll = fxn(Y*px+X)
+  my, mx = (s[2] // py) * py, (s[3] // px) * px
+  for k in range(py * px):
+    Y, X = k // px, k % px
+    ll = fxn(Y * px + X)
    if X == 0 and Y == 0:
      ret = np.zeros(s, dtype=ll.dtype)
    ret[:, :, Y:my:py, X:mx:px] = ll
  return ret

+
 class MaxPool2D(Function):
  @staticmethod
  def forward(ctx, x, kernel_size=(2, 2)):
@@ -251,9 +306,14 @@ class MaxPool2D(Function):

  @staticmethod
  def backward(ctx, grad_output):
-    idxs,s = ctx.saved_tensors
-    return unstack_for_pool(lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size)
-register('max_pool2d', MaxPool2D)
+    idxs, s = ctx.saved_tensors
+    return unstack_for_pool(
+      lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size
+    )
+
+
+register("max_pool2d", MaxPool2D)
+

 class AvgPool2D(Function):
  @staticmethod
@@ -264,8 +324,9 @@ class AvgPool2D(Function):

  @staticmethod
  def backward(ctx, grad_output):
-    s, = ctx.saved_tensors
+    (s,) = ctx.saved_tensors
    py, px = ctx.kernel_size
-    return unstack_for_pool(lambda idx: grad_output/py/px, s, py, px)
-register('avg_pool2d', AvgPool2D)
+    return unstack_for_pool(lambda idx: grad_output / py / px, s, py, px)

+
+register("avg_pool2d", AvgPool2D)
--- a/tinygrad/ops_gpu.py
+++ b/tinygrad/ops_gpu.py
@@ -3,131 +3,229 @@ from .tensor import Function, register, GPUBuffer, Tensor, Device
 import pyopencl as cl
 import functools

+
 def buffer_new(ctx, shape, zero=False):
-  return GPUBuffer(shape, hostbuf=None if not zero else np.zeros(shape, dtype=np.float32))
+  return GPUBuffer(
+    shape, hostbuf=None if not zero else np.zeros(shape, dtype=np.float32)
+  )
+

@functools.lru_cache()
 def clbuild(cl_ctx, name, prg):
  return cl.Program(cl_ctx, prg).build().__getattr__(name)

+
 def uint2(x, y):
-  return np.array((x,y), dtype=cl.cltypes.uint2)
+  return np.array((x, y), dtype=cl.cltypes.uint2)
+
+
 i32 = np.int32

-def subsample_op(ctx, input, kernel_size, stride, iter_op, result_op, decls=''):
+
+def subsample_op(ctx, input, kernel_size, stride, iter_op, result_op, decls=""):
  py, px = stride
  N, C, Yin, Xin = input.shape
-  Yout, Xout = (Yin-kernel_size[0])//py+1, (Xin-kernel_size[1])//px+1
+  Yout, Xout = (Yin - kernel_size[0]) // py + 1, (Xin - kernel_size[1]) // px + 1
  ret = buffer_new(ctx, (N, C, Yout, Xout), zero=True)
-  subsample = clbuild(ctx.cl_ctx, "subsample", """
+  subsample = clbuild(
+    ctx.cl_ctx,
+    "subsample",
+    """
  __kernel void subsample(__global float *output, __global const float *input, uint2 osize, uint2 isize,
                          uint2 ksz, uint2 stride) {
    int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
    int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
-    """+decls+""";
+    """
+    + decls
+    + """;
    for (uint j=0; j<ksz.y; ++j) {
      for (uint i=0; i<ksz.x; ++i) {
        int iid = (gid.x*stride.x+i) + isize.x*((gid.y*stride.y+j) + isize.y*gid.z);
        if (gid.x*stride.x+i < isize.x && gid.y*stride.y+j < isize.y) {
-          """+iter_op+""";
+          """
+    + iter_op
+    + """;
        }
      }
    }
-    output[oid] = """+result_op+""";
-  }""")
-  subsample(ctx.cl_queue, (N*C, Yout, Xout), None,
-            ret.cl, input.cl, uint2(Xout, Yout), uint2(Xin, Yin),
-            uint2(*kernel_size[::-1]), uint2(px, py))
-  ctx.data = np.empty((N, C, Yout, Xout)) # set shape expectation on tensor instance
+    output[oid] = """
+    + result_op
+    + """;
+  }""",
+  )
+  subsample(
+    ctx.cl_queue,
+    (N * C, Yout, Xout),
+    None,
+    ret.cl,
+    input.cl,
+    uint2(Xout, Yout),
+    uint2(Xin, Yin),
+    uint2(*kernel_size[::-1]),
+    uint2(px, py),
+  )
+  ctx.data = np.empty((N, C, Yout, Xout))  # set shape expectation on tensor instance
  return ret

-def supersample_op(ctx, input, out_shape, kernel_size, result_op, decls='', input2=None):
+
+def supersample_op(
+  ctx, input, out_shape, kernel_size, result_op, decls="", input2=None
+):
  (N, C, Yin, Xin), (Yout, Xout) = input.shape, out_shape[2:]
-  py,px = kernel_size
+  py, px = kernel_size
  ret = buffer_new(ctx, out_shape, zero=True)
-  supsample = clbuild(ctx.cl_ctx, "supsample", """
+  supsample = clbuild(
+    ctx.cl_ctx,
+    "supsample",
+    """
  __kernel void supsample(__global float *output, __global const float *input, __global const void *input2,
                          uint2 osize, uint2 isize, uint2 ksz) {
    int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
    int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
    int iid = (gid.x/ksz.x) + isize.x*((gid.y/ksz.y) + isize.y*gid.z);
-    """+decls+""";
+    """
+    + decls
+    + """;
    if (gid.x/ksz.x < isize.x && gid.y/ksz.y < isize.y) {
-      output[oid] = """+result_op+""";
+      output[oid] = """
+    + result_op
+    + """;
    }
-  }""")
-  supsample(ctx.cl_queue, (N*C, Yout, Xout), None,
-            ret.cl, input.cl, input2.cl if input2 is not None else input2,
-            uint2(Xout, Yout), uint2(Xin, Yin), uint2(px, py))
-  ctx.data = np.empty((N, C, Yout, Xout)) # set shape expectation on tensor instance
+  }""",
+  )
+  supsample(
+    ctx.cl_queue,
+    (N * C, Yout, Xout),
+    None,
+    ret.cl,
+    input.cl,
+    input2.cl if input2 is not None else input2,
+    uint2(Xout, Yout),
+    uint2(Xin, Yin),
+    uint2(px, py),
+  )
+  ctx.data = np.empty((N, C, Yout, Xout))  # set shape expectation on tensor instance
  return ret

+
@functools.lru_cache()
 def get_binop_prg(cl_ctx, code, complist):
  ndims = len(complist)
-  args = "".join([", int d%d" % i for i in range(ndims)]) + "".join([", int p%d" % i for i in range(ndims-1)])
-  compute_idx_rets = ["\n    int idx_ret"+str(i)+" = (gid0 / "+("p%d"%i if i < ndims-1 else "1")+") % d"+str(i)+";" for i in range(ndims)]
-  
-  idx_exprs = ["0", "0"] # [idx_x, idx_y]
+  args = "".join([", int d%d" % i for i in range(ndims)]) + "".join(
+    [", int p%d" % i for i in range(ndims - 1)]
+  )
+  compute_idx_rets = [
+    "\n    int idx_ret"
+    + str(i)
+    + " = (gid0 / "
+    + ("p%d" % i if i < ndims - 1 else "1")
+    + ") % d"
+    + str(i)
+    + ";"
+    for i in range(ndims)
+  ]
+
+  idx_exprs = ["0", "0"]  # [idx_x, idx_y]
  for i in range(ndims):
    for j in range(2):
      if complist[i][j]:
        idx_exprs[j] = "idx_ret%d + d%d*(%s)" % (i, i, idx_exprs[j])
-  
-  return cl.Program(cl_ctx, """__kernel void binop(__global const float *x_g, __global const float *y_g, __global float *res_g"""+args+""") {
-    int gid0 = get_global_id(0);"""+"".join(compute_idx_rets)+"""
-    float a = x_g["""+idx_exprs[0]+"""];
-    float b = y_g["""+idx_exprs[1]+"""];
-    res_g[gid0] = """+code+""";\n}""").build()
+
+  return cl.Program(
+    cl_ctx,
+    """__kernel void binop(__global const float *x_g, __global const float *y_g, __global float *res_g"""
+    + args
+    + """) {
+    int gid0 = get_global_id(0);"""
+    + "".join(compute_idx_rets)
+    + """
+    float a = x_g["""
+    + idx_exprs[0]
+    + """];
+    float b = y_g["""
+    + idx_exprs[1]
+    + """];
+    res_g[gid0] = """
+    + code
+    + """;\n}""",
+  ).build()
+

 def binary_op(ctx, code, x, y):
  n_dims = max(len(x.shape), len(y.shape))
  shape_x, shape_y = np.ones(n_dims, dtype=np.int32), np.ones(n_dims, dtype=np.int32)
-  shape_x[:len(x.shape)] = np.array(x.shape, dtype=np.int32)
-  shape_y[:len(y.shape)] = np.array(y.shape, dtype=np.int32)
+  shape_x[: len(x.shape)] = np.array(x.shape, dtype=np.int32)
+  shape_y[: len(y.shape)] = np.array(y.shape, dtype=np.int32)
  if not np.all((shape_x == 1) | (shape_y == 1) | (shape_x == shape_y)):
    raise Exception(f"binary op unbroadcastable shape mismatch: {x.shape} vs {y.shape}")
  shape_ret = np.maximum(shape_x, shape_y)
-  
-  dimlist, complist = [], [] # note: len(dimlist) may be less than n_dims
+
+  dimlist, complist = [], []  # note: len(dimlist) may be less than n_dims
+
  def push(dim, comp):
    if len(complist) > 0 and complist[-1] == comp:
      dimlist[-1] *= dim
    elif comp != (False, False):
-      dimlist.append(dim); complist.append(comp)
-  for i in range(n_dims): # group together any adjacent dimensions that we can to simplify broadcasting
+      dimlist.append(dim)
+      complist.append(comp)
+
+  for i in range(
+    n_dims
+  ):  # group together any adjacent dimensions that we can to simplify broadcasting
    push(i32(max(shape_x[i], shape_y[i])), (shape_x[i] > 1, shape_y[i] > 1))
-  
+
  prg = get_binop_prg(ctx.cl_ctx, code, tuple(complist))
  ret = buffer_new(ctx, shape_ret, zero=True)
-  prod_list = np.array(dimlist, dtype=i32)[-1::-1].cumprod(dtype=i32)[-1::-1] # take cumprod from back to front
-  prg.binop(ctx.cl_queue, [prod_list[0]] if len(dimlist) > 0 else [1], None, x.cl, y.cl, ret.cl, *dimlist, *(prod_list[1:]))
+  prod_list = np.array(dimlist, dtype=i32)[-1::-1].cumprod(dtype=i32)[
+    -1::-1
+  ]  # take cumprod from back to front
+  prg.binop(
+    ctx.cl_queue,
+    [prod_list[0]] if len(dimlist) > 0 else [1],
+    None,
+    x.cl,
+    y.cl,
+    ret.cl,
+    *dimlist,
+    *(prod_list[1:]),
+  )
  return ret

+
 def unary_op(ctx, code, x):
  ret = buffer_new(ctx, x.shape)
-  unop = clbuild(ctx.cl_ctx, "unop", """
+  unop = clbuild(
+    ctx.cl_ctx,
+    "unop",
+    """
  __kernel void unop(__global const float *a_g, __global float *res_g) {
    int gid = get_global_id(0);
    float a = a_g[gid];
-    res_g[gid] = """+code+""";
-  }""")
+    res_g[gid] = """
+    + code
+    + """;
+  }""",
+  )
  unop(ctx.cl_queue, [np.prod(ret.shape)], None, x.cl, ret.cl)
  return ret

+
 def reduce_op(ctx, code, code2, inp, axis=None):
  if axis is None:
    # full reduce
-    osize = [1]*len(inp.shape)
+    osize = [1] * len(inp.shape)
  else:
    osize = np.array(inp.shape)
-    osize[list(axis)] = 1 
+    osize[list(axis)] = 1
  ret = buffer_new(ctx, osize)
  if axis is None:
    ret.shape = (1,)

  # TODO: this is insanely slow
-  reduce = clbuild(ctx.cl_ctx, "reduce", """
+  reduce = clbuild(
+    ctx.cl_ctx,
+    "reduce",
+    """
  __kernel void reduce(__global const float *a_g, int sz, __global float *res_g, int prod, int n_dims,
                       __global const int *shape_x, __global const int *shape_ret) {
    int gid = get_global_id(0);
@@ -148,80 +246,129 @@ def reduce_op(ctx, code, code2, inp, axis=None):
        }
      }
      float a = a_g[idx];
-      """+code+""";
+      """
+    + code
+    + """;
    }
-    res_g[gid] = """+code2+""";
-  }""")
-  buffer_np = lambda x: cl.Buffer(ctx.cl_ctx,
-    cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x)
-  reduce(ctx.cl_queue, [np.prod(osize)], None, inp.cl,
-    i32(np.prod(inp.shape)//np.prod(osize)), ret.cl,
-    i32(np.prod(osize)), i32(len(osize)),
+    res_g[gid] = """
+    + code2
+    + """;
+  }""",
+  )
+  buffer_np = lambda x: cl.Buffer(
+    ctx.cl_ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x
+  )
+  reduce(
+    ctx.cl_queue,
+    [np.prod(osize)],
+    None,
+    inp.cl,
+    i32(np.prod(inp.shape) // np.prod(osize)),
+    ret.cl,
+    i32(np.prod(osize)),
+    i32(len(osize)),
    buffer_np(np.array(inp.shape, dtype=np.int32)),
-    buffer_np(np.array(osize, dtype=np.int32)))
+    buffer_np(np.array(osize, dtype=np.int32)),
+  )
  return ret

+
 def unbroadcast(ctx, out, in_sh):
-  sum_axis = [i for i in range(len(in_sh)) if in_sh[i]==1 and out.shape[i]>1] if in_sh != (1,) else None
+  sum_axis = (
+    [i for i in range(len(in_sh)) if in_sh[i] == 1 and out.shape[i] > 1]
+    if in_sh != (1,)
+    else None
+  )
  return reduce_op(ctx, "out += a", "out", out, sum_axis)

+
 # ***** now for the ops themselves *****

+
 class Add(Function):
  @staticmethod
  def forward(ctx, x, y):
    ctx.save_for_backward(x.shape, y.shape)
-    return binary_op(ctx, 'a+b', x, y)
+    return binary_op(ctx, "a+b", x, y)

  @staticmethod
  def backward(ctx, grad_output):
    grad_x, grad_y = grad_output, grad_output
    shape_x, shape_y = ctx.saved_tensors
-    return unbroadcast(ctx, grad_x, shape_x), unbroadcast(ctx, grad_y, shape_y),
-register('add', Add, device=Device.GPU)
+    return (
+      unbroadcast(ctx, grad_x, shape_x),
+      unbroadcast(ctx, grad_y, shape_y),
+    )
+
+
+register("add", Add, device=Device.GPU)
+

 class Sub(Function):
  @staticmethod
  def forward(ctx, x, y):
    ctx.save_for_backward(x.shape, y.shape)
-    return binary_op(ctx, 'a-b', x, y)
+    return binary_op(ctx, "a-b", x, y)

  @staticmethod
  def backward(ctx, grad_output):
-    grad_x, grad_y = grad_output, unary_op(ctx, '-a', grad_output)
+    grad_x, grad_y = grad_output, unary_op(ctx, "-a", grad_output)
    shape_x, shape_y = ctx.saved_tensors
-    return unbroadcast(ctx, grad_x, shape_x), unbroadcast(ctx, grad_y, shape_y),
-register('sub', Sub, device=Device.GPU)
+    return (
+      unbroadcast(ctx, grad_x, shape_x),
+      unbroadcast(ctx, grad_y, shape_y),
+    )
+
+
+register("sub", Sub, device=Device.GPU)
+

 class Mul(Function):
  @staticmethod
  def forward(ctx, x, y):
    ctx.save_for_backward(x, y)
-    return binary_op(ctx, 'a*b', x, y)
+    return binary_op(ctx, "a*b", x, y)

  @staticmethod
  def backward(ctx, grad_output):
-    x,y = ctx.saved_tensors
-    grad_x = binary_op(ctx, 'a*b', y, grad_output)
-    grad_y = binary_op(ctx, 'a*b', x, grad_output)
-    return unbroadcast(ctx, grad_x, x.shape), unbroadcast(ctx, grad_y, y.shape),
-register('mul', Mul, device=Device.GPU)
+    x, y = ctx.saved_tensors
+    grad_x = binary_op(ctx, "a*b", y, grad_output)
+    grad_y = binary_op(ctx, "a*b", x, grad_output)
+    return (
+      unbroadcast(ctx, grad_x, x.shape),
+      unbroadcast(ctx, grad_y, y.shape),
+    )
+
+
+register("mul", Mul, device=Device.GPU)
+

 class Pow(Function):
  @staticmethod
  def forward(ctx, x, y):
    ctx.save_for_backward(x, y)
-    return binary_op(ctx, 'pow(a,b)', x, y)
+    return binary_op(ctx, "pow(a,b)", x, y)

  @staticmethod
  def backward(ctx, grad_output):
-    x,y = ctx.saved_tensors
-    grad_x = binary_op(ctx, 'a*b', grad_output,
-                      binary_op(ctx, 'b * (pow((float)a, (float)(b-1.0)))', x, y))
-    grad_y = binary_op(ctx, 'a*b', grad_output,
-                      binary_op(ctx, 'pow(a, (float)b) * log(a);', x, y))
-    return unbroadcast(ctx, grad_x, x.shape), unbroadcast(ctx, grad_y, y.shape),
-register('pow', Pow, device=Device.GPU)
+    x, y = ctx.saved_tensors
+    grad_x = binary_op(
+      ctx,
+      "a*b",
+      grad_output,
+      binary_op(ctx, "b * (pow((float)a, (float)(b-1.0)))", x, y),
+    )
+    grad_y = binary_op(
+      ctx, "a*b", grad_output, binary_op(ctx, "pow(a, (float)b) * log(a);", x, y)
+    )
+    return (
+      unbroadcast(ctx, grad_x, x.shape),
+      unbroadcast(ctx, grad_y, y.shape),
+    )
+
+
+register("pow", Pow, device=Device.GPU)
+

 class Sum(Function):
  @staticmethod
@@ -229,16 +376,24 @@ class Sum(Function):
    ctx.save_for_backward(input, axis)
    ret = reduce_op(ctx, "out += a", "out", input, axis=axis)
    if axis is not None:
-      ret.shape = tuple([input.shape[i] for i in range(len(input.shape)) if i not in axis])
+      ret.shape = tuple(
+        [input.shape[i] for i in range(len(input.shape)) if i not in axis]
+      )
    return ret

  @staticmethod
  def backward(ctx, grad_output):
    input, axis = ctx.saved_tensors
-    shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
+    shape = [
+      1 if axis is None or i in axis else input.shape[i]
+      for i in range(len(input.shape))
+    ]
    output = GPUBuffer(shape, hostbuf=grad_output)
-    return binary_op(ctx, 'a+b', output, buffer_new(ctx, input.shape, zero=True))
-register('sum', Sum, device=Device.GPU)
+    return binary_op(ctx, "a+b", output, buffer_new(ctx, input.shape, zero=True))
+
+
+register("sum", Sum, device=Device.GPU)
+

 class Dot(Function):
  @staticmethod
@@ -247,7 +402,10 @@ class Dot(Function):
    isize, msize, osize = i32(input.shape[0]), i32(input.shape[1]), i32(weight.shape[1])
    ret = buffer_new(ctx, (isize, osize))

-    matmul = clbuild(ctx.cl_ctx, "matmul", """
+    matmul = clbuild(
+      ctx.cl_ctx,
+      "matmul",
+      """
    __kernel void matmul(
      __global const float *input, __global const float *weight, __global float *res,
      int is0, int is1, int msize, int ws0, int ws1, int osize
@@ -261,13 +419,25 @@ class Dot(Function):
      }

      res[X * osize + Y] = ret;
-    }""")
+    }""",
+    )
    ctx.save_for_backward(input, weight, matmul)

    # (isize,msize) x (msize,osize) = (isize,osize)
-    matmul(ctx.cl_queue, [isize, osize], None,
-      input.cl, weight.cl, ret.cl,
-      msize, i32(1), msize, i32(1), osize, osize)
+    matmul(
+      ctx.cl_queue,
+      [isize, osize],
+      None,
+      input.cl,
+      weight.cl,
+      ret.cl,
+      msize,
+      i32(1),
+      msize,
+      i32(1),
+      osize,
+      osize,
+    )
    return ret

  @staticmethod
@@ -279,28 +449,56 @@ class Dot(Function):
    grad_weight = buffer_new(ctx, weight.shape)

    # (isize,osize) x (msize,osize) = (isize,msize)
-    matmul(ctx.cl_queue, [isize, msize], None,
-      grad_output.cl, weight.cl, grad_input.cl,
-      osize, i32(1), osize, osize, i32(1), msize)
+    matmul(
+      ctx.cl_queue,
+      [isize, msize],
+      None,
+      grad_output.cl,
+      weight.cl,
+      grad_input.cl,
+      osize,
+      i32(1),
+      osize,
+      osize,
+      i32(1),
+      msize,
+    )

    # (isize,msize) x (isize,osize) = (msize,osize)
-    matmul(ctx.cl_queue, [msize, osize], None,
-      input.cl, grad_output.cl, grad_weight.cl,
-      i32(1), msize, isize, i32(1), osize, osize)
+    matmul(
+      ctx.cl_queue,
+      [msize, osize],
+      None,
+      input.cl,
+      grad_output.cl,
+      grad_weight.cl,
+      i32(1),
+      msize,
+      isize,
+      i32(1),
+      osize,
+      osize,
+    )

    return grad_input, grad_weight
-register('dot', Dot, device=Device.GPU)
+
+
+register("dot", Dot, device=Device.GPU)

 # ************* simple ops *************

+
 class Pad2D(Function):
  @staticmethod
  def forward(ctx, x, padding=None):
-    bs,cin,iy,ix = x.shape
-    oy,ox = iy+padding[2]+padding[3], ix+padding[0]+padding[1]
+    bs, cin, iy, ix = x.shape
+    oy, ox = iy + padding[2] + padding[3], ix + padding[0] + padding[1]
    ret = buffer_new(ctx, (bs, cin, oy, ox), zero=True)

-    pad2d = clbuild(ctx.cl_ctx, "pad2d", """
+    pad2d = clbuild(
+      ctx.cl_ctx,
+      "pad2d",
+      """
    __kernel void pad2d(__global const float *input, __global float *output,
                        int ipx, int ipy, int py, int px, int oy, int ox, int iy, int ix) {
      int BC = get_global_id(0);
@@ -311,13 +509,24 @@ class Pad2D(Function):
      int optr = BC*oy*ox + (Y+py)*ox + px + X;

      output[optr] = input[iptr];
-    }""")
+    }""",
+    )
    ctx.save_for_backward(padding, pad2d)
-    pad2d(ctx.cl_queue, [bs*cin, iy, ix], None,
-        x.cl, ret.cl,
-        i32(0), i32(0), i32(padding[2]), i32(padding[0]),
-        i32(oy), i32(ox), i32(iy), i32(ix)
-      )
+    pad2d(
+      ctx.cl_queue,
+      [bs * cin, iy, ix],
+      None,
+      x.cl,
+      ret.cl,
+      i32(0),
+      i32(0),
+      i32(padding[2]),
+      i32(padding[0]),
+      i32(oy),
+      i32(ox),
+      i32(iy),
+      i32(ix),
+    )
    return ret

  @staticmethod
@@ -326,13 +535,26 @@ class Pad2D(Function):
    bs, cin, iy, ix = grad_output.shape
    oy, ox = iy - padding[2] - padding[3], ix - padding[0] - padding[1]
    ret = buffer_new(ctx, (bs, cin, oy, ox))
-    pad2d(ctx.cl_queue, [bs*cin, oy, ox], None,
-              grad_output.cl, ret.cl,
-              i32(padding[2]), i32(padding[0]), i32(0), i32(0),
-              i32(oy), i32(ox), i32(iy), i32(ix)
-             )
+    pad2d(
+      ctx.cl_queue,
+      [bs * cin, oy, ox],
+      None,
+      grad_output.cl,
+      ret.cl,
+      i32(padding[2]),
+      i32(padding[0]),
+      i32(0),
+      i32(0),
+      i32(oy),
+      i32(ox),
+      i32(iy),
+      i32(ix),
+    )
    return ret
-register('pad2d', Pad2D, device=Device.GPU)
+
+
+register("pad2d", Pad2D, device=Device.GPU)
+

 class Reshape(Function):
  @staticmethod
@@ -345,105 +567,152 @@ class Reshape(Function):

  @staticmethod
  def backward(ctx, grad_output):
-    in_shape, = ctx.saved_tensors
+    (in_shape,) = ctx.saved_tensors
    return GPUBuffer(in_shape, hostbuf=grad_output)
-register('reshape', Reshape, device=Device.GPU)
+
+
+register("reshape", Reshape, device=Device.GPU)

 # ************* activation ops *************

+
 class ReLU(Function):
  @staticmethod
  def forward(ctx, input):
    ctx.save_for_backward(input)
-    return unary_op(ctx, 'max(a, (float)0.)', input)
+    return unary_op(ctx, "max(a, (float)0.)", input)

  @staticmethod
  def backward(ctx, grad_output):
-    input, = ctx.saved_tensors
-    return binary_op(ctx, 'a * (b >= 0)', grad_output, input)
-register('relu', ReLU, device=Device.GPU)
+    (input,) = ctx.saved_tensors
+    return binary_op(ctx, "a * (b >= 0)", grad_output, input)
+
+
+register("relu", ReLU, device=Device.GPU)
+

 class Sigmoid(Function):
  @staticmethod
  def forward(ctx, input):
-    ret = unary_op(ctx, '1./(1+exp(-a))', input)
+    ret = unary_op(ctx, "1./(1+exp(-a))", input)
    ctx.save_for_backward(ret)
    return ret

  @staticmethod
  def backward(ctx, grad_output):
-    ret, = ctx.saved_tensors
-    return binary_op(ctx, 'a * (b * (1 - b));', grad_output, ret)
-register('sigmoid', Sigmoid, device=Device.GPU)
+    (ret,) = ctx.saved_tensors
+    return binary_op(ctx, "a * (b * (1 - b));", grad_output, ret)
+
+
+register("sigmoid", Sigmoid, device=Device.GPU)
+

 class AvgPool2D(Function):
  @staticmethod
  def forward(ctx, input, kernel_size=(2, 2)):
-    ret = subsample_op(ctx, input, kernel_size, kernel_size, iter_op="sumval += input[iid]",
-      result_op="sumval / (ksz.x * ksz.y)", decls="float sumval=0.f")
+    ret = subsample_op(
+      ctx,
+      input,
+      kernel_size,
+      kernel_size,
+      iter_op="sumval += input[iid]",
+      result_op="sumval / (ksz.x * ksz.y)",
+      decls="float sumval=0.f",
+    )
    ctx.save_for_backward(input.shape)
    return ret

  @staticmethod
  def backward(ctx, grad_output):
-    orig_shape, = ctx.saved_tensors
-    return supersample_op(ctx, grad_output, orig_shape, ctx.kernel_size,
-      result_op="input[iid] / (ksz.x * ksz.y)")
-register('avg_pool2d', AvgPool2D, device=Device.GPU)
+    (orig_shape,) = ctx.saved_tensors
+    return supersample_op(
+      ctx,
+      grad_output,
+      orig_shape,
+      ctx.kernel_size,
+      result_op="input[iid] / (ksz.x * ksz.y)",
+    )
+
+
+register("avg_pool2d", AvgPool2D, device=Device.GPU)
+

 class MaxPool2D(Function):
  @staticmethod
  def forward(ctx, input, kernel_size=(2, 2)):
-    idxs = subsample_op(ctx, input, kernel_size, kernel_size,
+    idxs = subsample_op(
+      ctx,
+      input,
+      kernel_size,
+      kernel_size,
      iter_op="if (input[iid]>maxval) { maxval = input[iid]; maxidx = j * ksz.x + i; }",
-      result_op="(float)maxidx", decls="float maxval=-FLT_MAX; int maxidx=0")
+      result_op="(float)maxidx",
+      decls="float maxval=-FLT_MAX; int maxidx=0",
+    )
    ctx.save_for_backward(idxs, input.shape)
-    return subsample_op(ctx, input, kernel_size, kernel_size,
+    return subsample_op(
+      ctx,
+      input,
+      kernel_size,
+      kernel_size,
      iter_op="maxval = max(maxval, input[iid])",
-      result_op="maxval", decls="float maxval = -FLT_MAX")
+      result_op="maxval",
+      decls="float maxval = -FLT_MAX",
+    )

  @staticmethod
  def backward(ctx, grad_output):
    idxs, orig_shape = ctx.saved_tensors
-    return supersample_op(ctx, grad_output, orig_shape, ctx.kernel_size,
+    return supersample_op(
+      ctx,
+      grad_output,
+      orig_shape,
+      ctx.kernel_size,
      result_op="(maxidx == kernidx) * input[iid]",
      decls="int maxidx=((__global float*)input2)[iid]; int kernidx=(gid.x%ksz.x) + ksz.x*(gid.y%ksz.y)",
-      input2=idxs)
-register('max_pool2d', MaxPool2D, device=Device.GPU)
+      input2=idxs,
+    )
+
+
+register("max_pool2d", MaxPool2D, device=Device.GPU)
+

 class LogSoftmax(Function):
  @staticmethod
  def forward(ctx, input):
    # TODO: stability?
    lsum = reduce_op(ctx, "out += exp(a)", "log(out)", input, axis=[1])
-    output = binary_op(ctx, 'a-b', input, lsum)
+    output = binary_op(ctx, "a-b", input, lsum)
    ctx.save_for_backward(output)
    return output

  @staticmethod
  def backward(ctx, grad_output):
-    output, = ctx.saved_tensors
+    (output,) = ctx.saved_tensors
    lsum = reduce_op(ctx, "out += a", "out", grad_output, axis=[1])
    texp = binary_op(ctx, "exp(a) * b", output, lsum)
    return binary_op(ctx, "a - b", grad_output, texp)
-register('logsoftmax', LogSoftmax, device=Device.GPU)
+
+
+register("logsoftmax", LogSoftmax, device=Device.GPU)

 # ************* conv ops *************

+
 class Conv2D(Function):
  @staticmethod
  def forward(ctx, x, w, stride=1, groups=1):
    if type(ctx.stride) == int:
      ctx.stride = (ctx.stride, ctx.stride)
-    cout,cin,H,W = w.shape
-    ys,xs = ctx.stride
-    bs,cin_,iy,ix = x.shape
-    oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs
-    assert cin*ctx.groups == cin_
+    cout, cin, H, W = w.shape
+    ys, xs = ctx.stride
+    bs, cin_, iy, ix = x.shape
+    oy, ox = (iy - (H - ys)) // ys, (ix - (W - xs)) // xs
+    assert cin * ctx.groups == cin_
    assert cout % ctx.groups == 0
-    rcout = cout//ctx.groups
+    rcout = cout // ctx.groups

-    ctx.save_for_backward(x,w)
+    ctx.save_for_backward(x, w)

    # output buffer
    ret = buffer_new(ctx, (bs, cout, oy, ox))
@@ -452,7 +721,10 @@ class Conv2D(Function):
    # weight = (groups, rcout, cin, H, W)
    # output = (bs, groups, rcout, oy, ox)

-    conv = clbuild(ctx.cl_ctx, "conv", """
+    conv = clbuild(
+      ctx.cl_ctx,
+      "conv",
+      """
    __kernel void conv(__global const float *input, __global const float *weight, __global float *output,
      int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs) {

@@ -475,26 +747,41 @@ class Conv2D(Function):
        }
      }
      output[B*groups*rcout*oy*ox + g*rcout*oy*ox + c*oy*ox + Y*ox + X] = acc;
-    }""")
+    }""",
+    )

-    conv(ctx.cl_queue, [bs*groups*rcout, oy, ox], None,
-      x.cl, w.cl, ret.cl,
-      i32(H), i32(W), i32(groups), i32(rcout), i32(cin),
-      i32(oy), i32(ox), i32(iy), i32(ix), i32(ys), i32(xs)
+    conv(
+      ctx.cl_queue,
+      [bs * groups * rcout, oy, ox],
+      None,
+      x.cl,
+      w.cl,
+      ret.cl,
+      i32(H),
+      i32(W),
+      i32(groups),
+      i32(rcout),
+      i32(cin),
+      i32(oy),
+      i32(ox),
+      i32(iy),
+      i32(ix),
+      i32(ys),
+      i32(xs),
    )
    return ret

  @staticmethod
  def backward(ctx, grad_output):
-    bs,_,oy,ox = grad_output.shape
+    bs, _, oy, ox = grad_output.shape
    x, w = ctx.saved_tensors
-    cout,cin,H,W = w.shape
-    ys,xs = ctx.stride
-    bs,cin_,iy,ix = x.shape
-    oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs
-    assert cin*ctx.groups == cin_
+    cout, cin, H, W = w.shape
+    ys, xs = ctx.stride
+    bs, cin_, iy, ix = x.shape
+    oy, ox = (iy - (H - ys)) // ys, (ix - (W - xs)) // xs
+    assert cin * ctx.groups == cin_
    assert cout % ctx.groups == 0
-    rcout = cout//ctx.groups
+    rcout = cout // ctx.groups

    dx = buffer_new(ctx, (bs, cin_, iy, ix), zero=True)
    dw = buffer_new(ctx, (cout, cin, H, W))
@@ -503,7 +790,10 @@ class Conv2D(Function):
    # tensw = (groups*rcout, cin, H, W)
    # ggg = (bs, groups*rout, oy, ox)

-    convw = clbuild(ctx.cl_ctx, "convw", """
+    convw = clbuild(
+      ctx.cl_ctx,
+      "convw",
+      """
    __kernel void convw(__global const float *tensx, __global const float *ggg, __global float *dw,
      int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs) {

@@ -523,8 +813,12 @@ class Conv2D(Function):
        }
      }
      dw[get_global_id(0)*H*W + y*W + x] = acc;
-    }""")
-    convx = clbuild(ctx.cl_ctx, "convx", """
+    }""",
+    )
+    convx = clbuild(
+      ctx.cl_ctx,
+      "convx",
+      """
    __kernel void convx(__global const float *tensw, __global const float *ggg, __global float *dx,
      int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs) {

@@ -547,10 +841,36 @@ class Conv2D(Function):
        }
      }
    }
-    """)
+    """,
+    )

-    conv_args = i32(H), i32(W), i32(ctx.groups), i32(rcout), i32(cin), i32(oy), i32(ox), i32(iy), i32(ix), i32(ys), i32(xs), i32(bs)
-    convw(ctx.cl_queue, [ctx.groups*rcout*cin, H, W], None, x.cl, grad_output.cl, dw.cl, *conv_args)
-    convx(ctx.cl_queue, [bs, ctx.groups, cin], None, w.cl, grad_output.cl, dx.cl, *conv_args)
+    conv_args = (
+      i32(H),
+      i32(W),
+      i32(ctx.groups),
+      i32(rcout),
+      i32(cin),
+      i32(oy),
+      i32(ox),
+      i32(iy),
+      i32(ix),
+      i32(ys),
+      i32(xs),
+      i32(bs),
+    )
+    convw(
+      ctx.cl_queue,
+      [ctx.groups * rcout * cin, H, W],
+      None,
+      x.cl,
+      grad_output.cl,
+      dw.cl,
+      *conv_args,
+    )
+    convx(
+      ctx.cl_queue, [bs, ctx.groups, cin], None, w.cl, grad_output.cl, dx.cl, *conv_args
+    )
    return dx, dw
-register('conv2d', Conv2D, device=Device.GPU)
+
+
+register("conv2d", Conv2D, device=Device.GPU)
--- a/tinygrad/optim.py
+++ b/tinygrad/optim.py
@@ -3,6 +3,7 @@
 import numpy as np
 from tinygrad.tensor import Tensor

+
 class Optimizer:
  def __init__(self, params):
    self.params = [x for x in params if x.requires_grad == True]
@@ -11,6 +12,7 @@ class Optimizer:
    for param in self.params:
      param.grad = None

+
 class SGD(Optimizer):
  def __init__(self, params, lr=0.001):
    super().__init__(params)
@@ -20,30 +22,53 @@ class SGD(Optimizer):
    for t in self.params:
      t -= t.grad * self.lr

+
 class RMSprop(Optimizer):
  def __init__(self, params, lr=0.001, decay=0.9, eps=1e-8):
    super().__init__(params)
    self.lr, self.decay, self.eps = lr, decay, eps

-    self.v = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params]
+    self.v = [
+      Tensor(
+        np.zeros(t.shape, dtype=np.float32),
+        device=params[0].device,
+        requires_grad=False,
+      )
+      for t in self.params
+    ]

  def step(self):
    for i, t in enumerate(self.params):
      self.v[i] = self.decay * self.v[i] + (1.0 - self.decay) * t.grad * t.grad
      t -= (t.grad * self.lr).div(self.v[i].sqrt() + self.eps)

+
 class Adam(Optimizer):
  def __init__(self, params, lr=0.001, b1=0.9, b2=0.999, eps=1e-8):
    super().__init__(params)
    self.lr, self.b1, self.b2, self.eps, self.t = lr, b1, b2, eps, 0

-    self.m = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params]
-    self.v = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params]
+    self.m = [
+      Tensor(
+        np.zeros(t.shape, dtype=np.float32),
+        device=params[0].device,
+        requires_grad=False,
+      )
+      for t in self.params
+    ]
+    self.v = [
+      Tensor(
+        np.zeros(t.shape, dtype=np.float32),
+        device=params[0].device,
+        requires_grad=False,
+      )
+      for t in self.params
+    ]

  def step(self):
    self.t = self.t + 1
-    a = self.lr * ((1.0 - self.b2**self.t)**0.5) / (1.0 - self.b1**self.t)
-    for i,t in enumerate(self.params):
+    a = self.lr * ((1.0 - self.b2 ** self.t) ** 0.5) / (1.0 - self.b1 ** self.t)
+    for i, t in enumerate(self.params):
      self.m[i] = self.b1 * self.m[i] + (1.0 - self.b1) * t.grad
      self.v[i] = self.b2 * self.v[i] + (1.0 - self.b2) * t.grad * t.grad
      t -= a * self.m[i].div(self.v[i].sqrt() + self.eps)
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -10,32 +10,43 @@ from collections import defaultdict
 DEBUG = os.getenv("DEBUG", None) is not None
 if DEBUG:
  import atexit, time
+
  debug_counts, debug_times = defaultdict(int), defaultdict(float)
+
  def print_debug_exit():
    for name, _ in sorted(debug_times.items(), key=lambda x: -x[1]):
      print(f"{name:>20} : {debug_counts[name]:>6} {debug_times[name]:>10.2f} ms")
+
  atexit.register(print_debug_exit)

+
 class ProfileOp:
  def __init__(self, name, x, backward=False):
-    self.name = ("back_" if backward else "")+name
+    self.name = ("back_" if backward else "") + name
    self.x = x
+
  def __enter__(self):
-    if DEBUG: self.st = time.time()
+    if DEBUG:
+      self.st = time.time()
+
  def __exit__(self, *junk):
    if DEBUG:
      if cl_queue is not None:
        cl_queue.finish()
-      et = (time.time()-self.st)*1000.
+      et = (time.time() - self.st) * 1000.0
      debug_counts[self.name] += 1
      debug_times[self.name] += et
      print(f"{self.name:>20} : {et:>7.2f} ms {[y.shape for y in self.x]}")

+
 # **** GPU functions ****

 cl_ctx, cl_queue = None, None
+
+
 def require_init_gpu():
-  if not GPU: raise Exception("No GPU Support, install pyopencl")
+  if not GPU:
+    raise Exception("No GPU Support, install pyopencl")
  global cl_ctx, cl_queue
  if cl_queue is None:
    devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.GPU)
@@ -45,28 +56,45 @@ def require_init_gpu():
    # this is an in-order command queue
    cl_queue = cl.CommandQueue(cl_ctx)

+
 class GPUBuffer:
  def __init__(self, shape, hostbuf=None):
    self.shape, self.dtype = tuple(shape), np.float32
-    self.cl = hostbuf.cl if isinstance(hostbuf, GPUBuffer) else \
-      cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0), 4*np.prod(shape),
-                hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None)
+    self.cl = (
+      hostbuf.cl
+      if isinstance(hostbuf, GPUBuffer)
+      else cl.Buffer(
+        cl_ctx,
+        cl.mem_flags.READ_WRITE
+        | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0),
+        4 * np.prod(shape),
+        hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None,
+      )
+    )

  def __repr__(self):
    return f"<GPUBuffer with shape {self.shape!r}>"

+
 # **** ANE functions ****

 ane = None
+
+
 def require_init_ane():
  global ane
  if ane is None:
    import ane.lib.ane, tinygrad.ops_ane
+
    ane = ane.lib.ane.ANE()

+
 # **** start with two base classes, Tensor and Function ****

-class Device: CPU, GPU, ANE = 0, 1, 2
+
+class Device:
+  CPU, GPU, ANE = 0, 1, 2
+

 class Tensor:
  did_float_warning = False
@@ -110,7 +138,12 @@ class Tensor:

  @classmethod
  def uniform(cls, *shape, **kwargs):
-    return cls((np.random.uniform(-1., 1., size=shape)/np.sqrt(np.prod(shape))).astype(np.float32), **kwargs)
+    return cls(
+      (np.random.uniform(-1.0, 1.0, size=shape) / np.sqrt(np.prod(shape))).astype(
+        np.float32
+      ),
+      **kwargs,
+    )

  @classmethod
  def eye(cls, dim, **kwargs):
@@ -130,18 +163,21 @@ class Tensor:

    # fill in the first grad with one
    # this is "implicit gradient creation"
-    self.grad = Tensor(np.ones(self.shape, dtype=self.dtype), device=self.device, requires_grad=False)
+    self.grad = Tensor(
+      np.ones(self.shape, dtype=self.dtype), device=self.device, requires_grad=False
+    )

    for t0 in reversed(self.deepwalk(set(), [])):
-      assert (t0.grad is not None)
+      assert t0.grad is not None
      with ProfileOp(t0._ctx.__class__.__name__, [t0.grad], backward=True):
        grads = t0._ctx.backward(t0._ctx, t0.grad.data)
      if len(t0._ctx.parents) == 1:
        grads = [grads]
-      for t,g in zip(t0._ctx.parents, grads):
+      for t, g in zip(t0._ctx.parents, grads):
        if g is not None:
-          assert g.shape == t.shape, \
-            f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
+          assert (
+            g.shape == t.shape
+          ), f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
          gt = Tensor(g, device=self.device, requires_grad=False)
          t.grad = gt if t.grad is None else (t.grad + gt)

@@ -150,19 +186,21 @@ class Tensor:
  @staticmethod
  def _move_data(data, device):
    if isinstance(data, GPUBuffer):
-      if device == Device.GPU: return data
+      if device == Device.GPU:
+        return data
      old = data
      data = np.empty(old.shape, dtype=np.float32)
      with ProfileOp("toCPU", [data]):
-          cl.enqueue_copy(cl_queue, data, old.cl, is_blocking=True)
+        cl.enqueue_copy(cl_queue, data, old.cl, is_blocking=True)

    elif "ANETensor" in str(type(data)):
-      if device == Device.ANE: return data
+      if device == Device.ANE:
+        return data
      with ProfileOp("toCPU", [data]):
-          data = data.data().astype(np.float32)
+        data = data.data().astype(np.float32)

    if not isinstance(data, np.ndarray):
-        data = np.array(data, dtype=np.float32)
+      data = np.array(data, dtype=np.float32)

    if data.dtype != np.float32 and not Tensor.did_float_warning:
      # warning? float64 is actually needed for numerical jacobian
@@ -184,14 +222,17 @@ class Tensor:

  def to_(self, device):
    self.data, self.device = self._move_data(self.data, device), device
-    if self.grad: self.grad.to_(device)
+    if self.grad:
+      self.grad.to_(device)

  def to(self, device):
    ret = Tensor(self.data, device)
-    if self.grad: ret.grad = self.grad.to(device)
+    if self.grad:
+      ret.grad = self.grad.to(device)
    return ret

-  def _is(self, device): return self.device == device
+  def _is(self, device):
+    return self.device == device

  def detach(self):
    return Tensor(self.data, device=self.device)
@@ -203,7 +244,7 @@ class Tensor:

  def mean(self, axis=None):
    out = self.sum(axis=axis)
-    coeff = np.prod(out.shape)/np.prod(self.shape)
+    coeff = np.prod(out.shape) / np.prod(self.shape)
    return out * coeff

  def sqrt(self):
@@ -219,15 +260,18 @@ class Tensor:
    return 2.0 * ((2.0 * self).sigmoid()) - 1.0

  def leakyrelu(self, neg_slope=0.01):
-    return self.relu() - (-neg_slope*self).relu()
+    return self.relu() - (-neg_slope * self).relu()

  def dropout(self, p=0.5):
-    _mask = np.asarray(np.random.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype)
+    _mask = np.asarray(
+      np.random.binomial(1, 1.0 - p, size=self.shape), dtype=self.dtype
+    )
    ret = self * Tensor(_mask, requires_grad=False, device=self.device)
    return ret.div(1.0 - p)

  def abs(self):
-    return self.relu() + (-1.0*self).relu()
+    return self.relu() + (-1.0 * self).relu()
+

 # An instantiation of the Function is the Context
 class Function:
@@ -239,7 +283,7 @@ class Function:
    self.saved_tensors.extend(x)

  def apply(self, *x, **kwargs):
-    ctx = self(*x) # self - operation i.e 'add', 'sub', etc.
+    ctx = self(*x)  # self - operation i.e 'add', 'sub', etc.
    # use default params
    params = signature(self.forward).parameters
    for p in params.values():
@@ -249,40 +293,66 @@ class Function:
    for k, v in kwargs.items():
      setattr(ctx, k, v)
    with ProfileOp(ctx.__class__.__name__, x):
-      ret = Tensor(self.forward(ctx, *[t.data for t in x], **kwargs),
-                   device=ctx.device, requires_grad=any([t.requires_grad for t in x]))
+      ret = Tensor(
+        self.forward(ctx, *[t.data for t in x], **kwargs),
+        device=ctx.device,
+        requires_grad=any([t.requires_grad for t in x]),
+      )
    if ret.requires_grad:
      ret._ctx = ctx
    return ret

+
 def register(name, fxn, device=Device.CPU):
  Tensor.ops[device][name] = fxn
+
  def dispatch(*x, **kwargs):
    tt = [arg for arg in x if isinstance(arg, Tensor)][0]
-    x = [Tensor(np.array([arg], dtype=tt.dtype), device=tt.device, requires_grad=False) if not isinstance(arg, Tensor) else arg for arg in x]
+    x = [
+      Tensor(np.array([arg], dtype=tt.dtype), device=tt.device, requires_grad=False)
+      if not isinstance(arg, Tensor)
+      else arg
+      for arg in x
+    ]
    f = (Tensor.ops[tt.device])[name]
    f.cl_ctx, f.cl_queue, f.ane, f.device = cl_ctx, cl_queue, ane, tt.device
    return f.apply(f, *x, **kwargs)
+
  setattr(Tensor, name, dispatch)
  # TODO: div is a second class op, so it doesn't work here
-  if name in ['add', 'sub', 'mul', 'pow']:
+  if name in ["add", "sub", "mul", "pow"]:
    setattr(Tensor, f"__{name}__", dispatch)
-    setattr(Tensor, f"__i{name}__", lambda self,x: self.assign(dispatch(self,x)))
-    setattr(Tensor, f"__r{name}__", lambda self,x: dispatch(x,self))
+    setattr(Tensor, f"__i{name}__", lambda self, x: self.assign(dispatch(self, x)))
+    setattr(Tensor, f"__r{name}__", lambda self, x: dispatch(x, self))
+

 for device in [device for device in Device.__dict__.keys() if device[0] != "_"]:
-  setattr(Tensor, f"{device.lower()}", functools.partialmethod(Tensor.to, Device.__dict__[device]))
-  setattr(Tensor, f"{device.lower()}_", functools.partialmethod(Tensor.to_, Device.__dict__[device]))
-  setattr(Tensor, f"is_{device.lower()}", property(functools.partialmethod(Tensor._is, Device.__dict__[device])))
+  setattr(
+    Tensor,
+    f"{device.lower()}",
+    functools.partialmethod(Tensor.to, Device.__dict__[device]),
+  )
+  setattr(
+    Tensor,
+    f"{device.lower()}_",
+    functools.partialmethod(Tensor.to_, Device.__dict__[device]),
+  )
+  setattr(
+    Tensor,
+    f"is_{device.lower()}",
+    property(functools.partialmethod(Tensor._is, Device.__dict__[device])),
+  )

 # this registers all the operations
 import tinygrad.ops_cpu
+
 try:
  import pyopencl as cl
+
  # TODO: move this import to require_init_gpu?
  import tinygrad.ops_gpu
+
  GPU = True
 except ImportError:
  # no GPU support
  GPU = False
-