diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 78c2c36239..c06ba50eeb 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -34,13 +34,21 @@ jobs:
       run: python -m pytest -s -v
 
   linter:
-      name: Indentation Linter
-      runs-on: ubuntu-latest
+    name: Indentation  Linter
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v2
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install pylint
+    - name: Lint with pylint
+      run: |
+        python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='  ' **/*.py
 
-      steps:
-      - name: Checkout Code
-        uses: actions/checkout@v2
-      - name: Install PyLint
-        run: sudo apt-get install pylint
-      - name: Validate indentation is 2 lines
-        run: if [[ $(pylint --jobs=0 --indent-string='  ' * | grep "Bad indentation") ]]; then exit 1; fi
diff --git a/ane/1_build/coreml_ane.py b/ane/1_build/coreml_ane.py
index f3c8dcbfd1..e4c89266ae 100755
--- a/ane/1_build/coreml_ane.py
+++ b/ane/1_build/coreml_ane.py
@@ -3,24 +3,34 @@ import numpy as np
 import coremltools as ct
 from coremltools.models.neural_network import datatypes, NeuralNetworkBuilder
 
-input_features = [('image', datatypes.Array(3))]
-output_features = [('probs', datatypes.Array(3))]
+input_features = [("image", datatypes.Array(3))]
+output_features = [("probs", datatypes.Array(3))]
 
 weights = np.zeros((3, 3)) + 3
 bias = np.ones(3)
 
 builder = NeuralNetworkBuilder(input_features, output_features)
-builder.add_inner_product(name='ip_layer', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='image', output_name='med')
-#builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs')
-#builder.add_elementwise(name='element', input_names=['med', 'med'], output_name='probs', mode='ADD')
-builder.add_bias(name='bias', b=bias, input_name='med', output_name='probs', shape_bias=(3,))
-#builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs')
+builder.add_inner_product(
+  name="ip_layer",
+  W=weights,
+  b=None,
+  input_channels=3,
+  output_channels=3,
+  has_bias=False,
+  input_name="image",
+  output_name="med",
+)
+# builder.add_inner_product(name='ip_layer_2', W=weights, b=None, input_channels=3, output_channels=3, has_bias=False, input_name='med', output_name='probs')
+# builder.add_elementwise(name='element', input_names=['med', 'med'], output_name='probs', mode='ADD')
+builder.add_bias(
+  name="bias", b=bias, input_name="med", output_name="probs", shape_bias=(3,)
+)
+# builder.add_activation(name='act_layer', non_linearity='SIGMOID', input_name='med', output_name='probs')
 
 # compile the spec
 mlmodel = ct.models.MLModel(builder.spec)
 
 # trigger the ANE!
-out = mlmodel.predict({"image": np.array([1337,0,0], dtype=np.float32)})
+out = mlmodel.predict({"image": np.array([1337, 0, 0], dtype=np.float32)})
 print(out)
-mlmodel.save('test.mlmodel')
-
+mlmodel.save("test.mlmodel")
diff --git a/ane/2_compile/hwx_parse.py b/ane/2_compile/hwx_parse.py
index 4961ade896..d42ff500ef 100755
--- a/ane/2_compile/hwx_parse.py
+++ b/ane/2_compile/hwx_parse.py
@@ -2,17 +2,21 @@
 import sys
 from hexdump import hexdump
 from macholib import MachO
+
+
 def get_macho(fn):
   # mod to make the header okay
   # MH_CIGAM_64 is good
   dat = open(fn, "rb").read()
-  dat = b"\xcf\xfa\xed\xfe"+dat[4:]
+  dat = b"\xcf\xfa\xed\xfe" + dat[4:]
   from tempfile import NamedTemporaryFile
+
   with NamedTemporaryFile(delete=False) as f:
     f.write(dat)
     f.close()
   return MachO.MachO(f.name)
 
+
 a = get_macho("model.hwx")
 
 # load commands
@@ -21,14 +25,22 @@ for c in a.headers[0].commands:
   if c[0].cmd == 25:
     print(c[1])
     for section in c[2]:
-      print(section.segname.strip(b'\0'), section.sectname.strip(b'\0'), hex(section.addr), hex(section.size), "@", hex(c[1].fileoff))
-      #print(dir(section))
+      print(
+        section.segname.strip(b"\0"),
+        section.sectname.strip(b"\0"),
+        hex(section.addr),
+        hex(section.size),
+        "@",
+        hex(c[1].fileoff),
+      )
+      # print(dir(section))
       if c[1].filesize > 0:
         hexdump(section.section_data)
 
 # this parser is wrong (fixed with 64-bit one)
 from macholib import SymbolTable
-sym = SymbolTable.SymbolTable(a) 
+
+sym = SymbolTable.SymbolTable(a)
 
 syms = {}
 for l in sym.nlists:
@@ -36,64 +48,65 @@ for l in sym.nlists:
   if l[0].n_value != 0:
     syms[l[1]] = l[0].n_value
 
-for k,v in syms.items():
+for k, v in syms.items():
   print(k, hex(v))
 
 
-
 from termcolor import colored
+
+
 def compare(x, y):
   ss = []
   ln = []
   ln2 = []
 
-  ll = (max(len(x), len(y)) + 0xF)//0x10 * 0x10
+  ll = (max(len(x), len(y)) + 0xF) // 0x10 * 0x10
 
   highlight = False
-  next_highlight = 0x2b
-  for i in range(ll+1):
+  next_highlight = 0x2B
+  for i in range(ll + 1):
     if i == next_highlight:
       highlight = True
       if i < len(y):
-        next_highlight += y[i]+8
+        next_highlight += y[i] + 8
       else:
         next_highlight = None
     else:
       highlight = False
-    a = "%02X" % x[i] if i < len(x) else "--", \
-        "%02X" % y[i] if i < len(y) else "--"
+    a = "%02X" % x[i] if i < len(x) else "--", "%02X" % y[i] if i < len(y) else "--"
+
     def fj(x):
       ss = []
       for i in range(0, 0x10, 4):
-        ss.append(' '.join(x[i:i+4]))
-      return '  '.join(ss)
+        ss.append(" ".join(x[i : i + 4]))
+      return "  ".join(ss)
 
-    if i!=0 and i%0x10 == 0:
-      ss.append("%8X: " % (i-0x10)+fj(ln)+"  |  "+fj(ln2)+"\n")
+    if i != 0 and i % 0x10 == 0:
+      ss.append("%8X: " % (i - 0x10) + fj(ln) + "  |  " + fj(ln2) + "\n")
       ln = []
       ln2 = []
     if a[0] != a[1] and a[0] != "--" and a[1] != "--":
-      ln.append(colored(a[0], 'green'))
-      ln2.append(colored(a[1], 'red'))
+      ln.append(colored(a[0], "green"))
+      ln2.append(colored(a[1], "red"))
     else:
       if highlight:
-        ln.append(colored(a[0], 'yellow'))
-        ln2.append(colored(a[1], 'yellow'))
+        ln.append(colored(a[0], "yellow"))
+        ln2.append(colored(a[1], "yellow"))
       else:
         ln.append(a[0])
         ln2.append(a[1])
-  return ''.join(ss)
+  return "".join(ss)
+
 
 g = get_macho("model.hwx.golden" if len(sys.argv) < 2 else sys.argv[1])
 f1 = g.headers[0].commands[1][2][0].section_data
 f2 = a.headers[0].commands[1][2][0].section_data
 for i in range(0, len(f2), 0x300):
-  print("===== op %d =====" % (i//0x300))
+  print("===== op %d =====" % (i // 0x300))
   if len(f1) < 0x300:
-    print(compare(f1, f2[i:i+0x300]))
+    print(compare(f1, f2[i : i + 0x300]))
   else:
-    print(compare(f1[i:i+0x300], f2[i:i+0x300]))
-
-#open("/tmp/data.section", "wb").write(f2)
-#print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read()))
+    print(compare(f1[i : i + 0x300], f2[i : i + 0x300]))
 
+# open("/tmp/data.section", "wb").write(f2)
+# print(compare(open("model.hwx.golden", "rb").read(), open("model.hwx", "rb").read()))
diff --git a/ane/lib/ane.py b/ane/lib/ane.py
index def815fabb..fa50dc4ec3 100755
--- a/ane/lib/ane.py
+++ b/ane/lib/ane.py
@@ -3,11 +3,12 @@ import os
 from ctypes import *
 import numpy as np
 import faulthandler
+
 faulthandler.enable()
 
-libane = cdll.LoadLibrary(os.path.join(
-  os.path.dirname(os.path.abspath(__file__)), 
-  "libane.dylib"))
+libane = cdll.LoadLibrary(
+  os.path.join(os.path.dirname(os.path.abspath(__file__)), "libane.dylib")
+)
 
 libane.ANE_Compile.argtypes = [c_char_p, c_int]
 libane.ANE_Compile.restype = c_void_p
@@ -17,34 +18,36 @@ libane.ANE_TensorCreate.restype = c_void_p
 libane.ANE_TensorData.argtypes = [c_void_p]
 libane.ANE_TensorData.restype = POINTER(c_uint16)
 
-libane.ANE_Run.argtypes = [c_void_p]*3
+libane.ANE_Run.argtypes = [c_void_p] * 3
 libane.ANE_Run.restype = c_int
 
+
 class ANETensor:
   def __init__(self, *shape):
     self.shape = shape
     self.dtype = np.float16
     self.sz = int(np.prod(shape))
-    assert(self.sz <= 0x4000)
+    assert self.sz <= 0x4000
     self.tt = libane.ANE_TensorCreate(self.sz, 1)
-    assert(self.tt is not None)
+    assert self.tt is not None
 
   def data(self):
     data = libane.ANE_TensorData(self.tt)
-    assert(data is not None)
-    #print(hex(addressof(data.contents)))
+    assert data is not None
+    # print(hex(addressof(data.contents)))
     buf = np.ctypeslib.as_array(data, shape=(self.sz,))
     ret = np.frombuffer(buf, dtype=self.dtype)
-    #print(ret.data)
+    # print(ret.data)
     return ret
 
+
 class ANE:
   def __init__(self):
     libane.ANE_Open()
 
   def compile(self, dat):
     ret = libane.ANE_Compile(create_string_buffer(dat), len(dat))
-    assert(ret is not None)
+    assert ret is not None
     return ret
 
   def run(self, prog, tin, tout):
@@ -53,6 +56,7 @@ class ANE:
   def tensor(self, shape):
     return ANETensor(shape)
 
+
 if __name__ == "__main__":
   ane = ANE()
 
@@ -62,7 +66,7 @@ if __name__ == "__main__":
   tind = tin.data()
   toutd = tout.data()
 
-  tind[0:4] = [-1,1,-2,2]
+  tind[0:4] = [-1, 1, -2, 2]
   print(tind)
   print(toutd)
 
@@ -71,4 +75,3 @@ if __name__ == "__main__":
 
   print(tind)
   print(toutd)
-
diff --git a/examples/benchmark.py b/examples/benchmark.py
index 2788915055..5a673db647 100644
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -17,8 +17,9 @@ for dev in ["CPU", "GPU", "ANE"]:
     boaa = baa.relu()
     et = time.time()
     if i == 2:
-      print("%s can do at least %.2f MEGAReLUs/sec" % (dev, (np.prod(boaa.shape)/1e6)/(et-st)))
+      print(
+        "%s can do at least %.2f MEGAReLUs/sec"
+        % (dev, (np.prod(boaa.shape) / 1e6) / (et - st))
+      )
     # decently reliable
-    assert(np.all(boaa.cpu().data >= 0))
-
-
+    assert np.all(boaa.cpu().data >= 0)
diff --git a/examples/efficientnet.py b/examples/efficientnet.py
index 1f73cfdf32..66165ab869 100644
--- a/examples/efficientnet.py
+++ b/examples/efficientnet.py
@@ -3,24 +3,29 @@
 # a rough copy of
 # https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py
 import os
+
 GPU = os.getenv("GPU", None) is not None
 import sys
 import io
 import time
 import numpy as np
+
 np.set_printoptions(suppress=True)
 from tinygrad.tensor import Tensor
 from extra.utils import fetch, get_parameters
 from extra.efficientnet import EfficientNet
 
+
 def infer(model, img):
   # preprocess image
   aspect_ratio = img.size[0] / img.size[1]
-  img = img.resize((int(224*max(aspect_ratio,1.0)), int(224*max(1.0/aspect_ratio,1.0))))
+  img = img.resize(
+    (int(224 * max(aspect_ratio, 1.0)), int(224 * max(1.0 / aspect_ratio, 1.0)))
+  )
 
   img = np.array(img)
-  y0,x0=(np.asarray(img.shape)[:2]-224)//2
-  retimg = img = img[y0:y0+224, x0:x0+224]
+  y0, x0 = (np.asarray(img.shape)[:2] - 224) // 2
+  retimg = img = img[y0 : y0 + 224, x0 : x0 + 224]
 
   # if you want to look at the image
   """
@@ -30,11 +35,11 @@ def infer(model, img):
   """
 
   # low level preprocess
-  img = np.moveaxis(img, [2,0,1], [0,1,2])
-  img = img.astype(np.float32)[:3].reshape(1,3,224,224)
+  img = np.moveaxis(img, [2, 0, 1], [0, 1, 2])
+  img = img.astype(np.float32)[:3].reshape(1, 3, 224, 224)
   img /= 255.0
-  img -= np.array([0.485, 0.456, 0.406]).reshape((1,-1,1,1))
-  img /= np.array([0.229, 0.224, 0.225]).reshape((1,-1,1,1))
+  img -= np.array([0.485, 0.456, 0.406]).reshape((1, -1, 1, 1))
+  img /= np.array([0.229, 0.224, 0.225]).reshape((1, -1, 1, 1))
 
   # run the net
   if GPU:
@@ -50,6 +55,7 @@ def infer(model, img):
   """
   return out, retimg
 
+
 if __name__ == "__main__":
   # instantiate my net
   model = EfficientNet(int(os.getenv("NUM", "0")))
@@ -59,38 +65,42 @@ if __name__ == "__main__":
 
   # category labels
   import ast
-  lbls = fetch("https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt")
-  lbls = ast.literal_eval(lbls.decode('utf-8'))
+
+  lbls = fetch(
+    "https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/238f720ff059c1f82f368259d1ca4ffa5dd8f9f5/imagenet1000_clsidx_to_labels.txt"
+  )
+  lbls = ast.literal_eval(lbls.decode("utf-8"))
 
   # load image and preprocess
   from PIL import Image
+
   url = sys.argv[1]
-  if url == 'webcam':
+  if url == "webcam":
     import cv2
+
     cap = cv2.VideoCapture(0)
     cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
     while 1:
-      _ = cap.grab() # discard one frame to circumvent capture buffering
+      _ = cap.grab()  # discard one frame to circumvent capture buffering
       ret, frame = cap.read()
-      img = Image.fromarray(frame[:, :, [2,1,0]])
+      img = Image.fromarray(frame[:, :, [2, 1, 0]])
       out, retimg = infer(model, img)
       print(np.argmax(out.data), np.max(out.data), lbls[np.argmax(out.data)])
       SCALE = 3
-      simg = cv2.resize(retimg, (224*SCALE, 224*SCALE))
+      simg = cv2.resize(retimg, (224 * SCALE, 224 * SCALE))
       retimg = cv2.cvtColor(simg, cv2.COLOR_RGB2BGR)
-      cv2.imshow('capture', retimg)
-      if cv2.waitKey(1) & 0xFF == ord('q'):
+      cv2.imshow("capture", retimg)
+      if cv2.waitKey(1) & 0xFF == ord("q"):
         break
     cap.release()
     cv2.destroyAllWindows()
   else:
-    if url.startswith('http'):
+    if url.startswith("http"):
       img = Image.open(io.BytesIO(fetch(url)))
     else:
       img = Image.open(url)
     st = time.time()
     out, _ = infer(model, img)
     print(np.argmax(out.data), np.max(out.data), lbls[np.argmax(out.data)])
-    print("did inference in %.2f s" % (time.time()-st))
-  #print("NOT", np.argmin(out.data), np.min(out.data), lbls[np.argmin(out.data)])
-
+    print("did inference in %.2f s" % (time.time() - st))
+  # print("NOT", np.argmin(out.data), np.min(out.data), lbls[np.argmin(out.data)])
diff --git a/examples/mnist_gan.py b/examples/mnist_gan.py
index 67e4b7ba7f..eab20c6a6d 100644
--- a/examples/mnist_gan.py
+++ b/examples/mnist_gan.py
@@ -3,8 +3,9 @@ import os
 import sys
 import numpy as np
 from tqdm import tqdm
+
 sys.path.append(os.getcwd())
-sys.path.append(os.path.join(os.getcwd(), 'test'))
+sys.path.append(os.path.join(os.getcwd(), "test"))
 
 from tinygrad.tensor import Tensor, Function, register
 from extra.utils import get_parameters
@@ -12,7 +13,10 @@ import tinygrad.optim as optim
 from test_mnist import X_train
 from torchvision.utils import make_grid, save_image
 import torch
+
 GPU = os.getenv("GPU") is not None
+
+
 class LinearGen:
   def __init__(self):
     lv = 128
@@ -28,6 +32,7 @@ class LinearGen:
     x = x.dot(self.l4).tanh()
     return x
 
+
 class LinearDisc:
   def __init__(self):
     in_sh = 784
@@ -39,108 +44,112 @@ class LinearDisc:
   def forward(self, x, train=True):
     x = x.dot(self.l1).leakyrelu(0.2)
     if train:
-        x = x.dropout(0.3)
+      x = x.dropout(0.3)
     x = x.dot(self.l2).leakyrelu(0.2)
     if train:
-        x = x.dropout(0.3)
+      x = x.dropout(0.3)
     x = x.dot(self.l3).leakyrelu(0.2)
     if train:
-        x = x.dropout(0.3)
+      x = x.dropout(0.3)
     x = x.dot(self.l4).logsoftmax()
     return x
 
+
 if __name__ == "__main__":
-    generator = LinearGen()
-    discriminator = LinearDisc()
-    batch_size = 512
-    k = 1
-    epochs = 300
-    generator_params = get_parameters(generator)
-    discriminator_params = get_parameters(discriminator)
-    gen_loss = []
-    disc_loss = []
-    output_folder = "outputs"
-    os.makedirs(output_folder, exist_ok=True)
-    train_data_size = len(X_train)
-    ds_noise = Tensor(np.random.randn(64,128).astype(np.float32), gpu=GPU, requires_grad=False)
-    n_steps = int(train_data_size/batch_size)
-    if GPU:
-      [x.cuda_() for x in generator_params+discriminator_params]
-    # optimizers
-    optim_g = optim.Adam(generator_params,lr=0.0002, b1=0.5) # 0.0002 for equilibrium!
-    optim_d = optim.Adam(discriminator_params,lr=0.0002, b1=0.5)
+  generator = LinearGen()
+  discriminator = LinearDisc()
+  batch_size = 512
+  k = 1
+  epochs = 300
+  generator_params = get_parameters(generator)
+  discriminator_params = get_parameters(discriminator)
+  gen_loss = []
+  disc_loss = []
+  output_folder = "outputs"
+  os.makedirs(output_folder, exist_ok=True)
+  train_data_size = len(X_train)
+  ds_noise = Tensor(
+    np.random.randn(64, 128).astype(np.float32), gpu=GPU, requires_grad=False
+  )
+  n_steps = int(train_data_size / batch_size)
+  if GPU:
+    [x.cuda_() for x in generator_params + discriminator_params]
+  # optimizers
+  optim_g = optim.Adam(generator_params, lr=0.0002, b1=0.5)  # 0.0002 for equilibrium!
+  optim_d = optim.Adam(discriminator_params, lr=0.0002, b1=0.5)
 
-    def regularization_l2(model, a=1e-4):
-        #TODO: l2 reg loss
-        pass
+  def regularization_l2(model, a=1e-4):
+    # TODO: l2 reg loss
+    pass
 
-    def generator_batch():
-        idx = np.random.randint(0, X_train.shape[0], size=(batch_size))
-        image_b = X_train[idx].reshape(-1, 28*28).astype(np.float32)/255.
-        image_b = (image_b - 0.5)/0.5
-        return Tensor(image_b, gpu=GPU)
+  def generator_batch():
+    idx = np.random.randint(0, X_train.shape[0], size=(batch_size))
+    image_b = X_train[idx].reshape(-1, 28 * 28).astype(np.float32) / 255.0
+    image_b = (image_b - 0.5) / 0.5
+    return Tensor(image_b, gpu=GPU)
 
-    def real_label(bs):
-        y = np.zeros((bs,2), np.float32)
-        y[range(bs), [1]*bs] = -2.0
-        real_labels = Tensor(y, gpu=GPU)
-        return real_labels
+  def real_label(bs):
+    y = np.zeros((bs, 2), np.float32)
+    y[range(bs), [1] * bs] = -2.0
+    real_labels = Tensor(y, gpu=GPU)
+    return real_labels
 
-    def fake_label(bs):
-        y = np.zeros((bs,2), np.float32)
-        y[range(bs), [0]*bs] = -2.0 # Can we do label smoothin? i.e -2.0 changed to -1.98789.
-        fake_labels = Tensor(y, gpu=GPU)
-        return fake_labels
+  def fake_label(bs):
+    y = np.zeros((bs, 2), np.float32)
+    y[
+      range(bs), [0] * bs
+    ] = -2.0  # Can we do label smoothin? i.e -2.0 changed to -1.98789.
+    fake_labels = Tensor(y, gpu=GPU)
+    return fake_labels
 
-    def train_discriminator(optimizer, data_real, data_fake):
-        real_labels = real_label(batch_size)
-        fake_labels = fake_label(batch_size)
+  def train_discriminator(optimizer, data_real, data_fake):
+    real_labels = real_label(batch_size)
+    fake_labels = fake_label(batch_size)
 
-        optimizer.zero_grad()
+    optimizer.zero_grad()
 
-        output_real = discriminator.forward(data_real)
-        loss_real = (output_real * real_labels).mean()
+    output_real = discriminator.forward(data_real)
+    loss_real = (output_real * real_labels).mean()
 
-        output_fake = discriminator.forward(data_fake)
-        loss_fake = (output_fake * fake_labels).mean()
+    output_fake = discriminator.forward(data_fake)
+    loss_fake = (output_fake * fake_labels).mean()
 
-        loss_real.backward()
-        loss_fake.backward()
-        optimizer.step()
-        return loss_real.cpu().data + loss_fake.cpu().data
+    loss_real.backward()
+    loss_fake.backward()
+    optimizer.step()
+    return loss_real.cpu().data + loss_fake.cpu().data
 
-    def train_generator(optimizer, data_fake):
-        real_labels = real_label(batch_size)
-        optimizer.zero_grad()
-        output = discriminator.forward(data_fake)
-        loss = (output * real_labels).mean()
-        loss.backward()
-        optimizer.step()
-        return loss.cpu().data
-
-    for epoch in tqdm(range(epochs)):
-        loss_g = 0.0
-        loss_d = 0.0
-        print(f"Epoch {epoch} of {epochs}")
-        for i in tqdm(range(n_steps)):
-            image = generator_batch()
-            for step in range(k): # Try with k = 5 or 7.
-                noise = Tensor(np.random.randn(batch_size,128), gpu=GPU)
-                data_fake = generator.forward(noise).detach()
-                data_real = image
-                loss_d_step = train_discriminator(optim_d, data_real, data_fake)
-                loss_d += loss_d_step
-            noise = Tensor(np.random.randn(batch_size,128), gpu=GPU)
-            data_fake = generator.forward(noise)
-            loss_g_step = train_generator(optim_g, data_fake)
-            loss_g += loss_g_step
-        fake_images = generator.forward(ds_noise).detach().cpu().data
-        fake_images = (fake_images.reshape(-1, 1, 28, 28)+ 1) / 2 # 0 - 1 range.
-        fake_images = make_grid(torch.tensor(fake_images))
-        save_image(fake_images, os.path.join(output_folder,f"image_{epoch}.jpg"))
-        epoch_loss_g = loss_g / n_steps
-        epoch_loss_d = loss_d / n_steps
-        print(f"EPOCH: Generator loss: {epoch_loss_g}, Discriminator loss: {epoch_loss_d}")
-    else:
-        print("Training Completed!")
+  def train_generator(optimizer, data_fake):
+    real_labels = real_label(batch_size)
+    optimizer.zero_grad()
+    output = discriminator.forward(data_fake)
+    loss = (output * real_labels).mean()
+    loss.backward()
+    optimizer.step()
+    return loss.cpu().data
 
+  for epoch in tqdm(range(epochs)):
+    loss_g = 0.0
+    loss_d = 0.0
+    print(f"Epoch {epoch} of {epochs}")
+    for i in tqdm(range(n_steps)):
+      image = generator_batch()
+      for step in range(k):  # Try with k = 5 or 7.
+        noise = Tensor(np.random.randn(batch_size, 128), gpu=GPU)
+        data_fake = generator.forward(noise).detach()
+        data_real = image
+        loss_d_step = train_discriminator(optim_d, data_real, data_fake)
+        loss_d += loss_d_step
+      noise = Tensor(np.random.randn(batch_size, 128), gpu=GPU)
+      data_fake = generator.forward(noise)
+      loss_g_step = train_generator(optim_g, data_fake)
+      loss_g += loss_g_step
+    fake_images = generator.forward(ds_noise).detach().cpu().data
+    fake_images = (fake_images.reshape(-1, 1, 28, 28) + 1) / 2  # 0 - 1 range.
+    fake_images = make_grid(torch.tensor(fake_images))
+    save_image(fake_images, os.path.join(output_folder, f"image_{epoch}.jpg"))
+    epoch_loss_g = loss_g / n_steps
+    epoch_loss_d = loss_d / n_steps
+    print(f"EPOCH: Generator loss: {epoch_loss_g}, Discriminator loss: {epoch_loss_d}")
+  else:
+    print("Training Completed!")
diff --git a/examples/serious_mnist.py b/examples/serious_mnist.py
index 56cc432c15..b1612e5249 100644
--- a/examples/serious_mnist.py
+++ b/examples/serious_mnist.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python
-#inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
+# inspired by https://github.com/Matuzas77/MNIST-0.17/blob/master/MNIST_final_solution.ipynb
 import os
 import sys
+
 sys.path.append(os.getcwd())
-sys.path.append(os.path.join(os.getcwd(), 'test'))
+sys.path.append(os.path.join(os.getcwd(), "test"))
 
 import numpy as np
 from tinygrad.tensor import Tensor, GPU
@@ -13,126 +14,139 @@ from test_mnist import fetch_mnist
 from extra.training import train, evaluate
 import tinygrad.optim as optim
 from extra.augment import augment_img
+
 GPU = os.getenv("GPU", None) is not None
 QUICK = os.getenv("QUICK", None) is not None
 DEBUG = os.getenv("DEBUG", None) is not None
 
+
 class SqueezeExciteBlock2D:
   def __init__(self, filters):
     self.filters = filters
-    self.weight1 = Tensor.uniform(self.filters, self.filters//32)
-    self.bias1 = Tensor.uniform(1,self.filters//32)
-    self.weight2 = Tensor.uniform(self.filters//32, self.filters)
+    self.weight1 = Tensor.uniform(self.filters, self.filters // 32)
+    self.bias1 = Tensor.uniform(1, self.filters // 32)
+    self.weight2 = Tensor.uniform(self.filters // 32, self.filters)
     self.bias2 = Tensor.uniform(1, self.filters)
 
   def __call__(self, input):
-    se = input.avg_pool2d(kernel_size=(input.shape[2], input.shape[3])) #GlobalAveragePool2D
+    se = input.avg_pool2d(
+      kernel_size=(input.shape[2], input.shape[3])
+    )  # GlobalAveragePool2D
     se = se.reshape(shape=(-1, self.filters))
     se = se.dot(self.weight1) + self.bias1
-    se = se.relu() 
+    se = se.relu()
     se = se.dot(self.weight2) + self.bias2
-    se = se.sigmoid().reshape(shape=(-1,self.filters,1,1)) #for broadcasting 
+    se = se.sigmoid().reshape(shape=(-1, self.filters, 1, 1))  # for broadcasting
     se = input.mul(se)
     return se
 
+
 class ConvBlock:
   def __init__(self, h, w, inp, filters=128, conv=3):
     self.h, self.w = h, w
     self.inp = inp
-    #init weights
-    self.cweights = [Tensor.uniform(filters, inp if i==0 else filters, conv, conv) for i in range(3)]
+    # init weights
+    self.cweights = [
+      Tensor.uniform(filters, inp if i == 0 else filters, conv, conv) for i in range(3)
+    ]
     self.cbiases = [Tensor.uniform(1, filters, 1, 1) for i in range(3)]
-    #init layers
+    # init layers
     self._bn = BatchNorm2D(128, training=True)
     self._seb = SqueezeExciteBlock2D(filters)
-  
+
   def __call__(self, input):
-    x = input.reshape(shape=(-1, self.inp, self.w, self.h)) 
+    x = input.reshape(shape=(-1, self.inp, self.w, self.h))
     for cweight, cbias in zip(self.cweights, self.cbiases):
-      x = x.pad2d(padding=[1,1,1,1]).conv2d(cweight).add(cbias).relu()
+      x = x.pad2d(padding=[1, 1, 1, 1]).conv2d(cweight).add(cbias).relu()
     x = self._bn(x)
     x = self._seb(x)
     return x
 
+
 class BigConvNet:
   def __init__(self):
-    self.conv = [ConvBlock(28,28,1), ConvBlock(28,28,128), ConvBlock(14,14,128)]
-    self.weight1 = Tensor.uniform(128,10)
-    self.weight2 = Tensor.uniform(128,10)
+    self.conv = [ConvBlock(28, 28, 1), ConvBlock(28, 28, 128), ConvBlock(14, 14, 128)]
+    self.weight1 = Tensor.uniform(128, 10)
+    self.weight2 = Tensor.uniform(128, 10)
 
   def parameters(self):
-    if DEBUG: #keeping this for a moment
+    if DEBUG:  # keeping this for a moment
       pars = [par for par in get_parameters(self) if par.requires_grad]
       no_pars = 0
       for par in pars:
         print(par.shape)
         no_pars += np.prod(par.shape)
-      print('no of parameters', no_pars)
+      print("no of parameters", no_pars)
       return pars
     else:
       return get_parameters(self)
 
   def save(self, filename):
-    with open(filename+'.npy', 'wb') as f:
+    with open(filename + ".npy", "wb") as f:
       for par in get_parameters(self):
-        #if par.requires_grad:
+        # if par.requires_grad:
         np.save(f, par.cpu().data)
 
   def load(self, filename):
-    with open(filename+'.npy', 'rb') as f:
-      for par in get_parameters(self): 
-        #if par.requires_grad:
+    with open(filename + ".npy", "rb") as f:
+      for par in get_parameters(self):
+        # if par.requires_grad:
         try:
           par.cpu().data[:] = np.load(f)
           if GPU:
             par.cuda()
         except:
-          print('Could not load parameter')
+          print("Could not load parameter")
 
   def forward(self, x):
     x = self.conv[0](x)
     x = self.conv[1](x)
-    x = x.avg_pool2d(kernel_size=(2,2))
+    x = x.avg_pool2d(kernel_size=(2, 2))
     x = self.conv[2](x)
-    x1 = x.avg_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
-    x2 = x.max_pool2d(kernel_size=(14,14)).reshape(shape=(-1,128)) #global
+    x1 = x.avg_pool2d(kernel_size=(14, 14)).reshape(shape=(-1, 128))  # global
+    x2 = x.max_pool2d(kernel_size=(14, 14)).reshape(shape=(-1, 128))  # global
     xo = x1.dot(self.weight1) + x2.dot(self.weight2)
     return xo.logsoftmax()
 
 
 if __name__ == "__main__":
   lrs = [1e-4, 1e-5] if QUICK else [1e-3, 1e-4, 1e-5, 1e-5]
-  epochss = [2, 1] if QUICK else [13, 3, 3, 1] 
+  epochss = [2, 1] if QUICK else [13, 3, 3, 1]
   BS = 32
 
   lmbd = 0.00025
-  lossfn = lambda out,y: out.mul(y).mean() + lmbd*(model.weight1.abs() + model.weight2.abs()).sum()
+  lossfn = (
+    lambda out, y: out.mul(y).mean()
+    + lmbd * (model.weight1.abs() + model.weight2.abs()).sum()
+  )
   X_train, Y_train, X_test, Y_test = fetch_mnist()
-  steps = len(X_train)//BS
+  steps = len(X_train) // BS
   np.random.seed(1337)
   if QUICK:
     steps = 1
     X_test, Y_test = X_test[:BS], Y_test[:BS]
-  
+
   model = BigConvNet()
- 
+
   if len(sys.argv) > 1:
     try:
       model.load(sys.argv[1])
-      print('Loaded weights "'+sys.argv[1]+'", evaluating...')
+      print('Loaded weights "' + sys.argv[1] + '", evaluating...')
       evaluate(model, X_test, Y_test, BS=BS)
     except:
-      print('could not load weights "'+sys.argv[1]+'".')
- 
+      print('could not load weights "' + sys.argv[1] + '".')
+
   if GPU:
     params = get_parameters(model)
     [x.cuda_() for x in params]
 
   for lr, epochs in zip(lrs, epochss):
     optimizer = optim.Adam(model.parameters(), lr=lr)
-    for epoch in range(1,epochs+1):
-      #first epoch without augmentation
+    for epoch in range(1, epochs + 1):
+      # first epoch without augmentation
       X_aug = X_train if epoch == 1 else augment_img(X_train)
-      train(model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, gpu=GPU, BS=BS)
+      train(
+        model, X_aug, Y_train, optimizer, steps=steps, lossfn=lossfn, gpu=GPU, BS=BS
+      )
       accuracy = evaluate(model, X_test, Y_test, BS=BS)
-      model.save('examples/checkpoint'+str("%.0f" % (accuracy*1.0e6)))
+      model.save("examples/checkpoint" + str("%.0f" % (accuracy * 1.0e6)))
diff --git a/examples/train_efficientnet.py b/examples/train_efficientnet.py
index e806f5ef3d..2760ff0412 100644
--- a/examples/train_efficientnet.py
+++ b/examples/train_efficientnet.py
@@ -10,13 +10,14 @@ import io
 import tarfile
 import pickle
 
+
 class TinyConvNet:
   def __init__(self, classes=10):
     conv = 3
-    inter_chan, out_chan = 8, 16   # for speed
-    self.c1 = Tensor.uniform(inter_chan,3,conv,conv)
-    self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
-    self.l1 = Tensor.uniform(out_chan*6*6, classes)
+    inter_chan, out_chan = 8, 16  # for speed
+    self.c1 = Tensor.uniform(inter_chan, 3, conv, conv)
+    self.c2 = Tensor.uniform(out_chan, inter_chan, conv, conv)
+    self.l1 = Tensor.uniform(out_chan * 6 * 6, classes)
 
   def forward(self, x):
     x = x.conv2d(self.c1).relu().max_pool2d()
@@ -24,13 +25,20 @@ class TinyConvNet:
     x = x.reshape(shape=[x.shape[0], -1])
     return x.dot(self.l1).logsoftmax()
 
+
 def load_cifar():
-  tt = tarfile.open(fileobj=io.BytesIO(fetch('https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz')), mode='r:gz')
-  db = pickle.load(tt.extractfile('cifar-10-batches-py/data_batch_1'), encoding="bytes")
-  X = db[b'data'].reshape((-1, 3, 32, 32))
-  Y = np.array(db[b'labels'])
+  tt = tarfile.open(
+    fileobj=io.BytesIO(
+      fetch("https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz")
+    ),
+    mode="r:gz",
+  )
+  db = pickle.load(tt.extractfile("cifar-10-batches-py/data_batch_1"), encoding="bytes")
+  X = db[b"data"].reshape((-1, 3, 32, 32))
+  Y = np.array(db[b"labels"])
   return X, Y
 
+
 if __name__ == "__main__":
   X_train, Y_train = load_cifar()
   classes = 10
@@ -40,7 +48,7 @@ if __name__ == "__main__":
   TRANSFER = os.getenv("TRANSFER") is not None
   if TINY:
     model = TinyConvNet(classes)
-  elif TRANSFER: 
+  elif TRANSFER:
     model = EfficientNet(int(os.getenv("NUM", "0")), classes, has_se=True)
     model.load_weights_from_torch()
   else:
@@ -50,21 +58,21 @@ if __name__ == "__main__":
   print("parameters", len(parameters))
   optimizer = optim.Adam(parameters, lr=0.001)
 
-  #BS, steps = 16, 32
+  # BS, steps = 16, 32
   BS, steps = 64 if TINY else 16, 2048
 
-  for i in (t := trange(steps)):
+  for i in (t := trange(steps)) :
     samp = np.random.randint(0, X_train.shape[0], size=(BS))
 
     img = X_train[samp].astype(np.float32)
 
     st = time.time()
     out = model.forward(Tensor(img))
-    fp_time = (time.time()-st)*1000.0
+    fp_time = (time.time() - st) * 1000.0
 
     Y = Y_train[samp]
-    y = np.zeros((BS,classes), np.float32)
-    y[range(y.shape[0]),Y] = -classes
+    y = np.zeros((BS, classes), np.float32)
+    y[range(y.shape[0]), Y] = -classes
     y = Tensor(y)
     loss = out.logsoftmax().mul(y).mean()
 
@@ -72,25 +80,32 @@ if __name__ == "__main__":
 
     st = time.time()
     loss.backward()
-    bp_time = (time.time()-st)*1000.0
+    bp_time = (time.time() - st) * 1000.0
 
     st = time.time()
     optimizer.step()
-    opt_time = (time.time()-st)*1000.0
+    opt_time = (time.time() - st) * 1000.0
 
-    #print(out.cpu().data)
+    # print(out.cpu().data)
 
     st = time.time()
     loss = loss.cpu().data
     cat = np.argmax(out.cpu().data, axis=1)
     accuracy = (cat == Y).mean()
-    finish_time = (time.time()-st)*1000.0
+    finish_time = (time.time() - st) * 1000.0
 
     # printing
-    t.set_description("loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f" %
-      (loss, accuracy,
-      fp_time, bp_time, opt_time, finish_time,
-      fp_time + bp_time + opt_time + finish_time))
+    t.set_description(
+      "loss %.2f accuracy %.2f -- %.2f + %.2f + %.2f + %.2f = %.2f"
+      % (
+        loss,
+        accuracy,
+        fp_time,
+        bp_time,
+        opt_time,
+        finish_time,
+        fp_time + bp_time + opt_time + finish_time,
+      )
+    )
 
     del out, y, loss
-
diff --git a/examples/use_ane.py b/examples/use_ane.py
index 9c4bb5bd2c..ee78ca16ff 100755
--- a/examples/use_ane.py
+++ b/examples/use_ane.py
@@ -3,9 +3,8 @@ import numpy as np
 from tinygrad.tensor import Tensor
 import time
 
-a = Tensor([-2,-1,0,1,2]).ane()
+a = Tensor([-2, -1, 0, 1, 2]).ane()
 print(a.cpu())
 b = a.relu()
 print(b.cpu())
-assert(np.all(b.cpu().data >= 0))
-
+assert np.all(b.cpu().data >= 0)
diff --git a/extra/augment.py b/extra/augment.py
index c68205717c..658ac3ae69 100644
--- a/extra/augment.py
+++ b/extra/augment.py
@@ -2,39 +2,47 @@ import numpy as np
 from PIL import Image
 import os
 import sys
+
 sys.path.append(os.getcwd())
-sys.path.append(os.path.join(os.getcwd(), 'test'))
+sys.path.append(os.path.join(os.getcwd(), "test"))
 from test_mnist import fetch_mnist
 from tqdm import trange
 
+
 def augment_img(X, rotate=10, px=3):
   Xaug = np.zeros_like(X)
   for i in trange(len(X)):
     im = Image.fromarray(X[i])
-    im = im.rotate(np.random.randint(-rotate,rotate), resample=Image.BICUBIC)
+    im = im.rotate(np.random.randint(-rotate, rotate), resample=Image.BICUBIC)
     w, h = X.shape[1:]
-    #upper left, lower left, lower right, upper right
-    quad = np.random.randint(-px,px,size=(8)) + np.array([0,0,0,h,w,h,w,0])
+    # upper left, lower left, lower right, upper right
+    quad = np.random.randint(-px, px, size=(8)) + np.array([0, 0, 0, h, w, h, w, 0])
     im = im.transform((w, h), Image.QUAD, quad, resample=Image.BICUBIC)
     Xaug[i] = im
   return Xaug
 
+
 if __name__ == "__main__":
   from test_mnist import fetch_mnist
   import matplotlib.pyplot as plt
+
   X_train, Y_train, X_test, Y_test = fetch_mnist()
-  X = np.vstack([X_train[:1]]*10+[X_train[1:2]]*10)
-  fig, a = plt.subplots(2,len(X))
+  X = np.vstack([X_train[:1]] * 10 + [X_train[1:2]] * 10)
+  fig, a = plt.subplots(2, len(X))
   Xaug = augment_img(X)
   for i in range(len(X)):
-    a[0][i].imshow(X[i], cmap='gray')
-    a[1][i].imshow(Xaug[i],cmap='gray')
-    a[0][i].axis('off')
-    a[1][i].axis('off')
+    a[0][i].imshow(X[i], cmap="gray")
+    a[1][i].imshow(Xaug[i], cmap="gray")
+    a[0][i].axis("off")
+    a[1][i].axis("off")
   plt.show()
 
-  #create some nice gifs for doc?!
+  # create some nice gifs for doc?!
   for i in range(10):
-    im = Image.fromarray(X_train[7353+i])
-    im_aug = [Image.fromarray(x) for x in augment_img(np.array([X_train[7353+i]]*100))]
-    im.save("aug"+str(i)+".gif", save_all=True, append_images=im_aug, duration=100, loop=0)
+    im = Image.fromarray(X_train[7353 + i])
+    im_aug = [
+      Image.fromarray(x) for x in augment_img(np.array([X_train[7353 + i]] * 100))
+    ]
+    im.save(
+      "aug" + str(i) + ".gif", save_all=True, append_images=im_aug, duration=100, loop=0
+    )
diff --git a/extra/efficientnet.py b/extra/efficientnet.py
index cd32acfab6..d936aa0855 100644
--- a/extra/efficientnet.py
+++ b/extra/efficientnet.py
@@ -6,6 +6,7 @@ from extra.utils import fetch
 
 USE_TORCH = False
 
+
 def fake_torch_load(b0):
   import io
   import pickle
@@ -23,9 +24,9 @@ def fake_torch_load(b0):
 
   class HackTensor:
     def __new__(cls, *args):
-      #print(args)
+      # print(args)
       ident, storage_type, obj_key, location, obj_size, view_metadata = args[0]
-      assert ident == 'storage'
+      assert ident == "storage"
 
       ret = np.zeros(obj_size, dtype=storage_type)
       key_prelookup[obj_key] = (storage_type, obj_size, ret, args[2], args[3])
@@ -33,10 +34,10 @@ def fake_torch_load(b0):
 
   class MyPickle(pickle.Unpickler):
     def find_class(self, module, name):
-      #print(module, name)
-      if name == 'FloatStorage':
+      # print(module, name)
+      if name == "FloatStorage":
         return np.float32
-      if name == 'LongStorage':
+      if name == "LongStorage":
         return np.int64
       if module == "torch._utils" or module == "torch":
         return HackTensor
@@ -51,7 +52,7 @@ def fake_torch_load(b0):
   # create key_lookup
   key_lookup = pickle.load(fb0)
   key_real = [None] * len(key_lookup)
-  for k,v in key_prelookup.items():
+  for k, v in key_prelookup.items():
     key_real[key_lookup.index(k)] = v
 
   # read in the actual data
@@ -64,13 +65,23 @@ def fake_torch_load(b0):
     np_array.shape = np_shape
 
     # numpy stores its strides in bytes
-    real_strides = tuple([x*bytes_size for x in np_strides])
+    real_strides = tuple([x * bytes_size for x in np_strides])
     np_array.strides = real_strides
 
   return ret
 
+
 class MBConvBlock:
-  def __init__(self, kernel_size, strides, expand_ratio, input_filters, output_filters, se_ratio, has_se):
+  def __init__(
+    self,
+    kernel_size,
+    strides,
+    expand_ratio,
+    input_filters,
+    output_filters,
+    se_ratio,
+    has_se,
+  ):
     oup = expand_ratio * input_filters
     if expand_ratio != 1:
       self._expand_conv = Tensor.uniform(oup, input_filters, 1, 1)
@@ -79,10 +90,10 @@ class MBConvBlock:
       self._expand_conv = None
 
     self.strides = strides
-    if strides == (2,2):
-      self.pad = [(kernel_size-1)//2-1, (kernel_size-1)//2]*2
+    if strides == (2, 2):
+      self.pad = [(kernel_size - 1) // 2 - 1, (kernel_size - 1) // 2] * 2
     else:
-      self.pad = [(kernel_size-1)//2]*4
+      self.pad = [(kernel_size - 1) // 2] * 4
 
     self._depthwise_conv = Tensor.uniform(oup, 1, kernel_size, kernel_size)
     self._bn1 = BatchNorm2D(oup)
@@ -103,14 +114,22 @@ class MBConvBlock:
     if self._expand_conv:
       x = self._bn0(x.conv2d(self._expand_conv)).swish()
     x = x.pad2d(padding=self.pad)
-    x = x.conv2d(self._depthwise_conv, stride=self.strides, groups=self._depthwise_conv.shape[0])
+    x = x.conv2d(
+      self._depthwise_conv, stride=self.strides, groups=self._depthwise_conv.shape[0]
+    )
     x = self._bn1(x).swish()
 
     # has_se
     if self.has_se:
       x_squeezed = x.avg_pool2d(kernel_size=x.shape[2:4])
-      x_squeezed = x_squeezed.conv2d(self._se_reduce).add(self._se_reduce_bias.reshape(shape=[1, -1, 1, 1])).swish()
-      x_squeezed = x_squeezed.conv2d(self._se_expand).add(self._se_expand_bias.reshape(shape=[1, -1, 1, 1]))
+      x_squeezed = (
+        x_squeezed.conv2d(self._se_reduce)
+        .add(self._se_reduce_bias.reshape(shape=[1, -1, 1, 1]))
+        .swish()
+      )
+      x_squeezed = x_squeezed.conv2d(self._se_expand).add(
+        self._se_expand_bias.reshape(shape=[1, -1, 1, 1])
+      )
       x = x.mul(x_squeezed.sigmoid())
 
     x = self._bn2(x.conv2d(self._project_conv))
@@ -118,21 +137,22 @@ class MBConvBlock:
       x = x.add(inputs)
     return x
 
+
 class EfficientNet:
   def __init__(self, number=0, classes=1000, has_se=True):
     self.number = number
     global_params = [
       # width, depth
-      (1.0, 1.0), # b0
-      (1.0, 1.1), # b1
-      (1.1, 1.2), # b2
-      (1.2, 1.4), # b3
-      (1.4, 1.8), # b4
-      (1.6, 2.2), # b5
-      (1.8, 2.6), # b6
-      (2.0, 3.1), # b7
-      (2.2, 3.6), # b8
-      (4.3, 5.3), # l2
+      (1.0, 1.0),  # b0
+      (1.0, 1.1),  # b1
+      (1.1, 1.2),  # b2
+      (1.2, 1.4),  # b3
+      (1.4, 1.8),  # b4
+      (1.6, 2.2),  # b5
+      (1.8, 2.6),  # b6
+      (2.0, 3.1),  # b7
+      (2.2, 3.6),  # b8
+      (4.3, 5.3),  # l2
     ][number]
 
     def round_filters(filters):
@@ -140,7 +160,7 @@ class EfficientNet:
       divisor = 8
       filters *= multiplier
       new_filters = max(divisor, int(filters + divisor / 2) // divisor * divisor)
-      if new_filters < 0.9 * filters: # prevent rounding by more than 10%
+      if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
         new_filters += divisor
       return int(new_filters)
 
@@ -151,13 +171,13 @@ class EfficientNet:
     self._conv_stem = Tensor.uniform(out_channels, 3, 3, 3)
     self._bn0 = BatchNorm2D(out_channels)
     blocks_args = [
-      [1, 3, (1,1), 1, 32, 16, 0.25],
-      [2, 3, (2,2), 6, 16, 24, 0.25],
-      [2, 5, (2,2), 6, 24, 40, 0.25],
-      [3, 3, (2,2), 6, 40, 80, 0.25],
-      [3, 5, (1,1), 6, 80, 112, 0.25],
-      [4, 5, (2,2), 6, 112, 192, 0.25],
-      [1, 3, (1,1), 6, 192, 320, 0.25],
+      [1, 3, (1, 1), 1, 32, 16, 0.25],
+      [2, 3, (2, 2), 6, 16, 24, 0.25],
+      [2, 5, (2, 2), 6, 24, 40, 0.25],
+      [3, 3, (2, 2), 6, 40, 80, 0.25],
+      [3, 5, (1, 1), 6, 80, 112, 0.25],
+      [4, 5, (2, 2), 6, 112, 192, 0.25],
+      [1, 3, (1, 1), 6, 192, 320, 0.25],
     ]
     self._blocks = []
     # num_repeats, kernel_size, strides, expand_ratio, input_filters, output_filters, se_ratio
@@ -168,7 +188,7 @@ class EfficientNet:
       for n in range(round_repeats(b[0])):
         self._blocks.append(MBConvBlock(*args, has_se=has_se))
         args[3] = args[4]
-        args[1] = (1,1)
+        args[1] = (1, 1)
 
     in_channels = round_filters(320)
     out_channels = round_filters(1280)
@@ -178,43 +198,52 @@ class EfficientNet:
     self._fc_bias = Tensor.zeros(classes)
 
   def forward(self, x):
-    x = x.pad2d(padding=(0,1,0,1))
+    x = x.pad2d(padding=(0, 1, 0, 1))
     x = self._bn0(x.conv2d(self._conv_stem, stride=2)).swish()
-    #print(x.shape, x.data[:, 0, 0, 0])
+    # print(x.shape, x.data[:, 0, 0, 0])
     for block in self._blocks:
       x = block(x)
     x = self._bn1(x.conv2d(self._conv_head)).swish()
     x = x.avg_pool2d(kernel_size=x.shape[2:4])
     x = x.reshape(shape=(-1, x.shape[1]))
-    #x = x.dropout(0.2)
-    return x.dot(self._fc).add(self._fc_bias.reshape(shape=[1,-1]))
+    # x = x.dropout(0.2)
+    return x.dot(self._fc).add(self._fc_bias.reshape(shape=[1, -1]))
 
   def load_weights_from_torch(self):
     # load b0
     # https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/utils.py#L551
     if self.number == 0:
-      b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth")
+      b0 = fetch(
+        "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth"
+      )
     elif self.number == 2:
-      b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth")
+      b0 = fetch(
+        "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth"
+      )
     elif self.number == 4:
-      b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth")
+      b0 = fetch(
+        "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth"
+      )
     elif self.number == 7:
-      b0 = fetch("https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth")
+      b0 = fetch(
+        "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth"
+      )
     else:
       raise Exception("no pretrained weights")
 
     if USE_TORCH:
       import io
       import torch
+
       b0 = torch.load(io.BytesIO(b0))
     else:
       b0 = fake_torch_load(b0)
 
-    for k,v in b0.items():
-      if '_blocks.' in k:
+    for k, v in b0.items():
+      if "_blocks." in k:
         k = "%s[%s].%s" % tuple(k.split(".", 2))
-      mk = "self."+k
-      #print(k, v.shape)
+      mk = "self." + k
+      # print(k, v.shape)
       try:
         mv = eval(mk)
       except AttributeError:
@@ -223,10 +252,9 @@ class EfficientNet:
         except AttributeError:
           mv = eval(mk.replace(".bias", "_bias"))
       vnp = v.numpy().astype(np.float32) if USE_TORCH else v
-      vnp = vnp if k != '_fc.weight' else vnp.T
+      vnp = vnp if k != "_fc.weight" else vnp.T
 
       if mv.shape == vnp.shape or vnp.shape == ():
         mv.data[:] = vnp
       else:
         print("MISMATCH SHAPE IN %s, %r %r" % (k, mv.shape, vnp.shape))
-
diff --git a/extra/gradcheck.py b/extra/gradcheck.py
index 4ebf04db50..603ac0f2b5 100644
--- a/extra/gradcheck.py
+++ b/extra/gradcheck.py
@@ -1,17 +1,19 @@
 import numpy as np
 from tinygrad.tensor import Tensor
 
-def mask_like(like, mask_inx, mask_value = 1.0):
+
+def mask_like(like, mask_inx, mask_value=1.0):
   mask = np.zeros_like(like).reshape(-1)
   mask[mask_inx] = mask_value
   return mask.reshape(like.shape)
 
+
 def jacobian(func, input):
   output = func(input)
 
   ji = input.data.reshape(-1).shape[-1]
   jo = output.data.reshape(-1).shape[-1]
-  J = np.zeros((jo,ji), dtype=np.float32)
+  J = np.zeros((jo, ji), dtype=np.float32)
 
   for o in range(jo):
     input.grad = None
@@ -19,14 +21,15 @@ def jacobian(func, input):
 
     # tinygrad doesn't support slicing, tiny-hack to select
     # the needed scalar an backpropagate only through it
-    o_scalar = Tensor(mask_like(output.data, o, 1.)).mul(output).sum()
+    o_scalar = Tensor(mask_like(output.data, o, 1.0)).mul(output).sum()
     o_scalar.backward()
 
     for i, grad in enumerate(input.grad.data.reshape(-1)):
-      J[o,i] = grad
+      J[o, i] = grad
   return J
 
-def numerical_jacobian(func, input, eps = 1e-6):
+
+def numerical_jacobian(func, input, eps=1e-6):
   output = func(input)
 
   ji = input.data.reshape(-1).shape[-1]
@@ -34,17 +37,18 @@ def numerical_jacobian(func, input, eps = 1e-6):
   NJ = np.zeros((jo, ji), dtype=np.float32)
 
   for i in range(ji):
-    eps_perturb = mask_like(input.data, i, mask_value = eps)
+    eps_perturb = mask_like(input.data, i, mask_value=eps)
 
     output_perturb_add = func(Tensor(input.data + eps_perturb)).data.reshape(-1)
     output_perturb_sub = func(Tensor(input.data - eps_perturb)).data.reshape(-1)
 
-    grad_approx = ((output_perturb_add) - (output_perturb_sub)) / (2*eps)
+    grad_approx = ((output_perturb_add) - (output_perturb_sub)) / (2 * eps)
 
-    NJ[:,i] = grad_approx
+    NJ[:, i] = grad_approx
   return NJ
 
-def gradcheck(func, input, eps = 1e-06, atol = 1e-5, rtol = 0.001):
+
+def gradcheck(func, input, eps=1e-06, atol=1e-5, rtol=0.001):
   NJ = numerical_jacobian(func, input, eps)
   J = jacobian(func, input)
   return np.allclose(J, NJ, atol=atol, rtol=rtol)
diff --git a/extra/training.py b/extra/training.py
index 24fc84acb4..8e00b8ffb4 100644
--- a/extra/training.py
+++ b/extra/training.py
@@ -4,26 +4,40 @@ from tqdm import trange
 from extra.utils import get_parameters
 from tinygrad.tensor import Tensor, GPU, Device
 
-def train(model, X_train, Y_train, optim, steps, num_classes=None, BS=128, device=Device.CPU, lossfn = lambda out,y: out.mul(y).mean()):
-  if device == Device.GPU: [x.gpu_() for x in get_parameters([model, optim])]
-  elif device == Device.ANE: [x.ane_() for x in get_parameters([model, optim])]
-  if num_classes is None: num_classes = Y_train.max().astype(int)+1
+
+def train(
+  model,
+  X_train,
+  Y_train,
+  optim,
+  steps,
+  num_classes=None,
+  BS=128,
+  device=Device.CPU,
+  lossfn=lambda out, y: out.mul(y).mean(),
+):
+  if device == Device.GPU:
+    [x.gpu_() for x in get_parameters([model, optim])]
+  elif device == Device.ANE:
+    [x.ane_() for x in get_parameters([model, optim])]
+  if num_classes is None:
+    num_classes = Y_train.max().astype(int) + 1
   losses, accuracies = [], []
-  for i in (t := trange(steps, disable=os.getenv('CI') is not None)):
+  for i in (t := trange(steps, disable=os.getenv("CI") is not None)) :
     samp = np.random.randint(0, X_train.shape[0], size=(BS))
 
-    x = Tensor(X_train[samp].reshape((-1, 28*28)).astype(np.float32), device=device)
+    x = Tensor(X_train[samp].reshape((-1, 28 * 28)).astype(np.float32), device=device)
     Y = Y_train[samp]
-    y = np.zeros((len(samp),num_classes), np.float32)
+    y = np.zeros((len(samp), num_classes), np.float32)
     # correct loss for NLL, torch NLL loss returns one per row
-    y[range(y.shape[0]),Y] = -1.0*num_classes
+    y[range(y.shape[0]), Y] = -1.0 * num_classes
     y = Tensor(y, device=device)
 
     # network
     out = model.forward(x)
 
     # NLL loss function
-    loss = lossfn(out, y) 
+    loss = lossfn(out, y)
     optim.zero_grad()
     loss.backward()
     optim.step()
@@ -37,15 +51,26 @@ def train(model, X_train, Y_train, optim, steps, num_classes=None, BS=128, devic
     accuracies.append(accuracy)
     t.set_description("loss %.2f accuracy %.2f" % (loss, accuracy))
 
+
 def evaluate(model, X_test, Y_test, num_classes=None, device=Device.CPU, BS=128):
   def numpy_eval(num_classes):
-    Y_test_preds_out = np.zeros((len(Y_test),num_classes))
-    for i in trange(len(Y_test)//BS, disable=os.getenv('CI') is not None):
-      Y_test_preds_out[i*BS:(i+1)*BS] = model.forward(Tensor(X_test[i*BS:(i+1)*BS].reshape((-1, 28*28)).astype(np.float32), device=device)).cpu().data
+    Y_test_preds_out = np.zeros((len(Y_test), num_classes))
+    for i in trange(len(Y_test) // BS, disable=os.getenv("CI") is not None):
+      Y_test_preds_out[i * BS : (i + 1) * BS] = (
+        model.forward(
+          Tensor(
+            X_test[i * BS : (i + 1) * BS].reshape((-1, 28 * 28)).astype(np.float32),
+            device=device,
+          )
+        )
+        .cpu()
+        .data
+      )
     Y_test_preds = np.argmax(Y_test_preds_out, axis=1)
     return (Y_test == Y_test_preds).mean()
 
-  if num_classes is None: num_classes = Y_test.max().astype(int)+1
+  if num_classes is None:
+    num_classes = Y_test.max().astype(int) + 1
   accuracy = numpy_eval(num_classes)
   print("test set accuracy is %f" % accuracy)
-  return accuracy 
+  return accuracy
diff --git a/extra/utils.py b/extra/utils.py
index 01d20b05e8..e8a822bb0e 100644
--- a/extra/utils.py
+++ b/extra/utils.py
@@ -1,19 +1,22 @@
 from tinygrad.tensor import Tensor
 
+
 def fetch(url):
   import requests, os, hashlib, tempfile
-  fp = os.path.join(tempfile.gettempdir(), hashlib.md5(url.encode('utf-8')).hexdigest())
+
+  fp = os.path.join(tempfile.gettempdir(), hashlib.md5(url.encode("utf-8")).hexdigest())
   if os.path.isfile(fp) and os.stat(fp).st_size > 0:
     with open(fp, "rb") as f:
       dat = f.read()
   else:
     print("fetching %s" % url)
     dat = requests.get(url).content
-    with open(fp+".tmp", "wb") as f:
+    with open(fp + ".tmp", "wb") as f:
       f.write(dat)
-    os.rename(fp+".tmp", fp)
+    os.rename(fp + ".tmp", fp)
   return dat
 
+
 def get_parameters(obj):
   parameters = []
   if isinstance(obj, Tensor):
@@ -21,8 +24,7 @@ def get_parameters(obj):
   elif isinstance(obj, list):
     for x in obj:
       parameters.extend(get_parameters(x))
-  elif hasattr(obj, '__dict__'):
-    for k,v in obj.__dict__.items():
+  elif hasattr(obj, "__dict__"):
+    for k, v in obj.__dict__.items():
       parameters.extend(get_parameters(v))
   return parameters
-
diff --git a/setup.py b/setup.py
index 823bfcab2c..7ef6c20782 100644
--- a/setup.py
+++ b/setup.py
@@ -4,29 +4,31 @@ import os
 from setuptools import setup
 
 directory = os.path.abspath(os.path.dirname(__file__))
-with open(os.path.join(directory, 'README.md'), encoding='utf-8') as f:
+with open(os.path.join(directory, "README.md"), encoding="utf-8") as f:
   long_description = f.read()
 
-setup(name='tinygrad',
-      version='0.3.0',
-      description='You like pytorch? You like micrograd? You love tinygrad! heart',
-      author='George Hotz',
-      license='MIT',
-      long_description=long_description,
-      long_description_content_type='text/markdown',
-      packages = ['tinygrad'],
-      classifiers=[
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License"
-      ],
-      install_requires=['numpy', 'requests'],
-      python_requires='>=3.8',
-      extras_require={
-        'gpu': ["pyopencl", "six"],
-        'testing': [
-            "pytest",
-            "torch",
-            "tqdm",
-        ],
-      },
-      include_package_data=True)
+setup(
+  name="tinygrad",
+  version="0.3.0",
+  description="You like pytorch? You like micrograd? You love tinygrad! heart",
+  author="George Hotz",
+  license="MIT",
+  long_description=long_description,
+  long_description_content_type="text/markdown",
+  packages=["tinygrad"],
+  classifiers=[
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+  ],
+  install_requires=["numpy", "requests"],
+  python_requires=">=3.8",
+  extras_require={
+    "gpu": ["pyopencl", "six"],
+    "testing": [
+      "pytest",
+      "torch",
+      "tqdm",
+    ],
+  },
+  include_package_data=True,
+)
diff --git a/test/config.py b/test/config.py
index ab20e8b39e..990bbebef7 100644
--- a/test/config.py
+++ b/test/config.py
@@ -1,3 +1,3 @@
 import os
 
-ANE = os.environ.get('ANE', False)
+ANE = os.environ.get("ANE", False)
diff --git a/test/test_gc.py b/test/test_gc.py
index 2dc007d34b..2e7d84df42 100644
--- a/test/test_gc.py
+++ b/test/test_gc.py
@@ -4,43 +4,48 @@ import unittest
 from tinygrad.tensor import Tensor, GPU, Device
 from .config import ANE
 
+
 def tensors_allocated():
   return sum([isinstance(x, Tensor) for x in gc.get_objects()])
-    
+
+
 class TestGC(unittest.TestCase):
   device = Device.CPU
 
   def test_gc(self):
-    a = Tensor.zeros(4,4, device=self.device)
-    b = Tensor.zeros(4,4, device=self.device)
-    (a*b).mean().backward()
-    assert(tensors_allocated() > 0)
-    del a,b
-    assert(tensors_allocated() == 0)
+    a = Tensor.zeros(4, 4, device=self.device)
+    b = Tensor.zeros(4, 4, device=self.device)
+    (a * b).mean().backward()
+    assert tensors_allocated() > 0
+    del a, b
+    assert tensors_allocated() == 0
 
   def test_gc_complex(self):
-    a = Tensor.zeros(4,4, device=self.device)
-    b = Tensor.zeros(4,4, device=self.device)
-    assert(tensors_allocated() == 2)
-    (a*b).mean().backward()
-    assert(tensors_allocated() == 4)
+    a = Tensor.zeros(4, 4, device=self.device)
+    b = Tensor.zeros(4, 4, device=self.device)
+    assert tensors_allocated() == 2
+    (a * b).mean().backward()
+    assert tensors_allocated() == 4
     del b
-    assert(tensors_allocated() == 2)
-    b = Tensor.zeros(4,4, device=self.device)
+    assert tensors_allocated() == 2
+    b = Tensor.zeros(4, 4, device=self.device)
     print(tensors_allocated())
-    (a*b).mean().backward()
+    (a * b).mean().backward()
     print(tensors_allocated())
-    assert(tensors_allocated() == 4)
+    assert tensors_allocated() == 4
     del b
-    assert(tensors_allocated() == 2)
+    assert tensors_allocated() == 2
+
 
 @unittest.skipUnless(GPU, "Requires GPU")
 class TestGCGPU(TestGC):
-  device = Device.GPU 
+  device = Device.GPU
+
 
 @unittest.skipUnless(ANE, "Requires ANE")
 class TestGCANE(TestGC):
-  device=Device.ANE
+  device = Device.ANE
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
   unittest.main()
diff --git a/test/test_mnist.py b/test/test_mnist.py
index 8edae37f4d..35ed13c50e 100644
--- a/test/test_mnist.py
+++ b/test/test_mnist.py
@@ -11,19 +11,28 @@ from .config import ANE
 # mnist loader
 def fetch_mnist():
   import gzip
+
   parse = lambda dat: np.frombuffer(gzip.decompress(dat), dtype=np.uint8).copy()
-  X_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"))[0x10:].reshape((-1, 28, 28))
-  Y_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"))[8:]
-  X_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"))[0x10:].reshape((-1, 28, 28))
-  Y_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"))[8:]
+  X_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"))[
+    0x10:
+  ].reshape((-1, 28, 28))
+  Y_train = parse(fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"))[
+    8:
+  ]
+  X_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz"))[
+    0x10:
+  ].reshape((-1, 28, 28))
+  Y_test = parse(fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"))[
+    8:
+  ]
   return X_train, Y_train, X_test, Y_test
 
+
 # load the mnist dataset
 X_train, Y_train, X_test, Y_test = fetch_mnist()
 
 # create a model
 class TinyBobNet:
-
   def __init__(self):
     self.l1 = Tensor.uniform(784, 128)
     self.l2 = Tensor.uniform(128, 10)
@@ -34,27 +43,29 @@ class TinyBobNet:
   def forward(self, x):
     return x.dot(self.l1).relu().dot(self.l2).logsoftmax()
 
+
 # create a model with a conv layer
 class TinyConvNet:
   def __init__(self):
     # https://keras.io/examples/vision/mnist_convnet/
     conv = 3
-    #inter_chan, out_chan = 32, 64
-    inter_chan, out_chan = 8, 16   # for speed
-    self.c1 = Tensor.uniform(inter_chan,1,conv,conv)
-    self.c2 = Tensor.uniform(out_chan,inter_chan,conv,conv)
-    self.l1 = Tensor.uniform(out_chan*5*5, 10)
+    # inter_chan, out_chan = 32, 64
+    inter_chan, out_chan = 8, 16  # for speed
+    self.c1 = Tensor.uniform(inter_chan, 1, conv, conv)
+    self.c2 = Tensor.uniform(out_chan, inter_chan, conv, conv)
+    self.l1 = Tensor.uniform(out_chan * 5 * 5, 10)
 
   def parameters(self):
     return get_parameters(self)
 
   def forward(self, x):
-    x = x.reshape(shape=(-1, 1, 28, 28)) # hacks
+    x = x.reshape(shape=(-1, 1, 28, 28))  # hacks
     x = x.conv2d(self.c1).relu().max_pool2d()
     x = x.conv2d(self.c2).relu().max_pool2d()
     x = x.reshape(shape=[x.shape[0], -1])
     return x.dot(self.l1).logsoftmax()
 
+
 class TestMNIST(unittest.TestCase):
   device = Device.CPU
 
@@ -76,16 +87,19 @@ class TestMNIST(unittest.TestCase):
     np.random.seed(1337)
     model = TinyBobNet()
     optimizer = optim.RMSprop(model.parameters(), lr=0.0002)
-    train(model,  X_train, Y_train, optimizer, steps=1000, device=self.device)
+    train(model, X_train, Y_train, optimizer, steps=1000, device=self.device)
     assert evaluate(model, X_test, Y_test, device=self.device) > 0.95
 
+
 @unittest.skipUnless(GPU, "Requires GPU")
 class TestMNISTGPU(TestMNIST):
   device = Device.GPU
 
+
 @unittest.skipUnless(ANE, "Requires ANE")
 class TestMNISTANE(TestMNIST):
-  device=Device.ANE
+  device = Device.ANE
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
   unittest.main()
diff --git a/test/test_net_speed.py b/test/test_net_speed.py
index 8b1ed84a16..bcf60b2f13 100644
--- a/test/test_net_speed.py
+++ b/test/test_net_speed.py
@@ -7,21 +7,25 @@ import torch
 from tinygrad.tensor import Tensor, GPU, Device
 from .config import ANE
 
+
 def start_profile():
   import time
-  pr = cProfile.Profile(timer=lambda: int(time.time()*1e9), timeunit=1e-6)
+
+  pr = cProfile.Profile(timer=lambda: int(time.time() * 1e9), timeunit=1e-6)
   pr.enable()
   return pr
 
-def stop_profile(pr, sort='cumtime'):
+
+def stop_profile(pr, sort="cumtime"):
   pr.disable()
   ps = pstats.Stats(pr)
   ps.strip_dirs()
   ps.sort_stats(sort)
   ps.print_stats(0.2)
 
+
 class TestConvSpeed(unittest.TestCase):
-  device= Device.CPU
+  device = Device.CPU
 
   def test_mnist(self):
     # https://keras.io/examples/vision/mnist_convnet/
@@ -34,12 +38,12 @@ class TestConvSpeed(unittest.TestCase):
 
     conv = 3
     inter_chan, out_chan = 32, 64
-    c1 = torch.randn(inter_chan,1,conv,conv, requires_grad=True)
-    c2 = torch.randn(out_chan,inter_chan,conv,conv, requires_grad=True)
-    l1 = torch.randn(out_chan*5*5, 10, requires_grad=True)
+    c1 = torch.randn(inter_chan, 1, conv, conv, requires_grad=True)
+    c2 = torch.randn(out_chan, inter_chan, conv, conv, requires_grad=True)
+    l1 = torch.randn(out_chan * 5 * 5, 10, requires_grad=True)
 
     c2d = torch.nn.functional.conv2d
-    mp = torch.nn.MaxPool2d((2,2))
+    mp = torch.nn.MaxPool2d((2, 2))
     lsm = torch.nn.LogSoftmax(dim=1)
 
     cnt = 5
@@ -47,19 +51,19 @@ class TestConvSpeed(unittest.TestCase):
     for i in range(cnt):
       et0 = time.time()
       x = torch.randn(128, 1, 28, 28, requires_grad=True)
-      x = mp(c2d(x,c1).relu())
-      x = mp(c2d(x,c2).relu())
+      x = mp(c2d(x, c1).relu())
+      x = mp(c2d(x, c2).relu())
       x = x.reshape(x.shape[0], -1)
       out = lsm(x.matmul(l1))
       out = out.mean()
       et1 = time.time()
       out.backward()
       et2 = time.time()
-      fpt += (et1-et0)
-      bpt += (et2-et1)
+      fpt += et1 - et0
+      bpt += et2 - et1
 
-    fpt_baseline = (fpt*1000/cnt)
-    bpt_baseline = (bpt*1000/cnt)
+    fpt_baseline = fpt * 1000 / cnt
+    bpt_baseline = bpt * 1000 / cnt
     print("torch forward pass:  %.3f ms" % fpt_baseline)
     print("torch backward pass: %.3f ms" % bpt_baseline)
 
@@ -71,7 +75,7 @@ class TestConvSpeed(unittest.TestCase):
 
     cnt = 5
     fpt, bpt = 0.0, 0.0
-    for i in range(1+cnt):
+    for i in range(1 + cnt):
       et0 = time.time()
       x = Tensor.randn(128, 1, 28, 28, device=self.device)
       x = x.conv2d(c1).relu().avg_pool2d()
@@ -85,24 +89,31 @@ class TestConvSpeed(unittest.TestCase):
       if i == 0:
         pr = start_profile()
       else:
-        fpt += (et1-et0)
-        bpt += (et2-et1)
+        fpt += et1 - et0
+        bpt += et2 - et1
+
+    stop_profile(pr, sort="time")
+    fpt = fpt * 1000 / cnt
+    bpt = bpt * 1000 / cnt
+    print(
+      "forward pass:  %.3f ms, %.2fx off baseline %.3f ms"
+      % (fpt, fpt / fpt_baseline, fpt_baseline)
+    )
+    print(
+      "backward pass: %.3f ms, %.2fx off baseline %.3f ms"
+      % (bpt, bpt / bpt_baseline, bpt_baseline)
+    )
 
-    stop_profile(pr, sort='time')
-    fpt = (fpt*1000/cnt)
-    bpt = (bpt*1000/cnt)
-    print("forward pass:  %.3f ms, %.2fx off baseline %.3f ms" % (fpt, fpt/fpt_baseline, fpt_baseline))
-    print("backward pass: %.3f ms, %.2fx off baseline %.3f ms" % (bpt, bpt/bpt_baseline, bpt_baseline))
 
 @unittest.skipUnless(GPU, "Requires GPU")
 class TestConvSpeedGPU(TestConvSpeed):
   device = Device.GPU
 
+
 @unittest.skipUnless(ANE, "Requires ANE")
 class TestConvSpeedANE(TestConvSpeed):
-  device=Device.ANE
+  device = Device.ANE
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   unittest.main()
-
diff --git a/test/test_nn.py b/test/test_nn.py
index ba00c7340b..6273ba479f 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -7,6 +7,7 @@ from extra.utils import get_parameters
 import torch
 from .config import ANE
 
+
 class TestNN(unittest.TestCase):
   device = Device.CPU
 
@@ -30,8 +31,12 @@ class TestNN(unittest.TestCase):
       tbn.running_mean[:] = torch.tensor(bn.running_mean.data)
       tbn.running_var[:] = torch.tensor(bn.running_var.data)
 
-    np.testing.assert_allclose(bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5)
-    np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
+    np.testing.assert_allclose(
+      bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5
+    )
+    np.testing.assert_allclose(
+      bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5
+    )
 
     # trial
     inn = Tensor.randn(2, sz, 3, 3, device=self.device)
@@ -45,35 +50,42 @@ class TestNN(unittest.TestCase):
     # close
     np.testing.assert_allclose(outt.data, toutt.detach().numpy(), rtol=5e-5)
 
-    np.testing.assert_allclose(bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5)
+    np.testing.assert_allclose(
+      bn.running_mean.data, tbn.running_mean.detach().numpy(), rtol=1e-5
+    )
 
     # TODO: this is failing
-    #np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
+    # np.testing.assert_allclose(bn.running_var.data, tbn.running_var.detach().numpy(), rtol=1e-5)
 
   def test_batchnorm2d_training(self):
     self.test_batchnorm2d(True)
 
+
 @unittest.skipUnless(GPU, "Requires GPU")
 class TestNNGPU(TestNN):
   device = Device.GPU
 
   @unittest.skip("Tests not added")
-  def test_batchnorm2d(self): pass
+  def test_batchnorm2d(self):
+    pass
 
   @unittest.skip("Tests not added")
-  def test_batchnorm2d_training(self): pass
+  def test_batchnorm2d_training(self):
+    pass
 
 
 @unittest.skipUnless(ANE, "Requires ANE")
 class TestNNANE(TestNN):
-  device=Device.ANE
+  device = Device.ANE
 
   @unittest.skip("Tests not added")
-  def test_batchnorm2d(self): pass
+  def test_batchnorm2d(self):
+    pass
 
   @unittest.skip("Tests not added")
-  def test_batchnorm2d_training(self): pass
+  def test_batchnorm2d_training(self):
+    pass
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   unittest.main()
diff --git a/test/test_ops.py b/test/test_ops.py
index 4e283f87e2..afaffa5125 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -7,13 +7,24 @@ import functools
 from tinygrad.tensor import Tensor, GPU, Device
 from .config import ANE
 
-def helper_test_op(shps, torch_fxn, tinygrad_fxn, atol=0, rtol=1e-6, grad_atol=0, grad_rtol=1e-6, device=Device.CPU, forward_only=False):
+
+def helper_test_op(
+  shps,
+  torch_fxn,
+  tinygrad_fxn,
+  atol=0,
+  rtol=1e-6,
+  grad_atol=0,
+  grad_rtol=1e-6,
+  device=Device.CPU,
+  forward_only=False,
+):
   torch.manual_seed(0)
   ts = [torch.rand(x, requires_grad=True) for x in shps]
   tst = [Tensor(x.detach().numpy()) for x in ts]
-  if device==Device.GPU:
+  if device == Device.GPU:
     tst = [x.gpu() for x in tst]
-  elif device==Device.ANE:
+  elif device == Device.ANE:
     tst = [x.ane() for x in tst]
 
   out = torch_fxn(*ts)
@@ -26,142 +37,291 @@ def helper_test_op(shps, torch_fxn, tinygrad_fxn, atol=0, rtol=1e-6, grad_atol=0
     ret.mean().backward()
 
     for t, tt in zip(ts, tst):
-      np.testing.assert_allclose(t.grad, tt.cpu().grad.data, atol=grad_atol, rtol=grad_rtol)
+      np.testing.assert_allclose(
+        t.grad, tt.cpu().grad.data, atol=grad_atol, rtol=grad_rtol
+      )
 
   # speed
-  torch_fp = timeit.Timer(functools.partial(torch_fxn, *ts)).timeit(5) * 1000/5
-  tinygrad_fp = timeit.Timer(functools.partial(tinygrad_fxn, *tst)).timeit(5) * 1000/5
+  torch_fp = timeit.Timer(functools.partial(torch_fxn, *ts)).timeit(5) * 1000 / 5
+  tinygrad_fp = timeit.Timer(functools.partial(tinygrad_fxn, *tst)).timeit(5) * 1000 / 5
 
   if not forward_only:
-    torch_fbp = timeit.Timer(functools.partial(lambda f,x: f(*x).mean().backward(), torch_fxn, ts)).timeit(5) * 1000/5
-    tinygrad_fbp = timeit.Timer(functools.partial(lambda f,x: f(*x).mean().backward(), tinygrad_fxn, tst)).timeit(5) * 1000/5
+    torch_fbp = (
+      timeit.Timer(
+        functools.partial(lambda f, x: f(*x).mean().backward(), torch_fxn, ts)
+      ).timeit(5)
+      * 1000
+      / 5
+    )
+    tinygrad_fbp = (
+      timeit.Timer(
+        functools.partial(lambda f, x: f(*x).mean().backward(), tinygrad_fxn, tst)
+      ).timeit(5)
+      * 1000
+      / 5
+    )
   else:
     torch_fbp, tinygrad_fbp = np.nan, np.nan
 
-  print("testing %30r   torch/tinygrad fp: %.2f / %.2f ms  bp: %.2f / %.2f ms" % (shps, torch_fp, tinygrad_fp, torch_fbp-torch_fp, tinygrad_fbp-tinygrad_fp))
+  print(
+    "testing %30r   torch/tinygrad fp: %.2f / %.2f ms  bp: %.2f / %.2f ms"
+    % (shps, torch_fp, tinygrad_fp, torch_fbp - torch_fp, tinygrad_fbp - tinygrad_fp)
+  )
+
 
 class TestOps(unittest.TestCase):
-  device=Device.CPU
+  device = Device.CPU
 
   def test_add(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x+y, Tensor.add, device=self.device)
+    helper_test_op(
+      [(45, 65), (45, 65)], lambda x, y: x + y, Tensor.add, device=self.device
+    )
+
   def test_sub(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x-y, Tensor.sub, device=self.device)
+    helper_test_op(
+      [(45, 65), (45, 65)], lambda x, y: x - y, Tensor.sub, device=self.device
+    )
+
   def test_mul(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x*y, Tensor.mul, device=self.device)
+    helper_test_op(
+      [(45, 65), (45, 65)], lambda x, y: x * y, Tensor.mul, device=self.device
+    )
+
   def test_div(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x/y, Tensor.div, device=self.device)
+    helper_test_op(
+      [(45, 65), (45, 65)], lambda x, y: x / y, Tensor.div, device=self.device
+    )
+
   def test_pow(self):
-    helper_test_op([(45,65), (45,65)], lambda x,y: x**y, Tensor.pow, device=self.device)
+    helper_test_op(
+      [(45, 65), (45, 65)], lambda x, y: x ** y, Tensor.pow, device=self.device
+    )
+
   def test_sqrt(self):
-    helper_test_op([(45,65)], lambda x: x.sqrt(), Tensor.sqrt, device=self.device)
+    helper_test_op([(45, 65)], lambda x: x.sqrt(), Tensor.sqrt, device=self.device)
+
   def test_relu(self):
-    helper_test_op([(45,65)], lambda x: x.relu(), Tensor.relu, device=self.device)
+    helper_test_op([(45, 65)], lambda x: x.relu(), Tensor.relu, device=self.device)
+
   def test_leakyrelu(self):
-    helper_test_op([(45,65)], lambda x: torch.nn.functional.leaky_relu(x,0.01), Tensor.leakyrelu, device=self.device)
+    helper_test_op(
+      [(45, 65)],
+      lambda x: torch.nn.functional.leaky_relu(x, 0.01),
+      Tensor.leakyrelu,
+      device=self.device,
+    )
+
   def test_abs(self):
-    helper_test_op([(45,65)], lambda x: torch.abs(x), Tensor.abs, device=self.device)
+    helper_test_op([(45, 65)], lambda x: torch.abs(x), Tensor.abs, device=self.device)
+
   def test_sigmoid(self):
-    helper_test_op([(45,65)], lambda x: x.sigmoid(), Tensor.sigmoid, device=self.device)
+    helper_test_op(
+      [(45, 65)], lambda x: x.sigmoid(), Tensor.sigmoid, device=self.device
+    )
+
   def test_dot(self):
-    helper_test_op([(45,65), (65,100)], lambda x,y: x.matmul(y), Tensor.dot, device=self.device)
+    helper_test_op(
+      [(45, 65), (65, 100)], lambda x, y: x.matmul(y), Tensor.dot, device=self.device
+    )
+
   def test_sum(self):
-    helper_test_op([(45,3)], lambda x: x.sum(), Tensor.sum, device=self.device)
+    helper_test_op([(45, 3)], lambda x: x.sum(), Tensor.sum, device=self.device)
+
   def test_sum_axis(self):
-    helper_test_op([(3,4,5,6)], lambda x: x.sum(axis=(1,2)), lambda x: Tensor.sum(x, axis=(1,2)), device=self.device)
+    helper_test_op(
+      [(3, 4, 5, 6)],
+      lambda x: x.sum(axis=(1, 2)),
+      lambda x: Tensor.sum(x, axis=(1, 2)),
+      device=self.device,
+    )
+
   def test_mean_axis(self):
-    helper_test_op([(3,4,5,6)], lambda x: x.mean(axis=(1,2)), lambda x: Tensor.mean(x, axis=(1,2)), device=self.device)
+    helper_test_op(
+      [(3, 4, 5, 6)],
+      lambda x: x.mean(axis=(1, 2)),
+      lambda x: Tensor.mean(x, axis=(1, 2)),
+      device=self.device,
+    )
+
   def test_logsoftmax(self):
-    helper_test_op([(45,65)], lambda x: torch.nn.LogSoftmax(dim=1)(x), Tensor.logsoftmax, atol=1e-7, grad_atol=1e-7, device=self.device)
+    helper_test_op(
+      [(45, 65)],
+      lambda x: torch.nn.LogSoftmax(dim=1)(x),
+      Tensor.logsoftmax,
+      atol=1e-7,
+      grad_atol=1e-7,
+      device=self.device,
+    )
+
   def test_tanh(self):
-    helper_test_op([(45,65)], lambda x: x.tanh(), Tensor.tanh, atol=1e-6, grad_atol=1e-6, device=self.device)
+    helper_test_op(
+      [(45, 65)],
+      lambda x: x.tanh(),
+      Tensor.tanh,
+      atol=1e-6,
+      grad_atol=1e-6,
+      device=self.device,
+    )
+
   def test_topo_sort(self):
-    helper_test_op([(45,65)], lambda x: (x+x)*x, lambda x: x.add(x).mul(x), atol=1e-6, grad_atol=1e-6, device=self.device)
+    helper_test_op(
+      [(45, 65)],
+      lambda x: (x + x) * x,
+      lambda x: x.add(x).mul(x),
+      atol=1e-6,
+      grad_atol=1e-6,
+      device=self.device,
+    )
 
   def test_scalar_mul(self):
-    helper_test_op([(45,65)], lambda x: x*2, lambda x: x*2, device=self.device)
+    helper_test_op([(45, 65)], lambda x: x * 2, lambda x: x * 2, device=self.device)
+
   def test_scalar_rmul(self):
-    helper_test_op([(45,65)], lambda x: 2*x, lambda x: 2*x, device=self.device)
+    helper_test_op([(45, 65)], lambda x: 2 * x, lambda x: 2 * x, device=self.device)
 
   def test_scalar_sub(self):
-    helper_test_op([(45,65)], lambda x: x-2, lambda x: x-2, device=self.device)
+    helper_test_op([(45, 65)], lambda x: x - 2, lambda x: x - 2, device=self.device)
+
   def test_scalar_rsub(self):
-    helper_test_op([(45,65)], lambda x: 2-x, lambda x: 2-x, device=self.device)
+    helper_test_op([(45, 65)], lambda x: 2 - x, lambda x: 2 - x, device=self.device)
 
   def test_broadcast_full(self):
-    for torch_op, tinygrad_op in [(torch.add, Tensor.add), (torch.sub, Tensor.sub), (torch.mul, Tensor.mul),
-                                  (torch.div, Tensor.div), (torch.pow, Tensor.pow)]:
-      for shapes in [((5,13,24,16), (5,1,24,1)), ((1,3,1,7,1), (2,1,5,1,8))]:
+    for torch_op, tinygrad_op in [
+      (torch.add, Tensor.add),
+      (torch.sub, Tensor.sub),
+      (torch.mul, Tensor.mul),
+      (torch.div, Tensor.div),
+      (torch.pow, Tensor.pow),
+    ]:
+      for shapes in [
+        ((5, 13, 24, 16), (5, 1, 24, 1)),
+        ((1, 3, 1, 7, 1), (2, 1, 5, 1, 8)),
+      ]:
         with self.subTest(op=torch_op.__name__, shapes=shapes):
           helper_test_op(shapes, torch_op, tinygrad_op, device=self.device)
 
-
   def test_broadcast_partial(self):
-    for torch_op, tinygrad_op in [(torch.add, Tensor.add), (torch.sub, Tensor.sub), (torch.mul, Tensor.mul),
-                                  (torch.div, Tensor.div), (torch.pow, Tensor.pow)]:
-      for shapes in [((1,32,32,32), (1,32,1,1)), ((5,13,24,16,2), (1,13,24,1,1)),
-                     ((4,1), (4,5)), ((1,4), (5,4))]:
+    for torch_op, tinygrad_op in [
+      (torch.add, Tensor.add),
+      (torch.sub, Tensor.sub),
+      (torch.mul, Tensor.mul),
+      (torch.div, Tensor.div),
+      (torch.pow, Tensor.pow),
+    ]:
+      for shapes in [
+        ((1, 32, 32, 32), (1, 32, 1, 1)),
+        ((5, 13, 24, 16, 2), (1, 13, 24, 1, 1)),
+        ((4, 1), (4, 5)),
+        ((1, 4), (5, 4)),
+      ]:
         with self.subTest(op=torch_op.__name__, shapes=shapes):
           # NOTE: ANE backwards?
-          helper_test_op(shapes, torch_op, tinygrad_op, device=self.device, forward_only=self.device!=Device.CPU)
+          helper_test_op(
+            shapes,
+            torch_op,
+            tinygrad_op,
+            device=self.device,
+            forward_only=self.device != Device.CPU,
+          )
 
   def test_pad2d(self):
-    helper_test_op([(3,3,3,3)], lambda x: torch.nn.functional.pad(x, (1,2,3,4)), lambda x: x.pad2d(padding=(1,2,3,4)), device=self.device)
+    helper_test_op(
+      [(3, 3, 3, 3)],
+      lambda x: torch.nn.functional.pad(x, (1, 2, 3, 4)),
+      lambda x: x.pad2d(padding=(1, 2, 3, 4)),
+      device=self.device,
+    )
 
   def test_reshape(self):
-    helper_test_op([(4,3,6,6)], lambda x: torch.reshape(x, (-1,3,6,6)), lambda x: x.reshape(shape=(-1,3,6,6)), device=self.device)
-    helper_test_op([(4,3,6,6)], lambda x: torch.reshape(x, (-1,1,6,6)), lambda x: x.reshape(shape=(-1,1,6,6)), device=self.device)
+    helper_test_op(
+      [(4, 3, 6, 6)],
+      lambda x: torch.reshape(x, (-1, 3, 6, 6)),
+      lambda x: x.reshape(shape=(-1, 3, 6, 6)),
+      device=self.device,
+    )
+    helper_test_op(
+      [(4, 3, 6, 6)],
+      lambda x: torch.reshape(x, (-1, 1, 6, 6)),
+      lambda x: x.reshape(shape=(-1, 1, 6, 6)),
+      device=self.device,
+    )
 
   def test_detach(self):
-    helper_test_op([(4,3,6,6)], lambda x: x.detach(), lambda x: x.detach(), device=self.device, forward_only=True)
+    helper_test_op(
+      [(4, 3, 6, 6)],
+      lambda x: x.detach(),
+      lambda x: x.detach(),
+      device=self.device,
+      forward_only=True,
+    )
 
   def test_conv2d(self):
-    for bs in [1,8]:
-      for cin in [1,3]:
-        for groups in [1,3] if cin == 3 else [1]:
-          for H in [1,2,5]:
-            for W in [1,2,3,5]:
-              with self.subTest(batch_size=bs, channels=cin, groups=groups, height=H, width=W):
-                helper_test_op([(bs,cin,11,28), (6,cin//groups,H,W)],
-                  lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups).relu(),
-                  lambda x,w: Tensor.conv2d(x,w,groups=groups).relu(), device=self.device, grad_rtol=1e-5)
+    for bs in [1, 8]:
+      for cin in [1, 3]:
+        for groups in [1, 3] if cin == 3 else [1]:
+          for H in [1, 2, 5]:
+            for W in [1, 2, 3, 5]:
+              with self.subTest(
+                batch_size=bs, channels=cin, groups=groups, height=H, width=W
+              ):
+                helper_test_op(
+                  [(bs, cin, 11, 28), (6, cin // groups, H, W)],
+                  lambda x, w: torch.nn.functional.conv2d(x, w, groups=groups).relu(),
+                  lambda x, w: Tensor.conv2d(x, w, groups=groups).relu(),
+                  device=self.device,
+                  grad_rtol=1e-5,
+                )
 
   def test_strided_conv2d(self):
     bs = 4
     cin = 3
-    H,W = 3,3
+    H, W = 3, 3
     with self.subTest(stride := 2):
-      helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
-        lambda x,w: torch.nn.functional.conv2d(x,w,stride=2).relu(),
-        lambda x,w: Tensor.conv2d(x,w,stride=stride).relu(), device=self.device)
-    with self.subTest(stride := (2,1)):
-      helper_test_op([(bs,cin,11,28), (4,cin,H,W)],
-        lambda x,w: torch.nn.functional.conv2d(x,w,stride=stride).relu(),
-        lambda x,w: Tensor.conv2d(x,w,stride=(2,1)).relu(), device=self.device)
+      helper_test_op(
+        [(bs, cin, 11, 28), (4, cin, H, W)],
+        lambda x, w: torch.nn.functional.conv2d(x, w, stride=2).relu(),
+        lambda x, w: Tensor.conv2d(x, w, stride=stride).relu(),
+        device=self.device,
+      )
+    with self.subTest(stride := (2, 1)):
+      helper_test_op(
+        [(bs, cin, 11, 28), (4, cin, H, W)],
+        lambda x, w: torch.nn.functional.conv2d(x, w, stride=stride).relu(),
+        lambda x, w: Tensor.conv2d(x, w, stride=(2, 1)).relu(),
+        device=self.device,
+      )
 
   def test_maxpool2d(self):
-    for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1)]:
+    for ksz in [(2, 2), (3, 3), (3, 2), (5, 5), (5, 1)]:
       with self.subTest(kernel_size=ksz):
-        helper_test_op([(32,2,110,28)],
+        helper_test_op(
+          [(32, 2, 110, 28)],
           lambda x: torch.nn.functional.max_pool2d(x, kernel_size=ksz),
-          lambda x: Tensor.max_pool2d(x, kernel_size=ksz), device=self.device)
+          lambda x: Tensor.max_pool2d(x, kernel_size=ksz),
+          device=self.device,
+        )
 
   def test_avgpool2d(self):
-    shape = (32,2,111,28)
-    for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1), shape[2:]]:
+    shape = (32, 2, 111, 28)
+    for ksz in [(2, 2), (3, 3), (3, 2), (5, 5), (5, 1), shape[2:]]:
       with self.subTest(kernel_size=ksz):
-        helper_test_op([shape],
+        helper_test_op(
+          [shape],
           lambda x: torch.nn.functional.avg_pool2d(x, kernel_size=ksz),
-          lambda x: Tensor.avg_pool2d(x, kernel_size=ksz), device=self.device)
+          lambda x: Tensor.avg_pool2d(x, kernel_size=ksz),
+          device=self.device,
+        )
+
 
 @unittest.skipUnless(GPU, "Requires GPU")
 class TestOpsGPU(TestOps):
-  device=Device.GPU
+  device = Device.GPU
+
 
 @unittest.skipUnless(ANE, "Requires ANE")
 class TestOpsANE(TestOps):
-  device=Device.ANE
+  device = Device.ANE
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
   unittest.main(verbosity=2)
-
diff --git a/test/test_optim.py b/test/test_optim.py
index 99ddcff57c..e4111c80bd 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -6,20 +6,24 @@ from tinygrad.optim import Adam, SGD, RMSprop
 from extra.utils import get_parameters
 from .config import ANE
 
-x_init = np.random.randn(1,3).astype(np.float32)
-W_init = np.random.randn(3,3).astype(np.float32)
-m_init = np.random.randn(1,3).astype(np.float32)
+x_init = np.random.randn(1, 3).astype(np.float32)
+W_init = np.random.randn(3, 3).astype(np.float32)
+m_init = np.random.randn(1, 3).astype(np.float32)
+
 
 def step_tinygrad(optim, kwargs={}, device=Device.CPU):
   net = TinyNet()
   optim = optim([net.x, net.W], **kwargs)
-  if device==Device.GPU: [x.gpu_() for x in get_parameters([net, optim])]
-  elif device==Device.ANE: [x.ane_() for x in get_parameters([net, optim])]
+  if device == Device.GPU:
+    [x.gpu_() for x in get_parameters([net, optim])]
+  elif device == Device.ANE:
+    [x.ane_() for x in get_parameters([net, optim])]
   out = net.forward()
   out.backward()
   optim.step()
   return net.x.cpu().data, net.W.cpu().data
 
+
 def step_pytorch(optim, kwargs={}):
   net = TorchNet()
   optim = optim([net.x, net.W], **kwargs)
@@ -29,7 +33,7 @@ def step_pytorch(optim, kwargs={}):
   return net.x.detach().numpy(), net.W.detach().numpy()
 
 
-class TinyNet():
+class TinyNet:
   def __init__(self):
     self.x = Tensor(x_init.copy())
     self.W = Tensor(W_init.copy())
@@ -42,7 +46,7 @@ class TinyNet():
     return out
 
 
-class TorchNet():
+class TorchNet:
   def __init__(self):
     self.x = torch.tensor(x_init.copy(), requires_grad=True)
     self.W = torch.tensor(W_init.copy(), requires_grad=True)
@@ -59,19 +63,23 @@ class TestOptim(unittest.TestCase):
   device = Device.CPU
 
   def test_adam(self):
-    for x,y in zip(step_tinygrad(Adam, device=self.device),
-                   step_pytorch(torch.optim.Adam)):
+    for x, y in zip(
+      step_tinygrad(Adam, device=self.device), step_pytorch(torch.optim.Adam)
+    ):
       np.testing.assert_allclose(x, y, atol=1e-4)
 
   def test_sgd(self):
-    for x,y in zip(step_tinygrad(SGD, kwargs={'lr': 0.001}, device=self.device),
-                   step_pytorch(torch.optim.SGD, kwargs={'lr': 0.001})):
+    for x, y in zip(
+      step_tinygrad(SGD, kwargs={"lr": 0.001}, device=self.device),
+      step_pytorch(torch.optim.SGD, kwargs={"lr": 0.001}),
+    ):
       np.testing.assert_allclose(x, y, atol=1e-5)
 
   def test_rmsprop(self):
-    for x,y in zip(step_tinygrad(RMSprop, kwargs={'lr': 0.001, 'decay': 0.99}, device=self.device),
-                   step_pytorch(torch.optim.RMSprop,
-                                kwargs={'lr': 0.001, 'alpha': 0.99})):
+    for x, y in zip(
+      step_tinygrad(RMSprop, kwargs={"lr": 0.001, "decay": 0.99}, device=self.device),
+      step_pytorch(torch.optim.RMSprop, kwargs={"lr": 0.001, "alpha": 0.99}),
+    ):
       np.testing.assert_allclose(x, y, atol=1e-5)
 
 
@@ -79,10 +87,11 @@ class TestOptim(unittest.TestCase):
 class TestOptimGPU(TestOptim):
   device = Device.GPU
 
+
 @unittest.skipUnless(ANE, "Requires ANE")
 class TestOptimANE(TestOptim):
   device = Device.ANE
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   unittest.main()
diff --git a/test/test_tensor.py b/test/test_tensor.py
index f54527b084..a81a10e011 100644
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -6,11 +6,12 @@ from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
 from .config import ANE
 
 
-x_init = np.random.randn(1,3).astype(np.float32)
-U_init = np.random.randn(3,3).astype(np.float32)
-V_init = np.random.randn(3,3).astype(np.float32)
-W_init = np.random.randn(3,3).astype(np.float32)
-m_init = np.random.randn(1,3).astype(np.float32)
+x_init = np.random.randn(1, 3).astype(np.float32)
+U_init = np.random.randn(3, 3).astype(np.float32)
+V_init = np.random.randn(3, 3).astype(np.float32)
+W_init = np.random.randn(3, 3).astype(np.float32)
+m_init = np.random.randn(1, 3).astype(np.float32)
+
 
 class TestTinygrad(unittest.TestCase):
   device = Device.CPU
@@ -36,7 +37,7 @@ class TestTinygrad(unittest.TestCase):
       out.backward()
       return out.detach().numpy(), x.grad, W.grad
 
-    for x,y in zip(test_tinygrad(), test_pytorch()):
+    for x, y in zip(test_tinygrad(), test_pytorch()):
       np.testing.assert_allclose(x, y, atol=1e-5)
 
   def test_backward_pass_diamond_model(self):
@@ -64,7 +65,7 @@ class TestTinygrad(unittest.TestCase):
       out.backward()
       return out.detach().numpy(), u.grad, v.grad, w.grad
 
-    for x,y in zip(test_tinygrad(), test_pytorch()):
+    for x, y in zip(test_tinygrad(), test_pytorch()):
       np.testing.assert_allclose(x, y, atol=1e-5)
 
   def test_jacobian(self):
@@ -73,7 +74,9 @@ class TestTinygrad(unittest.TestCase):
 
     torch_x = torch.tensor(x, requires_grad=True)
     torch_W = torch.tensor(W, requires_grad=True)
-    torch_func = lambda x: torch.nn.functional.log_softmax(x.matmul(torch_W).relu(), dim=1)
+    torch_func = lambda x: torch.nn.functional.log_softmax(
+      x.matmul(torch_W).relu(), dim=1
+    )
     PJ = torch.autograd.functional.jacobian(torch_func, torch_x).squeeze().numpy()
 
     tiny_x = Tensor(x, device=self.device)
@@ -82,8 +85,8 @@ class TestTinygrad(unittest.TestCase):
     J = jacobian(tiny_func, tiny_x)
     NJ = numerical_jacobian(tiny_func, tiny_x)
 
-    np.testing.assert_allclose(PJ, J, atol = 1e-5)
-    np.testing.assert_allclose(PJ, NJ, atol = 1e-5)
+    np.testing.assert_allclose(PJ, J, atol=1e-5)
+    np.testing.assert_allclose(PJ, NJ, atol=1e-5)
 
   def test_gradcheck(self):
     W = np.random.RandomState(1337).random((10, 5))
@@ -96,7 +99,7 @@ class TestTinygrad(unittest.TestCase):
     self.assertTrue(gradcheck(tiny_func, tiny_x))
 
     # coarse approx. since a "big" eps and the non-linearities of the model
-    self.assertFalse(gradcheck(tiny_func, tiny_x, eps = 0.1))
+    self.assertFalse(gradcheck(tiny_func, tiny_x, eps=0.1))
 
 
 @unittest.skipUnless(GPU, "Requires GPU")
@@ -104,14 +107,18 @@ class TestTinygradGPU(TestTinygrad):
   device = Device.GPU
 
   @unittest.skip("float64 not supported on GPU")
-  def test_jacobian(self): pass
+  def test_jacobian(self):
+    pass
 
   @unittest.skip("float64 not supported on GPU")
-  def test_gradcheck(self): pass
+  def test_gradcheck(self):
+    pass
+
 
 @unittest.skipUnless(ANE, "Requires ANE")
 class TestOpsANE(TestTinygrad):
-  device=Device.ANE
+  device = Device.ANE
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
   unittest.main()
diff --git a/tinygrad/nn.py b/tinygrad/nn.py
index 17a99d5941..0e1cc704d7 100644
--- a/tinygrad/nn.py
+++ b/tinygrad/nn.py
@@ -1,23 +1,38 @@
 from tinygrad.tensor import Tensor
 
+
 class BatchNorm2D:
-  def __init__(self, sz, eps=1e-5, track_running_stats=False, training=False, momentum=0.1):
-    self.eps, self.track_running_stats, self.training, self.momentum = eps, track_running_stats, training, momentum
+  def __init__(
+    self, sz, eps=1e-5, track_running_stats=False, training=False, momentum=0.1
+  ):
+    self.eps, self.track_running_stats, self.training, self.momentum = (
+      eps,
+      track_running_stats,
+      training,
+      momentum,
+    )
 
     self.weight, self.bias = Tensor.ones(sz), Tensor.zeros(sz)
 
-    self.running_mean, self.running_var = Tensor.zeros(sz, requires_grad=False), Tensor.ones(sz, requires_grad=False)
+    self.running_mean, self.running_var = (
+      Tensor.zeros(sz, requires_grad=False),
+      Tensor.ones(sz, requires_grad=False),
+    )
     self.num_batches_tracked = Tensor.zeros(1, requires_grad=False)
 
   def __call__(self, x):
     if self.track_running_stats or self.training:
-      batch_mean = x.mean(axis=(0,2,3))
-      y = (x - batch_mean.reshape(shape=[1, -1, 1, 1]))
-      batch_var = (y*y).mean(axis=(0,2,3))
+      batch_mean = x.mean(axis=(0, 2, 3))
+      y = x - batch_mean.reshape(shape=[1, -1, 1, 1])
+      batch_var = (y * y).mean(axis=(0, 2, 3))
 
     if self.track_running_stats:
-      self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean
-      self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var
+      self.running_mean = (
+        1 - self.momentum
+      ) * self.running_mean + self.momentum * batch_mean
+      self.running_var = (
+        1 - self.momentum
+      ) * self.running_var + self.momentum * batch_var
       self.num_batches_tracked += 1
 
     if self.training:
@@ -26,6 +41,9 @@ class BatchNorm2D:
     return self.normalize(x, self.running_mean, self.running_var)
 
   def normalize(self, x, mean, var):
-    x = (x - mean.reshape(shape=[1, -1, 1, 1])) * self.weight.reshape(shape=[1, -1, 1, 1])
-    return x.div(var.add(self.eps).reshape(shape=[1, -1, 1, 1])**0.5) + self.bias.reshape(shape=[1, -1, 1, 1])
-
+    x = (x - mean.reshape(shape=[1, -1, 1, 1])) * self.weight.reshape(
+      shape=[1, -1, 1, 1]
+    )
+    return x.div(
+      var.add(self.eps).reshape(shape=[1, -1, 1, 1]) ** 0.5
+    ) + self.bias.reshape(shape=[1, -1, 1, 1])
diff --git a/tinygrad/ops_ane.py b/tinygrad/ops_ane.py
index ba1c9a2d66..c7bb23c09f 100644
--- a/tinygrad/ops_ane.py
+++ b/tinygrad/ops_ane.py
@@ -2,19 +2,23 @@ from .tensor import Tensor, Function, register
 from functools import lru_cache
 import struct
 
+
 @lru_cache
 def compile_wrapper(ane, dat):
   return ane.compile(dat)
 
+
 def roundup(x, v):
-  return x + (v-x)%v
+  return x + (v - x) % v
+
 
 def fill(dat, addrs, type, val, base=0x4000):
   x = struct.pack(type, val)
   for a in addrs:
-    dat[base+a:base+a+len(x)] = x
+    dat[base + a : base + a + len(x)] = x
   return dat
 
+
 @lru_cache
 def compile_relu(ane, sz):
   dat = list(open("ane/ops/relu.hwx", "rb").read())
@@ -22,16 +26,18 @@ def compile_relu(ane, sz):
   # number of relus
   dat = fill(dat, [0x128, 0x13C], "H", sz)
   # number of engines? (max 0x100)
-  dat = fill(dat, [0x1ec, 0x1f0, 0x1f4, 0x1f8], "I", max(0x100, roundup(sz*2, 0x10)))
+  dat = fill(dat, [0x1EC, 0x1F0, 0x1F4, 0x1F8], "I", max(0x100, roundup(sz * 2, 0x10)))
   # strides?
-  dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz*2, 0x40))
+  dat = fill(dat, [0x260, 0x264, 0x268], "I", roundup(sz * 2, 0x40))
   return compile_wrapper(ane, bytes(dat))
 
+
 class ReLU(Function):
   @staticmethod
   def forward(ctx, input):
     ret = ctx.ane.tensor(input.shape)
     ctx.ane.run(compile_relu(ctx.ane, input.sz), input, ret)
     return ret
-register('relu', ReLU, device=Tensor.ANE)
 
+
+register("relu", ReLU, device=Tensor.ANE)
diff --git a/tinygrad/ops_cpu.py b/tinygrad/ops_cpu.py
index 87b253dbc0..c3661c1df2 100644
--- a/tinygrad/ops_cpu.py
+++ b/tinygrad/ops_cpu.py
@@ -5,44 +5,58 @@ from .tensor import Function, register
 # ************* basic ops *************
 def unbroadcast(out, in_sh):
   # adjoint operation to broadcast is sum. Need to sum all axis with 1 = in_sh[i] < out.shape[i]
-  sum_axis = tuple([i for i in range(len(in_sh)) if in_sh[i]==1 and out.shape[i]>1]) if in_sh != (1,) else None
+  sum_axis = (
+    tuple([i for i in range(len(in_sh)) if in_sh[i] == 1 and out.shape[i] > 1])
+    if in_sh != (1,)
+    else None
+  )
   return out.sum(axis=sum_axis).reshape(in_sh)
 
+
 class Add(Function):
   @staticmethod
   def forward(ctx, x, y):
     ctx.save_for_backward(x.shape, y.shape)
-    return x+y
+    return x + y
 
   @staticmethod
   def backward(ctx, grad_output):
     shape_x, shape_y = ctx.saved_tensors
     return unbroadcast(grad_output, shape_x), unbroadcast(grad_output, shape_y)
-register('add', Add)
+
+
+register("add", Add)
+
 
 class Sub(Function):
   @staticmethod
   def forward(ctx, x, y):
     ctx.save_for_backward(x.shape, y.shape)
-    return x-y
+    return x - y
 
   @staticmethod
   def backward(ctx, grad_output):
     shape_x, shape_y = ctx.saved_tensors
     return unbroadcast(grad_output, shape_x), unbroadcast(-grad_output, shape_y)
-register('sub', Sub)
+
+
+register("sub", Sub)
+
 
 class Mul(Function):
   @staticmethod
   def forward(ctx, x, y):
     ctx.save_for_backward(x, y)
-    return x*y
+    return x * y
 
   @staticmethod
   def backward(ctx, grad_output):
-    x,y = ctx.saved_tensors
-    return unbroadcast(y*grad_output, x.shape), unbroadcast(x*grad_output, y.shape)
-register('mul', Mul)
+    x, y = ctx.saved_tensors
+    return unbroadcast(y * grad_output, x.shape), unbroadcast(x * grad_output, y.shape)
+
+
+register("mul", Mul)
+
 
 class Pow(Function):
   @staticmethod
@@ -52,27 +66,37 @@ class Pow(Function):
 
   @staticmethod
   def backward(ctx, grad_output):
-    x,y = ctx.saved_tensors
-    return unbroadcast(y * (x**(y-1.0)) * grad_output, x.shape), \
-           unbroadcast((x**y) * np.log(x) * grad_output, y.shape)
-register('pow', Pow)
+    x, y = ctx.saved_tensors
+    return unbroadcast(y * (x ** (y - 1.0)) * grad_output, x.shape), unbroadcast(
+      (x ** y) * np.log(x) * grad_output, y.shape
+    )
+
+
+register("pow", Pow)
+
 
 class Sum(Function):
   @staticmethod
-  def forward(ctx, input,axis=None):
+  def forward(ctx, input, axis=None):
     ctx.save_for_backward(input, axis)
     return np.array([input.sum()]) if axis is None else input.sum(axis=axis)
 
   @staticmethod
   def backward(ctx, grad_output):
     input, axis = ctx.saved_tensors
-    shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
+    shape = [
+      1 if axis is None or i in axis else input.shape[i]
+      for i in range(len(input.shape))
+    ]
     return grad_output.reshape(shape) + np.zeros_like(input)
-register('sum', Sum)
+
+
+register("sum", Sum)
 
 
 # ************* GEMM *************
 
+
 class Dot(Function):
   @staticmethod
   def forward(ctx, input, weight):
@@ -85,21 +109,27 @@ class Dot(Function):
     grad_input = grad_output.dot(weight.T)
     grad_weight = input.T.dot(grad_output)
     return grad_input, grad_weight
-register('dot', Dot)
+
+
+register("dot", Dot)
 
 # ************* simple ops *************
 
+
 class Pad2D(Function):
   @staticmethod
   def forward(ctx, x, padding=None):
     ctx.save_for_backward(padding)
-    return np.pad(x, ((0,0), (0,0), tuple(padding[2:4]), tuple(padding[0:2])))
+    return np.pad(x, ((0, 0), (0, 0), tuple(padding[2:4]), tuple(padding[0:2])))
 
   @staticmethod
   def backward(ctx, grad_output):
-    padding, = ctx.saved_tensors
-    return grad_output[..., padding[2]:-padding[3], padding[0]:-padding[1]]
-register('pad2d', Pad2D)
+    (padding,) = ctx.saved_tensors
+    return grad_output[..., padding[2] : -padding[3], padding[0] : -padding[1]]
+
+
+register("pad2d", Pad2D)
+
 
 class Reshape(Function):
   @staticmethod
@@ -109,13 +139,16 @@ class Reshape(Function):
 
   @staticmethod
   def backward(ctx, grad_output):
-    in_shape, = ctx.saved_tensors
+    (in_shape,) = ctx.saved_tensors
     return grad_output.reshape(in_shape)
-register('reshape', Reshape)
+
+
+register("reshape", Reshape)
 
 
 # ************* activation ops *************
 
+
 class ReLU(Function):
   @staticmethod
   def forward(ctx, input):
@@ -124,123 +157,145 @@ class ReLU(Function):
 
   @staticmethod
   def backward(ctx, grad_output):
-    input, = ctx.saved_tensors
+    (input,) = ctx.saved_tensors
     return grad_output * (input >= 0)
-register('relu', ReLU)
+
+
+register("relu", ReLU)
+
 
 def _exp_normalize(x, axis=None):
-    y = np.exp(x - x.max(axis=axis, keepdims=True))
-    return y / y.sum(axis=axis, keepdims=True)
+  y = np.exp(x - x.max(axis=axis, keepdims=True))
+  return y / y.sum(axis=axis, keepdims=True)
+
 
 class Sigmoid(Function):
   @staticmethod
   def forward(ctx, input):
     with np.warnings.catch_warnings():
-      np.warnings.filterwarnings('ignore')
-      ret = np.where(input >= 0,
-        1/(1 + np.exp(-input)),
-        np.exp(input)/(1 + np.exp(input))
+      np.warnings.filterwarnings("ignore")
+      ret = np.where(
+        input >= 0, 1 / (1 + np.exp(-input)), np.exp(input) / (1 + np.exp(input))
       )
     ctx.save_for_backward(ret)
     return ret
 
   @staticmethod
   def backward(ctx, grad_output):
-    ret, = ctx.saved_tensors
+    (ret,) = ctx.saved_tensors
     return grad_output * (ret * (1 - ret))
-register('sigmoid', Sigmoid)
+
+
+register("sigmoid", Sigmoid)
+
 
 class LogSoftmax(Function):
   @staticmethod
   def forward(ctx, input):
     softmax = _exp_normalize(input, axis=1)
     ctx.save_for_backward(softmax)
-    return np.log(softmax) 
+    return np.log(softmax)
 
   @staticmethod
   def backward(ctx, grad_output):
-    softmax, = ctx.saved_tensors
-    return grad_output - grad_output.sum(axis=1, keepdims=True)*softmax
-register('logsoftmax', LogSoftmax)
+    (softmax,) = ctx.saved_tensors
+    return grad_output - grad_output.sum(axis=1, keepdims=True) * softmax
+
+
+register("logsoftmax", LogSoftmax)
 
 
 # ************* conv ops *************
 
+
 class Conv2D(Function):
   @staticmethod
   def forward(ctx, x, w, stride=1, groups=1):
     if type(ctx.stride) == int:
       ctx.stride = (ctx.stride, ctx.stride)
-    cout,cin,H,W = w.shape
-    ys,xs = ctx.stride
-    bs,cin_ = x.shape[0], x.shape[1]
-    oy,ox = (x.shape[2]-(H-ys))//ys, (x.shape[3]-(W-xs))//xs
-    assert cin*ctx.groups == cin_
+    cout, cin, H, W = w.shape
+    ys, xs = ctx.stride
+    bs, cin_ = x.shape[0], x.shape[1]
+    oy, ox = (x.shape[2] - (H - ys)) // ys, (x.shape[3] - (W - xs)) // xs
+    assert cin * ctx.groups == cin_
     assert cout % ctx.groups == 0
-    rcout = cout//ctx.groups
+    rcout = cout // ctx.groups
 
-    gx = x.reshape(bs,ctx.groups,cin,x.shape[2],x.shape[3])
-    tx = np.lib.stride_tricks.as_strided(gx,
+    gx = x.reshape(bs, ctx.groups, cin, x.shape[2], x.shape[3])
+    tx = np.lib.stride_tricks.as_strided(
+      gx,
       shape=(bs, ctx.groups, cin, oy, ox, H, W),
-      strides=(*gx.strides[0:3], gx.strides[3]*ys, gx.strides[4]*xs, *gx.strides[3:5]),
+      strides=(
+        *gx.strides[0:3],
+        gx.strides[3] * ys,
+        gx.strides[4] * xs,
+        *gx.strides[3:5],
+      ),
       writeable=False,
     )
     tw = w.reshape(ctx.groups, rcout, cin, H, W)
     ctx.save_for_backward(tx, tw, x.shape)
 
-    ret = np.zeros((bs,ctx.groups,oy,ox,rcout),dtype=x.dtype)
+    ret = np.zeros((bs, ctx.groups, oy, ox, rcout), dtype=x.dtype)
     for g in range(ctx.groups):
-      #ijYXyx,kjyx -> iYXk ->ikYX
-      ret[:,g] += np.tensordot(tx[:,g], tw[g], ((1,4,5),(1,2,3)))
-    return np.moveaxis(ret,4,2).reshape(bs, cout, oy, ox)
+      # ijYXyx,kjyx -> iYXk ->ikYX
+      ret[:, g] += np.tensordot(tx[:, g], tw[g], ((1, 4, 5), (1, 2, 3)))
+    return np.moveaxis(ret, 4, 2).reshape(bs, cout, oy, ox)
 
   @staticmethod
   def backward(ctx, grad_output):
-    bs,_,oy,ox = grad_output.shape
+    bs, _, oy, ox = grad_output.shape
     tx, tw, x_shape = ctx.saved_tensors
-    _,rcout,cin,H,W = tw.shape
-    ys,xs = ctx.stride
-    OY,OX = x_shape[2:4]
+    _, rcout, cin, H, W = tw.shape
+    ys, xs = ctx.stride
+    OY, OX = x_shape[2:4]
 
-    ggg = grad_output.reshape(bs,ctx.groups,rcout,oy,ox)
+    ggg = grad_output.reshape(bs, ctx.groups, rcout, oy, ox)
 
-    gdw = np.zeros((ctx.groups,rcout,cin,H,W), dtype=tx.dtype)
+    gdw = np.zeros((ctx.groups, rcout, cin, H, W), dtype=tx.dtype)
     for g in range(ctx.groups):
       #'ikYX,ijYXyx -> kjyx'
-      gdw[g] += np.tensordot(ggg[:,g], tx[:,g], ((0,2,3),(0,2,3)))
+      gdw[g] += np.tensordot(ggg[:, g], tx[:, g], ((0, 2, 3), (0, 2, 3)))
 
     # needs to be optimized
-    gdx = np.zeros((bs,ctx.groups,cin,OY,OX), dtype=tx.dtype)
-    for k in range(oy*ox):
-      Y, X = k//ox, k%ox
-      iY,iX = Y*ys, X*xs
-      #gdx[:,:,: , iY:iY+H, iX:iX+W] += np.einsum('igk,gkjyx->igjyx', ggg[:,:,:,Y,X], tw)
+    gdx = np.zeros((bs, ctx.groups, cin, OY, OX), dtype=tx.dtype)
+    for k in range(oy * ox):
+      Y, X = k // ox, k % ox
+      iY, iX = Y * ys, X * xs
+      # gdx[:,:,: , iY:iY+H, iX:iX+W] += np.einsum('igk,gkjyx->igjyx', ggg[:,:,:,Y,X], tw)
       for g in range(ctx.groups):
-        tg = np.dot(ggg[:,g,:,Y,X].reshape(bs, -1), tw[g].reshape(rcout, -1))
-        gdx[:, g, :, iY:iY+H, iX:iX+W] += tg.reshape((bs, cin, H, W))
+        tg = np.dot(ggg[:, g, :, Y, X].reshape(bs, -1), tw[g].reshape(rcout, -1))
+        gdx[:, g, :, iY : iY + H, iX : iX + W] += tg.reshape((bs, cin, H, W))
 
-    return gdx.reshape((bs, ctx.groups*cin, OY, OX)), gdw.reshape((ctx.groups*rcout, cin, H, W))
-register('conv2d', Conv2D)
+    return gdx.reshape((bs, ctx.groups * cin, OY, OX)), gdw.reshape(
+      (ctx.groups * rcout, cin, H, W)
+    )
+
+
+register("conv2d", Conv2D)
 
 
 # ************* pooling ops *************
 
+
 def stack_for_pool(x, py, px):
-  my, mx = (x.shape[2]//py)*py, (x.shape[3]//px)*px
+  my, mx = (x.shape[2] // py) * py, (x.shape[3] // px) * px
   xup = x[:, :, :my, :mx]
-  stack = [xup[:, :, k//px::py, k%px::px][None] for k in range(py*px)]
+  stack = [xup[:, :, k // px :: py, k % px :: px][None] for k in range(py * px)]
   return np.concatenate(stack, axis=0)
 
+
 def unstack_for_pool(fxn, s, py, px):
-  my, mx = (s[2]//py)*py, (s[3]//px)*px
-  for k in range(py*px):
-    Y, X = k//px, k%px
-    ll = fxn(Y*px+X)
+  my, mx = (s[2] // py) * py, (s[3] // px) * px
+  for k in range(py * px):
+    Y, X = k // px, k % px
+    ll = fxn(Y * px + X)
     if X == 0 and Y == 0:
       ret = np.zeros(s, dtype=ll.dtype)
     ret[:, :, Y:my:py, X:mx:px] = ll
   return ret
 
+
 class MaxPool2D(Function):
   @staticmethod
   def forward(ctx, x, kernel_size=(2, 2)):
@@ -251,9 +306,14 @@ class MaxPool2D(Function):
 
   @staticmethod
   def backward(ctx, grad_output):
-    idxs,s = ctx.saved_tensors
-    return unstack_for_pool(lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size)
-register('max_pool2d', MaxPool2D)
+    idxs, s = ctx.saved_tensors
+    return unstack_for_pool(
+      lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size
+    )
+
+
+register("max_pool2d", MaxPool2D)
+
 
 class AvgPool2D(Function):
   @staticmethod
@@ -264,8 +324,9 @@ class AvgPool2D(Function):
 
   @staticmethod
   def backward(ctx, grad_output):
-    s, = ctx.saved_tensors
+    (s,) = ctx.saved_tensors
     py, px = ctx.kernel_size
-    return unstack_for_pool(lambda idx: grad_output/py/px, s, py, px)
-register('avg_pool2d', AvgPool2D)
+    return unstack_for_pool(lambda idx: grad_output / py / px, s, py, px)
 
+
+register("avg_pool2d", AvgPool2D)
diff --git a/tinygrad/ops_gpu.py b/tinygrad/ops_gpu.py
index 269dbb49ac..422a3df1e7 100644
--- a/tinygrad/ops_gpu.py
+++ b/tinygrad/ops_gpu.py
@@ -3,131 +3,229 @@ from .tensor import Function, register, GPUBuffer, Tensor, Device
 import pyopencl as cl
 import functools
 
+
 def buffer_new(ctx, shape, zero=False):
-  return GPUBuffer(shape, hostbuf=None if not zero else np.zeros(shape, dtype=np.float32))
+  return GPUBuffer(
+    shape, hostbuf=None if not zero else np.zeros(shape, dtype=np.float32)
+  )
+
 
 @functools.lru_cache()
 def clbuild(cl_ctx, name, prg):
   return cl.Program(cl_ctx, prg).build().__getattr__(name)
 
+
 def uint2(x, y):
-  return np.array((x,y), dtype=cl.cltypes.uint2)
+  return np.array((x, y), dtype=cl.cltypes.uint2)
+
+
 i32 = np.int32
 
-def subsample_op(ctx, input, kernel_size, stride, iter_op, result_op, decls=''):
+
+def subsample_op(ctx, input, kernel_size, stride, iter_op, result_op, decls=""):
   py, px = stride
   N, C, Yin, Xin = input.shape
-  Yout, Xout = (Yin-kernel_size[0])//py+1, (Xin-kernel_size[1])//px+1
+  Yout, Xout = (Yin - kernel_size[0]) // py + 1, (Xin - kernel_size[1]) // px + 1
   ret = buffer_new(ctx, (N, C, Yout, Xout), zero=True)
-  subsample = clbuild(ctx.cl_ctx, "subsample", """
+  subsample = clbuild(
+    ctx.cl_ctx,
+    "subsample",
+    """
   __kernel void subsample(__global float *output, __global const float *input, uint2 osize, uint2 isize,
                           uint2 ksz, uint2 stride) {
     int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
     int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
-    """+decls+""";
+    """
+    + decls
+    + """;
     for (uint j=0; j<ksz.y; ++j) {
       for (uint i=0; i<ksz.x; ++i) {
         int iid = (gid.x*stride.x+i) + isize.x*((gid.y*stride.y+j) + isize.y*gid.z);
         if (gid.x*stride.x+i < isize.x && gid.y*stride.y+j < isize.y) {
-          """+iter_op+""";
+          """
+    + iter_op
+    + """;
         }
       }
     }
-    output[oid] = """+result_op+""";
-  }""")
-  subsample(ctx.cl_queue, (N*C, Yout, Xout), None,
-            ret.cl, input.cl, uint2(Xout, Yout), uint2(Xin, Yin),
-            uint2(*kernel_size[::-1]), uint2(px, py))
-  ctx.data = np.empty((N, C, Yout, Xout)) # set shape expectation on tensor instance
+    output[oid] = """
+    + result_op
+    + """;
+  }""",
+  )
+  subsample(
+    ctx.cl_queue,
+    (N * C, Yout, Xout),
+    None,
+    ret.cl,
+    input.cl,
+    uint2(Xout, Yout),
+    uint2(Xin, Yin),
+    uint2(*kernel_size[::-1]),
+    uint2(px, py),
+  )
+  ctx.data = np.empty((N, C, Yout, Xout))  # set shape expectation on tensor instance
   return ret
 
-def supersample_op(ctx, input, out_shape, kernel_size, result_op, decls='', input2=None):
+
+def supersample_op(
+  ctx, input, out_shape, kernel_size, result_op, decls="", input2=None
+):
   (N, C, Yin, Xin), (Yout, Xout) = input.shape, out_shape[2:]
-  py,px = kernel_size
+  py, px = kernel_size
   ret = buffer_new(ctx, out_shape, zero=True)
-  supsample = clbuild(ctx.cl_ctx, "supsample", """
+  supsample = clbuild(
+    ctx.cl_ctx,
+    "supsample",
+    """
   __kernel void supsample(__global float *output, __global const float *input, __global const void *input2,
                           uint2 osize, uint2 isize, uint2 ksz) {
     int3 gid = (int3)(get_global_id(2), get_global_id(1), get_global_id(0));
     int oid = gid.x + osize.x*(gid.y + osize.y*gid.z);
     int iid = (gid.x/ksz.x) + isize.x*((gid.y/ksz.y) + isize.y*gid.z);
-    """+decls+""";
+    """
+    + decls
+    + """;
     if (gid.x/ksz.x < isize.x && gid.y/ksz.y < isize.y) {
-      output[oid] = """+result_op+""";
+      output[oid] = """
+    + result_op
+    + """;
     }
-  }""")
-  supsample(ctx.cl_queue, (N*C, Yout, Xout), None,
-            ret.cl, input.cl, input2.cl if input2 is not None else input2,
-            uint2(Xout, Yout), uint2(Xin, Yin), uint2(px, py))
-  ctx.data = np.empty((N, C, Yout, Xout)) # set shape expectation on tensor instance
+  }""",
+  )
+  supsample(
+    ctx.cl_queue,
+    (N * C, Yout, Xout),
+    None,
+    ret.cl,
+    input.cl,
+    input2.cl if input2 is not None else input2,
+    uint2(Xout, Yout),
+    uint2(Xin, Yin),
+    uint2(px, py),
+  )
+  ctx.data = np.empty((N, C, Yout, Xout))  # set shape expectation on tensor instance
   return ret
 
+
 @functools.lru_cache()
 def get_binop_prg(cl_ctx, code, complist):
   ndims = len(complist)
-  args = "".join([", int d%d" % i for i in range(ndims)]) + "".join([", int p%d" % i for i in range(ndims-1)])
-  compute_idx_rets = ["\n    int idx_ret"+str(i)+" = (gid0 / "+("p%d"%i if i < ndims-1 else "1")+") % d"+str(i)+";" for i in range(ndims)]
-  
-  idx_exprs = ["0", "0"] # [idx_x, idx_y]
+  args = "".join([", int d%d" % i for i in range(ndims)]) + "".join(
+    [", int p%d" % i for i in range(ndims - 1)]
+  )
+  compute_idx_rets = [
+    "\n    int idx_ret"
+    + str(i)
+    + " = (gid0 / "
+    + ("p%d" % i if i < ndims - 1 else "1")
+    + ") % d"
+    + str(i)
+    + ";"
+    for i in range(ndims)
+  ]
+
+  idx_exprs = ["0", "0"]  # [idx_x, idx_y]
   for i in range(ndims):
     for j in range(2):
       if complist[i][j]:
         idx_exprs[j] = "idx_ret%d + d%d*(%s)" % (i, i, idx_exprs[j])
-  
-  return cl.Program(cl_ctx, """__kernel void binop(__global const float *x_g, __global const float *y_g, __global float *res_g"""+args+""") {
-    int gid0 = get_global_id(0);"""+"".join(compute_idx_rets)+"""
-    float a = x_g["""+idx_exprs[0]+"""];
-    float b = y_g["""+idx_exprs[1]+"""];
-    res_g[gid0] = """+code+""";\n}""").build()
+
+  return cl.Program(
+    cl_ctx,
+    """__kernel void binop(__global const float *x_g, __global const float *y_g, __global float *res_g"""
+    + args
+    + """) {
+    int gid0 = get_global_id(0);"""
+    + "".join(compute_idx_rets)
+    + """
+    float a = x_g["""
+    + idx_exprs[0]
+    + """];
+    float b = y_g["""
+    + idx_exprs[1]
+    + """];
+    res_g[gid0] = """
+    + code
+    + """;\n}""",
+  ).build()
+
 
 def binary_op(ctx, code, x, y):
   n_dims = max(len(x.shape), len(y.shape))
   shape_x, shape_y = np.ones(n_dims, dtype=np.int32), np.ones(n_dims, dtype=np.int32)
-  shape_x[:len(x.shape)] = np.array(x.shape, dtype=np.int32)
-  shape_y[:len(y.shape)] = np.array(y.shape, dtype=np.int32)
+  shape_x[: len(x.shape)] = np.array(x.shape, dtype=np.int32)
+  shape_y[: len(y.shape)] = np.array(y.shape, dtype=np.int32)
   if not np.all((shape_x == 1) | (shape_y == 1) | (shape_x == shape_y)):
     raise Exception(f"binary op unbroadcastable shape mismatch: {x.shape} vs {y.shape}")
   shape_ret = np.maximum(shape_x, shape_y)
-  
-  dimlist, complist = [], [] # note: len(dimlist) may be less than n_dims
+
+  dimlist, complist = [], []  # note: len(dimlist) may be less than n_dims
+
   def push(dim, comp):
     if len(complist) > 0 and complist[-1] == comp:
       dimlist[-1] *= dim
     elif comp != (False, False):
-      dimlist.append(dim); complist.append(comp)
-  for i in range(n_dims): # group together any adjacent dimensions that we can to simplify broadcasting
+      dimlist.append(dim)
+      complist.append(comp)
+
+  for i in range(
+    n_dims
+  ):  # group together any adjacent dimensions that we can to simplify broadcasting
     push(i32(max(shape_x[i], shape_y[i])), (shape_x[i] > 1, shape_y[i] > 1))
-  
+
   prg = get_binop_prg(ctx.cl_ctx, code, tuple(complist))
   ret = buffer_new(ctx, shape_ret, zero=True)
-  prod_list = np.array(dimlist, dtype=i32)[-1::-1].cumprod(dtype=i32)[-1::-1] # take cumprod from back to front
-  prg.binop(ctx.cl_queue, [prod_list[0]] if len(dimlist) > 0 else [1], None, x.cl, y.cl, ret.cl, *dimlist, *(prod_list[1:]))
+  prod_list = np.array(dimlist, dtype=i32)[-1::-1].cumprod(dtype=i32)[
+    -1::-1
+  ]  # take cumprod from back to front
+  prg.binop(
+    ctx.cl_queue,
+    [prod_list[0]] if len(dimlist) > 0 else [1],
+    None,
+    x.cl,
+    y.cl,
+    ret.cl,
+    *dimlist,
+    *(prod_list[1:]),
+  )
   return ret
 
+
 def unary_op(ctx, code, x):
   ret = buffer_new(ctx, x.shape)
-  unop = clbuild(ctx.cl_ctx, "unop", """
+  unop = clbuild(
+    ctx.cl_ctx,
+    "unop",
+    """
   __kernel void unop(__global const float *a_g, __global float *res_g) {
     int gid = get_global_id(0);
     float a = a_g[gid];
-    res_g[gid] = """+code+""";
-  }""")
+    res_g[gid] = """
+    + code
+    + """;
+  }""",
+  )
   unop(ctx.cl_queue, [np.prod(ret.shape)], None, x.cl, ret.cl)
   return ret
 
+
 def reduce_op(ctx, code, code2, inp, axis=None):
   if axis is None:
     # full reduce
-    osize = [1]*len(inp.shape)
+    osize = [1] * len(inp.shape)
   else:
     osize = np.array(inp.shape)
-    osize[list(axis)] = 1 
+    osize[list(axis)] = 1
   ret = buffer_new(ctx, osize)
   if axis is None:
     ret.shape = (1,)
 
   # TODO: this is insanely slow
-  reduce = clbuild(ctx.cl_ctx, "reduce", """
+  reduce = clbuild(
+    ctx.cl_ctx,
+    "reduce",
+    """
   __kernel void reduce(__global const float *a_g, int sz, __global float *res_g, int prod, int n_dims,
                        __global const int *shape_x, __global const int *shape_ret) {
     int gid = get_global_id(0);
@@ -148,80 +246,129 @@ def reduce_op(ctx, code, code2, inp, axis=None):
         }
       }
       float a = a_g[idx];
-      """+code+""";
+      """
+    + code
+    + """;
     }
-    res_g[gid] = """+code2+""";
-  }""")
-  buffer_np = lambda x: cl.Buffer(ctx.cl_ctx,
-    cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x)
-  reduce(ctx.cl_queue, [np.prod(osize)], None, inp.cl,
-    i32(np.prod(inp.shape)//np.prod(osize)), ret.cl,
-    i32(np.prod(osize)), i32(len(osize)),
+    res_g[gid] = """
+    + code2
+    + """;
+  }""",
+  )
+  buffer_np = lambda x: cl.Buffer(
+    ctx.cl_ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=x
+  )
+  reduce(
+    ctx.cl_queue,
+    [np.prod(osize)],
+    None,
+    inp.cl,
+    i32(np.prod(inp.shape) // np.prod(osize)),
+    ret.cl,
+    i32(np.prod(osize)),
+    i32(len(osize)),
     buffer_np(np.array(inp.shape, dtype=np.int32)),
-    buffer_np(np.array(osize, dtype=np.int32)))
+    buffer_np(np.array(osize, dtype=np.int32)),
+  )
   return ret
 
+
 def unbroadcast(ctx, out, in_sh):
-  sum_axis = [i for i in range(len(in_sh)) if in_sh[i]==1 and out.shape[i]>1] if in_sh != (1,) else None
+  sum_axis = (
+    [i for i in range(len(in_sh)) if in_sh[i] == 1 and out.shape[i] > 1]
+    if in_sh != (1,)
+    else None
+  )
   return reduce_op(ctx, "out += a", "out", out, sum_axis)
 
+
 # ***** now for the ops themselves *****
 
+
 class Add(Function):
   @staticmethod
   def forward(ctx, x, y):
     ctx.save_for_backward(x.shape, y.shape)
-    return binary_op(ctx, 'a+b', x, y)
+    return binary_op(ctx, "a+b", x, y)
 
   @staticmethod
   def backward(ctx, grad_output):
     grad_x, grad_y = grad_output, grad_output
     shape_x, shape_y = ctx.saved_tensors
-    return unbroadcast(ctx, grad_x, shape_x), unbroadcast(ctx, grad_y, shape_y),
-register('add', Add, device=Device.GPU)
+    return (
+      unbroadcast(ctx, grad_x, shape_x),
+      unbroadcast(ctx, grad_y, shape_y),
+    )
+
+
+register("add", Add, device=Device.GPU)
+
 
 class Sub(Function):
   @staticmethod
   def forward(ctx, x, y):
     ctx.save_for_backward(x.shape, y.shape)
-    return binary_op(ctx, 'a-b', x, y)
+    return binary_op(ctx, "a-b", x, y)
 
   @staticmethod
   def backward(ctx, grad_output):
-    grad_x, grad_y = grad_output, unary_op(ctx, '-a', grad_output)
+    grad_x, grad_y = grad_output, unary_op(ctx, "-a", grad_output)
     shape_x, shape_y = ctx.saved_tensors
-    return unbroadcast(ctx, grad_x, shape_x), unbroadcast(ctx, grad_y, shape_y),
-register('sub', Sub, device=Device.GPU)
+    return (
+      unbroadcast(ctx, grad_x, shape_x),
+      unbroadcast(ctx, grad_y, shape_y),
+    )
+
+
+register("sub", Sub, device=Device.GPU)
+
 
 class Mul(Function):
   @staticmethod
   def forward(ctx, x, y):
     ctx.save_for_backward(x, y)
-    return binary_op(ctx, 'a*b', x, y)
+    return binary_op(ctx, "a*b", x, y)
 
   @staticmethod
   def backward(ctx, grad_output):
-    x,y = ctx.saved_tensors
-    grad_x = binary_op(ctx, 'a*b', y, grad_output)
-    grad_y = binary_op(ctx, 'a*b', x, grad_output)
-    return unbroadcast(ctx, grad_x, x.shape), unbroadcast(ctx, grad_y, y.shape),
-register('mul', Mul, device=Device.GPU)
+    x, y = ctx.saved_tensors
+    grad_x = binary_op(ctx, "a*b", y, grad_output)
+    grad_y = binary_op(ctx, "a*b", x, grad_output)
+    return (
+      unbroadcast(ctx, grad_x, x.shape),
+      unbroadcast(ctx, grad_y, y.shape),
+    )
+
+
+register("mul", Mul, device=Device.GPU)
+
 
 class Pow(Function):
   @staticmethod
   def forward(ctx, x, y):
     ctx.save_for_backward(x, y)
-    return binary_op(ctx, 'pow(a,b)', x, y)
+    return binary_op(ctx, "pow(a,b)", x, y)
 
   @staticmethod
   def backward(ctx, grad_output):
-    x,y = ctx.saved_tensors
-    grad_x = binary_op(ctx, 'a*b', grad_output,
-                      binary_op(ctx, 'b * (pow((float)a, (float)(b-1.0)))', x, y))
-    grad_y = binary_op(ctx, 'a*b', grad_output,
-                      binary_op(ctx, 'pow(a, (float)b) * log(a);', x, y))
-    return unbroadcast(ctx, grad_x, x.shape), unbroadcast(ctx, grad_y, y.shape),
-register('pow', Pow, device=Device.GPU)
+    x, y = ctx.saved_tensors
+    grad_x = binary_op(
+      ctx,
+      "a*b",
+      grad_output,
+      binary_op(ctx, "b * (pow((float)a, (float)(b-1.0)))", x, y),
+    )
+    grad_y = binary_op(
+      ctx, "a*b", grad_output, binary_op(ctx, "pow(a, (float)b) * log(a);", x, y)
+    )
+    return (
+      unbroadcast(ctx, grad_x, x.shape),
+      unbroadcast(ctx, grad_y, y.shape),
+    )
+
+
+register("pow", Pow, device=Device.GPU)
+
 
 class Sum(Function):
   @staticmethod
@@ -229,16 +376,24 @@ class Sum(Function):
     ctx.save_for_backward(input, axis)
     ret = reduce_op(ctx, "out += a", "out", input, axis=axis)
     if axis is not None:
-      ret.shape = tuple([input.shape[i] for i in range(len(input.shape)) if i not in axis])
+      ret.shape = tuple(
+        [input.shape[i] for i in range(len(input.shape)) if i not in axis]
+      )
     return ret
 
   @staticmethod
   def backward(ctx, grad_output):
     input, axis = ctx.saved_tensors
-    shape = [1 if axis is None or i in axis else input.shape[i] for i in range(len(input.shape))]
+    shape = [
+      1 if axis is None or i in axis else input.shape[i]
+      for i in range(len(input.shape))
+    ]
     output = GPUBuffer(shape, hostbuf=grad_output)
-    return binary_op(ctx, 'a+b', output, buffer_new(ctx, input.shape, zero=True))
-register('sum', Sum, device=Device.GPU)
+    return binary_op(ctx, "a+b", output, buffer_new(ctx, input.shape, zero=True))
+
+
+register("sum", Sum, device=Device.GPU)
+
 
 class Dot(Function):
   @staticmethod
@@ -247,7 +402,10 @@ class Dot(Function):
     isize, msize, osize = i32(input.shape[0]), i32(input.shape[1]), i32(weight.shape[1])
     ret = buffer_new(ctx, (isize, osize))
 
-    matmul = clbuild(ctx.cl_ctx, "matmul", """
+    matmul = clbuild(
+      ctx.cl_ctx,
+      "matmul",
+      """
     __kernel void matmul(
       __global const float *input, __global const float *weight, __global float *res,
       int is0, int is1, int msize, int ws0, int ws1, int osize
@@ -261,13 +419,25 @@ class Dot(Function):
       }
 
       res[X * osize + Y] = ret;
-    }""")
+    }""",
+    )
     ctx.save_for_backward(input, weight, matmul)
 
     # (isize,msize) x (msize,osize) = (isize,osize)
-    matmul(ctx.cl_queue, [isize, osize], None,
-      input.cl, weight.cl, ret.cl,
-      msize, i32(1), msize, i32(1), osize, osize)
+    matmul(
+      ctx.cl_queue,
+      [isize, osize],
+      None,
+      input.cl,
+      weight.cl,
+      ret.cl,
+      msize,
+      i32(1),
+      msize,
+      i32(1),
+      osize,
+      osize,
+    )
     return ret
 
   @staticmethod
@@ -279,28 +449,56 @@ class Dot(Function):
     grad_weight = buffer_new(ctx, weight.shape)
 
     # (isize,osize) x (msize,osize) = (isize,msize)
-    matmul(ctx.cl_queue, [isize, msize], None,
-      grad_output.cl, weight.cl, grad_input.cl,
-      osize, i32(1), osize, osize, i32(1), msize)
+    matmul(
+      ctx.cl_queue,
+      [isize, msize],
+      None,
+      grad_output.cl,
+      weight.cl,
+      grad_input.cl,
+      osize,
+      i32(1),
+      osize,
+      osize,
+      i32(1),
+      msize,
+    )
 
     # (isize,msize) x (isize,osize) = (msize,osize)
-    matmul(ctx.cl_queue, [msize, osize], None,
-      input.cl, grad_output.cl, grad_weight.cl,
-      i32(1), msize, isize, i32(1), osize, osize)
+    matmul(
+      ctx.cl_queue,
+      [msize, osize],
+      None,
+      input.cl,
+      grad_output.cl,
+      grad_weight.cl,
+      i32(1),
+      msize,
+      isize,
+      i32(1),
+      osize,
+      osize,
+    )
 
     return grad_input, grad_weight
-register('dot', Dot, device=Device.GPU)
+
+
+register("dot", Dot, device=Device.GPU)
 
 # ************* simple ops *************
 
+
 class Pad2D(Function):
   @staticmethod
   def forward(ctx, x, padding=None):
-    bs,cin,iy,ix = x.shape
-    oy,ox = iy+padding[2]+padding[3], ix+padding[0]+padding[1]
+    bs, cin, iy, ix = x.shape
+    oy, ox = iy + padding[2] + padding[3], ix + padding[0] + padding[1]
     ret = buffer_new(ctx, (bs, cin, oy, ox), zero=True)
 
-    pad2d = clbuild(ctx.cl_ctx, "pad2d", """
+    pad2d = clbuild(
+      ctx.cl_ctx,
+      "pad2d",
+      """
     __kernel void pad2d(__global const float *input, __global float *output,
                         int ipx, int ipy, int py, int px, int oy, int ox, int iy, int ix) {
       int BC = get_global_id(0);
@@ -311,13 +509,24 @@ class Pad2D(Function):
       int optr = BC*oy*ox + (Y+py)*ox + px + X;
 
       output[optr] = input[iptr];
-    }""")
+    }""",
+    )
     ctx.save_for_backward(padding, pad2d)
-    pad2d(ctx.cl_queue, [bs*cin, iy, ix], None,
-        x.cl, ret.cl,
-        i32(0), i32(0), i32(padding[2]), i32(padding[0]),
-        i32(oy), i32(ox), i32(iy), i32(ix)
-      )
+    pad2d(
+      ctx.cl_queue,
+      [bs * cin, iy, ix],
+      None,
+      x.cl,
+      ret.cl,
+      i32(0),
+      i32(0),
+      i32(padding[2]),
+      i32(padding[0]),
+      i32(oy),
+      i32(ox),
+      i32(iy),
+      i32(ix),
+    )
     return ret
 
   @staticmethod
@@ -326,13 +535,26 @@ class Pad2D(Function):
     bs, cin, iy, ix = grad_output.shape
     oy, ox = iy - padding[2] - padding[3], ix - padding[0] - padding[1]
     ret = buffer_new(ctx, (bs, cin, oy, ox))
-    pad2d(ctx.cl_queue, [bs*cin, oy, ox], None,
-              grad_output.cl, ret.cl,
-              i32(padding[2]), i32(padding[0]), i32(0), i32(0),
-              i32(oy), i32(ox), i32(iy), i32(ix)
-             )
+    pad2d(
+      ctx.cl_queue,
+      [bs * cin, oy, ox],
+      None,
+      grad_output.cl,
+      ret.cl,
+      i32(padding[2]),
+      i32(padding[0]),
+      i32(0),
+      i32(0),
+      i32(oy),
+      i32(ox),
+      i32(iy),
+      i32(ix),
+    )
     return ret
-register('pad2d', Pad2D, device=Device.GPU)
+
+
+register("pad2d", Pad2D, device=Device.GPU)
+
 
 class Reshape(Function):
   @staticmethod
@@ -345,105 +567,152 @@ class Reshape(Function):
 
   @staticmethod
   def backward(ctx, grad_output):
-    in_shape, = ctx.saved_tensors
+    (in_shape,) = ctx.saved_tensors
     return GPUBuffer(in_shape, hostbuf=grad_output)
-register('reshape', Reshape, device=Device.GPU)
+
+
+register("reshape", Reshape, device=Device.GPU)
 
 # ************* activation ops *************
 
+
 class ReLU(Function):
   @staticmethod
   def forward(ctx, input):
     ctx.save_for_backward(input)
-    return unary_op(ctx, 'max(a, (float)0.)', input)
+    return unary_op(ctx, "max(a, (float)0.)", input)
 
   @staticmethod
   def backward(ctx, grad_output):
-    input, = ctx.saved_tensors
-    return binary_op(ctx, 'a * (b >= 0)', grad_output, input)
-register('relu', ReLU, device=Device.GPU)
+    (input,) = ctx.saved_tensors
+    return binary_op(ctx, "a * (b >= 0)", grad_output, input)
+
+
+register("relu", ReLU, device=Device.GPU)
+
 
 class Sigmoid(Function):
   @staticmethod
   def forward(ctx, input):
-    ret = unary_op(ctx, '1./(1+exp(-a))', input)
+    ret = unary_op(ctx, "1./(1+exp(-a))", input)
     ctx.save_for_backward(ret)
     return ret
 
   @staticmethod
   def backward(ctx, grad_output):
-    ret, = ctx.saved_tensors
-    return binary_op(ctx, 'a * (b * (1 - b));', grad_output, ret)
-register('sigmoid', Sigmoid, device=Device.GPU)
+    (ret,) = ctx.saved_tensors
+    return binary_op(ctx, "a * (b * (1 - b));", grad_output, ret)
+
+
+register("sigmoid", Sigmoid, device=Device.GPU)
+
 
 class AvgPool2D(Function):
   @staticmethod
   def forward(ctx, input, kernel_size=(2, 2)):
-    ret = subsample_op(ctx, input, kernel_size, kernel_size, iter_op="sumval += input[iid]",
-      result_op="sumval / (ksz.x * ksz.y)", decls="float sumval=0.f")
+    ret = subsample_op(
+      ctx,
+      input,
+      kernel_size,
+      kernel_size,
+      iter_op="sumval += input[iid]",
+      result_op="sumval / (ksz.x * ksz.y)",
+      decls="float sumval=0.f",
+    )
     ctx.save_for_backward(input.shape)
     return ret
 
   @staticmethod
   def backward(ctx, grad_output):
-    orig_shape, = ctx.saved_tensors
-    return supersample_op(ctx, grad_output, orig_shape, ctx.kernel_size,
-      result_op="input[iid] / (ksz.x * ksz.y)")
-register('avg_pool2d', AvgPool2D, device=Device.GPU)
+    (orig_shape,) = ctx.saved_tensors
+    return supersample_op(
+      ctx,
+      grad_output,
+      orig_shape,
+      ctx.kernel_size,
+      result_op="input[iid] / (ksz.x * ksz.y)",
+    )
+
+
+register("avg_pool2d", AvgPool2D, device=Device.GPU)
+
 
 class MaxPool2D(Function):
   @staticmethod
   def forward(ctx, input, kernel_size=(2, 2)):
-    idxs = subsample_op(ctx, input, kernel_size, kernel_size,
+    idxs = subsample_op(
+      ctx,
+      input,
+      kernel_size,
+      kernel_size,
       iter_op="if (input[iid]>maxval) { maxval = input[iid]; maxidx = j * ksz.x + i; }",
-      result_op="(float)maxidx", decls="float maxval=-FLT_MAX; int maxidx=0")
+      result_op="(float)maxidx",
+      decls="float maxval=-FLT_MAX; int maxidx=0",
+    )
     ctx.save_for_backward(idxs, input.shape)
-    return subsample_op(ctx, input, kernel_size, kernel_size,
+    return subsample_op(
+      ctx,
+      input,
+      kernel_size,
+      kernel_size,
       iter_op="maxval = max(maxval, input[iid])",
-      result_op="maxval", decls="float maxval = -FLT_MAX")
+      result_op="maxval",
+      decls="float maxval = -FLT_MAX",
+    )
 
   @staticmethod
   def backward(ctx, grad_output):
     idxs, orig_shape = ctx.saved_tensors
-    return supersample_op(ctx, grad_output, orig_shape, ctx.kernel_size,
+    return supersample_op(
+      ctx,
+      grad_output,
+      orig_shape,
+      ctx.kernel_size,
       result_op="(maxidx == kernidx) * input[iid]",
       decls="int maxidx=((__global float*)input2)[iid]; int kernidx=(gid.x%ksz.x) + ksz.x*(gid.y%ksz.y)",
-      input2=idxs)
-register('max_pool2d', MaxPool2D, device=Device.GPU)
+      input2=idxs,
+    )
+
+
+register("max_pool2d", MaxPool2D, device=Device.GPU)
+
 
 class LogSoftmax(Function):
   @staticmethod
   def forward(ctx, input):
     # TODO: stability?
     lsum = reduce_op(ctx, "out += exp(a)", "log(out)", input, axis=[1])
-    output = binary_op(ctx, 'a-b', input, lsum)
+    output = binary_op(ctx, "a-b", input, lsum)
     ctx.save_for_backward(output)
     return output
 
   @staticmethod
   def backward(ctx, grad_output):
-    output, = ctx.saved_tensors
+    (output,) = ctx.saved_tensors
     lsum = reduce_op(ctx, "out += a", "out", grad_output, axis=[1])
     texp = binary_op(ctx, "exp(a) * b", output, lsum)
     return binary_op(ctx, "a - b", grad_output, texp)
-register('logsoftmax', LogSoftmax, device=Device.GPU)
+
+
+register("logsoftmax", LogSoftmax, device=Device.GPU)
 
 # ************* conv ops *************
 
+
 class Conv2D(Function):
   @staticmethod
   def forward(ctx, x, w, stride=1, groups=1):
     if type(ctx.stride) == int:
       ctx.stride = (ctx.stride, ctx.stride)
-    cout,cin,H,W = w.shape
-    ys,xs = ctx.stride
-    bs,cin_,iy,ix = x.shape
-    oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs
-    assert cin*ctx.groups == cin_
+    cout, cin, H, W = w.shape
+    ys, xs = ctx.stride
+    bs, cin_, iy, ix = x.shape
+    oy, ox = (iy - (H - ys)) // ys, (ix - (W - xs)) // xs
+    assert cin * ctx.groups == cin_
     assert cout % ctx.groups == 0
-    rcout = cout//ctx.groups
+    rcout = cout // ctx.groups
 
-    ctx.save_for_backward(x,w)
+    ctx.save_for_backward(x, w)
 
     # output buffer
     ret = buffer_new(ctx, (bs, cout, oy, ox))
@@ -452,7 +721,10 @@ class Conv2D(Function):
     # weight = (groups, rcout, cin, H, W)
     # output = (bs, groups, rcout, oy, ox)
 
-    conv = clbuild(ctx.cl_ctx, "conv", """
+    conv = clbuild(
+      ctx.cl_ctx,
+      "conv",
+      """
     __kernel void conv(__global const float *input, __global const float *weight, __global float *output,
       int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs) {
 
@@ -475,26 +747,41 @@ class Conv2D(Function):
         }
       }
       output[B*groups*rcout*oy*ox + g*rcout*oy*ox + c*oy*ox + Y*ox + X] = acc;
-    }""")
+    }""",
+    )
 
-    conv(ctx.cl_queue, [bs*groups*rcout, oy, ox], None,
-      x.cl, w.cl, ret.cl,
-      i32(H), i32(W), i32(groups), i32(rcout), i32(cin),
-      i32(oy), i32(ox), i32(iy), i32(ix), i32(ys), i32(xs)
+    conv(
+      ctx.cl_queue,
+      [bs * groups * rcout, oy, ox],
+      None,
+      x.cl,
+      w.cl,
+      ret.cl,
+      i32(H),
+      i32(W),
+      i32(groups),
+      i32(rcout),
+      i32(cin),
+      i32(oy),
+      i32(ox),
+      i32(iy),
+      i32(ix),
+      i32(ys),
+      i32(xs),
     )
     return ret
 
   @staticmethod
   def backward(ctx, grad_output):
-    bs,_,oy,ox = grad_output.shape
+    bs, _, oy, ox = grad_output.shape
     x, w = ctx.saved_tensors
-    cout,cin,H,W = w.shape
-    ys,xs = ctx.stride
-    bs,cin_,iy,ix = x.shape
-    oy,ox = (iy-(H-ys))//ys, (ix-(W-xs))//xs
-    assert cin*ctx.groups == cin_
+    cout, cin, H, W = w.shape
+    ys, xs = ctx.stride
+    bs, cin_, iy, ix = x.shape
+    oy, ox = (iy - (H - ys)) // ys, (ix - (W - xs)) // xs
+    assert cin * ctx.groups == cin_
     assert cout % ctx.groups == 0
-    rcout = cout//ctx.groups
+    rcout = cout // ctx.groups
 
     dx = buffer_new(ctx, (bs, cin_, iy, ix), zero=True)
     dw = buffer_new(ctx, (cout, cin, H, W))
@@ -503,7 +790,10 @@ class Conv2D(Function):
     # tensw = (groups*rcout, cin, H, W)
     # ggg = (bs, groups*rout, oy, ox)
 
-    convw = clbuild(ctx.cl_ctx, "convw", """
+    convw = clbuild(
+      ctx.cl_ctx,
+      "convw",
+      """
     __kernel void convw(__global const float *tensx, __global const float *ggg, __global float *dw,
       int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs) {
 
@@ -523,8 +813,12 @@ class Conv2D(Function):
         }
       }
       dw[get_global_id(0)*H*W + y*W + x] = acc;
-    }""")
-    convx = clbuild(ctx.cl_ctx, "convx", """
+    }""",
+    )
+    convx = clbuild(
+      ctx.cl_ctx,
+      "convx",
+      """
     __kernel void convx(__global const float *tensw, __global const float *ggg, __global float *dx,
       int H, int W, int groups, int rcout, int cin, int oy, int ox, int iy, int ix, int ys, int xs, int bs) {
 
@@ -547,10 +841,36 @@ class Conv2D(Function):
         }
       }
     }
-    """)
+    """,
+    )
 
-    conv_args = i32(H), i32(W), i32(ctx.groups), i32(rcout), i32(cin), i32(oy), i32(ox), i32(iy), i32(ix), i32(ys), i32(xs), i32(bs)
-    convw(ctx.cl_queue, [ctx.groups*rcout*cin, H, W], None, x.cl, grad_output.cl, dw.cl, *conv_args)
-    convx(ctx.cl_queue, [bs, ctx.groups, cin], None, w.cl, grad_output.cl, dx.cl, *conv_args)
+    conv_args = (
+      i32(H),
+      i32(W),
+      i32(ctx.groups),
+      i32(rcout),
+      i32(cin),
+      i32(oy),
+      i32(ox),
+      i32(iy),
+      i32(ix),
+      i32(ys),
+      i32(xs),
+      i32(bs),
+    )
+    convw(
+      ctx.cl_queue,
+      [ctx.groups * rcout * cin, H, W],
+      None,
+      x.cl,
+      grad_output.cl,
+      dw.cl,
+      *conv_args,
+    )
+    convx(
+      ctx.cl_queue, [bs, ctx.groups, cin], None, w.cl, grad_output.cl, dx.cl, *conv_args
+    )
     return dx, dw
-register('conv2d', Conv2D, device=Device.GPU)
+
+
+register("conv2d", Conv2D, device=Device.GPU)
diff --git a/tinygrad/optim.py b/tinygrad/optim.py
index 5edf6976e7..127a6223ae 100644
--- a/tinygrad/optim.py
+++ b/tinygrad/optim.py
@@ -3,6 +3,7 @@
 import numpy as np
 from tinygrad.tensor import Tensor
 
+
 class Optimizer:
   def __init__(self, params):
     self.params = [x for x in params if x.requires_grad == True]
@@ -11,6 +12,7 @@ class Optimizer:
     for param in self.params:
       param.grad = None
 
+
 class SGD(Optimizer):
   def __init__(self, params, lr=0.001):
     super().__init__(params)
@@ -20,30 +22,53 @@ class SGD(Optimizer):
     for t in self.params:
       t -= t.grad * self.lr
 
+
 class RMSprop(Optimizer):
   def __init__(self, params, lr=0.001, decay=0.9, eps=1e-8):
     super().__init__(params)
     self.lr, self.decay, self.eps = lr, decay, eps
 
-    self.v = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params]
+    self.v = [
+      Tensor(
+        np.zeros(t.shape, dtype=np.float32),
+        device=params[0].device,
+        requires_grad=False,
+      )
+      for t in self.params
+    ]
 
   def step(self):
     for i, t in enumerate(self.params):
       self.v[i] = self.decay * self.v[i] + (1.0 - self.decay) * t.grad * t.grad
       t -= (t.grad * self.lr).div(self.v[i].sqrt() + self.eps)
 
+
 class Adam(Optimizer):
   def __init__(self, params, lr=0.001, b1=0.9, b2=0.999, eps=1e-8):
     super().__init__(params)
     self.lr, self.b1, self.b2, self.eps, self.t = lr, b1, b2, eps, 0
 
-    self.m = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params]
-    self.v = [Tensor(np.zeros(t.shape, dtype=np.float32), device=params[0].device, requires_grad=False) for t in self.params]
+    self.m = [
+      Tensor(
+        np.zeros(t.shape, dtype=np.float32),
+        device=params[0].device,
+        requires_grad=False,
+      )
+      for t in self.params
+    ]
+    self.v = [
+      Tensor(
+        np.zeros(t.shape, dtype=np.float32),
+        device=params[0].device,
+        requires_grad=False,
+      )
+      for t in self.params
+    ]
 
   def step(self):
     self.t = self.t + 1
-    a = self.lr * ((1.0 - self.b2**self.t)**0.5) / (1.0 - self.b1**self.t)
-    for i,t in enumerate(self.params):
+    a = self.lr * ((1.0 - self.b2 ** self.t) ** 0.5) / (1.0 - self.b1 ** self.t)
+    for i, t in enumerate(self.params):
       self.m[i] = self.b1 * self.m[i] + (1.0 - self.b1) * t.grad
       self.v[i] = self.b2 * self.v[i] + (1.0 - self.b2) * t.grad * t.grad
       t -= a * self.m[i].div(self.v[i].sqrt() + self.eps)
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 14c5643c16..806d1d9ea7 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -10,32 +10,43 @@ from collections import defaultdict
 DEBUG = os.getenv("DEBUG", None) is not None
 if DEBUG:
   import atexit, time
+
   debug_counts, debug_times = defaultdict(int), defaultdict(float)
+
   def print_debug_exit():
     for name, _ in sorted(debug_times.items(), key=lambda x: -x[1]):
       print(f"{name:>20} : {debug_counts[name]:>6} {debug_times[name]:>10.2f} ms")
+
   atexit.register(print_debug_exit)
 
+
 class ProfileOp:
   def __init__(self, name, x, backward=False):
-    self.name = ("back_" if backward else "")+name
+    self.name = ("back_" if backward else "") + name
     self.x = x
+
   def __enter__(self):
-    if DEBUG: self.st = time.time()
+    if DEBUG:
+      self.st = time.time()
+
   def __exit__(self, *junk):
     if DEBUG:
       if cl_queue is not None:
         cl_queue.finish()
-      et = (time.time()-self.st)*1000.
+      et = (time.time() - self.st) * 1000.0
       debug_counts[self.name] += 1
       debug_times[self.name] += et
       print(f"{self.name:>20} : {et:>7.2f} ms {[y.shape for y in self.x]}")
 
+
 # **** GPU functions ****
 
 cl_ctx, cl_queue = None, None
+
+
 def require_init_gpu():
-  if not GPU: raise Exception("No GPU Support, install pyopencl")
+  if not GPU:
+    raise Exception("No GPU Support, install pyopencl")
   global cl_ctx, cl_queue
   if cl_queue is None:
     devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.GPU)
@@ -45,28 +56,45 @@ def require_init_gpu():
     # this is an in-order command queue
     cl_queue = cl.CommandQueue(cl_ctx)
 
+
 class GPUBuffer:
   def __init__(self, shape, hostbuf=None):
     self.shape, self.dtype = tuple(shape), np.float32
-    self.cl = hostbuf.cl if isinstance(hostbuf, GPUBuffer) else \
-      cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0), 4*np.prod(shape),
-                hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None)
+    self.cl = (
+      hostbuf.cl
+      if isinstance(hostbuf, GPUBuffer)
+      else cl.Buffer(
+        cl_ctx,
+        cl.mem_flags.READ_WRITE
+        | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0),
+        4 * np.prod(shape),
+        hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None,
+      )
+    )
 
   def __repr__(self):
     return f"<GPUBuffer with shape {self.shape!r}>"
 
+
 # **** ANE functions ****
 
 ane = None
+
+
 def require_init_ane():
   global ane
   if ane is None:
     import ane.lib.ane, tinygrad.ops_ane
+
     ane = ane.lib.ane.ANE()
 
+
 # **** start with two base classes, Tensor and Function ****
 
-class Device: CPU, GPU, ANE = 0, 1, 2
+
+class Device:
+  CPU, GPU, ANE = 0, 1, 2
+
 
 class Tensor:
   did_float_warning = False
@@ -110,7 +138,12 @@ class Tensor:
 
   @classmethod
   def uniform(cls, *shape, **kwargs):
-    return cls((np.random.uniform(-1., 1., size=shape)/np.sqrt(np.prod(shape))).astype(np.float32), **kwargs)
+    return cls(
+      (np.random.uniform(-1.0, 1.0, size=shape) / np.sqrt(np.prod(shape))).astype(
+        np.float32
+      ),
+      **kwargs,
+    )
 
   @classmethod
   def eye(cls, dim, **kwargs):
@@ -130,18 +163,21 @@ class Tensor:
 
     # fill in the first grad with one
     # this is "implicit gradient creation"
-    self.grad = Tensor(np.ones(self.shape, dtype=self.dtype), device=self.device, requires_grad=False)
+    self.grad = Tensor(
+      np.ones(self.shape, dtype=self.dtype), device=self.device, requires_grad=False
+    )
 
     for t0 in reversed(self.deepwalk(set(), [])):
-      assert (t0.grad is not None)
+      assert t0.grad is not None
       with ProfileOp(t0._ctx.__class__.__name__, [t0.grad], backward=True):
         grads = t0._ctx.backward(t0._ctx, t0.grad.data)
       if len(t0._ctx.parents) == 1:
         grads = [grads]
-      for t,g in zip(t0._ctx.parents, grads):
+      for t, g in zip(t0._ctx.parents, grads):
         if g is not None:
-          assert g.shape == t.shape, \
-            f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
+          assert (
+            g.shape == t.shape
+          ), f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
           gt = Tensor(g, device=self.device, requires_grad=False)
           t.grad = gt if t.grad is None else (t.grad + gt)
 
@@ -150,19 +186,21 @@ class Tensor:
   @staticmethod
   def _move_data(data, device):
     if isinstance(data, GPUBuffer):
-      if device == Device.GPU: return data
+      if device == Device.GPU:
+        return data
       old = data
       data = np.empty(old.shape, dtype=np.float32)
       with ProfileOp("toCPU", [data]):
-          cl.enqueue_copy(cl_queue, data, old.cl, is_blocking=True)
+        cl.enqueue_copy(cl_queue, data, old.cl, is_blocking=True)
 
     elif "ANETensor" in str(type(data)):
-      if device == Device.ANE: return data
+      if device == Device.ANE:
+        return data
       with ProfileOp("toCPU", [data]):
-          data = data.data().astype(np.float32)
+        data = data.data().astype(np.float32)
 
     if not isinstance(data, np.ndarray):
-        data = np.array(data, dtype=np.float32)
+      data = np.array(data, dtype=np.float32)
 
     if data.dtype != np.float32 and not Tensor.did_float_warning:
       # warning? float64 is actually needed for numerical jacobian
@@ -184,14 +222,17 @@ class Tensor:
 
   def to_(self, device):
     self.data, self.device = self._move_data(self.data, device), device
-    if self.grad: self.grad.to_(device)
+    if self.grad:
+      self.grad.to_(device)
 
   def to(self, device):
     ret = Tensor(self.data, device)
-    if self.grad: ret.grad = self.grad.to(device)
+    if self.grad:
+      ret.grad = self.grad.to(device)
     return ret
 
-  def _is(self, device): return self.device == device
+  def _is(self, device):
+    return self.device == device
 
   def detach(self):
     return Tensor(self.data, device=self.device)
@@ -203,7 +244,7 @@ class Tensor:
 
   def mean(self, axis=None):
     out = self.sum(axis=axis)
-    coeff = np.prod(out.shape)/np.prod(self.shape)
+    coeff = np.prod(out.shape) / np.prod(self.shape)
     return out * coeff
 
   def sqrt(self):
@@ -219,15 +260,18 @@ class Tensor:
     return 2.0 * ((2.0 * self).sigmoid()) - 1.0
 
   def leakyrelu(self, neg_slope=0.01):
-    return self.relu() - (-neg_slope*self).relu()
+    return self.relu() - (-neg_slope * self).relu()
 
   def dropout(self, p=0.5):
-    _mask = np.asarray(np.random.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype)
+    _mask = np.asarray(
+      np.random.binomial(1, 1.0 - p, size=self.shape), dtype=self.dtype
+    )
     ret = self * Tensor(_mask, requires_grad=False, device=self.device)
     return ret.div(1.0 - p)
 
   def abs(self):
-    return self.relu() + (-1.0*self).relu()
+    return self.relu() + (-1.0 * self).relu()
+
 
 # An instantiation of the Function is the Context
 class Function:
@@ -239,7 +283,7 @@ class Function:
     self.saved_tensors.extend(x)
 
   def apply(self, *x, **kwargs):
-    ctx = self(*x) # self - operation i.e 'add', 'sub', etc.
+    ctx = self(*x)  # self - operation i.e 'add', 'sub', etc.
     # use default params
     params = signature(self.forward).parameters
     for p in params.values():
@@ -249,40 +293,66 @@ class Function:
     for k, v in kwargs.items():
       setattr(ctx, k, v)
     with ProfileOp(ctx.__class__.__name__, x):
-      ret = Tensor(self.forward(ctx, *[t.data for t in x], **kwargs),
-                   device=ctx.device, requires_grad=any([t.requires_grad for t in x]))
+      ret = Tensor(
+        self.forward(ctx, *[t.data for t in x], **kwargs),
+        device=ctx.device,
+        requires_grad=any([t.requires_grad for t in x]),
+      )
     if ret.requires_grad:
       ret._ctx = ctx
     return ret
 
+
 def register(name, fxn, device=Device.CPU):
   Tensor.ops[device][name] = fxn
+
   def dispatch(*x, **kwargs):
     tt = [arg for arg in x if isinstance(arg, Tensor)][0]
-    x = [Tensor(np.array([arg], dtype=tt.dtype), device=tt.device, requires_grad=False) if not isinstance(arg, Tensor) else arg for arg in x]
+    x = [
+      Tensor(np.array([arg], dtype=tt.dtype), device=tt.device, requires_grad=False)
+      if not isinstance(arg, Tensor)
+      else arg
+      for arg in x
+    ]
     f = (Tensor.ops[tt.device])[name]
     f.cl_ctx, f.cl_queue, f.ane, f.device = cl_ctx, cl_queue, ane, tt.device
     return f.apply(f, *x, **kwargs)
+
   setattr(Tensor, name, dispatch)
   # TODO: div is a second class op, so it doesn't work here
-  if name in ['add', 'sub', 'mul', 'pow']:
+  if name in ["add", "sub", "mul", "pow"]:
     setattr(Tensor, f"__{name}__", dispatch)
-    setattr(Tensor, f"__i{name}__", lambda self,x: self.assign(dispatch(self,x)))
-    setattr(Tensor, f"__r{name}__", lambda self,x: dispatch(x,self))
+    setattr(Tensor, f"__i{name}__", lambda self, x: self.assign(dispatch(self, x)))
+    setattr(Tensor, f"__r{name}__", lambda self, x: dispatch(x, self))
+
 
 for device in [device for device in Device.__dict__.keys() if device[0] != "_"]:
-  setattr(Tensor, f"{device.lower()}", functools.partialmethod(Tensor.to, Device.__dict__[device]))
-  setattr(Tensor, f"{device.lower()}_", functools.partialmethod(Tensor.to_, Device.__dict__[device]))
-  setattr(Tensor, f"is_{device.lower()}", property(functools.partialmethod(Tensor._is, Device.__dict__[device])))
+  setattr(
+    Tensor,
+    f"{device.lower()}",
+    functools.partialmethod(Tensor.to, Device.__dict__[device]),
+  )
+  setattr(
+    Tensor,
+    f"{device.lower()}_",
+    functools.partialmethod(Tensor.to_, Device.__dict__[device]),
+  )
+  setattr(
+    Tensor,
+    f"is_{device.lower()}",
+    property(functools.partialmethod(Tensor._is, Device.__dict__[device])),
+  )
 
 # this registers all the operations
 import tinygrad.ops_cpu
+
 try:
   import pyopencl as cl
+
   # TODO: move this import to require_init_gpu?
   import tinygrad.ops_gpu
+
   GPU = True
 except ImportError:
   # no GPU support
   GPU = False
-