diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8f20b2d1cc..83f137e024 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -23,7 +23,7 @@ jobs:
     - name: Repo line count
       run: python3 sz.py
     - name: Lint with pylint
-      run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='  ' **/*.py
+      run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string='  ' **/*.py
     - name: Lint with flake8
       run: flake8 tinygrad/ --indent-size=2 --select=F,E112,E113,E203,E304,E502,E702,E703,E71,E72,E731,W191,W6 --statistics -j4
     - name: Lint tinygrad with pylint
diff --git a/examples/compile_efficientnet.py b/examples/compile_efficientnet.py
index 4d7c0a7b84..10cfd9fcb6 100644
--- a/examples/compile_efficientnet.py
+++ b/examples/compile_efficientnet.py
@@ -69,7 +69,7 @@ if __name__ == "__main__":
   # the functions
   cprog += list(functions.values())
 
-  # the net 
+  # the net
   cprog += ["void net() {"] + [f"{name}({', '.join(args)});" for (name, args, _global_size) in statements] + ["}"]
 
   cprog += ["""
diff --git a/examples/deep_deterministic_policy_gradient.py b/examples/deep_deterministic_policy_gradient.py
index c162aab391..906069f0db 100644
--- a/examples/deep_deterministic_policy_gradient.py
+++ b/examples/deep_deterministic_policy_gradient.py
@@ -114,8 +114,8 @@ class DeepDeterministicPolicyGradient:
       noise_stddev: The standard deviation of the exploration noise.
 
   Note:
-      In contrast to the original paper, actions are already included in the first layer 
-      of the Critic and we use a Gaussian distribution instead of an Ornstein Uhlenbeck 
+      In contrast to the original paper, actions are already included in the first layer
+      of the Critic and we use a Gaussian distribution instead of an Ornstein Uhlenbeck
       process for exploration noise.
 
   """
@@ -203,7 +203,7 @@ class DeepDeterministicPolicyGradient:
         next_state_batch,
         done_batch,
     ) = self.memory.sample()
- 
+
     target_actions = self.target_actor.forward(next_state_batch, self.max_action)
     y = reward_batch + self.gamma * self.target_critic.forward(
         next_state_batch, target_actions.detach()
diff --git a/examples/yolov8-onnx.py b/examples/yolov8-onnx.py
index 53c2c0a39c..3bab3c2956 100644
--- a/examples/yolov8-onnx.py
+++ b/examples/yolov8-onnx.py
@@ -7,7 +7,7 @@ from tinygrad.tensor import Tensor
 
 os.chdir("/tmp")
 if not os.path.isfile("yolov8n-seg.onnx"):
-  model = YOLO("yolov8n-seg.pt") 
+  model = YOLO("yolov8n-seg.pt")
   model.export(format="onnx", imgsz=[480,640])
 onnx_model = onnx.load(open("yolov8n-seg.onnx", "rb"))
 # TODO: move get example inputs to onnx
diff --git a/examples/yolov8.py b/examples/yolov8.py
index 956884a592..71da6bf25b 100644
--- a/examples/yolov8.py
+++ b/examples/yolov8.py
@@ -48,10 +48,10 @@ def box_area(box):
 def box_iou(box1, box2):
   lt = np.maximum(box1[:, None, :2], box2[:, :2])
   rb = np.minimum(box1[:, None, 2:], box2[:, 2:])
-  wh = np.clip(rb - lt, 0, None) 
-  inter = wh[:, :, 0] * wh[:, :, 1]  
-  area1 = box_area(box1)[:, None]  
-  area2 = box_area(box2)[None, :]  
+  wh = np.clip(rb - lt, 0, None)
+  inter = wh[:, :, 0] * wh[:, :, 1]
+  area1 = box_area(box1)[:, None]
+  area2 = box_area(box2)[None, :]
   iou = inter / (area1 + area2 - inter)
   return iou
 
@@ -66,7 +66,7 @@ def compute_nms(boxes, scores, iou_threshold):
     inds = np.where(iou.squeeze() <= iou_threshold)[0]
     order = order[inds + 1]
   return np.array(keep)
-    
+
 def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, agnostic=False, max_det=300, nc=0, max_wh=7680):
   prediction = prediction[0] if isinstance(prediction, (list, tuple)) else prediction
   bs, nc = prediction.shape[0], nc or (prediction.shape[1] - 4)
@@ -86,7 +86,7 @@ def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, agnostic=Fa
     c = x[:, 5:6] * (0 if agnostic else max_wh)
     boxes, scores = x[:, :4] + c, x[:, 4]
     i = compute_nms(boxes, scores, iou_thres)[:max_det]
-    output[xi] = x[i] 
+    output[xi] = x[i]
   return output
 
 def postprocess(preds, img, orig_imgs):
@@ -102,7 +102,7 @@ def postprocess(preds, img, orig_imgs):
       pred[:, :4] = scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
       all_preds.append(pred)
   return all_preds
-  
+
 def draw_bounding_boxes_and_save(orig_img_paths, output_img_paths, all_predictions, class_labels, iou_threshold=0.5):
   color_dict = {label: tuple((((i+1) * 50) % 256, ((i+1) * 100) % 256, ((i+1) * 150) % 256)) for i, label in enumerate(class_labels)}
   font = cv2.FONT_HERSHEY_SIMPLEX
@@ -159,7 +159,7 @@ def draw_bounding_boxes_and_save(orig_img_paths, output_img_paths, all_predictio
     cv2.imwrite(output_img_path, orig_img)
     print(f'saved detections at {output_img_path}')
 
-# utility functions for forward pass. 
+# utility functions for forward pass.
 def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
   lt, rb = distance.chunk(2, dim)
   x1y1 = anchor_points - lt
@@ -167,7 +167,7 @@ def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
   if xywh:
     c_xy = (x1y1 + x2y2) / 2
     wh = x2y2 - x1y1
-    return c_xy.cat(wh, dim=1) 
+    return c_xy.cat(wh, dim=1)
   return x1y1.cat(x2y2, dim=1)
 
 def make_anchors(feats, strides, grid_cell_offset=0.5):
@@ -175,13 +175,13 @@ def make_anchors(feats, strides, grid_cell_offset=0.5):
   assert feats is not None
   for i, stride in enumerate(strides):
     _, _, h, w = feats[i].shape
-    sx = Tensor.arange(w) + grid_cell_offset  
-    sy = Tensor.arange(h) + grid_cell_offset 
-    
-    # this is np.meshgrid but in tinygrad 
+    sx = Tensor.arange(w) + grid_cell_offset
+    sy = Tensor.arange(h) + grid_cell_offset
+
+    # this is np.meshgrid but in tinygrad
     sx = sx.reshape(1, -1).repeat([h, 1]).reshape(-1)
     sy = sy.reshape(-1, 1).repeat([1, w]).reshape(-1)
-    
+
     anchor_points.append(Tensor.stack((sx, sy), -1).reshape(-1, 2))
     stride_tensor.append(Tensor.full((h * w), stride))
   anchor_points = anchor_points[0].cat(anchor_points[1], anchor_points[2])
@@ -244,32 +244,32 @@ class Upsample:
     (b, c), _lens = x.shape[:2], len(x.shape[2:])
     tmp = x.reshape([b, c, -1] + [1] * _lens) * Tensor.ones(*[1, 1, 1] + [self.scale_factor] * _lens)
     return tmp.reshape(list(x.shape) + [self.scale_factor] * _lens).permute([0, 1] + list(chain.from_iterable([[y+2, y+2+_lens] for y in range(_lens)]))).reshape([b, c] + [x * self.scale_factor for x in x.shape[2:]])
-  
+
 class Conv_Block():
   def __init__(self, c1, c2, kernel_size=1, stride=1, groups=1, dilation=1, padding=None):
     self.conv = Conv2d(c1,c2, kernel_size, stride, padding=autopad(kernel_size, padding, dilation), bias=False, groups=groups, dilation=dilation)
     self.bn = BatchNorm2d(c2, eps=0.001)
-  
+
   def __call__(self, x):
     return self.bn(self.conv(x)).silu()
-  
+
 class Bottleneck:
   def __init__(self, c1, c2 , shortcut: bool, g=1, kernels: list = (3,3), channel_factor=0.5):
     c_ = int(c2 * channel_factor)
     self.cv1 = Conv_Block(c1, c_, kernel_size=kernels[0], stride=1, padding=None)
     self.cv2 = Conv_Block(c_, c2, kernel_size=kernels[1], stride=1, padding=None, groups=g)
     self.residual = c1 == c2 and shortcut
-    
+
   def __call__(self, x):
     return x + self.cv2(self.cv1(x)) if self.residual else self.cv2(self.cv1(x))
 
 class C2f:
-  def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): 
-    self.c = int(c2 * e) 
+  def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
+    self.c = int(c2 * e)
     self.cv1 = Conv_Block(c1, 2 * self.c, 1,)
     self.cv2 = Conv_Block((2 + n) * self.c, c2, 1)
     self.bottleneck = [Bottleneck(self.c, self.c, shortcut, g, kernels=[(3, 3), (3, 3)], channel_factor=1.0) for _ in range(n)]
-   
+
   def __call__(self, x):
     y= list(self.cv1(x).chunk(2, 1))
     y.extend(m(y[-1]) for m in self.bottleneck)
@@ -282,17 +282,17 @@ class SPPF:
     c_ = c1 // 2  # hidden channels
     self.cv1 = Conv_Block(c1, c_, 1, 1, padding=None)
     self.cv2 = Conv_Block(c_ * 4, c2, 1, 1, padding=None)
-    
-    # TODO: this pads with 0s, whereas torch function pads with -infinity. This results in a < 2% difference in prediction which does not make a difference visually. 
+
+    # TODO: this pads with 0s, whereas torch function pads with -infinity. This results in a < 2% difference in prediction which does not make a difference visually.
     self.maxpool = lambda x : x.pad2d((k // 2, k // 2, k // 2, k // 2)).max_pool2d(kernel_size=k, stride=1)
-        
+
   def __call__(self, x):
     x = self.cv1(x)
     x2 = self.maxpool(x)
     x3 = self.maxpool(x2)
     x4 = self.maxpool(x3)
     return self.cv2(x.cat(x2, x3, x4, dim=1))
-  
+
 class DFL:
   def __init__(self, c1=16):
     self.conv = Conv2d(c1, 1, 1, bias=False)
@@ -303,19 +303,19 @@ class DFL:
   def __call__(self, x):
     b, c, a = x.shape # batch, channels, anchors
     return self.conv(x.reshape(b, 4, self.c1, a).transpose(2, 1).softmax(1)).reshape(b, 4, a)
-  
-#backbone                               
+
+#backbone
 class Darknet:
-  def __init__(self, w, r, d): 
+  def __init__(self, w, r, d):
     self.b1 = [Conv_Block(c1=3, c2= int(64*w), kernel_size=3, stride=2, padding=1), Conv_Block(int(64*w), int(128*w), kernel_size=3, stride=2, padding=1)]
     self.b2 = [C2f(c1=int(128*w), c2=int(128*w), n=round(3*d), shortcut=True), Conv_Block(int(128*w), int(256*w), 3, 2, 1), C2f(int(256*w), int(256*w), round(6*d), True)]
     self.b3 = [Conv_Block(int(256*w), int(512*w), kernel_size=3, stride=2, padding=1), C2f(int(512*w), int(512*w), round(6*d), True)]
     self.b4 = [Conv_Block(int(512*w), int(512*w*r), kernel_size=3, stride=2, padding=1), C2f(int(512*w*r), int(512*w*r), round(3*d), True)]
     self.b5 = [SPPF(int(512*w*r), int(512*w*r), 5)]
-    
+
   def return_modules(self):
     return [*self.b1, *self.b2, *self.b3, *self.b4, *self.b5]
-  
+
   def __call__(self, x):
     x1 = x.sequential(self.b1)
     x2 = x1.sequential(self.b2)
@@ -334,10 +334,10 @@ class Yolov8NECK:
     self.n4 = C2f(c1=int(768*w), c2=int(512*w), n=round(3*d), shortcut=False)
     self.n5 = Conv_Block(c1=int(512* w), c2=int(512 * w), kernel_size=3, stride=2, padding=1)
     self.n6 = C2f(c1=int(512*w*(1+r)), c2=int(512*w*r), n=round(3*d), shortcut=False)
-  
+
   def return_modules(self):
     return [self.n1, self.n2, self.n3, self.n4, self.n5, self.n6]
-  
+
   def __call__(self, p3, p4, p5):
     x = self.n1(self.up(p5).cat(p4, dim=1))
     head_1 = self.n2(self.up(x).cat(p3, dim=1))
@@ -345,20 +345,20 @@ class Yolov8NECK:
     head_3 = self.n6(self.n5(head_2).cat(p5, dim=1))
     return [head_1, head_2, head_3]
 
-#task specific head. 
+#task specific head.
 class DetectionHead:
   def __init__(self, nc=80, filters=()):
-    self.ch = 16 
+    self.ch = 16
     self.nc = nc  # number of classes
-    self.nl = len(filters)  
+    self.nl = len(filters)
     self.no = nc + self.ch * 4  #
     self.stride = [8, 16, 32]
     c1 = max(filters[0], self.nc)
     c2 = max((filters[0] // 4, self.ch * 4))
-    self.dfl = DFL(self.ch) 
+    self.dfl = DFL(self.ch)
     self.cv3 = [[Conv_Block(x, c1, 3), Conv_Block(c1, c1, 3), Conv2d(c1, self.nc, 1)] for x in filters]
     self.cv2 = [[Conv_Block(x, c2, 3), Conv_Block(c2, c2, 3), Conv2d(c2, 4 * self.ch, 1)] for x in filters]
-  
+
   def __call__(self, x):
     for i in range(self.nl):
       x[i] = (x[i].sequential(self.cv2[i]).cat(x[i].sequential(self.cv3[i]), dim=1))
@@ -369,7 +369,7 @@ class DetectionHead:
     dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
     z = dbox.cat(cls.sigmoid(), dim=1)
     return z
-   
+
 class YOLOv8:
   def __init__(self, w, r,  d, num_classes): #width_multiple, ratio_multiple, depth_multiple
     self.net = Darknet(w, r, d)
@@ -386,9 +386,9 @@ class YOLOv8:
     yolov8neck_modules = [12, 15, 16, 18, 19, 21]
     yolov8_head_weights = [(22, self.head)]
     return [*zip(backbone_modules, self.net.return_modules()), *zip(yolov8neck_modules, self.fpn.return_modules()), *yolov8_head_weights]
-  
+
 if __name__ == '__main__':
-  
+
   # usage : python3 yolov8.py "image_URL OR image_path" "v8 variant" (optional, n is default)
   if len(sys.argv) < 2:
     print("Error: Image URL or path not provided.")
@@ -397,7 +397,7 @@ if __name__ == '__main__':
   img_path = sys.argv[1]
   yolo_variant = sys.argv[2] if len(sys.argv) >= 3 else (print("No variant given, so choosing 'n' as the default. Yolov8 has different variants, you can choose from ['n', 's', 'm', 'l', 'x']") or 'n')
   print(f'running inference for YOLO version {yolo_variant}')
-  
+
   output_folder_path = './outputs_yolov8'
   if not os.path.exists(output_folder_path):
     os.makedirs(output_folder_path)
@@ -409,31 +409,31 @@ if __name__ == '__main__':
     print('Error in image loading. Check your image file.')
     sys.exit(1)
   pre_processed_image = preprocess(image)
-  
+
   # Different YOLOv8 variants use different w , r, and d multiples. For a list , refer to this yaml file (the scales section) https://github.com/ultralytics/ultralytics/blob/main/ultralytics/models/v8/yolov8.yaml
-  depth, width, ratio = get_variant_multiples(yolo_variant) 
-  yolo_infer = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)  
-  
+  depth, width, ratio = get_variant_multiples(yolo_variant)
+  yolo_infer = YOLOv8(w=width, r=ratio, d=depth, num_classes=80)
+
   weights_location = Path(__file__).parent.parent / "weights" / f'yolov8{yolo_variant}.safetensors'
   download_file(f'https://gitlab.com/r3sist/yolov8_weights/-/raw/master/yolov8{yolo_variant}.safetensors', weights_location)
-  
+
   state_dict = safe_load(weights_location)
   load_state_dict(yolo_infer, state_dict)
-    
+
   st = time.time()
   predictions = yolo_infer(pre_processed_image)
   print(f'did inference in {int(round(((time.time() - st) * 1000)))}ms')
 
   post_predictions = postprocess(preds=predictions, img=pre_processed_image, orig_imgs=image)
-  
+
   #v8 and v3 have same 80 class names for Object Detection
   class_labels = fetch('https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names')
   class_labels = class_labels.decode('utf-8').split('\n')
 
   draw_bounding_boxes_and_save(orig_img_paths=image_location, output_img_paths=out_paths, all_predictions=post_predictions, class_labels=class_labels)
 
-# TODO for later: 
-#  1. Fix SPPF minor difference due to maxpool 
-#  2. AST exp overflow warning while on cpu 
-#  3. Make NMS faster 
+# TODO for later:
+#  1. Fix SPPF minor difference due to maxpool
+#  2. AST exp overflow warning while on cpu
+#  3. Make NMS faster
 #  4. Add video inference and webcam support
\ No newline at end of file
diff --git a/extra/lr_scheduler.py b/extra/lr_scheduler.py
index 997d92236e..25badc1743 100644
--- a/extra/lr_scheduler.py
+++ b/extra/lr_scheduler.py
@@ -7,9 +7,9 @@ class LR_Scheduler:
   def __init__(self, optimizer: Optimizer):
     self.optimizer = optimizer
     self.epoch_counter = Tensor([0], requires_grad=False)
-  
+
   def get_lr(self): pass
-  
+
   def step(self) -> None:
     self.epoch_counter.assign(self.epoch_counter + 1).realize()
     self.optimizer.lr.assign(self.get_lr()).realize()
@@ -19,7 +19,7 @@ class MultiStepLR(LR_Scheduler):
     super().__init__(optimizer)
     self.milestones = milestones
     self.gamma = gamma
-  
+
   def get_lr(self) -> Tensor:
     if self.epoch_counter.numpy()[0] not in self.milestones:
       return self.optimizer.lr
@@ -34,13 +34,13 @@ class ReduceLROnPlateau(LR_Scheduler):
     self.bad_epoch = 0
 
     if mode == "min": self.threshold *= -1
-  
+
   def is_better(self, current: float) -> bool:
     dynamic_threshold = self.best*(1+self.threshold) if self.threshold_mode == "rel" else self.best+self.threshold
     if self.mode == "min":
       return current < dynamic_threshold
     return current > dynamic_threshold
-  
+
   def step(self, current: float) -> None:
     self.epoch_counter.assign(self.epoch_counter + 1).realize()
     if self.is_better(current):
@@ -48,7 +48,7 @@ class ReduceLROnPlateau(LR_Scheduler):
       self.best = current
     else:
       self.bad_epoch += 1
-    
+
     if self.bad_epoch > self.patience:
       self.optimizer.lr *= self.factor
       self.bad_epoch = 0
@@ -74,12 +74,12 @@ class OneCycleLR(LR_Scheduler):
     self.pct_start = pct_start
     assert anneal_strategy == 'linear', 'only linear annealing supported'
     assert not cycle_momentum, 'cycle momentum not supported'
-    self.optimizer.lr.assign(self.get_lr()).realize() # update the initial LR 
+    self.optimizer.lr.assign(self.get_lr()).realize() # update the initial LR
 
   @staticmethod
   def _annealing_linear(start: Tensor, end: Tensor, pct: Tensor) -> Tensor: return ((end - start) * pct + start)
 
-  def get_lr(self) -> Tensor: 
+  def get_lr(self) -> Tensor:
     return (self.epoch_counter < self.total_steps*self.pct_start).where(
       self._annealing_linear(self.initial_lr, self.max_lr, self.epoch_counter/(self.total_steps*self.pct_start)),
       self._annealing_linear(self.max_lr, self.min_lr, (self.epoch_counter-(self.total_steps*self.pct_start))/(self.total_steps*(1-self.pct_start)))
diff --git a/extra/onnx.py b/extra/onnx.py
index 3dc64fea8d..7477771570 100644
--- a/extra/onnx.py
+++ b/extra/onnx.py
@@ -78,7 +78,7 @@ def get_run_onnx(onnx_model: ModelProto):
   attribute_dict = {}
   for num,n in enumerate(onnx_model.graph.node):
     attribute_dict[num] = attribute_to_dict(n.attribute)
-  
+
   onnx_model_version = onnx_model.opset_import[0].version
 
   def run_onnx(inputs={}, debug=False):
@@ -204,7 +204,7 @@ def get_run_onnx(onnx_model: ModelProto):
       assert len(n.output) <= len(ret), f"expected output size must be less than {len(ret)}, it's {n.output}"
       if debug: print([x.shape if isinstance(x, Tensor) else None for x in ret])
       if debug: print("outputs:")
-      for i in range(len(n.output)): 
+      for i in range(len(n.output)):
         if debug: print(f"\t{n.output[i]} - {ret[i]}")
         intermediate_tensors[n.output[i]] = ret[i]
       #print(ret[0].numpy().mean())
diff --git a/extra/onnx_ops.py b/extra/onnx_ops.py
index 377f340f65..b57a5ab4a4 100644
--- a/extra/onnx_ops.py
+++ b/extra/onnx_ops.py
@@ -209,7 +209,7 @@ def Or(x:Tensor, y:Tensor): return Where((x==y), x, Tensor.ones(*x.shape)).cast(
 def Xor(x:Tensor, y:Tensor): return Where((x==y), Tensor.zeros(*x.shape), Tensor.ones(*x.shape)).cast(dtypes.bool)
 def Not(x:Tensor): return Where((x==1), Tensor.zeros(*x.shape), Tensor.ones(*x.shape)).cast(dtypes.bool)
 
-def Trilu(x: Tensor, k: Union[Tensor, int]=0, upper=1): 
+def Trilu(x: Tensor, k: Union[Tensor, int]=0, upper=1):
   k = int(k.numpy().item()) if k is not 0 else 0 # onnx passes k as a tensor int64 with one element, default is 0
   return x.triu(k) if upper else x.tril(k)
 
@@ -242,13 +242,13 @@ def NegativeLogLikelihoodLoss(input, target, weight=None, ignore_index=None, red
     input = input.reshape((N, C, -1))
     target = target.reshape((N, -1))
   if weight is not None:
-    mask = target.unsqueeze(-1) == Tensor.arange(C,dtype=dtypes.int64).repeat((N, 1, 1)) 
+    mask = target.unsqueeze(-1) == Tensor.arange(C,dtype=dtypes.int64).repeat((N, 1, 1))
     weight = (mask * weight).sum(axis=-1)
   if ignore_index is not None:
     cond = (target == ignore_index)
-    weight = cond.where(0, weight) if weight is not None else cond.where(Tensor.zeros(*target.shape), 1) 
-  mask = target[:, None, :] ==  Tensor.arange(C).reshape([1, C] + [1]*(len(input.shape) -2)) 
-  loss = (-mask * input).sum(axis=1) * (1 if weight is None else weight)  
+    weight = cond.where(0, weight) if weight is not None else cond.where(Tensor.zeros(*target.shape), 1)
+  mask = target[:, None, :] ==  Tensor.arange(C).reshape([1, C] + [1]*(len(input.shape) -2))
+  loss = (-mask * input).sum(axis=1) * (1 if weight is None else weight)
   if reduction == "mean": return loss.mean() if weight is None else loss.sum() / weight.sum()
   elif reduction == "sum": return loss.sum()
   return loss.reshape(t_shape) if len(i_shape) != 3 else loss
@@ -259,7 +259,7 @@ def OneHot(indices, depth, values, axis=-1):
   if axis < 0: axis += rank + 1
   ls, rs = indices.shape[0:axis], indices.shape[axis: rank]
   cond = indices[:,None] == Tensor.arange(depth).reshape((1,) * len(ls) + (depth,) + (1,) * len(rs))
-  return cond.where(values[1], values[0]).cast(values.dtype) 
+  return cond.where(values[1], values[0]).cast(values.dtype)
 
 def Floor(x:Tensor): return x.floor()
 def Ceil(x:Tensor): return x.ceil()
diff --git a/extra/training.py b/extra/training.py
index f2dae8f620..4deace07a1 100644
--- a/extra/training.py
+++ b/extra/training.py
@@ -13,7 +13,7 @@ def sparse_categorical_crossentropy(out, Y):
   y = Tensor(y)
   return out.mul(y).mean()
 
-def train(model, X_train, Y_train, optim, steps, BS=128, lossfn=sparse_categorical_crossentropy, 
+def train(model, X_train, Y_train, optim, steps, BS=128, lossfn=sparse_categorical_crossentropy,
         transform=lambda x: x, target_transform=lambda x: x, noloss=False):
   Tensor.training = True
   losses, accuracies = [], []
@@ -41,9 +41,9 @@ def train(model, X_train, Y_train, optim, steps, BS=128, lossfn=sparse_categoric
       accuracies.append(accuracy)
       t.set_description("loss %.2f accuracy %.2f" % (loss, accuracy))
   return [losses, accuracies]
-    
 
-def evaluate(model, X_test, Y_test, num_classes=None, BS=128, return_predict=False, transform=lambda x: x, 
+
+def evaluate(model, X_test, Y_test, num_classes=None, BS=128, return_predict=False, transform=lambda x: x,
              target_transform=lambda y: y):
   Tensor.training = False
   def numpy_eval(Y_test, num_classes):
diff --git a/models/mask_rcnn.py b/models/mask_rcnn.py
index 33ea5fb18d..e91f70c66f 100644
--- a/models/mask_rcnn.py
+++ b/models/mask_rcnn.py
@@ -40,7 +40,7 @@ def topk(input_, k, dim=-1, largest=True, sorted=False):
   ind_part = np.argsort(input_, axis=dim)
   ind = np.take_along_axis(ind, ind_part, axis=dim)
   if largest: input_ *= -1
-  val = np.take_along_axis(input_, ind_part, axis=dim) 
+  val = np.take_along_axis(input_, ind_part, axis=dim)
   return Tensor(val), ind
 
 # This is very slow for large arrays, or indices
@@ -48,12 +48,12 @@ def _gather(array, indices):
   indices = indices.float().to(array.device)
   reshape_arg = [1]*array.ndim + [array.shape[-1]]
   return Tensor.where(
-    indices.unsqueeze(indices.ndim).expand(*indices.shape, array.shape[-1]) == Tensor.arange(array.shape[-1]).reshape(*reshape_arg).expand(*indices.shape, array.shape[-1]), 
+    indices.unsqueeze(indices.ndim).expand(*indices.shape, array.shape[-1]) == Tensor.arange(array.shape[-1]).reshape(*reshape_arg).expand(*indices.shape, array.shape[-1]),
     array, 0,
   ).sum(indices.ndim)
 
 # TODO: replace npgather with a faster gather using tinygrad only
-# NOTE: this blocks the gradient 
+# NOTE: this blocks the gradient
 def npgather(array,indices):
   if isinstance(array, Tensor): array = array.numpy()
   if isinstance(indices, Tensor): indices = indices.numpy()
@@ -98,7 +98,7 @@ def tensor_gather(tensor, indices):
   return ret
 
 
-class LastLevelMaxPool: 
+class LastLevelMaxPool:
   def __call__(self, x): return [Tensor.max_pool2d(x, 1, 2)]
 
 
@@ -853,7 +853,7 @@ def _bilinear_interpolate(
   w2 = outer_prod(hy, lx)
   w3 = outer_prod(ly, hx)
   w4 = outer_prod(ly, lx)
-  
+
   val = w1*v1 + w2*v2 + w3*v3 + w4*v4
   return val
 
@@ -861,41 +861,41 @@ def _bilinear_interpolate(
 def _roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
   orig_dtype = input.dtype
   _, _, height, width = input.shape
-  ph = Tensor.arange(pooled_height, device=input.device)  
-  pw = Tensor.arange(pooled_width, device=input.device) 
+  ph = Tensor.arange(pooled_height, device=input.device)
+  pw = Tensor.arange(pooled_width, device=input.device)
 
-  roi_batch_ind = rois[:, 0].cast(dtypes.int32).contiguous() 
+  roi_batch_ind = rois[:, 0].cast(dtypes.int32).contiguous()
   offset = 0.5 if aligned else 0.0
   roi_start_w = rois[:, 1] * spatial_scale - offset
   roi_start_h = rois[:, 2] * spatial_scale - offset
-  roi_end_w = rois[:, 3] * spatial_scale - offset 
+  roi_end_w = rois[:, 3] * spatial_scale - offset
   roi_end_h = rois[:, 4] * spatial_scale - offset
 
-  roi_width = roi_end_w - roi_start_w 
-  roi_height = roi_end_h - roi_start_h 
+  roi_width = roi_end_w - roi_start_w
+  roi_height = roi_end_h - roi_start_h
   if not aligned:
-    roi_width = roi_width.maximum(1.0) 
-    roi_height = roi_height.maximum(1.0) 
+    roi_width = roi_width.maximum(1.0)
+    roi_height = roi_height.maximum(1.0)
 
-  bin_size_h = roi_height / pooled_height  
-  bin_size_w = roi_width / pooled_width  
+  bin_size_h = roi_height / pooled_height
+  bin_size_w = roi_width / pooled_width
 
   exact_sampling = sampling_ratio > 0
-  roi_bin_grid_h = sampling_ratio if exact_sampling else (roi_height / pooled_height).ceil() 
+  roi_bin_grid_h = sampling_ratio if exact_sampling else (roi_height / pooled_height).ceil()
   roi_bin_grid_w = sampling_ratio if exact_sampling else (roi_width / pooled_width).ceil()
 
   if exact_sampling:
-    count = max(roi_bin_grid_h * roi_bin_grid_w, 1)  
-    iy = Tensor.arange(roi_bin_grid_h, device=input.device) 
-    ix = Tensor.arange(roi_bin_grid_w, device=input.device) 
+    count = max(roi_bin_grid_h * roi_bin_grid_w, 1)
+    iy = Tensor.arange(roi_bin_grid_h, device=input.device)
+    ix = Tensor.arange(roi_bin_grid_w, device=input.device)
     ymask = None
     xmask = None
   else:
     count = (roi_bin_grid_h * roi_bin_grid_w).maximum(1)
-    iy = Tensor.arange(height, device=input.device)  
-    ix = Tensor.arange(width, device=input.device)  
-    ymask = iy[None, :] < roi_bin_grid_h[:, None] 
-    xmask = ix[None, :] < roi_bin_grid_w[:, None] 
+    iy = Tensor.arange(height, device=input.device)
+    ix = Tensor.arange(width, device=input.device)
+    ymask = iy[None, :] < roi_bin_grid_h[:, None]
+    xmask = ix[None, :] < roi_bin_grid_w[:, None]
 
   def from_K(t):
     return t[:, None, None]
diff --git a/models/unet3d.py b/models/unet3d.py
index 289ed4c86b..555c622214 100644
--- a/models/unet3d.py
+++ b/models/unet3d.py
@@ -30,7 +30,7 @@ class UNet3D:
     self.input_block = DownsampleBlock(in_channels, filters[0], stride=1)
     self.downsample = [DownsampleBlock(i, o) for i, o in zip(inp, out)]
     self.bottleneck = DownsampleBlock(filters[-1], filters[-1])
-    self.upsample = [UpsampleBlock(filters[-1], filters[-1])] + [UpsampleBlock(i, o) for i, o in zip(out[::-1], inp[::-1])] 
+    self.upsample = [UpsampleBlock(filters[-1], filters[-1])] + [UpsampleBlock(i, o) for i, o in zip(out[::-1], inp[::-1])]
     self.output = {"conv": nn.Conv2d(filters[0], n_class, kernel_size=(1, 1, 1))}
 
   def __call__(self, x):
@@ -44,7 +44,7 @@ class UNet3D:
       x = upsample(x, skip)
     x = self.output["conv"](x)
     return x
-    
+
   def load_from_pretrained(self):
     fn = Path(__file__).parent.parent / "weights" / "unet-3d.ckpt"
     download_file("https://zenodo.org/record/5597155/files/3dunet_kits19_pytorch.ptc?download=1", fn)
diff --git a/test/test_conv.py b/test/test_conv.py
index a735613f9e..433a705345 100644
--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -104,7 +104,7 @@ class TestConv(unittest.TestCase):
     x = x.conv2d(w, groups=32)
     out = x.numpy()
     Tensor.no_grad = False
-  
+
   def test_multiadd(self):
     w = Tensor.ones(32)
     x = Tensor.ones(32).relu()