diff --git a/examples/mlperf/helpers.py b/examples/mlperf/helpers.py
index 90e1199ad1..a5d1419158 100644
--- a/examples/mlperf/helpers.py
+++ b/examples/mlperf/helpers.py
@@ -1,6 +1,6 @@
 from collections import OrderedDict
 import unicodedata
-from typing import Optional, Tuple
+from typing import Optional, Tuple, List
 import numpy as np
 from tinygrad.nn import state
 from tinygrad.tensor import Tensor, dtypes
@@ -275,3 +275,37 @@ def box_iou(boxes1:np.ndarray, boxes2:np.ndarray) -> np.ndarray:
 
   inter, union = _box_inter_union(boxes1, boxes2)
   return inter / union
+
+def generate_anchors(input_size:Tuple[int, int], batch_size:int = 1, scales:Optional[Tuple[Tensor, ...]] = None, aspect_ratios:Optional[Tuple[Tensor, ...]] = None) -> List[Tensor]:
+  def _compute_grid_sizes(input_size:Tuple[int, int]) -> np.ndarray:
+    return np.ceil(np.array(input_size)[None, :] / 2 ** np.arange(3, 8)[:, None])
+  
+  scales = tuple(Tensor((i, int(i * 2 ** (1/3)), int(i * 2 ** (2/3)))) for i in 2 ** np.arange(5, 10)) if scales is None else scales
+  aspect_ratios = ((0.5, 1.0, 2.0),) * len(scales) if aspect_ratios is None else aspect_ratios
+  aspect_ratios = tuple(Tensor(ar) for ar in aspect_ratios)
+  grid_sizes = _compute_grid_sizes(input_size)
+
+  assert len(scales) == len(aspect_ratios) == len(grid_sizes), "scales, aspect_ratios, and grid_sizes must have the same length"
+
+  anchors_over_all_feature_maps = []
+  for s, ar, gs in zip(scales, aspect_ratios, grid_sizes):
+    h_ratios = ar.sqrt()
+    w_ratios = 1 / h_ratios
+    ws = (w_ratios[:, None] * s[None, :]).reshape(-1)
+    hs = (h_ratios[:, None] * s[None, :]).reshape(-1)
+    base_anchors = (Tensor.stack(-ws, -hs, ws, hs, dim=1) / 2).round()
+    stride_h, stride_w = input_size[0] // gs[0], input_size[1] // gs[1]
+    shifts_x, shifts_y = (Tensor.arange(gs[1]) * stride_w).meshgrid(Tensor.arange(gs[0]) * stride_h, indexing="xy")
+    shifts_x, shifts_y = shifts_x.reshape(-1), shifts_y.reshape(-1)
+    shifts = Tensor.stack(shifts_x, shifts_y, shifts_x, shifts_y, dim=1)
+    anchors_over_all_feature_maps.append((shifts[:, None] + base_anchors[None, :]).reshape(-1, 4))
+
+  if batch_size > 1:
+    anchors = []
+    for _ in range(batch_size):
+      anchors_in_img = [a for a in anchors_over_all_feature_maps]
+      anchors.append(anchors_in_img)
+
+    return [Tensor.cat(*anchors_per_img) for anchors_per_img in anchors]
+  
+  return anchors_over_all_feature_maps
diff --git a/extra/datasets/openimages.py b/extra/datasets/openimages.py
index 7dc7f8ba84..2dc3f15778 100644
--- a/extra/datasets/openimages.py
+++ b/extra/datasets/openimages.py
@@ -206,7 +206,7 @@ def resize(img:Image, tgt:Optional[Dict[str, Union[np.ndarray, Tuple]]]=None, si
 
   return img, img_size
 
-def normalize(img:Tensor, device:List[str]):
+def normalize(img:Tensor, device:Optional[List[str]] = None):
   mean = Tensor([0.485, 0.456, 0.406], device=device, dtype=dtypes.float32).reshape(1, -1, 1, 1)
   std = Tensor([0.229, 0.224, 0.225], device=device, dtype=dtypes.float32).reshape(1, -1, 1, 1)
   img = ((img.permute([0, 3, 1, 2]) / 255.0) - mean) / std
diff --git a/extra/models/retinanet.py b/extra/models/retinanet.py
index 8561aabf67..36aaa254d2 100644
--- a/extra/models/retinanet.py
+++ b/extra/models/retinanet.py
@@ -4,6 +4,7 @@ import math
 from tinygrad import Tensor, dtypes
 from tinygrad.helpers import flatten, get_child
 import tinygrad.nn as nn
+from examples.mlperf.helpers import generate_anchors
 from examples.mlperf.initializers import Conv2dNormal, Conv2dKaimingUniform
 from examples.mlperf.losses import sigmoid_focal_loss
 from extra.models.helpers import meshgrid, nms
@@ -20,24 +21,6 @@ def decode_bbox(offsets, anchors):
   pred_x2, pred_y2 = pred_cx + 0.5 * pred_w, pred_cy + 0.5 * pred_h
   return np.stack([pred_x1, pred_y1, pred_x2, pred_y2], axis=1, dtype=np.float32)
 
-def generate_anchors(input_size, grid_sizes, scales, aspect_ratios):
-  assert len(scales) == len(aspect_ratios) == len(grid_sizes)
-  anchors = []
-  for s, ar, gs in zip(scales, aspect_ratios, grid_sizes):
-    s, ar = np.array(s), np.array(ar)
-    h_ratios = np.sqrt(ar)
-    w_ratios = 1 / h_ratios
-    ws = (w_ratios[:, None] * s[None, :]).reshape(-1)
-    hs = (h_ratios[:, None] * s[None, :]).reshape(-1)
-    base_anchors = (np.stack([-ws, -hs, ws, hs], axis=1) / 2).round()
-    stride_h, stride_w = input_size[0] // gs[0], input_size[1] // gs[1]
-    shifts_x, shifts_y = np.meshgrid(np.arange(gs[1]) * stride_w, np.arange(gs[0]) * stride_h)
-    shifts_x = shifts_x.reshape(-1)
-    shifts_y = shifts_y.reshape(-1)
-    shifts = np.stack([shifts_x, shifts_y, shifts_x, shifts_y], axis=1, dtype=np.float32)
-    anchors.append((shifts[:, None] + base_anchors[None, :]).reshape(-1, 4))
-  return anchors
-
 class RetinaNet:
   def __init__(self, backbone: ResNet, num_classes=264, num_anchors=9, scales=None, aspect_ratios=None):
     assert isinstance(backbone, ResNet)
@@ -48,7 +31,6 @@ class RetinaNet:
 
     self.backbone = ResNetFPN(backbone)
     self.head = RetinaHead(self.backbone.out_channels, num_anchors=num_anchors, num_classes=num_classes)
-    self.anchor_gen = lambda input_size: generate_anchors(input_size, self.backbone.compute_grid_sizes(input_size), scales, aspect_ratios)
 
   def __call__(self, x:Tensor, y:Optional[Tensor] = None, matches:Optional[Tensor] = None):
     return self.forward(x, y=y, matches=matches)
@@ -73,7 +55,7 @@ class RetinaNet:
 
   # predictions: (BS, (H1W1+...+HmWm)A, 4 + K)
   def postprocess_detections(self, predictions, input_size=(800, 800), image_sizes=None, orig_image_sizes=None, score_thresh=0.05, topk_candidates=1000, nms_thresh=0.5):
-    anchors = self.anchor_gen(input_size)
+    anchors = generate_anchors(input_size)
     grid_sizes = self.backbone.compute_grid_sizes(input_size)
     split_idx = np.cumsum([int(self.num_anchors * sz[0] * sz[1]) for sz in grid_sizes[:-1]])
     detections = []
@@ -86,6 +68,8 @@ class RetinaNet:
 
       image_boxes, image_scores, image_labels = [], [], []
       for offsets_per_level, scores_per_level, anchors_per_level in zip(offsets_per_image, scores_per_image, anchors):
+        anchors_per_level = anchors_per_level.numpy()
+
         # remove low scoring boxes
         scores_per_level = scores_per_level.flatten()
         keep_idxs = scores_per_level > score_thresh