diff --git a/examples/mlperf/helpers.py b/examples/mlperf/helpers.py index 90e1199ad1..a5d1419158 100644 --- a/examples/mlperf/helpers.py +++ b/examples/mlperf/helpers.py @@ -1,6 +1,6 @@ from collections import OrderedDict import unicodedata -from typing import Optional, Tuple +from typing import Optional, Tuple, List import numpy as np from tinygrad.nn import state from tinygrad.tensor import Tensor, dtypes @@ -275,3 +275,37 @@ def box_iou(boxes1:np.ndarray, boxes2:np.ndarray) -> np.ndarray: inter, union = _box_inter_union(boxes1, boxes2) return inter / union + +def generate_anchors(input_size:Tuple[int, int], batch_size:int = 1, scales:Optional[Tuple[Tensor, ...]] = None, aspect_ratios:Optional[Tuple[Tensor, ...]] = None) -> List[Tensor]: + def _compute_grid_sizes(input_size:Tuple[int, int]) -> np.ndarray: + return np.ceil(np.array(input_size)[None, :] / 2 ** np.arange(3, 8)[:, None]) + + scales = tuple(Tensor((i, int(i * 2 ** (1/3)), int(i * 2 ** (2/3)))) for i in 2 ** np.arange(5, 10)) if scales is None else scales + aspect_ratios = ((0.5, 1.0, 2.0),) * len(scales) if aspect_ratios is None else aspect_ratios + aspect_ratios = tuple(Tensor(ar) for ar in aspect_ratios) + grid_sizes = _compute_grid_sizes(input_size) + + assert len(scales) == len(aspect_ratios) == len(grid_sizes), "scales, aspect_ratios, and grid_sizes must have the same length" + + anchors_over_all_feature_maps = [] + for s, ar, gs in zip(scales, aspect_ratios, grid_sizes): + h_ratios = ar.sqrt() + w_ratios = 1 / h_ratios + ws = (w_ratios[:, None] * s[None, :]).reshape(-1) + hs = (h_ratios[:, None] * s[None, :]).reshape(-1) + base_anchors = (Tensor.stack(-ws, -hs, ws, hs, dim=1) / 2).round() + stride_h, stride_w = input_size[0] // gs[0], input_size[1] // gs[1] + shifts_x, shifts_y = (Tensor.arange(gs[1]) * stride_w).meshgrid(Tensor.arange(gs[0]) * stride_h, indexing="xy") + shifts_x, shifts_y = shifts_x.reshape(-1), shifts_y.reshape(-1) + shifts = Tensor.stack(shifts_x, shifts_y, shifts_x, shifts_y, dim=1) + anchors_over_all_feature_maps.append((shifts[:, None] + base_anchors[None, :]).reshape(-1, 4)) + + if batch_size > 1: + anchors = [] + for _ in range(batch_size): + anchors_in_img = [a for a in anchors_over_all_feature_maps] + anchors.append(anchors_in_img) + + return [Tensor.cat(*anchors_per_img) for anchors_per_img in anchors] + + return anchors_over_all_feature_maps diff --git a/extra/datasets/openimages.py b/extra/datasets/openimages.py index 7dc7f8ba84..2dc3f15778 100644 --- a/extra/datasets/openimages.py +++ b/extra/datasets/openimages.py @@ -206,7 +206,7 @@ def resize(img:Image, tgt:Optional[Dict[str, Union[np.ndarray, Tuple]]]=None, si return img, img_size -def normalize(img:Tensor, device:List[str]): +def normalize(img:Tensor, device:Optional[List[str]] = None): mean = Tensor([0.485, 0.456, 0.406], device=device, dtype=dtypes.float32).reshape(1, -1, 1, 1) std = Tensor([0.229, 0.224, 0.225], device=device, dtype=dtypes.float32).reshape(1, -1, 1, 1) img = ((img.permute([0, 3, 1, 2]) / 255.0) - mean) / std diff --git a/extra/models/retinanet.py b/extra/models/retinanet.py index 8561aabf67..36aaa254d2 100644 --- a/extra/models/retinanet.py +++ b/extra/models/retinanet.py @@ -4,6 +4,7 @@ import math from tinygrad import Tensor, dtypes from tinygrad.helpers import flatten, get_child import tinygrad.nn as nn +from examples.mlperf.helpers import generate_anchors from examples.mlperf.initializers import Conv2dNormal, Conv2dKaimingUniform from examples.mlperf.losses import sigmoid_focal_loss from extra.models.helpers import meshgrid, nms @@ -20,24 +21,6 @@ def decode_bbox(offsets, anchors): pred_x2, pred_y2 = pred_cx + 0.5 * pred_w, pred_cy + 0.5 * pred_h return np.stack([pred_x1, pred_y1, pred_x2, pred_y2], axis=1, dtype=np.float32) -def generate_anchors(input_size, grid_sizes, scales, aspect_ratios): - assert len(scales) == len(aspect_ratios) == len(grid_sizes) - anchors = [] - for s, ar, gs in zip(scales, aspect_ratios, grid_sizes): - s, ar = np.array(s), np.array(ar) - h_ratios = np.sqrt(ar) - w_ratios = 1 / h_ratios - ws = (w_ratios[:, None] * s[None, :]).reshape(-1) - hs = (h_ratios[:, None] * s[None, :]).reshape(-1) - base_anchors = (np.stack([-ws, -hs, ws, hs], axis=1) / 2).round() - stride_h, stride_w = input_size[0] // gs[0], input_size[1] // gs[1] - shifts_x, shifts_y = np.meshgrid(np.arange(gs[1]) * stride_w, np.arange(gs[0]) * stride_h) - shifts_x = shifts_x.reshape(-1) - shifts_y = shifts_y.reshape(-1) - shifts = np.stack([shifts_x, shifts_y, shifts_x, shifts_y], axis=1, dtype=np.float32) - anchors.append((shifts[:, None] + base_anchors[None, :]).reshape(-1, 4)) - return anchors - class RetinaNet: def __init__(self, backbone: ResNet, num_classes=264, num_anchors=9, scales=None, aspect_ratios=None): assert isinstance(backbone, ResNet) @@ -48,7 +31,6 @@ class RetinaNet: self.backbone = ResNetFPN(backbone) self.head = RetinaHead(self.backbone.out_channels, num_anchors=num_anchors, num_classes=num_classes) - self.anchor_gen = lambda input_size: generate_anchors(input_size, self.backbone.compute_grid_sizes(input_size), scales, aspect_ratios) def __call__(self, x:Tensor, y:Optional[Tensor] = None, matches:Optional[Tensor] = None): return self.forward(x, y=y, matches=matches) @@ -73,7 +55,7 @@ class RetinaNet: # predictions: (BS, (H1W1+...+HmWm)A, 4 + K) def postprocess_detections(self, predictions, input_size=(800, 800), image_sizes=None, orig_image_sizes=None, score_thresh=0.05, topk_candidates=1000, nms_thresh=0.5): - anchors = self.anchor_gen(input_size) + anchors = generate_anchors(input_size) grid_sizes = self.backbone.compute_grid_sizes(input_size) split_idx = np.cumsum([int(self.num_anchors * sz[0] * sz[1]) for sz in grid_sizes[:-1]]) detections = [] @@ -86,6 +68,8 @@ class RetinaNet: image_boxes, image_scores, image_labels = [], [], [] for offsets_per_level, scores_per_level, anchors_per_level in zip(offsets_per_image, scores_per_image, anchors): + anchors_per_level = anchors_per_level.numpy() + # remove low scoring boxes scores_per_level = scores_per_level.flatten() keep_idxs = scores_per_level > score_thresh