diff --git a/examples/mlperf/dataloader.py b/examples/mlperf/dataloader.py index 990931f765..04a7ef5991 100644 --- a/examples/mlperf/dataloader.py +++ b/examples/mlperf/dataloader.py @@ -359,8 +359,13 @@ def batch_load_unet3d(preprocessed_dataset_dir:Path, batch_size:int=6, val:bool= def load_retinanet_data(base_dir:Path, val:bool, queue_in:Queue, queue_out:Queue, X:Tensor, Y_boxes:Tensor, Y_labels:Tensor, anchors:np.ndarray): from extra.datasets.openimages import image_load, prepare_target, random_horizontal_flip, resize from examples.mlperf.helpers import box_iou, find_matches + import torch while (data:=queue_in.get()) is not None: + np.random.seed(42) + random.seed(42) + torch.manual_seed(42) + idx, img, ann = data img_id = img["id"] img = image_load(base_dir, img["subset"], img["file_name"]) @@ -387,9 +392,9 @@ def batch_load_retinanet(dataset, val:bool, anchors:np.ndarray, base_dir:Path, b queue_in.put((idx, img, ann)) def _setup_shared_mem(shm_name:str, size:Tuple[int, ...], dtype:dtypes) -> Tuple[shared_memory.SharedMemory, Tensor]: - if os.path.exists(f"/dev/shm/{shm_name}"): os.unlink(f"/dev/shm/{shm_name}") + if os.path.exists(f"/Users/flata/Downloads/shm/{shm_name}"): os.unlink(f"/Users/flata/Downloads/shm/{shm_name}") shm = shared_memory.SharedMemory(name=shm_name, create=True, size=prod(size)) - shm_tensor = Tensor.empty(*size, dtype=dtype, device=f"disk:/dev/shm/{shm_name}") + shm_tensor = Tensor.empty(*size, dtype=dtype, device=f"disk:/Users/flata/Downloads/shm/{shm_name}") return shm, shm_tensor image_ids = sorted(dataset.imgs.keys()) diff --git a/test/external/external_test_datasets.py b/test/external/external_test_datasets.py index a33426b5e3..a7045640c8 100644 --- a/test/external/external_test_datasets.py +++ b/test/external/external_test_datasets.py @@ -1,10 +1,14 @@ from extra.datasets.kits19 import iterate, preprocess -from examples.mlperf.dataloader import batch_load_unet3d -from test.external.mlperf_retinanet.openimages import get_openimages +from extra.datasets.openimages import download_dataset +from examples.mlperf.dataloader import batch_load_unet3d, batch_load_retinanet +from test.external.mlperf_retinanet.openimages import get_openimages, postprocess_targets +from test.external.mlperf_retinanet.presets import DetectionPresetTrain, DetectionPresetEval +from test.external.mlperf_retinanet.transforms import GeneralizedRCNNTransform from test.external.mlperf_unet3d.kits19 import PytTrain, PytVal from tinygrad.helpers import temp from pathlib import Path from PIL import Image +from pycocotools.coco import COCO import json import nibabel as nib @@ -12,12 +16,14 @@ import numpy as np import os import random import tempfile +import torch import unittest class ExternalTestDatasets(unittest.TestCase): def _set_seed(self): np.random.seed(42) random.seed(42) + torch.manual_seed(42) class TestKiTS19Dataset(ExternalTestDatasets): def _create_samples(self, val, num_samples=2): @@ -54,7 +60,7 @@ class TestKiTS19Dataset(ExternalTestDatasets): if use_old_dataloader: dataset = iterate(list(Path(tempfile.gettempdir()).glob("case_*")), preprocessed_dir=preproc_pth, val=val, shuffle=shuffle, bs=batch_size) else: - dataset = iter(batch_load_unet3d(preproc_pth, batch_size=batch_size, val=val, shuffle=shuffle, seed=seed)) + dataset = batch_load_unet3d(preproc_pth, batch_size=batch_size, val=val, shuffle=shuffle, seed=seed) return iter(dataset) @@ -80,6 +86,8 @@ class TestKiTS19Dataset(ExternalTestDatasets): class TestOpenImagesDataset(ExternalTestDatasets): def _create_samples(self, subset): + self._set_seed() + os.makedirs(Path(base_dir:=tempfile.gettempdir() + "/openimages") / f"{subset}/data", exist_ok=True) os.makedirs(base_dir / Path(f"{subset}/labels"), exist_ok=True) @@ -102,16 +110,35 @@ class TestOpenImagesDataset(ExternalTestDatasets): return base_dir, ann_file - def _create_ref_dataloader(self, subset): + def _create_ref_dataloader(self, subset, batch_size=1): base_dir, ann_file = self._create_samples(subset) - print(f"{base_dir=} {ann_file=}") + transforms = DetectionPresetTrain("hflip") + dataset = get_openimages(ann_file.stem, base_dir, subset, transforms) + return iter(dataset) - def _create_tinygrad_dataloader(self): - pass + def _create_tinygrad_dataloader(self, subset, anchors, batch_size=1): + base_dir, ann_file = self._create_samples(subset) + dataset = COCO(ann_file) + dataloader = batch_load_retinanet(dataset, subset == "validation", anchors, Path(base_dir), batch_size=batch_size) + return iter(dataloader) def test_training_set(self): - self._create_ref_dataloader("train") - assert 1==0 + img_size, img_mean, img_std, anchors = (800, 800), [0.485, 0.456, 0.406], [0.229, 0.224, 0.225], torch.ones((120087, 4)) + tinygrad_dataloader, ref_dataloader = self._create_tinygrad_dataloader("train", anchors.numpy()), self._create_ref_dataloader("train") + transform = GeneralizedRCNNTransform(img_size, img_mean, img_std) + + for ((tinygrad_img, tinygrad_boxes, tinygrad_labels, _), (ref_img, ref_tgt)) in zip(tinygrad_dataloader, ref_dataloader): + self._set_seed() + ref_tgt = [ref_tgt] + print(f"{ref_img=} {ref_tgt=}") + + ref_img, ref_tgt = transform(ref_img, [ref_tgt]) + ref_tgt = postprocess_targets(ref_tgt, anchors.unsqueeze(0)) + ref_boxes, ref_labels = ref_tgt[0]["boxes"], ref_tgt[0]["labels"] + + np.testing.assert_equal(tinygrad_img.numpy(), ref_img.tensors.transpose(1, 3).numpy()) + # print(f"{tinygrad_img.shape=} {tinygrad_boxes.shape=} {tinygrad_labels.shape=}") + # print(f"{ref_boxes.shape=} {ref_labels.shape=} {ref_img.tensors.shape=}") if __name__ == '__main__': unittest.main() diff --git a/test/external/mlperf_retinanet/boxes.py b/test/external/mlperf_retinanet/boxes.py new file mode 100644 index 0000000000..fa4a0d66ba --- /dev/null +++ b/test/external/mlperf_retinanet/boxes.py @@ -0,0 +1,197 @@ +# https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/boxes.py + +import torch +from torch import Tensor +from typing import Tuple +import torchvision +from torchvision.extension import _assert_has_ops + + +def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor: + """ + Performs non-maximum suppression (NMS) on the boxes according + to their intersection-over-union (IoU). + + NMS iteratively removes lower scoring boxes which have an + IoU greater than iou_threshold with another (higher scoring) + box. + + If multiple boxes have the exact same score and satisfy the IoU + criterion with respect to a reference box, the selected box is + not guaranteed to be the same between CPU and GPU. This is similar + to the behavior of argsort in PyTorch when repeated values are present. + + Args: + boxes (Tensor[N, 4])): boxes to perform NMS on. They + are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and + ``0 <= y1 < y2``. + scores (Tensor[N]): scores for each one of the boxes + iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold + + Returns: + Tensor: int64 tensor with the indices of the elements that have been kept + by NMS, sorted in decreasing order of scores + """ + _assert_has_ops() + return torch.ops.torchvision.nms(boxes, scores, iou_threshold) + + +def batched_nms( + boxes: Tensor, + scores: Tensor, + idxs: Tensor, + iou_threshold: float, +) -> Tensor: + """ + Performs non-maximum suppression in a batched fashion. + + Each index value correspond to a category, and NMS + will not be applied between elements of different categories. + + Args: + boxes (Tensor[N, 4]): boxes where NMS will be performed. They + are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and + ``0 <= y1 < y2``. + scores (Tensor[N]): scores for each one of the boxes + idxs (Tensor[N]): indices of the categories for each one of the boxes. + iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold + + Returns: + Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted + in decreasing order of scores + """ + # Benchmarks that drove the following thresholds are at + # https://github.com/pytorch/vision/issues/1311#issuecomment-781329339 + # Ideally for GPU we'd use a higher threshold + if boxes.numel() > 4_000 and not torchvision._is_tracing(): + return _batched_nms_vanilla(boxes, scores, idxs, iou_threshold) + else: + return _batched_nms_coordinate_trick(boxes, scores, idxs, iou_threshold) + + +@torch.jit._script_if_tracing +def _batched_nms_coordinate_trick( + boxes: Tensor, + scores: Tensor, + idxs: Tensor, + iou_threshold: float, +) -> Tensor: + # strategy: in order to perform NMS independently per class, + # we add an offset to all the boxes. The offset is dependent + # only on the class idx, and is large enough so that boxes + # from different classes do not overlap + if boxes.numel() == 0: + return torch.empty((0,), dtype=torch.int64, device=boxes.device) + max_coordinate = boxes.max() + offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes)) + boxes_for_nms = boxes + offsets[:, None] + keep = nms(boxes_for_nms, scores, iou_threshold) + return keep + + +@torch.jit._script_if_tracing +def _batched_nms_vanilla( + boxes: Tensor, + scores: Tensor, + idxs: Tensor, + iou_threshold: float, +) -> Tensor: + # Based on Detectron2 implementation, just manually call nms() on each class independently + keep_mask = torch.zeros_like(scores, dtype=torch.bool) + for class_id in torch.unique(idxs): + curr_indices = torch.where(idxs == class_id)[0] + curr_keep_indices = nms(boxes[curr_indices], scores[curr_indices], iou_threshold) + keep_mask[curr_indices[curr_keep_indices]] = True + keep_indices = torch.where(keep_mask)[0] + return keep_indices[scores[keep_indices].sort(descending=True)[1]] + + +def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor: + """ + Clip boxes so that they lie inside an image of size `size`. + + Args: + boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format + with ``0 <= x1 < x2`` and ``0 <= y1 < y2``. + size (Tuple[height, width]): size of the image + + Returns: + Tensor[N, 4]: clipped boxes + """ + dim = boxes.dim() + boxes_x = boxes[..., 0::2] + boxes_y = boxes[..., 1::2] + height, width = size + + if torchvision._is_tracing(): + boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device)) + boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device)) + boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device)) + boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device)) + else: + boxes_x = boxes_x.clamp(min=0, max=width) + boxes_y = boxes_y.clamp(min=0, max=height) + + clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim) + return clipped_boxes.reshape(boxes.shape) + + +def _upcast(t: Tensor) -> Tensor: + # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type + if t.is_floating_point(): + return t if t.dtype in (torch.float32, torch.float64) else t.float() + else: + return t if t.dtype in (torch.int32, torch.int64) else t.int() + + +def box_area(boxes: Tensor) -> Tensor: + """ + Computes the area of a set of bounding boxes, which are specified by their + (x1, y1, x2, y2) coordinates. + + Args: + boxes (Tensor[N, 4]): boxes for which the area will be computed. They + are expected to be in (x1, y1, x2, y2) format with + ``0 <= x1 < x2`` and ``0 <= y1 < y2``. + + Returns: + Tensor[N]: the area for each box + """ + boxes = _upcast(boxes) + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py +# with slight modifications +def _box_inter_union(boxes1: Tensor, boxes2: Tensor) -> Tuple[Tensor, Tensor]: + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + wh = _upcast(rb - lt).clamp(min=0) # [N,M,2] + inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + return inter, union + + +def box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor: + """ + Return intersection-over-union (Jaccard index) between two sets of boxes. + + Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with + ``0 <= x1 < x2`` and ``0 <= y1 < y2``. + + Args: + boxes1 (Tensor[N, 4]): first set of boxes + boxes2 (Tensor[M, 4]): second set of boxes + + Returns: + Tensor[N, M]: the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2 + """ + inter, union = _box_inter_union(boxes1, boxes2) + iou = inter / union + return iou \ No newline at end of file diff --git a/test/external/mlperf_retinanet/image_list.py b/test/external/mlperf_retinanet/image_list.py new file mode 100644 index 0000000000..72959615fd --- /dev/null +++ b/test/external/mlperf_retinanet/image_list.py @@ -0,0 +1,27 @@ +# https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/image_list.py + +import torch +from torch import Tensor +from typing import List, Tuple + + +class ImageList(object): + """ + Structure that holds a list of images (of possibly + varying sizes) as a single tensor. + This works by padding the images to the same size, + and storing in a field the original sizes of each image + """ + + def __init__(self, tensors: Tensor, image_sizes: List[Tuple[int, int]]): + """ + Args: + tensors (tensor) + image_sizes (list[tuple[int, int]]) + """ + self.tensors = tensors + self.image_sizes = image_sizes + + def to(self, device: torch.device) -> 'ImageList': + cast_tensor = self.tensors.to(device) + return ImageList(cast_tensor, self.image_sizes) \ No newline at end of file diff --git a/test/external/mlperf_retinanet/openimages.py b/test/external/mlperf_retinanet/openimages.py new file mode 100644 index 0000000000..9642824368 --- /dev/null +++ b/test/external/mlperf_retinanet/openimages.py @@ -0,0 +1,258 @@ +# https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/coco_utils.py + +import copy +import os +from PIL import Image + +import random +import numpy as np +import torch +import torch.utils.data +import torchvision + +from pycocotools import mask as coco_mask +from pycocotools.coco import COCO + +from test.external.mlperf_retinanet import transforms as T +from test.external.mlperf_retinanet.boxes import box_iou +from test.external.mlperf_retinanet.utils import Matcher + + +def convert_coco_poly_to_mask(segmentations, height, width): + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8) + mask = mask.any(dim=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, dim=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8) + return masks + + +class ConvertCocoPolysToMask(object): + def __init__(self, filter_iscrowd=True): + self.filter_iscrowd = filter_iscrowd + + def __call__(self, image, target): + w, h = image.size + + image_id = target["image_id"] + image_id = torch.tensor([image_id]) + + anno = target["annotations"] + + if self.filter_iscrowd: + anno = [obj for obj in anno if obj['iscrowd'] == 0] + + boxes = [obj["bbox"] for obj in anno] + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + + classes = [obj["category_id"] for obj in anno] + classes = torch.tensor(classes, dtype=torch.int64) + + keypoints = None + if anno and "keypoints" in anno[0]: + keypoints = [obj["keypoints"] for obj in anno] + keypoints = torch.as_tensor(keypoints, dtype=torch.float32) + num_keypoints = keypoints.shape[0] + if num_keypoints: + keypoints = keypoints.view(num_keypoints, -1, 3) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + classes = classes[keep] + + target = {} + target["boxes"] = boxes + target["labels"] = classes + target["image_id"] = image_id + + # for conversion to coco api + area = torch.tensor([obj["area"] for obj in anno]) + iscrowd = torch.tensor([obj["iscrowd"] for obj in anno]) + target["area"] = area + target["iscrowd"] = iscrowd + + return image, target + + +def _coco_remove_images_without_annotations(dataset, cat_list=None): + def _has_only_empty_bbox(anno): + return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno) + + def _count_visible_keypoints(anno): + return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno) + + min_keypoints_per_image = 10 + + def _has_valid_annotation(anno): + # if it's empty, there is no annotation + if len(anno) == 0: + return False + # if all boxes have close to zero area, there is no annotation + if _has_only_empty_bbox(anno): + return False + # keypoints task have a slight different critera for considering + # if an annotation is valid + if "keypoints" not in anno[0]: + return True + # for keypoint detection tasks, only consider valid images those + # containing at least min_keypoints_per_image + if _count_visible_keypoints(anno) >= min_keypoints_per_image: + return True + return False + + assert isinstance(dataset, torchvision.datasets.CocoDetection) + ids = [] + for ds_idx, img_id in enumerate(dataset.ids): + ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None) + anno = dataset.coco.loadAnns(ann_ids) + if cat_list: + anno = [obj for obj in anno if obj["category_id"] in cat_list] + if _has_valid_annotation(anno): + ids.append(ds_idx) + + dataset = torch.utils.data.Subset(dataset, ids) + return dataset + + +def convert_to_coco_api(ds): + coco_ds = COCO() + # annotation IDs need to start at 1, not 0, see torchvision issue #1530 + ann_id = 1 + dataset = {'images': [], 'categories': [], 'annotations': []} + categories = set() + for img_idx in range(len(ds)): + # find better way to get target + # targets = ds.get_annotations(img_idx) + img, targets = ds[img_idx] + image_id = targets["image_id"].item() + img_dict = {} + img_dict['id'] = image_id + img_dict['height'] = img.shape[-2] + img_dict['width'] = img.shape[-1] + dataset['images'].append(img_dict) + bboxes = targets["boxes"] + bboxes[:, 2:] -= bboxes[:, :2] + bboxes = bboxes.tolist() + labels = targets['labels'].tolist() + areas = targets['area'].tolist() + iscrowd = targets['iscrowd'].tolist() + num_objs = len(bboxes) + for i in range(num_objs): + ann = {} + ann['image_id'] = image_id + ann['bbox'] = bboxes[i] + ann['category_id'] = labels[i] + categories.add(labels[i]) + ann['area'] = areas[i] + ann['iscrowd'] = iscrowd[i] + ann['id'] = ann_id + dataset['annotations'].append(ann) + ann_id += 1 + dataset['categories'] = [{'id': i} for i in sorted(categories)] + coco_ds.dataset = dataset + coco_ds.createIndex() + return coco_ds + + +def get_coco_api_from_dataset(dataset): + for _ in range(10): + if isinstance(dataset, torchvision.datasets.CocoDetection): + break + if isinstance(dataset, torch.utils.data.Subset): + dataset = dataset.dataset + if isinstance(dataset, torchvision.datasets.CocoDetection): + return dataset.coco + return convert_to_coco_api(dataset) + + +class CocoDetection(torchvision.datasets.CocoDetection): + def __init__(self, img_folder, ann_file, transforms): + super(CocoDetection, self).__init__(img_folder, ann_file) + self._transforms = transforms + + def __getitem__(self, idx): + img, target = super(CocoDetection, self).__getitem__(idx) + image_id = self.ids[idx] + target = dict(image_id=image_id, annotations=target) + if self._transforms is not None: + img, target = self._transforms(img, target) + return img, target + + +def get_coco(name, root, image_set, transforms, mode='instances'): + anno_file_template = "{}_{}2017.json" + PATHS = { + "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))), + "val": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))), + } + + t = [ConvertCocoPolysToMask(filter_iscrowd=True)] + + if transforms is not None: + t.append(transforms) + transforms = T.Compose(t) + + img_folder, ann_file = PATHS[image_set] + img_folder = os.path.join(root, img_folder) + ann_file = os.path.join(root, ann_file) + + dataset = CocoDetection(img_folder, ann_file, transforms=transforms) + + if image_set == "train": + dataset = _coco_remove_images_without_annotations(dataset) + + return dataset + + +def get_openimages(name, root, image_set, transforms): + PATHS = { + "train": os.path.join(root, "train"), + "val": os.path.join(root, "validation"), + } + + t = [ConvertCocoPolysToMask(filter_iscrowd=False)] + + if transforms is not None: + t.append(transforms) + transforms = T.Compose(t) + + img_folder = os.path.join(PATHS[image_set], "data") + ann_file = os.path.join(PATHS[image_set], "labels", f"{name}.json") + + dataset = CocoDetection(img_folder, ann_file, transforms=transforms) + + return dataset + +# https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/retinanet.py#L401 +# NOTE: this applies the following filtering in https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/retinanet.py#L117 +# and https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/retinanet.py#L203 to match with +# tinygrad's dataloader implementation +def postprocess_targets(targets, anchors): + proposal_matcher, matched_idxs = Matcher(0.5, 0.4, allow_low_quality_matches=True), [] + for anchors_per_image, targets_per_image in zip(anchors, targets): + if targets_per_image['boxes'].numel() == 0: + matched_idxs.append(torch.full((anchors_per_image.size(0),), -1, dtype=torch.int64, + device=anchors_per_image.device)) + continue + + match_quality_matrix = box_iou(targets_per_image['boxes'], anchors_per_image) + matched_idxs.append(proposal_matcher(match_quality_matrix)) + + for targets_per_image, matched_idxs_per_image in zip(targets, matched_idxs): + foreground_idxs_per_image = matched_idxs_per_image >= 0 + targets_per_image["boxes"] = targets_per_image["boxes"][matched_idxs_per_image[foreground_idxs_per_image]] + targets_per_image["labels"] = targets_per_image["labels"][matched_idxs_per_image[foreground_idxs_per_image]] + + return targets \ No newline at end of file diff --git a/test/external/mlperf_retinanet/presets.py b/test/external/mlperf_retinanet/presets.py new file mode 100644 index 0000000000..e697b96d54 --- /dev/null +++ b/test/external/mlperf_retinanet/presets.py @@ -0,0 +1,39 @@ +# https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/presets.py + +from test.external.mlperf_retinanet import transforms as T + + +class DetectionPresetTrain: + def __init__(self, data_augmentation, hflip_prob=0.5, mean=(123., 117., 104.)): + if data_augmentation == 'hflip': + self.transforms = T.Compose([ + T.RandomHorizontalFlip(p=hflip_prob), + T.ToTensor(), + ]) + elif data_augmentation == 'ssd': + self.transforms = T.Compose([ + T.RandomPhotometricDistort(), + T.RandomZoomOut(fill=list(mean)), + T.RandomIoUCrop(), + T.RandomHorizontalFlip(p=hflip_prob), + T.ToTensor(), + ]) + elif data_augmentation == 'ssdlite': + self.transforms = T.Compose([ + T.RandomIoUCrop(), + T.RandomHorizontalFlip(p=hflip_prob), + T.ToTensor(), + ]) + else: + raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"') + + def __call__(self, img, target): + return self.transforms(img, target) + + +class DetectionPresetEval: + def __init__(self): + self.transforms = T.ToTensor() + + def __call__(self, img, target): + return self.transforms(img, target) \ No newline at end of file diff --git a/test/external/mlperf_retinanet/roi_heads.py b/test/external/mlperf_retinanet/roi_heads.py new file mode 100644 index 0000000000..f5e63d868c --- /dev/null +++ b/test/external/mlperf_retinanet/roi_heads.py @@ -0,0 +1,83 @@ +# https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/roi_heads.py + +import torch +import torchvision + +import torch.nn.functional as F +from torch import nn, Tensor + +from torchvision.ops import boxes as box_ops +from torchvision.ops import roi_align + +from typing import Optional, List, Dict, Tuple + +from test.external.mlperf_retinanet.utils import BoxCoder, Matcher + + +def expand_boxes(boxes, scale): + # type: (Tensor, float) -> Tensor + w_half = (boxes[:, 2] - boxes[:, 0]) * .5 + h_half = (boxes[:, 3] - boxes[:, 1]) * .5 + x_c = (boxes[:, 2] + boxes[:, 0]) * .5 + y_c = (boxes[:, 3] + boxes[:, 1]) * .5 + + w_half *= scale + h_half *= scale + + boxes_exp = torch.zeros_like(boxes) + boxes_exp[:, 0] = x_c - w_half + boxes_exp[:, 2] = x_c + w_half + boxes_exp[:, 1] = y_c - h_half + boxes_exp[:, 3] = y_c + h_half + return boxes_exp + + +def expand_masks(mask, padding): + # type: (Tensor, int) -> Tuple[Tensor, float] + M = mask.shape[-1] + scale = float(M + 2 * padding) / M + padded_mask = F.pad(mask, (padding,) * 4) + return padded_mask, scale + + +def paste_mask_in_image(mask, box, im_h, im_w): + # type: (Tensor, Tensor, int, int) -> Tensor + TO_REMOVE = 1 + w = int(box[2] - box[0] + TO_REMOVE) + h = int(box[3] - box[1] + TO_REMOVE) + w = max(w, 1) + h = max(h, 1) + + # Set shape to [batchxCxHxW] + mask = mask.expand((1, 1, -1, -1)) + + # Resize mask + mask = F.interpolate(mask, size=(h, w), mode='bilinear', align_corners=False) + mask = mask[0][0] + + im_mask = torch.zeros((im_h, im_w), dtype=mask.dtype, device=mask.device) + x_0 = max(box[0], 0) + x_1 = min(box[2] + 1, im_w) + y_0 = max(box[1], 0) + y_1 = min(box[3] + 1, im_h) + + im_mask[y_0:y_1, x_0:x_1] = mask[ + (y_0 - box[1]):(y_1 - box[1]), (x_0 - box[0]):(x_1 - box[0]) + ] + return im_mask + + +def paste_masks_in_image(masks, boxes, img_shape, padding=1): + # type: (Tensor, Tensor, Tuple[int, int], int) -> Tensor + masks, scale = expand_masks(masks, padding=padding) + boxes = expand_boxes(boxes, scale).to(dtype=torch.int64) + im_h, im_w = img_shape + res = [ + paste_mask_in_image(m[0], b, im_h, im_w) + for m, b in zip(masks, boxes) + ] + if len(res) > 0: + ret = torch.stack(res, dim=0)[:, None] + else: + ret = masks.new_empty((0, 1, im_h, im_w)) + return ret diff --git a/test/external/mlperf_retinanet/transforms.py b/test/external/mlperf_retinanet/transforms.py new file mode 100644 index 0000000000..e50c08d1ee --- /dev/null +++ b/test/external/mlperf_retinanet/transforms.py @@ -0,0 +1,517 @@ +# https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/transforms.py + +import torch +import torchvision + +from torch import nn, Tensor +from torchvision.transforms import functional as F +from torchvision.transforms import transforms as T +from typing import List, Tuple, Dict, Optional + + +def _flip_coco_person_keypoints(kps, width): + flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] + flipped_data = kps[:, flip_inds] + flipped_data[..., 0] = width - flipped_data[..., 0] + # Maintain COCO convention that if visibility == 0, then x, y = 0 + inds = flipped_data[..., 2] == 0 + flipped_data[inds] = 0 + return flipped_data + + +################################################################################ +# TODO(ahmadki): remove this block, and replace get_image_size with F.get_image_size +# once https://github.com/pytorch/vision/pull/4321 is public + +from PIL import Image, ImageOps, ImageEnhance +Image.MAX_IMAGE_PIXELS = None +from typing import Any + +try: + import accimage +except ImportError: + accimage = None + + +@torch.jit.unused +def _is_pil_image(img: Any) -> bool: + if accimage is not None: + return isinstance(img, (Image.Image, accimage.Image)) + else: + return isinstance(img, Image.Image) + +def get_image_size_tensor(img: Tensor) -> List[int]: + # Returns (w, h) of tensor image + _assert_image_tensor(img) + return [img.shape[-1], img.shape[-2]] + +@torch.jit.unused +def get_image_size_pil(img: Any) -> List[int]: + if _is_pil_image(img): + return list(img.size) + raise TypeError("Unexpected type {}".format(type(img))) + +def get_image_size(img: Tensor) -> List[int]: + """Returns the size of an image as [width, height]. + Args: + img (PIL Image or Tensor): The image to be checked. + Returns: + List[int]: The image size. + """ + if isinstance(img, torch.Tensor): + return get_image_size_tensor(img) + + return get_image_size_pil(img) + +def get_image_num_channels_tensor(img: Tensor) -> int: + _assert_image_tensor(img) + if img.ndim == 2: + return 1 + elif img.ndim > 2: + return img.shape[-3] + + raise TypeError(f"Input ndim should be 2 or more. Got {img.ndim}") + +@torch.jit.unused +def get_image_num_channels_pil(img: Any) -> int: + if _is_pil_image(img): + return len(img.getbands()) + raise TypeError("Unexpected type {}".format(type(img))) + +def get_image_num_channels(img: Tensor) -> int: + if isinstance(img, torch.Tensor): + return get_image_num_channels_tensor(img) + + return get_image_num_channels_pil(img) +################################################################################ + +class Compose(object): + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, target): + for t in self.transforms: + image, target = t(image, target) + return image, target + + +class RandomHorizontalFlip(T.RandomHorizontalFlip): + def forward(self, image: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if torch.rand(1) < self.p: + image = F.hflip(image) + if target is not None: + width, _ = get_image_size(image) + target["boxes"][:, [0, 2]] = width - target["boxes"][:, [2, 0]] + if "masks" in target: + target["masks"] = target["masks"].flip(-1) + if "keypoints" in target: + keypoints = target["keypoints"] + keypoints = _flip_coco_person_keypoints(keypoints, width) + target["keypoints"] = keypoints + return image, target + + +class ToTensor(nn.Module): + def forward(self, image: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + image = F.to_tensor(image) + return image, target + + +class RandomIoUCrop(nn.Module): + def __init__(self, min_scale: float = 0.3, max_scale: float = 1.0, min_aspect_ratio: float = 0.5, + max_aspect_ratio: float = 2.0, sampler_options: Optional[List[float]] = None, trials: int = 40): + super().__init__() + # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174 + self.min_scale = min_scale + self.max_scale = max_scale + self.min_aspect_ratio = min_aspect_ratio + self.max_aspect_ratio = max_aspect_ratio + if sampler_options is None: + sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0] + self.options = sampler_options + self.trials = trials + + def forward(self, image: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if target is None: + raise ValueError("The targets can't be None for this transform.") + + if isinstance(image, torch.Tensor): + if image.ndimension() not in {2, 3}: + raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension())) + elif image.ndimension() == 2: + image = image.unsqueeze(0) + + orig_w, orig_h = get_image_size(image) + + while True: + # sample an option + idx = int(torch.randint(low=0, high=len(self.options), size=(1,))) + min_jaccard_overlap = self.options[idx] + if min_jaccard_overlap >= 1.0: # a value larger than 1 encodes the leave as-is option + return image, target + + for _ in range(self.trials): + # check the aspect ratio limitations + r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2) + new_w = int(orig_w * r[0]) + new_h = int(orig_h * r[1]) + aspect_ratio = new_w / new_h + if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio): + continue + + # check for 0 area crops + r = torch.rand(2) + left = int((orig_w - new_w) * r[0]) + top = int((orig_h - new_h) * r[1]) + right = left + new_w + bottom = top + new_h + if left == right or top == bottom: + continue + + # check for any valid boxes with centers within the crop area + cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2]) + cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3]) + is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom) + if not is_within_crop_area.any(): + continue + + # check at least 1 box with jaccard limitations + boxes = target["boxes"][is_within_crop_area] + ious = torchvision.ops.boxes.box_iou(boxes, torch.tensor([[left, top, right, bottom]], + dtype=boxes.dtype, device=boxes.device)) + if ious.max() < min_jaccard_overlap: + continue + + # keep only valid boxes and perform cropping + target["boxes"] = boxes + target["labels"] = target["labels"][is_within_crop_area] + target["boxes"][:, 0::2] -= left + target["boxes"][:, 1::2] -= top + target["boxes"][:, 0::2].clamp_(min=0, max=new_w) + target["boxes"][:, 1::2].clamp_(min=0, max=new_h) + image = F.crop(image, top, left, new_h, new_w) + + return image, target + + +class RandomZoomOut(nn.Module): + def __init__(self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1., 4.), p: float = 0.5): + super().__init__() + if fill is None: + fill = [0., 0., 0.] + self.fill = fill + self.side_range = side_range + if side_range[0] < 1. or side_range[0] > side_range[1]: + raise ValueError("Invalid canvas side range provided {}.".format(side_range)) + self.p = p + + @torch.jit.unused + def _get_fill_value(self, is_pil): + # type: (bool) -> int + # We fake the type to make it work on JIT + return tuple(int(x) for x in self.fill) if is_pil else 0 + + def forward(self, image: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if isinstance(image, torch.Tensor): + if image.ndimension() not in {2, 3}: + raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension())) + elif image.ndimension() == 2: + image = image.unsqueeze(0) + + if torch.rand(1) < self.p: + return image, target + + orig_w, orig_h = get_image_size(image) + + r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0]) + canvas_width = int(orig_w * r) + canvas_height = int(orig_h * r) + + r = torch.rand(2) + left = int((canvas_width - orig_w) * r[0]) + top = int((canvas_height - orig_h) * r[1]) + right = canvas_width - (left + orig_w) + bottom = canvas_height - (top + orig_h) + + if torch.jit.is_scripting(): + fill = 0 + else: + fill = self._get_fill_value(_is_pil_image(image)) + + image = F.pad(image, [left, top, right, bottom], fill=fill) + if isinstance(image, torch.Tensor): + v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1) + image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h):, :] = \ + image[..., :, (left + orig_w):] = v + + if target is not None: + target["boxes"][:, 0::2] += left + target["boxes"][:, 1::2] += top + + return image, target + + +class RandomPhotometricDistort(nn.Module): + def __init__(self, contrast: Tuple[float] = (0.5, 1.5), saturation: Tuple[float] = (0.5, 1.5), + hue: Tuple[float] = (-0.05, 0.05), brightness: Tuple[float] = (0.875, 1.125), p: float = 0.5): + super().__init__() + self._brightness = T.ColorJitter(brightness=brightness) + self._contrast = T.ColorJitter(contrast=contrast) + self._hue = T.ColorJitter(hue=hue) + self._saturation = T.ColorJitter(saturation=saturation) + self.p = p + + def forward(self, image: Tensor, + target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if isinstance(image, torch.Tensor): + if image.ndimension() not in {2, 3}: + raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension())) + elif image.ndimension() == 2: + image = image.unsqueeze(0) + + r = torch.rand(7) + + if r[0] < self.p: + image = self._brightness(image) + + contrast_before = r[1] < 0.5 + if contrast_before: + if r[2] < self.p: + image = self._contrast(image) + + if r[3] < self.p: + image = self._saturation(image) + + if r[4] < self.p: + image = self._hue(image) + + if not contrast_before: + if r[5] < self.p: + image = self._contrast(image) + + if r[6] < self.p: + channels = get_image_num_channels(image) + permutation = torch.randperm(channels) + + is_pil = _is_pil_image(image) + if is_pil: + image = F.to_tensor(image) + image = image[..., permutation, :, :] + if is_pil: + image = F.to_pil_image(image) + + return image, target + +import math +import torch +import torchvision + +from torch import nn, Tensor +from typing import List, Tuple, Dict, Optional + +from test.external.mlperf_retinanet.image_list import ImageList +from test.external.mlperf_retinanet.roi_heads import paste_masks_in_image + + +@torch.jit.unused +def _get_shape_onnx(image: Tensor) -> Tensor: + from torch.onnx import operators + return operators.shape_as_tensor(image)[-2:] + + +@torch.jit.unused +def _fake_cast_onnx(v: Tensor) -> float: + # ONNX requires a tensor but here we fake its type for JIT. + return v + + +def _resize_image_and_masks(image: Tensor, + target: Optional[Dict[str, Tensor]] = None, + image_size: Optional[Tuple[int, int]] = None, + ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if torchvision._is_tracing(): + im_shape = _get_shape_onnx(image) + else: + im_shape = torch.tensor(image.shape[-2:]) + + image = torch.nn.functional.interpolate(image[None], size=image_size, scale_factor=None, mode='bilinear', + recompute_scale_factor=None, align_corners=False)[0] + + if target is None: + return image, target + + if "masks" in target: + mask = target["masks"] + mask = torch.nn.functional.interpolate(mask[:, None].float(), size=image_size, scale_factor=None, + recompute_scale_factor=None)[:, 0].byte() + target["masks"] = mask + return image, target + +# https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/transform.py + +class GeneralizedRCNNTransform(nn.Module): + """ + Performs input / target transformation before feeding the data to a GeneralizedRCNN + model. + + The transformations it perform are: + - input normalization (mean subtraction and std division) + - input / target resizing to match image_size + + It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets + """ + + def __init__(self, image_size: Optional[Tuple[int, int]], + image_mean: List[float], image_std: List[float],): + super(GeneralizedRCNNTransform, self).__init__() + self.image_size = image_size + self.image_mean = image_mean + self.image_std = image_std + + def forward(self, + images: List[Tensor], + targets: Optional[List[Dict[str, Tensor]]] = None + ) -> Tuple[ImageList, Optional[List[Dict[str, Tensor]]]]: + images = [img for img in images] + if targets is not None: + # make a copy of targets to avoid modifying it in-place + # once torchscript supports dict comprehension + # this can be simplified as follows + # targets = [{k: v for k,v in t.items()} for t in targets] + targets_copy: List[Dict[str, Tensor]] = [] + for t in targets: + data: Dict[str, Tensor] = {} + for k, v in t.items(): + data[k] = v + targets_copy.append(data) + targets = targets_copy + for i in range(len(images)): + image = images[i] + target_index = targets[i] if targets is not None else None + + if image.dim() != 3: + raise ValueError("images is expected to be a list of 3d tensors " + "of shape [C, H, W], got {}".format(image.shape)) + image = self.normalize(image) + image, target_index = self.resize(image, target_index) + images[i] = image + if targets is not None and target_index is not None: + targets[i] = target_index + + image_sizes = [img.shape[-2:] for img in images] + images = torch.stack(images) + image_sizes_list: List[Tuple[int, int]] = [] + for image_size in image_sizes: + assert len(image_size) == 2 + image_sizes_list.append((image_size[0], image_size[1])) + + image_list = ImageList(images, image_sizes_list) + return image_list, targets + + def normalize(self, image: Tensor) -> Tensor: + if not image.is_floating_point(): + raise TypeError( + f"Expected input images to be of floating type (in range [0, 1]), " + f"but found type {image.dtype} instead" + ) + dtype, device = image.dtype, image.device + mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device) + std = torch.as_tensor(self.image_std, dtype=dtype, device=device) + return (image - mean[:, None, None]) / std[:, None, None] + + def torch_choice(self, k: List[int]) -> int: + """ + Implements `random.choice` via torch ops so it can be compiled with + TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803 + is fixed. + """ + index = int(torch.empty(1).uniform_(0., float(len(k))).item()) + return k[index] + + def resize(self, + image: Tensor, + target: Optional[Dict[str, Tensor]] = None, + ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + h, w = image.shape[-2:] + image, target = _resize_image_and_masks(image, target, self.image_size) + + if target is None: + return image, target + + bbox = target["boxes"] + bbox = resize_boxes(bbox, (h, w), image.shape[-2:]) + target["boxes"] = bbox + + if "keypoints" in target: + keypoints = target["keypoints"] + keypoints = resize_keypoints(keypoints, (h, w), image.shape[-2:]) + target["keypoints"] = keypoints + return image, target + + def postprocess(self, + result: List[Dict[str, Tensor]], + image_shapes: List[Tuple[int, int]], + original_image_sizes: List[Tuple[int, int]] + ) -> List[Dict[str, Tensor]]: + if self.training: + return result + for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)): + boxes = pred["boxes"] + boxes = resize_boxes(boxes, im_s, o_im_s) + result[i]["boxes"] = boxes + if "masks" in pred: + masks = pred["masks"] + masks = paste_masks_in_image(masks, boxes, o_im_s) + result[i]["masks"] = masks + if "keypoints" in pred: + keypoints = pred["keypoints"] + keypoints = resize_keypoints(keypoints, im_s, o_im_s) + result[i]["keypoints"] = keypoints + return result + + def __repr__(self) -> str: + format_string = self.__class__.__name__ + '(' + _indent = '\n ' + format_string += "{0}Normalize(mean={1}, std={2})".format(_indent, self.image_mean, self.image_std) + format_string += "{0}Resize(height={1}, width={2}, mode='bilinear')".format(_indent, self.image_size[0], + self.image_size[1]) + format_string += '\n)' + return format_string + + +def resize_keypoints(keypoints: Tensor, original_size: List[int], new_size: List[int]) -> Tensor: + ratios = [ + torch.tensor(s, dtype=torch.float32, device=keypoints.device) / + torch.tensor(s_orig, dtype=torch.float32, device=keypoints.device) + for s, s_orig in zip(new_size, original_size) + ] + ratio_h, ratio_w = ratios + resized_data = keypoints.clone() + if torch._C._get_tracing_state(): + resized_data_0 = resized_data[:, :, 0] * ratio_w + resized_data_1 = resized_data[:, :, 1] * ratio_h + resized_data = torch.stack((resized_data_0, resized_data_1, resized_data[:, :, 2]), dim=2) + else: + resized_data[..., 0] *= ratio_w + resized_data[..., 1] *= ratio_h + return resized_data + + +def resize_boxes(boxes: Tensor, original_size: List[int], new_size: List[int]) -> Tensor: + ratios = [ + torch.tensor(s, dtype=torch.float32, device=boxes.device) / + torch.tensor(s_orig, dtype=torch.float32, device=boxes.device) + for s, s_orig in zip(new_size, original_size) + ] + ratio_height, ratio_width = ratios + xmin, ymin, xmax, ymax = boxes.unbind(1) + + xmin = xmin * ratio_width + xmax = xmax * ratio_width + ymin = ymin * ratio_height + ymax = ymax * ratio_height + return torch.stack((xmin, ymin, xmax, ymax), dim=1) \ No newline at end of file diff --git a/test/external/mlperf_retinanet/utils.py b/test/external/mlperf_retinanet/utils.py new file mode 100644 index 0000000000..693a8d805c --- /dev/null +++ b/test/external/mlperf_retinanet/utils.py @@ -0,0 +1,409 @@ +# https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/utils.py + +import math +import torch + +from collections import OrderedDict +from torch import Tensor, nn +from typing import List, Tuple, Dict + +from torchvision.ops.misc import FrozenBatchNorm2d + + +class IntermediateLayerGetter(nn.ModuleDict): + """ + Module wrapper that returns intermediate layers from a model + + It has a strong assumption that the modules have been registered + into the model in the same order as they are used. + This means that one should **not** reuse the same nn.Module + twice in the forward if you want this to work. + + Additionally, it is only able to query submodules that are directly + assigned to the model. So if `model` is passed, `model.feature1` can + be returned, but not `model.feature1.layer2`. + + Args: + model (nn.Module): model on which we will extract the features + return_layers (Dict[name, new_name]): a dict containing the names + of the modules for which the activations will be returned as + the key of the dict, and the value of the dict is the name + of the returned activation (which the user can specify). + + Examples:: + + >>> m = torchvision.models.resnet18(pretrained=True) + >>> # extract layer1 and layer3, giving as names `feat1` and feat2` + >>> new_m = torchvision.models._utils.IntermediateLayerGetter(m, + >>> {'layer1': 'feat1', 'layer3': 'feat2'}) + >>> out = new_m(torch.rand(1, 3, 224, 224)) + >>> print([(k, v.shape) for k, v in out.items()]) + >>> [('feat1', torch.Size([1, 64, 56, 56])), + >>> ('feat2', torch.Size([1, 256, 14, 14]))] + """ + _version = 2 + __annotations__ = { + "return_layers": Dict[str, str], + } + + def __init__(self, model: nn.Module, return_layers: Dict[str, str]) -> None: + if not set(return_layers).issubset([name for name, _ in model.named_children()]): + raise ValueError("return_layers are not present in model") + orig_return_layers = return_layers + return_layers = {str(k): str(v) for k, v in return_layers.items()} + layers = OrderedDict() + for name, module in model.named_children(): + layers[name] = module + if name in return_layers: + del return_layers[name] + if not return_layers: + break + + super(IntermediateLayerGetter, self).__init__(layers) + self.return_layers = orig_return_layers + + def forward(self, x): + out = OrderedDict() + for name, module in self.items(): + x = module(x) + if name in self.return_layers: + out_name = self.return_layers[name] + out[out_name] = x + return out + + +@torch.jit._script_if_tracing +def encode_boxes(reference_boxes, proposals, weights): + # type: (torch.Tensor, torch.Tensor, torch.Tensor) -> torch.Tensor + """ + Encode a set of proposals with respect to some + reference boxes + + Args: + reference_boxes (Tensor): reference boxes + proposals (Tensor): boxes to be encoded + weights (Tensor[4]): the weights for ``(x, y, w, h)`` + """ + + # perform some unpacking to make it JIT-fusion friendly + wx = weights[0] + wy = weights[1] + ww = weights[2] + wh = weights[3] + + proposals_x1 = proposals[:, 0].unsqueeze(1) + proposals_y1 = proposals[:, 1].unsqueeze(1) + proposals_x2 = proposals[:, 2].unsqueeze(1) + proposals_y2 = proposals[:, 3].unsqueeze(1) + + reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1) + reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1) + reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1) + reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1) + + # implementation starts here + ex_widths = proposals_x2 - proposals_x1 + ex_heights = proposals_y2 - proposals_y1 + ex_ctr_x = proposals_x1 + 0.5 * ex_widths + ex_ctr_y = proposals_y1 + 0.5 * ex_heights + + gt_widths = reference_boxes_x2 - reference_boxes_x1 + gt_heights = reference_boxes_y2 - reference_boxes_y1 + gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths + gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights + + targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths + targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights + targets_dw = ww * torch.log(gt_widths / ex_widths) + targets_dh = wh * torch.log(gt_heights / ex_heights) + + targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) + return targets + + +class BoxCoder(object): + """ + This class encodes and decodes a set of bounding boxes into + the representation used for training the regressors. + """ + + def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)): + # type: (Tuple[float, float, float, float], float) -> None + """ + Args: + weights (4-element tuple) + bbox_xform_clip (float) + """ + self.weights = weights + self.bbox_xform_clip = bbox_xform_clip + + def encode(self, reference_boxes, proposals): + # type: (List[Tensor], List[Tensor]) -> List[Tensor] + boxes_per_image = [len(b) for b in reference_boxes] + reference_boxes = torch.cat(reference_boxes, dim=0) + proposals = torch.cat(proposals, dim=0) + targets = self.encode_single(reference_boxes, proposals) + return targets.split(boxes_per_image, 0) + + def encode_single(self, reference_boxes, proposals): + """ + Encode a set of proposals with respect to some + reference boxes + + Args: + reference_boxes (Tensor): reference boxes + proposals (Tensor): boxes to be encoded + """ + dtype = reference_boxes.dtype + device = reference_boxes.device + weights = torch.as_tensor(self.weights, dtype=dtype, device=device) + targets = encode_boxes(reference_boxes, proposals, weights) + + return targets + + def decode(self, rel_codes, boxes): + # type: (Tensor, List[Tensor]) -> Tensor + assert isinstance(boxes, (list, tuple)) + assert isinstance(rel_codes, torch.Tensor) + boxes_per_image = [b.size(0) for b in boxes] + concat_boxes = torch.cat(boxes, dim=0) + box_sum = 0 + for val in boxes_per_image: + box_sum += val + if box_sum > 0: + rel_codes = rel_codes.reshape(box_sum, -1) + pred_boxes = self.decode_single( + rel_codes, concat_boxes + ) + if box_sum > 0: + pred_boxes = pred_boxes.reshape(box_sum, -1, 4) + return pred_boxes + + def decode_single(self, rel_codes, boxes): + """ + From a set of original boxes and encoded relative box offsets, + get the decoded boxes. + + Args: + rel_codes (Tensor): encoded boxes + boxes (Tensor): reference boxes. + """ + + boxes = boxes.to(rel_codes.dtype) + + widths = boxes[:, 2] - boxes[:, 0] + heights = boxes[:, 3] - boxes[:, 1] + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + wx, wy, ww, wh = self.weights + dx = rel_codes[:, 0::4] / wx + dy = rel_codes[:, 1::4] / wy + dw = rel_codes[:, 2::4] / ww + dh = rel_codes[:, 3::4] / wh + + # Prevent sending too large values into torch.exp() + dw = torch.clamp(dw, max=self.bbox_xform_clip) + dh = torch.clamp(dh, max=self.bbox_xform_clip) + + pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] + pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] + pred_w = torch.exp(dw) * widths[:, None] + pred_h = torch.exp(dh) * heights[:, None] + + # Distance from center to box's corner. + c_to_c_h = torch.tensor(0.5, dtype=pred_ctr_y.dtype, device=pred_h.device) * pred_h + c_to_c_w = torch.tensor(0.5, dtype=pred_ctr_x.dtype, device=pred_w.device) * pred_w + + pred_boxes1 = pred_ctr_x - c_to_c_w + pred_boxes2 = pred_ctr_y - c_to_c_h + pred_boxes3 = pred_ctr_x + c_to_c_w + pred_boxes4 = pred_ctr_y + c_to_c_h + pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4), dim=2).flatten(1) + return pred_boxes + + +class Matcher(object): + """ + This class assigns to each predicted "element" (e.g., a box) a ground-truth + element. Each predicted element will have exactly zero or one matches; each + ground-truth element may be assigned to zero or more predicted elements. + + Matching is based on the MxN match_quality_matrix, that characterizes how well + each (ground-truth, predicted)-pair match. For example, if the elements are + boxes, the matrix may contain box IoU overlap values. + + The matcher returns a tensor of size N containing the index of the ground-truth + element m that matches to prediction n. If there is no match, a negative value + is returned. + """ + + BELOW_LOW_THRESHOLD = -1 + BETWEEN_THRESHOLDS = -2 + + __annotations__ = { + 'BELOW_LOW_THRESHOLD': int, + 'BETWEEN_THRESHOLDS': int, + } + + def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False): + # type: (float, float, bool) -> None + """ + Args: + high_threshold (float): quality values greater than or equal to + this value are candidate matches. + low_threshold (float): a lower quality threshold used to stratify + matches into three levels: + 1) matches >= high_threshold + 2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold) + 3) BELOW_LOW_THRESHOLD matches in [0, low_threshold) + allow_low_quality_matches (bool): if True, produce additional matches + for predictions that have only low-quality match candidates. See + set_low_quality_matches_ for more details. + """ + self.BELOW_LOW_THRESHOLD = -1 + self.BETWEEN_THRESHOLDS = -2 + assert low_threshold <= high_threshold + self.high_threshold = high_threshold + self.low_threshold = low_threshold + self.allow_low_quality_matches = allow_low_quality_matches + + def __call__(self, match_quality_matrix): + """ + Args: + match_quality_matrix (Tensor[float]): an MxN tensor, containing the + pairwise quality between M ground-truth elements and N predicted elements. + + Returns: + matches (Tensor[int64]): an N tensor where N[i] is a matched gt in + [0, M - 1] or a negative value indicating that prediction i could not + be matched. + """ + if match_quality_matrix.numel() == 0: + # empty targets or proposals not supported during training + if match_quality_matrix.shape[0] == 0: + raise ValueError( + "No ground-truth boxes available for one of the images " + "during training") + else: + raise ValueError( + "No proposal boxes available for one of the images " + "during training") + + # match_quality_matrix is M (gt) x N (predicted) + # Max over gt elements (dim 0) to find best gt candidate for each prediction + matched_vals, matches = match_quality_matrix.max(dim=0) + if self.allow_low_quality_matches: + all_matches = matches.clone() + else: + all_matches = None + + # Assign candidate matches with low quality to negative (unassigned) values + below_low_threshold = matched_vals < self.low_threshold + between_thresholds = (matched_vals >= self.low_threshold) & ( + matched_vals < self.high_threshold + ) + matches[below_low_threshold] = self.BELOW_LOW_THRESHOLD + matches[between_thresholds] = self.BETWEEN_THRESHOLDS + + if self.allow_low_quality_matches: + assert all_matches is not None + self.set_low_quality_matches_(matches, all_matches, match_quality_matrix) + + return matches + + def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix): + """ + Produce additional matches for predictions that have only low-quality matches. + Specifically, for each ground-truth find the set of predictions that have + maximum overlap with it (including ties); for each prediction in that set, if + it is unmatched, then match it to the ground-truth with which it has the highest + quality value. + """ + # For each gt, find the prediction with which it has highest quality + highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1) + # Find highest quality match available, even if it is low, including ties + gt_pred_pairs_of_highest_quality = torch.where( + match_quality_matrix == highest_quality_foreach_gt[:, None] + ) + # Example gt_pred_pairs_of_highest_quality: + # tensor([[ 0, 39796], + # [ 1, 32055], + # [ 1, 32070], + # [ 2, 39190], + # [ 2, 40255], + # [ 3, 40390], + # [ 3, 41455], + # [ 4, 45470], + # [ 5, 45325], + # [ 5, 46390]]) + # Each row is a (gt index, prediction index) + # Note how gt items 1, 2, 3, and 5 each have two ties + + pred_inds_to_update = gt_pred_pairs_of_highest_quality[1] + matches[pred_inds_to_update] = all_matches[pred_inds_to_update] + + +class SSDMatcher(Matcher): + + def __init__(self, threshold): + super().__init__(threshold, threshold, allow_low_quality_matches=False) + + def __call__(self, match_quality_matrix): + matches = super().__call__(match_quality_matrix) + + # For each gt, find the prediction with which it has the highest quality + _, highest_quality_pred_foreach_gt = match_quality_matrix.max(dim=1) + matches[highest_quality_pred_foreach_gt] = torch.arange(highest_quality_pred_foreach_gt.size(0), + dtype=torch.int64, + device=highest_quality_pred_foreach_gt.device) + + return matches + + +def overwrite_eps(model, eps): + """ + This method overwrites the default eps values of all the + FrozenBatchNorm2d layers of the model with the provided value. + This is necessary to address the BC-breaking change introduced + by the bug-fix at pytorch/vision#2933. The overwrite is applied + only when the pretrained weights are loaded to maintain compatibility + with previous versions. + + Args: + model (nn.Module): The model on which we perform the overwrite. + eps (float): The new value of eps. + """ + for module in model.modules(): + if isinstance(module, FrozenBatchNorm2d): + module.eps = eps + + +def retrieve_out_channels(model, size): + """ + This method retrieves the number of output channels of a specific model. + + Args: + model (nn.Module): The model for which we estimate the out_channels. + It should return a single Tensor or an OrderedDict[Tensor]. + size (Tuple[int, int]): The size (wxh) of the input. + + Returns: + out_channels (List[int]): A list of the output channels of the model. + """ + in_training = model.training + model.eval() + + with torch.no_grad(): + # Use dummy data to retrieve the feature map sizes to avoid hard-coding their values + device = next(model.parameters()).device + tmp_img = torch.zeros((1, 3, size[1], size[0]), device=device) + features = model(tmp_img) + if isinstance(features, torch.Tensor): + features = OrderedDict([('0', features)]) + out_channels = [x.size(1) for x in features.values()] + + if in_training: + model.train() + + return out_channels \ No newline at end of file