From ec146da5cf54f9417016e57b40add9e27d679aee Mon Sep 17 00:00:00 2001
From: Francis Lata <francisalbertlata@gmail.com>
Date: Tue, 22 Oct 2024 02:48:11 -0700
Subject: [PATCH] trim dataloader related code needed from ref

---
 test/external/mlperf_retinanet/openimages.py | 139 +-----------
 test/external/mlperf_retinanet/presets.py    |  14 --
 test/external/mlperf_retinanet/transforms.py | 215 -------------------
 3 files changed, 2 insertions(+), 366 deletions(-)

diff --git a/test/external/mlperf_retinanet/openimages.py b/test/external/mlperf_retinanet/openimages.py
index 9642824368..f2b2df36ca 100644
--- a/test/external/mlperf_retinanet/openimages.py
+++ b/test/external/mlperf_retinanet/openimages.py
@@ -18,23 +18,6 @@ from test.external.mlperf_retinanet.boxes import box_iou
 from test.external.mlperf_retinanet.utils import Matcher
 
 
-def convert_coco_poly_to_mask(segmentations, height, width):
-    masks = []
-    for polygons in segmentations:
-        rles = coco_mask.frPyObjects(polygons, height, width)
-        mask = coco_mask.decode(rles)
-        if len(mask.shape) < 3:
-            mask = mask[..., None]
-        mask = torch.as_tensor(mask, dtype=torch.uint8)
-        mask = mask.any(dim=2)
-        masks.append(mask)
-    if masks:
-        masks = torch.stack(masks, dim=0)
-    else:
-        masks = torch.zeros((0, height, width), dtype=torch.uint8)
-    return masks
-
-
 class ConvertCocoPolysToMask(object):
     def __init__(self, filter_iscrowd=True):
         self.filter_iscrowd = filter_iscrowd
@@ -86,97 +69,6 @@ class ConvertCocoPolysToMask(object):
         return image, target
 
 
-def _coco_remove_images_without_annotations(dataset, cat_list=None):
-    def _has_only_empty_bbox(anno):
-        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
-
-    def _count_visible_keypoints(anno):
-        return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
-
-    min_keypoints_per_image = 10
-
-    def _has_valid_annotation(anno):
-        # if it's empty, there is no annotation
-        if len(anno) == 0:
-            return False
-        # if all boxes have close to zero area, there is no annotation
-        if _has_only_empty_bbox(anno):
-            return False
-        # keypoints task have a slight different critera for considering
-        # if an annotation is valid
-        if "keypoints" not in anno[0]:
-            return True
-        # for keypoint detection tasks, only consider valid images those
-        # containing at least min_keypoints_per_image
-        if _count_visible_keypoints(anno) >= min_keypoints_per_image:
-            return True
-        return False
-
-    assert isinstance(dataset, torchvision.datasets.CocoDetection)
-    ids = []
-    for ds_idx, img_id in enumerate(dataset.ids):
-        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
-        anno = dataset.coco.loadAnns(ann_ids)
-        if cat_list:
-            anno = [obj for obj in anno if obj["category_id"] in cat_list]
-        if _has_valid_annotation(anno):
-            ids.append(ds_idx)
-
-    dataset = torch.utils.data.Subset(dataset, ids)
-    return dataset
-
-
-def convert_to_coco_api(ds):
-    coco_ds = COCO()
-    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
-    ann_id = 1
-    dataset = {'images': [], 'categories': [], 'annotations': []}
-    categories = set()
-    for img_idx in range(len(ds)):
-        # find better way to get target
-        # targets = ds.get_annotations(img_idx)
-        img, targets = ds[img_idx]
-        image_id = targets["image_id"].item()
-        img_dict = {}
-        img_dict['id'] = image_id
-        img_dict['height'] = img.shape[-2]
-        img_dict['width'] = img.shape[-1]
-        dataset['images'].append(img_dict)
-        bboxes = targets["boxes"]
-        bboxes[:, 2:] -= bboxes[:, :2]
-        bboxes = bboxes.tolist()
-        labels = targets['labels'].tolist()
-        areas = targets['area'].tolist()
-        iscrowd = targets['iscrowd'].tolist()
-        num_objs = len(bboxes)
-        for i in range(num_objs):
-            ann = {}
-            ann['image_id'] = image_id
-            ann['bbox'] = bboxes[i]
-            ann['category_id'] = labels[i]
-            categories.add(labels[i])
-            ann['area'] = areas[i]
-            ann['iscrowd'] = iscrowd[i]
-            ann['id'] = ann_id
-            dataset['annotations'].append(ann)
-            ann_id += 1
-    dataset['categories'] = [{'id': i} for i in sorted(categories)]
-    coco_ds.dataset = dataset
-    coco_ds.createIndex()
-    return coco_ds
-
-
-def get_coco_api_from_dataset(dataset):
-    for _ in range(10):
-        if isinstance(dataset, torchvision.datasets.CocoDetection):
-            break
-        if isinstance(dataset, torch.utils.data.Subset):
-            dataset = dataset.dataset
-    if isinstance(dataset, torchvision.datasets.CocoDetection):
-        return dataset.coco
-    return convert_to_coco_api(dataset)
-
-
 class CocoDetection(torchvision.datasets.CocoDetection):
     def __init__(self, img_folder, ann_file, transforms):
         super(CocoDetection, self).__init__(img_folder, ann_file)
@@ -191,31 +83,6 @@ class CocoDetection(torchvision.datasets.CocoDetection):
         return img, target
 
 
-def get_coco(name, root, image_set, transforms, mode='instances'):
-    anno_file_template = "{}_{}2017.json"
-    PATHS = {
-        "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
-        "val": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))),
-    }
-
-    t = [ConvertCocoPolysToMask(filter_iscrowd=True)]
-
-    if transforms is not None:
-        t.append(transforms)
-    transforms = T.Compose(t)
-
-    img_folder, ann_file = PATHS[image_set]
-    img_folder = os.path.join(root, img_folder)
-    ann_file = os.path.join(root, ann_file)
-
-    dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
-
-    if image_set == "train":
-        dataset = _coco_remove_images_without_annotations(dataset)
-
-    return dataset
-
-
 def get_openimages(name, root, image_set, transforms):
     PATHS = {
         "train": os.path.join(root, "train"),
@@ -235,10 +102,8 @@ def get_openimages(name, root, image_set, transforms):
 
     return dataset
 
-# https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/retinanet.py#L401
-# NOTE: this applies the following filtering in https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/retinanet.py#L117
-# and https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/retinanet.py#L203 to match with
-# tinygrad's dataloader implementation
+# This applies the filtering in https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/retinanet.py#L117
+# and https://github.com/mlcommons/training/blob/cdd928d4596c142c15a7d86b2eeadbac718c8da2/single_stage_detector/ssd/model/retinanet.py#L203 to match with tinygrad's dataloader implementation.
 def postprocess_targets(targets, anchors):
     proposal_matcher, matched_idxs = Matcher(0.5, 0.4, allow_low_quality_matches=True), []
     for anchors_per_image, targets_per_image in zip(anchors, targets):
diff --git a/test/external/mlperf_retinanet/presets.py b/test/external/mlperf_retinanet/presets.py
index e697b96d54..9a3aa09661 100644
--- a/test/external/mlperf_retinanet/presets.py
+++ b/test/external/mlperf_retinanet/presets.py
@@ -10,20 +10,6 @@ class DetectionPresetTrain:
                 T.RandomHorizontalFlip(p=hflip_prob),
                 T.ToTensor(),
             ])
-        elif data_augmentation == 'ssd':
-            self.transforms = T.Compose([
-                T.RandomPhotometricDistort(),
-                T.RandomZoomOut(fill=list(mean)),
-                T.RandomIoUCrop(),
-                T.RandomHorizontalFlip(p=hflip_prob),
-                T.ToTensor(),
-            ])
-        elif data_augmentation == 'ssdlite':
-            self.transforms = T.Compose([
-                T.RandomIoUCrop(),
-                T.RandomHorizontalFlip(p=hflip_prob),
-                T.ToTensor(),
-            ])
         else:
             raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"')
 
diff --git a/test/external/mlperf_retinanet/transforms.py b/test/external/mlperf_retinanet/transforms.py
index e50c08d1ee..e63c26d87d 100644
--- a/test/external/mlperf_retinanet/transforms.py
+++ b/test/external/mlperf_retinanet/transforms.py
@@ -63,28 +63,6 @@ def get_image_size(img: Tensor) -> List[int]:
 
     return get_image_size_pil(img)
 
-def get_image_num_channels_tensor(img: Tensor) -> int:
-    _assert_image_tensor(img)
-    if img.ndim == 2:
-        return 1
-    elif img.ndim > 2:
-        return img.shape[-3]
-
-    raise TypeError(f"Input ndim should be 2 or more. Got {img.ndim}")
-
-@torch.jit.unused
-def get_image_num_channels_pil(img: Any) -> int:
-    if _is_pil_image(img):
-        return len(img.getbands())
-    raise TypeError("Unexpected type {}".format(type(img)))
-
-def get_image_num_channels(img: Tensor) -> int:
-    if isinstance(img, torch.Tensor):
-        return get_image_num_channels_tensor(img)
-
-    return get_image_num_channels_pil(img)
-################################################################################
-
 class Compose(object):
     def __init__(self, transforms):
         self.transforms = transforms
@@ -119,193 +97,6 @@ class ToTensor(nn.Module):
         return image, target
 
 
-class RandomIoUCrop(nn.Module):
-    def __init__(self, min_scale: float = 0.3, max_scale: float = 1.0, min_aspect_ratio: float = 0.5,
-                 max_aspect_ratio: float = 2.0, sampler_options: Optional[List[float]] = None, trials: int = 40):
-        super().__init__()
-        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
-        self.min_scale = min_scale
-        self.max_scale = max_scale
-        self.min_aspect_ratio = min_aspect_ratio
-        self.max_aspect_ratio = max_aspect_ratio
-        if sampler_options is None:
-            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
-        self.options = sampler_options
-        self.trials = trials
-
-    def forward(self, image: Tensor,
-                target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if target is None:
-            raise ValueError("The targets can't be None for this transform.")
-
-        if isinstance(image, torch.Tensor):
-            if image.ndimension() not in {2, 3}:
-                raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension()))
-            elif image.ndimension() == 2:
-                image = image.unsqueeze(0)
-
-        orig_w, orig_h = get_image_size(image)
-
-        while True:
-            # sample an option
-            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
-            min_jaccard_overlap = self.options[idx]
-            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
-                return image, target
-
-            for _ in range(self.trials):
-                # check the aspect ratio limitations
-                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
-                new_w = int(orig_w * r[0])
-                new_h = int(orig_h * r[1])
-                aspect_ratio = new_w / new_h
-                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
-                    continue
-
-                # check for 0 area crops
-                r = torch.rand(2)
-                left = int((orig_w - new_w) * r[0])
-                top = int((orig_h - new_h) * r[1])
-                right = left + new_w
-                bottom = top + new_h
-                if left == right or top == bottom:
-                    continue
-
-                # check for any valid boxes with centers within the crop area
-                cx = 0.5 * (target["boxes"][:, 0] + target["boxes"][:, 2])
-                cy = 0.5 * (target["boxes"][:, 1] + target["boxes"][:, 3])
-                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
-                if not is_within_crop_area.any():
-                    continue
-
-                # check at least 1 box with jaccard limitations
-                boxes = target["boxes"][is_within_crop_area]
-                ious = torchvision.ops.boxes.box_iou(boxes, torch.tensor([[left, top, right, bottom]],
-                                                                         dtype=boxes.dtype, device=boxes.device))
-                if ious.max() < min_jaccard_overlap:
-                    continue
-
-                # keep only valid boxes and perform cropping
-                target["boxes"] = boxes
-                target["labels"] = target["labels"][is_within_crop_area]
-                target["boxes"][:, 0::2] -= left
-                target["boxes"][:, 1::2] -= top
-                target["boxes"][:, 0::2].clamp_(min=0, max=new_w)
-                target["boxes"][:, 1::2].clamp_(min=0, max=new_h)
-                image = F.crop(image, top, left, new_h, new_w)
-
-                return image, target
-
-
-class RandomZoomOut(nn.Module):
-    def __init__(self, fill: Optional[List[float]] = None, side_range: Tuple[float, float] = (1., 4.), p: float = 0.5):
-        super().__init__()
-        if fill is None:
-            fill = [0., 0., 0.]
-        self.fill = fill
-        self.side_range = side_range
-        if side_range[0] < 1. or side_range[0] > side_range[1]:
-            raise ValueError("Invalid canvas side range provided {}.".format(side_range))
-        self.p = p
-
-    @torch.jit.unused
-    def _get_fill_value(self, is_pil):
-        # type: (bool) -> int
-        # We fake the type to make it work on JIT
-        return tuple(int(x) for x in self.fill) if is_pil else 0
-
-    def forward(self, image: Tensor,
-                target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if isinstance(image, torch.Tensor):
-            if image.ndimension() not in {2, 3}:
-                raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension()))
-            elif image.ndimension() == 2:
-                image = image.unsqueeze(0)
-
-        if torch.rand(1) < self.p:
-            return image, target
-
-        orig_w, orig_h = get_image_size(image)
-
-        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
-        canvas_width = int(orig_w * r)
-        canvas_height = int(orig_h * r)
-
-        r = torch.rand(2)
-        left = int((canvas_width - orig_w) * r[0])
-        top = int((canvas_height - orig_h) * r[1])
-        right = canvas_width - (left + orig_w)
-        bottom = canvas_height - (top + orig_h)
-
-        if torch.jit.is_scripting():
-            fill = 0
-        else:
-            fill = self._get_fill_value(_is_pil_image(image))
-
-        image = F.pad(image, [left, top, right, bottom], fill=fill)
-        if isinstance(image, torch.Tensor):
-            v = torch.tensor(self.fill, device=image.device, dtype=image.dtype).view(-1, 1, 1)
-            image[..., :top, :] = image[..., :, :left] = image[..., (top + orig_h):, :] = \
-                image[..., :, (left + orig_w):] = v
-
-        if target is not None:
-            target["boxes"][:, 0::2] += left
-            target["boxes"][:, 1::2] += top
-
-        return image, target
-
-
-class RandomPhotometricDistort(nn.Module):
-    def __init__(self, contrast: Tuple[float] = (0.5, 1.5), saturation: Tuple[float] = (0.5, 1.5),
-                 hue: Tuple[float] = (-0.05, 0.05), brightness: Tuple[float] = (0.875, 1.125), p: float = 0.5):
-        super().__init__()
-        self._brightness = T.ColorJitter(brightness=brightness)
-        self._contrast = T.ColorJitter(contrast=contrast)
-        self._hue = T.ColorJitter(hue=hue)
-        self._saturation = T.ColorJitter(saturation=saturation)
-        self.p = p
-
-    def forward(self, image: Tensor,
-                target: Optional[Dict[str, Tensor]] = None) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
-        if isinstance(image, torch.Tensor):
-            if image.ndimension() not in {2, 3}:
-                raise ValueError('image should be 2/3 dimensional. Got {} dimensions.'.format(image.ndimension()))
-            elif image.ndimension() == 2:
-                image = image.unsqueeze(0)
-
-        r = torch.rand(7)
-
-        if r[0] < self.p:
-            image = self._brightness(image)
-
-        contrast_before = r[1] < 0.5
-        if contrast_before:
-            if r[2] < self.p:
-                image = self._contrast(image)
-
-        if r[3] < self.p:
-            image = self._saturation(image)
-
-        if r[4] < self.p:
-            image = self._hue(image)
-
-        if not contrast_before:
-            if r[5] < self.p:
-                image = self._contrast(image)
-
-        if r[6] < self.p:
-            channels = get_image_num_channels(image)
-            permutation = torch.randperm(channels)
-
-            is_pil = _is_pil_image(image)
-            if is_pil:
-                image = F.to_tensor(image)
-            image = image[..., permutation, :, :]
-            if is_pil:
-                image = F.to_pil_image(image)
-
-        return image, target
-
 import math
 import torch
 import torchvision
@@ -323,12 +114,6 @@ def _get_shape_onnx(image: Tensor) -> Tensor:
     return operators.shape_as_tensor(image)[-2:]
 
 
-@torch.jit.unused
-def _fake_cast_onnx(v: Tensor) -> float:
-    # ONNX requires a tensor but here we fake its type for JIT.
-    return v
-
-
 def _resize_image_and_masks(image: Tensor,
                             target: Optional[Dict[str, Tensor]] = None,
                             image_size: Optional[Tuple[int, int]] = None,