let transform_instance_annotations clip boxes

Summary: Because the inputs are already meaningful "annotations" - it logically makes sense to clip them here instead of later. Reviewed By: rbgirshick Differential Revision: D22125857 fbshipit-source-id: beefab1adb74076ae153ffdf39f548c85736e951
aja9675 · Jun 19, 2020 · 59d88d0 · 59d88d0
1 parent 3d1d01e
commit 59d88d0
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 8 deletions.
diff --git a/detectron2/data/dataset_mapper.py b/detectron2/data/dataset_mapper.py
@@ -136,7 +136,12 @@ def __call__(self, dataset_dict):
             instances = utils.annotations_to_instances(
                 annos, image_shape, mask_format=self.mask_format
             )
-            # Create a tight bounding box from masks, useful when image is cropped
+
+            # After transforms such as cropping are applied, the bounding box may no longer
+            # tightly bound the object. As an example, imagine a triangle object
+            # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
+            # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
+            # the intersection of original bounding box and the cropping box.
             if self.crop_gen and instances.has("gt_masks"):
                 instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
             dataset_dict["instances"] = utils.filter_empty_instances(instances)

diff --git a/detectron2/data/detection_utils.py b/detectron2/data/detection_utils.py
@@ -214,9 +214,11 @@ def transform_instance_annotations(
             transformed according to `transforms`.
             The "bbox_mode" field will be set to XYXY_ABS.
     """
+    # bbox is 1d (per-instance bounding box)
     bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
-    # Note that bbox is 1d (per-instance bounding box)
-    annotation["bbox"] = transforms.apply_box([bbox])[0]
+    # clip transformed bbox to image size
+    bbox = transforms.apply_box([bbox])[0].clip(min=0)
+    annotation["bbox"] = np.minimum(bbox, list(image_size + image_size)[::-1])
     annotation["bbox_mode"] = BoxMode.XYXY_ABS
 
     if "segmentation" in annotation:
@@ -310,8 +312,7 @@ def annotations_to_instances(annos, image_size, mask_format="polygon"):
     """
     boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
     target = Instances(image_size)
-    boxes = target.gt_boxes = Boxes(boxes)
-    boxes.clip(image_size)
+    target.gt_boxes = Boxes(boxes)
 
     classes = [obj["category_id"] for obj in annos]
     classes = torch.tensor(classes, dtype=torch.int64)

diff --git a/tests/data/test_detection_utils.py b/tests/data/test_detection_utils.py
@@ -66,15 +66,16 @@ def test_crop(self):
         keypoints = np.random.rand(17, 3) * 50 + 15
         keypoints[:, 2] = 2
         anno = {
-            "bbox": np.asarray([10, 10, 200, 300]),
+            "bbox": np.asarray([10, 10, 200, 400]),
             "bbox_mode": BoxMode.XYXY_ABS,
             "keypoints": keypoints,
         }
 
         output = detection_utils.transform_instance_annotations(
-            copy.deepcopy(anno), transforms, (400, 400)
+            copy.deepcopy(anno), transforms, (10, 10)
         )
-        self.assertTrue((output["bbox"] == np.asarray([-290, -290, -100, 0])).all())
+        # box is shifted and cropped
+        self.assertTrue((output["bbox"] == np.asarray([0, 0, 0, 10])).all())
         # keypoints are no longer visible
         self.assertTrue((output["keypoints"][:, 2] == 0).all())