forked from ppwwyyxx/detectron2
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Reviewed By: feichtenhofer Differential Revision: D36550846 fbshipit-source-id: 7e387001763faf5310fb9a38bda4f07c9d3a8ba7
- Loading branch information
1 parent
9957012
commit c9cf7c9
Showing
12 changed files
with
386 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
# MViTv2: Improved Multiscale Vision Transformers for Classification and Detection | ||
|
||
Yanghao Li*, Chao-Yuan Wu*, Haoqi Fan, Karttikeya Mangalam, Bo Xiong, Jitendra Malik, Christoph Feichtenhofer* | ||
|
||
[[`arXiv`](https://arxiv.org/abs/2203.16527)] [[`BibTeX`](#CitingMViTv2)] | ||
|
||
In this repository, we provide detection configs and models for MViTv2 (CVPR 2022) in Detectron2. For image classification tasks, please refer to [MViTv2 repo](https://github.com/facebookresearch/mvit). | ||
|
||
## Results and Pretrained Models | ||
|
||
### COCO | ||
|
||
<table><tbody> | ||
<!-- START TABLE --> | ||
<!-- TABLE HEADER --> | ||
<th valign="bottom">Name</th> | ||
<th valign="bottom">pre-train</th> | ||
<th valign="bottom">Method</th> | ||
<th valign="bottom">epochs</th> | ||
<th valign="bottom">box<br/>AP</th> | ||
<th valign="bottom">mask<br/>AP</th> | ||
<th valign="bottom">#params</th> | ||
<th valign="bottom">FLOPS</th> | ||
<th valign="bottom">model id</th> | ||
<th valign="bottom">download</th> | ||
<!-- TABLE BODY --> | ||
<!-- ROW: mask_rcnn_mvitv2_t_3x --> | ||
<tr><td align="left"><a href="configs/mask_rcnn_mvitv2_t_3x.py">MViTV2-T</a></td> | ||
<td align="center">IN1K</td> | ||
<td align="center">Mask R-CNN</td> | ||
<td align="center">36</td> | ||
<td align="center">48.3</td> | ||
<td align="center">43.8</td> | ||
<td align="center">44M</td> | ||
<td align="center">279G</td> | ||
<td align="center">307611773</td> | ||
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/MViTv2/mask_rcnn_mvitv2_t_3x/f307611773/model_final_1a1c30.pkl">model</a></td> | ||
</tr> | ||
<!-- ROW: cascade_mask_rcnn_mvitv2_t_3x --> | ||
<tr><td align="left"><a href="configs/cascade_mask_rcnn_mvitv2_t_3x.py">MViTV2-T</a></td> | ||
<td align="center">IN1K</td> | ||
<td align="center">Cascade Mask R-CNN</td> | ||
<td align="center">36</td> | ||
<td align="center">52.2</td> | ||
<td align="center">45.0</td> | ||
<td align="center">76M</td> | ||
<td align="center">701G</td> | ||
<td align="center">308344828</td> | ||
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/MViTv2/cascade_mask_rcnn_mvitv2_t_3x/f308344828/model_final_c6967a.pkl">model</a></td> | ||
</tr> | ||
<!-- ROW: cascade_mask_rcnn_mvitv2_s_3x --> | ||
<tr><td align="left"><a href="configs/cascade_mask_rcnn_mvitv2_s_3x.py">MViTV2-S</a></td> | ||
<td align="center">IN1K</td> | ||
<td align="center">Cascade Mask R-CNN</td> | ||
<td align="center">36</td> | ||
<td align="center">53.2</td> | ||
<td align="center">46.0</td> | ||
<td align="center">87M</td> | ||
<td align="center">748G</td> | ||
<td align="center">308344647</td> | ||
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/MViTv2/cascade_mask_rcnn_mvitv2_s_3x/f308344647/model_final_279baf.pkl">model</a></td> | ||
</tr> | ||
<!-- ROW: cascade_mask_rcnn_mvitv2_b_3x --> | ||
<tr><td align="left"><a href="configs/cascade_mask_rcnn_mvitv2_b_3x.py">MViTV2-B</a></td> | ||
<td align="center">IN1K</td> | ||
<td align="center">Cascade Mask R-CNN</td> | ||
<td align="center">36</td> | ||
<td align="center">54.1</td> | ||
<td align="center">46.7</td> | ||
<td align="center">103M</td> | ||
<td align="center">814G</td> | ||
<td align="center">308109448</td> | ||
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/MViTv2/cascade_mask_rcnn_mvitv2_b_3x/f308109448/model_final_421a91.pkl">model</a></td> | ||
</tr> | ||
<!-- ROW: cascade_mask_rcnn_mvitv2_b_in21k_3x --> | ||
<tr><td align="left"><a href="configs/cascade_mask_rcnn_mvitv2_b_in21k_3x.py">MViTV2-B</a></td> | ||
<td align="center">IN21K</td> | ||
<td align="center">Cascade Mask R-CNN</td> | ||
<td align="center">36</td> | ||
<td align="center">54.9</td> | ||
<td align="center">47.4</td> | ||
<td align="center">103M</td> | ||
<td align="center">814G</td> | ||
<td align="center">309003202</td> | ||
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/MViTv2/cascade_mask_rcnn_mvitv2_b_in12k_3x/f309003202/model_final_be5168.pkl">model</a></td> | ||
</tr> | ||
<!-- ROW: cascade_mask_rcnn_mvitv2_l_in21k_lsj_50ep --> | ||
<tr><td align="left"><a href="configs/cascade_mask_rcnn_mvitv2_l_in21k_lsj_50ep.py">MViTV2-L</a></td> | ||
<td align="center">IN21K</td> | ||
<td align="center">Cascade Mask R-CNN</td> | ||
<td align="center">50</td> | ||
<td align="center">55.8</td> | ||
<td align="center">48.3</td> | ||
<td align="center">270M</td> | ||
<td align="center">1519G</td> | ||
<td align="center">308099658</td> | ||
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/MViTv2/cascade_mask_rcnn_mvitv2_l_in12k_lsj_50ep/f308099658/model_final_c41c5a.pkl">model</a></td> | ||
</tr> | ||
<!-- ROW: cascade_mask_rcnn_mvitv2_h_in21k_lsj_3x --> | ||
<tr><td align="left"><a href="configs/cascade_mask_rcnn_mvitv2_h_in21k_lsj_3x.py">MViTV2-H</a></td> | ||
<td align="center">IN21K</td> | ||
<td align="center">Cascade Mask R-CNN</td> | ||
<td align="center">36</td> | ||
<td align="center">56.1</td> | ||
<td align="center">48.5</td> | ||
<td align="center">718M</td> | ||
<td align="center">3084G</td> | ||
<td align="center">309013744</td> | ||
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/MViTv2/cascade_mask_rcnn_mvitv2_h_in12k_lsj_3x/f309013744/model_final_30d36b.pkl">model</a></td> | ||
</tr> | ||
</tbody></table> | ||
|
||
Note that the above models were trained and measured on 8-node with 64 NVIDIA A100 GPUs in total. The ImageNet pre-trained model weights are obtained from [MViTv2 repo](https://github.com/facebookresearch/mvit). | ||
|
||
## Training | ||
All configs can be trained with: | ||
|
||
``` | ||
../../tools/lazyconfig_train_net.py --config-file configs/path/to/config.py | ||
``` | ||
By default, we use 64 GPUs with batch size as 64 for training. | ||
|
||
## Evaluation | ||
Model evaluation can be done similarly: | ||
``` | ||
../../tools/lazyconfig_train_net.py --config-file configs/path/to/config.py --eval-only train.init_checkpoint=/path/to/model_checkpoint | ||
``` | ||
|
||
|
||
|
||
## <a name="CitingMViTv2"></a>Citing MViTv2 | ||
|
||
If you use MViTv2, please use the following BibTeX entry. | ||
|
||
```BibTeX | ||
@inproceedings{li2021improved, | ||
title={MViTv2: Improved multiscale vision transformers for classification and detection}, | ||
author={Li, Yanghao and Wu, Chao-Yuan and Fan, Haoqi and Mangalam, Karttikeya and Xiong, Bo and Malik, Jitendra and Feichtenhofer, Christoph}, | ||
booktitle={CVPR}, | ||
year={2022} | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from .cascade_mask_rcnn_mvitv2_t_3x import model, dataloader, optimizer, lr_multiplier, train | ||
|
||
|
||
model.backbone.bottom_up.depth = 24 | ||
model.backbone.bottom_up.last_block_indexes = (1, 4, 20, 23) | ||
model.backbone.bottom_up.drop_path_rate = 0.4 | ||
|
||
train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in1k.pyth" |
3 changes: 3 additions & 0 deletions
3
projects/MViTv2/configs/cascade_mask_rcnn_mvitv2_b_in21k_3x.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .cascade_mask_rcnn_mvitv2_b_3x import model, dataloader, optimizer, lr_multiplier, train | ||
|
||
train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_B_in21k.pyth" |
12 changes: 12 additions & 0 deletions
12
projects/MViTv2/configs/cascade_mask_rcnn_mvitv2_h_in21k_lsj_3x.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from .cascade_mask_rcnn_mvitv2_b_3x import model, optimizer, train, lr_multiplier | ||
from .common.coco_loader_lsj import dataloader | ||
|
||
|
||
model.backbone.bottom_up.embed_dim = 192 | ||
model.backbone.bottom_up.depth = 80 | ||
model.backbone.bottom_up.num_heads = 3 | ||
model.backbone.bottom_up.last_block_indexes = (3, 11, 71, 79) | ||
model.backbone.bottom_up.drop_path_rate = 0.6 | ||
model.backbone.bottom_up.use_act_checkpoint = True | ||
|
||
train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_H_in21k.pyth" |
31 changes: 31 additions & 0 deletions
31
projects/MViTv2/configs/cascade_mask_rcnn_mvitv2_l_in21k_lsj_50ep.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from fvcore.common.param_scheduler import MultiStepParamScheduler | ||
|
||
from detectron2.config import LazyCall as L | ||
from detectron2.solver import WarmupParamScheduler | ||
|
||
from .cascade_mask_rcnn_mvitv2_b_3x import model, optimizer, train | ||
from .common.coco_loader_lsj import dataloader | ||
|
||
|
||
model.backbone.bottom_up.embed_dim = 144 | ||
model.backbone.bottom_up.depth = 48 | ||
model.backbone.bottom_up.num_heads = 2 | ||
model.backbone.bottom_up.last_block_indexes = (1, 7, 43, 47) | ||
model.backbone.bottom_up.drop_path_rate = 0.5 | ||
|
||
train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_L_in21k.pyth" | ||
|
||
# Schedule | ||
# 50ep = 184375 // 2 iters * 64 images/iter / 118000 images/ep | ||
train.max_iter = 184375 // 2 | ||
lr_multiplier = L(WarmupParamScheduler)( | ||
scheduler=L(MultiStepParamScheduler)( | ||
values=[1.0, 0.1, 0.01], | ||
milestones=[163889 // 2, 177546 // 2], | ||
num_updates=train.max_iter, | ||
), | ||
warmup_length=250 / train.max_iter, | ||
warmup_factor=0.001, | ||
) | ||
|
||
optimizer.lr = 1e-4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from .cascade_mask_rcnn_mvitv2_t_3x import model, dataloader, optimizer, lr_multiplier, train | ||
|
||
|
||
model.backbone.bottom_up.depth = 16 | ||
model.backbone.bottom_up.last_block_indexes = (0, 2, 13, 15) | ||
|
||
train.init_checkpoint = "detectron2://ImageNetPretrained/mvitv2/MViTv2_S_in1k.pyth" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from detectron2.config import LazyCall as L | ||
from detectron2.layers import ShapeSpec | ||
from detectron2.modeling.box_regression import Box2BoxTransform | ||
from detectron2.modeling.matcher import Matcher | ||
from detectron2.modeling.roi_heads import FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads | ||
from detectron2.layers.batch_norm import NaiveSyncBatchNorm | ||
|
||
from .mask_rcnn_mvitv2_t_3x import model, dataloader, optimizer, lr_multiplier, train | ||
|
||
|
||
# arguments that don't exist for Cascade R-CNN | ||
[model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] | ||
|
||
model.roi_heads.update( | ||
_target_=CascadeROIHeads, | ||
box_heads=[ | ||
L(FastRCNNConvFCHead)( | ||
input_shape=ShapeSpec(channels=256, height=7, width=7), | ||
conv_dims=[256, 256, 256, 256], | ||
fc_dims=[1024], | ||
conv_norm=lambda c: NaiveSyncBatchNorm(c, stats_mode="N"), | ||
) | ||
for _ in range(3) | ||
], | ||
box_predictors=[ | ||
L(FastRCNNOutputLayers)( | ||
input_shape=ShapeSpec(channels=1024), | ||
test_score_thresh=0.05, | ||
box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), | ||
cls_agnostic_bbox_reg=True, | ||
num_classes="${...num_classes}", | ||
) | ||
for (w1, w2) in [(10, 5), (20, 10), (30, 15)] | ||
], | ||
proposal_matchers=[ | ||
L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) | ||
for th in [0.5, 0.6, 0.7] | ||
], | ||
) | ||
|
||
# Using NaiveSyncBatchNorm becase heads may have empty input. That is not supported by | ||
# torch.nn.SyncBatchNorm. We can remove this after | ||
# https://github.com/pytorch/pytorch/issues/36530 is fixed. | ||
model.roi_heads.mask_head.conv_norm = lambda c: NaiveSyncBatchNorm(c, stats_mode="N") | ||
|
||
# 2conv in RPN: | ||
# https://github.com/tensorflow/tpu/blob/b24729de804fdb751b06467d3dce0637fa652060/models/official/detection/modeling/architecture/heads.py#L95-L97 # noqa: E501, B950 | ||
model.proposal_generator.head.conv_dims = [-1, -1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from omegaconf import OmegaConf | ||
|
||
import detectron2.data.transforms as T | ||
from detectron2.config import LazyCall as L | ||
from detectron2.data import ( | ||
DatasetMapper, | ||
build_detection_test_loader, | ||
build_detection_train_loader, | ||
get_detection_dataset_dicts, | ||
) | ||
from detectron2.evaluation import COCOEvaluator | ||
|
||
dataloader = OmegaConf.create() | ||
|
||
dataloader.train = L(build_detection_train_loader)( | ||
dataset=L(get_detection_dataset_dicts)(names="coco_2017_train"), | ||
mapper=L(DatasetMapper)( | ||
is_train=True, | ||
augmentations=[ | ||
L(T.RandomApply)( | ||
tfm_or_aug=L(T.AugmentationList)( | ||
augs=[ | ||
L(T.ResizeShortestEdge)( | ||
short_edge_length=[400, 500, 600], sample_style="choice" | ||
), | ||
L(T.RandomCrop)(crop_type="absolute_range", crop_size=(384, 600)), | ||
] | ||
), | ||
prob=0.5, | ||
), | ||
L(T.ResizeShortestEdge)( | ||
short_edge_length=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), | ||
sample_style="choice", | ||
max_size=1333, | ||
), | ||
L(T.RandomFlip)(horizontal=True), | ||
], | ||
image_format="RGB", | ||
use_instance_mask=True, | ||
), | ||
total_batch_size=16, | ||
num_workers=4, | ||
) | ||
|
||
dataloader.test = L(build_detection_test_loader)( | ||
dataset=L(get_detection_dataset_dicts)(names="coco_2017_val", filter_empty=False), | ||
mapper=L(DatasetMapper)( | ||
is_train=False, | ||
augmentations=[ | ||
L(T.ResizeShortestEdge)(short_edge_length=800, max_size=1333), | ||
], | ||
image_format="${...train.mapper.image_format}", | ||
), | ||
num_workers=4, | ||
) | ||
|
||
dataloader.evaluator = L(COCOEvaluator)( | ||
dataset_name="${..test.dataset.names}", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import detectron2.data.transforms as T | ||
from detectron2 import model_zoo | ||
from detectron2.config import LazyCall as L | ||
|
||
from .coco_loader import dataloader | ||
|
||
# Data using LSJ | ||
image_size = 1024 | ||
dataloader.train.mapper.augmentations = [ | ||
L(T.RandomFlip)(horizontal=True), # flip first | ||
L(T.ResizeScale)( | ||
min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size | ||
), | ||
L(T.FixedSizeCrop)(crop_size=(image_size, image_size)), | ||
] | ||
dataloader.train.mapper.image_format = "RGB" | ||
dataloader.train.total_batch_size = 64 | ||
# recompute boxes due to cropping | ||
dataloader.train.mapper.recompute_boxes = True |
Oops, something went wrong.