diff --git a/LICENSE b/LICENSE index 6b28d560c..3b222bf0d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,399 +1,22 @@ -Attribution-NonCommercial 4.0 International -======================================================================= +MIT License -Creative Commons Corporation ("Creative Commons") is not a law firm and -does not provide legal services or legal advice. Distribution of -Creative Commons public licenses does not create a lawyer-client or -other relationship. Creative Commons makes its licenses and related -information available on an "as-is" basis. Creative Commons gives no -warranties regarding its licenses, any material licensed under their -terms and conditions, or any related information. Creative Commons -disclaims all liability for damages resulting from their use to the -fullest extent possible. +Copyright (c) Meta Platforms, Inc. and affiliates. -Using Creative Commons Public Licenses +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: -Creative Commons public licenses provide a standard set of terms and -conditions that creators and other rights holders may use to share -original works of authorship and other material subject to copyright -and certain other rights specified in the public license below. The -following considerations are for informational purposes only, are not -exhaustive, and do not form part of our licenses. +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. - Considerations for licensors: Our public licenses are - intended for use by those authorized to give the public - permission to use material in ways otherwise restricted by - copyright and certain other rights. Our licenses are - irrevocable. Licensors should read and understand the terms - and conditions of the license they choose before applying it. - Licensors should also secure all rights necessary before - applying our licenses so that the public can reuse the - material as expected. Licensors should clearly mark any - material not subject to the license. This includes other CC- - licensed material, or material used under an exception or - limitation to copyright. More considerations for licensors: - wiki.creativecommons.org/Considerations_for_licensors - - Considerations for the public: By using one of our public - licenses, a licensor grants the public permission to use the - licensed material under specified terms and conditions. If - the licensor's permission is not necessary for any reason--for - example, because of any applicable exception or limitation to - copyright--then that use is not regulated by the license. Our - licenses grant only permissions under copyright and certain - other rights that a licensor has authority to grant. Use of - the licensed material may still be restricted for other - reasons, including because others have copyright or other - rights in the material. A licensor may make special requests, - such as asking that all changes be marked or described. - Although not required by our licenses, you are encouraged to - respect those requests where reasonable. More_considerations - for the public: - wiki.creativecommons.org/Considerations_for_licensees - -======================================================================= - -Creative Commons Attribution-NonCommercial 4.0 International Public -License - -By exercising the Licensed Rights (defined below), You accept and agree -to be bound by the terms and conditions of this Creative Commons -Attribution-NonCommercial 4.0 International Public License ("Public -License"). To the extent this Public License may be interpreted as a -contract, You are granted the Licensed Rights in consideration of Your -acceptance of these terms and conditions, and the Licensor grants You -such rights in consideration of benefits the Licensor receives from -making the Licensed Material available under these terms and -conditions. - -Section 1 -- Definitions. - - a. Adapted Material means material subject to Copyright and Similar - Rights that is derived from or based upon the Licensed Material - and in which the Licensed Material is translated, altered, - arranged, transformed, or otherwise modified in a manner requiring - permission under the Copyright and Similar Rights held by the - Licensor. For purposes of this Public License, where the Licensed - Material is a musical work, performance, or sound recording, - Adapted Material is always produced where the Licensed Material is - synched in timed relation with a moving image. - - b. Adapter's License means the license You apply to Your Copyright - and Similar Rights in Your contributions to Adapted Material in - accordance with the terms and conditions of this Public License. - - c. Copyright and Similar Rights means copyright and/or similar rights - closely related to copyright including, without limitation, - performance, broadcast, sound recording, and Sui Generis Database - Rights, without regard to how the rights are labeled or - categorized. For purposes of this Public License, the rights - specified in Section 2(b)(1)-(2) are not Copyright and Similar - Rights. - d. Effective Technological Measures means those measures that, in the - absence of proper authority, may not be circumvented under laws - fulfilling obligations under Article 11 of the WIPO Copyright - Treaty adopted on December 20, 1996, and/or similar international - agreements. - - e. Exceptions and Limitations means fair use, fair dealing, and/or - any other exception or limitation to Copyright and Similar Rights - that applies to Your use of the Licensed Material. - - f. Licensed Material means the artistic or literary work, database, - or other material to which the Licensor applied this Public - License. - - g. Licensed Rights means the rights granted to You subject to the - terms and conditions of this Public License, which are limited to - all Copyright and Similar Rights that apply to Your use of the - Licensed Material and that the Licensor has authority to license. - - h. Licensor means the individual(s) or entity(ies) granting rights - under this Public License. - - i. NonCommercial means not primarily intended for or directed towards - commercial advantage or monetary compensation. For purposes of - this Public License, the exchange of the Licensed Material for - other material subject to Copyright and Similar Rights by digital - file-sharing or similar means is NonCommercial provided there is - no payment of monetary compensation in connection with the - exchange. - - j. Share means to provide material to the public by any means or - process that requires permission under the Licensed Rights, such - as reproduction, public display, public performance, distribution, - dissemination, communication, or importation, and to make material - available to the public including in ways that members of the - public may access the material from a place and at a time - individually chosen by them. - - k. Sui Generis Database Rights means rights other than copyright - resulting from Directive 96/9/EC of the European Parliament and of - the Council of 11 March 1996 on the legal protection of databases, - as amended and/or succeeded, as well as other essentially - equivalent rights anywhere in the world. - - l. You means the individual or entity exercising the Licensed Rights - under this Public License. Your has a corresponding meaning. - -Section 2 -- Scope. - - a. License grant. - - 1. Subject to the terms and conditions of this Public License, - the Licensor hereby grants You a worldwide, royalty-free, - non-sublicensable, non-exclusive, irrevocable license to - exercise the Licensed Rights in the Licensed Material to: - - a. reproduce and Share the Licensed Material, in whole or - in part, for NonCommercial purposes only; and - - b. produce, reproduce, and Share Adapted Material for - NonCommercial purposes only. - - 2. Exceptions and Limitations. For the avoidance of doubt, where - Exceptions and Limitations apply to Your use, this Public - License does not apply, and You do not need to comply with - its terms and conditions. - - 3. Term. The term of this Public License is specified in Section - 6(a). - - 4. Media and formats; technical modifications allowed. The - Licensor authorizes You to exercise the Licensed Rights in - all media and formats whether now known or hereafter created, - and to make technical modifications necessary to do so. The - Licensor waives and/or agrees not to assert any right or - authority to forbid You from making technical modifications - necessary to exercise the Licensed Rights, including - technical modifications necessary to circumvent Effective - Technological Measures. For purposes of this Public License, - simply making modifications authorized by this Section 2(a) - (4) never produces Adapted Material. - - 5. Downstream recipients. - - a. Offer from the Licensor -- Licensed Material. Every - recipient of the Licensed Material automatically - receives an offer from the Licensor to exercise the - Licensed Rights under the terms and conditions of this - Public License. - - b. No downstream restrictions. You may not offer or impose - any additional or different terms or conditions on, or - apply any Effective Technological Measures to, the - Licensed Material if doing so restricts exercise of the - Licensed Rights by any recipient of the Licensed - Material. - - 6. No endorsement. Nothing in this Public License constitutes or - may be construed as permission to assert or imply that You - are, or that Your use of the Licensed Material is, connected - with, or sponsored, endorsed, or granted official status by, - the Licensor or others designated to receive attribution as - provided in Section 3(a)(1)(A)(i). - - b. Other rights. - - 1. Moral rights, such as the right of integrity, are not - licensed under this Public License, nor are publicity, - privacy, and/or other similar personality rights; however, to - the extent possible, the Licensor waives and/or agrees not to - assert any such rights held by the Licensor to the limited - extent necessary to allow You to exercise the Licensed - Rights, but not otherwise. - - 2. Patent and trademark rights are not licensed under this - Public License. - - 3. To the extent possible, the Licensor waives any right to - collect royalties from You for the exercise of the Licensed - Rights, whether directly or through a collecting society - under any voluntary or waivable statutory or compulsory - licensing scheme. In all other cases the Licensor expressly - reserves any right to collect such royalties, including when - the Licensed Material is used other than for NonCommercial - purposes. - -Section 3 -- License Conditions. - -Your exercise of the Licensed Rights is expressly made subject to the -following conditions. - - a. Attribution. - - 1. If You Share the Licensed Material (including in modified - form), You must: - - a. retain the following if it is supplied by the Licensor - with the Licensed Material: - - i. identification of the creator(s) of the Licensed - Material and any others designated to receive - attribution, in any reasonable manner requested by - the Licensor (including by pseudonym if - designated); - - ii. a copyright notice; - - iii. a notice that refers to this Public License; - - iv. a notice that refers to the disclaimer of - warranties; - - v. a URI or hyperlink to the Licensed Material to the - extent reasonably practicable; - - b. indicate if You modified the Licensed Material and - retain an indication of any previous modifications; and - - c. indicate the Licensed Material is licensed under this - Public License, and include the text of, or the URI or - hyperlink to, this Public License. - - 2. You may satisfy the conditions in Section 3(a)(1) in any - reasonable manner based on the medium, means, and context in - which You Share the Licensed Material. For example, it may be - reasonable to satisfy the conditions by providing a URI or - hyperlink to a resource that includes the required - information. - - 3. If requested by the Licensor, You must remove any of the - information required by Section 3(a)(1)(A) to the extent - reasonably practicable. - - 4. If You Share Adapted Material You produce, the Adapter's - License You apply must not prevent recipients of the Adapted - Material from complying with this Public License. - -Section 4 -- Sui Generis Database Rights. - -Where the Licensed Rights include Sui Generis Database Rights that -apply to Your use of the Licensed Material: - - a. for the avoidance of doubt, Section 2(a)(1) grants You the right - to extract, reuse, reproduce, and Share all or a substantial - portion of the contents of the database for NonCommercial purposes - only; - - b. if You include all or a substantial portion of the database - contents in a database in which You have Sui Generis Database - Rights, then the database in which You have Sui Generis Database - Rights (but not its individual contents) is Adapted Material; and - - c. You must comply with the conditions in Section 3(a) if You Share - all or a substantial portion of the contents of the database. - -For the avoidance of doubt, this Section 4 supplements and does not -replace Your obligations under this Public License where the Licensed -Rights include other Copyright and Similar Rights. - -Section 5 -- Disclaimer of Warranties and Limitation of Liability. - - a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE - EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS - AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF - ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, - IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, - WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR - PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, - ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT - KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT - ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. - - b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE - TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, - NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, - INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, - COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR - USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN - ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR - DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR - IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. - - c. The disclaimer of warranties and limitation of liability provided - above shall be interpreted in a manner that, to the extent - possible, most closely approximates an absolute disclaimer and - waiver of all liability. - -Section 6 -- Term and Termination. - - a. This Public License applies for the term of the Copyright and - Similar Rights licensed here. However, if You fail to comply with - this Public License, then Your rights under this Public License - terminate automatically. - - b. Where Your right to use the Licensed Material has terminated under - Section 6(a), it reinstates: - - 1. automatically as of the date the violation is cured, provided - it is cured within 30 days of Your discovery of the - violation; or - - 2. upon express reinstatement by the Licensor. - - For the avoidance of doubt, this Section 6(b) does not affect any - right the Licensor may have to seek remedies for Your violations - of this Public License. - - c. For the avoidance of doubt, the Licensor may also offer the - Licensed Material under separate terms or conditions or stop - distributing the Licensed Material at any time; however, doing so - will not terminate this Public License. - - d. Sections 1, 5, 6, 7, and 8 survive termination of this Public - License. - -Section 7 -- Other Terms and Conditions. - - a. The Licensor shall not be bound by any additional or different - terms or conditions communicated by You unless expressly agreed. - - b. Any arrangements, understandings, or agreements regarding the - Licensed Material not stated herein are separate from and - independent of the terms and conditions of this Public License. - -Section 8 -- Interpretation. - - a. For the avoidance of doubt, this Public License does not, and - shall not be interpreted to, reduce, limit, restrict, or impose - conditions on any use of the Licensed Material that could lawfully - be made without permission under this Public License. - - b. To the extent possible, if any provision of this Public License is - deemed unenforceable, it shall be automatically reformed to the - minimum extent necessary to make it enforceable. If the provision - cannot be reformed, it shall be severed from this Public License - without affecting the enforceability of the remaining terms and - conditions. - - c. No term or condition of this Public License will be waived and no - failure to comply consented to unless expressly agreed to by the - Licensor. - - d. Nothing in this Public License constitutes or may be interpreted - as a limitation upon, or waiver of, any privileges and immunities - that apply to the Licensor or You, including from the legal - processes of any jurisdiction or authority. - -======================================================================= - -Creative Commons is not a party to its public -licenses. Notwithstanding, Creative Commons may elect to apply one of -its public licenses to material it publishes and in those instances -will be considered the “Licensor.” The text of the Creative Commons -public licenses is dedicated to the public domain under the CC0 Public -Domain Dedication. Except for the limited purpose of indicating that -material is shared under a Creative Commons public license or as -otherwise permitted by the Creative Commons policies published at -creativecommons.org/policies, Creative Commons does not authorize the -use of the trademark "Creative Commons" or any other trademark or logo -of Creative Commons without its prior written consent including, -without limitation, in connection with any unauthorized modifications -to any of its public licenses or any other arrangements, -understandings, or agreements concerning use of licensed material. For -the avoidance of doubt, this paragraph does not form part of the -public licenses. - -Creative Commons may be contacted at creativecommons.org. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/detection/configs/Base-RCNN-C4-BN.yaml b/detection/configs/Base-RCNN-C4-BN.yaml index 5104c6a60..eaa5b376e 100644 --- a/detection/configs/Base-RCNN-C4-BN.yaml +++ b/detection/configs/Base-RCNN-C4-BN.yaml @@ -8,7 +8,8 @@ MODEL: BACKBONE: FREEZE_AT: 0 RESNETS: - NORM: "SyncBN" + # SyncBN seems to cause larger variance for unknown reasons + NORM: "naiveSyncBN" TEST: PRECISE_BN: ENABLED: True diff --git a/detection/convert-pretrain-to-detectron2.py b/detection/convert-pretrain-to-detectron2.py old mode 100755 new mode 100644 index b96ed9192..6e06b2d5e --- a/detection/convert-pretrain-to-detectron2.py +++ b/detection/convert-pretrain-to-detectron2.py @@ -1,10 +1,16 @@ #!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + import pickle as pkl import sys + import torch + if __name__ == "__main__": input = sys.argv[1] diff --git a/detection/train_net.py b/detection/train_net.py old mode 100755 new mode 100644 index 39e844c50..008724db4 --- a/detection/train_net.py +++ b/detection/train_net.py @@ -1,14 +1,23 @@ #!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + import os from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg -from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch +from detectron2.engine import ( + default_argument_parser, + default_setup, + DefaultTrainer, + launch, +) from detectron2.evaluation import COCOEvaluator, PascalVOCDetectionEvaluator from detectron2.layers import get_norm -from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads +from detectron2.modeling.roi_heads import Res5ROIHeads, ROI_HEADS_REGISTRY @ROI_HEADS_REGISTRY.register() @@ -17,12 +26,12 @@ class Res5ROIHeadsExtraNorm(Res5ROIHeads): As described in the MOCO paper, there is an extra BN layer following the res5 stage. """ - def _build_res5_block(self, cfg): - seq, out_channels = super()._build_res5_block(cfg) + + def __init__(self, cfg, input_shape): + super().__init__(cfg, input_shape) norm = cfg.MODEL.RESNETS.NORM - norm = get_norm(norm, out_channels) - seq.add_module("norm", norm) - return seq, out_channels + norm = get_norm(norm, self.res5[-1].out_channels) + self.res5.add_module("norm", norm) class Trainer(DefaultTrainer): @@ -62,7 +71,7 @@ def main(args): return trainer.train() -if __name__ == "__main__": +def invoke_main() -> None: args = default_argument_parser().parse_args() print("Command Line Args:", args) launch( @@ -73,3 +82,7 @@ def main(args): dist_url=args.dist_url, args=(args,), ) + + +if __name__ == "__main__": + invoke_main() # pragma: no cover diff --git a/main_lincls.py b/main_lincls.py old mode 100755 new mode 100644 index cb9d51675..23ba8839e --- a/main_lincls.py +++ b/main_lincls.py @@ -1,5 +1,10 @@ #!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + import argparse import builtins import os @@ -9,76 +14,145 @@ import warnings import torch -import torch.nn as nn -import torch.nn.parallel import torch.backends.cudnn as cudnn import torch.distributed as dist -import torch.optim import torch.multiprocessing as mp +import torch.nn as nn +import torch.nn.parallel +import torch.optim import torch.utils.data import torch.utils.data.distributed -import torchvision.transforms as transforms import torchvision.datasets as datasets import torchvision.models as models +import torchvision.transforms as transforms + -model_names = sorted(name for name in models.__dict__ - if name.islower() and not name.startswith("__") - and callable(models.__dict__[name])) - -parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') -parser.add_argument('data', metavar='DIR', - help='path to dataset') -parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', - choices=model_names, - help='model architecture: ' + - ' | '.join(model_names) + - ' (default: resnet50)') -parser.add_argument('-j', '--workers', default=32, type=int, metavar='N', - help='number of data loading workers (default: 32)') -parser.add_argument('--epochs', default=100, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--start-epoch', default=0, type=int, metavar='N', - help='manual epoch number (useful on restarts)') -parser.add_argument('-b', '--batch-size', default=256, type=int, - metavar='N', - help='mini-batch size (default: 256), this is the total ' - 'batch size of all GPUs on the current node when ' - 'using Data Parallel or Distributed Data Parallel') -parser.add_argument('--lr', '--learning-rate', default=30., type=float, - metavar='LR', help='initial learning rate', dest='lr') -parser.add_argument('--schedule', default=[60, 80], nargs='*', type=int, - help='learning rate schedule (when to drop lr by a ratio)') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum') -parser.add_argument('--wd', '--weight-decay', default=0., type=float, - metavar='W', help='weight decay (default: 0.)', - dest='weight_decay') -parser.add_argument('-p', '--print-freq', default=10, type=int, - metavar='N', help='print frequency (default: 10)') -parser.add_argument('--resume', default='', type=str, metavar='PATH', - help='path to latest checkpoint (default: none)') -parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', - help='evaluate model on validation set') -parser.add_argument('--world-size', default=-1, type=int, - help='number of nodes for distributed training') -parser.add_argument('--rank', default=-1, type=int, - help='node rank for distributed training') -parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, - help='url used to set up distributed training') -parser.add_argument('--dist-backend', default='nccl', type=str, - help='distributed backend') -parser.add_argument('--seed', default=None, type=int, - help='seed for initializing training. ') -parser.add_argument('--gpu', default=None, type=int, - help='GPU id to use.') -parser.add_argument('--multiprocessing-distributed', action='store_true', - help='Use multi-processing distributed training to launch ' - 'N processes per node, which has N GPUs. This is the ' - 'fastest way to use PyTorch for either single node or ' - 'multi node data parallel training') - -parser.add_argument('--pretrained', default='', type=str, - help='path to moco pretrained checkpoint') +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +parser.add_argument("data", metavar="DIR", help="path to dataset") +parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet50", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +) +parser.add_argument( + "-j", + "--workers", + default=32, + type=int, + metavar="N", + help="number of data loading workers (default: 32)", +) +parser.add_argument( + "--epochs", default=100, type=int, metavar="N", help="number of total epochs to run" +) +parser.add_argument( + "--start-epoch", + default=0, + type=int, + metavar="N", + help="manual epoch number (useful on restarts)", +) +parser.add_argument( + "-b", + "--batch-size", + default=256, + type=int, + metavar="N", + help="mini-batch size (default: 256), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) +parser.add_argument( + "--lr", + "--learning-rate", + default=30.0, + type=float, + metavar="LR", + help="initial learning rate", + dest="lr", +) +parser.add_argument( + "--schedule", + default=[60, 80], + nargs="*", + type=int, + help="learning rate schedule (when to drop lr by a ratio)", +) +parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum") +parser.add_argument( + "--wd", + "--weight-decay", + default=0.0, + type=float, + metavar="W", + help="weight decay (default: 0.)", + dest="weight_decay", +) +parser.add_argument( + "-p", + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", +) +parser.add_argument( + "--resume", + default="", + type=str, + metavar="PATH", + help="path to latest checkpoint (default: none)", +) +parser.add_argument( + "-e", + "--evaluate", + dest="evaluate", + action="store_true", + help="evaluate model on validation set", +) +parser.add_argument( + "--world-size", + default=-1, + type=int, + help="number of nodes for distributed training", +) +parser.add_argument( + "--rank", default=-1, type=int, help="node rank for distributed training" +) +parser.add_argument( + "--dist-url", + default="tcp://224.66.41.62:23456", + type=str, + help="url used to set up distributed training", +) +parser.add_argument( + "--dist-backend", default="nccl", type=str, help="distributed backend" +) +parser.add_argument( + "--seed", default=None, type=int, help="seed for initializing training. " +) +parser.add_argument("--gpu", default=None, type=int, help="GPU id to use.") +parser.add_argument( + "--multiprocessing-distributed", + action="store_true", + help="Use multi-processing distributed training to launch " + "N processes per node, which has N GPUs. This is the " + "fastest way to use PyTorch for either single node or " + "multi node data parallel training", +) + +parser.add_argument( + "--pretrained", default="", type=str, help="path to moco pretrained checkpoint" +) best_acc1 = 0 @@ -90,15 +164,19 @@ def main(): random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True - warnings.warn('You have chosen to seed training. ' - 'This will turn on the CUDNN deterministic setting, ' - 'which can slow down your training considerably! ' - 'You may see unexpected behavior when restarting ' - 'from checkpoints.') + warnings.warn( + "You have chosen to seed training. " + "This will turn on the CUDNN deterministic setting, " + "which can slow down your training considerably! " + "You may see unexpected behavior when restarting " + "from checkpoints." + ) if args.gpu is not None: - warnings.warn('You have chosen a specific GPU. This will completely ' - 'disable data parallelism.') + warnings.warn( + "You have chosen a specific GPU. This will completely " + "disable data parallelism." + ) if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) @@ -124,8 +202,10 @@ def main_worker(gpu, ngpus_per_node, args): # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: + def print_pass(*args): pass + builtins.print = print_pass if args.gpu is not None: @@ -138,15 +218,19 @@ def print_pass(*args): # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu - dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) + dist.init_process_group( + backend=args.dist_backend, + init_method=args.dist_url, + world_size=args.world_size, + rank=args.rank, + ) # create model print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # freeze all layers but the last fc for name, param in model.named_parameters(): - if name not in ['fc.weight', 'fc.bias']: + if name not in ["fc.weight", "fc.bias"]: param.requires_grad = False # init the fc layer model.fc.weight.data.normal_(mean=0.0, std=0.01) @@ -159,12 +243,14 @@ def print_pass(*args): checkpoint = torch.load(args.pretrained, map_location="cpu") # rename moco pre-trained keys - state_dict = checkpoint['state_dict'] + state_dict = checkpoint["state_dict"] for k in list(state_dict.keys()): # retain only encoder_q up to before the embedding layer - if k.startswith('module.encoder_q') and not k.startswith('module.encoder_q.fc'): + if k.startswith("module.encoder_q") and not k.startswith( + "module.encoder_q.fc" + ): # remove prefix - state_dict[k[len("module.encoder_q."):]] = state_dict[k] + state_dict[k[len("module.encoder_q.") :]] = state_dict[k] # delete renamed or unused k del state_dict[k] @@ -188,7 +274,9 @@ def print_pass(*args): # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.gpu] + ) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all @@ -199,7 +287,7 @@ def print_pass(*args): model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs - if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + if args.arch.startswith("alexnet") or args.arch.startswith("vgg"): model.features = torch.nn.DataParallel(model.features) model.cuda() else: @@ -211,9 +299,9 @@ def print_pass(*args): # optimize only the linear classifier parameters = list(filter(lambda p: p.requires_grad, model.parameters())) assert len(parameters) == 2 # fc.weight, fc.bias - optimizer = torch.optim.SGD(parameters, args.lr, - momentum=args.momentum, - weight_decay=args.weight_decay) + optimizer = torch.optim.SGD( + parameters, args.lr, momentum=args.momentum, weight_decay=args.weight_decay + ) # optionally resume from a checkpoint if args.resume: @@ -223,36 +311,43 @@ def print_pass(*args): checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. - loc = 'cuda:{}'.format(args.gpu) + loc = "cuda:{}".format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) - args.start_epoch = checkpoint['epoch'] - best_acc1 = checkpoint['best_acc1'] + args.start_epoch = checkpoint["epoch"] + best_acc1 = checkpoint["best_acc1"] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) - model.load_state_dict(checkpoint['state_dict']) - optimizer.load_state_dict(checkpoint['optimizer']) - print("=> loaded checkpoint '{}' (epoch {})" - .format(args.resume, checkpoint['epoch'])) + model.load_state_dict(checkpoint["state_dict"]) + optimizer.load_state_dict(checkpoint["optimizer"]) + print( + "=> loaded checkpoint '{}' (epoch {})".format( + args.resume, checkpoint["epoch"] + ) + ) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code - traindir = os.path.join(args.data, 'train') - valdir = os.path.join(args.data, 'val') - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) + traindir = os.path.join(args.data, "train") + valdir = os.path.join(args.data, "val") + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) train_dataset = datasets.ImageFolder( traindir, - transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) + transforms.Compose( + [ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ] + ), + ) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) @@ -260,18 +355,31 @@ def print_pass(*args): train_sampler = None train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), - num_workers=args.workers, pin_memory=True, sampler=train_sampler) + train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + num_workers=args.workers, + pin_memory=True, + sampler=train_sampler, + ) val_loader = torch.utils.data.DataLoader( - datasets.ImageFolder(valdir, transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])), - batch_size=args.batch_size, shuffle=False, - num_workers=args.workers, pin_memory=True) + datasets.ImageFolder( + valdir, + transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ] + ), + ), + batch_size=args.batch_size, + shuffle=False, + num_workers=args.workers, + pin_memory=True, + ) if args.evaluate: validate(val_loader, model, criterion, args) @@ -292,29 +400,34 @@ def print_pass(*args): is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) - if not args.multiprocessing_distributed or (args.multiprocessing_distributed - and args.rank % ngpus_per_node == 0): - save_checkpoint({ - 'epoch': epoch + 1, - 'arch': args.arch, - 'state_dict': model.state_dict(), - 'best_acc1': best_acc1, - 'optimizer' : optimizer.state_dict(), - }, is_best) + if not args.multiprocessing_distributed or ( + args.multiprocessing_distributed and args.rank % ngpus_per_node == 0 + ): + save_checkpoint( + { + "epoch": epoch + 1, + "arch": args.arch, + "state_dict": model.state_dict(), + "best_acc1": best_acc1, + "optimizer": optimizer.state_dict(), + }, + is_best, + ) if epoch == args.start_epoch: sanity_check(model.state_dict(), args.pretrained) def train(train_loader, model, criterion, optimizer, epoch, args): - batch_time = AverageMeter('Time', ':6.3f') - data_time = AverageMeter('Data', ':6.3f') - losses = AverageMeter('Loss', ':.4e') - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') + batch_time = AverageMeter("Time", ":6.3f") + data_time = AverageMeter("Data", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") progress = ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, top5], - prefix="Epoch: [{}]".format(epoch)) + prefix="Epoch: [{}]".format(epoch), + ) """ Switch to eval mode: @@ -358,14 +471,13 @@ def train(train_loader, model, criterion, optimizer, epoch, args): def validate(val_loader, model, criterion, args): - batch_time = AverageMeter('Time', ':6.3f') - losses = AverageMeter('Loss', ':.4e') - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') + batch_time = AverageMeter("Time", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") progress = ProgressMeter( - len(val_loader), - [batch_time, losses, top1, top5], - prefix='Test: ') + len(val_loader), [batch_time, losses, top1, top5], prefix="Test: " + ) # switch to evaluate mode model.eval() @@ -395,16 +507,17 @@ def validate(val_loader, model, criterion, args): progress.display(i) # TODO: this should also be done with the ProgressMeter - print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' - .format(top1=top1, top5=top5)) + print( + " * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}".format(top1=top1, top5=top5) + ) return top1.avg -def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): torch.save(state, filename) if is_best: - shutil.copyfile(filename, 'model_best.pth.tar') + shutil.copyfile(filename, "model_best.pth.tar") def sanity_check(state_dict, pretrained_weights): @@ -414,26 +527,31 @@ def sanity_check(state_dict, pretrained_weights): """ print("=> loading '{}' for sanity check".format(pretrained_weights)) checkpoint = torch.load(pretrained_weights, map_location="cpu") - state_dict_pre = checkpoint['state_dict'] + state_dict_pre = checkpoint["state_dict"] for k in list(state_dict.keys()): # only ignore fc layer - if 'fc.weight' in k or 'fc.bias' in k: + if "fc.weight" in k or "fc.bias" in k: continue # name in pretrained model - k_pre = 'module.encoder_q.' + k[len('module.'):] \ - if k.startswith('module.') else 'module.encoder_q.' + k + k_pre = ( + "module.encoder_q." + k[len("module.") :] + if k.startswith("module.") + else "module.encoder_q." + k + ) - assert ((state_dict[k].cpu() == state_dict_pre[k_pre]).all()), \ - '{} is changed in linear classifier training.'.format(k) + assert ( + state_dict[k].cpu() == state_dict_pre[k_pre] + ).all(), "{} is changed in linear classifier training.".format(k) print("=> sanity check passed.") -class AverageMeter(object): +class AverageMeter: """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): + + def __init__(self, name, fmt=":f"): self.name = name self.fmt = fmt self.reset() @@ -451,11 +569,11 @@ def update(self, val, n=1): self.avg = self.sum / self.count def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" return fmtstr.format(**self.__dict__) -class ProgressMeter(object): +class ProgressMeter: def __init__(self, num_batches, meters, prefix=""): self.batch_fmtstr = self._get_batch_fmtstr(num_batches) self.meters = meters @@ -464,21 +582,21 @@ def __init__(self, num_batches, meters, prefix=""): def display(self, batch): entries = [self.prefix + self.batch_fmtstr.format(batch)] entries += [str(meter) for meter in self.meters] - print('\t'.join(entries)) + print("\t".join(entries)) def _get_batch_fmtstr(self, num_batches): num_digits = len(str(num_batches // 1)) - fmt = '{:' + str(num_digits) + 'd}' - return '[' + fmt + '/' + fmt.format(num_batches) + ']' + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" def adjust_learning_rate(optimizer, epoch, args): """Decay the learning rate based on schedule""" lr = args.lr for milestone in args.schedule: - lr *= 0.1 if epoch >= milestone else 1. + lr *= 0.1 if epoch >= milestone else 1.0 for param_group in optimizer.param_groups: - param_group['lr'] = lr + param_group["lr"] = lr def accuracy(output, target, topk=(1,)): @@ -498,5 +616,5 @@ def accuracy(output, target, topk=(1,)): return res -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/main_moco.py b/main_moco.py old mode 100755 new mode 100644 index d7ea97359..45520c7cd --- a/main_moco.py +++ b/main_moco.py @@ -1,5 +1,11 @@ #!/usr/bin/env python -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + import argparse import builtins import math @@ -9,93 +15,166 @@ import time import warnings +import deeplearning.cross_image_ssl.moco.builder +import deeplearning.cross_image_ssl.moco.loader import torch -import torch.nn as nn -import torch.nn.parallel import torch.backends.cudnn as cudnn import torch.distributed as dist -import torch.optim import torch.multiprocessing as mp +import torch.nn as nn +import torch.nn.parallel +import torch.optim import torch.utils.data import torch.utils.data.distributed -import torchvision.transforms as transforms import torchvision.datasets as datasets import torchvision.models as models +import torchvision.transforms as transforms -import moco.loader -import moco.builder - -model_names = sorted(name for name in models.__dict__ - if name.islower() and not name.startswith("__") - and callable(models.__dict__[name])) - -parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') -parser.add_argument('data', metavar='DIR', - help='path to dataset') -parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', - choices=model_names, - help='model architecture: ' + - ' | '.join(model_names) + - ' (default: resnet50)') -parser.add_argument('-j', '--workers', default=32, type=int, metavar='N', - help='number of data loading workers (default: 32)') -parser.add_argument('--epochs', default=200, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--start-epoch', default=0, type=int, metavar='N', - help='manual epoch number (useful on restarts)') -parser.add_argument('-b', '--batch-size', default=256, type=int, - metavar='N', - help='mini-batch size (default: 256), this is the total ' - 'batch size of all GPUs on the current node when ' - 'using Data Parallel or Distributed Data Parallel') -parser.add_argument('--lr', '--learning-rate', default=0.03, type=float, - metavar='LR', help='initial learning rate', dest='lr') -parser.add_argument('--schedule', default=[120, 160], nargs='*', type=int, - help='learning rate schedule (when to drop lr by 10x)') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum of SGD solver') -parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, - metavar='W', help='weight decay (default: 1e-4)', - dest='weight_decay') -parser.add_argument('-p', '--print-freq', default=10, type=int, - metavar='N', help='print frequency (default: 10)') -parser.add_argument('--resume', default='', type=str, metavar='PATH', - help='path to latest checkpoint (default: none)') -parser.add_argument('--world-size', default=-1, type=int, - help='number of nodes for distributed training') -parser.add_argument('--rank', default=-1, type=int, - help='node rank for distributed training') -parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, - help='url used to set up distributed training') -parser.add_argument('--dist-backend', default='nccl', type=str, - help='distributed backend') -parser.add_argument('--seed', default=None, type=int, - help='seed for initializing training. ') -parser.add_argument('--gpu', default=None, type=int, - help='GPU id to use.') -parser.add_argument('--multiprocessing-distributed', action='store_true', - help='Use multi-processing distributed training to launch ' - 'N processes per node, which has N GPUs. This is the ' - 'fastest way to use PyTorch for either single node or ' - 'multi node data parallel training') + +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +parser.add_argument("data", metavar="DIR", help="path to dataset") +parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet50", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet50)", +) +parser.add_argument( + "-j", + "--workers", + default=32, + type=int, + metavar="N", + help="number of data loading workers (default: 32)", +) +parser.add_argument( + "--epochs", default=200, type=int, metavar="N", help="number of total epochs to run" +) +parser.add_argument( + "--start-epoch", + default=0, + type=int, + metavar="N", + help="manual epoch number (useful on restarts)", +) +parser.add_argument( + "-b", + "--batch-size", + default=256, + type=int, + metavar="N", + help="mini-batch size (default: 256), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) +parser.add_argument( + "--lr", + "--learning-rate", + default=0.03, + type=float, + metavar="LR", + help="initial learning rate", + dest="lr", +) +parser.add_argument( + "--schedule", + default=[120, 160], + nargs="*", + type=int, + help="learning rate schedule (when to drop lr by 10x)", +) +parser.add_argument( + "--momentum", default=0.9, type=float, metavar="M", help="momentum of SGD solver" +) +parser.add_argument( + "--wd", + "--weight-decay", + default=1e-4, + type=float, + metavar="W", + help="weight decay (default: 1e-4)", + dest="weight_decay", +) +parser.add_argument( + "-p", + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", +) +parser.add_argument( + "--resume", + default="", + type=str, + metavar="PATH", + help="path to latest checkpoint (default: none)", +) +parser.add_argument( + "--world-size", + default=-1, + type=int, + help="number of nodes for distributed training", +) +parser.add_argument( + "--rank", default=-1, type=int, help="node rank for distributed training" +) +parser.add_argument( + "--dist-url", + default="tcp://224.66.41.62:23456", + type=str, + help="url used to set up distributed training", +) +parser.add_argument( + "--dist-backend", default="nccl", type=str, help="distributed backend" +) +parser.add_argument( + "--seed", default=None, type=int, help="seed for initializing training. " +) +parser.add_argument("--gpu", default=None, type=int, help="GPU id to use.") +parser.add_argument( + "--multiprocessing-distributed", + action="store_true", + help="Use multi-processing distributed training to launch " + "N processes per node, which has N GPUs. This is the " + "fastest way to use PyTorch for either single node or " + "multi node data parallel training", +) # moco specific configs: -parser.add_argument('--moco-dim', default=128, type=int, - help='feature dimension (default: 128)') -parser.add_argument('--moco-k', default=65536, type=int, - help='queue size; number of negative keys (default: 65536)') -parser.add_argument('--moco-m', default=0.999, type=float, - help='moco momentum of updating key encoder (default: 0.999)') -parser.add_argument('--moco-t', default=0.07, type=float, - help='softmax temperature (default: 0.07)') +parser.add_argument( + "--moco-dim", default=128, type=int, help="feature dimension (default: 128)" +) +parser.add_argument( + "--moco-k", + default=65536, + type=int, + help="queue size; number of negative keys (default: 65536)", +) +parser.add_argument( + "--moco-m", + default=0.999, + type=float, + help="moco momentum of updating key encoder (default: 0.999)", +) +parser.add_argument( + "--moco-t", default=0.07, type=float, help="softmax temperature (default: 0.07)" +) # options for moco v2 -parser.add_argument('--mlp', action='store_true', - help='use mlp head') -parser.add_argument('--aug-plus', action='store_true', - help='use moco v2 data augmentation') -parser.add_argument('--cos', action='store_true', - help='use cosine lr schedule') +parser.add_argument("--mlp", action="store_true", help="use mlp head") +parser.add_argument( + "--aug-plus", action="store_true", help="use moco v2 data augmentation" +) +parser.add_argument("--cos", action="store_true", help="use cosine lr schedule") def main(): @@ -105,15 +184,19 @@ def main(): random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True - warnings.warn('You have chosen to seed training. ' - 'This will turn on the CUDNN deterministic setting, ' - 'which can slow down your training considerably! ' - 'You may see unexpected behavior when restarting ' - 'from checkpoints.') + warnings.warn( + "You have chosen to seed training. " + "This will turn on the CUDNN deterministic setting, " + "which can slow down your training considerably! " + "You may see unexpected behavior when restarting " + "from checkpoints." + ) if args.gpu is not None: - warnings.warn('You have chosen a specific GPU. This will completely ' - 'disable data parallelism.') + warnings.warn( + "You have chosen a specific GPU. This will completely " + "disable data parallelism." + ) if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) @@ -138,8 +221,10 @@ def main_worker(gpu, ngpus_per_node, args): # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: + def print_pass(*args): pass + builtins.print = print_pass if args.gpu is not None: @@ -152,13 +237,22 @@ def print_pass(*args): # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu - dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) + dist.init_process_group( + backend=args.dist_backend, + init_method=args.dist_url, + world_size=args.world_size, + rank=args.rank, + ) # create model print("=> creating model '{}'".format(args.arch)) - model = moco.builder.MoCo( + model = deeplearning.cross_image_ssl.moco.builder.MoCo( models.__dict__[args.arch], - args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp) + args.moco_dim, + args.moco_k, + args.moco_m, + args.moco_t, + args.mlp, + ) print(model) if args.distributed: @@ -173,7 +267,9 @@ def print_pass(*args): # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.gpu] + ) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all @@ -192,9 +288,12 @@ def print_pass(*args): # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) - optimizer = torch.optim.SGD(model.parameters(), args.lr, - momentum=args.momentum, - weight_decay=args.weight_decay) + optimizer = torch.optim.SGD( + model.parameters(), + args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) # optionally resume from a checkpoint if args.resume: @@ -204,49 +303,60 @@ def print_pass(*args): checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. - loc = 'cuda:{}'.format(args.gpu) + loc = "cuda:{}".format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) - args.start_epoch = checkpoint['epoch'] - model.load_state_dict(checkpoint['state_dict']) - optimizer.load_state_dict(checkpoint['optimizer']) - print("=> loaded checkpoint '{}' (epoch {})" - .format(args.resume, checkpoint['epoch'])) + args.start_epoch = checkpoint["epoch"] + model.load_state_dict(checkpoint["state_dict"]) + optimizer.load_state_dict(checkpoint["optimizer"]) + print( + "=> loaded checkpoint '{}' (epoch {})".format( + args.resume, checkpoint["epoch"] + ) + ) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code - traindir = os.path.join(args.data, 'train') - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) + traindir = os.path.join(args.data, "train") + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) if args.aug_plus: # MoCo v2's aug: similar to SimCLR https://arxiv.org/abs/2002.05709 augmentation = [ - transforms.RandomResizedCrop(224, scale=(0.2, 1.)), - transforms.RandomApply([ - transforms.ColorJitter(0.4, 0.4, 0.4, 0.1) # not strengthened - ], p=0.8), + transforms.RandomResizedCrop(224, scale=(0.2, 1.0)), + transforms.RandomApply( + [transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], + p=0.8, # not strengthened + ), transforms.RandomGrayscale(p=0.2), - transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=0.5), + transforms.RandomApply( + [deeplearning.cross_image_ssl.moco.loader.GaussianBlur([0.1, 2.0])], + p=0.5, + ), transforms.RandomHorizontalFlip(), transforms.ToTensor(), - normalize + normalize, ] else: # MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978 augmentation = [ - transforms.RandomResizedCrop(224, scale=(0.2, 1.)), + transforms.RandomResizedCrop(224, scale=(0.2, 1.0)), transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), - normalize + normalize, ] train_dataset = datasets.ImageFolder( traindir, - moco.loader.TwoCropsTransform(transforms.Compose(augmentation))) + deeplearning.cross_image_ssl.moco.loader.TwoCropsTransform( + transforms.Compose(augmentation) + ), + ) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) @@ -254,8 +364,14 @@ def print_pass(*args): train_sampler = None train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), - num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) + train_dataset, + batch_size=args.batch_size, + shuffle=(train_sampler is None), + num_workers=args.workers, + pin_memory=True, + sampler=train_sampler, + drop_last=True, + ) for epoch in range(args.start_epoch, args.epochs): if args.distributed: @@ -265,26 +381,32 @@ def print_pass(*args): # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) - if not args.multiprocessing_distributed or (args.multiprocessing_distributed - and args.rank % ngpus_per_node == 0): - save_checkpoint({ - 'epoch': epoch + 1, - 'arch': args.arch, - 'state_dict': model.state_dict(), - 'optimizer' : optimizer.state_dict(), - }, is_best=False, filename='checkpoint_{:04d}.pth.tar'.format(epoch)) + if not args.multiprocessing_distributed or ( + args.multiprocessing_distributed and args.rank % ngpus_per_node == 0 + ): + save_checkpoint( + { + "epoch": epoch + 1, + "arch": args.arch, + "state_dict": model.state_dict(), + "optimizer": optimizer.state_dict(), + }, + is_best=False, + filename="checkpoint_{:04d}.pth.tar".format(epoch), + ) def train(train_loader, model, criterion, optimizer, epoch, args): - batch_time = AverageMeter('Time', ':6.3f') - data_time = AverageMeter('Data', ':6.3f') - losses = AverageMeter('Loss', ':.4e') - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') + batch_time = AverageMeter("Time", ":6.3f") + data_time = AverageMeter("Data", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") progress = ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, top5], - prefix="Epoch: [{}]".format(epoch)) + prefix="Epoch: [{}]".format(epoch), + ) # switch to train mode model.train() @@ -322,15 +444,16 @@ def train(train_loader, model, criterion, optimizer, epoch, args): progress.display(i) -def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): +def save_checkpoint(state, is_best, filename="checkpoint.pth.tar"): torch.save(state, filename) if is_best: - shutil.copyfile(filename, 'model_best.pth.tar') + shutil.copyfile(filename, "model_best.pth.tar") -class AverageMeter(object): +class AverageMeter: """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): + + def __init__(self, name, fmt=":f"): self.name = name self.fmt = fmt self.reset() @@ -348,11 +471,11 @@ def update(self, val, n=1): self.avg = self.sum / self.count def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" return fmtstr.format(**self.__dict__) -class ProgressMeter(object): +class ProgressMeter: def __init__(self, num_batches, meters, prefix=""): self.batch_fmtstr = self._get_batch_fmtstr(num_batches) self.meters = meters @@ -361,24 +484,24 @@ def __init__(self, num_batches, meters, prefix=""): def display(self, batch): entries = [self.prefix + self.batch_fmtstr.format(batch)] entries += [str(meter) for meter in self.meters] - print('\t'.join(entries)) + print("\t".join(entries)) def _get_batch_fmtstr(self, num_batches): num_digits = len(str(num_batches // 1)) - fmt = '{:' + str(num_digits) + 'd}' - return '[' + fmt + '/' + fmt.format(num_batches) + ']' + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" def adjust_learning_rate(optimizer, epoch, args): """Decay the learning rate based on schedule""" lr = args.lr if args.cos: # cosine lr schedule - lr *= 0.5 * (1. + math.cos(math.pi * epoch / args.epochs)) + lr *= 0.5 * (1.0 + math.cos(math.pi * epoch / args.epochs)) else: # stepwise lr schedule for milestone in args.schedule: - lr *= 0.1 if epoch >= milestone else 1. + lr *= 0.1 if epoch >= milestone else 1.0 for param_group in optimizer.param_groups: - param_group['lr'] = lr + param_group["lr"] = lr def accuracy(output, target, topk=(1,)): @@ -398,5 +521,5 @@ def accuracy(output, target, topk=(1,)): return res -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/moco/__init__.py b/moco/__init__.py index 168f9979a..523441393 100644 --- a/moco/__init__.py +++ b/moco/__init__.py @@ -1 +1,4 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. diff --git a/moco/builder.py b/moco/builder.py index 7d80fe996..7952a981c 100644 --- a/moco/builder.py +++ b/moco/builder.py @@ -1,4 +1,8 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + import torch import torch.nn as nn @@ -8,6 +12,7 @@ class MoCo(nn.Module): Build a MoCo model with: a query encoder, a key encoder, and a queue https://arxiv.org/abs/1911.05722 """ + def __init__(self, base_encoder, dim=128, K=65536, m=0.999, T=0.07, mlp=False): """ dim: feature dimension (default: 128) @@ -28,10 +33,16 @@ def __init__(self, base_encoder, dim=128, K=65536, m=0.999, T=0.07, mlp=False): if mlp: # hack: brute-force replacement dim_mlp = self.encoder_q.fc.weight.shape[1] - self.encoder_q.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_q.fc) - self.encoder_k.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_k.fc) - - for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): + self.encoder_q.fc = nn.Sequential( + nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_q.fc + ) + self.encoder_k.fc = nn.Sequential( + nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_k.fc + ) + + for param_q, param_k in zip( + self.encoder_q.parameters(), self.encoder_k.parameters() + ): param_k.data.copy_(param_q.data) # initialize param_k.requires_grad = False # not update by gradient @@ -46,8 +57,10 @@ def _momentum_update_key_encoder(self): """ Momentum update of the key encoder """ - for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): - param_k.data = param_k.data * self.m + param_q.data * (1. - self.m) + for param_q, param_k in zip( + self.encoder_q.parameters(), self.encoder_k.parameters() + ): + param_k.data = param_k.data * self.m + param_q.data * (1.0 - self.m) @torch.no_grad() def _dequeue_and_enqueue(self, keys): @@ -60,7 +73,7 @@ def _dequeue_and_enqueue(self, keys): assert self.K % batch_size == 0 # for simplicity # replace the keys at ptr (dequeue and enqueue) - self.queue[:, ptr:ptr + batch_size] = keys.T + self.queue[:, ptr : ptr + batch_size] = keys.T ptr = (ptr + batch_size) % self.K # move pointer self.queue_ptr[0] = ptr @@ -141,9 +154,9 @@ def forward(self, im_q, im_k): # compute logits # Einstein sum is more intuitive # positive logits: Nx1 - l_pos = torch.einsum('nc,nc->n', [q, k]).unsqueeze(-1) + l_pos = torch.einsum("nc,nc->n", [q, k]).unsqueeze(-1) # negative logits: NxK - l_neg = torch.einsum('nc,ck->nk', [q, self.queue.clone().detach()]) + l_neg = torch.einsum("nc,ck->nk", [q, self.queue.clone().detach()]) # logits: Nx(1+K) logits = torch.cat([l_pos, l_neg], dim=1) @@ -167,8 +180,9 @@ def concat_all_gather(tensor): Performs all_gather operation on the provided tensors. *** Warning ***: torch.distributed.all_gather has no gradient. """ - tensors_gather = [torch.ones_like(tensor) - for _ in range(torch.distributed.get_world_size())] + tensors_gather = [ + torch.ones_like(tensor) for _ in range(torch.distributed.get_world_size()) + ] torch.distributed.all_gather(tensors_gather, tensor, async_op=False) output = torch.cat(tensors_gather, dim=0) diff --git a/moco/loader.py b/moco/loader.py index 655aea5bf..d06abb2f7 100644 --- a/moco/loader.py +++ b/moco/loader.py @@ -1,7 +1,12 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -from PIL import ImageFilter +# Copyright (c) Meta Platforms, Inc. and affiliates. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + import random +from PIL import ImageFilter + class TwoCropsTransform: """Take two random crops of one image as the query and key.""" @@ -15,10 +20,10 @@ def __call__(self, x): return [q, k] -class GaussianBlur(object): +class GaussianBlur: """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709""" - def __init__(self, sigma=[.1, 2.]): + def __init__(self, sigma=[0.1, 2.0]): self.sigma = sigma def __call__(self, x):