OSS Playground modulelized model components (facebookarchive#2059)

Relatively independent feature. Tested and reviewed. Should be save to merge
nrsatish · Feb 27, 2018 · a8e7515 · a8e7515
1 parent 6d7f106
commit a8e7515
Show file tree

Hide file tree

Showing 20 changed files with 1,737 additions and 0 deletions.
diff --git a/caffe2/contrib/__init__.py b/caffe2/contrib/__init__.py
diff --git a/caffe2/contrib/playground/AnyExp.py b/caffe2/contrib/playground/AnyExp.py
diff --git a/caffe2/contrib/playground/AnyExpOnTerm.py b/caffe2/contrib/playground/AnyExpOnTerm.py
@@ -0,0 +1,34 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import json
+import caffe2.contrib.playground.AnyExp as AnyExp
+
+import logging
+logging.basicConfig()
+log = logging.getLogger("AnyExpOnTerm")
+log.setLevel(logging.DEBUG)
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Any Experiment training.')
+    parser.add_argument("--parameters-json", type=json.loads,
+                        help='model options in json format', dest="params")
+
+    args = parser.parse_args()
+    opts = args.params['opts']
+    opts = AnyExp.initOpts(opts)
+    log.info('opts is: {}'.format(str(opts)))
+
+    AnyExp.initDefaultModuleMap()
+
+    opts['input']['datasets'] = AnyExp.aquireDatasets(opts)
+
+    # defined this way so that AnyExp.trainFun(opts) can be replaced with
+    # some other custermized training function.
+    ret = AnyExp.runShardedTrainLoop(opts, AnyExp.trainFun())
+
+    log.info('ret is: {}'.format(str(ret)))
diff --git a/caffe2/contrib/playground/ModuleRegister.py b/caffe2/contrib/playground/ModuleRegister.py
@@ -0,0 +1,106 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import inspect
+import logging
+logging.basicConfig()
+log = logging.getLogger("ModuleRegister")
+log.setLevel(logging.DEBUG)
+
+MODULE_MAPS = []
+
+
+def registerModuleMap(module_map):
+    MODULE_MAPS.append(module_map)
+    log.info("ModuleRegister get modules from  ModuleMap content: {}".
+             format(inspect.getsource(module_map)))
+
+
+def constructTrainerClass(myTrainerClass, opts):
+
+    log.info("ModuleRegister, myTrainerClass name is {}".
+             format(myTrainerClass.__name__))
+    log.info("ModuleRegister, myTrainerClass type is {}".
+             format(type(myTrainerClass)))
+    log.info("ModuleRegister, myTrainerClass dir is {}".
+             format(dir(myTrainerClass)))
+
+    myInitializeModelModule = getModule(opts['model']['model_name_py'])
+    log.info("ModuleRegister, myInitializeModelModule dir is {}".
+             format(dir(myInitializeModelModule)))
+
+    myTrainerClass.init_model = myInitializeModelModule.init_model
+    myTrainerClass.run_training_net = myInitializeModelModule.run_training_net
+    myTrainerClass.fun_per_iter_b4RunNet = \
+        myInitializeModelModule.fun_per_iter_b4RunNet
+    myTrainerClass.fun_per_epoch_b4RunNet = \
+        myInitializeModelModule.fun_per_epoch_b4RunNet
+
+    myInputModule = getModule(opts['input']['input_name_py'])
+    log.info("ModuleRegister, myInputModule {} dir is {}".
+             format(opts['input']['input_name_py'], myInputModule.__name__))
+
+    # Override input methods of the myTrainerClass class
+    myTrainerClass.get_input_dataset = myInputModule.get_input_dataset
+    myTrainerClass.get_model_input_fun = myInputModule.get_model_input_fun
+    myTrainerClass.gen_input_builder_fun = myInputModule.gen_input_builder_fun
+
+    # myForwardPassModule = GetForwardPassModule(opts)
+    myForwardPassModule = getModule(opts['model']['forward_pass_py'])
+    myTrainerClass.gen_forward_pass_builder_fun = \
+        myForwardPassModule.gen_forward_pass_builder_fun
+
+    myParamUpdateModule = getModule(opts['model']['parameter_update_py'])
+    myTrainerClass.gen_param_update_builder_fun =\
+        myParamUpdateModule.gen_param_update_builder_fun \
+        if myParamUpdateModule is not None else None
+
+    myOptimizerModule = getModule(opts['model']['optimizer_py'])
+    myTrainerClass.gen_optimizer_fun = \
+        myOptimizerModule.gen_optimizer_fun \
+        if myOptimizerModule is not None else None
+
+    myRendezvousModule = getModule(opts['model']['rendezvous_py'])
+    myTrainerClass.gen_rendezvous_ctx = \
+        myRendezvousModule.gen_rendezvous_ctx \
+        if myRendezvousModule is not None else None
+
+    # override output module
+    myOutputModule = getModule(opts['output']['gen_output_py'])
+
+    log.info("ModuleRegister, myOutputModule is {}".
+             format(myOutputModule.__name__))
+    myTrainerClass.fun_conclude_operator = myOutputModule.fun_conclude_operator
+    myTrainerClass.assembleAllOutputs = myOutputModule.assembleAllOutputs
+
+    return myTrainerClass
+
+
+def getModule(moduleName):
+    log.info("MODULE_MAPS content {}".format(str(MODULE_MAPS)))
+    myModule = None
+    for ModuleMap in MODULE_MAPS:
+        log.info("iterate through MODULE_MAPS content {}".
+                 format(str(ModuleMap)))
+        for name, obj in inspect.getmembers(ModuleMap):
+            log.info("iterate through MODULE_MAPS a name {}".format(str(name)))
+            if name == moduleName:
+                log.info("AnyExp get module {} with source:{}".
+                         format(moduleName, inspect.getsource(obj)))
+                myModule = obj
+                return myModule
+    return None
+
+
+def getClassFromModule(moduleName, className):
+    myClass = None
+    for ModuleMap in MODULE_MAPS:
+        for name, obj in inspect.getmembers(ModuleMap):
+            if name == moduleName:
+                log.info("ModuleRegistry from module {} get class {} of source:{}".
+                         format(moduleName, className, inspect.getsource(obj)))
+                myClass = getattr(obj, className)
+                return myClass
+    return None
diff --git a/caffe2/contrib/playground/README.md b/caffe2/contrib/playground/README.md
@@ -0,0 +1,151 @@
+# Playground for Caffe2 Models
+
+Playground is created to allow modelers to reuse the components of their models.  It is based on data parallel model of Caffe2.  Playground provide a framework that takes care of regular trainer iteration procedures and abstracting out APIs that allows user to apply customized model components of their own.  User can swap / exchange /reuse these components without rewriting the whole training script.  Once the components are in place, user can use parameterized launch command to drive their experiments.  This will be convenient for creating large amount of experiments with different components defined for each of them.  It may be used as a tool to explore different model architect / algorithms like optimizer / momentum / learning rate / batch normalization parameters.
+
+Playground project highlight:
+1. parameter driven: no need to create py script for each experiment, just swap components using parameters.  Very customizable, add your own component and add your opts in the command as you want.
+2. All models follows a typical way of train/testing epoch iteration.  Many aspects can be customized, example: run epoch by loss instead of predetermined iteration
+3. customizable components, trained metrics, also specified with parameters
+4. gpu or cpu training supported
+5. parallel training on multiple host supported
+6. checkpoint, pre-trained model helps with recover interrupted / failed experiment
+
+
+### Example Usage
+Playground comes with a resnet50 example, located in resnet50demo folder.  To see how playground works, do the following:
+
+1. make sure your caffe2 build successful with openCV and lmdb dependencies supported.
+
+2. make sure you have training/testing datasets ready in folders that can be accessible to trainer / distributed trainers
+
+3. specify a folder that you would like to store your checkpoint model files.
+
+4. use this command to launch a training, verify epochs are running with metrics reported in log and model file store in your checkpoint folder
+
+$ python caffe2/contrib/playground/AnyExpOnTerm.py --parameters-json '{
+"opts":{
+
+    "input":{
+        "input_name_py":"gfs_IN1k",
+        "train_input_path":"/mnt/vol/gfsai-oregon/ai-group/datasets/imagenet_lmdb/ilsvrc12_train_lmdb/",
+        "test_input_path":"/mnt/vol/gfsai-oregon/ai-group/datasets/imagenet_lmdb/ilsvrc12_val_lmdb",
+        "scale_jitter_type": 1, "color_jitter": true,      "color_lighting": true,
+        "namespace": "aml",  "table": "imagenet_data",  "column_handle": "everstore_handle",
+        "column_label": "label", "column_id": "image_id",  "label_type": 0,
+        "train_partition": {"ds": "2017-07-31", "config": "imagenet1k", "is_train": "1"},
+        "test_partition": {"ds": "2017-07-31", "config": "imagenet1k", "is_train": "0"},
+        "num_classes":1000, "loadsize" : 256, "imsize": 224, "decode_threads": 8, "datasets":[]},
+
+    "model":{
+        "model_name_py":"IN1k_resnet50",
+        "forward_pass_py":"caffe2_resnet50_default_forward",
+        "parameter_update_py":"explicit_resnet_param_update",
+        "optimizer_py":"",
+        "rendezvous_py":"rendezvous_filestore"},
+
+    "model_param":{
+        "pretrained_model":"", "reset_epoch":true, "memonger" : true, "cuda_nccl": true,
+        "combine_spatial_bn":true, "max_concurrent_distributed_ops" : 16,
+        "base_learning_rate":0.05, "bn_epsilon":0.00001, "bn_momentum":0.9, "custom_bn_init": true,
+        "bn_init_gamma":1e-323, "weight_decay":1e-4, "weight_decay_bn":1e-323},
+
+    "epoch_iter":{
+        "num_train_sample_per_epoch":512,
+        "num_test_sample": 250,
+        "num_epochs":10,
+        "num_epochs_per_flow_schedule":5,
+        "num_train_iteration_per_test": 10,
+        "batch_per_device":32,
+        "num_test_iter":2},
+
+    "distributed":{
+        "num_shards":1,
+        "num_gpus":2,
+        "first_gpu_id":0,
+        "num_cpus":4,
+        "first_cpu_id":0},
+
+    "output":{
+        "gen_output_py":"output_generator",
+        "gen_checkpoint_path_py":"gen_checkpoint_path",
+        "checkpoint_folder":"/home/diyu/model_checkpoint/",
+        "metrics":[
+            {"name":"train_loss",
+             "meter_py":"ComputeLoss",
+             "meter_kargs":{"blob_name":"loss"},
+             "is_train":true},
+            {"name":"test_loss",
+             "meter_py":"ComputeLoss",
+             "meter_kargs":{"blob_name":"loss"},
+             "is_train":false},
+            {"name":"train_accuracy_top1",
+             "meter_py":"ComputeTopKAccuracy",
+             "meter_kargs":{"blob_name":["softmax", "label"], "topk":1},
+             "is_train":true},
+            {"name":"train_accuracy_top5",
+             "meter_py":"ComputeTopKAccuracy",
+             "meter_kargs":{"blob_name":["softmax", "label"], "topk":5},
+             "is_train":true},
+            {"name":"test_accuracy_top1",
+             "meter_py":"ComputeTopKAccuracy",
+             "meter_kargs":{"blob_name":["softmax", "label"], "topk":1},
+             "is_train":false},
+            {"name":"test_accuracy_top5",
+             "meter_py":"ComputeTopKAccuracy",
+             "meter_kargs":{"blob_name":["softmax", "label"], "topk":5},
+             "is_train":false}],
+        "plots":[
+            {"x":"", "x_title":"", "ys":["train_loss", "test_loss"],
+             "y_title":"train and test loss"},
+            {"x":"epochs", "x_title":"epochs",
+             "ys":["train_accuracy_top1","test_accuracy_top1",
+                   "train_accuracy_top5","test_accuracy_top5"],
+             "y_title":""}]}}
+
+}'
+
+5. now you can switch to different components that supplied in resnet50demo folder like so:
+
+   "forward_pass_py":"caffe2_resnet50_default_forward", --> "explicit_resnet_forward"  (which is a resnet100 model)
+
+   and/or
+
+   "parameter_update_py":"caffe2_resnet50_default_param_update", --> "explicit_resnet_param_update"
+
+   playground should be able to launch training epochs and give you results
+
+
+### General Usage Guideline
+
+1. mandatory non empty opts: input_name_py, datasets, model_name_py, forward_pass_py, (parameter_update_py or optimizer_py), rendezvous_py, memonger, all epoch_iter opts, all distributed opts, gen_output_py
+
+2. mandatory nullable opts: pretrained_model, max_concurrent_distributed_ops, combine_spatial_bn
+
+3. other module dependent opts can be changed or removed: the rest of the opts.
+
+4. specify any additional opts depends on your modules' need, directly add them into the command line opts dictionary and no need to change any py code.  You should create your module to make sure they knows how to handle these new opts.  You access your own opts in such a manner:  self.opt['your_own_arg']['your_own_sub_arg']
+
+5. checkpoint is performed at the end of each epoch by default and generated model file can be find in log.  Each checkpoint can be used as pre-trained model to start new experiment.  Make sure new experiment is compatible with pre-trained model if you specified it.  For example, gpu experiment and cpu experiment can not share checkpoint, because the blob names are different.  Any experiments with different blob names can not share checkpoint.
+
+6. The metric and plots are reported when experiment finish running.  Intermediate results are reported in the log of the CreateTrainerAndRunManyEpochs operator as iteration goes on.
+
+7. if num_gpus is specified, the trainer will try to use gpu.  if num_gpus = 0, the trainer will use cpu to train.  For gpu training, batch_per_device are typically 32, for cpu training, batch_per_device is normally set to 2 with num_cpus higher like 8 or 16 depends on your machines' configuration.
+
+8. if train on single host, let "num_shards" = 1,  if multiple hosts, specify your "num_shards" and start parallelized training from each shards similarly to the resnet50_trainer.py example in caffe2/python/examples/ folder.
+
+
+### Develop Your Own Components
+
+1. Create a folder for your own experiment under caffe2/contrib/playground/ and go to this folder.
+
+2. Create a base model file, for example IN1kResnet50.py.  In this script you need to implement init function and in it, instantiate your train/test model and give them to self.train_model and self.test_model.  In this base model class, you can also chose to override other functions you'd like to customize, for example if you want to iterate according to accuracy instead of fixed number of loops, override list_of_epochs(), and list_of_epoch_iters()
+
+3. Create component py scripts implementing the generators arguments of data_parallel_model.Parallelize().  Total four of them: input_builder_fun, forward_pass_builder_fun,  one of param_update_builder_fun or optimizer_builder_fun, and rendezvous.  This is where you can switch between different components. Examples: for the demo IN1k_resnet50 experiments, I created two different forward function: explicit_resnet50_forward.py and caffe2_resnet50_default_forward.py.  Both implemented the API "gen_param_update_builder_fun", which is abstract method in the framework class AnyExp.py
+
+4. Next import the module components you created into module_map.py.  This import is needed to include these packages during building.  Give imported module a module name, normally if module is just a simple file contains some functions, just use the py script file name.  If the module contains class and the class is needed for module input, name it with the class name, examples are the meter classes like compute_loss.  When launching your experiment, in opts for the term “xxx_py” fill in the name you chose in module_map.py.  Playground will find your module and load it.
+
+5. Create as many modules as you need.  Then when you perform your experiment, specify the module you want in opts correspondingly and you can run your experiment with ease.
+
+6. In the demo, the opts item “gen_output_py” uses output_generator.py , which provides a minimum way to generating final experimental result, stored in the form of a dict.  It will allow user to do whatever visualization with these data after the training is finished.
+
+7. Customize your experimental result.  A meter interface is provided to implement your own metrics calculators.  Example compute_loss.py and compute_topk_accuracy.py.  For training metrics, results are calculated right away in each iteration.  For testing metrics, results are accumulated for the whole loop and finally calculated after test iteration finishes.  Once your have your meter class defined, you can start defining what metrics to report in your opts['output']['metrics'] list.  The name you give to your metrics can later be used when you define your plots.  The Playground will always record throughput metrics secs_per_train and samples_per_sec.
diff --git a/caffe2/contrib/playground/__init__.py b/caffe2/contrib/playground/__init__.py