[tests] Switch to OverfitTeacher for some model tests. (facebookresea…

…rch#3055) * [tests] Switch to OverfitTeacher for some model tests. * Black. * Improve on distributed training. * Screw this test. * Try lowering parallelism. * Looser requirements on test_distributed. * Speed up drqa test. * Faster dynamic batch test. * Speed up deepcopies of opts. * Fewer deepcopies. * Verify data relaxation. * Robust * Lint. * make hred more reliable * try again. * Speed up beam block test. * Speed up more generation tests. * Bring back one more retry. * Whopos. * Always kill process group i guess. * We can save the company some $ * Another test combo failure. * Lint. * Use constants.
analysmith · Sep 15, 2020 · cd20fd3 · cd20fd3
1 parent 33c4e6b
commit cd20fd3
Show file tree

Hide file tree

Showing 15 changed files with 348 additions and 547 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -173,7 +173,7 @@ jobs:
   unittests_osx:
     <<: *osx_cpu37
     working_directory: ~/ParlAI
-    parallelism: 8
+    parallelism: 2
     steps:
       - checkout
       - <<: *fixgit
@@ -220,7 +220,7 @@ jobs:
   unittests_36:
     <<: *standard_cpu36
     working_directory: ~/ParlAI
-    parallelism: 16
+    parallelism: 8
     steps:
       - checkout
       - <<: *fixgit
@@ -252,7 +252,7 @@ jobs:
   unittests_38:
     <<: *standard_cpu38
     working_directory: ~/ParlAI
-    parallelism: 16
+    parallelism: 2
     steps:
       - checkout
       - <<: *fixgit
@@ -284,7 +284,7 @@ jobs:
   unittests_37:
     <<: *standard_cpu37
     working_directory: ~/ParlAI
-    parallelism: 16
+    parallelism: 8
     steps:
       - checkout
       - <<: *fixgit
@@ -316,7 +316,7 @@ jobs:
   unittests_gpu14:
     <<: *gpu
     working_directory: ~/ParlAI
-    parallelism: 16
+    parallelism: 2
     steps:
       - checkout
       - <<: *fixgit
@@ -355,7 +355,7 @@ jobs:
   unittests_gpu15:
     <<: *gpu
     working_directory: ~/ParlAI
-    parallelism: 16
+    parallelism: 2
     steps:
       - checkout
       - <<: *fixgit
@@ -394,7 +394,7 @@ jobs:
   unittests_gpu16:
     <<: *gpu
     working_directory: ~/ParlAI
-    parallelism: 16
+    parallelism: 8
     steps:
       - checkout
       - <<: *fixgit

diff --git a/parlai/core/opt.py b/parlai/core/opt.py
@@ -45,7 +45,7 @@ def __init__(self, *args, **kwargs):
         self.deepcopies = []
 
     def __setitem__(self, key, val):
-        loc = traceback.format_stack()[-2]
+        loc = traceback.format_stack(limit=2)[-2]
         self.history.append((key, val, loc))
         super().__setitem__(key, val)
 
@@ -64,7 +64,7 @@ def __deepcopy__(self, memo):
         Override deepcopy so that history is copied over to new object.
         """
         # track location of deepcopy
-        loc = traceback.format_stack()[-3]
+        loc = traceback.format_stack(limit=3)[-3]
         self.deepcopies.append(loc)
         # copy all our children
         memo = Opt({k: copy.deepcopy(v) for k, v in self.items()})

diff --git a/parlai/core/worlds.py b/parlai/core/worlds.py
@@ -145,7 +145,7 @@ def clone(self):
         """
         Create a duplicate of the world.
         """
-        return type(self)(opt=copy.deepcopy(self.opt), agents=None, shared=self.share())
+        return type(self)(opt=self.opt, agents=None, shared=self.share())
 
     def _share_agents(self):
         """
@@ -519,14 +519,14 @@ def __init__(self, opt: Opt, agents=None, shared=None, default_world=None):
         for index, k in enumerate(opt['task'].split(',')):
             k = k.strip()
             if k:
-                opt_singletask = copy.deepcopy(opt)
-                opt_singletask['task'] = k
                 if shared:
                     # Create worlds based on shared data.
                     s = shared['worlds'][index]
                     self.worlds.append(s['world_class'](s['opt'], None, s))
                 else:
                     # Agents are already specified.
+                    opt_singletask = copy.deepcopy(opt)
+                    opt_singletask['task'] = k
                     self.worlds.append(
                         create_task_world(
                             opt_singletask, agents, default_world=default_world

diff --git a/parlai/scripts/train_model.py b/parlai/scripts/train_model.py
@@ -475,8 +475,11 @@ def validate(self):
                 self.save_model()
                 self.saved = True
             if (
-                opt['validation_metric'] == 'accuracy'
+                opt['validation_metric_mode'] == 'max'
                 and self.best_valid >= opt['validation_cutoff']
+            ) or (
+                opt['validation_metric_mode'] == 'min'
+                and self.best_valid <= opt['validation_cutoff']
             ):
                 logging.info('task solved! stopping.')
                 return True

diff --git a/parlai/tasks/integration_tests/agents.py b/parlai/tasks/integration_tests/agents.py
@@ -192,6 +192,55 @@ def setup_data(self, fold):
             yield (text, [text], 0, cands), True
 
 
+class OverfitTeacher(CandidateTeacher, DialogTeacher):
+    @classmethod
+    def add_cmdline_args(self, argparser):
+        argparser.add_argument('--corpus-size', default=4, type=int)
+
+    def __init__(self, opt, shared=None):
+        self.corpussize = opt.get('corpus_size', 4)
+        super().__init__(opt, shared)
+
+    def setup_data(self, fold):
+        super()._setup_data('train')
+        for i, text in enumerate(self.corpus[: self.corpussize]):
+            cands = []
+            for j in range(NUM_CANDIDATES):
+                offset = (i + j) % len(self.corpus)
+                cands.append(self.corpus[offset])
+            yield (text, [text], 0, cands), True
+
+    def num_examples(self):
+        return self.corpussize
+
+    def num_episodes(self):
+        return self.corpussize
+
+
+class OverfitMultiturnTeacher(CandidateTeacher, DialogTeacher):
+    @classmethod
+    def add_cmdline_args(self, argparser):
+        argparser.add_argument('--corpus-size', default=4, type=int)
+
+    def __init__(self, opt, shared=None):
+        self.corpussize = opt.get('corpus_size', 4)
+        super().__init__(opt, shared)
+
+    def setup_data(self, fold):
+        super()._setup_data('train')
+        for text in self.corpus[: self.corpussize]:
+            words = text.split(' ')
+            for j in range(1, len(words) + 1):
+                real_text = ' '.join(words[:j])
+                yield (real_text, text), True
+
+    def num_examples(self):
+        return self.corpussize * EXAMPLE_SIZE
+
+    def num_episodes(self):
+        return self.corpussize * EXAMPLE_SIZE
+
+
 class VariableLengthTeacher(CandidateTeacher):
     def build_corpus(self):
         corpus = super().build_corpus()
@@ -319,73 +368,6 @@ def setup_data(self, fold):
             yield (t, [label], r, c + [label]), e
 
 
-class BadExampleTeacher(CandidateTeacher):
-    """
-    Teacher which produces a variety of examples that upset verify_data.py.
-
-    Useful for checking how models respond when the following assumptions are
-    violated:
-
-        0. text is empty string
-        1. missing text
-        2. label is empty string
-        3. missing label
-        4. label candidates is empty
-        5. label candidates contains an empty string
-        6. label isn't in the candidates
-        7. missing label candidates
-
-    Note: this test may come to outlive its purpose in the future. When failing
-    this test, one should consider who is really at fault: the test, or the code.
-    """
-
-    NUM_CASES = 8
-
-    def __init__(self, opt, shared=None):
-        super().__init__(opt, shared)
-        # gross hack: override data.get to force things the way we want; otherwise
-        # we can't actually force some of these scenarios.
-        self.data.get = self._wrapperfn(self.data.get)
-
-    def _wrapperfn(self, oldget):
-        def newget(*args):
-            item, eod = oldget(*args)
-            item = copy.deepcopy(item)
-            newget.case = (newget.case + 1) % self.NUM_CASES
-            case = newget.case
-            if case == 0:
-                # empty string input
-                item.force_set('text', '')
-            elif case == 1:
-                # not text input
-                del item['text']
-            elif case == 2:
-                # empty string label
-                item.force_set('labels', [''])
-            elif case == 3:
-                # no label
-                del item['labels']
-            elif case == 4:
-                # no label candidates
-                item.force_set('label_candidates', [])
-            elif case == 5:
-                # extra empty string in labels
-                item.force_set(
-                    'label_candidates', list(item['label_candidates']) + ['']
-                )
-            elif case == 6:
-                # label candidates doesn't have the label
-                item.force_set('label_candidates', list(item['label_candidates']))
-                item['label_candidates'].remove(item['labels'][0])
-            elif case == 7:
-                # no label candidates field
-                del item['label_candidates']
-            return item, eod
-
-        newget.case = random.randint(0, self.NUM_CASES)
-        return newget
-
-
 class ImageTeacher(AbstractImageTeacher):
     """
     Teacher which provides images and captions.

diff --git a/tests/nightly/gpu/test_dialogpt.py b/tests/nightly/gpu/test_dialogpt.py
@@ -16,20 +16,21 @@ class TestDialogptModel(unittest.TestCase):
     Checks that DialoGPT gets a certain performance on the integration test task.
     """
 
-    @testing_utils.retry(ntries=3, log_retry=True)
     def test_dialogpt(self):
         valid, test = testing_utils.train_model(
             dict(
-                task='integration_tests:nocandidate',
+                task='integration_tests:overfit',
                 model='hugging_face/dialogpt',
                 add_special_tokens=True,
                 add_start_token=True,
-                optimizer='sgd',
-                learningrate=1,
+                optimizer='adam',
+                learningrate=1e-3,
                 batchsize=4,
-                num_epochs=4,
+                num_epochs=50,
+                validation_every_n_epochs=5,
+                validation_metric='ppl',
                 short_final_eval=True,
-                validation_max_exs=12,
+                skip_generation=True,
             )
         )
 

diff --git a/tests/nightly/gpu/test_drqa.py b/tests/nightly/gpu/test_drqa.py
@@ -16,7 +16,8 @@ class TestDrQAModel(unittest.TestCase):
 
     def test_pretrained(self):
         _, test = testing_utils.eval_model(
-            dict(task='squad:index', model_file='zoo:drqa/squad/model')
+            dict(task='squad:index', model_file='zoo:drqa/squad/model', batchsize=32),
+            skip_valid=True,
         )
         self.assertGreaterEqual(test['accuracy'], 0.68)
         self.assertGreaterEqual(test['f1'], 0.78)