Update EmpatheticDialogues teachers (facebookresearch#2303)

* Update ED teacher * Update names * Update SHA * Move in ED unit test * Also test PersonaTopicifierTeacher * Some reversion * Revert "Some reversion" This reverts commit 352da68. * Opt and tempdir * Remembering default mode * Test other model * Streamline * Remove persona topicifier teacher * Fix classifier issue * Overhaul ED test * Test fix * Space * Comment clarifications
jeremyzzzhj · Dec 23, 2019 · d57786d · d57786d
1 parent 9964bcf
commit d57786d
Show file tree

Hide file tree

Showing 5 changed files with 546 additions and 89 deletions.
diff --git a/parlai/tasks/empathetic_dialogues/README.md b/parlai/tasks/empathetic_dialogues/README.md
@@ -2,8 +2,10 @@ Task: Empathetic Dialogues
 ===========================
 Description: A dataset of 25k conversations grounded in emotional situations to facilitate training and evaluating dialogue systems. See https://arxiv.org/abs/1811.00207 for more information. 
 =========================== 
-Dataset has been released under the CC BY-NC license. 
-EmpatheticDialogueTeacher returns examples like so: 
+Dataset has been released under the CC BY-NC license.
+
+## EmpatheticDialoguesTeacher
+Returns examples like so: 
 - [text]:  context line (previous utterance by 'speaker') 
 - [labels]: label line  (current utterance by 'listener') 
 with additional task specific fields: 
@@ -13,7 +15,11 @@ Other optional fields:
 - [prepend_ctx]: fasttext prediction on context line - or None 
 - [prepend_cand]: fasttext prediction on label line (candidate) - or None 
 - [deepmoji_ctx]: vector encoding from deepmoji penultimate layer - or None 
-- [deepmoji_cand]: vector encoding from deepmoji penultimate layer for label line (candidate) - or None 
+- [deepmoji_cand]: vector encoding from deepmoji penultimate layer for label line (candidate) - or None
 
-Tags: #EmpatheticDialogues, #All, #ChitChat
+## EmotionClassificationSituationTeacher
+Classifier that returns the situation and emotion for each episode given by `EmpatheticDialoguesTeacher`. Examples:
+- [text]: A 1-3 sentence description of the situation that the conversation is (equivalent to [situation] for `EmpatheticDialoguesTeacher`)
+- [labels]: one of 32 emotion words (equivalent to [emotion] for `EmpatheticDialoguesTeacher`)
 
+Tags: #EmpatheticDialogues, #All, #ChitChat
diff --git a/parlai/tasks/empathetic_dialogues/agents.py b/parlai/tasks/empathetic_dialogues/agents.py
@@ -10,31 +10,62 @@
 import numpy as np
 
 
-class EmpatheticDialogueTeacher(FixedDialogTeacher):
+DEFAULT_TRAIN_EXPERIENCER_ONLY = False
+
+
+class EmpatheticDialoguesTeacher(FixedDialogTeacher):
     def __init__(self, opt, shared=None):
         super().__init__(opt, shared)
         self.opt = opt
+        self.datatype = opt.get('datatype', 'train').split(':')[0]
+        self.datapath = os.path.join(
+            self.opt['datapath'],
+            'empatheticdialogues',
+            'empatheticdialogues',
+            self.datatype + '.csv',
+        )
+        self.experiencer_side_only = (
+            opt.get('train_experiencer_only', DEFAULT_TRAIN_EXPERIENCER_ONLY)
+            and self.datatype == 'train'
+        ) or self.datatype != 'train'
+        print(
+            f'[EmpatheticDialoguesTeacher] Only use experiencer side? '
+            f'{self.experiencer_side_only}, datatype: {self.datatype}'
+        )
+
         if shared:
             self.data = shared['data']
         else:
             build(opt)
-            fold = opt.get('datatype', 'train').split(':')[0]
-            self._setup_data(fold)
+            self._setup_data(self.datatype)
 
-        self.num_exs = sum([(len(d) + 1) // 2 for d in self.data])
+        self.num_exs = sum([len(d) for d in self.data])
         self.num_eps = len(self.data)
         self.reset()
 
+    @classmethod
+    def add_cmdline_args(cls, argparser):
+        agent = argparser.add_argument_group('EmpatheticDialogues teacher arguments')
+        agent.add_argument(
+            '--train-experiencer-only',
+            type='bool',
+            default=DEFAULT_TRAIN_EXPERIENCER_ONLY,
+            # i.e. do not include the other side of the conversation where the Listener
+            # (responder) utterance would be the text and the Speaker (experiencer)
+            # utterance would be the label
+            help='In the train set, only use Speaker (experiencer) utterances as text and Listener (responder) utterances as labels.',
+        )
+
     def num_episodes(self):
         return self.num_eps
 
     def num_examples(self):
         return self.num_exs
 
-    def _setup_data(self, fold):
-        self.turns = 0
+    def _setup_data(self, datatype):
+
         if self.opt.get('deepmoji') is not None:
-            self.embed = np.load(self.opt['deepmoji'] + fold + ".npy")
+            self.embed = np.load(self.opt['deepmoji'] + datatype + ".npy")
 
         if self.opt.get('fasttextloc') is not None and self.opt.get('prepend', -1) > 0:
             try:
@@ -44,32 +75,37 @@ def _setup_data(self, fold):
             ftpath = self.opt['fasttextloc']
             ftmodel = fastText.FastText.load_model(ftpath)
 
-        fpath = os.path.join(
-            self.opt['datapath'],
-            'empatheticdialogues',
-            'empatheticdialogues',
-            fold + '.csv',
-        )
-        df = open(fpath).readlines()
+        df = open(self.datapath).readlines()
 
+        turn_idx = 1
+        responder_text_dialogue = []
+        experiencer_text_dialogue = []
         self.data = []
-        dialog = []
         for i in range(1, len(df)):
 
             cparts = df[i - 1].strip().split(",")
             sparts = df[i].strip().split(",")
 
             if cparts[0] == sparts[0]:
 
+                # Check that the turn number has incremented correctly
+                turn_idx += 1
+                assert (
+                    int(cparts[1]) + 1 == int(sparts[1]) and int(sparts[1]) == turn_idx
+                )
+
                 contextt = cparts[5].replace("_comma_", ",")
                 label = sparts[5].replace("_comma_", ",")
                 prompt = sparts[2]
                 sit = sparts[3].replace("_comma_", ",")
                 if len(sparts) == 9:
-                    inline_label_candidates = [
-                        cand.replace("_comma_", ",").replace("_pipe_", "|")
-                        for cand in sparts[8].split('|')
-                    ]
+                    if sparts[8] != '':
+                        inline_label_candidates = [
+                            cand.replace("_comma_", ",").replace("_pipe_", "|")
+                            for cand in sparts[8].split('|')
+                        ]
+                    else:
+                        inline_label_candidates = []
                 elif len(sparts) == 8:
                     inline_label_candidates = []
                 else:
@@ -94,31 +130,46 @@ def _setup_data(self, fold):
                     for f in gettop:
                         ft_cand = f.split("_")[-1] + " " + ft_cand
 
-                dialog.append(
-                    (
-                        contextt,
-                        label,
-                        prompt,
-                        sit,
-                        context_emb,
-                        cand_emb,
-                        ft_ctx,
-                        ft_cand,
-                        inline_label_candidates,
-                    )
-                )
+                dialogue_parts = [
+                    contextt,
+                    label,
+                    prompt,
+                    sit,
+                    context_emb,
+                    cand_emb,
+                    ft_ctx,
+                    ft_cand,
+                    inline_label_candidates,
+                ]
+
+                if int(sparts[1]) % 2 == 0:
+                    # experiencer is the "text" and responder is the "label"
+                    experiencer_text_dialogue.append(dialogue_parts)
+                else:
+                    # responder is the "text" and experiencer is the "label"
+                    responder_text_dialogue.append(dialogue_parts)
 
             else:
 
-                if len(dialog) > 0:
-                    self.data.append(dialog)
-                dialog = []
+                # We've finished the previous episode, so add it to the data
+                turn_idx = 1
+                if len(experiencer_text_dialogue) > 0:
+                    self.data.append(experiencer_text_dialogue)
+                if len(responder_text_dialogue) > 0 and not self.experiencer_side_only:
+                    self.data.append(responder_text_dialogue)
+                experiencer_text_dialogue = []
+                responder_text_dialogue = []
+
+        # Add in the final episode
+        if len(experiencer_text_dialogue) > 0:
+            self.data.append(experiencer_text_dialogue)
+        if len(responder_text_dialogue) > 0 and not self.experiencer_side_only:
+            self.data.append(responder_text_dialogue)
 
     def get(self, episode_idx, entry_idx=0):
         ep = self.data[episode_idx]
-        i = entry_idx * 2
-        ep_i = ep[i]
-        episode_done = i >= (len(ep) - 2)
+        ep_i = ep[entry_idx]
+        episode_done = entry_idx >= (len(ep) - 1)
         action = {
             'situation': ep_i[3],
             'emotion': ep_i[2],
@@ -139,68 +190,36 @@ def share(self):
         return shared
 
 
-class EmotionClassificationTeacher(EmpatheticDialogueTeacher):
+class EmotionClassificationSituationTeacher(EmpatheticDialoguesTeacher):
     """
-    Class for detecting the emotion based on the utterance.
+    Class for detecting the emotion based on the situation.
     """
 
-    @staticmethod
-    def add_cmdline_args(parser):
-        parser = parser.add_argument_group('Emotion Classification Args')
-        parser.add_argument(
-            '--single-turn',
-            type='bool',
-            default=True,
-            help='Single turn classification task',
-        )
-
     def __init__(self, opt, shared=None):
+        opt['train_experiencer_only'] = True
+        # So that we only have one episode per train conversation
         super().__init__(opt, shared)
-        self.single_turn = opt['single_turn']
-        if not shared and self.single_turn:
-            self._make_single_turn()
+        if not shared:
+            self._get_situations()
 
     def num_episodes(self):
         return len(self.data)
 
     def num_examples(self):
-        if not self.single_turn:
-            return super().num_examples()
         return len(self.data)
 
-    def _make_single_turn(self):
+    def _get_situations(self):
         new_data = []
         for ep in self.data:
-            for ex in ep:
-                new_data.append(ex)
+            new_data.append(ep[0])
         self.data = new_data
 
     def get(self, episode_idx, entry_idx=0):
-        if not self.single_turn:
-            # get the specific episode from the example
-            ep = self.data[episode_idx]
-            i = entry_idx * 2
-            ex = ep[i]
-            episode_done = i >= (len(ep) - 2)
-        else:
-            # each episode is a singular example, we use both sides of the
-            # conversation
-            ex = self.data[episode_idx]
-            episode_done = True
-
-        return {
-            'situation': ex[3],
-            'labels': [ex[2]],
-            'text': ex[0],
-            'next_utt': ex[1],
-            'prepend_ctx': ex[6],
-            'prepend_cand': ex[7],
-            'deepmoji_ctx': ex[4],
-            'deepmoji_cand': ex[5],
-            'episode_done': episode_done,
-            'label_candidates': ex[8],
-        }
+        ex = self.data[episode_idx]
+        episode_done = True
+
+        return {'labels': [ex[2]], 'text': ex[3], 'episode_done': episode_done}
 
 
-class DefaultTeacher(EmpatheticDialogueTeacher):
+class DefaultTeacher(EmpatheticDialoguesTeacher):
     pass
diff --git a/parlai/tasks/empathetic_dialogues/build.py b/parlai/tasks/empathetic_dialogues/build.py
@@ -14,7 +14,7 @@
     DownloadableFile(
         'http://parl.ai/downloads/empatheticdialogues/empatheticdialogues.tar.gz',
         'empatheticdialogues.tar.gz',
-        '240c492cb6199a315722f716bfcc14f13ea6605f1cec67349153b606be92f6f2',
+        '56f234d77b7dd1f005fd365bb17769cfe346c3c84295b69bc069c8ccb83be03d',
     )
 ]
 

diff --git a/parlai/tasks/task_list.py b/parlai/tasks/task_list.py
@@ -1004,7 +1004,7 @@
             "to facilitate training and evaluating dialogue systems. See "
             "https://arxiv.org/abs/1811.00207 for more information. \n"
             "Dataset has been released under the CC BY-NC license. \n"
-            "EmpatheticDialogueTeacher returns examples like so: \n\n"
+            "EmpatheticDialoguesTeacher returns examples like so: \n\n"
             "  - [text]:  context line (previous utterance by 'speaker') \n"
             "  - [labels]: label line  (current utterance by 'listener') \n\n"
             "with additional task specific fields: \n\n"