Skip to content

Commit

Permalink
Update EmpatheticDialogues teachers (facebookresearch#2303)
Browse files Browse the repository at this point in the history
* Update ED teacher

* Update names

* Update SHA

* Move in ED unit test

* Also test PersonaTopicifierTeacher

* Some reversion

* Revert "Some reversion"

This reverts commit 352da68.

* Opt and tempdir

* Remembering default mode

* Test other model

* Streamline

* Remove persona topicifier teacher

* Fix classifier issue

* Overhaul ED test

* Test fix

* Space

* Comment clarifications
  • Loading branch information
EricMichaelSmith authored Dec 23, 2019
1 parent 9964bcf commit d57786d
Show file tree
Hide file tree
Showing 5 changed files with 546 additions and 89 deletions.
14 changes: 10 additions & 4 deletions parlai/tasks/empathetic_dialogues/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ Task: Empathetic Dialogues
===========================
Description: A dataset of 25k conversations grounded in emotional situations to facilitate training and evaluating dialogue systems. See https://arxiv.org/abs/1811.00207 for more information.
===========================
Dataset has been released under the CC BY-NC license.
EmpatheticDialogueTeacher returns examples like so:
Dataset has been released under the CC BY-NC license.

## EmpatheticDialoguesTeacher
Returns examples like so:
- [text]: context line (previous utterance by 'speaker')
- [labels]: label line (current utterance by 'listener')
with additional task specific fields:
Expand All @@ -13,7 +15,11 @@ Other optional fields:
- [prepend_ctx]: fasttext prediction on context line - or None
- [prepend_cand]: fasttext prediction on label line (candidate) - or None
- [deepmoji_ctx]: vector encoding from deepmoji penultimate layer - or None
- [deepmoji_cand]: vector encoding from deepmoji penultimate layer for label line (candidate) - or None
- [deepmoji_cand]: vector encoding from deepmoji penultimate layer for label line (candidate) - or None

Tags: #EmpatheticDialogues, #All, #ChitChat
## EmotionClassificationSituationTeacher
Classifier that returns the situation and emotion for each episode given by `EmpatheticDialoguesTeacher`. Examples:
- [text]: A 1-3 sentence description of the situation that the conversation is (equivalent to [situation] for `EmpatheticDialoguesTeacher`)
- [labels]: one of 32 emotion words (equivalent to [emotion] for `EmpatheticDialoguesTeacher`)

Tags: #EmpatheticDialogues, #All, #ChitChat
185 changes: 102 additions & 83 deletions parlai/tasks/empathetic_dialogues/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,31 +10,62 @@
import numpy as np


class EmpatheticDialogueTeacher(FixedDialogTeacher):
DEFAULT_TRAIN_EXPERIENCER_ONLY = False


class EmpatheticDialoguesTeacher(FixedDialogTeacher):
def __init__(self, opt, shared=None):
super().__init__(opt, shared)
self.opt = opt
self.datatype = opt.get('datatype', 'train').split(':')[0]
self.datapath = os.path.join(
self.opt['datapath'],
'empatheticdialogues',
'empatheticdialogues',
self.datatype + '.csv',
)
self.experiencer_side_only = (
opt.get('train_experiencer_only', DEFAULT_TRAIN_EXPERIENCER_ONLY)
and self.datatype == 'train'
) or self.datatype != 'train'
print(
f'[EmpatheticDialoguesTeacher] Only use experiencer side? '
f'{self.experiencer_side_only}, datatype: {self.datatype}'
)

if shared:
self.data = shared['data']
else:
build(opt)
fold = opt.get('datatype', 'train').split(':')[0]
self._setup_data(fold)
self._setup_data(self.datatype)

self.num_exs = sum([(len(d) + 1) // 2 for d in self.data])
self.num_exs = sum([len(d) for d in self.data])
self.num_eps = len(self.data)
self.reset()

@classmethod
def add_cmdline_args(cls, argparser):
agent = argparser.add_argument_group('EmpatheticDialogues teacher arguments')
agent.add_argument(
'--train-experiencer-only',
type='bool',
default=DEFAULT_TRAIN_EXPERIENCER_ONLY,
# i.e. do not include the other side of the conversation where the Listener
# (responder) utterance would be the text and the Speaker (experiencer)
# utterance would be the label
help='In the train set, only use Speaker (experiencer) utterances as text and Listener (responder) utterances as labels.',
)

def num_episodes(self):
return self.num_eps

def num_examples(self):
return self.num_exs

def _setup_data(self, fold):
self.turns = 0
def _setup_data(self, datatype):

if self.opt.get('deepmoji') is not None:
self.embed = np.load(self.opt['deepmoji'] + fold + ".npy")
self.embed = np.load(self.opt['deepmoji'] + datatype + ".npy")

if self.opt.get('fasttextloc') is not None and self.opt.get('prepend', -1) > 0:
try:
Expand All @@ -44,32 +75,37 @@ def _setup_data(self, fold):
ftpath = self.opt['fasttextloc']
ftmodel = fastText.FastText.load_model(ftpath)

fpath = os.path.join(
self.opt['datapath'],
'empatheticdialogues',
'empatheticdialogues',
fold + '.csv',
)
df = open(fpath).readlines()
df = open(self.datapath).readlines()

turn_idx = 1
responder_text_dialogue = []
experiencer_text_dialogue = []
self.data = []
dialog = []
for i in range(1, len(df)):

cparts = df[i - 1].strip().split(",")
sparts = df[i].strip().split(",")

if cparts[0] == sparts[0]:

# Check that the turn number has incremented correctly
turn_idx += 1
assert (
int(cparts[1]) + 1 == int(sparts[1]) and int(sparts[1]) == turn_idx
)

contextt = cparts[5].replace("_comma_", ",")
label = sparts[5].replace("_comma_", ",")
prompt = sparts[2]
sit = sparts[3].replace("_comma_", ",")
if len(sparts) == 9:
inline_label_candidates = [
cand.replace("_comma_", ",").replace("_pipe_", "|")
for cand in sparts[8].split('|')
]
if sparts[8] != '':
inline_label_candidates = [
cand.replace("_comma_", ",").replace("_pipe_", "|")
for cand in sparts[8].split('|')
]
else:
inline_label_candidates = []
elif len(sparts) == 8:
inline_label_candidates = []
else:
Expand All @@ -94,31 +130,46 @@ def _setup_data(self, fold):
for f in gettop:
ft_cand = f.split("_")[-1] + " " + ft_cand

dialog.append(
(
contextt,
label,
prompt,
sit,
context_emb,
cand_emb,
ft_ctx,
ft_cand,
inline_label_candidates,
)
)
dialogue_parts = [
contextt,
label,
prompt,
sit,
context_emb,
cand_emb,
ft_ctx,
ft_cand,
inline_label_candidates,
]

if int(sparts[1]) % 2 == 0:
# experiencer is the "text" and responder is the "label"
experiencer_text_dialogue.append(dialogue_parts)
else:
# responder is the "text" and experiencer is the "label"
responder_text_dialogue.append(dialogue_parts)

else:

if len(dialog) > 0:
self.data.append(dialog)
dialog = []
# We've finished the previous episode, so add it to the data
turn_idx = 1
if len(experiencer_text_dialogue) > 0:
self.data.append(experiencer_text_dialogue)
if len(responder_text_dialogue) > 0 and not self.experiencer_side_only:
self.data.append(responder_text_dialogue)
experiencer_text_dialogue = []
responder_text_dialogue = []

# Add in the final episode
if len(experiencer_text_dialogue) > 0:
self.data.append(experiencer_text_dialogue)
if len(responder_text_dialogue) > 0 and not self.experiencer_side_only:
self.data.append(responder_text_dialogue)

def get(self, episode_idx, entry_idx=0):
ep = self.data[episode_idx]
i = entry_idx * 2
ep_i = ep[i]
episode_done = i >= (len(ep) - 2)
ep_i = ep[entry_idx]
episode_done = entry_idx >= (len(ep) - 1)
action = {
'situation': ep_i[3],
'emotion': ep_i[2],
Expand All @@ -139,68 +190,36 @@ def share(self):
return shared


class EmotionClassificationTeacher(EmpatheticDialogueTeacher):
class EmotionClassificationSituationTeacher(EmpatheticDialoguesTeacher):
"""
Class for detecting the emotion based on the utterance.
Class for detecting the emotion based on the situation.
"""

@staticmethod
def add_cmdline_args(parser):
parser = parser.add_argument_group('Emotion Classification Args')
parser.add_argument(
'--single-turn',
type='bool',
default=True,
help='Single turn classification task',
)

def __init__(self, opt, shared=None):
opt['train_experiencer_only'] = True
# So that we only have one episode per train conversation
super().__init__(opt, shared)
self.single_turn = opt['single_turn']
if not shared and self.single_turn:
self._make_single_turn()
if not shared:
self._get_situations()

def num_episodes(self):
return len(self.data)

def num_examples(self):
if not self.single_turn:
return super().num_examples()
return len(self.data)

def _make_single_turn(self):
def _get_situations(self):
new_data = []
for ep in self.data:
for ex in ep:
new_data.append(ex)
new_data.append(ep[0])
self.data = new_data

def get(self, episode_idx, entry_idx=0):
if not self.single_turn:
# get the specific episode from the example
ep = self.data[episode_idx]
i = entry_idx * 2
ex = ep[i]
episode_done = i >= (len(ep) - 2)
else:
# each episode is a singular example, we use both sides of the
# conversation
ex = self.data[episode_idx]
episode_done = True

return {
'situation': ex[3],
'labels': [ex[2]],
'text': ex[0],
'next_utt': ex[1],
'prepend_ctx': ex[6],
'prepend_cand': ex[7],
'deepmoji_ctx': ex[4],
'deepmoji_cand': ex[5],
'episode_done': episode_done,
'label_candidates': ex[8],
}
ex = self.data[episode_idx]
episode_done = True

return {'labels': [ex[2]], 'text': ex[3], 'episode_done': episode_done}


class DefaultTeacher(EmpatheticDialogueTeacher):
class DefaultTeacher(EmpatheticDialoguesTeacher):
pass
2 changes: 1 addition & 1 deletion parlai/tasks/empathetic_dialogues/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
DownloadableFile(
'http://parl.ai/downloads/empatheticdialogues/empatheticdialogues.tar.gz',
'empatheticdialogues.tar.gz',
'240c492cb6199a315722f716bfcc14f13ea6605f1cec67349153b606be92f6f2',
'56f234d77b7dd1f005fd365bb17769cfe346c3c84295b69bc069c8ccb83be03d',
)
]

Expand Down
2 changes: 1 addition & 1 deletion parlai/tasks/task_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -1004,7 +1004,7 @@
"to facilitate training and evaluating dialogue systems. See "
"https://arxiv.org/abs/1811.00207 for more information. \n"
"Dataset has been released under the CC BY-NC license. \n"
"EmpatheticDialogueTeacher returns examples like so: \n\n"
"EmpatheticDialoguesTeacher returns examples like so: \n\n"
" - [text]: context line (previous utterance by 'speaker') \n"
" - [labels]: label line (current utterance by 'listener') \n\n"
"with additional task specific fields: \n\n"
Expand Down
Loading

0 comments on commit d57786d

Please sign in to comment.