Fix data preprocessing for chatbot tutorial (#1992)

priyaramani · malfet · web-flow · commit 5ded1fa9bfc2 · 2022-08-02T11:06:48.000-07:00
* Fix loading and preprocessing of data for chatbot tutorial

* update comment

* updated related sources

* Apply suggestions from code review

Update download link

Co-authored-by: Nikita Shulga &lt;nshulga@fb.com&gt;
diff --git a/Makefile b/Makefile
@@ -74,8 +74,8 @@ download:
 	cp $(DATADIR)/iris.data beginner_source/data/
 
 	# Download dataset for beginner_source/chatbot_tutorial.py
-	wget -N https://s3.amazonaws.com/pytorch-tutorial-assets/cornell_movie_dialogs_corpus.zip -P $(DATADIR)
-	unzip $(ZIPOPTS) $(DATADIR)/cornell_movie_dialogs_corpus.zip -d beginner_source/data/
+	wget -N https://s3.amazonaws.com/pytorch-tutorial-assets/cornell_movie_dialogs_corpus_v2.zip -P $(DATADIR)
+	unzip $(ZIPOPTS) $(DATADIR)/cornell_movie_dialogs_corpus_v2.zip -d beginner_source/data/
 
 	# Download dataset for beginner_source/audio_classifier_tutorial.py
 	wget -N https://s3.amazonaws.com/pytorch-tutorial-assets/UrbanSound8K.tar.gz -P $(DATADIR)
diff --git a/beginner_source/chatbot_tutorial.py b/beginner_source/chatbot_tutorial.py
@@ -85,7 +85,8 @@
 # ------------
 #
 # To start, Download the data ZIP file
-# `here <https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html>`__
+# `here <https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip>`__
+
 # and put in a ``data/`` directory under the current directory.
 #
 # After that, let’s import some necessities.
@@ -110,6 +111,7 @@
 from io import open
 import itertools
 import math
+import json
 
 
 USE_CUDA = torch.cuda.is_available()
@@ -140,7 +142,7 @@
 # original format.
 #
 
-corpus_name = "cornell movie-dialogs corpus"
+corpus_name = "movie-corpus"
 corpus = os.path.join("data", corpus_name)
 
 def printLines(file, n=10):
@@ -149,7 +151,7 @@ def printLines(file, n=10):
     for line in lines[:n]:
         print(line)
 
-printLines(os.path.join(corpus, "movie_lines.txt"))
+printLines(os.path.join(corpus, "utterances.jsonl"))
 
 
 ######################################################################
@@ -160,55 +162,47 @@ def printLines(file, n=10):
 # contains a tab-separated *query sentence* and a *response sentence* pair.
 #
 # The following functions facilitate the parsing of the raw
-# *movie_lines.txt* data file.
+# *utterances.jsonl* data file.
 #
-# -  ``loadLines`` splits each line of the file into a dictionary of
-#    fields (lineID, characterID, movieID, character, text)
-# -  ``loadConversations`` groups fields of lines from ``loadLines`` into
-#    conversations based on *movie_conversations.txt*
+# -  ``loadLinesAndConversations`` splits each line of the file into a dictionary of
+#    lines with fields: lineID, characterID, and text and then groups them
+#    into conversations with fields: conversationID, movieID, and lines.
 # -  ``extractSentencePairs`` extracts pairs of sentences from
 #    conversations
 #
 
-# Splits each line of the file into a dictionary of fields
-def loadLines(fileName, fields):
+# Splits each line of the file to create lines and conversations
+def loadLinesAndConversations(fileName):
     lines = {}
+    conversations = {}
     with open(fileName, 'r', encoding='iso-8859-1') as f:
         for line in f:
-            values = line.split(" +++$+++ ")
-            # Extract fields
+            lineJson = json.loads(line)
+            # Extract fields for line object
             lineObj = {}
-            for i, field in enumerate(fields):
-                lineObj[field] = values[i]
+            lineObj["lineID"] = lineJson["id"]
+            lineObj["characterID"] = lineJson["speaker"]
+            lineObj["text"] = lineJson["text"]
             lines[lineObj['lineID']] = lineObj
-    return lines
 
+            # Extract fields for conversation object
+            if lineJson["conversation_id"] not in conversations:
+                convObj = {}
+                convObj["conversationID"] = lineJson["conversation_id"]
+                convObj["movieID"] = lineJson["meta"]["movie_id"]
+                convObj["lines"] = [lineObj]
+            else:
+                convObj = conversations[lineJson["conversation_id"]]
+                convObj["lines"].insert(0, lineObj)
+            conversations[convObj["conversationID"]] = convObj
 
-# Groups fields of lines from `loadLines` into conversations based on *movie_conversations.txt*
-def loadConversations(fileName, lines, fields):
-    conversations = []
-    with open(fileName, 'r', encoding='iso-8859-1') as f:
-        for line in f:
-            values = line.split(" +++$+++ ")
-            # Extract fields
-            convObj = {}
-            for i, field in enumerate(fields):
-                convObj[field] = values[i]
-            # Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
-            utterance_id_pattern = re.compile('L[0-9]+')
-            lineIds = utterance_id_pattern.findall(convObj["utteranceIDs"])
-            # Reassemble lines
-            convObj["lines"] = []
-            for lineId in lineIds:
-                convObj["lines"].append(lines[lineId])
-            conversations.append(convObj)
-    return conversations
+    return lines, conversations
 
 
 # Extracts pairs of sentences from conversations
 def extractSentencePairs(conversations):
     qa_pairs = []
-    for conversation in conversations:
+    for conversation in conversations.values():
         # Iterate over all the lines of the conversation
         for i in range(len(conversation["lines"]) - 1):  # We ignore the last line (no answer for it)
             inputLine = conversation["lines"][i]["text"].strip()
@@ -231,18 +225,12 @@ def extractSentencePairs(conversations):
 # Unescape the delimiter
 delimiter = str(codecs.decode(delimiter, "unicode_escape"))
 
-# Initialize lines dict, conversations list, and field ids
+# Initialize lines dict and conversations dict
 lines = {}
-conversations = []
-MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
-MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
-
-# Load lines and process conversations
-print("\nProcessing corpus...")
-lines = loadLines(os.path.join(corpus, "movie_lines.txt"), MOVIE_LINES_FIELDS)
-print("\nLoading conversations...")
-conversations = loadConversations(os.path.join(corpus, "movie_conversations.txt"),
-                                  lines, MOVIE_CONVERSATIONS_FIELDS)
+conversations = {}
+# Load lines and conversations
+print("\nProcessing corpus into lines and conversations...")
+lines, conversations = loadLinesAndConversations(os.path.join(corpus, "utterances.jsonl"))
 
 # Write new csv file
 print("\nWriting newly formatted file...")
@@ -1341,7 +1329,7 @@ def evaluateInput(encoder, decoder, searcher, voc):
     for k, v in state.items():
         if isinstance(v, torch.Tensor):
             state[k] = v.cuda()
-    
+
 # Run training iterations
 print("Starting Training!")
 trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
diff --git a/beginner_source/colab.rst b/beginner_source/colab.rst
@@ -28,9 +28,9 @@ To fix this, we'll copy the required file into our Google Drive account.
 1. Log into Google Drive.
 2. In Google Drive, make a folder named **data**, with a subfolder named
    **cornell**.
-3. Visit the Cornell Movie Dialogs Corpus and download the ZIP file.
+3. Visit the Cornell Movie Dialogs Corpus and download the movie-corpus ZIP file.
 4. Unzip the file on your local machine.
-5. Copy the files **movie\_lines.txt** and **movie\_conversations.txt** to the **data/cornell** folder that you
+5. Copy the file **utterances.jsonl** to the **data/cornell** folder that you
    created in Google Drive.
 
 Now we'll need to edit the file in\_ \_Colab to point to the file on