Python 3 compatibility:

- Use io.open instead of codecs.open/codecs.getwriter - Use input() if raw_input is not available - Use _pickle if cPickle is not available - Use print_function from __future__
bugbakery · Jun 8, 2018 · 552adb8 · 552adb8
1 parent 5161946
commit 552adb8
Show file tree

Hide file tree

Showing 10 changed files with 123 additions and 83 deletions.
diff --git a/convert_to_readable.py b/convert_to_readable.py
@@ -1,5 +1,5 @@
 import sys
-import codecs
+from io import open
 from data import EOS_TOKENS, PUNCTUATION_VOCABULARY
 
 if __name__ == "__main__":
@@ -16,7 +16,7 @@
 
     with_newlines = len(sys.argv) > 3 and bool(int(sys.argv[3]))
 
-    with codecs.open(input_file, 'r', 'utf-8') as in_f, codecs.open(output_file, 'w', 'utf-8') as out_f:
+    with open(input_file, 'r', encoding='utf-8') as in_f, open(output_file, 'w', encoding='utf-8') as out_f:
         last_was_eos = True
         first = True
         for token in in_f.read().split():

diff --git a/data.py b/data.py
@@ -1,12 +1,19 @@
 # coding: utf-8
-from __future__ import division
+from __future__ import division, print_function
 
 import random
 import os
 import sys
 import operator
-import cPickle
-import codecs
+try:
+    import cPickle
+except ImportError:
+    import _pickle as cPickle
+try:
+    input = raw_input
+except NameError:
+    pass
+from io import open
 import fnmatch
 import shutil
 
@@ -77,16 +84,16 @@ def write_vocabulary(vocabulary, file_name):
     if UNK not in vocabulary:
         vocabulary.append(UNK)
 
-    print "Vocabulary size: %d" % len(vocabulary)
+    print("Vocabulary size: %d" % len(vocabulary))
 
-    with codecs.open(file_name, 'w', 'utf-8') as f:
+    with open(file_name, 'w', encoding='utf-8') as f:
         f.write("\n".join(vocabulary))
 
 def iterable_to_dict(arr):
     return dict((x.strip(), i) for (i, x) in enumerate(arr))
 
 def read_vocabulary(file_name):
-    with codecs.open(file_name, 'r', 'utf-8') as f:
+    with open(file_name, 'r', encoding='utf-8') as f:
         return iterable_to_dict(f.readlines())
 
 def write_processed_dataset(input_files, output_file):
@@ -115,7 +122,7 @@ def write_processed_dataset(input_files, output_file):
 
     for input_file in input_files:
 
-        with codecs.open(input_file, 'r', 'utf-8') as text:
+        with open(input_file, 'r', encoding='utf-8') as text:
 
             for line in text:
 
@@ -196,7 +203,7 @@ def write_processed_dataset(input_files, output_file):
 
                         last_eos_idx = 0 # sequence always starts with a new sentence
 
-    print "%.2f%% UNK-s in %s" % (num_unks / num_total * 100, output_file)
+    print("%.2f%% UNK-s in %s" % (num_unks / num_total * 100, output_file))
 
     dump(data, output_file)
 
@@ -224,15 +231,15 @@ def create_dev_test_train_split_and_vocabulary(root_path, create_vocabulary, tra
                 train_txt_files.append(path)
 
                 if create_vocabulary and not pretrained_embeddings_path:
-                    with codecs.open(path, 'r', 'utf-8') as text:
+                    with open(path, 'r', encoding='utf-8') as text:
                         for line in text:
                             add_counts(word_counts, line)
 
     if create_vocabulary:
         if pretrained_embeddings_path:
             vocabulary = []
             embeddings = []
-            with codecs.open(pretrained_embeddings_path, 'r', 'utf-8') as f:
+            with open(pretrained_embeddings_path, 'r', encoding='utf-8') as f:
                 for line in f:
                     line = line.split()
                     w = line[0]
@@ -261,7 +268,7 @@ def create_dev_test_train_split_and_vocabulary(root_path, create_vocabulary, tra
     if os.path.exists(DATA_PATH):
 
         while True:
-            resp = raw_input("Data path '%s' already exists. Do you want to:\n[r]eplace the files in existing data path?\n[e]xit?\n>" % DATA_PATH)
+            resp = input("Data path '%s' already exists. Do you want to:\n[r]eplace the files in existing data path?\n[e]xit?\n>" % DATA_PATH)
             resp = resp.lower().strip()
             if resp not in ('r', 'e'):
                 continue
@@ -281,4 +288,4 @@ def create_dev_test_train_split_and_vocabulary(root_path, create_vocabulary, tra
     # Stage 2
     if len(sys.argv) > 2:
         path2 = sys.argv[2]
-        create_dev_test_train_split_and_vocabulary(path2, False, TRAIN_FILE2, DEV_FILE2, TEST_FILE2)
+        create_dev_test_train_split_and_vocabulary(path2, False, TRAIN_FILE2, DEV_FILE2, TEST_FILE2)
diff --git a/demo_play_with_model.py b/demo_play_with_model.py
@@ -1,6 +1,6 @@
 # coding: utf-8
 
-from __future__ import division
+from __future__ import division, print_function
 
 from nltk.tokenize import word_tokenize
 
@@ -9,8 +9,8 @@
 
 import theano
 import sys
-import codecs
 import re
+from io import open
 
 import theano.tensor as T
 import numpy as np
@@ -106,10 +106,10 @@ def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuat
 
     x = T.imatrix('x')
 
-    print "Loading model parameters..."
+    print("Loading model parameters...")
     net, _ = models.load(model_file, 1, x)
 
-    print "Building model..."
+    print("Building model...")
     predict = theano.function(inputs=[x], outputs=net.y)
     word_vocabulary = net.x_vocabulary
     punctuation_vocabulary = net.y_vocabulary
@@ -120,11 +120,15 @@ def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuat
     tokenizer = word_tokenize
     untokenizer = lambda text: text.replace(" '", "'").replace(" n't", "n't").replace("can not", "cannot")
 
-    with codecs.getwriter('utf-8')(sys.stdout) as f_out:
+    with open(sys.stdout.fileno(), 'w', encoding='utf-8', closefd=False) as f_out:
         while True:
-            text = raw_input("\nTEXT: ").decode('utf-8')
+            try:
+                text = raw_input("\nTEXT: ").decode('utf-8')
+            except NameError:
+                text = input("\nTEXT: ")
 
             words = [w for w in untokenizer(' '.join(tokenizer(text))).split()
                      if w not in punctuation_vocabulary and w not in human_readable_punctuation_vocabulary]
 
             punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuation_vocabulary, reverse_word_vocabulary, words, f_out, show_unk)
+            f_out.flush()
diff --git a/error_calculator.py b/error_calculator.py
@@ -4,10 +4,12 @@
 Computes and prints the overall classification error and precision, recall, F-score over punctuations.
 """
 
+from __future__ import print_function
+
 from numpy import nan
 import data
-import codecs
 import sys
+from io import open
 
 MAPPING = {}#{"!EXCLAMATIONMARK": ".PERIOD", "?QUESTIONMARK": ".PERIOD", ":COLON": ".PERIOD", ";SEMICOLON": ".PERIOD"} # Can be used to estimate 2-class performance for example
 
@@ -32,7 +34,7 @@ def compute_error(target_paths, predicted_paths):
         t_i = 0
         p_i = 0
 
-        with codecs.open(target_path, 'r', 'utf-8') as target, codecs.open(predicted_path, 'r', 'utf-8') as predicted:
+        with open(target_path, 'r', encoding='utf-8') as target, open(predicted_path, 'r', encoding='utf-8') as predicted:
 
             target_stream = target.read().split()
             predicted_stream = predicted.read().split()
@@ -91,8 +93,8 @@ def compute_error(target_paths, predicted_paths):
     overall_fp = 0.0
     overall_fn = 0.0
 
-    print "-"*46
-    print "{:<16} {:<9} {:<9} {:<9}".format('PUNCTUATION','PRECISION','RECALL','F-SCORE')
+    print("-"*46)
+    print("{:<16} {:<9} {:<9} {:<9}".format('PUNCTUATION','PRECISION','RECALL','F-SCORE'))
     for p in data.PUNCTUATION_VOCABULARY:
 
         if p == data.SPACE:
@@ -106,14 +108,14 @@ def compute_error(target_paths, predicted_paths):
         precision = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_positives[p])) if p in false_positives else nan
         recall = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_negatives[p])) if p in false_negatives else nan
         f_score = (2. * precision * recall / (precision + recall)) if (precision + recall) > 0 else nan        
-        print u"{:<16} {:<9} {:<9} {:<9}".format(punctuation, round(precision,3)*100, round(recall,3)*100, round(f_score,3)*100).encode('utf-8')
-    print "-"*46
+        print(u"{:<16} {:<9} {:<9} {:<9}".format(punctuation, round(precision,3)*100, round(recall,3)*100, round(f_score,3)*100).encode('utf-8'))
+    print("-"*46)
     pre = overall_tp/(overall_tp+overall_fp) if overall_fp else nan
     rec = overall_tp/(overall_tp+overall_fn) if overall_fn else nan
     f1 = (2.*pre*rec)/(pre+rec) if (pre + rec) else nan
-    print "{:<16} {:<9} {:<9} {:<9}".format("Overall", round(pre,3)*100, round(rec,3)*100, round(f1,3)*100)
-    print "Err: %s%%" % round((100.0 - float(total_correct) / float(counter-1) * 100.0), 2)
-    print "SER: %s%%" % round((substitutions + deletions + insertions) / (correct + substitutions + deletions) * 100, 1)
+    print("{:<16} {:<9} {:<9} {:<9}".format("Overall", round(pre,3)*100, round(rec,3)*100, round(f1,3)*100))
+    print("Err: %s%%" % round((100.0 - float(total_correct) / float(counter-1) * 100.0), 2))
+    print("SER: %s%%" % round((substitutions + deletions + insertions) / (correct + substitutions + deletions) * 100, 1))
 
 
 if __name__ == "__main__":

diff --git a/example/dont_run_me_run_the_other_script_instead.py b/example/dont_run_me_run_the_other_script_instead.py
@@ -1,11 +1,11 @@
 # coding: utf-8
 
-from __future__ import division
+from __future__ import division, print_function
 from nltk.tokenize import word_tokenize
 
 import nltk
 import os
-import codecs
+from io import open
 import re
 import sys
 
@@ -59,8 +59,8 @@ def process_line(line):
 
 skipped = 0
 
-with codecs.open(sys.argv[2], 'w', 'utf-8') as out_txt:
-    with codecs.open(sys.argv[1], 'r', 'utf-8') as text:
+with open(sys.argv[2], 'w', encoding='utf-8') as out_txt:
+    with open(sys.argv[1], 'r', encoding='utf-8') as text:
 
         for line in text:
 
@@ -75,4 +75,4 @@ def process_line(line):
 
             out_txt.write(line + '\n')
 
-print "Skipped %d lines" % skipped
+print("Skipped %d lines" % skipped)
diff --git a/main.py b/main.py
@@ -1,5 +1,5 @@
 # coding: utf-8
-from __future__ import division
+from __future__ import division, print_function
 
 from collections import OrderedDict
 from time import time
@@ -8,9 +8,16 @@
 import data
 
 import theano
-import cPickle
+try:
+    import cPickle
+except ImportError:
+    import _pickle as cPickle
 import sys
 import os.path
+try:
+    input = raw_input
+except NameError:
+    pass
 
 import theano.tensor as T
 import numpy as np
@@ -39,10 +46,10 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
         P_batch = []
 
     if len(dataset) < batch_size:
-        print "WARNING: Not enough samples in '%s'. Reduce mini-batch size to %d or use a dataset with at least %d words." % (
+        print("WARNING: Not enough samples in '%s'. Reduce mini-batch size to %d or use a dataset with at least %d words." % (
             file_name,
             len(dataset),
-            MINIBATCH_SIZE * data.MAX_SEQUENCE_LEN)
+            MINIBATCH_SIZE * data.MAX_SEQUENCE_LEN))
 
     for subsequence in dataset:
 
@@ -88,7 +95,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
 
     model_file_name = "Model_%s_h%d_lr%s.pcl" % (model_name, num_hidden, learning_rate)
 
-    print num_hidden, learning_rate, model_file_name
+    print(num_hidden, learning_rate, model_file_name)
 
     word_vocabulary = data.read_vocabulary(data.WORD_VOCAB_FILE)
     punctuation_vocabulary = data.iterable_to_dict(data.PUNCTUATION_VOCABULARY)
@@ -101,7 +108,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
     if os.path.isfile(model_file_name):
 
         while True:
-            resp = raw_input("Found an existing model with the name %s. Do you want to:\n[c]ontinue training the existing model?\n[r]eplace the existing model and train a new one?\n[e]xit?\n>" % model_file_name)
+            resp = input("Found an existing model with the name %s. Do you want to:\n[c]ontinue training the existing model?\n[r]eplace the existing model and train a new one?\n[e]xit?\n>" % model_file_name)
             resp = resp.lower().strip()
             if resp not in ('c', 'r', 'e'):
                 continue
@@ -112,7 +119,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
             break
 
     if continue_with_previous:
-        print "Loading previous model state" 
+        print("Loading previous model state")
 
         net, state = models.load(model_file_name, MINIBATCH_SIZE, x)
         gsums, learning_rate, validation_ppl_history, starting_epoch, rng = state
@@ -122,7 +129,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
         rng = np.random
         rng.seed(1)
 
-        print "Building model..."
+        print("Building model...")
         net = models.GRU(
             rng=rng,
             x=x,
@@ -170,7 +177,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
         outputs=net.cost(y)
     )
 
-    print "Training..."
+    print("Training...")
     for epoch in range(starting_epoch, MAX_EPOCHS):
         t0 = time()
         total_neg_log_likelihood = 0
@@ -183,24 +190,24 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
             if iteration % 100 == 0:
                 sys.stdout.write("PPL: %.4f; Speed: %.2f sps\n" % (np.exp(total_neg_log_likelihood / total_num_output_samples), total_num_output_samples / max(time() - t0, 1e-100)))
                 sys.stdout.flush()
-        print "Total number of training labels: %d" % total_num_output_samples
+        print("Total number of training labels: %d" % total_num_output_samples)
 
         total_neg_log_likelihood = 0
         total_num_output_samples = 0
         for X, Y in get_minibatch(data.DEV_FILE, MINIBATCH_SIZE, shuffle=False):
             total_neg_log_likelihood += validate_model(X, Y)
             total_num_output_samples += np.prod(Y.shape)
-        print "Total number of validation labels: %d" % total_num_output_samples
+        print("Total number of validation labels: %d" % total_num_output_samples)
 
         ppl = np.exp(total_neg_log_likelihood / total_num_output_samples)
         validation_ppl_history.append(ppl)
 
-        print "Validation perplexity is %s" % np.round(ppl, 4)
+        print("Validation perplexity is %s" % np.round(ppl, 4))
 
         if ppl <= best_ppl:
             best_ppl = ppl
             net.save(model_file_name, gsums=gsums, learning_rate=learning_rate, validation_ppl_history=validation_ppl_history, best_validation_ppl=best_ppl, epoch=epoch, random_state=rng.get_state())
         elif best_ppl not in validation_ppl_history[-PATIENCE_EPOCHS:]:
-            print "Finished!"
-            print "Best validation perplexity was %s" % best_ppl
+            print("Finished!")
+            print("Best validation perplexity was %s" % best_ppl)
             break