Skip to content

Commit

Permalink
Python 3 compatibility:
Browse files Browse the repository at this point in the history
- Use io.open instead of codecs.open/codecs.getwriter
- Use input() if raw_input is not available
- Use _pickle if cPickle is not available
- Use print_function from __future__
  • Loading branch information
mkhon committed Jun 8, 2018
1 parent 5161946 commit 552adb8
Show file tree
Hide file tree
Showing 10 changed files with 123 additions and 83 deletions.
4 changes: 2 additions & 2 deletions convert_to_readable.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import sys
import codecs
from io import open
from data import EOS_TOKENS, PUNCTUATION_VOCABULARY

if __name__ == "__main__":
Expand All @@ -16,7 +16,7 @@

with_newlines = len(sys.argv) > 3 and bool(int(sys.argv[3]))

with codecs.open(input_file, 'r', 'utf-8') as in_f, codecs.open(output_file, 'w', 'utf-8') as out_f:
with open(input_file, 'r', encoding='utf-8') as in_f, open(output_file, 'w', encoding='utf-8') as out_f:
last_was_eos = True
first = True
for token in in_f.read().split():
Expand Down
31 changes: 19 additions & 12 deletions data.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
# coding: utf-8
from __future__ import division
from __future__ import division, print_function

import random
import os
import sys
import operator
import cPickle
import codecs
try:
import cPickle
except ImportError:
import _pickle as cPickle
try:
input = raw_input
except NameError:
pass
from io import open
import fnmatch
import shutil

Expand Down Expand Up @@ -77,16 +84,16 @@ def write_vocabulary(vocabulary, file_name):
if UNK not in vocabulary:
vocabulary.append(UNK)

print "Vocabulary size: %d" % len(vocabulary)
print("Vocabulary size: %d" % len(vocabulary))

with codecs.open(file_name, 'w', 'utf-8') as f:
with open(file_name, 'w', encoding='utf-8') as f:
f.write("\n".join(vocabulary))

def iterable_to_dict(arr):
return dict((x.strip(), i) for (i, x) in enumerate(arr))

def read_vocabulary(file_name):
with codecs.open(file_name, 'r', 'utf-8') as f:
with open(file_name, 'r', encoding='utf-8') as f:
return iterable_to_dict(f.readlines())

def write_processed_dataset(input_files, output_file):
Expand Down Expand Up @@ -115,7 +122,7 @@ def write_processed_dataset(input_files, output_file):

for input_file in input_files:

with codecs.open(input_file, 'r', 'utf-8') as text:
with open(input_file, 'r', encoding='utf-8') as text:

for line in text:

Expand Down Expand Up @@ -196,7 +203,7 @@ def write_processed_dataset(input_files, output_file):

last_eos_idx = 0 # sequence always starts with a new sentence

print "%.2f%% UNK-s in %s" % (num_unks / num_total * 100, output_file)
print("%.2f%% UNK-s in %s" % (num_unks / num_total * 100, output_file))

dump(data, output_file)

Expand Down Expand Up @@ -224,15 +231,15 @@ def create_dev_test_train_split_and_vocabulary(root_path, create_vocabulary, tra
train_txt_files.append(path)

if create_vocabulary and not pretrained_embeddings_path:
with codecs.open(path, 'r', 'utf-8') as text:
with open(path, 'r', encoding='utf-8') as text:
for line in text:
add_counts(word_counts, line)

if create_vocabulary:
if pretrained_embeddings_path:
vocabulary = []
embeddings = []
with codecs.open(pretrained_embeddings_path, 'r', 'utf-8') as f:
with open(pretrained_embeddings_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.split()
w = line[0]
Expand Down Expand Up @@ -261,7 +268,7 @@ def create_dev_test_train_split_and_vocabulary(root_path, create_vocabulary, tra
if os.path.exists(DATA_PATH):

while True:
resp = raw_input("Data path '%s' already exists. Do you want to:\n[r]eplace the files in existing data path?\n[e]xit?\n>" % DATA_PATH)
resp = input("Data path '%s' already exists. Do you want to:\n[r]eplace the files in existing data path?\n[e]xit?\n>" % DATA_PATH)
resp = resp.lower().strip()
if resp not in ('r', 'e'):
continue
Expand All @@ -281,4 +288,4 @@ def create_dev_test_train_split_and_vocabulary(root_path, create_vocabulary, tra
# Stage 2
if len(sys.argv) > 2:
path2 = sys.argv[2]
create_dev_test_train_split_and_vocabulary(path2, False, TRAIN_FILE2, DEV_FILE2, TEST_FILE2)
create_dev_test_train_split_and_vocabulary(path2, False, TRAIN_FILE2, DEV_FILE2, TEST_FILE2)
16 changes: 10 additions & 6 deletions demo_play_with_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# coding: utf-8

from __future__ import division
from __future__ import division, print_function

from nltk.tokenize import word_tokenize

Expand All @@ -9,8 +9,8 @@

import theano
import sys
import codecs
import re
from io import open

import theano.tensor as T
import numpy as np
Expand Down Expand Up @@ -106,10 +106,10 @@ def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuat

x = T.imatrix('x')

print "Loading model parameters..."
print("Loading model parameters...")
net, _ = models.load(model_file, 1, x)

print "Building model..."
print("Building model...")
predict = theano.function(inputs=[x], outputs=net.y)
word_vocabulary = net.x_vocabulary
punctuation_vocabulary = net.y_vocabulary
Expand All @@ -120,11 +120,15 @@ def punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuat
tokenizer = word_tokenize
untokenizer = lambda text: text.replace(" '", "'").replace(" n't", "n't").replace("can not", "cannot")

with codecs.getwriter('utf-8')(sys.stdout) as f_out:
with open(sys.stdout.fileno(), 'w', encoding='utf-8', closefd=False) as f_out:
while True:
text = raw_input("\nTEXT: ").decode('utf-8')
try:
text = raw_input("\nTEXT: ").decode('utf-8')
except NameError:
text = input("\nTEXT: ")

words = [w for w in untokenizer(' '.join(tokenizer(text))).split()
if w not in punctuation_vocabulary and w not in human_readable_punctuation_vocabulary]

punctuate(predict, word_vocabulary, punctuation_vocabulary, reverse_punctuation_vocabulary, reverse_word_vocabulary, words, f_out, show_unk)
f_out.flush()
20 changes: 11 additions & 9 deletions error_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
Computes and prints the overall classification error and precision, recall, F-score over punctuations.
"""

from __future__ import print_function

from numpy import nan
import data
import codecs
import sys
from io import open

MAPPING = {}#{"!EXCLAMATIONMARK": ".PERIOD", "?QUESTIONMARK": ".PERIOD", ":COLON": ".PERIOD", ";SEMICOLON": ".PERIOD"} # Can be used to estimate 2-class performance for example

Expand All @@ -32,7 +34,7 @@ def compute_error(target_paths, predicted_paths):
t_i = 0
p_i = 0

with codecs.open(target_path, 'r', 'utf-8') as target, codecs.open(predicted_path, 'r', 'utf-8') as predicted:
with open(target_path, 'r', encoding='utf-8') as target, open(predicted_path, 'r', encoding='utf-8') as predicted:

target_stream = target.read().split()
predicted_stream = predicted.read().split()
Expand Down Expand Up @@ -91,8 +93,8 @@ def compute_error(target_paths, predicted_paths):
overall_fp = 0.0
overall_fn = 0.0

print "-"*46
print "{:<16} {:<9} {:<9} {:<9}".format('PUNCTUATION','PRECISION','RECALL','F-SCORE')
print("-"*46)
print("{:<16} {:<9} {:<9} {:<9}".format('PUNCTUATION','PRECISION','RECALL','F-SCORE'))
for p in data.PUNCTUATION_VOCABULARY:

if p == data.SPACE:
Expand All @@ -106,14 +108,14 @@ def compute_error(target_paths, predicted_paths):
precision = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_positives[p])) if p in false_positives else nan
recall = (true_positives.get(p,0.) / (true_positives.get(p,0.) + false_negatives[p])) if p in false_negatives else nan
f_score = (2. * precision * recall / (precision + recall)) if (precision + recall) > 0 else nan
print u"{:<16} {:<9} {:<9} {:<9}".format(punctuation, round(precision,3)*100, round(recall,3)*100, round(f_score,3)*100).encode('utf-8')
print "-"*46
print(u"{:<16} {:<9} {:<9} {:<9}".format(punctuation, round(precision,3)*100, round(recall,3)*100, round(f_score,3)*100).encode('utf-8'))
print("-"*46)
pre = overall_tp/(overall_tp+overall_fp) if overall_fp else nan
rec = overall_tp/(overall_tp+overall_fn) if overall_fn else nan
f1 = (2.*pre*rec)/(pre+rec) if (pre + rec) else nan
print "{:<16} {:<9} {:<9} {:<9}".format("Overall", round(pre,3)*100, round(rec,3)*100, round(f1,3)*100)
print "Err: %s%%" % round((100.0 - float(total_correct) / float(counter-1) * 100.0), 2)
print "SER: %s%%" % round((substitutions + deletions + insertions) / (correct + substitutions + deletions) * 100, 1)
print("{:<16} {:<9} {:<9} {:<9}".format("Overall", round(pre,3)*100, round(rec,3)*100, round(f1,3)*100))
print("Err: %s%%" % round((100.0 - float(total_correct) / float(counter-1) * 100.0), 2))
print("SER: %s%%" % round((substitutions + deletions + insertions) / (correct + substitutions + deletions) * 100, 1))


if __name__ == "__main__":
Expand Down
10 changes: 5 additions & 5 deletions example/dont_run_me_run_the_other_script_instead.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# coding: utf-8

from __future__ import division
from __future__ import division, print_function
from nltk.tokenize import word_tokenize

import nltk
import os
import codecs
from io import open
import re
import sys

Expand Down Expand Up @@ -59,8 +59,8 @@ def process_line(line):

skipped = 0

with codecs.open(sys.argv[2], 'w', 'utf-8') as out_txt:
with codecs.open(sys.argv[1], 'r', 'utf-8') as text:
with open(sys.argv[2], 'w', encoding='utf-8') as out_txt:
with open(sys.argv[1], 'r', encoding='utf-8') as text:

for line in text:

Expand All @@ -75,4 +75,4 @@ def process_line(line):

out_txt.write(line + '\n')

print "Skipped %d lines" % skipped
print("Skipped %d lines" % skipped)
35 changes: 21 additions & 14 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# coding: utf-8
from __future__ import division
from __future__ import division, print_function

from collections import OrderedDict
from time import time
Expand All @@ -8,9 +8,16 @@
import data

import theano
import cPickle
try:
import cPickle
except ImportError:
import _pickle as cPickle
import sys
import os.path
try:
input = raw_input
except NameError:
pass

import theano.tensor as T
import numpy as np
Expand Down Expand Up @@ -39,10 +46,10 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
P_batch = []

if len(dataset) < batch_size:
print "WARNING: Not enough samples in '%s'. Reduce mini-batch size to %d or use a dataset with at least %d words." % (
print("WARNING: Not enough samples in '%s'. Reduce mini-batch size to %d or use a dataset with at least %d words." % (
file_name,
len(dataset),
MINIBATCH_SIZE * data.MAX_SEQUENCE_LEN)
MINIBATCH_SIZE * data.MAX_SEQUENCE_LEN))

for subsequence in dataset:

Expand Down Expand Up @@ -88,7 +95,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):

model_file_name = "Model_%s_h%d_lr%s.pcl" % (model_name, num_hidden, learning_rate)

print num_hidden, learning_rate, model_file_name
print(num_hidden, learning_rate, model_file_name)

word_vocabulary = data.read_vocabulary(data.WORD_VOCAB_FILE)
punctuation_vocabulary = data.iterable_to_dict(data.PUNCTUATION_VOCABULARY)
Expand All @@ -101,7 +108,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
if os.path.isfile(model_file_name):

while True:
resp = raw_input("Found an existing model with the name %s. Do you want to:\n[c]ontinue training the existing model?\n[r]eplace the existing model and train a new one?\n[e]xit?\n>" % model_file_name)
resp = input("Found an existing model with the name %s. Do you want to:\n[c]ontinue training the existing model?\n[r]eplace the existing model and train a new one?\n[e]xit?\n>" % model_file_name)
resp = resp.lower().strip()
if resp not in ('c', 'r', 'e'):
continue
Expand All @@ -112,7 +119,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
break

if continue_with_previous:
print "Loading previous model state"
print("Loading previous model state")

net, state = models.load(model_file_name, MINIBATCH_SIZE, x)
gsums, learning_rate, validation_ppl_history, starting_epoch, rng = state
Expand All @@ -122,7 +129,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
rng = np.random
rng.seed(1)

print "Building model..."
print("Building model...")
net = models.GRU(
rng=rng,
x=x,
Expand Down Expand Up @@ -170,7 +177,7 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
outputs=net.cost(y)
)

print "Training..."
print("Training...")
for epoch in range(starting_epoch, MAX_EPOCHS):
t0 = time()
total_neg_log_likelihood = 0
Expand All @@ -183,24 +190,24 @@ def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):
if iteration % 100 == 0:
sys.stdout.write("PPL: %.4f; Speed: %.2f sps\n" % (np.exp(total_neg_log_likelihood / total_num_output_samples), total_num_output_samples / max(time() - t0, 1e-100)))
sys.stdout.flush()
print "Total number of training labels: %d" % total_num_output_samples
print("Total number of training labels: %d" % total_num_output_samples)

total_neg_log_likelihood = 0
total_num_output_samples = 0
for X, Y in get_minibatch(data.DEV_FILE, MINIBATCH_SIZE, shuffle=False):
total_neg_log_likelihood += validate_model(X, Y)
total_num_output_samples += np.prod(Y.shape)
print "Total number of validation labels: %d" % total_num_output_samples
print("Total number of validation labels: %d" % total_num_output_samples)

ppl = np.exp(total_neg_log_likelihood / total_num_output_samples)
validation_ppl_history.append(ppl)

print "Validation perplexity is %s" % np.round(ppl, 4)
print("Validation perplexity is %s" % np.round(ppl, 4))

if ppl <= best_ppl:
best_ppl = ppl
net.save(model_file_name, gsums=gsums, learning_rate=learning_rate, validation_ppl_history=validation_ppl_history, best_validation_ppl=best_ppl, epoch=epoch, random_state=rng.get_state())
elif best_ppl not in validation_ppl_history[-PATIENCE_EPOCHS:]:
print "Finished!"
print "Best validation perplexity was %s" % best_ppl
print("Finished!")
print("Best validation perplexity was %s" % best_ppl)
break
Loading

0 comments on commit 552adb8

Please sign in to comment.