Skip to content

Commit

Permalink
tokenizer support
Browse files Browse the repository at this point in the history
  • Loading branch information
Petr Belohlavek committed Feb 6, 2016
1 parent d574e95 commit 704af88
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 6 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
six >= 1.10.0
unicodecsv >= 0.14.0
nltk >= 3.1
23 changes: 20 additions & 3 deletions src/create_ubuntu_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from six.moves import urllib
import tarfile
import csv
import nltk

__author__ = 'rkadlec'

Expand Down Expand Up @@ -289,8 +290,14 @@ def create_eval_dataset(args, file_list_csv):
w.writerow(header)

for row in data_set:
translated_row = [row[0], row[1]]
translated_row.extend(row[2])
if args.tokenize:
translated_row = ["", ""]
translated_row[0] = " ".join(nltk.word_tokenize(row[0]))
translated_row[1] = " ".join(nltk.word_tokenize(row[1]))
translated_row.extend(map(lambda x: " ".join(nltk.word_tokenize(x)), row[2]))
else:
translated_row = [row[0], row[1]]
translated_row.extend(row[2])
w.writerow(translated_row)
print("Dataset stored in: {}".format(args.output))

Expand All @@ -311,7 +318,14 @@ def train_cmd(args):
# header
w.writerow(["Context", "Utterance", "Label"])
for row in train_set:
w.writerow(row)
if args.tokenize:
tokenized_row = []
tokenized_row.append(" ".join(nltk.word_tokenize(row[0])))
tokenized_row.append(" ".join(nltk.word_tokenize(row[1])))
tokenized_row.append(row[2])
else:
tokenized_row = row
w.writerow(tokenized_row)
print("Train data stored in: {}".format(args.output))

def valid_cmd(args):
Expand All @@ -334,6 +348,9 @@ def test_cmd(args):
parser.add_argument('-o', '--output', default=None,
help='output csv')

parser.add_argument('-t', '--tokenize', action='store_true',
help='tokenize the output')

subparsers = parser.add_subparsers(help='sub-command help')

parser_train = subparsers.add_parser('train', help='trainset generator')
Expand Down
6 changes: 3 additions & 3 deletions src/generate.sh
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
python create_ubuntu_dataset.py --output 'train.csv' 'train'
python create_ubuntu_dataset.py --output 'test.csv' 'test'
python create_ubuntu_dataset.py --output 'valid.csv' 'valid'
python create_ubuntu_dataset.py -t --output 'train.csv' 'train'
python create_ubuntu_dataset.py -t --output 'test.csv' 'test'
python create_ubuntu_dataset.py -t --output 'valid.csv' 'valid'

0 comments on commit 704af88

Please sign in to comment.