Skip to content

Commit

Permalink
remove whitespace characters
Browse files Browse the repository at this point in the history
  • Loading branch information
Charles-Emmanuel Dias authored and Charles-Emmanuel Dias committed Oct 14, 2017
1 parent 6f2f365 commit 16cee69
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,20 +60,24 @@ def __iter__(self):
def next(self):
if self.x < self.stop:
self.x += 1
return list(w.orth_ for w in self.tok[self.x-1] if w.orth_ != " ") #whitespace shouldn't be a word.
return list(w.orth_ for w in self.tok[self.x-1] if len(w.orth_.strip()) >= 1 ) #whitespace shouldn't be a word.
else:
self.x = 0
raise StopIteration
__next__ = next




print("Building dataset from : {}".format(args.input))
print("-> Building {} random splits".format(args.nb_splits))

nlp = spacy.load('en')

tokenized = [tok for tok in tqdm(nlp.tokenizer.pipe((x["reviewText"] for x in data_generator(args.input)),batch_size=10000, n_threads=8),desc="Tokenizing")]



if args.create_emb:
w2vmodel = gensim.models.Word2Vec(TokIt(tokenized), size=args.emb_size, window=5, min_count=5, iter=args.epochs, max_vocab_size=args.dic_size, workers=4)
print(len(w2vmodel.wv.vocab))
Expand Down

0 comments on commit 16cee69

Please sign in to comment.