Skip to content

Commit 179924b

Browse files
author
Philipp Dowling
committed
added corpus preprocessor that removes punctuation
1 parent a6ac8a7 commit 179924b

File tree

2 files changed

+69
-3
lines changed

2 files changed

+69
-3
lines changed

resources/fix_corpus.py

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
__author__ = 'dowling'
2+
import datetime
3+
import logging
4+
5+
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(name)-18s: %(message)s")
6+
7+
ln = logging.getLogger()
8+
9+
fileHandler = logging.FileHandler("fix_corpus_log%s.txt" % datetime.datetime.now().isoformat())
10+
11+
fileHandler.setFormatter(logFormatter)
12+
ln.addHandler(fileHandler)
13+
14+
consoleHandler = logging.StreamHandler()
15+
consoleHandler.setFormatter(logFormatter)
16+
ln.addHandler(consoleHandler)
17+
18+
ln.setLevel(logging.DEBUG)
19+
20+
import re
21+
import string
22+
from gensim import utils
23+
24+
replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
25+
26+
class PreprocessingLineSentence():
27+
def __init__(self, path_to_corpus):
28+
self.path = path_to_corpus
29+
30+
def __iter__(self):
31+
with utils.smart_open(self.path) as fin:
32+
for line_no, line in enumerate(fin):
33+
if line_no % 10000 == 0:
34+
ln.debug("Processed %s lines" % line_no)
35+
36+
# replace all punctuation with a space, unless it's inside a DBPEDIA_ID
37+
line_parts = []
38+
start_at = 0
39+
for list_idx, match in enumerate(re.finditer(r"DBPEDIA_ID/\S+", line)):
40+
41+
edited = line[start_at: match.start()].translate(replace_punctuation)
42+
43+
line_parts.append(edited)
44+
line_parts.append(match.group(0))
45+
start_at = match.end()
46+
47+
edited = line[start_at: -1].translate(replace_punctuation)
48+
line_parts.append(edited)
49+
50+
line = "".join(line_parts)
51+
52+
line = utils.to_unicode(line)
53+
yield line.split()
54+
55+
56+
def fix_corpus(path_to_corpus):
57+
fixed = PreprocessingLineSentence(path_to_corpus)
58+
with open(path_to_corpus + "_fixed", "w") as f:
59+
for line in fixed:
60+
f.write(line)

resources/gensim/gensim_word2vec.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import datetime
22
import logging
3+
from gensim import utils
4+
import string
35

46
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(name)-18s: %(message)s")
57

@@ -21,15 +23,19 @@
2123
from optparse import OptionParser
2224

2325
import gensim
26+
import re
2427

2528
os.system("taskset -p 0xff %d" % os.getpid())
2629

2730

31+
32+
33+
34+
2835
def read_corpus(path_to_corpus, output_path, min_count=10, size=500, window=10):
2936
workers = multiprocessing.cpu_count()
30-
sentences = gensim.models.word2vec.LineSentence(path_to_corpus)
31-
model = gensim.models.Word2Vec(sentences, min_count=min_count, size=size,
32-
window=window, sg=1, workers=workers)
37+
sentences = PreprocessingLineSentence(path_to_corpus)
38+
model = gensim.models.Word2Vec(sentences, min_count=min_count, size=size, window=window, sg=1, workers=workers)
3339
model.save(output_path)
3440

3541

0 commit comments

Comments
 (0)