1
+ __author__ = 'dowling'
2
+ import datetime
3
+ import logging
4
+
5
+ logFormatter = logging .Formatter ("%(asctime)s %(levelname)-8s %(name)-18s: %(message)s" )
6
+
7
+ ln = logging .getLogger ()
8
+
9
+ fileHandler = logging .FileHandler ("fix_corpus_log%s.txt" % datetime .datetime .now ().isoformat ())
10
+
11
+ fileHandler .setFormatter (logFormatter )
12
+ ln .addHandler (fileHandler )
13
+
14
+ consoleHandler = logging .StreamHandler ()
15
+ consoleHandler .setFormatter (logFormatter )
16
+ ln .addHandler (consoleHandler )
17
+
18
+ ln .setLevel (logging .DEBUG )
19
+
20
+ import re
21
+ import string
22
+ from gensim import utils
23
+
24
+ replace_punctuation = string .maketrans (string .punctuation , ' ' * len (string .punctuation ))
25
+
26
+ class PreprocessingLineSentence ():
27
+ def __init__ (self , path_to_corpus ):
28
+ self .path = path_to_corpus
29
+
30
+ def __iter__ (self ):
31
+ with utils .smart_open (self .path ) as fin :
32
+ for line_no , line in enumerate (fin ):
33
+ if line_no % 10000 == 0 :
34
+ ln .debug ("Processed %s lines" % line_no )
35
+
36
+ # replace all punctuation with a space, unless it's inside a DBPEDIA_ID
37
+ line_parts = []
38
+ start_at = 0
39
+ for list_idx , match in enumerate (re .finditer (r"DBPEDIA_ID/\S+" , line )):
40
+
41
+ edited = line [start_at : match .start ()].translate (replace_punctuation )
42
+
43
+ line_parts .append (edited )
44
+ line_parts .append (match .group (0 ))
45
+ start_at = match .end ()
46
+
47
+ edited = line [start_at : - 1 ].translate (replace_punctuation )
48
+ line_parts .append (edited )
49
+
50
+ line = "" .join (line_parts )
51
+
52
+ line = utils .to_unicode (line )
53
+ yield line .split ()
54
+
55
+
56
+ def fix_corpus (path_to_corpus ):
57
+ fixed = PreprocessingLineSentence (path_to_corpus )
58
+ with open (path_to_corpus + "_fixed" , "w" ) as f :
59
+ for line in fixed :
60
+ f .write (line )
0 commit comments