Added multiprocessing support.

balusamy · Mar 13, 2011 · 49e6061 · 49e6061
1 parent f47a471
commit 49e6061
Showing 1 changed file with 24 additions and 4 deletions.
diff --git a/annotated_wikiextractor/annotated_wikiextractor.py b/annotated_wikiextractor/annotated_wikiextractor.py
@@ -3,6 +3,7 @@
 #  Version: 0.1 (Jan 26, 2010)
 #  Author: Joachim Daiber ([email protected])
 # =============================================================================
+from multiprocessing import Pool
 
 # =============================================================================
 #
@@ -54,6 +55,7 @@
 import wikiextractor
 
 prefix = 'http://en.wikipedia.org/wiki/'
+number_of_workers = 2
 keep_anchors = False
 
 """
@@ -109,10 +111,9 @@ def extract(self, wiki_document):
         for m in ms:              
             if urllib.quote("#") not in m.group(1) or keep_anchors:
                 annotations.append({
-                    "id"    :   m.group(1), 
-                    "label" :   m.group(2), 
-                    "from"  :   m.start() - deltaStringLength, 
-                    "to"    :   m.start() + len(m.group(2)) - deltaStringLength
+                    "uri"    :   m.group(1), 
+                    "surface_form" :   m.group(2), 
+                    "offset"  :   m.start() - deltaStringLength
                 })
 
             deltaStringLength += len(m.group(0)) - len(m.group(2))
@@ -126,6 +127,25 @@ def extract(self, wiki_document):
 
         #Return the AnnotatedWikiDocument
         return annotated_wiki_document
+
+
+def process_data(input_file, wiki_extractor, output_splitter):
+
+    # Set up pool of worker processes
+    pool = Pool(processes=number_of_workers)
+
+    page = []
+    for line in input_file:
+        line = line.decode('utf-8').strip()
+        if line == '<page>':
+            page = []
+        elif line == '</page>':
+            pool.apply_async(wikiextractor.process_page, (page, wiki_extractor, output_splitter))
+        else:
+            page.append(line)
+
+    # Wait for the worker processes to finish
+    pool.join()
 
 def main():
     script_name = os.path.basename(sys.argv[0])