Skip to content

Commit

Permalink
fixed content processor
Browse files Browse the repository at this point in the history
fixed the content processor so it properly gives 500 words to each pool.
Also, removed punctuation from words
  • Loading branch information
Ethan Blackburn committed Jan 8, 2013
1 parent 8169ee5 commit 3d0dcc3
Showing 1 changed file with 21 additions and 8 deletions.
29 changes: 21 additions & 8 deletions content_processor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from multiprocessing import Pool
import re, sys, logging
import re, sys, logging, string

from ready_queue import ready_queue

Expand All @@ -9,13 +9,21 @@ def rankKeywords(text):
invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
ranks = {}
text = text.split(' ')
exclude = set(string.punctuation)
for t in text:
#remove punctuation if attached to word
temp = t
t = ''
for i in range(len(temp)):
if(temp[i] not in exclude):
t += temp[i]
t = t.strip()
if t in invalid_keywords:
continue
if not ranks.has_key(t):
ranks[t] = 1
else:
ranks[t] += 1
ranks[t] += 1
return ranks

def stripPunctuation(text):
Expand Down Expand Up @@ -83,13 +91,18 @@ def processBody(self):
offset = 0
i = 0
l = []
while True:
cont = True
while cont:
#this divides the text into sets of 500 words
#set j to the index of the last letter of the 500th word
j = self.findnth(self.text[i:],' ',500)
offset += j
#if only 500 words or less are left
if j == -1:
break
l.append(self.text[i:j])
i = offset + j+1
cont = False
#Should append a string that contains 500 words for each loop(except the last loop) to l
#last loop should append a string with 500 words or less to l
l.append(self.text[i:i+j])
i += j+1
logger.debug("processing with %i threads" % len(l))
try:
if len(l) == 0:
Expand Down Expand Up @@ -136,4 +149,4 @@ def getDataDict(self):
for k,v in self.keywords.items():
if v < 3:
del self.keywords[k]
return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}
return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}

0 comments on commit 3d0dcc3

Please sign in to comment.