forked from dataiap/dataiap
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
85 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import sys | ||
sys.path.append('../resources/util/') | ||
from email_util import EmailWalker | ||
from mrjob.protocol import JSONValueProtocol | ||
import os | ||
|
||
def root_to_json(root_dir, output_file): | ||
walker = EmailWalker(root_dir) | ||
output = open(output_file, "w") | ||
|
||
for email in walker: | ||
email.date = str(email.date) | ||
line = JSONValueProtocol.write(None, email.__dict__) + '\n' | ||
output.write(line) | ||
|
||
output.close() | ||
|
||
def all_folders(root_container, output_dir): | ||
for file in os.listdir(root_container): | ||
path = os.path.join(root_container, file) | ||
if os.path.isdir(path): | ||
output = os.path.join(output_dir, file+".json") | ||
root_to_json(path, output) | ||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) == 4 and sys.argv[1] == "one_root": | ||
root_to_json(sys.argv[2], sys.argv[3]) | ||
elif len(sys.argv) == 4 and sys.argv[1] == "many_roots": | ||
all_folders(sys.argv[2], sys.argv[3]) | ||
else: | ||
print "one_root walks a single email root folder and emits a json file for it" | ||
print "many_roots walks all of the email roots in a directory and outputs a json file for each one" | ||
print "Arguments: [one_root|many_roots] [path_to_root|path_to_root_container] [output_file|output_directory" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import sys | ||
from mrjob.protocol import JSONValueProtocol | ||
from mrjob.job import MRJob | ||
from term_tools import get_terms | ||
|
||
class MRWordCount(MRJob): | ||
INPUT_PROTOCOL = JSONValueProtocol | ||
OUTPUT_PROTOCOL = JSONValueProtocol | ||
|
||
def mapper(self, key, email): | ||
for term in get_terms(email['text']): | ||
yield term, 1 | ||
|
||
def reducer(self, word, howmany): | ||
yield None, {'term': word, 'count': sum(howmany)} | ||
|
||
if __name__ == '__main__': | ||
MRWordCount.run() |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import re | ||
import sys | ||
from collections import defaultdict | ||
from mrjob.protocol import JSONValueProtocol | ||
from term_tools import get_terms | ||
|
||
input = open(sys.argv[1]) | ||
words = defaultdict(lambda: 0) | ||
for line in input: | ||
email = JSONValueProtocol.read(line)[1] | ||
for term in get_terms(email['text']): | ||
words[term] += 1 | ||
|
||
for word, count in words.items(): | ||
print word, count |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import re | ||
|
||
STOPWORDS = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']) | ||
|
||
word_regex = '^[a-z][a-z\'-]+[a-z]$' | ||
def get_terms(s): | ||
s = s.lower() | ||
arr = s.split() | ||
terms = [] | ||
for term in arr: | ||
if re.match(word_regex, term) != None: | ||
terms.append(term) | ||
terms = filter(lambda term: len(term) > 3, terms) | ||
terms = filter(lambda term: term not in STOPWORDS, terms) | ||
return terms |