Skip to content

Commit

Permalink
mr_wordcount checkin for wu
Browse files Browse the repository at this point in the history
  • Loading branch information
marcua committed Jan 12, 2012
1 parent c8040df commit 0d9cc92
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 0 deletions.
34 changes: 34 additions & 0 deletions day5/emails_to_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import sys
sys.path.append('../resources/util/')
from email_util import EmailWalker
from mrjob.protocol import JSONValueProtocol
import os

def root_to_json(root_dir, output_file):
walker = EmailWalker(root_dir)
output = open(output_file, "w")

for email in walker:
email.date = str(email.date)
line = JSONValueProtocol.write(None, email.__dict__) + '\n'
output.write(line)

output.close()

def all_folders(root_container, output_dir):
for file in os.listdir(root_container):
path = os.path.join(root_container, file)
if os.path.isdir(path):
output = os.path.join(output_dir, file+".json")
root_to_json(path, output)

if __name__ == "__main__":
if len(sys.argv) == 4 and sys.argv[1] == "one_root":
root_to_json(sys.argv[2], sys.argv[3])
elif len(sys.argv) == 4 and sys.argv[1] == "many_roots":
all_folders(sys.argv[2], sys.argv[3])
else:
print "one_root walks a single email root folder and emits a json file for it"
print "many_roots walks all of the email roots in a directory and outputs a json file for each one"
print "Arguments: [one_root|many_roots] [path_to_root|path_to_root_container] [output_file|output_directory"

3 changes: 3 additions & 0 deletions day5/mapreduce.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
export AWS_ACCESS_KEY_ID='my_key_id'
export AWS_SECRET_ACCESS_KEY='my_access_id'
on windows: set VARIABLE=value
python -m mrjob.tools.emr.create_job_flow --num-ec2-instances=5
python -m mrjob.tools.emr.terminate_job_flow.py JOBFLOWID
Expand Down Expand Up @@ -211,6 +212,8 @@ def reducer(self, term, occurrences):
S3 data is stored in ** buckets **. Within a bucket you create, you can store as many files or folders as you'd like. The name of your bucket has to be unique across all of the people that store their stuff in S3. Want to make your own bucket? Let's do this!
"""


Expand Down
18 changes: 18 additions & 0 deletions day5/mr_wordcount.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import sys
from mrjob.protocol import JSONValueProtocol
from mrjob.job import MRJob
from term_tools import get_terms

class MRWordCount(MRJob):
INPUT_PROTOCOL = JSONValueProtocol
OUTPUT_PROTOCOL = JSONValueProtocol

def mapper(self, key, email):
for term in get_terms(email['text']):
yield term, 1

def reducer(self, word, howmany):
yield None, {'term': word, 'count': sum(howmany)}

if __name__ == '__main__':
MRWordCount.run()
Binary file added day5/package.tar
Binary file not shown.
15 changes: 15 additions & 0 deletions day5/simple_wordcount.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import re
import sys
from collections import defaultdict
from mrjob.protocol import JSONValueProtocol
from term_tools import get_terms

input = open(sys.argv[1])
words = defaultdict(lambda: 0)
for line in input:
email = JSONValueProtocol.read(line)[1]
for term in get_terms(email['text']):
words[term] += 1

for word, count in words.items():
print word, count
15 changes: 15 additions & 0 deletions day5/term_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import re

STOPWORDS = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'])

word_regex = '^[a-z][a-z\'-]+[a-z]$'
def get_terms(s):
s = s.lower()
arr = s.split()
terms = []
for term in arr:
if re.match(word_regex, term) != None:
terms.append(term)
terms = filter(lambda term: len(term) > 3, terms)
terms = filter(lambda term: term not in STOPWORDS, terms)
return terms

0 comments on commit 0d9cc92

Please sign in to comment.