mr_wordcount checkin for wu

meklf · Jan 12, 2012 · 0d9cc92 · 0d9cc92
1 parent c8040df
commit 0d9cc92
Show file tree

Hide file tree

Showing 6 changed files with 85 additions and 0 deletions.
diff --git a/day5/emails_to_json.py b/day5/emails_to_json.py
@@ -0,0 +1,34 @@
+import sys
+sys.path.append('../resources/util/')
+from email_util import EmailWalker
+from mrjob.protocol import JSONValueProtocol
+import os
+
+def root_to_json(root_dir, output_file):
+    walker = EmailWalker(root_dir)
+    output = open(output_file, "w")
+
+    for email in walker:
+        email.date = str(email.date)
+        line = JSONValueProtocol.write(None, email.__dict__) + '\n'
+        output.write(line)
+
+    output.close()
+
+def all_folders(root_container, output_dir):
+    for file in os.listdir(root_container):
+        path = os.path.join(root_container, file)
+        if os.path.isdir(path):
+            output = os.path.join(output_dir, file+".json")
+            root_to_json(path, output)
+
+if __name__ == "__main__":
+    if len(sys.argv) == 4 and sys.argv[1] == "one_root":
+        root_to_json(sys.argv[2], sys.argv[3])
+    elif len(sys.argv) == 4 and sys.argv[1] == "many_roots":
+        all_folders(sys.argv[2], sys.argv[3])
+    else:
+        print "one_root walks a single email root folder and emits a json file for it"
+        print "many_roots walks all of the email roots in a directory and outputs a json file for each one"
+        print "Arguments: [one_root|many_roots] [path_to_root|path_to_root_container] [output_file|output_directory"
+
diff --git a/day5/mapreduce.py b/day5/mapreduce.py
@@ -1,6 +1,7 @@
 """
 export AWS_ACCESS_KEY_ID='my_key_id'
 export AWS_SECRET_ACCESS_KEY='my_access_id'
+on windows: set VARIABLE=value
 
 python -m mrjob.tools.emr.create_job_flow --num-ec2-instances=5
 python -m mrjob.tools.emr.terminate_job_flow.py JOBFLOWID
@@ -211,6 +212,8 @@ def reducer(self, term, occurrences):
 
 S3 data is stored in ** buckets **.  Within a bucket you create, you can store as many files or folders as you'd like.  The name of your bucket has to be unique across all of the people that store their stuff in S3.  Want to make your own bucket?  Let's do this!
 
+
+
 """
 
 

diff --git a/day5/mr_wordcount.py b/day5/mr_wordcount.py
@@ -0,0 +1,18 @@
+import sys
+from mrjob.protocol import JSONValueProtocol
+from mrjob.job import MRJob
+from term_tools import get_terms
+
+class MRWordCount(MRJob):
+    INPUT_PROTOCOL = JSONValueProtocol
+    OUTPUT_PROTOCOL = JSONValueProtocol
+
+    def mapper(self, key, email):
+        for term in get_terms(email['text']):
+            yield term, 1
+
+    def reducer(self, word, howmany):
+        yield None, {'term': word, 'count': sum(howmany)}
+
+if __name__ == '__main__':
+        MRWordCount.run()
diff --git a/day5/package.tar b/day5/package.tar
diff --git a/day5/simple_wordcount.py b/day5/simple_wordcount.py
@@ -0,0 +1,15 @@
+import re
+import sys
+from collections import defaultdict
+from mrjob.protocol import JSONValueProtocol
+from term_tools import get_terms
+
+input = open(sys.argv[1])
+words = defaultdict(lambda: 0)
+for line in input:
+    email = JSONValueProtocol.read(line)[1]
+    for term in get_terms(email['text']):
+        words[term] += 1
+
+for word, count in words.items():
+    print word, count
diff --git a/day5/term_tools.py b/day5/term_tools.py
@@ -0,0 +1,15 @@
+import re
+
+STOPWORDS = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'])
+
+word_regex = '^[a-z][a-z\'-]+[a-z]$'
+def get_terms(s):
+    s = s.lower()
+    arr = s.split()
+    terms = []
+    for term in arr:
+        if re.match(word_regex, term) != None:
+            terms.append(term)
+    terms = filter(lambda term: len(term) > 3, terms)
+    terms = filter(lambda term: term not in STOPWORDS, terms)
+    return terms