Skip to content

Commit 0301cb8

Browse files
committed
Merge pull request #7 from SpikeTheMaster/master
Add Amazon and Steam parsing changes
2 parents d8faeea + 62ed335 commit 0301cb8

5 files changed

+203
-13
lines changed

README.md

+41-11
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ Elasticsearch For Beginners: Indexing your Gmail Inbox
22
=======================
33

44

5-
#### What's this all about?
5+
6+
#### What's this all about?
67

78
I recently looked at my Gmail inbox and noticed that I have well over 50k emails, taking up about 12GB of space but there is no good way to tell what emails take up space, who sent them to, who emails me, etc
89

@@ -15,11 +16,11 @@ __Related tutorial:__ [Index and Search Hacker News using Elasticsearch and the
1516

1617
Set up [Elasticsearch](http://ohardt.us/es-install) and make sure it's running at [http://localhost:9200](http://localhost:9200)
1718

18-
I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Run `pip install tornado` to install Tornado.
19+
I use Python and [Tornado](https://github.com/tornadoweb/tornado/) for the scripts to import and query the data. Run `pip install tornado chardet` to install Tornado and chardet.
1920

2021

2122

22-
#### Aight, where do we start?
23+
#### Aight, where do we start?
2324

2425
First, go [here](http://ohardt.us/download-gmail-mailbox) and download your Gmail mailbox, depending on the amount of emails you have accumulated this might take a while.
2526

@@ -100,7 +101,7 @@ for part in parts:
100101

101102
##### Index the data with Elasticsearch
102103

103-
The most simple aproach is a PUT request per item:
104+
The most simple approach is a PUT request per item:
104105

105106
```python
106107
def upload_item_to_es(item):
@@ -109,12 +110,12 @@ def upload_item_to_es(item):
109110
response = yield http_client.fetch(request)
110111
if not response.code in [200, 201]:
111112
print "\nfailed to add item %s" % item['message-id']
112-
113+
113114
```
114115

115116
However, Elasticsearch provides a better method for importing large chunks of data: [bulk indexing](http://ohardt.us/es-bulk-indexing)
116117
Instead of making a HTTP request per document and indexing individually, we batch them in chunks of eg. 1000 documents and then index them.<br>
117-
Bulk messages are of the format:
118+
Bulk messages are of the format:
118119

119120
```
120121
cmd\n
@@ -195,11 +196,9 @@ You can also quickly query for certain fields via the `q` parameter. This exampl
195196
curl "localhost:9200/gmail/email/_search?pretty&q=from:[email protected]"
196197
```
197198

198-
199-
200199
##### Aggregation queries
201200

202-
Aggregation queries let us bucket data by a given key and count the number of messages per bucket.
201+
Aggregation queries let us bucket data by a given key and count the number of messages per bucket.
203202
For example, number of messages grouped by recipient:
204203

205204
```
@@ -255,7 +254,7 @@ Result:
255254
"doc_count" : 4285
256255
}, { "key" : "unread",
257256
"doc_count" : 510
258-
},
257+
},
259258
...
260259
]
261260
}
@@ -269,7 +268,7 @@ curl -s "localhost:9200/gmail/email/_search?pretty&search_type=count" -d '
269268
"years": {
270269
"date_histogram": {
271270
"field": "date_ts", "interval": "year"
272-
}}}}
271+
}}}}
273272
'
274273
```
275274

@@ -296,6 +295,37 @@ Result:
296295
}
297296
```
298297

298+
Write aggregation queries to work out how much you spent on Amazon/Steam:
299+
300+
```
301+
GET _search
302+
{
303+
"query": {
304+
"match_all": {}
305+
},
306+
"size": 0,
307+
"aggs": {
308+
"group_by_company": {
309+
"terms": {
310+
"field": "order_details.merchant"
311+
},
312+
"aggs": {
313+
"total_spent": {
314+
"sum": {
315+
"field": "order_details.order_total"
316+
}
317+
},
318+
"postage": {
319+
"sum": {
320+
"field": "order_details.postage"
321+
}
322+
}
323+
}
324+
}
325+
}
326+
}
327+
```
328+
299329

300330
#### Todo
301331

src/AmazonEmailParser.py

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import json
2+
import re
3+
4+
class AmazonEmailParser(object):
5+
6+
def __init__(self):
7+
self.orderTotalRE = re.compile(r"(?<=Order Total:) (?:.*?)(\d+.\d+)")
8+
self.postageRE = re.compile(r"(?<=Postage & Packing:) (?:.*?)(\d+.\d+)")
9+
self.deliveryRE = re.compile(r"(?<=Delivery & Handling::) (?:.*?)(\d+.\d+)")
10+
self.orderItemsRE = re.compile(r"==========\r\n\r\n")
11+
self.costRE = re.compile(r"(\d+\.\d+)")
12+
13+
def canParse(self, email):
14+
try:
15+
if 'auto-confirm@amazon' in email['from']:
16+
return True
17+
else:
18+
return False
19+
except:
20+
return False
21+
22+
def parse(self, email):
23+
body = email['body']
24+
25+
if 'Order Confirmation' in body:
26+
postage = 0
27+
orderTotal = 0
28+
29+
result = re.search(self.orderTotalRE, body)
30+
31+
if result:
32+
orderTotal = float(result.groups()[0])
33+
34+
result = re.search(self.postageRE, body)
35+
36+
if result:
37+
postage = float(result.groups()[0])
38+
else:
39+
result = re.search(self.deliveryRE, body)
40+
if result:
41+
postage = float(result.groups()[0])
42+
43+
email['order_details'] = {
44+
"order_items" : [],
45+
"order_total" : orderTotal,
46+
"postage" : postage,
47+
"merchant" : "amazon"
48+
}
49+
50+
orders = re.split(self.orderItemsRE, body)[1]
51+
orders = orders.split('\r\n\r\n')
52+
53+
#Remove first and last 3 items
54+
orders.pop(0)
55+
orders.pop()
56+
orders.pop()
57+
orders.pop()
58+
59+
costTotal = orderTotal
60+
61+
for item in orders:
62+
if 'Your estimated delivery date is:' in item or 'Your order will be sent to:' in item:
63+
continue
64+
else:
65+
lines = item.replace('_','').split('\r\n')
66+
if len(lines) < 4:
67+
continue
68+
itemName = lines[0].strip()
69+
cost = float(re.search(self.costRE, lines[1].strip()).groups()[0])
70+
condition = lines[2].rpartition(':')[2].strip()
71+
seller = lines[3].replace('Sold by', '').strip()
72+
73+
email['order_details']['order_items'].append({"item":itemName, "cost":cost, "condition": condition, "seller": seller})
74+
costTotal -= cost
75+
76+
if costTotal != 0:
77+
print "Warning order not parsed correctly, order items may be missing, or promotion may have been applied."
78+
print email['order_details']
79+
print body
80+
81+
return email

src/DelegatingEmailParser.py

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
class DelegatingEmailParser(object):
2+
3+
def __init__(self, parsers):
4+
self.parsers = parsers
5+
6+
def parse(self, email):
7+
for parser in self.parsers:
8+
if parser.canParse(email):
9+
return parser.parse(email)
10+
11+
return email

src/SteamEmailParser.py

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import json
2+
import re
3+
4+
class SteamEmailParser(object):
5+
6+
def __init__(self):
7+
self.orderTotalRE = re.compile(r"(?<=Total:)[ \t]+(\d+.\d+)")
8+
self.orderItemsRE = re.compile(r"(?:\.\r\n)+")
9+
self.costRE = re.compile(r"(\d+\.\d+)")
10+
11+
def canParse(self, email):
12+
try:
13+
if '[email protected]' in email['from']:
14+
return True
15+
else:
16+
return False
17+
except:
18+
return False
19+
20+
def parse(self, email):
21+
body = email['body']
22+
23+
if 'Thank you' in email['subject'] and 'purchase' in body:
24+
orderTotal = 0
25+
26+
result = re.search(self.orderTotalRE, body)
27+
28+
if result:
29+
orderTotal = float(result.groups()[0])
30+
31+
email['order_details'] = {
32+
"order_items" : [],
33+
"order_total" : orderTotal,
34+
"merchant" : "steam"
35+
}
36+
37+
order = re.split(self.orderItemsRE, body)[2].split('\r\n') #This parser to get order total is currently broken, gift purchases are not parsed
38+
39+
costTotal = orderTotal
40+
41+
costTotal = orderTotal
42+
43+
for item in order:
44+
if '-------' in item:
45+
break
46+
else:
47+
if item == '' or ': ' not in item:
48+
continue
49+
splitResult = item.rpartition(':')
50+
itemName = splitResult[0].strip()
51+
cost = float(re.match(self.costRE, splitResult[2].strip()).groups()[0])
52+
53+
email['order_details']['order_items'].append({"item":itemName, "cost":cost})
54+
costTotal -= cost
55+
56+
if costTotal != 0:
57+
print "Warning order not parsed correctly, order items may be missing, or promotion may have been applied."
58+
print email['order_details']
59+
print body
60+
61+
return email

src/index_emails.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
import email.utils
88
import mailbox
99
import email
10+
import quopri
11+
import chardet
12+
from DelegatingEmailParser import DelegatingEmailParser
13+
from AmazonEmailParser import AmazonEmailParser
14+
from SteamEmailParser import SteamEmailParser
1015
import logging
1116

1217
http_client = HTTPClient()
@@ -19,13 +24,12 @@ def delete_index():
1924
try:
2025
url = "%s/%s?refresh=true" % (tornado.options.options.es_url, tornado.options.options.index_name)
2126
request = HTTPRequest(url, method="DELETE", request_timeout=240)
27+
body = {"refresh": True}
2228
response = http_client.fetch(request)
2329
logging.info('Delete index done %s' % response.body)
2430
except:
2531
pass
2632

27-
28-
2933
def create_index():
3034

3135
schema = {
@@ -135,6 +139,9 @@ def load_from_file():
135139
upload_data = list()
136140
logging.info("Starting import from file %s" % tornado.options.options.infile)
137141
mbox = mailbox.UnixMailbox(open(tornado.options.options.infile, 'rb'), email.message_from_file)
142+
143+
emailParser = DelegatingEmailParser([AmazonEmailParser(), SteamEmailParser()])
144+
138145
for msg in mbox:
139146
count += 1
140147
if count < tornado.options.options.skip:

0 commit comments

Comments
 (0)