-
Notifications
You must be signed in to change notification settings - Fork 39
/
Copy pathmain.py
71 lines (49 loc) · 1.43 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import json
import time
import pandas as pd
from pymongo import MongoClient
from classes.business import Business
def get_reviews_for_business(bus_id, df):
"""
INPUT: business id, pandas DataFrame
OUTPUT: Series with only texts
For a given business id, return the review_id and
text of all reviews for that business.
"""
return df[df.business_id==bus_id]
def read_data():
"""
INPUT: None
OUTPUT: pandas data frame from file
"""
return pd.read_csv('./raw_data/yelp_data/processed.csv')
def main():
client = MongoClient()
db = client.yelptest2
summaries_coll = db.summaries
print "Loading data..."
df = read_data()
bus_ids = df.business_id.unique()[21:]
for bus_id in bus_ids:
print "Working on biz_id %s" % bus_id
start = time.time()
biz = Business(get_reviews_for_business(bus_id,df))
summary = biz.aspect_based_summary()
summaries_coll.insert(summary)
print "Inserted summary for %s into Mongo" % biz.business_name
elapsed = time.time() - start
print "Time elapsed: %d" % elapsed
if __name__ == "__main__":
main()
# import time
# OUTFILE = "test_summary.json"
# print "Loading data..."
# df = read_data()
# bus_id = df.business_id.iloc[4000]
# start = time.time()
# biz = Business(get_reviews_for_business(bus_id, df))
# summary = biz.aspect_based_summary()
# with open(OUTFILE, 'w') as f:
# f.write(json.dumps(summary))
# elapsed = time.time() - start
# print "Time elapsed: %d" % elapsed