Skip to content

Commit

Permalink
Adds functionality of changing the size of dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
harpribot committed May 6, 2016
1 parent 181c403 commit 098a6c1
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 3 deletions.
3 changes: 2 additions & 1 deletion extracter_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
inputfile = 'raw_data/foods.txt'
outputfile = 'extracted_data/review_summary.csv'

spider = Spider()
num_reviews = 100000
spider = Spider(num_reviews)
spider.crawl_for_reviews_and_summary(inputfile)
spider.save_review_summary_frame(outputfile)
12 changes: 10 additions & 2 deletions helpers/extracter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import pandas as pd

class Spider:
def __init__(self):
pass
def __init__(self,num_reviews):
self.num_reviews = num_reviews

def crawl_for_reviews_and_summary(self, input_file):
self.raw_data_file = input_file
Expand All @@ -14,9 +14,13 @@ def crawl_for_reviews_and_summary(self, input_file):
def __crawl_review(self):
review_list = []
print 'Crawling Reviews....'
num_lines = 0
with open(self.raw_data_file) as infile:
for line in infile:
if(line.startswith('review/text')):
if num_lines >= self.num_reviews:
break
num_lines += 1
_,review = line.split('/text: ')
review_list.append(review)

Expand All @@ -25,9 +29,13 @@ def __crawl_review(self):
def __crawl_summary(self):
summary_list = []
print 'Crawling Summary....'
num_lines = 0
with open(self.raw_data_file) as infile:
for line in infile:
if(line.startswith('review/summary')):
if num_lines >= self.num_reviews:
break
num_lines += 1
_,summary = line.split('/summary: ')
summary_list.append(summary)

Expand Down

0 comments on commit 098a6c1

Please sign in to comment.