From 098a6c146e8002013ec28a891441be6d26cb8a12 Mon Sep 17 00:00:00 2001 From: harpribot Date: Thu, 5 May 2016 19:02:39 -0500 Subject: [PATCH] Adds functionality of changing the size of dataset --- extracter_script.py | 3 ++- helpers/extracter.py | 12 ++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/extracter_script.py b/extracter_script.py index 863be74..fc097dc 100644 --- a/extracter_script.py +++ b/extracter_script.py @@ -4,6 +4,7 @@ inputfile = 'raw_data/foods.txt' outputfile = 'extracted_data/review_summary.csv' -spider = Spider() +num_reviews = 100000 +spider = Spider(num_reviews) spider.crawl_for_reviews_and_summary(inputfile) spider.save_review_summary_frame(outputfile) diff --git a/helpers/extracter.py b/helpers/extracter.py index 409a105..a666d1e 100644 --- a/helpers/extracter.py +++ b/helpers/extracter.py @@ -2,8 +2,8 @@ import pandas as pd class Spider: - def __init__(self): - pass + def __init__(self,num_reviews): + self.num_reviews = num_reviews def crawl_for_reviews_and_summary(self, input_file): self.raw_data_file = input_file @@ -14,9 +14,13 @@ def crawl_for_reviews_and_summary(self, input_file): def __crawl_review(self): review_list = [] print 'Crawling Reviews....' + num_lines = 0 with open(self.raw_data_file) as infile: for line in infile: if(line.startswith('review/text')): + if num_lines >= self.num_reviews: + break + num_lines += 1 _,review = line.split('/text: ') review_list.append(review) @@ -25,9 +29,13 @@ def __crawl_review(self): def __crawl_summary(self): summary_list = [] print 'Crawling Summary....' + num_lines = 0 with open(self.raw_data_file) as infile: for line in infile: if(line.startswith('review/summary')): + if num_lines >= self.num_reviews: + break + num_lines += 1 _,summary = line.split('/summary: ') summary_list.append(summary)