Updating documentation etc.

Fossj117 · Jul 30, 2014 · 5a470d6 · 5a470d6
1 parent 548b8e2
commit 5a470d6
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 73 deletions.
diff --git a/TODO.md b/TODO.md
diff --git a/classes/business.py b/classes/business.py
@@ -145,10 +145,11 @@ def aspect_summary(self, aspect):
 		map to a list of positive sentences (strings) and
 		a list of negative sentences (strings) correspondingly. 
 		
-		Gets summary for a *particular* aspect. 
+		Gets summary for a *particular* aspect. Summary includes primarily
+		the sorted positive/negative sentences mentioning this apsect.
 		"""
 
-		OPIN_THRESH = 0.7
+		OPIN_THRESH = 0.75
 		HARD_MIN_OPIN_THRESH = 0.6
 
 		POS_THRESH = 0.85
@@ -198,7 +199,7 @@ def aspect_summary(self, aspect):
 
 	def get_sents_by_aspect(self, aspect):
 		"""
-		INPUT: 
+		INPUT: Business, string (aspect)  
 		OUTPUT: List of Sentence objects
 		"""
 		return [sent for review in self for sent in review if sent.has_aspect(aspect)] 
@@ -228,7 +229,6 @@ def filter_all_asps(self, asps):
 		INPUT: Business
 		OUTPUT: list of strings
 		"""
-		# TODO if needed
 		# filter aspects that are too close to the restaurant's name?
 		return asps
 

diff --git a/classes/sentence.py b/classes/sentence.py
@@ -17,10 +17,6 @@ class Sentence(object):
 	# Tokenizer for converting a raw string (sentence) to a list of strings (words)
 	WORD_TOKENIZER = MyPottsTokenizer(preserve_case=False)
 
-	#STANFORD_POS_TAGGER = POSTagger(
-	#			'/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger', 
-	#           '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar')
-
 	# Lemmatizer
 	LEMMATIZER = WordNetLemmatizer()
 
@@ -74,8 +70,6 @@ def pos_tag(self, tokenized_sent):
 		the standard NLTK POS tagger. 
 		"""
 
-		# Using Stanford tagger: 
-		#return Sentence.STANFORD_POS_TAGGER.tag(tokenized_sent)
 		return nltk.pos_tag(tokenized_sent)
 
 	def lemmatize(self, pos_tagged_sent):
@@ -99,10 +93,11 @@ def lemmatize(self, pos_tagged_sent):
 
 	def get_features(self, asarray = False):
 		"""
-		INPUT: Sentence
+		INPUT: Sentence, boolean
 		OUTPUT: dict mapping string to ints/floats
 		
-		Returns an (ordered) feature dict for this Sentence
+		Returns an (ordered) feature dict for this Sentence. If asarray is 
+		True, returns an np feature array instead (unlabeled). 
 		"""
 
 		if not hasattr(self, 'features'):
@@ -118,13 +113,17 @@ def compute_aspects(self):
 		"""
 		INPUT: Sentence
 		OUTPUT: list of lists of strings (i.e. list of aspects)
+
+		Get the candidate aspects contained in this sentence. 
 		"""
 		return Sentence.ASP_EXTRACTOR.get_sent_aspects(self)
 
 	def has_aspect(self, asp_string):
 		"""
 		INPUT: Sentence, string (aspect)
 		OUTPUT: boolean
+
+		Return true if this sentence contains the given aspect string. 
 		"""
 
 		# re-tokenize the aspect
@@ -138,8 +137,8 @@ def encode(self):
 		INPUT: Sentence
 		OUTPUT: dict of this sentence's data
 
-		Encodes this sentence and associated metadata
-		to insert into database. 
+		Encodes this sentence and associated metadata, for
+		insertion into the database. 
 		"""
 		return {'text': self.raw,
 				'user': self.review.user_name

diff --git a/classes/transformers/asp_extractors.py b/classes/transformers/asp_extractors.py
@@ -16,12 +16,12 @@ class SentenceAspectExtractor():
 
     CHUNKER = nltk.RegexpParser(GRAMMAR)
 
-    _my_stopword_additions = ["it's", "i'm", "star", "", "time", "night", "try", "friend", "sure", "times", "way", "friends"]
+    _my_stopword_additions = ["it's", "i'm", "star", "", "time", "night", "try", "sure", "times", "way", "friends"]
     STOPWORDS = set(stopwords.words('english') + _my_stopword_additions)
 
     PUNCT_RE = re.compile("^[\".:;!?')(/]$")
 
-    FORBIDDEN = {'great', 'good', 'time', 'friend'}
+    FORBIDDEN = {'great', 'good', 'time', 'friend', 'way', 'friends'}
 
     def __init__(self):
         pass

diff --git a/main.py b/main.py
@@ -30,7 +30,7 @@ def main():
 
 	print "Loading data..."
 	df = read_data()
-	bus_ids = df.business_id.unique()[-50:-48]
+	bus_ids = df.business_id.unique()[21:]
 
 	for bus_id in bus_ids: