diff --git a/TODO.md b/TODO.md deleted file mode 100644 index 7e0a662..0000000 --- a/TODO.md +++ /dev/null @@ -1,56 +0,0 @@ -## To-Do List -######(Updated: July 24) - -#### Details: - -* [X] Make it so that JSON written to database stores the data necessary for rendering the positive/negative display on webapp -* [X] Make it so that sentence objects in JSON store their probability of pos/neg - * Will sort by this / filter low-confidence examples in the front end display -* Figure out a good way to **bold** the aspect in the sentence - * Ask Ryan/Jon how to do this - -#### Must-Dos: - -* Figure out how to deploy to EC2 -* Populate full DB -* [X] Improve sentiment analysis - -#### Nice-to-Haves: - -* Create a fancier summary of the comments about an aspect -* Make the aspect-browsing UI more intuitive - * Hide/unhide the individual sentences?? - -### TODAY: - -1. Update my info online - * be sure to update github repo address. - * Update name of project & description. Match with resume. - -2. Fix bug with get_sents_by_asect (or sent.has_aspect) ==> retrieve using tokenized version or something--not raw string form… - -3. Finish up front-end stuff. Render the new information that I have... - -#### De-embarassement techniques -* Don't display really long sentences… -* But be careful about doing too much filtering --> this is what's leading to the empty aspects. Need to at least check at the end of filtering to see if there is still enough stuff left to display -* Filter aspects that are close to or contain the restaurant's name?? -* Maybe filter sentences with multiple aspects? -* More aspect stop-words: -"way", "minutes"(?) -* Maybe make the aspect-extraction a little more stringent?? - -ABSOLUTELY NEED TO DO sentence_by_aspect retrieval by TOKENS. otherwise "waiter" gets retrieved for wait - -Same thing with "way" and "always" - -#### Deploying: - -* A Record => point to publice EC2 IP -* CNAME -* Alias (URL Redirect) - - -**Y**elp S**um**marization **M**iner - - diff --git a/classes/business.py b/classes/business.py index d649df6..524cb24 100644 --- a/classes/business.py +++ b/classes/business.py @@ -145,10 +145,11 @@ def aspect_summary(self, aspect): map to a list of positive sentences (strings) and a list of negative sentences (strings) correspondingly. - Gets summary for a *particular* aspect. + Gets summary for a *particular* aspect. Summary includes primarily + the sorted positive/negative sentences mentioning this apsect. """ - OPIN_THRESH = 0.7 + OPIN_THRESH = 0.75 HARD_MIN_OPIN_THRESH = 0.6 POS_THRESH = 0.85 @@ -198,7 +199,7 @@ def aspect_summary(self, aspect): def get_sents_by_aspect(self, aspect): """ - INPUT: + INPUT: Business, string (aspect) OUTPUT: List of Sentence objects """ return [sent for review in self for sent in review if sent.has_aspect(aspect)] @@ -228,7 +229,6 @@ def filter_all_asps(self, asps): INPUT: Business OUTPUT: list of strings """ - # TODO if needed # filter aspects that are too close to the restaurant's name? return asps diff --git a/classes/sentence.py b/classes/sentence.py index 593784b..d501a63 100644 --- a/classes/sentence.py +++ b/classes/sentence.py @@ -17,10 +17,6 @@ class Sentence(object): # Tokenizer for converting a raw string (sentence) to a list of strings (words) WORD_TOKENIZER = MyPottsTokenizer(preserve_case=False) - #STANFORD_POS_TAGGER = POSTagger( - # '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger', - # '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar') - # Lemmatizer LEMMATIZER = WordNetLemmatizer() @@ -74,8 +70,6 @@ def pos_tag(self, tokenized_sent): the standard NLTK POS tagger. """ - # Using Stanford tagger: - #return Sentence.STANFORD_POS_TAGGER.tag(tokenized_sent) return nltk.pos_tag(tokenized_sent) def lemmatize(self, pos_tagged_sent): @@ -99,10 +93,11 @@ def lemmatize(self, pos_tagged_sent): def get_features(self, asarray = False): """ - INPUT: Sentence + INPUT: Sentence, boolean OUTPUT: dict mapping string to ints/floats - Returns an (ordered) feature dict for this Sentence + Returns an (ordered) feature dict for this Sentence. If asarray is + True, returns an np feature array instead (unlabeled). """ if not hasattr(self, 'features'): @@ -118,6 +113,8 @@ def compute_aspects(self): """ INPUT: Sentence OUTPUT: list of lists of strings (i.e. list of aspects) + + Get the candidate aspects contained in this sentence. """ return Sentence.ASP_EXTRACTOR.get_sent_aspects(self) @@ -125,6 +122,8 @@ def has_aspect(self, asp_string): """ INPUT: Sentence, string (aspect) OUTPUT: boolean + + Return true if this sentence contains the given aspect string. """ # re-tokenize the aspect @@ -138,8 +137,8 @@ def encode(self): INPUT: Sentence OUTPUT: dict of this sentence's data - Encodes this sentence and associated metadata - to insert into database. + Encodes this sentence and associated metadata, for + insertion into the database. """ return {'text': self.raw, 'user': self.review.user_name diff --git a/classes/transformers/asp_extractors.py b/classes/transformers/asp_extractors.py index efa3816..b2b9f3b 100644 --- a/classes/transformers/asp_extractors.py +++ b/classes/transformers/asp_extractors.py @@ -16,12 +16,12 @@ class SentenceAspectExtractor(): CHUNKER = nltk.RegexpParser(GRAMMAR) - _my_stopword_additions = ["it's", "i'm", "star", "", "time", "night", "try", "friend", "sure", "times", "way", "friends"] + _my_stopword_additions = ["it's", "i'm", "star", "", "time", "night", "try", "sure", "times", "way", "friends"] STOPWORDS = set(stopwords.words('english') + _my_stopword_additions) PUNCT_RE = re.compile("^[\".:;!?')(/]$") - FORBIDDEN = {'great', 'good', 'time', 'friend'} + FORBIDDEN = {'great', 'good', 'time', 'friend', 'way', 'friends'} def __init__(self): pass diff --git a/main.py b/main.py index d9acd5e..c33bc41 100644 --- a/main.py +++ b/main.py @@ -30,7 +30,7 @@ def main(): print "Loading data..." df = read_data() - bus_ids = df.business_id.unique()[-50:-48] + bus_ids = df.business_id.unique()[21:] for bus_id in bus_ids: