Skip to content

Commit

Permalink
update class 15 materials with output
Browse files Browse the repository at this point in the history
  • Loading branch information
justmarkham committed Oct 7, 2015
1 parent f5b8c3c commit 35697ab
Show file tree
Hide file tree
Showing 2 changed files with 1,014 additions and 123 deletions.
9 changes: 5 additions & 4 deletions code/15_natural_language_processing_nb.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
# - [autotldr](https://www.reddit.com/r/technology/comments/35brc8/21_million_people_still_use_aol_dialup/cr2zzj0)
# - **Natural Language Generation**: Generate text from data
# - [How a computer describes a sports match](http://www.bbc.com/news/technology-34204052)
# - [Publishers withdraw more than 120 gibberish papers](http://www.nature.com/news/publishers-withdraw-more-than-120-gibberish-papers-1.14763)
# - **Speech recognition and generation**: Speech-to-text, text-to-speech
# - [Google's Web Speech API demo](https://www.google.com/intl/en/chrome/demos/speech.html)
# - [Vocalware Text-to-Speech demo](https://www.vocalware.com/index/demo)
Expand Down Expand Up @@ -166,10 +167,10 @@

# calculate null accuracy
y_test_binary = np.where(y_test==5, 1, 0)
y_test_binary.mean()
max(y_test_binary.mean(), 1 - y_test_binary.mean())


# define a function that accepts a vectorizer and returns the accuracy
# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
X_train_dtm = vect.fit_transform(X_train)
print 'Features: ', X_train_dtm.shape[1]
Expand Down Expand Up @@ -298,7 +299,7 @@ def split_into_lemmas(text):
return [word.lemmatize() for word in words]


# use split_into_lemmas as the feature extraction function
# use split_into_lemmas as the feature extraction function (WARNING: SLOW!)
vect = CountVectorizer(analyzer=split_into_lemmas)
tokenize_test(vect)

Expand Down Expand Up @@ -409,7 +410,7 @@ def detect_sentiment(text):
yelp['sentiment'] = yelp.text.apply(detect_sentiment)


# boxplot of sentiment grouped by stars
# box plot of sentiment grouped by stars
yelp.boxplot(column='sentiment', by='stars')


Expand Down
Loading

0 comments on commit 35697ab

Please sign in to comment.