forked from justmarkham/DAT8
-
Notifications
You must be signed in to change notification settings - Fork 0
/
14_text_data_sklearn_nb.py
268 lines (167 loc) · 7.81 KB
/
14_text_data_sklearn_nb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# # Working with Text Data and Naive Bayes in scikit-learn
# ## Agenda
#
# **Working with text data**
#
# - Representing text as data
# - Reading SMS data
# - Vectorizing SMS data
# - Examining the tokens and their counts
# - Bonus: Calculating the "spamminess" of each token
#
# **Naive Bayes classification**
#
# - Building a Naive Bayes model
# - Comparing Naive Bayes with logistic regression
# ## Part 1: Representing text as data
#
# From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):
#
# > Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect **numerical feature vectors with a fixed size** rather than the **raw text documents with variable length**.
#
# We will use [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) to "convert text into a matrix of token counts":
from sklearn.feature_extraction.text import CountVectorizer
# start with a simple example
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']
# learn the 'vocabulary' of the training data
vect = CountVectorizer()
vect.fit(simple_train)
vect.get_feature_names()
# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm
# print the sparse matrix
print simple_train_dtm
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()
# examine the vocabulary and document-term matrix together
import pandas as pd
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())
# From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):
#
# > In this scheme, features and samples are defined as follows:
#
# > - Each individual token occurrence frequency (normalized or not) is treated as a **feature**.
# > - The vector of all the token frequencies for a given document is considered a multivariate **sample**.
#
# > A **corpus of documents** can thus be represented by a matrix with **one row per document** and **one column per token** (e.g. word) occurring in the corpus.
#
# > We call **vectorization** the general process of turning a collection of text documents into numerical feature vectors. This specific strategy (tokenization, counting and normalization) is called the **Bag of Words** or "Bag of n-grams" representation. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document.
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test = ["please don't call me"]
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())
# **Summary:**
#
# - `vect.fit(train)` learns the vocabulary of the training data
# - `vect.transform(train)` uses the fitted vocabulary to build a document-term matrix from the training data
# - `vect.transform(test)` uses the fitted vocabulary to build a document-term matrix from the testing data (and ignores tokens it hasn't seen before)
# ## Part 2: Reading SMS data
# read tab-separated file
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv'
col_names = ['label', 'message']
sms = pd.read_table(url, sep='\t', header=None, names=col_names)
print sms.shape
sms.head(20)
sms.label.value_counts()
# convert label to a numeric variable
sms['label'] = sms.label.map({'ham':0, 'spam':1})
# define X and y
X = sms.message
y = sms.label
# split into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print X_train.shape
print X_test.shape
# ## Part 3: Vectorizing SMS data
# instantiate the vectorizer
vect = CountVectorizer()
# learn training data vocabulary, then create document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_train_dtm
# alternative: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm
# ## Part 4: Examining the tokens and their counts
# store token names
X_train_tokens = vect.get_feature_names()
# first 50 tokens
print X_train_tokens[:50]
# last 50 tokens
print X_train_tokens[-50:]
# view X_train_dtm as a dense matrix
X_train_dtm.toarray()
# count how many times EACH token appears across ALL messages in X_train_dtm
import numpy as np
X_train_counts = np.sum(X_train_dtm.toarray(), axis=0)
X_train_counts
X_train_counts.shape
# create a DataFrame of tokens with their counts
pd.DataFrame({'token':X_train_tokens, 'count':X_train_counts}).sort('count')
# ## Bonus: Calculating the "spamminess" of each token
# create separate DataFrames for ham and spam
sms_ham = sms[sms.label==0]
sms_spam = sms[sms.label==1]
# learn the vocabulary of ALL messages and save it
vect.fit(sms.message)
all_tokens = vect.get_feature_names()
# create document-term matrices for ham and spam
ham_dtm = vect.transform(sms_ham.message)
spam_dtm = vect.transform(sms_spam.message)
# count how many times EACH token appears across ALL ham messages
ham_counts = np.sum(ham_dtm.toarray(), axis=0)
# count how many times EACH token appears across ALL spam messages
spam_counts = np.sum(spam_dtm.toarray(), axis=0)
# create a DataFrame of tokens with their separate ham and spam counts
token_counts = pd.DataFrame({'token':all_tokens, 'ham':ham_counts, 'spam':spam_counts})
# add one to ham and spam counts to avoid dividing by zero (in the step that follows)
token_counts['ham'] = token_counts.ham + 1
token_counts['spam'] = token_counts.spam + 1
# calculate ratio of spam-to-ham for each token
token_counts['spam_ratio'] = token_counts.spam / token_counts.ham
token_counts.sort('spam_ratio')
# ## Part 5: Building a Naive Bayes model
#
# We will use [Multinomial Naive Bayes](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html):
#
# > The multinomial Naive Bayes classifier is suitable for classification with **discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.
# train a Naive Bayes model using X_train_dtm
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)
# calculate accuracy of class predictions
from sklearn import metrics
print metrics.accuracy_score(y_test, y_pred_class)
# confusion matrix
print metrics.confusion_matrix(y_test, y_pred_class)
# predict (poorly calibrated) probabilities
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob
# calculate AUC
print metrics.roc_auc_score(y_test, y_pred_prob)
# print message text for the false positives
X_test[y_test < y_pred_class]
# print message text for the false negatives
X_test[y_test > y_pred_class]
# what do you notice about the false negatives?
X_test[3132]
# ## Part 6: Comparing Naive Bayes with logistic regression
# import/instantiate/fit
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm, y_train)
# class predictions and predicted probabilities
y_pred_class = logreg.predict(X_test_dtm)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
# calculate accuracy and AUC
print metrics.accuracy_score(y_test, y_pred_class)
print metrics.roc_auc_score(y_test, y_pred_prob)