-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnltk_ex22.py
282 lines (220 loc) · 9.7 KB
/
nltk_ex22.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# -*- coding: utf-8 -*-
# nltk_ex22.py clpoda 2017_0220
# Time-stamp: <Mon 2017 Mar 27 09:26:39 AMAM clpoda>
# Stand-alone program to test nltk.
#
# Ref: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import pprint
from six.moves import range
import os
import csv
pd.set_option('display.width', 120)
# Show OwnerUserId w/o trailing '.0'; don't show commas in large numbers.
pd.options.display.float_format = '{:.0f}'.format
datadir = '/data/datasets/'
tmpdir = 'indir/' # Relative to current dir
outdir = 'outdir/' # Relative to current dir
#D q_fname = 'Questions.csv'
#D a_fname = 'Answers.csv'
#D a_fname = 'a6_999999.csv'
#D a_fname = 'a5_99998.csv' # Bag has 7903 rows.
q_fname = 'q3_992.csv'
a_fname = 'a3_986.csv'
#D a_fname = 'q_with_a.csv' # O/p from fga*.py
#D a_fname = 'q_with_a.0211_1308.csv' # 2729 lines; O/p from fga*.py
#D a_fname = 'q_with_a.40_owners_a5_9998.csv' # 800 lines; O/p from fga*.py
#D q_fname = 'q2.csv'
#D a_fname = 'a2.csv'
#
# Choose tmpdir or datadir:
#D a_infile = tmpdir + a_fname
a_infile = datadir + a_fname
# Choose tmpdir or datadir:
#D q_infile = tmpdir + q_fname
q_infile = datadir + q_fname
print('=== Input files, q & a:\n' + q_infile + '\n' + a_infile + '\n')
# Step 1. Read data from file into dataframe.
# Build data frames.
df_all_ans = pd.read_csv(a_infile, encoding='latin-1', warn_bad_lines=False, error_bad_lines=False)
#D questions are not yet used.
#D df_all_ques = pd.read_csv(q_infile, encoding='latin-1', warn_bad_lines=False, error_bad_lines=False)
print('=== df_all_ans.head():\n', df_all_ans.head())
numlines = len(df_all_ans)
progress_msg_factor = int(round(numlines/10))
print('\n=== Number of records in i/p data frame, df_all_ans: ' + str(numlines) + '\n')
# Step 2. Process the words of each input line.
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
#TBD nltk.download()
# Note: Did this once & it took 90 minutes;
# it downloaded text data sets, including stop words.
# Are there any issues btwn py2 & py3 for nltk?
# S/w installed at ~/nltk_data/.
# You can use web site & only d/l some parts if you don't need all.
# Add code to check for current files on local disk before d/l.
def convert_text_to_words( raw_q_a ):
# Convert a raw stackoverflow question or answer
# to a string of words.
# The input is a single string (a raw ques or ans entry), and
# the output is a single string (a preprocessed ques or ans).
#
# 1. Remove HTML
q_a_text = BeautifulSoup(raw_q_a, "lxml").get_text()
#
# 2. Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", q_a_text)
#
# 3. Convert to lower case, split into individual words
words = letters_only.lower().split()
#
# 4. In Python, searching a set is much faster than searching
# a list, so convert the stop words to a set
stops = set(stopwords.words("english"))
#
# 5. Remove stop words
meaningful_words = [w for w in words if not w in stops]
#
# 6. Join the words back into one string separated by space,
# and return the result.
return( " ".join( meaningful_words ))
# Get the number of bodies based on that column's size
num_bodies = df_all_ans["Body"].size
#D print("=== Number of bodies: " + str(num_bodies))
print("=== For all ans: Cleaning and parsing the training set bodies...")
clean_ans_bodies_l = []
for i in range( 0, num_bodies ):
clean_ans_bodies_l.append( convert_text_to_words( df_all_ans["Body"][i] ))
# Print a progress message; default is for every 10% of i/p data handled.
if( (i+1) % progress_msg_factor == 0 ):
clean_q_a = convert_text_to_words( df_all_ans["Body"][i] )
#D print("\n=== Body %d of %d" % ( i+1, num_bodies ))
#D print(' Original text: ' + df_all_ans['Body'][i])
#D print(' Cleaned text: ' + clean_q_a)
# Write cleaned bodies to a file, one body per line, for visual review.
outfile = tmpdir + a_fname + '.out'
if os.path.exists(outfile):
os.rename(outfile, outfile + '.bak')
print('\nWARN: renamed o/p file w/ .bak; save it manually if needed: ' + outfile)
with open(outfile, 'w') as f:
f.write('\n'.join(clean_ans_bodies_l))
# Step 3. Build a bag of words and their counts.
# This code uses ngrams instead of single words.
def make_bag_of_words(clean_ans_bodies_l):
print("\nCreating the bag of words for word counts ...\n")
from sklearn.feature_extraction.text import CountVectorizer
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool
#
# Fails w/ MemErr with max_features at 5000; ok at 100-200.
# Include ngram_range to use ngrams; otherwise, use single words only.
# This: token_pattern = r'\b\w+\b', includes 1-letter words.
#
vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
ngram_range = (3,5), \
# token_pattern = r'\b\w+\b', \
max_features = 200)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
#
train_data_features = vectorizer.fit_transform(clean_ans_bodies_l)
# Numpy arrays are easy to work with, so convert the result to an
# array
#
train_data_features = train_data_features.toarray()
# Note, w/ q9999 data, got (727, 1550):
print('Bag shape: rows (num of records), cols (num of features):')
print(train_data_features.shape, '\n')
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
# Print counts of each word in vocab.
# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)
return (vocab, dist)
print('=== make_bag_of_words(clean_ans_bodies_l')
(vocab, dist) = make_bag_of_words(clean_ans_bodies_l)
# Sort and save vocabulary data to file w/ a specified suffix.
def sort_save_vocab(suffix):
# For each item in bag of words, print the vocabulary word and the number
# of times it appears in the training set
count_tag_l = []
word_freq_d = {}
for tag, count in zip(vocab, dist):
count_tag_l.append((count, tag))
#D word_freq_d[tag] = count
# Sort the list of tuples by count.
words_sorted_by_count = sorted(count_tag_l, key=lambda x: x[0])
# Write sorted vocab to a file.
outfile = tmpdir + a_fname + suffix
if os.path.exists(outfile):
os.rename(outfile, outfile + '.bak')
print('\nWARN: renamed o/p file to *.bak; save it manually if needed:'+ outfile)
with open(outfile, 'w') as f:
for count, word in words_sorted_by_count:
print(count, word, file=f)
sort_save_vocab('.vocab')
# Step 4. Sort data by score.
print('\n=== Sort data by Score for each record.\n')
df_score = df_all_ans.sort_values(['Score'])
df_score = df_score[['Id', 'Score']]
# Compute the number of records to use for computation and display.
rec_selection_ratio = 0.10 # Default 0.01?
num_selected_recs = int(numlines * rec_selection_ratio)
if num_selected_recs < 6:
num_selected_recs = 5
print(' rec_selection_ratio, number of selected recs: ', rec_selection_ratio, num_selected_recs, '\n')
print('Lowest scoring records:')
print(df_score.head(), '\n') #D print(df_score.head(num_selected_recs), '\n')
print('Highest scoring records:')
print(df_score.tail(), '\n') #D print(df_score.tail(num_selected_recs), '\n')
# Step 5. Find most frequent words for top-scoring records.
print('\n=== Step 5. Find most freq words for top-scoring records.')
df_score_top_n = df_score[['Id']]
#D print(df_score_top_n.tail(20), '\n') #D print(df_score_top_n.tail(num_selected_recs), '\n')
# Use top_n records & count their words.
print("\nFor top ans: Cleaning and parsing the training set bodies...")
#D print("Number of bodies: " + str(num_bodies))
def find_freq_words():
top_n_bodies = []
df_score_l = []
# Convert dataframe to list of Id's, to get the body of each Id.
if top: # Get the tail of the list, highest-score items.
df_score_l = df_score_top_n['Id'].tail(num_selected_recs).tolist()
else: # Get the head of the list, lowest-score items.
df_score_l = df_score_top_n['Id'].head(num_selected_recs).tolist()
df8 = df_all_ans.set_index('Id')
progress_count = 0
for i in df_score_l:
progress_count += 1
top_n_bodies.append( convert_text_to_words( df8["Body"][i] ))
# Print a progress message for every 10% of i/p data handled.
if( (progress_count+1) % progress_msg_factor == 0 ):
clean_q_a = convert_text_to_words( df8["Body"][i] )
print("\nBody for Id %d " % ( i))
print(' Original text:\n' + df8['Body'][i][:70])
print(' Cleaned text:\n' + clean_q_a[:70])
return top_n_bodies
top = True
top_n_bodies = find_freq_words()
print('=== make_bag_of_words(top_n_bodies')
(vocab, dist) = make_bag_of_words(top_n_bodies)
sort_save_vocab('.vocab.hiscore')
# Step 6. Find most frequent words for bottom-scoring records.
print('\n=== Step 6. Find most freq words for bottom-scoring records.')
df_score_bot_n = df_score[['Id']]
#D print(df_score_bot_n.head(20), '\n') #D print(df_score_bot_n.head(num_selected_recs), '\n')
top = False
bot_n_bodies = find_freq_words()
print('=== make_bag_of_words(bot_n_bodies')
(vocab, dist) = make_bag_of_words(bot_n_bodies)
sort_save_vocab('.vocab.loscore')
'bye'