-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnltk_ex25.py
391 lines (282 loc) · 13.7 KB
/
nltk_ex25.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
# -*- coding: utf-8 -*-
# Using ~/anaconda3/bin/python: Python 3.6.0 :: Anaconda 4.3.0 (64-bit)
# Using Python 3.4.5 :: Anaconda 4.3.0 (64-bit), since Tue2017_0710
"""nltk_ex25.py
Analyze text data using the Natural Language Toolkit, NLTK.
NLTK is a suite of Python s/w, data sets, and documents
to support the development of Natural Language Processing, NLP.
This module contains several functions that use NLTK software
to process text and produce output that might help to find
some good but 'hidden' records in the input. A 'hidden' record
has a low score and might not be prominently displayed in a
list of records based on scores.
One approach is to collect terms found in high-score records,
and look for low-score records that contain the same terms.
Count the number of matching terms, and assume that a high count
indicates a low-score record that might have useful content.
Return the o/p to the caller for further evaluation & processing.
These functions were built to work with the calling code in
fga_find_good_answers.py (which uses data from stackoverflow.com).
See that program for more details.
Usage:
import nltk_ex25
pydoc nltk_ex25
See fga_find_good_answers.py for an example that uses
the functions in this module.
Set the value of MAX_HI_SCORE_TERMS in the calling program.
It determines how many hi-score terms are used to search
for matching terms in low-score answers. A bigger number
will cause the program to run longer, and might find more
'hidden' answers that could be valuable.
Terms:
ngram: a string of one or more words.
----------------------------------------------------------
"""
# Ref: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
import pandas as pd
import numpy as np
#TBR from collections import Counter, defaultdict
#TBR import pprint
from six.moves import range
import os
#TBR import csv
import re
import nltk
from nltk.corpus import stopwords
import config as cf
import util.misc as ut
def main():
"""Initialization:
Download NLTK stopwords data if not found locally.
"""
log_msg = cf.log_file + ' - Start logging for ' + os.path.basename(__file__)
cf.logger.info(log_msg)
# Check and download stopwords if not found locally.
nltk.download('stopwords')
def convert_text_to_words(raw_qa_s):
"""Convert and filter the i/p text string into a string of words.
Convert a raw stackoverflow question or answer
to a string of meaningful words for detailed analysis.
The input is a single string of text.
That content is processed in various ways, eg, remove HTML,
remove non-letters, convert to lower-case, and remove
stop words that clutter the output.
Return a single string of meaningful words.
"""
# 1. Remove HTML
qa_text = ut.strip_html(raw_qa_s, "lxml")
###
# 2. Remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", qa_text)
# 3. Convert to lower case, split into individual words
words = letters_only.lower().split()
# 4. In Python, searching a set is much faster than searching
# a list, so convert the stop words to a set
stops = set(stopwords.words("english"))
# Add more noise terms to stopwords.
stops.add('th')
# 5. Remove stop words
#ORG meaningful_words = [w for w in words if not w in stops]
meaningful_words = [w for w in words if w not in stops]
# 6. Join the words back into one string of words, each word
# separated by a space, and return the resulting string.
return(" ".join(meaningful_words))
def clean_raw_data(qagroup_poptop_df):
"""Clean and parse the input text.
The input is a dataframe of one question and its related
answers (one Q&A group). The question is both pop (popular)
and top: it has several answers, and some answers are by owners
with high reputation scores.
Create a new column in the dataframe to hold the
cleaned data.
Process the text from one cell of a row of the dataframe
into a string of meaningful words.
Store that new string in the new cell for that row.
Repeat for all the rows in the dataframe by using a loop.
This updated dataframe is then used elsewhere.
One use of this function reads text from the Body column of a row
in the dataframe; converts it into words; and places them in the
CleanBody cell of that row.
Return a list of strings of cleaned text, one string for each
row in the dataframe.
"""
# Get the number of bodies based on that column's size
num_bodies = qagroup_poptop_df["Body"].size
cf.logger.info("clean_raw(): Number of bodies: " + str(num_bodies))
clean_qa_bodies_l = []
qagroup_poptop_df["CleanBody"] = ""
# Build a list that holds the cleaned text from each answer's body field.
# Use it to find terms that match terms found in hi-score Answers.
progress_i = ut.calc_progress(num_bodies)
for i in range(0, num_bodies):
clean_body_s = convert_text_to_words(qagroup_poptop_df["Body"][i])
clean_qa_bodies_l.append(clean_body_s)
#
# Add new column to Answers df.
qagroup_poptop_df.loc[i, "CleanBody"] = clean_body_s
# Print a progress message; default is for every 10% of i/p data handled.
if((i+1) % progress_i == 0):
#D cf.logger.debug("Body %d of %d" % (i+1, num_bodies))
#D cf.logger.debug(' Original text: ' + qagroup_poptop_df['Body'][i])
cf.logger.debug(' [i], clean*(): Slice of cleaned text: [' +
str(i) + ']\n' + clean_body_s[:70])
return clean_qa_bodies_l
def make_bag_of_words(clean_qa_bodies_l):
"""Extract numerical features from input text data.
Later steps can use those features with machine learning
algorithms to analyze the data further.
The input data is a list of strings of cleaned text,
for one Q&A group. Each string contains the words of the
Body field of the question or an answer.
Consider the entire list to be a corpus or collection
of documents in the form of a matrix.
Each string in the list is a row of the matrix;
and each ngram (word or phrase) in each row occupies one column
of the matrix.
Use CountVectorizer() from scikit-learn to convert
that matrix into numerical feature vectors.
Return a tuple of two structures.
vocab_l: a list of srings; each string is one ngram
in the vocabulary;
dist_a: an array with the total count of the occurrences
of each ngram in the vocabulary list.
# Some Details.
To use phrases instead of single words in making the bag of words,
specify a range, eg, "ngram_range = (3,5)" in the arguments for
CountVectorizer().
To include 1-letter words in the bag, use this argument & pattern:
"token_pattern = r'\b\w+\b".
Ref: http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction.
"""
cf.logger.debug("Creating the bag of words for word counts ...")
from sklearn.feature_extraction.text import CountVectorizer
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool
# TBD, Fails w/ MemErr with max_features at 5000; ok at 200.
vectorizer = CountVectorizer(analyzer="word",
tokenizer=None,
preprocessor=None,
stop_words=None,
ngram_range=(3, 5),
# token_pattern=r'\b\w+\b',
max_features=200)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
train_data_features = vectorizer.fit_transform(clean_qa_bodies_l)
# Numpy arrays are easy to work with, so convert the result to an
# array
train_data_features = train_data_features.toarray()
# Note, w/ q9999 data, got (727, 1550):
cf.logger.info('Bag shape: rows (num of records), cols (num of features):')
cf.logger.info(train_data_features.shape)
# Take a look at the words in the vocabulary
vocab_l = vectorizer.get_feature_names()
# Sum up the counts of each vocabulary term in vocab_l.
dist_a = np.sum(train_data_features, axis=0)
return (vocab_l, dist_a)
def sort_vocab(vocab_l, dist_a):
"""Sort the i/p vocabulary data by count.
The two input structures are a list of strings of terms
and an array of counts that correspond to the terms.
Combine these two structures into a list of tuples;
then sort that list by the count element.
Return a list of tuples sorted by count: count, term.
"""
count_term_l = []
for term, count in zip(vocab_l, dist_a):
count_term_l.append((count, term))
# Sort the list of tuples by count.
words_sorted_by_count_l = sorted(count_term_l, key=lambda x: x[0])
return words_sorted_by_count_l
def sort_qa_by_score(qagroup_poptop_df):
"""Return a dataframe of Id's and their Scores, sorted by score.
Use one Q&A group dataframe as input.
"""
ids_and_scores_df = qagroup_poptop_df.sort_values(['Score'])
ids_and_scores_df = ids_and_scores_df[['Id', 'Score']]
cf.logger.info('Lowest scoring Answers:')
cf.logger.info(ids_and_scores_df.head())
cf.logger.info('Highest scoring Answers:')
cf.logger.info(ids_and_scores_df.tail())
return ids_and_scores_df
def find_hi_score_terms_in_bodies(words_sorted_by_count_l, clean_qa_bodies_l, MAX_HI_SCORE_TERMS, qagroup_poptop_df):
"""Save terms that a record has in common with frequently-seen text.
Important Variables.
clean_qa_bodies_df: A dataframe with cleaned text from Q&A
bodies.
HiScoreTerms: A column added to qagroup_poptop_df,
holding terms that might indicate that this is a useful record.
HSTCount: A column added to qagroup_poptop_df, holding
the total count of the number of high score terms in a record.
qagroup_poptop_df: a dataframe of one question
and its related answers (one Q&A group). The question is
both pop (popular) and top: it has several answers, and some
answers are by owners with high reputation scores.
words_sorted_by_count_l: A list of strings of ngrams found
in the bodies of records; the list is sorted by the count
of each ngram.
Actions.
Start with a list of ngrams from the bodies of a Q&A group,
sorted by count.
Select a subset of ngrams that appear most often.
Compare each ngram of that subset with each ngram in the body
of one row in the Q&A group dataframe. If there is a match,
save the term and increment the total HSTCount
for that row in the output dataframe.
Use nested loops to check all frequently-seen ngrams against
all rows of the Q&A group dataframe.
Return the updated Q&A group dataframe for further investigation.
"""
clean_qa_bodies_df = pd.DataFrame(clean_qa_bodies_l)
cf.logger.debug("clean_qa_bodies_l, top:")
cf.logger.debug(clean_qa_bodies_l[:1])
cf.logger.debug("clean_qa_bodies_l, bottom:")
cf.logger.debug(clean_qa_bodies_l[-1:])
#TBF
# p.1. Separate the terms from each other so it is clear what was found
# and what was not found in the A's being checked.
qagroup_poptop_df["HiScoreTerms"] = ""
qagroup_poptop_df["HSTCount"] = 0
cf.logger.info("Terms from hi-score Answers.")
#TBD, Maybe collect these "count,w" data into a single list,
# & write it in one step to the log file.
count_lim = min(10, MAX_HI_SCORE_TERMS)
for count, w in words_sorted_by_count_l[-count_lim:]:
log_msg = "count, w: " + str(count) + " , " + w
cf.logger.info(log_msg)
tmp2_sr = clean_qa_bodies_df[0].str.contains(w)
for index, row in tmp2_sr.iteritems():
if row:
#TBD, Maybe Change string 'w' to list or tuple for better storage in df?
#OK qagroup_poptop_df.loc[index, "HiScoreTerms"] = qagroup_poptop_df.loc[index, "HiScoreTerms"] + w + ' , '
qagroup_poptop_df.loc[index, "HiScoreTerms"] += (w + ' , ')
#TBD, increment hst counter each time one is found.
#TBD, How to handle multiple instances of hst? count each occurrence?
#TBD, use df func to simply count number of HiScoreTerms in each row?
qagroup_poptop_df.loc[index, "HSTCount"] += 1
#D cf.logger.debug("DBG, qagroup_poptop_df.head():")
#D cf.logger.debug(qagroup_poptop_df.head())
# Save possible valuable answers to a separate file for review.
# Replace empty strings in HiScoreTerms cells with NaN,
# to drop low value answers easily w/ dropna().
qagroup_poptop_df['HiScoreTerms'].replace('', np.nan, inplace=True)
#
# Save a new df with only rows that have data in the HiScoreTerms column.
# TBD, Remove this statement to fix bug that deletes many Q&A of value.
# Consider other workarounds if this is a problem.
# qagroup_poptop_df = qagroup_poptop_df.dropna(subset=['HiScoreTerms'])
#D # Print partial data about interesting answers to check.
#D print("\nCheck these low score Answers for useful data: ")
#D for index,row in qagroup_poptop_df.iterrows():
#D if np.isnan(row['ParentId']): # Found a question.
#D print('Id, Title: ', row['Id'], row['Title'])
#D print(qagroup_poptop_df[['Id', 'Score', 'CreationDate']])
#D print(qagroup_poptop_df[['Id', 'HiScoreTerms']])
# Also write summary data to log.
cf.logger.info("Check low score Answers for useful data: ")
cf.logger.info(qagroup_poptop_df[['Id', 'Score', 'HSTCount', 'CreationDate', 'Title', 'HiScoreTerms']])
return qagroup_poptop_df
if __name__ == '__main__':
main()