-
Notifications
You must be signed in to change notification settings - Fork 0
/
TextmineThis_symptoms.py
254 lines (193 loc) · 9.42 KB
/
TextmineThis_symptoms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import nltk
import scipy
import numpy
import re
import math
import hcluster
__author__ = "Henrik Groenholt Jensen"
class Textminer:
"""
<description>
@requires python-nltk along with the english nltk stopword-corpus
@requires installation of the hcluster-library
"""
def __init__(self):
"""
<description>
"""
def stem(self,data,stemmer=nltk.PorterStemmer()):
"""
Simply stems words.
@param data list or str A list of terms or a document.
@param stemmer nltk.*Stemmer A stemmer from the nltk library
(PorterStemmer by default).
@returns a list of lowercased stemmed words.
"""
l=None
if isinstance(data,str) or isinstance(data,unicode):
ll = data.split(' ')
ll = [stemmer.stem(term.strip().lower()) for term in ll if term != '']
l = " ".join(ll)
else:
print data
print type(data)
return l
def createTermDoc(self,data,stemming=True):
"""
Create a term-document matrix.
A term-count is given by the coordinat:
score = (symptom_hash[term],doc_hash[doc_id])
These hashes are returned along with the matrix for later use and lookup.
@param data list A list of tuples containing a document
id, the document and it's title.
Example: [(id1,symptomlist1,title1),
(id2,symptomlist2,title2),...]
@param stemming bool Stem the words if True.
@param rm_stopwords bool Remove english stopwords if True.
@param sanitizer regexp Provide a regular expression to sanitize
the document (default replaces all non-
alphanumerical charecters with whitespaces)
@returns a sparse dok-matrix (see scipy.sparse for info on the 'dok')
along with the term and document hashes.
"""
# Build the hashes and scores
symptom_hash, doc_hash, score_hash, name_hash = \
self._getHashesAndScores(data,stemming)
m = len(doc_hash); n = len(symptom_hash)
termDoc = numpy.matrix(numpy.zeros((m,n)))
for doc,tscores in score_hash.items():
for term,score in tscores.items():
try:
termDoc[doc_hash[doc],symptom_hash[term]] = score
except:
print score
return termDoc, symptom_hash, doc_hash, name_hash
def runTFIDF(self,termDoc):
"""
Creates a Term-Frequency Inverse-Document-Frequency from a sparse
coo_matrix, using log-transformation on TF and IDF.
@param termDoc sparse matrix A scipy.sparse term-doc matrix.
@returns a tf-idf processed sparse matrix
"""
term_counts = sum(termDoc>0).tolist()[0]
print "Running TF-IDF..."
for row in range(0,termDoc.shape[0]):
for col in termDoc[row,:].nonzero()[1].tolist()[0]:
tf = termDoc[row,col]
try:
tf = math.log(1 + tf)
except:
print "Error on term frequency. Got:",tf
try:
idf = math.log(termDoc.shape[0] / term_counts[col])
except:
raise ZeroDivisionError
termDoc[row,col]=tf*idf
return termDoc
def queryTheMatrix(self,termDoc,query,term_hash,doc_hash, name_hash,
stemming=True):
"""
Given a term-dcoument matrix and a term, document and name hash,
a list of document scores are returned based on a string or list of
terms (the query). The score represents the documents relevance to the
query. The dcoument score is a simple a simple summation of the scores
of each relevant term in the document.
@param termDoc sparse matrix A scipy.sparse term-doc matrix.
@param query str/list The query to from which scores are
produced.
@param term_hash dic Term hash, where the term itself is the key.
@param doc_hash dic Doc hash, where the id of the doc is the key.
@param name_hash dic Name hash, where the id of the doc is the key.
@param stemming bool Stem the words if True.
@param rm_stopwords bool Remove english stopwords if True.
@param sanitizer regexp Provide a regular expression to sanitize
the document (default replaces all non-
alphanumerical charecters with whitespaces).
@returns a list of tuples containing the document name and score
- sorted by score in descending order.
"""
# split by comma into symptom-terms
if isinstance(query,str) or isinstance(query,unicode):
searchTerms = [s.strip().lower() for s in query.split(',') if s!='']
else:
raise TypeError
# stem the document
if stemming: searchTerms = [self.stem(x) for x in searchTerms if x]
print "Search terms: ",searchTerms
scores = {}
for term in searchTerms:
try:
n = term_hash[term]
except:
print "Term not found: '"+term+"'"
continue
# NOTE: Consider using sets...
docs = termDoc[:,n].nonzero()[0].tolist()[0]
# Sum score measure:
rev_doc_hash = dict(zip(doc_hash.values(),doc_hash.keys()))
for doc in docs:
score = termDoc[doc,n]
doc_id = rev_doc_hash[doc] # extract the original orpha-nums
try:
scores[doc_id] += score
except:
scores[doc_id] = score
# Sort the scores (by value of course)
scores = sorted(scores.items(), key=lambda (k,v): (v,k), reverse=True)
# Replace orpha-nums with document titles
scores = [(name_hash[s[0]],s[1]) for s in scores]
# EXPERIMENT:
# Normalized cumultative summation of the scores
# totalsum = float(sum([s[1] for s in scores]))
# scores = [(s[0],s[1]/totalsum*100) for s in scores]
return scores
#########################
# AUXILLARY FUNCTIONS #
#########################
def _getHashesAndScores(self,data,stemming):
""" (used by the createTermDoc-method)
Build a symptom hash, a doc hash and a term-doc score hash
@param data list A list of tuples containing a document
id and the document where the id is a
num and the document a string.
Example: [(id1,doc1),(id2,doc2)]
@param stemming bool Stem the words if True.
@param sanitizer regexp Provide a regular expression to sanitize
the document (default replaces all non-
alphanumerical charecters with whitespaces)
@returns
* a symptom hash, where the name of the symptom is the key,
* a doc hash, where the id of the doc is the key,
* a score hash, where the key is the symptom and doc hash
(e.g. score_hash[doc][symptom] = score)
* and a name hash, where the id of the doc is the key.
(Note that the hashes are zero-indexed!)
"""
symptom_hash = {}
doc_hash = {}
name_hash = {}
score_hash = {} # term-doc scores
for disease in data:
doc_id = int(disease[0])
symptom_dist = disease[1]
name = disease[2]
# # Sanitize symptoms
# doc = sanitizer.sub(' ',doc)
#
# ### Mark irrelevant abstracts ###
# qualifier = "term does not characterize a disease"
# if qualifier in doc: name = "*"+name
#################################
if doc_hash.has_key(doc_id): continue # we don't want duplicates
doc_hash[doc_id] = len(doc_hash) # assuming documents unique
name_hash[doc_id] = name # name-hash for later quering..
# stem the symptoms
if stemming: symptom_dist = [(self.stem(x[0]),x[1]) for x in symptom_dist]
# fdist = self.getWordCount(doc) # get the term distribution
# build symptom hashes and term-doc scores
score_hash[doc_id] = {}
for symptom,score in symptom_dist:
if not symptom_hash.has_key(symptom):
symptom_hash[symptom] = len(symptom_hash)
score_hash[doc_id][symptom] = score
return symptom_hash, doc_hash, score_hash, name_hash