-
Notifications
You must be signed in to change notification settings - Fork 0
/
knn.py
118 lines (94 loc) · 4.1 KB
/
knn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
import numpy
import hcluster.distance as hdist
import db as DB
import TextmineThis as TT
import pod as POD
class kNN:
"""
classes List of the possible classes.
xs List of the neighbors.
ys List of the classes that the neighbors belong to.
k Number of neighbors to look at.
"""
def __init__(self):
self.classes = []
self.xs = []
self.ys = []
self.k = None
self.miner = TT.Textminer()
self.db = DB.db()
def precalc(self):
data = POD.parseOrphaDesc()
return self.miner.createTermDoc(data) # termdoc, t_hash, d_hash, n_hash
def trainfromtfidf(self, termDoc, t_hash, d_hash, n_hash):
codes=["A","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","V","Z"]
icd10 = self.db.c.execute("select patres,code from icd_10").fetchall()
dicd10 = dict(icd10)
patrescodes = d_hash.keys() # ({patres0, docIndex0}, {patres1, docIndex1}, etc.)
notpatres = set(patrescodes).difference(set(dicd10.keys()))
toclassifyidx = [(d_hash[code], code) for code in notpatres]
totrainidx = [(d_hash[code], code) for code in patrescodes if code not in notpatres]
#construct the training feature dictionary
features = []
for (idx, patcode) in totrainidx:
features.append({"feat": termDoc[idx].tolist(), "class": dicd10[patcode][0]})
return features, totrainidx, toclassifyidx
def train(self, feats, k, typecode=None):
"""train(xs, ys, k) -> kNN
Train a k nearest neighbors classifier on a training set. xs is a
list of observations and ys is a list of the class assignments.
Thus, xs and ys should contain the same number of elements. k is
the number of neighbors that should be examined when doing the
classification.
"""
# x = {feat: [], class: ''}
features = []
featclass = []
for feat in feats:
features+=feat['feat']
featclass+=feat['class']*len(feat['feat'])
self.classes = set(featclass)
self.xs = numpy.asarray(features, typecode)
self.ys = featclass
self.k = k
def calculate(self, x, weight_fn=lambda x, y: 1, distance_fn=hdist.euclidean):
"""calculate(knn, x[, weight_fn][, distance_fn]) -> weight dict
Calculate the probability for each class. knn is a kNN object. x
is the observed data. weight_fn is an optional function that
takes x and a training example, and returns a weight. distance_fn
is an optional function that takes two points and returns the
distance between them. If distance_fn is None (the default), the
Euclidean distance is used. Returns a dictionary of the class to
the weight given to the class.
"""
x = numpy.asarray(x)
order = [] # list of (distance, index)
for i in range(len(self.xs)):
# temp[:] = x - knn.xs[i]
# dist = numpy.sqrt(numpy.dot(temp,temp))
dist = distance_fn(x, self.xs[i])
order.append((dist, i))
order.sort()
# first 'k' are the ones I want.
weights = {} # class -> number of votes
for k in self.classes:
weights[k] = 0.0
for dist, i in order[:self.k]:
klass = self.ys[i]
weights[klass] = weights[klass] + weight_fn(x, self.xs[i])
return weights, order
def classify(self, x, weight_fn=lambda x, y: 1, distance_fn=hdist.euclidean):
"""classify(knn, x[, weight_fn][, distance_fn]) -> class
Classify an observation into a class. If not specified, weight_fn will
give all neighbors equal weight and distance_fn will be the euclidean
distance.
"""
weights,_ = self.calculate(x, weight_fn, distance_fn)
most_class = None
most_weight = None
for klass, weight in weights.items():
if most_class is None or weight > most_weight:
most_class = klass
most_weight = weight
return most_class