2
2
# @Author: LC
3
3
# @Date: 2016-04-08 15:26:49
4
4
# @Last modified by: WuLC
5
- # @Last Modified time: 2016-04-12 20:29:36
5
+ # @Last Modified time: 2016-04-21 14:41:10
6
6
7
7
# @Function:implementation of decision tree described in programming-collective-intelligence in chapter 7
8
8
# @Referer: chapter 7 in book 《programming-collective-intelligence》
@@ -105,7 +105,7 @@ def entropy(rows):
105
105
"""get the entropy of rows
106
106
107
107
Args:
108
- rows (TYPE ): rows to be caculated about their entropy
108
+ rows (list[list] ): rows to be caculated the about their entropy
109
109
110
110
Returns:
111
111
float: entropy of the rows
@@ -119,7 +119,21 @@ def entropy(rows):
119
119
ent -= p * log (p , 2 )
120
120
return ent
121
121
122
-
122
+ def variance (rows ):
123
+ """get the variance of the rows when the type is number
124
+
125
+ Args:
126
+ rows(list[list]): rows to be caculated their variance
127
+
128
+ Returns:
129
+ variance of the row in terms of the last row
130
+ """
131
+ s = sum (row [- 1 ] for row in rows )
132
+ mean = s / len (rows )
133
+ pow_sum = sum (pow (row [- 1 ]- mean ,2 ) for row in rows )
134
+ variance = pow_sum / len (rows )
135
+ reuturn variance
136
+
123
137
def build_tree (rows ):
124
138
"""build the decision of the rows in the metric of entropy
125
139
@@ -240,8 +254,46 @@ def prune(tree, mini_entropy):
240
254
prune (tree .fb , mini_entropy )
241
255
242
256
257
+ def mdclassfy (tree ,observation ):
258
+ """classfy observation with some missing data
259
+
260
+ Args:
261
+ tree (TYPE): root of decision tree
262
+ observation (TYPE): new observation to be classfied
263
+
264
+ Returns:
265
+ result that the observation to be classfied
266
+ """
267
+ if tree .results != None :
268
+ return results
269
+ col = tree .col
270
+ if observation [col ]== None : # empty field in the observation
271
+ resutl = {}
272
+ tb ,fb = mdclassfy (tree .tb ,observation ),mdclassfy (tree .fb ,observation )
273
+ tb_count = sum (tb .values )
274
+ fb_count = sum (fb .values )
275
+ tb_fraction = tb_count / (tb_count + fb_count )
276
+ fb_fraction = fb_count / (tb_count + fb_count )
277
+ for k ,v in tb .items (): results [k ]+= v * tb_count
278
+ for k ,v in fb .items (): results [k ]+= v * fb_count
279
+ return result
280
+ else :
281
+ value = tree .value
282
+ if isinstance (value ,float ) or isinstance (value ,int ):
283
+ if observation [col ] >= value :
284
+ return mdclassify (tree .tb ,observation )
285
+ else :
286
+ reutrn mdclassify (tree .fb ,observation )
287
+ else :
288
+ if observation [col ] == value :
289
+ return mdclassify (tree .tb ,observation )
290
+ else :
291
+ return mdclassify (tree .fb ,observation )
292
+
243
293
244
294
295
+
296
+
245
297
if __name__ == '__main__' :
246
298
rot = DecisionTreeNode ()
247
299
rot = build_tree (sample_data )
0 commit comments