forked from h2oai/h2o-2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathh2o_pca.py
103 lines (90 loc) · 4.42 KB
/
h2o_pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import h2o_cmd, h2o,time,math,string
from pprint import pprint
def simpleCheckPCA(self, pca, **kwargs):
#print h2o.dump_json(pca)
warnings = None
if 'warnings' in pca:
warnings = pca['warnings']
# catch the 'Failed to converge" for now
x = re.compile("[Ff]ailed")
for w in warnings:
print "\nwarning:", w
if re.search(x,w): raise Exception(w)
# Check other things in the json response dictionary 'pca' here
destination_key = pca['destination_key']
pcaResult = h2o_cmd.runInspect(key=destination_key, view=100)
h2o.verboseprint('pcaResult Inspect:', h2o.dump_json(pcaResult))
#Check no NaN in sdevs, propVars, or in PCs
print "Checking sdevs..."
sdevs = pcaResult["PCAModel"]["stdDev"]
h2o.verboseprint("pca sdevs:", h2o.dump_json(sdevs))
# sdevs is supposed to be a list sorted by s
# sFirst = sdevs[0].s
for PC,s in sdevs.iteritems():
if math.isnan(s):
raise Exception("sdev %s is NaN: %s" % (PC,s))
# anqi says the list should be sorted..i.e. first first
## if s < sFirst:
## raise Exception("sdev %s %s is > sFirst %s. Supposed to be sorted?" % (PC, s, sFirst))
print "Checking propVars...",
propVars = pcaResult["PCAModel"]["propVar"]
h2o.verboseprint("pca propVars:", h2o.dump_json(propVars))
for PC,propvar in propVars.iteritems():
if math.isnan(propvar):
raise Exception("propVar %s is NaN: %s", (PC, propvar))
print " Good!"
print "Checking eigenvectors...",
pcs = pcaResult["PCAModel"]["eigenvectors"]
h2o.verboseprint("pca eigenvectors:", h2o.dump_json(pcs))
for i,s in enumerate(pcs):
for r,e in s.iteritems():
if math.isnan(e):
raise Exception("Component %s has NaN: %s eigenvector %s", (i, e, s))
print " Good!"
print "How many components did we get? (after enum col dropping): %s", len(pcs)
# now print the top ten. Sorting by the value...getting key,value tuples (so we can see the column)
# it should match the column numbering..even if it skips cols due to enums
import operator
print "Just look at the sort for the first row in pca eigenvectors"
i = 0
s = pcs[i]
sorted_s = sorted(s.iteritems(), key=lambda t: abs(t[1]))
num = min(10, len(s))
print "\n%s First (smallest) %d. sorted_pcs[0:9]: %s\n" % (i, num, sorted_s[0:num-1])
print "The first entry from the eigenvector, should have the largest std dev, because it's sorted"
print "Rule of thumb is we can then look at the sorted values, and guess it's related to column importance"
print "The sort should be on the abs(), since the signs can be + or -"
print "\n%s Last %d (largest) sorted_s[-10:]: %s\n" % (i, num, sorted_s[-num:])
# shouldn't have any errors
h2o.check_sandbox_for_errors()
return warnings
def resultsCheckPCA(self, pca, **kwargs):
#print h2o.dump_json(pca)
destination_key = pca['destination_key']
pcaResult = h2o_cmd.runInspect(key=destination_key, **{'view':100})
print "Checking that propVars sum to 1",
propVars = pcaResult["PCAModel"]["propVar"]
sum_ = 1.0
for PC,propVar in propVars.iteritems(): sum_ -= propVar
self.assertAlmostEqual(sum_,0,msg="PropVar does not sum to 1.")
print " Good!"
if pcaResult["PCAModel"]["PCAParams"]["tolerance"] != 0.0 or pcaResult["PCAModel"]["PCAParams"]["standardized"] != True:
return
print "Checking that sdevs^2 sums to number of variables"
#if not standardized or tolerance != 0, don't do check
sdevs = pcaResult["PCAModel"]["stdDev"]
sum_ = len(sdevs)
for PC,sdev in sdevs.iteritems(): sum_ -= sdev**2
if not ((sum_ -.5) < 0 < (sum_ +.5)):
print "sum(sdevs^2) are not within .5 of 0. sdevs incorrect?"
h2o.dump_json(sdevs)
raise Exception("Standard Deviations are possibly incorrect!")
print " Good!"
print "Checking that the sum of square component loadings is 1 for each component."
print "In symbols, we are checking: sum_j(a_ij)^2 == 1 for all i"
pcs = pcaResult["PCAModel"]["eigenvectors"]
sums = [round(sum([a**2 for a in eigenvector.values()]),5) for eigenvector in pcs]
print "Sum of the square PC loadings are: ", sums
if sums != [1 for i in range(len(pcs))]:
raise Exception("Sum of the square loadings do not add up to 1 for at least one eigenvector!")
print "Good!"