Skip to content

Commit

Permalink
did we lose lambda max in glm2 submodels?
Browse files Browse the repository at this point in the history
  • Loading branch information
Kevin Normoyle committed Mar 6, 2014
1 parent 8e6220e commit 38c044d
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 97 deletions.
31 changes: 16 additions & 15 deletions py/h2o_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,26 +153,27 @@ def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False
if h2o.beta_features:
# number of submodels = number of lambda
# min of 2. lambdaMax is first
if len(GLMModel['submodels']) < 2:
raise Exception("Always should have a minimum of 2 submodels in GLM2 response", len(submodels))

submodels = GLMModel['submodels']
lambdas = GLMModel['lambdas']
if len(lambdas) < 2:
raise Exception("Always should have a minimum of 2 lambdas in GLM2 response", len(submodels))

submodels0 = GLMModel['submodels'][0]
submodels1 = GLMModel['submodels'][1]
lambdaMax = lambdas[0]
if lambdaMax <= lambdas[1]:
raise Exception("lambdaMax %s should always be < the lambda result %s we're checking" % (lambdaMax, lambdas[1]))

# since all our tests?? only use one lambda, the best_lamda_idx should = 1
best_lambda_idx = GLMModel['best_lambda_idx']
if best_lambda_idx != 1:
raise Exception("best_lamda_idx %s should point to the one lamda we specified? %s" % (best_lamda_idx, lamdas[1]))

lambdaMax = lambdas[0]
print "lambdaMax:", lambdaMax

if 1==0:
if len(submodels) < 2:
raise Exception("Always should have a minimum of 2 submodels in GLM2 response", len(submodels))
if len(lambdas) < 2:
raise Exception("Always should have a minimum of 2 lambdas in GLM2 response", len(submodels))
if best_lambda_idx != 1:
raise Exception("best_lamda_idx %s should point to the one lamda we specified? %s" % (best_lamda_idx, lamdas[1]))
if lambdaMax <= lambdas[-1]:
raise Exception("lambdaMax %s should always be < the lambda result %s we're checking" % (lambdaMax, lambdas[1]))

submodels0 = submodels[0]
submodels1 = submodels[-1] # hackery to make it work when there's just one
iterations = submodels1['iteration']

else:
iterations = GLMModel['iterations']

Expand Down
57 changes: 47 additions & 10 deletions py/h2o_summ.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@



# Courtesy of Wai Yip Tung. a pure python percentile function
# similar to Wai Yip Tung. a pure python percentile function
# so we don't have to use the one(s) from numpy or scipy
# and require those package installs
## {{{ http://code.activestate.com/recipes/511478/ (r1)
Expand All @@ -10,6 +7,9 @@
import functools

def percentileOnSortedList(N, percent, key=lambda x:x):
# 5 ways of resolving fractional
# floor, ceil, funky, linear, mean
INTERPOLATE = 'mean'
"""
Find the percentile of a list of values.
Expand All @@ -22,13 +22,50 @@ def percentileOnSortedList(N, percent, key=lambda x:x):
if N is None:
return None
k = (len(N)-1) * percent
f = math.floor(k)
c = math.ceil(k)
f = int(math.floor(k))
c = int(math.ceil(k))
if f == c:
return key(N[int(k)])
d0 = key(N[int(f)]) * (c-k)
d1 = key(N[int(c)]) * (k-f)
return d0+d1
d = key(N[k])
msg = "aligned:"

elif INTERPOLATE=='floor':
d = key(N[f])
msg = "fractional with floor:"

elif INTERPOLATE=='ceil':
d = key(N[c])
msg = "fractional with ceil:"

elif INTERPOLATE=='funky':
d0 = key(N[f]) * (c-k)
d1 = key(N[c]) * (k-f)
d = d0+d1
msg = "fractional with Tung(floor and ceil) :"

elif INTERPOLATE=='linear':
pctDiff = (k-f)/(c-f+0.0)
dDiff = pctDiff * (key(N[c]) - key(N[f]))
d = key(N[c] + dDiff)
msg = "fractional with linear(floor and ceil):"

elif INTERPOLATE=='mean':
d = (key(N[c]) + key(N[f])) / 2.0
msg = "fractional with mean(floor and ceil):"

# print 3 around the floored k, for eyeballing when we're close
flooredK = int(f)
# print the 3 around the median
if flooredK > 0:
print "prior->", key(N[flooredK-1]), " "
else:
print "prior->", "<bof>"
print "floor->", key(N[flooredK]), " ", msg, d
if flooredK+1 < len(N):
print " ceil->", key(N[flooredK+1])
else:
print " ceil-> <eof>"

return d

# median is 50th percentile.
def medianOnSortedList(N, key=lambda x:x):
Expand Down
11 changes: 6 additions & 5 deletions py/h2o_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import os, zipfile, simplejson as json
import h2o


# a short quick version for relative comparion
# a short quick version for relative comparion. But it's probably better to use approx_equal below
# the subsequent ones might be prefered, especially assertAlmostEqual(
# http://en.wikipedia.org/wiki/Relative_difference
# http://stackoverflow.com/questions/4028889/floating-point-equality-in-python
Expand Down Expand Up @@ -74,12 +73,14 @@ def approx_equal(x, y, *args, **kwargs):
if result is NotImplemented:
continue
return bool(result)

# If we get here without returning, then neither x nor y knows how to do an
# approximate equal comparison (or are both floats). Fall back to a numeric
# comparison.
return _float_approx_equal(x, y, *args, **kwargs)

# note this can take 'tol' and 'rel' parms for the float case
# just wraps approx_equal in an assert with a good print message
def assertApproxEqual(x, y, msg='', **kwargs):
if not approx_equal(x, y, msg=msg, **kwargs):
m = msg + '. h2o_util.assertApproxEqual failed comparing %s and %s. %s.' % (x, y, kwargs)
Expand All @@ -103,9 +104,9 @@ def cleanseInfNan(value):
# arbitrary Python floating point numbers.

# The weights need to cover the whole list? otherwise you don't get the rest of the choises
# random_data = [6,7,8]
# weights = [2,3,5]
# d = h2o_util.random_data[weighted_choice(weights)]
# random_data = [6,7,8]
# weights = [2,3,5]
# d = random_data[h2o_util.weighted_choice(weights)]
def weighted_choice(weights):
rnd = random.random() * sum(weights)
for i, w in enumerate(weights):
Expand Down
123 changes: 57 additions & 66 deletions py/testdir_single_jvm/b.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
import scipy as sp
import math
OTHER_T = 0.50
BIN_COUNT = 2
BIN_COUNT = 100000

# recursive binning to a single bin with one number
# might have multiple rows with that one number
# possible answers:
# that number if single row in that bin,
Expand Down Expand Up @@ -48,6 +47,9 @@ def findQuantile(d, dmin, dmax, threshold):

# totalRows should be cleansed of NAs. assume d doesn't have NAs (cleaned elsewhere)
totalRows = len(d)
# Used to have
desiredBinCnt = BIN_COUNT
maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues

# initialize
newValStart = dmin
Expand All @@ -59,9 +61,6 @@ def findQuantile(d, dmin, dmax, threshold):
# yes there is no newHighCount. Created during the pass, though.

# state shared by each pass
# Used to have
desiredBinCnt = BIN_COUNT
maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues
assert maxBinCnt > 0

hcnt = [None for b in range(maxBinCnt)]
Expand Down Expand Up @@ -102,12 +101,9 @@ def htot2():

# playing with creating relative NUDGE values to make sure bin range
# is always inclusive of target.
NUDGE = 1e-3
NUDGE = (1000 * (valEnd - valStart)) / 1000000
# ratio it down from binSize.
# It doesn't need to be as big as binSize.
# implicitly, it shouldn't need to be as large as binSize
NUDGE = binSize / desiredBinCnt
NUDGE = 0

# init to zero for each pass
Expand Down Expand Up @@ -136,7 +132,7 @@ def htot2():
else:
# where are we zeroing in? (start)
# print valOffset, binSize
hcntIdx = int(round((valOffset * 1000000.0) / binSize) / 1000000.0)
hcntIdx = int(math.floor((valOffset * 1000000.0) / binSize) / 1000000.0)
assert hcntIdx >=0 and hcntIdx<=maxBinCnt, "val %s %s %s %s hcntIdx: %s maxBinCnt: %s binSize: %s" % \
(val, valStart, valEnd, valOffset, hcntIdx, maxBinCnt, binSize)

Expand All @@ -154,81 +150,74 @@ def htot2():
assert totalRows==totalBinnedRows, "totalRows: %s htot2() %s not equal" % (totalRows, totalBinnedRows)

# now walk thru and find out what bin to look inside
k = 0
currentCnt = newLowCount
currentCnt = hcnt_low
targetCntFull = (threshold * totalRows) + 0.5
targetCntInt = int(math.floor(targetCntFull))
targetCntFract = targetCntFull - targetCntInt

print "targetCntInt:", targetCntInt, "targetCntFract", targetCntFract

k = 0
while((currentCnt + hcnt[k]) < targetCntInt):
currentCnt += hcnt[k]
k = k+1
assert k <= maxBinCnt, "k too large, k: %s maxBinCnt %s" % (k, maxBinCnt)
k += 1
assert k<=maxBinCnt, "k too large, k: %s maxBinCnt %s" % (k, maxBinCnt)

if hcnt[k]==1:
assert hcnt_min[k]==hcnt_max[k]

# some possibily interpolating guesses first, in guess we have to iterate (best guess)
done = False
guess = (hcnt_max[k] - hcnt_min[k]) / 2

if currentCnt==targetCntInt:
if hcnt[k]>2:
guess = hcnt_min[k]
done = True
print "Guess A", guess

# We should end with a count of 1, otherwise it's still a best guess
# could be approximately equal
# THERE CAN BE MULTIPLE VALUES AT THE TARGET VALUE
# check for min = max in that bin!
# In the right bit with only one value,
if not done and hcnt_min[k]==hcnt_max[k] and currentCnt==targetCntInt:
# no mattter what size the fraction it would be on this number
if hcnt[k]>=2 or targetCntFract==0:
if hcnt[k]==2:
# no mattter what size the fraction it would be on this number
guess = (hcnt_max[k] + hcnt_min[k]) / 2.0
done = True
print "Guess B", guess

if hcnt[k]==1 and targetCntFract==0:
assert hcnt_min[k]==hcnt_max[k]
guess = hcnt_min[k]
print 'Done:', 'hcnt_min[k]', 'hcnt_max[k]', 'currentCnt', 'targetCntInt', 'targetCntFract'
print 'Done:', hcnt_min[k], hcnt_max[k], currentCnt, targetCntInt, targetCntFract
else:
# may have to extrapoate if hcnt[k]==1
# guess with mean of bin
guess = (hcnt_max[k] - hcnt_min[k])/2

# do we have to compute the mean, using the current k bin and hcnt_high_min?
# if min and max for a bin are different the count must be >1
# need to get to 1 entry to
# if there's a fractional part, and we're not done, it's in the next k
# okay...potentially multiple of same value in the bin
if not done and hcnt[k]>1 and hcnt_min[k]==hcnt_max[k] and targetCntFract!=0:
print "\nInterpolating result into single value of this bin"
print 'Guess E:', 'guess', 'k', 'hcnt[k]', 'hcnt_min[k]', 'hcnt_max[k]', 'currentCnt', 'targetCntInt'
print "Guess E:", guess, k, hcnt[k], hcnt_min[k], hcnt_max[k], currentCnt, targetCntInt
guess = hcnt_min[k]
done = True

if not done and hcnt[k]==1 and hcnt_min[k]==hcnt_max[k] and targetCntFract!=0:
print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero"
# we really should interpolate with the next non-zero bin. But we'd have to search for it.
# just do another iteration. This should only happen on the first pass.
# I guess we could search to make single pass iterations work

if k<maxBinCnt:
nextK = k + 1 # could put it over maxBinCnt
else:
nextK = k
while nextK<maxBinCnt and hcnt[nextK]==0:
nextK += 1
# have the "extra bin" for this
if nextK >= maxBinCnt and hcnt_high_cnt!=0:
nextVal = hcnt_high_min
else:
assert hcnt[nextK]!=0
nextVal = hcnt_min[nextK]
done = True
print "k", k
print "Guess C", guess

if hcnt[k]==1 and targetCntFract!=0:
assert hcnt_min[k]==hcnt_max[k]
print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero"
if k<maxBinCnt:
nextK = k + 1 # could put it over maxBinCnt
else:
nextK = k
while nextK<maxBinCnt and hcnt[nextK]==0:
nextK += 1

# have the "extra bin" for this
if nextK >= maxBinCnt:
assert hcnt_high!=0
print "hello1:", hcnt_high_min
nextVal = hcnt_high_min
else:
print "hello2:", nextK
assert hcnt[nextK]!=0
nextVal = hcnt_min[nextK]

guess = (hcnt_max[k] + nextVal) / 2.0
done = True # has to be one above us when needed. (or we're at end)
print 'k', 'hcnt_max[k]', 'nextVal'
print "hello3:", k, hcnt_max[k], nextVal
print "\nInterpolating result using nextK: %s nextVal: %s" % (nextK, nextVal)
print "Guess G with nextK:", guess

print "Guess D", guess

if not done:
newValStart = hcnt_min[k] - NUDGE# FIX! should we nudge a little?
newValStart = hcnt_min[k] - NUDGE # FIX! should we nudge a little?
newValEnd = hcnt_max[k] + NUDGE # FIX! should we nudge a little?
newValRange = newValEnd - newValStart

Expand All @@ -239,8 +228,10 @@ def htot2():
print "Saying done because newBinSize is 0."
print "newValRange: %s, hcnt[k]: %s hcnt_min[k]: %s hcnt_max[k]: %s" %\
(newValRange, hcnt[k], hcnt_min[k], hcnt_max[k])
guess = newValStart
done = newBinSize==0

if newBinSize==0:
guess = newValStart
done = True

# if we have to interpolate
# if it falls into this bin, interpolate to this bin means one answer?
Expand Down Expand Up @@ -278,9 +269,9 @@ def twoDecimals(l):
return "%.2f" % l

# csvPathname = './syn_binary_1000000x1.csv'
csvPathname = './d.csv'
csvPathname = './syn_binary_1000000x1.csv'
csvPathname = './syn_binary_100x1.csv'
csvPathname = './d.csv'
csvPathname = './syn_binary_100000x1.csv'
col = 0

print "Reading csvPathname"
Expand Down Expand Up @@ -323,7 +314,7 @@ def twoDecimals(l):
a1 = stats.scoreatpercentile(target, per=100*OTHER_T, interpolation_method='fraction')
h2p.red_print("stats.scoreatpercentile:", a1)
a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T])
h2p.red_print("scipy stats.mstats.mquantiles:", ["%.2f" % v for v in a2])
h2p.red_print("scipy stats.mstats.mquantiles:", a2)

# looking at the sorted list here
targetFP.sort()
Expand Down
3 changes: 2 additions & 1 deletion py/testdir_single_jvm/test_GLM2_basic_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,9 @@ def test_A_GLM2_basic_predict_prostate(self):
h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=hexKey, prediction='Predict.hex')

# just get a predict and AUC on the same data. has to be binomial result
h2o.nodes[0].generate_auc(thresholds=None, actual=hexKey, predict='Predict.hex',
resultAUC = h2o.nodes[0].generate_auc(thresholds=None, actual=hexKey, predict='Predict.hex',
vactual=y, vpredict=1)
print "AUC result:", h2o.dump_json(resultAUC)

h2o.nodes[0].log_view()
namelist = h2o.nodes[0].log_download()
Expand Down

0 comments on commit 38c044d

Please sign in to comment.