did we lose lambda max in glm2 submodels?

narenreddy · Mar 6, 2014 · 38c044d · 38c044d
1 parent 8e6220e
commit 38c044d
Show file tree

Hide file tree

Showing 5 changed files with 128 additions and 97 deletions.
diff --git a/py/h2o_glm.py b/py/h2o_glm.py
@@ -153,26 +153,27 @@ def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False
     if h2o.beta_features:
         # number of submodels = number of lambda
         # min of 2. lambdaMax is first
-        if len(GLMModel['submodels']) < 2:
-            raise Exception("Always should have a minimum of 2 submodels in GLM2 response", len(submodels))
-
+        submodels = GLMModel['submodels']
         lambdas = GLMModel['lambdas']
-        if len(lambdas) < 2:
-            raise Exception("Always should have a minimum of 2 lambdas in GLM2 response", len(submodels))
-
-        submodels0 = GLMModel['submodels'][0]
-        submodels1 = GLMModel['submodels'][1]
-        lambdaMax = lambdas[0]
-        if lambdaMax <= lambdas[1]:
-            raise Exception("lambdaMax %s should always be < the lambda result %s we're checking" % (lambdaMax, lambdas[1]))
-
         # since all our tests?? only use one lambda, the best_lamda_idx should = 1
         best_lambda_idx = GLMModel['best_lambda_idx']
-        if best_lambda_idx != 1:
-            raise Exception("best_lamda_idx %s should point to the one lamda we specified? %s" % (best_lamda_idx, lamdas[1]))
-
+        lambdaMax = lambdas[0]
         print "lambdaMax:", lambdaMax
+
+        if 1==0:
+            if len(submodels) < 2:
+                raise Exception("Always should have a minimum of 2 submodels in GLM2 response", len(submodels))
+            if len(lambdas) < 2:
+                raise Exception("Always should have a minimum of 2 lambdas in GLM2 response", len(submodels))
+            if best_lambda_idx != 1:
+                raise Exception("best_lamda_idx %s should point to the one lamda we specified? %s" % (best_lamda_idx, lamdas[1]))
+            if lambdaMax <= lambdas[-1]:
+                raise Exception("lambdaMax %s should always be < the lambda result %s we're checking" % (lambdaMax, lambdas[1]))
+
+        submodels0 = submodels[0]
+        submodels1 = submodels[-1] # hackery to make it work when there's just one
         iterations = submodels1['iteration']
+
     else:
         iterations = GLMModel['iterations']
 

diff --git a/py/h2o_summ.py b/py/h2o_summ.py
@@ -1,7 +1,4 @@
-
-
-
-# Courtesy of Wai Yip Tung. a pure python percentile function
+# similar to Wai Yip Tung. a pure python percentile function
 # so we don't have to use the one(s) from numpy or scipy
 # and require those package installs
 ## {{{ http://code.activestate.com/recipes/511478/ (r1)
@@ -10,6 +7,9 @@
 import functools
 
 def percentileOnSortedList(N, percent, key=lambda x:x):
+    # 5 ways of resolving fractional
+    # floor, ceil, funky, linear, mean
+    INTERPOLATE = 'mean'
     """
     Find the percentile of a list of values.
 
@@ -22,13 +22,50 @@ def percentileOnSortedList(N, percent, key=lambda x:x):
     if N is None:
         return None
     k = (len(N)-1) * percent
-    f = math.floor(k)
-    c = math.ceil(k)
+    f = int(math.floor(k))
+    c = int(math.ceil(k))
     if f == c:
-        return key(N[int(k)])
-    d0 = key(N[int(f)]) * (c-k)
-    d1 = key(N[int(c)]) * (k-f)
-    return d0+d1
+        d = key(N[k])
+        msg = "aligned:" 
+
+    elif INTERPOLATE=='floor':
+        d = key(N[f])
+        msg = "fractional with floor:" 
+
+    elif INTERPOLATE=='ceil':
+        d = key(N[c])
+        msg = "fractional with ceil:" 
+
+    elif INTERPOLATE=='funky':
+        d0 = key(N[f]) * (c-k)
+        d1 = key(N[c]) * (k-f)
+        d = d0+d1
+        msg = "fractional with Tung(floor and ceil) :" 
+
+    elif INTERPOLATE=='linear':
+        pctDiff = (k-f)/(c-f+0.0)
+        dDiff = pctDiff * (key(N[c]) - key(N[f]))
+        d = key(N[c] + dDiff)
+        msg = "fractional with linear(floor and ceil):" 
+
+    elif INTERPOLATE=='mean':
+        d = (key(N[c]) + key(N[f])) / 2.0
+        msg = "fractional with mean(floor and ceil):" 
+
+    # print 3 around the floored k, for eyeballing when we're close
+    flooredK = int(f)
+    # print the 3 around the median
+    if flooredK > 0:
+        print "prior->", key(N[flooredK-1]), " "
+    else:
+        print "prior->", "<bof>"
+    print "floor->", key(N[flooredK]), " ", msg, d
+    if flooredK+1 < len(N):
+        print " ceil->", key(N[flooredK+1])
+    else:
+        print " ceil-> <eof>"
+
+    return d
 
 # median is 50th percentile.
 def medianOnSortedList(N, key=lambda x:x):

diff --git a/py/h2o_util.py b/py/h2o_util.py
@@ -3,8 +3,7 @@
 import os, zipfile, simplejson as json
 import h2o
 
-
-# a short quick version for relative comparion
+# a short quick version for relative comparion. But it's probably better to use approx_equal below
 # the subsequent ones might be prefered, especially assertAlmostEqual(
 # http://en.wikipedia.org/wiki/Relative_difference
 # http://stackoverflow.com/questions/4028889/floating-point-equality-in-python
@@ -74,12 +73,14 @@ def approx_equal(x, y, *args, **kwargs):
                 if result is NotImplemented:
                     continue
                 return bool(result)
+
     # If we get here without returning, then neither x nor y knows how to do an
     # approximate equal comparison (or are both floats). Fall back to a numeric
     # comparison.
     return _float_approx_equal(x, y, *args, **kwargs)
 
 # note this can take 'tol' and 'rel' parms for the float case
+# just wraps approx_equal in an assert with a good print message
 def assertApproxEqual(x, y, msg='', **kwargs):
     if not approx_equal(x, y, msg=msg, **kwargs):
         m = msg + '. h2o_util.assertApproxEqual failed comparing %s and %s. %s.' % (x, y, kwargs)
@@ -103,9 +104,9 @@ def cleanseInfNan(value):
 # arbitrary Python floating point numbers.
 
 # The weights need to cover the whole list? otherwise you don't get the rest of the choises
-# random_data = [6,7,8]
-# weights = [2,3,5]
-# d = h2o_util.random_data[weighted_choice(weights)]
+#     random_data = [6,7,8]
+#     weights = [2,3,5]
+#     d = random_data[h2o_util.weighted_choice(weights)]
 def weighted_choice(weights):
     rnd = random.random() * sum(weights)
     for i, w in enumerate(weights):

diff --git a/py/testdir_single_jvm/b.py b/py/testdir_single_jvm/b.py
@@ -5,9 +5,8 @@
 import scipy as sp
 import math
 OTHER_T = 0.50
-BIN_COUNT = 2
+BIN_COUNT = 100000
 
-# recursive binning to a single bin with one number
 # might have multiple rows with that one number
 # possible answers:
 #   that number if single row  in that bin, 
@@ -48,6 +47,9 @@ def findQuantile(d, dmin, dmax, threshold):
 
     # totalRows should be cleansed of NAs. assume d doesn't have NAs (cleaned elsewhere)
     totalRows = len(d)
+    # Used to have 
+    desiredBinCnt = BIN_COUNT
+    maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues
 
     # initialize
     newValStart = dmin
@@ -59,9 +61,6 @@ def findQuantile(d, dmin, dmax, threshold):
     # yes there is no newHighCount. Created during the pass, though.
 
     # state shared by each pass
-    # Used to have 
-    desiredBinCnt = BIN_COUNT
-    maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues
     assert maxBinCnt > 0
 
     hcnt = [None for b in range(maxBinCnt)]
@@ -102,12 +101,9 @@ def htot2():
 
         # playing with creating relative NUDGE values to make sure bin range
         # is always inclusive of target.
-        NUDGE = 1e-3
-        NUDGE = (1000 * (valEnd - valStart)) / 1000000
         # ratio it down from binSize. 
         # It doesn't need to be as big as binSize.
         # implicitly, it shouldn't need to be as large as binSize
-        NUDGE = binSize / desiredBinCnt
         NUDGE = 0
 
         # init to zero for each pass
@@ -136,7 +132,7 @@ def htot2():
             else:
                 # where are we zeroing in? (start)
                 # print valOffset, binSize
-                hcntIdx = int(round((valOffset * 1000000.0) / binSize) / 1000000.0)
+                hcntIdx = int(math.floor((valOffset * 1000000.0) / binSize) / 1000000.0)
                 assert hcntIdx >=0 and hcntIdx<=maxBinCnt, "val %s %s %s %s hcntIdx: %s maxBinCnt: %s binSize: %s" % \
                     (val, valStart, valEnd, valOffset, hcntIdx, maxBinCnt, binSize)
 
@@ -154,81 +150,74 @@ def htot2():
         assert totalRows==totalBinnedRows, "totalRows: %s htot2() %s not equal" % (totalRows, totalBinnedRows) 
 
         # now walk thru and find out what bin to look inside
-        k = 0
-        currentCnt = newLowCount
+        currentCnt = hcnt_low
         targetCntFull = (threshold * totalRows) + 0.5
         targetCntInt = int(math.floor(targetCntFull))
         targetCntFract = targetCntFull - targetCntInt
 
         print "targetCntInt:", targetCntInt, "targetCntFract", targetCntFract
 
+        k = 0
         while((currentCnt + hcnt[k]) < targetCntInt): 
             currentCnt += hcnt[k]
-            k = k+1
-            assert k <= maxBinCnt, "k too large, k: %s maxBinCnt %s" % (k, maxBinCnt)
+            k += 1
+            assert k<=maxBinCnt, "k too large, k: %s maxBinCnt %s" % (k, maxBinCnt)
 
         if hcnt[k]==1: 
             assert hcnt_min[k]==hcnt_max[k]
 
         # some possibily interpolating guesses first, in guess we have to iterate (best guess)
         done = False
+        guess = (hcnt_max[k] - hcnt_min[k]) / 2
+
+        if currentCnt==targetCntInt:
+            if hcnt[k]>2:
+                guess = hcnt_min[k]
+                done = True
+                print "Guess A", guess
 
-        # We should end with a count of 1, otherwise it's still a best guess
-        # could be approximately equal
-        # THERE CAN BE MULTIPLE VALUES AT THE TARGET VALUE
-        # check for min = max in that bin!
-        # In the right bit with only one value,
-        if not done and hcnt_min[k]==hcnt_max[k] and currentCnt==targetCntInt: 
-            # no mattter what size the fraction it would be on this number
-            if hcnt[k]>=2 or targetCntFract==0:
+            if hcnt[k]==2:
+                # no mattter what size the fraction it would be on this number
+                guess = (hcnt_max[k] + hcnt_min[k]) / 2.0
                 done = True
+                print "Guess B", guess
+
+            if hcnt[k]==1 and targetCntFract==0:
+                assert hcnt_min[k]==hcnt_max[k]
                 guess = hcnt_min[k]
-                print 'Done:', 'hcnt_min[k]', 'hcnt_max[k]', 'currentCnt', 'targetCntInt', 'targetCntFract'
-                print 'Done:', hcnt_min[k], hcnt_max[k], currentCnt, targetCntInt, targetCntFract
-            else:
-                # may have to extrapoate if hcnt[k]==1
-                # guess with mean of bin
-                guess = (hcnt_max[k] - hcnt_min[k])/2
-
-        # do we have to compute the mean, using the current k bin and hcnt_high_min?
-        # if min and max for a bin are different the count must be >1
-        # need to get to 1 entry to 
-        # if there's a fractional part, and we're not done, it's in the next k
-        # okay...potentially multiple of same value in the bin
-        if not done and hcnt[k]>1 and hcnt_min[k]==hcnt_max[k] and targetCntFract!=0:
-            print "\nInterpolating result into single value of this bin"
-            print 'Guess E:', 'guess', 'k', 'hcnt[k]', 'hcnt_min[k]', 'hcnt_max[k]', 'currentCnt', 'targetCntInt'
-            print "Guess E:", guess, k, hcnt[k], hcnt_min[k], hcnt_max[k], currentCnt, targetCntInt
-            guess = hcnt_min[k]
-            done = True
-
-        if not done and hcnt[k]==1 and hcnt_min[k]==hcnt_max[k] and targetCntFract!=0:
-            print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero"
-            # we really should interpolate with the next non-zero bin. But we'd have to search for it.
-            # just do another iteration. This should only happen on the first pass.
-            # I guess we could search to make single pass iterations work
-
-            if k<maxBinCnt:
-                nextK = k + 1 # could put it over maxBinCnt
-            else:
-                nextK = k
-            while nextK<maxBinCnt and hcnt[nextK]==0:
-                nextK += 1
-            # have the "extra bin" for this
-            if nextK >= maxBinCnt and hcnt_high_cnt!=0:
-                nextVal = hcnt_high_min
-            else:
-                assert hcnt[nextK]!=0
-                nextVal = hcnt_min[nextK]
+                done = True
+                print "k", k
+                print "Guess C", guess
+
+            if hcnt[k]==1 and targetCntFract!=0:
+                assert hcnt_min[k]==hcnt_max[k]
+                print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero"
+                if k<maxBinCnt:
+                    nextK = k + 1 # could put it over maxBinCnt
+                else:
+                    nextK = k
+                while nextK<maxBinCnt and hcnt[nextK]==0:
+                    nextK += 1
+
+                # have the "extra bin" for this
+                if nextK >= maxBinCnt:
+                    assert hcnt_high!=0
+                    print "hello1:", hcnt_high_min
+                    nextVal = hcnt_high_min
+                else:
+                    print "hello2:", nextK
+                    assert hcnt[nextK]!=0
+                    nextVal = hcnt_min[nextK]
 
             guess = (hcnt_max[k] + nextVal) / 2.0
             done = True # has to be one above us when needed. (or we're at end)
+            print 'k', 'hcnt_max[k]', 'nextVal'
+            print "hello3:", k, hcnt_max[k], nextVal
             print "\nInterpolating result using nextK: %s nextVal: %s" % (nextK, nextVal)
-            print "Guess G with nextK:", guess
-
+            print "Guess D", guess
 
         if not done:
-            newValStart = hcnt_min[k] - NUDGE# FIX! should we nudge a little?
+            newValStart = hcnt_min[k] - NUDGE # FIX! should we nudge a little?
             newValEnd   = hcnt_max[k] + NUDGE # FIX! should we nudge a little?
             newValRange = newValEnd - newValStart 
 
@@ -239,8 +228,10 @@ def htot2():
             print "Saying done because newBinSize is 0."
             print "newValRange: %s, hcnt[k]: %s hcnt_min[k]: %s hcnt_max[k]: %s" %\
                  (newValRange, hcnt[k], hcnt_min[k], hcnt_max[k])
-            guess = newValStart
-            done = newBinSize==0
+
+            if newBinSize==0:
+                guess = newValStart
+                done = True
 
             # if we have to interpolate
             # if it falls into this bin, interpolate to this bin means one answer?
@@ -278,9 +269,9 @@ def twoDecimals(l):
         return "%.2f" % l
 
 # csvPathname = './syn_binary_1000000x1.csv'
-csvPathname = './d.csv'
-csvPathname = './syn_binary_1000000x1.csv'
 csvPathname = './syn_binary_100x1.csv'
+csvPathname = './d.csv'
+csvPathname = './syn_binary_100000x1.csv'
 col = 0
 
 print "Reading csvPathname"
@@ -323,7 +314,7 @@ def twoDecimals(l):
 a1 = stats.scoreatpercentile(target, per=100*OTHER_T, interpolation_method='fraction')
 h2p.red_print("stats.scoreatpercentile:", a1)
 a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T])
-h2p.red_print("scipy stats.mstats.mquantiles:", ["%.2f" % v for v in a2])
+h2p.red_print("scipy stats.mstats.mquantiles:", a2)
 
 # looking at the sorted list here
 targetFP.sort()

diff --git a/py/testdir_single_jvm/test_GLM2_basic_predict.py b/py/testdir_single_jvm/test_GLM2_basic_predict.py
@@ -87,8 +87,9 @@ def test_A_GLM2_basic_predict_prostate(self):
             h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=hexKey, prediction='Predict.hex')
 
             # just get a predict and AUC on the same data. has to be binomial result
-            h2o.nodes[0].generate_auc(thresholds=None, actual=hexKey, predict='Predict.hex', 
+            resultAUC = h2o.nodes[0].generate_auc(thresholds=None, actual=hexKey, predict='Predict.hex', 
                 vactual=y, vpredict=1)
+            print "AUC result:", h2o.dump_json(resultAUC)
 
         h2o.nodes[0].log_view()
         namelist = h2o.nodes[0].log_download()