Merge branch 'master' of https://github.com/0xdata/h2o

AlicTsai · Dec 5, 2013 · b0d4b40 · b0d4b40
2 parents 4c74d17 + cd8be92
commit b0d4b40
Show file tree

Hide file tree

Showing 8 changed files with 186 additions and 99 deletions.
diff --git a/R/Makefile b/R/Makefile
@@ -11,6 +11,7 @@ ifeq ($(PDFLATEX),)
 else
 	R CMD Rd2pdf --force --output="h2o-package/h2o_package.pdf" --title="Package 'h2o'" --no-index --no-preview h2o-package/man 1> /dev/null
 endif
+	mkdir -p h2o-package/inst/java/
 	cp -f ../target/h2o.jar h2o-package/inst/java/h2o.jar
 	R CMD build h2o-package
 	mkdir -p ../target/R

diff --git a/py/h2o_cmd.py b/py/h2o_cmd.py
@@ -473,17 +473,20 @@ def sleep_with_dot(sec, message=None):
         dot()
         count += 1
 
-def createTestTrain(srcKey, trainDstKey, testDstKey, trainPercent, outputClass, outputCol, changeToBinomial=False):
+def createTestTrain(srcKey, trainDstKey, testDstKey, trainPercent, 
+    outputClass=None, outputCol=None, changeToBinomial=False):
     # will have to live with random extract. will create variance
 
     print "train: get random", trainPercent
     print "test: get remaining", 100 - trainPercent
     if changeToBinomial:
         print "change class", outputClass, "to 1, everything else to 0. factor() to turn real to int (for rf)"
 
+    boundary = (trainPercent + 0.0)/100
+
     execExpr = ""
     execExpr += "cct.hex=runif(%s);" % srcKey
-    execExpr += "%s=%s[cct.hex%s,];" % (trainDstKey, srcKey, '<=0.9')
+    execExpr += "%s=%s[cct.hex<=%s,];" % (trainDstKey, srcKey, boundary)
     if changeToBinomial:
         execExpr += "%s[,%s]=%s[,%s]==%s;" % (trainDstKey, outputCol+1, trainDstKey, outputCol+1, outputClass)
         execExpr +=  "factor(%s[, %s]);" % (trainDstKey, outputCol+1)
@@ -493,10 +496,10 @@ def createTestTrain(srcKey, trainDstKey, testDstKey, trainPercent, outputClass,
     inspect = runInspect(key=trainDstKey)
     infoFromInspect(inspect, "%s after mungeDataset on %s" % (trainDstKey, srcKey) )
 
-    print "test: same, but use the same runif() random result, complement"
+    print "test: same, but use the same runif() random result, complement comparison"
 
-    execExpr = "cct.hex=runif(%s);" % srcKey
-    execExpr += "%s=%s[cct.hex%s,];" % (testDstKey, srcKey, '>0.9')
+    execExpr = ""
+    execExpr += "%s=%s[cct.hex>%s,];" % (testDstKey, srcKey, boundary)
     if changeToBinomial:
         execExpr += "%s[,%s]=%s[,%s]==%s;" % (testDstKey, outputCol+1, testDstKey, outputCol+1, outputClass)
         execExpr +=  "factor(%s[, %s])" % (testDstKey, outputCol+1)

diff --git a/py/h2o_glm.py b/py/h2o_glm.py
@@ -106,9 +106,13 @@ def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False
     # h2o GLM will verboseprint the result and print errors. 
     # so don't have to do that
     # different when cross validation  is used? No trainingErrorDetails?
-    GLMModel = glm['GLMModel']
+    if h2o.beta_features:
+        GLMModel = glm['glm_model']
+    else:
+        GLMModel = glm['GLMModel']
+
     warnings = None
-    if 'warnings' in GLMModel:
+    if 'warnings' in GLMModel and GLMModel['warnings']:
         warnings = GLMModel['warnings']
         # stop on failed
         x = re.compile("failed", re.IGNORECASE)
@@ -128,82 +132,127 @@ def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False
     # not in GLMGrid?
 
     # FIX! don't get GLMParams if it can't solve?
-    GLMParams = GLMModel["GLMParams"]
+    if h2o.beta_features:
+        GLMParams = GLMModel['glm']
+    else:
+        GLMParams = GLMModel["GLMParams"]
     family = GLMParams["family"]
 
-    iterations = GLMModel['iterations']
+    if h2o.beta_features:
+        submodels0 = GLMModel['submodels'][0]
+        iterations = submodels0['iteration']
+    else:
+        iterations = GLMModel['iterations']
+
     print "GLMModel/iterations:", iterations
 
             # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
     if maxExpectedIterations is not None and iterations  > maxExpectedIterations:
             raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) )
 
-    # pop the first validation from the list
-    if 'validations' not in GLMModel:
-        raise Exception("Should be a 'validations' key in GLMModel: %s" % h2o.dump_json(GLMModel))
-
-    validationsList = GLMModel['validations']
-    # don't want to modify validationsList in case someone else looks at it
-    validations = validationsList[0]
+    if h2o.beta_features:
+        if 'validation' not in submodels0:
+            raise Exception("Should be a 'validations' key in submodels0: %s" % h2o.dump_json(submodels0))
+        validationsList = submodels0['validation']
+        validations = validationsList
+
+    else:
+        # pop the first validation from the list
+        if 'validations' not in GLMModel:
+            raise Exception("Should be a 'validations' key in GLMModel: %s" % h2o.dump_json(GLMModel))
+        validationsList = GLMModel['validations']
+        # don't want to modify validationsList in case someone else looks at it
+        validations = validationsList[0]
 
     # xval. compare what we asked for and what we got.
     n_folds = kwargs.setdefault('n_folds', None)
-    if not 'xval_models' in validations:
-        if n_folds > 1:
-            raise Exception("No cross validation models returned. Asked for "+n_folds)
-    else:
-        xval_models = validations['xval_models']
-        if n_folds and n_folds > 1:
-            if len(xval_models) != n_folds:
-                raise Exception(len(xval_models)+" cross validation models returned. Asked for "+n_folds)
+
+    # not checked in v2?
+    if not h2o.beta_features:
+        if not 'xval_models' in validations:
+            if n_folds > 1:
+                raise Exception("No cross validation models returned. Asked for "+n_folds)
         else:
-            # should be default 10?
-            if len(xval_models) != 10:
-                raise Exception(str(len(xval_models))+" cross validation models returned. Default should be 10")
-
-    print "GLMModel/validations"
-    validations['err'] = h2o_util.cleanseInfNan(validations['err'])
-    validations['nullDev'] = h2o_util.cleanseInfNan(validations['nullDev'])
-    validations['resDev'] = h2o_util.cleanseInfNan(validations['resDev'])
-    print "%15s %s" % ("err:\t", validations['err'])
-    print "%15s %s" % ("nullDev:\t", validations['nullDev'])
-    print "%15s %s" % ("resDev:\t", validations['resDev'])
+            xval_models = validations['xval_models']
+            if n_folds and n_folds > 1:
+                if len(xval_models) != n_folds:
+                    raise Exception(len(xval_models)+" cross validation models returned. Asked for "+n_folds)
+            else:
+                # should be default 10?
+                if len(xval_models) != 10:
+                    raise Exception(str(len(xval_models))+" cross validation models returned. Default should be 10")
+
+    if h2o.beta_features:
+        print "GLMModel/validations"
+        validations['avg_err'] = h2o_util.cleanseInfNan(validations['avg_err'])
+        validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance'])
+        validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance'])
+        print "%15s %s" % ("avg_err:\t", validations['avg_err'])
+        print "%15s %s" % ("null_deviance:\t", validations['null_deviance'])
+        print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance'])
+
+    else:
+        print "GLMModel/validations"
+        validations['err'] = h2o_util.cleanseInfNan(validations['err'])
+        validations['nullDev'] = h2o_util.cleanseInfNan(validations['nullDev'])
+        validations['resDev'] = h2o_util.cleanseInfNan(validations['resDev'])
+        print "%15s %s" % ("err:\t", validations['err'])
+        print "%15s %s" % ("nullDev:\t", validations['nullDev'])
+        print "%15s %s" % ("resDev:\t", validations['resDev'])
 
     # threshold only there if binomial?
     # auc only for binomial
     if family=="binomial":
         print "%15s %s" % ("auc:\t", validations['auc'])
-        print "%15s %s" % ("threshold:\t", validations['threshold'])
+        if h2o.beta_features:
+            print "%15s %s" % ("best_threshold:\t", validations['best_threshold'])
+        else:
+            print "%15s %s" % ("threshold:\t", validations['threshold'])
 
     if family=="poisson" or family=="gaussian":
         print "%15s %s" % ("aic:\t", validations['aic'])
 
-    if math.isnan(validations['err']):
-        emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", validations['err'])
-        raise Exception(emsg)
+    if not h2o.beta_features:
+        if math.isnan(validations['err']):
+            emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", validations['err'])
+            raise Exception(emsg)
 
-    if math.isnan(validations['resDev']):
-        emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validations['resDev'])
-        raise Exception(emsg)
+        if math.isnan(validations['resDev']):
+            emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validations['resDev'])
+            raise Exception(emsg)
 
-    # legal?
-    if math.isnan(validations['nullDev']):
-        ## emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", validations['nullDev'])
-        ## raise Exception(emsg)
-        pass
+        # legal?
+        if math.isnan(validations['nullDev']):
+            pass
 
     # get a copy, so we don't destroy the original when we pop the intercept
-    if doNormalized:
-        coefficients = GLMModel['normalized_coefficients'].copy()
+    if h2o.beta_features:
+        if doNormalized:
+            coefficients = submodels0['normalized_beta'].copy()
+        else:
+            print "beta:", submodels0['beta']
+            coefficients = list(submodels0['beta']) # copy
+
     else:
-        coefficients = GLMModel['coefficients'].copy()
+        if doNormalized:
+            coefficients = GLMModel['normalized_coefficients'].copy()
+        else:
+            coefficients = GLMModel['coefficients'].copy()
+
+    if h2o.beta_features:
+        column_names = GLMModel['_names']
+        print "column_names:", column_names
+    else:
+        column_names = GLMModel['column_names']
+        # get the intercept out of there into it's own dictionary
+        intercept = coefficients.pop('Intercept', None)
 
-    column_names = GLMModel['column_names']
-    # get the intercept out of there into it's own dictionary
-    intercept = coefficients.pop('Intercept', None)
     # have to skip the output col! get it from kwargs
     # better always be there!
-    y = kwargs['y']
+    if h2o.beta_features:
+        y = kwargs['response']
+    else:
+        y = kwargs['y']
 
 
     # the dict keys are column headers if they exist...how to order those? new: use the 'column_names'
@@ -291,9 +340,13 @@ def add_to_coefficient_list_and_string(c,cList,cString):
             "sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26"
             ))
 
-    print "GLMModel model time (milliseconds):", GLMModel['model_time']
-    print "GLMModel validation time (milliseconds):", validations['val_time']
-    print "GLMModel lsm time (milliseconds):", GLMModel['lsm_time']
+    if h2o.beta_features:
+        print "submodels0, run_time (milliseconds):", submodels0['run_time']
+    else:
+
+        print "GLMModel model time (milliseconds):", GLMModel['model_time']
+        print "GLMModel validation time (milliseconds):", validations['val_time']
+        print "GLMModel lsm time (milliseconds):", GLMModel['lsm_time']
 
     # shouldn't have any errors
     h2o.check_sandbox_for_errors()

diff --git a/py/testdir_multi_jvm/test_rf_covtype_train_oobe_fvec.py b/py/testdir_multi_jvm/test_rf_covtype_train_oobe_fvec.py
@@ -79,7 +79,7 @@ def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True):
             # just do random split for now
             dataKeyTrain = 'rTrain.hex'
             dataKeyTest = 'rTest.hex'
-            h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=0.90, outputClass=4, 
+            h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=90, outputClass=4, 
                 outputCol=numCols-1, changeToBinomial=not DO_MULTINOMIAL)
             sliceResult = {'destination_key': dataKeyTrain}
 

diff --git a/...tdir_single_jvm/test_GLM_covtype_train.py → ...dir_single_jvm/test_GLM2_covtype_train.py b/...tdir_single_jvm/test_GLM_covtype_train.py → ...dir_single_jvm/test_GLM2_covtype_train.py
@@ -20,7 +20,8 @@ def setUpClass(cls):
     def tearDownClass(cls):
         h2o.tear_down_cloud()
 
-    def test_GLM_covtype_train(self):
+    def test_GLM2_covtype_train(self):
+        h2o.beta_features = True
         importFolderPath = "standard"
         csvFilename = 'covtype.shuffled.data'
         csvPathname = importFolderPath + "/" + csvFilename
@@ -32,70 +33,64 @@ def test_GLM_covtype_train(self):
 
         inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
         print "\n" + csvPathname, \
-            "    num_rows:", "{:,}".format(inspect['num_rows']), \
-            "    num_cols:", "{:,}".format(inspect['num_cols'])
+            "    numRows:", "{:,}".format(inspect['numRows']), \
+            "    numCols:", "{:,}".format(inspect['numCols'])
 
         # how many rows for each pct?
-        num_rows = inspect['num_rows']
-        pct10 = int(num_rows * .1)
+        numRows = inspect['numRows']
+        pct10 = int(numRows * .1)
         rowsForPct = [i * pct10 for i in range(0,11)]
         # this can be slightly less than 10%
-        last10 = num_rows - rowsForPct[9]
+        last10 = numRows - rowsForPct[9]
         rowsForPct[10] = last10
         # use mod below for picking "rows-to-do" in case we do more than 9 trials
         # use 10 if 0 just to see (we copied 10 to 0 above)
         rowsForPct[0] = rowsForPct[10]
 
         print "Creating the key of the last 10% data, for scoring"
-        dataKeyTest = "rTest"
+        trainDataKey = "rTrain"
+        testDataKey = "rTest"
         # start at 90% rows + 1
 
-        execExpr = dataKeyTest + " = slice(" + hex_key + "," + str(rowsForPct[9]+1) + ")"
-        h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10)
+        h2o_cmd.createTestTrain(srcKey=hex_key, trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90)
+        # will have to live with random extract. will create variance
 
         kwargs = {
-            'y': 54, 
+            'response': 'C54', 
             'max_iter': 20, 
             'n_folds': 0, 
-            'thresholds': 0.5,
             'alpha': 0.1, 
             'lambda': 1e-5, 
             'family': 'binomial',
             'case_mode': '=', 
-            'case': 2
+            'case_val': 4,
         }
         timeoutSecs = 60
 
         for trial in range(10):
             # always slice from the beginning
             rowsToUse = rowsForPct[trial%10] 
-            resultKey = "r" + str(trial)
-            execExpr = resultKey + " = slice(" + hex_key + ",1," + str(rowsToUse) + ")"
-            h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10)
-            parseResult['destination_key'] = resultKey
-            # adjust timeoutSecs with the number of trees
-            # seems ec2 can be really slow
+
+            h2o_cmd.createTestTrain(srcKey=hex_key, trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90)
+            parseResult['destination_key'] = trainDataKey
 
             start = time.time()
             glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
             print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
             h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
-
-            GLMModel = glm['GLMModel']
-            modelKey = GLMModel['model_key']
+            modelKey = glm['glm_model']['_selfKey']
 
             start = time.time()
-            glmScore = h2o_cmd.runGLMScore(key=dataKeyTest, model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
-            print "glmScore end on ", dataKeyTest, 'took', time.time() - start, 'seconds'
-            ### print h2o.dump_json(glmScore)
+            glmScore = h2o_cmd.runGLMScore(key=testDataKey, model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
+            print "glmScore end on ", testDataKey, 'took', time.time() - start, 'seconds'
             classErr = glmScore['validation']['classErr']
             auc = glmScore['validation']['auc']
             err = glmScore['validation']['err']
             print "classErr:", classErr
             print "err:", err
             print "auc:", auc
 
-            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows"
+            print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"
 
 if __name__ == '__main__':
     h2o.unit_main()