Merge branch 'master' of https://github.com/0xdata/h2o

eric-code · May 8, 2014 · d9b6037 · d9b6037
2 parents dec284b + c80a65c
commit d9b6037
Show file tree

Hide file tree

Showing 9 changed files with 1,598 additions and 1,432 deletions.
diff --git a/R/h2o-package/R/Classes.R b/R/h2o-package/R/Classes.R
diff --git a/py/testdir_single_jvm_fvec/test_export_import.py b/py/testdir_single_jvm_fvec/test_export_import.py
@@ -0,0 +1,148 @@
+import unittest, random, sys, time
+sys.path.extend(['.','..','py'])
+import h2o, h2o_cmd, h2o_rf as h2o_rf, h2o_hosts, h2o_import as h2i, h2o_exec, h2o_jobs, h2o_gbm
+
+paramDict = {
+    'response': 'C55',
+    'cols': None,
+    # 'ignored_cols_by_name': 'C1,C2,C6,C7,C8',
+    'ignored_cols_by_name': None,
+    'classification': 1, 
+    'validation': None,
+    # fail case
+    # 'ntrees': 1,
+    # 'max_depth': 30,
+    # 'nbins': 100,
+    'ntrees': 10,
+    'max_depth': 20,
+
+    'min_rows': 1, # normally 1 for classification, 5 for regression
+    'nbins': 200,
+    'mtries': None,
+    'sample_rate': 0.66,
+    'importance': 0,
+    'seed': None,
+    }
+
+DO_OOBE = False
+# TRY = 'max_depth'
+# TRY = 'ntrees'
+TRY = 'nbins'
+
+class Basic(unittest.TestCase):
+    def tearDown(self):
+        h2o.check_sandbox_for_errors()
+
+    @classmethod
+    def setUpClass(cls):
+        global localhost
+        localhost = h2o.decide_if_localhost()
+        if (localhost):
+            h2o.build_cloud(1, java_heap_GB=4)
+        else:
+            h2o_hosts.build_cloud_with_hosts()
+
+
+    @classmethod
+    def tearDownClass(cls):
+        h2o.tear_down_cloud()
+
+    def test_export_import(self):
+        SYNDATASETS_DIR = h2o.make_syn_dir()
+        h2o.beta_features = True # fvec
+        importFolderPath = "standard"
+
+        # Parse Train ******************************************************
+        csvTrainFilename = 'covtype.shuffled.90pct.data'
+        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
+        trainKey = csvTrainFilename + ".hex"
+        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=trainKey,
+            timeoutSecs=180, doSummary=False)
+        inspect = h2o_cmd.runInspect(None, trainKey)
+
+        # Parse Test ******************************************************
+        csvTestFilename = 'covtype.shuffled.10pct.data'
+        csvTestPathname = importFolderPath + "/" + csvTestFilename
+        testKey = csvTestFilename + ".hex"
+        parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=testKey,
+            timeoutSecs=180)
+        inspect = h2o_cmd.runInspect(None, testKey)
+
+
+        trial = 0
+        ntreesList = [5, 10, 20, 30]
+        # ntreesList = [2]
+        nbinsList  = [10, 100, 1000]
+
+        if TRY == 'max_depth':
+            tryList = depthList
+        elif TRY == 'ntrees':
+            tryList = ntreesList
+        elif TRY == 'nbins':
+            tryList = nbinsList
+        else:
+            raise Exception("huh? %s" % TRY)
+
+        for d in tryList:
+            if TRY == 'max_depth':
+                paramDict['max_depth'] = d
+            elif TRY == 'ntrees':
+                paramDict['ntrees'] = d
+            elif TRY == 'nbins':
+                paramDict['nbins'] = d
+            else:
+                raise Exception("huh? %s" % TRY)
+
+            # adjust timeoutSecs with the number of trees
+            # seems ec2 can be really slow
+            if DO_OOBE:
+                paramDict['validation'] = None
+            else:
+                paramDict['validation'] = parseTestResult['destination_key']
+
+            timeoutSecs = 30 + paramDict['ntrees'] * 200
+
+
+            # do ten starts, to see the bad id problem?
+            trial += 1
+            kwargs = paramDict.copy()
+            modelKey = 'RFModel_' + str(trial)
+            kwargs['destination_key'] = modelKey
+
+            start = time.time()
+            rfResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, **kwargs)
+            trainElapsed = time.time() - start
+            print 'rf train end on', csvTrainPathname, 'took', trainElapsed, 'seconds'
+
+            h2o.nodes[0].export_files(src_key=testKey, path=SYNDATASETS_DIR + "/" + testKey, force=1)
+            h2o.nodes[0].export_files(src_key=trainKey, path=SYNDATASETS_DIR + "/" + trainKey, force=1)
+            # h2o.nodes[0].export_files(src_key=modelKey, path=SYNDATASETS_DIR + "/" + modelKey, force=1)
+
+
+            rf_model = rfResult['drf_model']
+            cms = rf_model['cms']
+            ### print "cm:", h2o.dump_json(cm)
+            ntrees = rf_model['N']
+            errs = rf_model['errs']
+            N = rf_model['N']
+            varimp = rf_model['varimp']
+            treeStats = rf_model['treeStats']
+
+            print "maxDepth:", treeStats['maxDepth']
+            print "maxLeaves:", treeStats['maxLeaves']
+            print "minDepth:", treeStats['minDepth']
+            print "minLeaves:", treeStats['minLeaves']
+            print "meanLeaves:", treeStats['meanLeaves']
+            print "meanDepth:", treeStats['meanDepth']
+            print "errs[0]:", errs[0]
+            print "errs[-1]:", errs[-1]
+            print "errs:", errs
+
+            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)
+            print "classErrorPctList:", classErrorPctList
+            self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict")
+            # FIX! should update this expected classification error
+            predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey)
+
+if __name__ == '__main__':
+    h2o.unit_main()
diff --git a/src/main/java/hex/deeplearning/DeepLearning.java b/src/main/java/hex/deeplearning/DeepLearning.java
@@ -34,15 +34,15 @@ public class DeepLearning extends Job.ValidatedJob {
    * model. This option allows users to build a new model as a
    * continuation of a previously generated model (e.g., by a grid search).
    */
-  @API(help = "Model checkpoint to resume training with", filter= Default.class, json = true, gridable = false)
+  @API(help = "Model checkpoint to resume training with", filter= Default.class, json = true)
   public Key checkpoint;
 
   /**
    * If given, store the best model so far under this key.
    * Model performance is measured by MSE for regression and overall
    * error rate for classification (at F1-optimal threshold for binary classification).
    */
-  @API(help = "Key to store the always-best model under", filter= Default.class, json = true, gridable = false)
+  @API(help = "Key to store the always-best model under", filter= Default.class, json = true)
   public Key best_model_key = null;
 
   /**
@@ -51,7 +51,7 @@ public class DeepLearning extends Job.ValidatedJob {
    * values is fine for many problems, but best results on complex datasets are often
    * only attainable via expert mode options.
    */
-  @API(help = "Enable expert mode (to access all options from GUI)", filter = Default.class, json = true, gridable = false)
+  @API(help = "Enable expert mode (to access all options from GUI)", filter = Default.class, json = true)
   public boolean expert_mode = false;
 
   /*Neural Net Topology*/
@@ -343,64 +343,64 @@ public class DeepLearning extends Job.ValidatedJob {
    * training data scoring dataset. When the error is at or below this threshold,
    * training stops.
    */
-  @API(help = "Stopping criterion for classification error fraction on training data (-1 to disable)", filter = Default.class, dmin=-1, dmax=1, json = true, gridable = false, importance = ParamImportance.EXPERT)
+  @API(help = "Stopping criterion for classification error fraction on training data (-1 to disable)", filter = Default.class, dmin=-1, dmax=1, json = true, importance = ParamImportance.EXPERT)
   public double classification_stop = 0;
 
   /**
    * The stopping criteria in terms of regression error (MSE) on the training
    * data scoring dataset. When the error is at or below this threshold, training
    * stops.
    */
-  @API(help = "Stopping criterion for regression error (MSE) on training data (-1 to disable)", filter = Default.class, dmin=-1, json = true, gridable = false, importance = ParamImportance.EXPERT)
+  @API(help = "Stopping criterion for regression error (MSE) on training data (-1 to disable)", filter = Default.class, dmin=-1, json = true, importance = ParamImportance.EXPERT)
   public double regression_stop = 1e-6;
 
   /**
    * Enable quiet mode for less output to standard output.
    */
-  @API(help = "Enable quiet mode for less output to standard output", filter = Default.class, json = true, gridable = false)
+  @API(help = "Enable quiet mode for less output to standard output", filter = Default.class, json = true)
   public boolean quiet_mode = false;
 
   /**
    * For classification models, the maximum size (in terms of classes) of the
    * confusion matrix for it to be printed. This option is meant to avoid printing
    * extremely large confusion matrices.
    */
-  @API(help = "Max. size (number of classes) for confusion matrices to be shown", filter = Default.class, json = true, gridable = false)
+  @API(help = "Max. size (number of classes) for confusion matrices to be shown", filter = Default.class, json = true)
   public int max_confusion_matrix_size = 20;
 
   /**
    * The maximum number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)
    */
-  @API(help = "Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)", filter = Default.class, lmin=0, json = true, gridable = false, importance = ParamImportance.EXPERT)
+  @API(help = "Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)", filter = Default.class, lmin=0, json = true, importance = ParamImportance.EXPERT)
   public int max_hit_ratio_k = 10;
 
   /*Imbalanced Classes*/
   /**
    * For imbalanced data, balance training data class counts via
    * over/under-sampling. This can result in improved predictive accuracy.
    */
-  @API(help = "Balance training data class counts via over/under-sampling (for imbalanced data)", filter = Default.class, json = true, gridable = false, importance = ParamImportance.EXPERT)
+  @API(help = "Balance training data class counts via over/under-sampling (for imbalanced data)", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
   public boolean balance_classes = false;
 
   /**
    * When classes are balanced, limit the resulting dataset size to the
    * specified multiple of the original dataset size.
    */
-  @API(help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0)", filter = Default.class, json = true, dmin=1e-3, gridable = false, importance = ParamImportance.EXPERT)
+  @API(help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0)", filter = Default.class, json = true, dmin=1e-3, importance = ParamImportance.EXPERT)
   public float max_after_balance_size = 5.0f;
 
   /**
    * Method used to sample the validation dataset for scoring, see Score Validation Samples above.
    */
-  @API(help = "Method used to sample validation dataset for scoring", filter = Default.class, json = true, gridable = false, importance = ParamImportance.EXPERT)
+  @API(help = "Method used to sample validation dataset for scoring", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
   public ClassSamplingMethod score_validation_sampling = ClassSamplingMethod.Uniform;
 
   /*Misc*/
   /**
    * Gather diagnostics for hidden layers, such as mean and RMS values of learning
    * rate, momentum, weights and biases.
    */
-  @API(help = "Enable diagnostics for hidden layers", filter = Default.class, json = true, gridable = false)
+  @API(help = "Enable diagnostics for hidden layers", filter = Default.class, json = true)
   public boolean diagnostics = true;
 
   /**

diff --git a/src/main/java/hex/gbm/SharedTreeModelBuilder.java b/src/main/java/hex/gbm/SharedTreeModelBuilder.java
@@ -61,14 +61,14 @@ public abstract class SharedTreeModelBuilder<TM extends DTree.TreeModel> extends
    * For imbalanced data, balance training data class counts via
    * over/under-sampling. This can result in improved predictive accuracy.
    */
-  @API(help = "Balance training data class counts via over/under-sampling (for imbalanced data)", filter = Default.class, json = true, gridable = false, importance = ParamImportance.EXPERT)
+  @API(help = "Balance training data class counts via over/under-sampling (for imbalanced data)", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
   public boolean balance_classes = false;
 
   /**
    * When classes are balanced, limit the resulting dataset size to the
    * specified multiple of the original dataset size.
    */
-  @API(help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0)", filter = Default.class, json = true, dmin=1e-3, gridable = true, importance = ParamImportance.EXPERT)
+  @API(help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0)", filter = Default.class, json = true, dmin=1e-3, importance = ParamImportance.EXPERT)
   public float max_after_balance_size = Float.POSITIVE_INFINITY;
 
 //  @API(help = "Active feature columns")

diff --git a/src/main/java/hex/singlenoderf/SpeeDRF.java b/src/main/java/hex/singlenoderf/SpeeDRF.java
@@ -60,7 +60,7 @@ public class SpeeDRF extends Job.ValidatedJob {
   @API(help = "seed", filter = Default.class, json = true)
   public long seed = -1;
 
-  @API(help = "Build trees in parallel", filter = Default.class, json = true)
+  @API(help = "Build trees in parallel")
   public boolean  parallel  = true;
 
   @API(help = "split limit")

diff --git a/src/main/java/water/AbstractBuildVersion.java b/src/main/java/water/AbstractBuildVersion.java
@@ -7,6 +7,17 @@ abstract public class AbstractBuildVersion {
     abstract public String projectVersion();
     abstract public String compiledOn();
     abstract public String compiledBy();
+
+    public String buildNumber() {
+      String buildNumber = "(unknown)";
+      try {
+        String projectVersion = projectVersion();
+        buildNumber = projectVersion.split("\\.")[3];
+      }
+      catch (Exception xe) {}
+      return buildNumber;
+    }
+
     @Override public String toString() {
     return "H2O v"+projectVersion()+ " ("+branchName()+" - "+lastCommitHash()+")";
     }

diff --git a/src/main/java/water/api/Documentation.java b/src/main/java/water/api/Documentation.java
@@ -1,10 +1,17 @@
 package water.api;
 
+import water.AbstractBuildVersion;
+import water.H2O;
+
 /**
  * Redirect to online documentation page.
  */
 public class Documentation extends HTMLOnlyRequest {
   protected String build(Response response) {
-    return "<meta http-equiv=\"refresh\" content=\"0; url=http://docs.0xdata.com/\">";
+    AbstractBuildVersion abv = H2O.getBuildVersion();
+    String branchName = abv.branchName();
+    String buildNumber = abv.buildNumber();
+    String url = "http://s3.amazonaws.com/h2o-release/h2o/" + branchName + "/" + buildNumber + "/docs-website/index.html";
+    return "<meta http-equiv=\"refresh\" content=\"0; url=" + url + "\">";
   }
 }
diff --git a/src/main/java/water/api/RequestServer.java b/src/main/java/water/api/RequestServer.java
@@ -101,8 +101,9 @@ public enum API_VERSION {
     Request.addToNavbar(registerRequest(new PCA()),         "PCA",                      "Model");
     Request.addToNavbar(registerRequest(new GBM()),         "GBM",                      "Model");
     Request.addToNavbar(registerRequest(new DeepLearning()),"Deep Learning",            "Model");
-    Request.addToNavbar(registerRequest(new DRF()),         "Distributed RF (Beta)",    "Model");
-    Request.addToNavbar(registerRequest(new GLM2()),        "GLM (Beta)",               "Model");
+    Request.addToNavbar(registerRequest(new DRF()),         "Distributed RF",           "Model");
+    Request.addToNavbar(registerRequest(new GLM2()),        "GLM",                      "Model");
+    Request.addToNavbar(registerRequest(new SpeeDRF()),     "SpeeDRF (Beta)",           "Model");
     Request.addToNavbar(registerRequest(new KMeans2()),     "KMeans (Beta)",            "Model");
     Request.addToNavbar(registerRequest(new NaiveBayes()),  "Naive Bayes (Beta)",       "Model");
 
@@ -119,10 +120,11 @@ public enum API_VERSION {
     Request.addToNavbar(registerRequest(new Cloud()),       "Cluster Status",           "Admin");
     Request.addToNavbar(registerRequest(new IOStatus()),    "Cluster I/O",              "Admin");
     Request.addToNavbar(registerRequest(new Timeline()),    "Timeline",                 "Admin");
+    Request.addToNavbar(registerRequest(new JProfile()),    "Profiler",                 "Admin");
     Request.addToNavbar(registerRequest(new JStack()),      "Stack Dump",               "Admin");
-    Request.addToNavbar(registerRequest(new JProfile()),    "Profile Dump",             "Admin");
     Request.addToNavbar(registerRequest(new Debug()),       "Debug Dump",               "Admin");
     Request.addToNavbar(registerRequest(new LogView()),     "Inspect Log",              "Admin");
+    Request.addToNavbar(registerRequest(new UnlockKeys()),  "Unlock Keys",              "Admin");
     Request.addToNavbar(registerRequest(new Shutdown()),    "Shutdown",                 "Admin");
 
     // Help and Tutorials
@@ -141,22 +143,18 @@ public enum API_VERSION {
       registerRequest(new ReBalance());
       registerRequest(new FrameSplitPage());
       registerRequest(new GapStatistic());
-      registerRequest(new SpeeDRF());
-      registerRequest(new UnlockKeys());
     } else {
       Request.addToNavbar(registerRequest(new hex.LR2()),        "Linear Regression2",   "Beta");
       Request.addToNavbar(registerRequest(new ReBalance()),      "ReBalance",            "Beta");
       Request.addToNavbar(registerRequest(new FrameSplitPage()), "Split frame",          "Beta");
       Request.addToNavbar(registerRequest(new Console()),        "Console",              "Beta");
       Request.addToNavbar(registerRequest(new GapStatistic()),   "Gap Statistic",        "Beta");
-      Request.addToNavbar(registerRequest(new SpeeDRF()),        "SpeeDRF",              "Beta");
-      Request.addToNavbar(registerRequest(new UnlockKeys()),     "Unlock Keys",          "Beta");
 //      Request.addToNavbar(registerRequest(new ExportModel()),    "Export Model",         "Beta (FluidVecs!)");
 //      Request.addToNavbar(registerRequest(new ImportModel()),    "Import Model",         "Beta (FluidVecs!)");
     }
 
-    // VA stuff
-    if (false) {
+    // VA stuff is only shown with -beta
+    if(H2O.OPT_ARGS.beta == null) {
       registerRequest(new Inspect());
       registerRequest(new SummaryPage());
       registerRequest(new Parse());

diff --git a/src/main/java/water/api/Tutorials.java b/src/main/java/water/api/Tutorials.java
@@ -27,7 +27,7 @@ public class Tutorials extends HTMLOnlyRequest {
 
     + "<div class='span2 col'>"
     + "  <h2>GBM</h2>"
-    +   "<p>GBM uses gradient boosted trees for regression and classification, and is one of the most powerful machine learning methods.</p>"
+    +   "<p>GBM uses gradient boosted trees for classification and regression, and is one of the most powerful machine learning methods in H<sub>2</sub>O.</p>"
     +   "<a href='/TutorialGBM.html' class='btn btn-primary'>Try it!</a>"
     + "</div>"
 
@@ -39,13 +39,13 @@ public class Tutorials extends HTMLOnlyRequest {
 
     + "<div class='span2 col'>"
     + "<h2>K-Means</h2>"
-    + "<p>Perform cluster analysis with H<sub>2</sub>O. It employs K-means, a highly scalable clustering algorithm for unsupervised learning on big data.</p>"
+    + "<p>Perform cluster analysis with H<sub>2</sub>O. K-means is a scalable clustering algorithm for unsupervised learning on big data.</p>"
     +   "<a href='/TutorialKMeans.html' class='btn btn-primary'>Try it!</a>"
     + "</div>"
 
     + "<div class='span2 col'>"
     + "<h2>Deep Learning</h2>"
-    + "<p>H<sub>2</sub>O's distributed Deep Learning gives you the power of deep neural networks for highest predictive accuracy in classification and regression.</p>"
+    + "<p>H<sub>2</sub>O's distributed Deep Learning gives you the power of deep neural networks for highest accuracy for classification and regression.</p>"
     +   "<a href='/TutorialDeepLearning.html' class='btn btn-primary'>Try it!</a>"
     + "</div>"