Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
dearirenelang committed May 8, 2014
2 parents dec284b + c80a65c commit d9b6037
Show file tree
Hide file tree
Showing 9 changed files with 1,598 additions and 1,432 deletions.
2,810 changes: 1,406 additions & 1,404 deletions R/h2o-package/R/Classes.R

Large diffs are not rendered by default.

148 changes: 148 additions & 0 deletions py/testdir_single_jvm_fvec/test_export_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import unittest, random, sys, time
sys.path.extend(['.','..','py'])
import h2o, h2o_cmd, h2o_rf as h2o_rf, h2o_hosts, h2o_import as h2i, h2o_exec, h2o_jobs, h2o_gbm

paramDict = {
'response': 'C55',
'cols': None,
# 'ignored_cols_by_name': 'C1,C2,C6,C7,C8',
'ignored_cols_by_name': None,
'classification': 1,
'validation': None,
# fail case
# 'ntrees': 1,
# 'max_depth': 30,
# 'nbins': 100,
'ntrees': 10,
'max_depth': 20,

'min_rows': 1, # normally 1 for classification, 5 for regression
'nbins': 200,
'mtries': None,
'sample_rate': 0.66,
'importance': 0,
'seed': None,
}

DO_OOBE = False
# TRY = 'max_depth'
# TRY = 'ntrees'
TRY = 'nbins'

class Basic(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()

@classmethod
def setUpClass(cls):
global localhost
localhost = h2o.decide_if_localhost()
if (localhost):
h2o.build_cloud(1, java_heap_GB=4)
else:
h2o_hosts.build_cloud_with_hosts()


@classmethod
def tearDownClass(cls):
h2o.tear_down_cloud()

def test_export_import(self):
SYNDATASETS_DIR = h2o.make_syn_dir()
h2o.beta_features = True # fvec
importFolderPath = "standard"

# Parse Train ******************************************************
csvTrainFilename = 'covtype.shuffled.90pct.data'
csvTrainPathname = importFolderPath + "/" + csvTrainFilename
trainKey = csvTrainFilename + ".hex"
parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=trainKey,
timeoutSecs=180, doSummary=False)
inspect = h2o_cmd.runInspect(None, trainKey)

# Parse Test ******************************************************
csvTestFilename = 'covtype.shuffled.10pct.data'
csvTestPathname = importFolderPath + "/" + csvTestFilename
testKey = csvTestFilename + ".hex"
parseTestResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTestPathname, hex_key=testKey,
timeoutSecs=180)
inspect = h2o_cmd.runInspect(None, testKey)


trial = 0
ntreesList = [5, 10, 20, 30]
# ntreesList = [2]
nbinsList = [10, 100, 1000]

if TRY == 'max_depth':
tryList = depthList
elif TRY == 'ntrees':
tryList = ntreesList
elif TRY == 'nbins':
tryList = nbinsList
else:
raise Exception("huh? %s" % TRY)

for d in tryList:
if TRY == 'max_depth':
paramDict['max_depth'] = d
elif TRY == 'ntrees':
paramDict['ntrees'] = d
elif TRY == 'nbins':
paramDict['nbins'] = d
else:
raise Exception("huh? %s" % TRY)

# adjust timeoutSecs with the number of trees
# seems ec2 can be really slow
if DO_OOBE:
paramDict['validation'] = None
else:
paramDict['validation'] = parseTestResult['destination_key']

timeoutSecs = 30 + paramDict['ntrees'] * 200


# do ten starts, to see the bad id problem?
trial += 1
kwargs = paramDict.copy()
modelKey = 'RFModel_' + str(trial)
kwargs['destination_key'] = modelKey

start = time.time()
rfResult = h2o_cmd.runRF(parseResult=parseTrainResult, timeoutSecs=timeoutSecs, **kwargs)
trainElapsed = time.time() - start
print 'rf train end on', csvTrainPathname, 'took', trainElapsed, 'seconds'

h2o.nodes[0].export_files(src_key=testKey, path=SYNDATASETS_DIR + "/" + testKey, force=1)
h2o.nodes[0].export_files(src_key=trainKey, path=SYNDATASETS_DIR + "/" + trainKey, force=1)
# h2o.nodes[0].export_files(src_key=modelKey, path=SYNDATASETS_DIR + "/" + modelKey, force=1)


rf_model = rfResult['drf_model']
cms = rf_model['cms']
### print "cm:", h2o.dump_json(cm)
ntrees = rf_model['N']
errs = rf_model['errs']
N = rf_model['N']
varimp = rf_model['varimp']
treeStats = rf_model['treeStats']

print "maxDepth:", treeStats['maxDepth']
print "maxLeaves:", treeStats['maxLeaves']
print "minDepth:", treeStats['minDepth']
print "minLeaves:", treeStats['minLeaves']
print "meanLeaves:", treeStats['meanLeaves']
print "meanDepth:", treeStats['meanDepth']
print "errs[0]:", errs[0]
print "errs[-1]:", errs[-1]
print "errs:", errs

(classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfResult)
print "classErrorPctList:", classErrorPctList
self.assertEqual(len(classErrorPctList), 7, "Should be 7 output classes, so should have 7 class error percentages from a reasonable predict")
# FIX! should update this expected classification error
predict = h2o.nodes[0].generate_predictions(model_key=modelKey, data_key=testKey)

if __name__ == '__main__':
h2o.unit_main()
24 changes: 12 additions & 12 deletions src/main/java/hex/deeplearning/DeepLearning.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,15 @@ public class DeepLearning extends Job.ValidatedJob {
* model. This option allows users to build a new model as a
* continuation of a previously generated model (e.g., by a grid search).
*/
@API(help = "Model checkpoint to resume training with", filter= Default.class, json = true, gridable = false)
@API(help = "Model checkpoint to resume training with", filter= Default.class, json = true)
public Key checkpoint;

/**
* If given, store the best model so far under this key.
* Model performance is measured by MSE for regression and overall
* error rate for classification (at F1-optimal threshold for binary classification).
*/
@API(help = "Key to store the always-best model under", filter= Default.class, json = true, gridable = false)
@API(help = "Key to store the always-best model under", filter= Default.class, json = true)
public Key best_model_key = null;

/**
Expand All @@ -51,7 +51,7 @@ public class DeepLearning extends Job.ValidatedJob {
* values is fine for many problems, but best results on complex datasets are often
* only attainable via expert mode options.
*/
@API(help = "Enable expert mode (to access all options from GUI)", filter = Default.class, json = true, gridable = false)
@API(help = "Enable expert mode (to access all options from GUI)", filter = Default.class, json = true)
public boolean expert_mode = false;

/*Neural Net Topology*/
Expand Down Expand Up @@ -343,64 +343,64 @@ public class DeepLearning extends Job.ValidatedJob {
* training data scoring dataset. When the error is at or below this threshold,
* training stops.
*/
@API(help = "Stopping criterion for classification error fraction on training data (-1 to disable)", filter = Default.class, dmin=-1, dmax=1, json = true, gridable = false, importance = ParamImportance.EXPERT)
@API(help = "Stopping criterion for classification error fraction on training data (-1 to disable)", filter = Default.class, dmin=-1, dmax=1, json = true, importance = ParamImportance.EXPERT)
public double classification_stop = 0;

/**
* The stopping criteria in terms of regression error (MSE) on the training
* data scoring dataset. When the error is at or below this threshold, training
* stops.
*/
@API(help = "Stopping criterion for regression error (MSE) on training data (-1 to disable)", filter = Default.class, dmin=-1, json = true, gridable = false, importance = ParamImportance.EXPERT)
@API(help = "Stopping criterion for regression error (MSE) on training data (-1 to disable)", filter = Default.class, dmin=-1, json = true, importance = ParamImportance.EXPERT)
public double regression_stop = 1e-6;

/**
* Enable quiet mode for less output to standard output.
*/
@API(help = "Enable quiet mode for less output to standard output", filter = Default.class, json = true, gridable = false)
@API(help = "Enable quiet mode for less output to standard output", filter = Default.class, json = true)
public boolean quiet_mode = false;

/**
* For classification models, the maximum size (in terms of classes) of the
* confusion matrix for it to be printed. This option is meant to avoid printing
* extremely large confusion matrices.
*/
@API(help = "Max. size (number of classes) for confusion matrices to be shown", filter = Default.class, json = true, gridable = false)
@API(help = "Max. size (number of classes) for confusion matrices to be shown", filter = Default.class, json = true)
public int max_confusion_matrix_size = 20;

/**
* The maximum number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)
*/
@API(help = "Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)", filter = Default.class, lmin=0, json = true, gridable = false, importance = ParamImportance.EXPERT)
@API(help = "Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)", filter = Default.class, lmin=0, json = true, importance = ParamImportance.EXPERT)
public int max_hit_ratio_k = 10;

/*Imbalanced Classes*/
/**
* For imbalanced data, balance training data class counts via
* over/under-sampling. This can result in improved predictive accuracy.
*/
@API(help = "Balance training data class counts via over/under-sampling (for imbalanced data)", filter = Default.class, json = true, gridable = false, importance = ParamImportance.EXPERT)
@API(help = "Balance training data class counts via over/under-sampling (for imbalanced data)", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public boolean balance_classes = false;

/**
* When classes are balanced, limit the resulting dataset size to the
* specified multiple of the original dataset size.
*/
@API(help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0)", filter = Default.class, json = true, dmin=1e-3, gridable = false, importance = ParamImportance.EXPERT)
@API(help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0)", filter = Default.class, json = true, dmin=1e-3, importance = ParamImportance.EXPERT)
public float max_after_balance_size = 5.0f;

/**
* Method used to sample the validation dataset for scoring, see Score Validation Samples above.
*/
@API(help = "Method used to sample validation dataset for scoring", filter = Default.class, json = true, gridable = false, importance = ParamImportance.EXPERT)
@API(help = "Method used to sample validation dataset for scoring", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public ClassSamplingMethod score_validation_sampling = ClassSamplingMethod.Uniform;

/*Misc*/
/**
* Gather diagnostics for hidden layers, such as mean and RMS values of learning
* rate, momentum, weights and biases.
*/
@API(help = "Enable diagnostics for hidden layers", filter = Default.class, json = true, gridable = false)
@API(help = "Enable diagnostics for hidden layers", filter = Default.class, json = true)
public boolean diagnostics = true;

/**
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/hex/gbm/SharedTreeModelBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,14 @@ public abstract class SharedTreeModelBuilder<TM extends DTree.TreeModel> extends
* For imbalanced data, balance training data class counts via
* over/under-sampling. This can result in improved predictive accuracy.
*/
@API(help = "Balance training data class counts via over/under-sampling (for imbalanced data)", filter = Default.class, json = true, gridable = false, importance = ParamImportance.EXPERT)
@API(help = "Balance training data class counts via over/under-sampling (for imbalanced data)", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public boolean balance_classes = false;

/**
* When classes are balanced, limit the resulting dataset size to the
* specified multiple of the original dataset size.
*/
@API(help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0)", filter = Default.class, json = true, dmin=1e-3, gridable = true, importance = ParamImportance.EXPERT)
@API(help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0)", filter = Default.class, json = true, dmin=1e-3, importance = ParamImportance.EXPERT)
public float max_after_balance_size = Float.POSITIVE_INFINITY;

// @API(help = "Active feature columns")
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/hex/singlenoderf/SpeeDRF.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public class SpeeDRF extends Job.ValidatedJob {
@API(help = "seed", filter = Default.class, json = true)
public long seed = -1;

@API(help = "Build trees in parallel", filter = Default.class, json = true)
@API(help = "Build trees in parallel")
public boolean parallel = true;

@API(help = "split limit")
Expand Down
11 changes: 11 additions & 0 deletions src/main/java/water/AbstractBuildVersion.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@ abstract public class AbstractBuildVersion {
abstract public String projectVersion();
abstract public String compiledOn();
abstract public String compiledBy();

public String buildNumber() {
String buildNumber = "(unknown)";
try {
String projectVersion = projectVersion();
buildNumber = projectVersion.split("\\.")[3];
}
catch (Exception xe) {}
return buildNumber;
}

@Override public String toString() {
return "H2O v"+projectVersion()+ " ("+branchName()+" - "+lastCommitHash()+")";
}
Expand Down
9 changes: 8 additions & 1 deletion src/main/java/water/api/Documentation.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
package water.api;

import water.AbstractBuildVersion;
import water.H2O;

/**
* Redirect to online documentation page.
*/
public class Documentation extends HTMLOnlyRequest {
protected String build(Response response) {
return "<meta http-equiv=\"refresh\" content=\"0; url=http://docs.0xdata.com/\">";
AbstractBuildVersion abv = H2O.getBuildVersion();
String branchName = abv.branchName();
String buildNumber = abv.buildNumber();
String url = "http://s3.amazonaws.com/h2o-release/h2o/" + branchName + "/" + buildNumber + "/docs-website/index.html";
return "<meta http-equiv=\"refresh\" content=\"0; url=" + url + "\">";
}
}
16 changes: 7 additions & 9 deletions src/main/java/water/api/RequestServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,9 @@ public enum API_VERSION {
Request.addToNavbar(registerRequest(new PCA()), "PCA", "Model");
Request.addToNavbar(registerRequest(new GBM()), "GBM", "Model");
Request.addToNavbar(registerRequest(new DeepLearning()),"Deep Learning", "Model");
Request.addToNavbar(registerRequest(new DRF()), "Distributed RF (Beta)", "Model");
Request.addToNavbar(registerRequest(new GLM2()), "GLM (Beta)", "Model");
Request.addToNavbar(registerRequest(new DRF()), "Distributed RF", "Model");
Request.addToNavbar(registerRequest(new GLM2()), "GLM", "Model");
Request.addToNavbar(registerRequest(new SpeeDRF()), "SpeeDRF (Beta)", "Model");
Request.addToNavbar(registerRequest(new KMeans2()), "KMeans (Beta)", "Model");
Request.addToNavbar(registerRequest(new NaiveBayes()), "Naive Bayes (Beta)", "Model");

Expand All @@ -119,10 +120,11 @@ public enum API_VERSION {
Request.addToNavbar(registerRequest(new Cloud()), "Cluster Status", "Admin");
Request.addToNavbar(registerRequest(new IOStatus()), "Cluster I/O", "Admin");
Request.addToNavbar(registerRequest(new Timeline()), "Timeline", "Admin");
Request.addToNavbar(registerRequest(new JProfile()), "Profiler", "Admin");
Request.addToNavbar(registerRequest(new JStack()), "Stack Dump", "Admin");
Request.addToNavbar(registerRequest(new JProfile()), "Profile Dump", "Admin");
Request.addToNavbar(registerRequest(new Debug()), "Debug Dump", "Admin");
Request.addToNavbar(registerRequest(new LogView()), "Inspect Log", "Admin");
Request.addToNavbar(registerRequest(new UnlockKeys()), "Unlock Keys", "Admin");
Request.addToNavbar(registerRequest(new Shutdown()), "Shutdown", "Admin");

// Help and Tutorials
Expand All @@ -141,22 +143,18 @@ public enum API_VERSION {
registerRequest(new ReBalance());
registerRequest(new FrameSplitPage());
registerRequest(new GapStatistic());
registerRequest(new SpeeDRF());
registerRequest(new UnlockKeys());
} else {
Request.addToNavbar(registerRequest(new hex.LR2()), "Linear Regression2", "Beta");
Request.addToNavbar(registerRequest(new ReBalance()), "ReBalance", "Beta");
Request.addToNavbar(registerRequest(new FrameSplitPage()), "Split frame", "Beta");
Request.addToNavbar(registerRequest(new Console()), "Console", "Beta");
Request.addToNavbar(registerRequest(new GapStatistic()), "Gap Statistic", "Beta");
Request.addToNavbar(registerRequest(new SpeeDRF()), "SpeeDRF", "Beta");
Request.addToNavbar(registerRequest(new UnlockKeys()), "Unlock Keys", "Beta");
// Request.addToNavbar(registerRequest(new ExportModel()), "Export Model", "Beta (FluidVecs!)");
// Request.addToNavbar(registerRequest(new ImportModel()), "Import Model", "Beta (FluidVecs!)");
}

// VA stuff
if (false) {
// VA stuff is only shown with -beta
if(H2O.OPT_ARGS.beta == null) {
registerRequest(new Inspect());
registerRequest(new SummaryPage());
registerRequest(new Parse());
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/water/api/Tutorials.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public class Tutorials extends HTMLOnlyRequest {

+ "<div class='span2 col'>"
+ " <h2>GBM</h2>"
+ "<p>GBM uses gradient boosted trees for regression and classification, and is one of the most powerful machine learning methods.</p>"
+ "<p>GBM uses gradient boosted trees for classification and regression, and is one of the most powerful machine learning methods in H<sub>2</sub>O.</p>"
+ "<a href='/TutorialGBM.html' class='btn btn-primary'>Try it!</a>"
+ "</div>"

Expand All @@ -39,13 +39,13 @@ public class Tutorials extends HTMLOnlyRequest {

+ "<div class='span2 col'>"
+ "<h2>K-Means</h2>"
+ "<p>Perform cluster analysis with H<sub>2</sub>O. It employs K-means, a highly scalable clustering algorithm for unsupervised learning on big data.</p>"
+ "<p>Perform cluster analysis with H<sub>2</sub>O. K-means is a scalable clustering algorithm for unsupervised learning on big data.</p>"
+ "<a href='/TutorialKMeans.html' class='btn btn-primary'>Try it!</a>"
+ "</div>"

+ "<div class='span2 col'>"
+ "<h2>Deep Learning</h2>"
+ "<p>H<sub>2</sub>O's distributed Deep Learning gives you the power of deep neural networks for highest predictive accuracy in classification and regression.</p>"
+ "<p>H<sub>2</sub>O's distributed Deep Learning gives you the power of deep neural networks for highest accuracy for classification and regression.</p>"
+ "<a href='/TutorialDeepLearning.html' class='btn btn-primary'>Try it!</a>"
+ "</div>"

Expand Down

0 comments on commit d9b6037

Please sign in to comment.