diff --git a/.classpath b/.classpath index 4246eaf28c..4df45c27c8 100644 --- a/.classpath +++ b/.classpath @@ -34,6 +34,6 @@ - + diff --git a/R/h2o-DESCRIPTION.template b/R/h2o-DESCRIPTION.template index ebdd024fea..656f032552 100644 --- a/R/h2o-DESCRIPTION.template +++ b/R/h2o-DESCRIPTION.template @@ -10,5 +10,5 @@ License: Apache License (== 2.0) Depends: R (>= 2.13.0), RCurl, rjson, statmod, tools, methods, utils Collate: Wrapper.R Internal.R Classes.R ParseImport.R Algorithms.R NeedsCompilation: no -SystemRequirements: java +SystemRequirements: java 1.6 or higher URL: http://www.0xdata.com diff --git a/R/h2o-package/R/Algorithms.R b/R/h2o-package/R/Algorithms.R index b56fd1900b..e4b0371f36 100644 --- a/R/h2o-package/R/Algorithms.R +++ b/R/h2o-package/R/Algorithms.R @@ -53,7 +53,7 @@ h2o.gbm <- function(x, y, distribution='multinomial', data, n.trees=10, interact if(params$distribution == "multinomial") { # temp = matrix(unlist(res$cm), nrow = length(res$cm)) # mySum$prediction_error = 1-sum(diag(temp))/sum(temp) - mySum$prediction_error = tail(res$cm, 1)[[1]]$'_predErr' + mySum$prediction_error = tail(res$'cms', 1)[[1]]$'_predErr' } return(mySum) } @@ -68,8 +68,8 @@ h2o.gbm <- function(x, y, distribution='multinomial', data, n.trees=10, interact result$params = params if(result$params$distribution == "multinomial") { - class_names = tail(res$'_domains', 1)[[1]] - result$confusion = .build_cm(tail(res$cm, 1)[[1]]$'_arr', class_names) # res$'_domains'[[length(res$'_domains')]]) + class_names = res$'cmDomain' #tail(res$'_domains', 1)[[1]] + result$confusion = .build_cm(tail(res$'cms', 1)[[1]]$'_arr', class_names) # res$'_domains'[[length(res$'_domains')]]) result$classification <- T } else result$classification <- F @@ -375,7 +375,7 @@ h2o.glm.FV <- function(x, y, data, family, nfolds = 10, alpha = 0.5, lambda = 1e result$auc = as.numeric(valid$auc) # Construct confusion matrix - cm_ind = trunc(100*result$best_threshold) + 2 + cm_ind = trunc(100*result$best_threshold) + 1 temp = data.frame(t(sapply(valid$'_cms'[[cm_ind]]$'_arr', c))) temp[,3] = c(temp[1,2], temp[2,1])/apply(temp, 1, sum) temp[3,] = c(temp[2,1], temp[1,2], 0)/apply(temp, 2, sum) @@ -781,7 +781,7 @@ h2o.randomForest.VA <- function(x, y, data, ntree=50, depth=50, sample.rate=2/3, } # -------------------------- FluidVecs -------------------------- # -h2o.randomForest.FV <- function(x, y, data, ntree=50, depth=50, sample.rate=2/3, nbins=100, seed=-1, validation, nodesize=1) { +h2o.randomForest.FV <- function(x, y, data, ntree=50, depth=50, nodesize=1, sample.rate=2/3, nbins=100, seed=-1, importance = FALSE, validation) { args <- .verify_dataxy(data, x, y) if(!is.numeric(ntree)) stop('ntree must be a number') if( any(ntree < 1) ) stop('ntree must be >= 1') @@ -792,7 +792,8 @@ h2o.randomForest.FV <- function(x, y, data, ntree=50, depth=50, sample.rate=2/3, if(!is.numeric(nbins)) stop('nbins must be a number') if( any(nbins < 1)) stop('nbins must be an integer >= 1') if(!is.numeric(seed)) stop("seed must be an integer >= 0") - + if(!is.logical(importance)) stop("importance be logical (TRUE or FALSE)')") + if(missing(validation)) validation = data # else if(class(validation) != "H2OParsedData") stop("validation must be an H2O dataset") else if(!class(validation) %in% c("H2OParsedData", "H2OParsedDataVA")) stop("validation must be an H2O parsed dataset") @@ -801,7 +802,7 @@ h2o.randomForest.FV <- function(x, y, data, ntree=50, depth=50, sample.rate=2/3, # NB: externally, 1 based indexing; internally, 0 based cols <- paste(args$x_i - 1, collapse=',') - res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, response=args$y, cols=cols, ntrees=ntree, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, seed=seed) + res = .h2o.__remoteSend(data@h2o, .h2o.__PAGE_DRF, source=data@key, response=args$y, cols=cols, ntrees=ntree, max_depth=depth, min_rows=nodesize, sample_rate=sample.rate, nbins=nbins, seed=seed, importance=as.numeric(importance)) params = list(x=args$x, y=args$y, ntree=ntree, depth=depth, sample.rate=sample.rate, nbins=nbins) if(length(ntree) == 1 && length(depth) == 1 && length(nodesize) == 1 && length(sample.rate) == 1 && length(nbins) == 1) { @@ -827,7 +828,7 @@ h2o.randomForest.FV <- function(x, y, data, ntree=50, depth=50, sample.rate=2/3, # temp = matrix(unlist(res$cm), nrow = length(res$cm)) # mySum$prediction_error = 1-sum(diag(temp))/sum(temp) - mySum$prediction_error = tail(res$cm, 1)[[1]]$'_predErr' + mySum$prediction_error = tail(res$'cms', 1)[[1]]$'_predErr' return(mySum) } @@ -845,8 +846,8 @@ h2o.randomForest.FV <- function(x, y, data, ntree=50, depth=50, sample.rate=2/3, rownames(rf_matrix) = c("Depth", "Leaves") result$forest = rf_matrix - class_names = tail(res$'_domains', 1)[[1]] - result$confusion = .build_cm(tail(res$cm, 1)[[1]]$'_arr', class_names) # res$'_domains'[[length(res$'_domains')]]) + class_names = res$'cmDomain' # tail(res$'_domains', 1)[[1]] + result$confusion = .build_cm(tail(res$'cms', 1)[[1]]$'_arr', class_names) #res$'_domains'[[length(res$'_domains')]]) result$mse = as.numeric(res$errs) # result$ntree = res$N return(result) diff --git a/R/h2o-package/R/Classes.R b/R/h2o-package/R/Classes.R index 987f676101..38c50e2fee 100644 --- a/R/h2o-package/R/Classes.R +++ b/R/h2o-package/R/Classes.R @@ -975,12 +975,15 @@ head.H2OParsedDataVA <- function(x, n = 6L, ...) { if(n > .MAX_INSPECT_VIEW) stop(paste("Cannot view more than", .MAX_INSPECT_VIEW, "rows")) res = .h2o.__remoteSend(x@h2o, .h2o.__PAGE_INSPECT, key=x@key, offset=0, view=n) + res2 = .h2o.__remoteSend(x@h2o, .h2o.__HACK_LEVELS, key=x@key) blanks = sapply(res$cols, function(y) { nchar(y$name) == 0 }) # Must stop R from auto-renaming cols with no name - temp = lapply(res$rows, function(y) { y$row = NULL; tmp = as.data.frame(y); names(tmp)[blanks] = ""; return(tmp) }) + nums = sapply(res2$levels, is.null) # Must stop R from coercing all columns with "NA" to factors, confusing rbind if it is actually numeric + + temp = lapply(res$rows, function(y) { y$row = NULL; na_num = (y[nums] == "NA"); y[nums][na_num] = as.numeric(NA); + tmp = as.data.frame(y); names(tmp)[blanks] = ""; return(tmp) }) if(is.null(temp)) return(temp) x.slice = do.call(rbind, temp) - - res2 = .h2o.__remoteSend(x@h2o, .h2o.__HACK_LEVELS, key = x@key) + for(i in 1:ncol(x)) { if(!is.null(res2$levels[[i]])) x.slice[,i] <- factor(x.slice[,i], levels = res2$levels[[i]]) @@ -997,13 +1000,16 @@ tail.H2OParsedDataVA <- function(x, n = 6L, ...) { idx = seq.int(to = nrx, length.out = n) res = .h2o.__remoteSend(x@h2o, .h2o.__PAGE_INSPECT, key=x@key, offset=idx[1], view=length(idx)) + res2 = .h2o.__remoteSend(x@h2o, .h2o.__HACK_LEVELS, key=x@key) blanks = sapply(res$cols, function(y) { nchar(y$name) == 0 }) # Must stop R from auto-renaming cols with no name - temp = lapply(res$rows, function(y) { y$row = NULL; tmp = as.data.frame(y); names(tmp)[blanks] = ""; return(tmp) }) + nums = sapply(res2$levels, is.null) # Must stop R from coercing all columns with "NA" to factors, confusing rbind if it is actually numeric + + temp = lapply(res$rows, function(y) { y$row = NULL; na_num = (y[nums] == "NA"); y[nums][na_num] = as.numeric(NA); + tmp = as.data.frame(y); names(tmp)[blanks] = ""; return(tmp) }) if(is.null(temp)) return(temp) x.slice = do.call(rbind, temp) rownames(x.slice) = idx - res2 = .h2o.__remoteSend(x@h2o, .h2o.__HACK_LEVELS, key = x@key) for(i in 1:ncol(x)) { if(!is.null(res2$levels[[i]])) x.slice[,i] <- factor(x.slice[,i], levels = res2$levels[[i]]) diff --git a/R/h2o-package/R/Wrapper.R b/R/h2o-package/R/Wrapper.R index 1f345493c2..009f4364cc 100644 --- a/R/h2o-package/R/Wrapper.R +++ b/R/h2o-package/R/Wrapper.R @@ -50,6 +50,13 @@ h2o.shutdown <- function(client, prompt = TRUE) { } # ----------------------- Diagnostics ----------------------- # + + +# **** TODO: This isn't really a cluster status... it's a node status check for the node we're connected to. +# This is possibly confusing because this can come back without warning, +# but if a user tries to do any remoteSend, they will get a "cloud sick warning" +# Suggest cribbing the code from Internal.R that checks cloud status (or just call it here?) + h2o.clusterStatus <- function(client) { if(missing(client) || class(client) != "H2OClient") stop("client must be a H2OClient object") myURL = paste("http://", client@ip, ":", client@port, "/", .h2o.__PAGE_CLOUD, sep = "") diff --git a/h2o-docs/source/Ruser/R_studio.rst b/h2o-docs/source/Ruser/R_studio.rst index 92d7da363d..6b71f2ab61 100644 --- a/h2o-docs/source/Ruser/R_studio.rst +++ b/h2o-docs/source/Ruser/R_studio.rst @@ -1,34 +1,38 @@ -H\ :sub:`2`\ O in R Studio ---------------------------- +H\ :sub:`2`\ O installation in R Studio +------------------------------------------ -These instructions assume you are using R Studio 2.14.0 or later. +These instructions assume you are using R 2.14.0 or later. **STEP 1** -To use H\ :sub: `2`\ O in R, users need a copy of H\ :sub: `2`\ O. -The download package can be obtained by clicking on the button Download H\ :sub:`2`\ O at `http://0xdata.com/downloadtable `_. +The download package containing the H\ :sub:`2`\ O jar file can be +obtained by visiting H\ :sub:`2`\ O available downloads at +`http://0xdata.com/downloadtable `_. -Unzip the downloaded H\ :sub:`2`\ O zip file. +Choose the version of H\ :sub:`2`\ O best for you, and unzip the +downloaded H\ :sub:`2`\ O zip file. The most recent promoted build is +reccomended. **STEP 2** -Start an instance of H\ :sub:`2`\ O. For help with this see :ref:`GettingStartedFromaZipFile` +Start an instance of H\ :sub:`2`\ O. For help with this see +:ref:`GettingStartedFromaZipFile` +If users do not start an instance of H\ :sub:`2`\ O, one will be +started automatically for them at localhost: 54321 (see **STEP 4** for +more detail). -Users should be aware that in order for H\ :sub:`2`\ O to successfully run through R, an instance of H\ :sub:`2`\ O must also simultaneously be running. If the instance of H\ :sub:`2`\ O is stopped, the R program will no longer run, and work done will be lost. +If the instance of H\ :sub:`2`\ O is stopped, the R +program will no longer run, and work done will be lost. +**STEP 3:** -**STEP 3** - -For users who may have already installed a prior version of the H2O -package. New users may skip this step. - -For packages to be successfully removed and updated in R studio - they -must first be detatched from the R environment and then uninstalled. -Simply enter the following: +New users may skip this step, while users who have previously +installed the H\ :sub:`2`\ O R packages should uninstall them by entering the +following commands to the R console: :: @@ -38,58 +42,70 @@ Simply enter the following: Note: users may get warnings of the type "Error in detatch("package:h2o", unload = TRUE): invalid 'name' argument. -This tells users that there is no h2o package to uninstall. These +This tells users that there is no H\ :sub:`2`\ O package to uninstall. These warnings can safely be ignored. -.. image:: Rstudioinstall1.jpg - :width: 90% - - -**STEP 4** - -Install the H\ :sub:`2`\ O package from the H2ORepo, the H2O cran that -functions exactly like the usual R cran, but is managed and maintained -by H2O. -Simply enter the call: +**STEP 4:** -:: +Install the H\ :sub:`2`\ O package via the H\ :sub:`2`\ O +repository. This repository functions exactly like the R repository, +but is maintained by H\ :sub:`2`\ O. - install.packages("h2o", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/rel-jacobi/2/R", getOption("repos")))) - -as shown here: +**DO NOT CUT AND PASTE THIS CALL INTO R** +The call shown below is specifically for the jacobi/2 build, which may +be older than the build you would like to use. Your call should look +similar to this, and you can find an exact command to copy and paste +by going to H\ :sub:`2`\ O available downloads at +`http://0xdata.com/downloadtable +`_ and selecting the correct version +there. -.. image:: Rstudioinstall2.jpg - :width: 90% - + `install.packages("h2o", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/rel-jacobi/2/R", getOption("repos"))))` + +**STEP 4:** +Once the H\ :sub:`2`\ O R package has been installed, call the +package, and establish a connection to a running instance of H\ +:sub:`2`\ O. -**STEP 4** +If there is no running instance of H\ :sub:`2`\ O prior to using +the command "h2o.init()", H\ :sub:`2`\ O in R will start an instance +automatically for the user at localhost:54321, and the user will be +notified. If you would like to connect to an instance at an IP and +port other than localhost:54321, these details must be specified as +arguments in the R call. -If you have not started an instance of H2O from your command line -terminal, R will start an instance for you automatically. If you have -already started an instance, H2O R will connect to this instance, and -no other instance will be started. - -Get R Studio talking to your instance of H\ :sub:`2`\ O by typing in the call: :: - >localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE) + library(h2o) + localH2O <- h2o.init() -Your IP and port may be different, depending on whether you are running H\ :sub:`2`\ O from your computer or a server. If you are running on a server, where it says IP enter the IP address of the server, and the appropriate port number. In the picture below the IP number is everything before the colon, and the port number is the 5 digit string after the colon. +Users who wish to specify a connection +with a server (other than localhost at port 54321) must explicity +state the IP address and port number in the h2o.init call. +An example is given below, but **do not cut and paste**; users should +specify the IP and port number appropriate to their specific +environment. +:: -Upgrading the H\ :sub:`2`\ O R Packages -""""""""""""""""""""""""""""""""""""""" + library(h2o) + localH2O = h2o.init(ip = "192.555.1.123", port = 12345, startH2O = FALSE) -Users may wish to manually upgrade their R packages. They can do this -by returning to STEP 3, and following the instructions through -STEP 4. +**STEP 5: Upgrading Packages** +Users may wish to manually upgrade their R packages. For instance, if +you are running the bleeding edge developer build, it’s possible that +the code has changed, but that the revision number has not, in which +case manually upgrading ensures the most current version of not only +the H\ :sub:`2`\ O code, but the corresponding R code as well. +This can be done by returning to STEP 3, and following the commands +through STEP 4. diff --git a/h2o-docs/source/Ruser/Rinstall.rst b/h2o-docs/source/Ruser/Rinstall.rst index 6abbc2e390..557e8d7c0d 100644 --- a/h2o-docs/source/Ruser/Rinstall.rst +++ b/h2o-docs/source/Ruser/Rinstall.rst @@ -21,9 +21,11 @@ reccomended. Start an instance of H\ :sub:`2`\ O. For help with this see :ref:`GettingStartedFromaZipFile` -Users should be aware that in order for H\ :sub:`2`\ O to successfully -run through R, an instance of H\ :sub:`2`\ O must also simultaneously -be running. If the instance of H\ :sub:`2`\ O is stopped, the R +If users do not start an instance of H\ :sub:`2`\ O, one will be +started automatically for them at localhost: 54321 (see **STEP 4** for +more detail). + +If the instance of H\ :sub:`2`\ O is stopped, the R program will no longer run, and work done will be lost. **STEP 3:** @@ -45,14 +47,20 @@ warnings can safely be ignored. **STEP 4:** -Install the H\ :sub:`2`\ O package, and the H\ :sub:`2`\ O client -package via the H\ :sub:`2`\ O cran. This repository functions -exactly like the R repository, but is maintained by H\ :sub:`2`\ O. +Install the H\ :sub:`2`\ O package via the H\ :sub:`2`\ O +repository. This repository functions exactly like the R repository, +but is maintained by H\ :sub:`2`\ O. -:: +**DO NOT CUT AND PASTE THIS CALL INTO R** +The call shown below is specifically for the jacobi/2 build, which may +be older than the build you would like to use. Your call should look +similar to this, and you can find an exact command to copy and paste +by going to H\ :sub:`2`\ O available downloads at +`http://0xdata.com/downloadtable +`_ and selecting the correct version +there. - install.packages("h2o", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/rel-jacobi/2/R", getOption("repos")))) - + `install.packages("h2o", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/rel-jacobi/2/R", getOption("repos"))))` **STEP 4:** @@ -63,12 +71,11 @@ package, and establish a connection to a running instance of H\ If there is no running instance of H\ :sub:`2`\ O prior to using the command "h2o.init()", H\ :sub:`2`\ O in R will start an instance -automatically for the user. +automatically for the user at localhost:54321, and the user will be +notified. If you would like to connect to an instance at an IP and +port other than localhost:54321, these details must be specified as +arguments in the R call. -Note that in the call "localH2O <- h2o.init()" the h2o.init object is -being named localH2O in the R environment for use later in model -specification. Entering the call exactly as it is written below assumes the -user wishes to connect to IP localhost and port: 54321. :: @@ -77,7 +84,7 @@ user wishes to connect to IP localhost and port: 54321. Users who wish to specify a connection -with a server (rather than localhost at port 54321) must explicity +with a server (other than localhost at port 54321) must explicity state the IP address and port number in the h2o.init call. An example is given below, but **do not cut and paste**; users should specify the IP and port number appropriate to their specific diff --git a/h2o-docs/source/Ruser/Rstudioinstall1.jpg b/h2o-docs/source/Ruser/Rstudioinstall1.jpg deleted file mode 100644 index a1685bad0e..0000000000 Binary files a/h2o-docs/source/Ruser/Rstudioinstall1.jpg and /dev/null differ diff --git a/h2o-docs/source/Ruser/Rstudioinstall2.jpg b/h2o-docs/source/Ruser/Rstudioinstall2.jpg deleted file mode 100644 index 1d737aaa71..0000000000 Binary files a/h2o-docs/source/Ruser/Rstudioinstall2.jpg and /dev/null differ diff --git a/h2o-docs/source/userguide/general.rst b/h2o-docs/source/userguide/general.rst index 276a5c754e..64320c4c1c 100644 --- a/h2o-docs/source/userguide/general.rst +++ b/h2o-docs/source/userguide/general.rst @@ -72,6 +72,6 @@ Step by step instructions on how to use each of the algorithms and tools can be found in tutorials . Users have a variety of options for accessing and running H\ :sub:`2`\ O. For instructions on how to get started using H\ :sub:`2`\ O (for example through R, using Java, or -via git-hub), please see the Quick Start Guides. New users may also -find the :ref:`glossary` useful for familiarizing themselves with H\ -:sub:`2`\ O's computing and statistics terms. +via git-hub), please see the Quick Start Guides, and Walk Through +Tutorials. New users may also find the :ref:`glossary` useful for +familiarizing themselves with H\ :sub:`2`\ O's computing and statistics terms. diff --git a/src/main/java/hex/drf/DRF.java b/src/main/java/hex/drf/DRF.java index 28d42a4975..0347d82948 100644 --- a/src/main/java/hex/drf/DRF.java +++ b/src/main/java/hex/drf/DRF.java @@ -3,31 +3,23 @@ import static water.util.Utils.*; import hex.ConfusionMatrix; import hex.ShuffleTask; -import hex.gbm.DHistogram; -import hex.gbm.DTree; +import hex.gbm.*; import hex.gbm.DTree.DecidedNode; import hex.gbm.DTree.LeafNode; import hex.gbm.DTree.TreeModel.TreeStats; import hex.gbm.DTree.UndecidedNode; -import hex.gbm.SharedTreeModelBuilder; + +import java.util.Arrays; +import java.util.Random; + import jsr166y.ForkJoinTask; import water.*; import water.H2O.H2OCountedCompleter; import water.api.DRFProgressPage; import water.api.DocGen; -import water.fvec.Chunk; -import water.fvec.Frame; -import water.fvec.Vec; -import water.util.Log; +import water.fvec.*; +import water.util.*; import water.util.Log.Tag.Sys; -import water.util.RString; -import water.util.SB; -import water.util.Utils; - -import java.util.Arrays; -import java.util.Random; - -import static water.util.Utils.*; // Random Forest Trees public class DRF extends SharedTreeModelBuilder { @@ -65,11 +57,12 @@ class myClassFilter extends DRFCopyDataBoolean { myClassFilter() { super("source public static class DRFModel extends DTree.TreeModel { static final int API_WEAVER = 1; // This file has auto-gen'd doc & json fields static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code. - @API(help = "Number of columns picked at each split", json=true) final int mtries; - @API(help = "Sample rate", json=true) final float sample_rate; - @API(help = "Seed", json=true) final long seed; - public DRFModel(Key key, Key dataKey, Key testKey, String names[], String domains[][], int ntrees, int max_depth, int min_rows, int nbins, int mtries, float sample_rate, long seed) { - super(key,dataKey,testKey,names,domains,ntrees, max_depth, min_rows, nbins); + + @API(help = "Number of columns picked at each split") final int mtries; + @API(help = "Sample rate") final float sample_rate; + @API(help = "Seed") final long seed; + public DRFModel(Key key, Key dataKey, Key testKey, String names[], String domains[][], String[] cmDomain, int ntrees, int max_depth, int min_rows, int nbins, int mtries, float sample_rate, long seed) { + super(key,dataKey,testKey,names,domains,cmDomain,ntrees, max_depth, min_rows, nbins); this.mtries = mtries; this.sample_rate = sample_rate; this.seed = seed; @@ -117,6 +110,9 @@ public DRFModel(DRFModel prior, float[] varimp) { public Frame score( Frame fr ) { return ((DRFModel)UKV.get(dest())).score(fr); } @Override protected Log.Tag.Sys logTag() { return Sys.DRF__; } + @Override protected DRFModel makeModel(Key outputKey, Key dataKey, Key testKey, String[] names, String[][] domains, String[] cmDomain) { + return new DRFModel(outputKey,dataKey,validation==null?null:testKey,names,domains,cmDomain,ntrees, max_depth, min_rows, nbins, mtries, sample_rate, _seed); + } @Override protected DRFModel makeModel( DRFModel model, double err, ConfusionMatrix cm) { return new DRFModel(model, err, cm); } @@ -171,12 +167,9 @@ public static String link(Key k, String content) { // Out-of-bag trees counter - only one since it is shared via k-trees protected Chunk chk_oobt(Chunk chks[]) { return chks[_ncols+1+_nclass+_nclass+_nclass]; } - @Override protected void buildModel( final Frame fr, String names[], String domains[][], final Key outputKey, final Key dataKey, final Key testKey, final Timer t_build ) { + @Override protected DRFModel buildModel( DRFModel model, final Frame fr, String names[], String domains[][], String[] cmDomain, final Timer t_build ) { fr.add("OUT_BAG_TREES", response.makeZero()); - DRFModel model = new DRFModel(outputKey,dataKey,validation==null?null:testKey,names,domains,ntrees, max_depth, min_rows, nbins, mtries, sample_rate, _seed); - model.delete_and_lock(self()); - // The RNG used to pick split columns Random rand = createRNG(_seed); @@ -189,28 +182,28 @@ public static String link(Key k, String content) { TreeStats tstats = new TreeStats(); // Build trees until we hit the limit for( tid=0; tid extends private transient boolean _gen_enum; // True if we need to cleanup an enum response column at the end + private transient boolean _validAdapted; // Internal flag to signal that validation dataset adaptation was performed + private transient Frame _adaptedValidation; // Validation dataset is already adapted to a produced model + private transient Vec _adaptedValidationResponse; // Validation response adapted to computed CM domain + private transient Frame _toDeleteFrame; // Frame containing only adapted part of validation which needs to be clean-up at the end of computation + private transient int[][] _modelMap; // Transformation for model response to common domain + private transient int[][] _validMap; // Transformation for validation response to common domain + /** Maximal number of supported levels in response. */ public static final int MAX_SUPPORTED_LEVELS = 1000; @@ -133,6 +140,7 @@ public void buildModel( ) { if( validation != null && !source._key.equals(validation._key) ) validation.read_lock(self()); + // Prepare a frame for this tree algorithm run Frame fr = new Frame(_names, _train); fr.add(_responseName,response); final Frame frm = new Frame(fr); // Model-Frame; no extra columns @@ -142,7 +150,7 @@ public void buildModel( ) { // For doing classification on Integer (not Enum) columns, we want some // handy names in the Model. This really should be in the Model code. String[] domain = response.domain(); - if( domain == null && _nclass > 1 ) // No names? Something is wrong since we converted response to enum + if( domain == null && _nclass > 1 ) // No names? Something is wrong since we converted response to enum already ! assert false : "Response domain' names should be always presented in case of classification"; if( domain == null ) domain = new String[] {"r"}; // For regression, give a name to class 0 @@ -167,9 +175,57 @@ public void buildModel( ) { for( int i=0; i<_nclass; i++ ) fr.add("NIDs_"+domain[i], response.makeCon(_distribution==null ? 0 : (_distribution[i]==0?-1:0))); - // Tail-call position: this forks off in the background, and this call - // returns immediately. The actual model build is merely kicked off. - buildModel(fr,names,domains,outputKey, dataKey, testKey, new Timer()); + // Compute output confusion matrix domain for classification: + // - if validation dataset is specified then CM domain is union of train and validation response domains + // else it is only domain of response column. + String[] cmDomain = null; + if (validation!=null && _nclass > 1) { + // Collect domain for validation response + Vec validResponse = validation.vec(names[names.length-1]).toEnum(); // toEnum call require explicit delete of created vector + String[] validationDomain = validResponse.domain(); + cmDomain = Utils.union(domain, validationDomain); + // Remove temporary vector + UKV.remove(validResponse._key); + if (!Arrays.deepEquals(cmDomain, domain)) { // Muhehehe, we have different domain for CM which is superset of model response domain + // Compute transformations: from response columns to CM domain + _modelMap = Model.getDomainMapping(cmDomain, domain, false); // transformation from model produced response ~> cmDomain + _validMap = Model.getDomainMapping(cmDomain, validationDomain, false); // transformation from validation response domain ~> cmDomain + } + } else if (_nclass > 1) { + cmDomain = domain; + } + + // Timer for model building + Timer bm_timer = new Timer(); + // Create an initial model + TM model = makeModel(outputKey, dataKey, testKey, names, domains, cmDomain); + // Save the model ! (delete_and_lock has side-effect of saving model into DKV) + model.delete_and_lock(self()); + // Prepare adapted validation dataset if it is necessary for classification (we do not need to care about regression) + if (validation!=null) { + Frame[] av = model.adapt(validation, false); + _adaptedValidation = av[0]; // adapted validation data for model + _toDeleteFrame = av[1]; // only adapted vectors which need to be deleted + _validAdapted = true; + // Do I need to perform additional adaptation of response? + if (_validMap!=null) { + assert _modelMap != null : "Model response transformation should exist if validation response transformation exists!"; + String vr = model.responseName(); + Vec tmp = validation.vec(vr).toEnum(); + _adaptedValidationResponse = tmp.makeTransf(_validMap); // Add an original response adapted to CM domain + _toDeleteFrame.add("__dummy__validation_response__", _adaptedValidationResponse); // Add the created vector to a clean-up list + _toDeleteFrame.add("__dummy__validation_enum_response__", tmp); + } + } + + try { + // Compute the model + model = buildModel(model, fr, names, domains, cmDomain, bm_timer); + //} catch (Throwable t) { t.printStackTrace(); + } finally { + model.unlock(self()); // Update and unlock model + cleanUp(fr,bm_timer); // Shared cleanup + } } // Shared cleanup @@ -181,6 +237,8 @@ protected void cleanUp(Frame fr, Timer t_build) { UKV.remove(fr.remove(fr.numCols()-1)._key); // If we made a response column with toEnum, nuke it. if( _gen_enum ) UKV.remove(response._key); + // Delete adapted part of validation dataset + if( _toDeleteFrame != null ) _toDeleteFrame.delete(); // Unlock the input datasets against deletes source.unlock(self()); @@ -191,7 +249,7 @@ protected void cleanUp(Frame fr, Timer t_build) { } transient long _timeLastScoreStart, _timeLastScoreEnd, _firstScore; - protected TM doScoring(TM model, Key outputKey, Frame fr, DTree[] ktrees, int tid, DTree.TreeModel.TreeStats tstats, boolean finalScoring, boolean oob, boolean build_tree_per_node ) { + protected TM doScoring(TM model, Frame fr, DTree[] ktrees, int tid, String[] cmDomain, DTree.TreeModel.TreeStats tstats, boolean finalScoring, boolean oob, boolean build_tree_per_node ) { long now = System.currentTimeMillis(); if( _firstScore == 0 ) _firstScore=now; long sinceLastScore = now-_timeLastScoreStart; @@ -209,7 +267,8 @@ protected TM doScoring(TM model, Key outputKey, Frame fr, DTree[] ktrees, int ti (double)(_timeLastScoreEnd-_timeLastScoreStart)/sinceLastScore < 0.1) ) { // 10% duty cycle _timeLastScoreStart = now; // Perform scoring - sc = new Score().doIt(model, fr, validation, oob, build_tree_per_node).report(logTag(),tid,ktrees); + Frame validationFrame = _validAdapted ? _adaptedValidation : validation; + sc = new Score().doIt(model, fr, validationFrame, _adaptedValidationResponse, _modelMap, cmDomain, oob, build_tree_per_node).report(logTag(),tid,ktrees); _timeLastScoreEnd = System.currentTimeMillis(); } // Double update - after scoring @@ -571,12 +630,14 @@ private static class ClassDist extends MRTask2 { // Score the *tree* columns, and produce a confusion matrix public class Score extends MRTask2 { - long _cm[/*actual*/][/*predicted*/]; // Confusion matrix - double _sum; // Sum-squared-error - long _snrows; // Count of voted-on rows + /* @OUT */ long _cm[/*actual*/][/*predicted*/]; // Confusion matrix + /* @OUT */ double _sum; // Sum-squared-error + /* @OUT */ long _snrows; // Count of voted-on rows + /* @IN */ boolean _oob; + /* @IN */ boolean _validation; + /* @IN */ int _cmlen; + /* @IN */ boolean _cavr; // true if validation response needs to be adapted to CM domain //double _auc; //Area under the ROC curve for _nclass == 2 - /* @IN */ boolean _oob; - /* @IN */ boolean _validation; public double sum() { return _sum; } public long[][] cm () { return _cm; } @@ -584,65 +645,95 @@ public class Score extends MRTask2 { public double mse() { return sum() / nrows(); } // public double auc() { return _auc; } - // Compute CM & MSE on either the training or testing dataset - public Score doIt(Model model, Frame fr, Frame validation, boolean oob, boolean build_tree_per_node) { - assert !oob || validation==null ; // oob => validation==null + /** + * Compute CM & MSE on either the training or testing dataset. + * + * It expect already adapted validation dataset which is adapted to a model + * and contains a response which is adapted to confusion matrix domain. Uff :) + * + * @param model a model which is used to perform computation + * @param fr a model training frame + * @param adaptedValidation a test frame or null, the test frame is already adapted to a model + * @param modelTransf expose a transformation of model results to be consistent with produced confusion matrix + * @param oob perform out-of-bag validation on training frame + * @param build_tree_per_node + * @return this score object + */ + public Score doIt(Model model, Frame fr, Frame adaptedValidation, Vec adaptedValidationResponse, int[][] modelTransf, String[] cmDomain, boolean oob, boolean build_tree_per_node) { + assert !oob || adaptedValidation==null : "Validation frame cannot be specified if oob validation is demanded!"; // oob => validation==null + assert _nclass == 1 || cmDomain != null ; + + _cmlen = _nclass > 1 ? cmDomain.length : 1; _oob = oob; - // No validation, so do on training data - //System.err.println(fr.toStringAll()); - if( validation == null ) return doAll(fr, build_tree_per_node); + // No validation frame is specified, so perform computation on training data + if( adaptedValidation == null ) return doAll(fr, build_tree_per_node); _validation = true; + _cavr = false; // Validation: need to score the set, getting a probability distribution for each class - // Frame has nclass vectors (nclass, or 1 for regression) - Frame res = model.score(validation); - // Adapt the validation set to the model - Frame frs[] = model.adapt(validation,false); - Frame adapValidation = frs[0]; // adapted validation dataset + // Frame has nclass vectors (nclass, or 1 for regression), for classification it + Frame res = model.score(adaptedValidation, false); // For classification: predicted values (~ values in res[0]) are in interval 0..domain().length-1, for regression just single column. + Frame adapValidation = new Frame(adaptedValidation); // adapted validation dataset // All columns including response of validation frame are already adapted to model - if (_nclass>1) { // Classification + if (_nclass>1) { // Only for Classification for( int i=0; i<_nclass; i++ ) // Distribution of response classes adapValidation.add("ClassDist"+i,res.vecs()[i+1]); + if (modelTransf!=null) { + Vec ar = res.vecs()[0].makeTransf(modelTransf); // perform transformation of model results to be consistent with expected confusion matrix domain + adapValidation.add("Prediction", ar); // add as a prediction + adapValidation.add("ActualValidationResponse", adaptedValidationResponse); + _cavr = true; // signal that we have two predictions vectors in the frame. + res.add("__dummyx__", ar); // add the vector to clean up list + } else + adapValidation.add("Prediction",res.vecs()[0]); // Predicted values + } else { // Regression + adapValidation.add("Prediction",res.vecs()[0]); } - adapValidation.add("Prediction",res.vecs()[0]); // Compute a CM & MSE - doAll(adapValidation, build_tree_per_node); - // Remove the extra adapted Vecs - frs[1].delete(); - // Remove temporary result - res.delete(); + try { + doAll(adapValidation, build_tree_per_node); + } finally { + // Perform clean-up: remove temporary result + res.delete(); + } return this; } @Override public void map( Chunk chks[] ) { Chunk ys = chk_resp(chks); // Response - _cm = new long[_nclass][_nclass]; - float fs[] = new float[_nclass+1]; + Chunk ays = _cavr ? chks[_ncols+1+_nclass+1] : ys; // Remember adapted response + _cm = new long[_cmlen][_cmlen]; + float fs[] = new float[_nclass+1]; // Array to hold prediction and distribution given by the model. // Score all Rows for( int row=0; row 1 ) sum = 1.0f; // Sum of a distribution is 1.0 for classification - else sum = fs[1]; // Sum is the same as prediction for regression. + if (_nclass > 1 ) sum = 1.0f; // Sum of a distribution is 1.0 for classification + else sum = fs[1]; // Sum is the same as prediction for regression. } else { // Passed in the model-specific columns sum = score1(chks,fs,row); } - float err; int ycls=0; + float err; int yact=0; // actual response from dataset if (_oob && inBagRow(chks, row)) continue; // score only on out-of-bag rows if( _nclass > 1 ) { // Classification if( sum == 0 ) { // This tree does not predict this row *at all*? err = 1.0f-1.0f/_nclass; // Then take ycls=0, uniform predictive power } else { - ycls = (int)ys.at80(row); // Response class from 0 to nclass-1 - if (ycls >= _nclass) continue; - assert 0 <= ycls && ycls < _nclass : "weird ycls="+ycls+", y="+ys.at0(row); - err = Float.isInfinite(sum) - ? (Float.isInfinite(fs[ycls+1]) ? 0f : 1f) - : 1.0f-fs[ycls+1]/sum; // Error: distance from predicting ycls as 1.0 + if (_cavr && ys.isNA0(row)) { // Handle adapted validation response - actual response was adapted but does not contain NA - it is implicit misprediction, + err = 1f; + } else { // No adaptation of validation response + yact = (int) ys.at80(row); // Pick an actual prediction adapted to model values <0, nclass-1) + assert 0 <= yact && yact < _nclass : "weird ycls="+yact+", y="+ys.at0(row); + err = Float.isInfinite(sum) + ? (Float.isInfinite(fs[yact+1]) ? 0f : 1f) + : 1.0f-fs[yact+1]/sum; // Error: distance from predicting ycls as 1.0 + } } - assert !Double.isNaN(err) : "fs[cls]="+fs[ycls+1] + ", sum=" + sum; + assert !Double.isNaN(err) : "fs[cls]="+fs[yact+1] + ", sum=" + sum; + // Overwrite response by adapted value to provide correct CM + if (_cavr) yact = (int) ays.at80(row); } else { // Regression err = (float)ys.at0(row) - sum; } @@ -650,12 +741,12 @@ public Score doIt(Model model, Frame fr, Frame validation, boolean oob, boolean assert !Double.isNaN(_sum); // Pick highest prob for our prediction. if (_nclass > 1) { // fill CM only for classification - if(_nclass == 2) { //binomial classification -> compute AUC, draw ROC + if(_nclass == 2) { // Binomial classification -> compute AUC, draw ROC for(int i = 0; i < _cms.length; ++i) - _cms[i].add(ycls, ( (1 - (fs[ycls+1] / sum) )>= DEFAULT_THRESHOLDS[i])?1:0); + _cms[i].add(yact, ( (1 - (fs[yact+1] / sum) )>= DEFAULT_THRESHOLDS[i])?1:0); } - int best = _validation ? (int) chks[_ncols+1+_nclass].at80(row) : Model.getPrediction(fs, row); - _cm[ycls][best]++; // Bump Confusion Matrix also + int ypred = _validation ? (int) chks[_ncols+1+_nclass].at80(row) : Model.getPrediction(fs, row); + _cm[yact][ypred]++; // actual v. predicted } _snrows++; } @@ -720,8 +811,9 @@ private double trapeziod_area(double x1, double x2, double y1, double y2) { } protected abstract water.util.Log.Tag.Sys logTag(); - protected abstract void buildModel( Frame fr, String names[], String domains[][], Key outputKey, Key dataKey, Key testKey, Timer t_build ); + protected abstract TM buildModel( TM initialModel, Frame fr, String names[], String domains[][], String[] cmDomain, Timer t_build ); + protected abstract TM makeModel( Key outputKey, Key dataKey, Key testKey, String names[], String domains[][], String[] cmDomain); protected abstract TM makeModel( TM model, double err, ConfusionMatrix cm); protected abstract TM makeModel( TM model, DTree ktrees[], DTree.TreeModel.TreeStats tstats); diff --git a/src/main/java/hex/nn/NNModel.java b/src/main/java/hex/nn/NNModel.java index 0a00894dc0..ccd6f579f5 100644 --- a/src/main/java/hex/nn/NNModel.java +++ b/src/main/java/hex/nn/NNModel.java @@ -566,7 +566,7 @@ public boolean generateHTML(String title, StringBuilder sb) { model_info.job().toHTML(sb); sb.append("
Actions: " - + (Job.isRunning(jobKey) ? Cancel.link(jobKey, "Cancel job") + ", " : "") + + (Job.isRunning(jobKey) ? Cancel.link(jobKey, "Stop training") + ", " : "") + is2.link("Inspect training data", _dataKey) + ", " + (model_info().parameters.validation != null ? (is2.link("Inspect validation data", model_info().parameters.validation._key) + ", ") : "") + water.api.Predict.link(_key, "Score on dataset") + ", " + @@ -721,14 +721,18 @@ else if (i < neurons.length-1) { sb.append("

" + "Progress" + "

"); sb.append("

" + "Epochs: " + String.format("%.3f", epoch_counter) + "

"); - final long pts = fulltrain ? model_info().data_info()._adaptedFrame.numRows() : score_train; - String training = "Number of training set samples for scoring: " + (fulltrain ? "all " : "") + pts; - if (pts < 1000 && model_info().data_info()._adaptedFrame.numRows() >= 1000) training += " (low, scoring might be inaccurate -> consider increasing this number in the expert mode)"; - if (pts > 100000) training += " (large, scoring can be slow -> consider reducing this number in the expert mode or scoring manually)"; - DocGen.HTML.section(sb, training); + // training + { + final long pts = fulltrain ? model_info().data_info()._adaptedFrame.numRows() : score_train; + String training = "Number of training set samples for scoring: " + (fulltrain ? "all " : "") + pts; + if (pts < 1000 && model_info().data_info()._adaptedFrame.numRows() >= 1000) training += " (low, scoring might be inaccurate -> consider increasing this number in the expert mode)"; + if (pts > 100000) training += " (large, scoring can be slow -> consider reducing this number in the expert mode or scoring manually)"; + DocGen.HTML.section(sb, training); + } + // validation if (error.validation) { final long ptsv = fullvalid ? model_info().get_params().validation.numRows() : score_valid; - String validation = "Number of validation set samples for scoring: " + (fullvalid ? "all " : "") + pts; + String validation = "Number of validation set samples for scoring: " + (fullvalid ? "all " : "") + ptsv; if (ptsv < 1000 && model_info().get_params().validation.numRows() >= 1000) validation += " (low, scoring might be inaccurate -> consider increasing this number in the expert mode)"; if (ptsv > 100000) validation += " (large, scoring can be slow -> consider reducing this number in the expert mode or scoring manually)"; DocGen.HTML.section(sb, validation); diff --git a/src/main/java/water/Job.java b/src/main/java/water/Job.java index d423e71d16..a3fc5e012f 100644 --- a/src/main/java/water/Job.java +++ b/src/main/java/water/Job.java @@ -435,26 +435,42 @@ public void cancel(final String msg) { } /** - * + * Callback which is called after job cancellation (by user, by exception). */ protected void onCancelled() { } + // This querys the *current object* for its status. // Only valid if you have a Job object that is being updated by somebody. - public boolean isCancelled() { return state == JobState.CANCELLED || state == JobState.CRASHED; } + public boolean isCancelled() { + return state == JobState.CANCELLED || state == JobState.CRASHED; + } public boolean isCrashed() { return state == JobState.CRASHED; } public boolean isDone() { return state == JobState.DONE; } - // Check the K/V store to see the Job is still running + + /** Check if given job is running. + * + * @param job_key job key + * @return true if job is still running else returns false. + */ public static boolean isRunning(Key job_key) { Job j = UKV.get(job_key); return j!=null && j.state == JobState.RUNNING; } - /** + * Returns true if job is not running. + * The job can be cancelled, crashed, or already done. * + * @param jobkey job identification key + * @return true if job is done, cancelled, or crashed, else false + */ + public static boolean isEnded(Key jobkey) { return !isRunning(jobkey); } + + /** + * Marks job as finished and records job end time. */ public void remove() { end_time = System.currentTimeMillis(); @@ -463,9 +479,10 @@ public void remove() { replaceByJobHandle(); } - /** Finds a job with given key or returns null - * @param key - * @return + /** Finds a job with given key or returns null. + * + * @param key job key + * @return returns a job with given job key or null if a job is not found. */ public static final Job findJob(final Key jobkey) { Job job = UKV.get(jobkey); @@ -484,7 +501,8 @@ public static final Job findJobByDest(final Key destKey) { return job; } - /** Returns job execution time in milliseconds */ + /** Returns job execution time in milliseconds. + * If job is not running then returns job execution time. */ public final long runTimeMs() { long until = end_time != 0 ? end_time : System.currentTimeMillis(); return until - start_time; @@ -496,8 +514,6 @@ public final long runTimeMs() { /** Value of the described speed criteria: msecs/frob */ public long speedValue() { return 0; } - // If job is a request - @Override protected Response serve() { fork(); return redirect(); @@ -507,8 +523,13 @@ protected Response redirect() { return Progress2.redirect(this, job_key, destination_key); } - // + /** + * Forks computation of this job. + * + *

The call does not block.

+ * @return always returns this job. + */ public Job fork() { init(); H2OCountedCompleter task = new H2OCountedCompleter() { @@ -545,6 +566,9 @@ public void invoke() { /** * Invoked before job runs. This is the place to checks arguments are valid or throw * IllegalArgumentException. It will get invoked both from the Web and Java APIs. + * + * @throws IllegalArgumentException throws the exception if initialization fails to ensure + * correct job runtime environment. */ protected void init() throws IllegalArgumentException { if (destination_key == null) destination_key = defaultDestKey(); @@ -559,40 +583,6 @@ protected JobState exec() { throw new RuntimeException("Should be overridden if job is a request"); } - public static boolean isJobEnded(Key jobkey) { - boolean done = false; - - Job[] jobs = Job.all(); - boolean found = false; - for (int i = jobs.length - 1; i >= 0; i--) { - if (jobs[i].job_key == null) { - continue; - } - - if (! jobs[i].job_key.equals(jobkey)) { - continue; - } - - // This is the job we are looking for. - found = true; - - if (jobs[i].end_time > 0) { - done = true; - } - - if (jobs[i].isCancelled()) { - done = true; - } - - break; - } - - if (! found) { - done = true; - } - - return done; - } /** * Block synchronously waiting for a job to end, success or not. @@ -601,7 +591,7 @@ public static boolean isJobEnded(Key jobkey) { */ public static void waitUntilJobEnded(Key jobkey, int pollingIntervalMillis) { while (true) { - if (isJobEnded(jobkey)) { + if (Job.isEnded(jobkey)) { return; } diff --git a/src/main/java/water/Model.java b/src/main/java/water/Model.java index 695f9a8af0..9d1e29a30f 100644 --- a/src/main/java/water/Model.java +++ b/src/main/java/water/Model.java @@ -90,25 +90,60 @@ public int nclasses() { /** Variable importance of individual variables measured by this model. */ public VariableImportance varimp() { return null; } - /** Bulk score the frame 'fr', producing a Frame result; the 1st Vec is the + /** Bulk score for given fr frame. + * The frame is always adapted to this model. + * + * @param fr frame to be scored + * @return frame holding predicted values + * + * @see #score(Frame, boolean) + */ + public final Frame score(Frame fr) { + return score(fr, true); + } + /** Bulk score the frame fr, producing a Frame result; the 1st Vec is the * predicted class, the remaining Vecs are the probability distributions. * For Regression (single-class) models, the 1st and only Vec is the - * prediction value. Also passed in a flag describing how hard we try to - * adapt the frame. */ - public Frame score( Frame fr) { + * prediction value. + * + * The flat adapt + * @param fr frame which should be scored + * @param adapt a flag enforcing an adaptation of fr to this model. If flag + * is false scoring code expect that fr is already adapted. + * @return a new frame containing a predicted values. For classification it contains a column with + * prediction and distribution for all response classes. For regression it contains only + * one column with predicted values. + */ + public final Frame score(Frame fr, boolean adapt) { int ridx = fr.find(_names[_names.length-1]); - if(ridx != -1){ // drop the response for scoring! + if (ridx != -1) { // drop the response for scoring! fr = new Frame(fr); fr.remove(ridx); } // Adapt the Frame layout - returns adapted frame and frame containing only // newly created vectors - Frame[] adaptFrms = adapt(fr,false); + Frame[] adaptFrms = adapt ? adapt(fr,false) : null; // Adapted frame containing all columns - mix of original vectors from fr // and newly created vectors serving as adaptors - Frame adaptFrm = adaptFrms[0]; + Frame adaptFrm = adapt ? adaptFrms[0] : fr; // Contains only newly created vectors. The frame eases deletion of these vectors. - Frame onlyAdaptFrm = adaptFrms[1]; + Frame onlyAdaptFrm = adapt ? adaptFrms[1] : null; + // Invoke scoring + Frame output = scoreImpl(adaptFrm); + // Be nice to DKV and delete vectors which i created :-) + if (adapt) onlyAdaptFrm.delete(); + return output; + } + + /** Score already adapted frame. + * + * @param fr + * @return + */ + private Frame scoreImpl(Frame adaptFrm) { + int ridx = adaptFrm.find(_names[_names.length-1]); + assert ridx == -1 : "Adapted frame should not contain response in scoring method!"; + // Create a new vector for response Vec v = adaptFrm.anyVec().makeZero(); // If the model produces a classification/enum, copy the domain into the // result vector. @@ -136,8 +171,6 @@ public Frame score( Frame fr) { // Return just the output columns int x=_names.length-1, y=adaptFrm.numCols(); Frame output = adaptFrm.extractFrame(x, y); - // Delete manually only vectors which i created :-/ - onlyAdaptFrm.delete(); return output; } @@ -266,24 +299,24 @@ public static int[][] getDomainMapping(String[] modelDom, String[] colDom, boole * * @param colName name of column which is mapped, can be null. * @param modelDom - * @param exact + * @param logNonExactMapping * @return */ - public static int[][] getDomainMapping(String colName, String[] modelDom, String[] colDom, boolean exact) { + public static int[][] getDomainMapping(String colName, String[] modelDom, String[] colDom, boolean logNonExactMapping) { int emap[] = new int[modelDom.length]; boolean bmap[] = new boolean[modelDom.length]; HashMap md = new HashMap(); for( int i = 0; i < colDom.length; i++) md.put(colDom[i], i); for( int i = 0; i < modelDom.length; i++) { Integer I = md.get(modelDom[i]); - if (I == null && exact) + if (I == null && logNonExactMapping) Log.warn(Sys.SCORM, "Column "+colName+" was trained with factor '"+modelDom[i]+"' which DOES NOT appear in column data"); if (I!=null) { emap[i] = I; bmap[i] = true; } } - if (exact) { // Inform about additional values in column domain which do not appear in model domain + if (logNonExactMapping) { // Inform about additional values in column domain which do not appear in model domain for (int i=0; i