merge in new master

adamwanggang · Aug 15, 2019 · a8b9cb1 · a8b9cb1
2 parents 7d59549 + ce20f7a
commit a8b9cb1
Show file tree

Hide file tree

Showing 90 changed files with 5,029 additions and 3,066 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,18 +1,16 @@
 language: r
-sudo: required
-dist: trusty
+cache: packages
 warnings_are_errors: true
-
 r:
   - oldrel
   - release
   - devel
 
 r_github_packages:
-  - jimhester/covr
+  - r-lib/covr
 
 after_success:
-  - Rscript -e 'covr::coveralls(quiet = FALSE)'
+  - Rscript -e 'covr::coveralls()'
 
 notifications:
   email:

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,20 +1,21 @@
 Package: ranger
 Type: Package
 Title: A Fast Implementation of Random Forests
-Version: 0.8.2-135
-Date: 2018-10-10
-Author: Marvin N. Wright
+Version: 0.11.4
+Date: 2019-08-15
+Author: Marvin N. Wright [aut, cre], Stefan Wager [ctb], Philipp Probst [ctb]
 Maintainer: Marvin N. Wright <[email protected]>
 Description: A fast implementation of Random Forests, particularly suited for high
           dimensional data. Ensembles of classification, regression, survival and
           probability prediction trees are supported. Data from genome-wide association
           studies can be analyzed efficiently. In addition to data frames, datasets of
-          class 'gwaa.data' (R package 'GenABEL') can be directly analyzed.
+          class 'gwaa.data' (R package 'GenABEL') and 'dgCMatrix' (R package 'Matrix') 
+          can be directly analyzed.
 License: GPL-3
 Imports: Rcpp (>= 0.11.2), Matrix
 LinkingTo: Rcpp, RcppEigen
 Depends: R (>= 3.1)
-Suggests: survival, testthat, GenABEL
-RoxygenNote: 6.1.0
+Suggests: survival, testthat
+RoxygenNote: 6.1.1
 URL: https://github.com/imbs-hl/ranger
 BugReports: https://github.com/imbs-hl/ranger/issues
diff --git a/NAMESPACE b/NAMESPACE
@@ -18,6 +18,7 @@ export(importance_pvalues)
 export(predictions)
 export(ranger)
 export(timepoints)
+export(treeInfo)
 import(stats)
 import(utils)
 importFrom(Matrix,Matrix)

diff --git a/NEWS b/NEWS
@@ -1,8 +1,42 @@
-##### Version 0.8.2
-* Add bias-corrected impurity importance
-* Add impurity importance for survival forests
+##### Version 0.11.4
+* Add "beta" splitrule for bounded outcomes
+
+##### Version 0.11.3
+* Accept user-specified function in quantile prediction
+
+##### Version 0.11.2
+* Bug fixes
+
+##### Version 0.11.1
+* Bug fixes
 
-##### Version 0.8.1
+##### Version 0.11.0
+* Add max.depth parameter to limit tree depth
+* Add inbag argument for manual selection of observations in trees
+* Add support of splitting weights for corrected impurity importance 
+* Internal changes (slightly improved computation speed)
+* Warning: Possible seed differences compared to older versions
+* Bug fixes
+
+##### Version 0.10.0
+* Change license of C++ core to MIT (R package is still GPL3)
+* Better 'order' mode for unordered factors for multiclass and survival
+* Add 'order' mode for unordered factors for GenABEL SNP data (binary classification and regression)
+* Add class-weighted Gini splitting
+* Add fixed proportion sampling
+* Add impurity importance for the maxstat splitting rule
+* Remove GenABEL from suggested packages (removed from CRAN). GenABEL data is still supported
+* Improve memory management (internal changes)
+* Bug fixes
+
+##### Version 0.9.0
+* Add bias-corrected impurity importance (actual impurity reduction, AIR)
+* Add quantile prediction as in quantile regression forests
+* Add treeInfo() function to extract human readable tree structure
+* Add standard error estimation with the infinitesimal jackknife (now the default)
+* Add impurity importance for survival forests
+* Faster aggregation of predictions
+* Fix memory issues on Windows 7
 * Bug fixes
 
 ##### Version 0.8.0

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,95 @@
+##### Version 0.11.4
+* Add "beta" splitrule for bounded outcomes
+
+##### Version 0.11.3
+* Accept user-specified function in quantile prediction
+
+##### Version 0.11.2
+* Bug fixes
+
+##### Version 0.11.1
+* Bug fixes
+
+##### Version 0.11.0
+* New CRAN version
+
+##### Version 0.10.6
+* Internal changes (slightly improved computation speed)
+* Warning: Possible seed differences compared to older versions
+* Bug fixes
+
+##### Version 0.10.5
+* Add support of splitting weights for corrected impurity importance 
+* Bug fixes
+
+##### Version 0.10.4
+* Add inbag argument for manual selection of observations in trees
+
+##### Version 0.10.3
+* Bug fixes
+
+##### Version 0.10.2
+* Add max.depth parameter to limit tree depth
+
+##### Version 0.10.1
+* Bug fixes
+
+##### Version 0.10.0
+* New CRAN version
+
+##### Version 0.9.12
+* Remove GenABEL from suggested packages (removed from CRAN). GenABEL data is still supported
+
+##### Version 0.9.11
+* Improve memory management (internal changes)
+
+##### Version 0.9.10
+* Add impurity importance for the maxstat splitting rule
+* Bug fixes
+
+##### Version 0.9.9
+* Add 'order' mode for unordered factors for GenABEL SNP data (binary classification and regression)
+
+##### Version 0.9.8
+* Bug fixes
+
+##### Version 0.9.7
+* Change license of C++ core to MIT (R package is still GPL3)
+
+##### Version 0.9.6
+* Better 'order' mode for unordered factors for multiclass and survival
+
+##### Version 0.9.5
+* Bug fixes
+
+##### Version 0.9.4
+* Add class-weighted Gini splitting
+
+##### Version 0.9.3
+* Bug fixes
+
+##### Version 0.9.2
+* Add fixed proportion sampling
+
+##### Version 0.9.1
+* Bug fixes
+
+##### Version 0.9.0
+* New CRAN version
+
+##### Version 0.8.5
+* Faster aggregation of predictions
+* Fix memory issues on Windows 7
+* Add treeInfo() function to extract human readable tree structure
+
+##### Version 0.8.4
+* Add quantile prediction as in quantile regression forests
+
+##### Version 0.8.3
+* Add standard error estimation with the infinitesimal jackknife (now the default)
+
 ##### Version 0.8.2
-* Add bias-corrected impurity importance
+* Add bias-corrected impurity importance (actual impurity reduction, AIR)
 * Add impurity importance for survival forests
 
 ##### Version 0.8.1

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,7 +1,11 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-rangerCpp <- function(treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_data, use_sparse_data) {
-    .Call(`_ranger_rangerCpp`, treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_data, use_sparse_data)
+rangerCpp <- function(treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_data, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag) {
+    .Call(`_ranger_rangerCpp`, treetype, dependent_variable_name, input_data, variable_names, mtry, num_trees, verbose, seed, num_threads, write_forest, importance_mode_r, min_node_size, split_select_weights, use_split_select_weights, always_split_variable_names, use_always_split_variable_names, status_variable_name, prediction_mode, loaded_forest, snp_data, sample_with_replacement, probability, unordered_variable_names, use_unordered_variable_names, save_memory, splitrule_r, case_weights, use_case_weights, class_weights, predict_all, keep_inbag, sample_fraction, alpha, minprop, holdout, prediction_type_r, num_random_splits, sparse_data, use_sparse_data, order_snps, oob_error, max_depth, inbag, use_inbag)
+}
+
+numSmaller <- function(values, reference) {
+    .Call(`_ranger_numSmaller`, values, reference)
 }
 
diff --git a/R/csrf.R b/R/csrf.R
@@ -61,7 +61,7 @@
 ##' 
 ##' @author Marvin N. Wright
 ##' @references
-##'   Xu, R., Nettleton, D. & Nordman, D.J. (2014). Case-specific random forests. J Comp Graph Stat 25:49-65. \url{http://dx.doi.org/10.1080/10618600.2014.983641}.
+##'   Xu, R., Nettleton, D. & Nordman, D.J. (2014). Case-specific random forests. J Comp Graph Stat 25:49-65. \url{https://doi.org/10.1080/10618600.2014.983641}.
 ##' @export
 csrf <- function(formula, training_data, test_data, params1 = list(), params2 = list()) {
   ## Grow a random forest on the training data to obtain weights

diff --git a/R/formula.R b/R/formula.R
@@ -6,14 +6,15 @@
 #'
 #' @param formula Object of class \code{formula} or \code{character} describing the model to fit.
 #' @param data Training data of class \code{data.frame}.
+#' @param env The environment in which the left hand side of \code{formula} is evaluated.
 #'
 #' @return Dataset including selected columns and interactions.
-parse.formula <- function(formula, data) {
+parse.formula <- function(formula, data, env = parent.frame()) {
   f <- as.formula(formula)
   t <- terms(f, data = data)
 
   ## Get dependent var(s)
-  response <- data.frame(eval(f[[2]], envir = data))
+  response <- data.frame(eval(f[[2]], envir = data, enclos = env))
   colnames(response) <- deparse(f[[2]])
 
   ## Get independent vars

diff --git a/R/holdoutRF.R b/R/holdoutRF.R
@@ -36,7 +36,7 @@
 ##' @seealso \code{\link{ranger}}
 ##' @author Marvin N. Wright
 ##' @references
-##'   Janitza, S., Celik, E. & Boulesteix, A.-L., (2015). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \url{http://dx.doi.org/10.1007/s11634-016-0276-4}. \cr
+##'   Janitza, S., Celik, E. & Boulesteix, A.-L., (2015). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \url{https://doi.org/10.1007/s11634-016-0276-4}. \cr
 ##' @export 
 holdoutRF <- function(...) {
 
@@ -85,4 +85,4 @@ holdoutRF <- function(...) {
   class(res) <- "holdoutRF"
 
   res
-}
+}
diff --git a/R/importance.R b/R/importance.R
@@ -50,24 +50,54 @@ importance.ranger <- function(x, ...) {
   return(x$variable.importance)
 }
 
-##' Compute variable importance with p-values.
-##'
+##' Compute variable importance with p-values. 
+##' For high dimensional data, the fast method of Janitza et al. (2016) can be used. 
+##' The permutation approach of Altmann et al. (2010) is computationally intensive but can be used with all kinds of data. 
+##' See below for details. 
+##' 
+##' The method of Janitza et al. (2016) uses a clever trick:
+##' With an unbiased variable importance measure, the importance values of non-associated variables vary randomly around zero. 
+##' Thus, all non-positive importance values are assumed to correspond to these non-associated variables and they are used to construct a distribution of the importance under the null hypothesis of no association to the response.
+##' Since only the non-positive values of this distribution can be observed, the positive values are created by mirroring the negative distribution. 
+##' See Janitza et al. (2016) for details.
+##' 
+##' The method of Altmann et al. (2010) uses a simple permutation test: 
+##' The distribution of the importance under the null hypothesis of no association to the response is created by several replications of permuting the response, growing an RF and computing the variable importance.
+##' The authors recommend 50-100 permutations. 
+##' However, much larger numbers have to be used to estimate more precise p-values.
+##' We add 1 to the numerator and denominator to avoid zero p-values.
 ##'
 ##' @title ranger variable importance p-values
-##' @param x ranger or holdoutRF object.
-##' @param method Method to compute p-values. Use "janitza" for the method by Janitza et al. (2015) or "altmann" for the non-parametric method by Altmann et al. (2010).
+##' @param x \code{ranger} or \code{holdoutRF} object.
+##' @param method Method to compute p-values. Use "janitza" for the method by Janitza et al. (2016) or "altmann" for the non-parametric method by Altmann et al. (2010).
 ##' @param num.permutations Number of permutations. Used in the "altmann" method only.
 ##' @param formula Object of class formula or character describing the model to fit. Used in the "altmann" method only.
 ##' @param data Training data of class data.frame or matrix. Used in the "altmann" method only.
-##' @param ... Further arguments passed to ranger(). Used in the "altmann" method only.
-##' @return Variable importance and p-values.
+##' @param ... Further arguments passed to \code{ranger()}. Used in the "altmann" method only.
+##' @return Variable importance and p-value for each variable.
+##' @examples
+##' require(ranger)
+##' 
+##' ## Janitza's p-values with corrected Gini importance
+##' n <- 50
+##' p <- 400
+##' dat <- data.frame(y = factor(rbinom(n, 1, .5)), replicate(p, runif(n)))
+##' rf.sim <- ranger(y ~ ., dat, importance = "impurity_corrected")
+##' importance_pvalues(rf.sim, method = "janitza")
+##' 
+##' ## Permutation p-values 
+##' \dontrun{
+##' rf.iris <- ranger(Species ~ ., data = iris, importance = 'permutation')
+##' importance_pvalues(rf.iris, method = "altmann", formula = Species ~ ., data = iris)
+##' }
 ##' @seealso \code{\link{ranger}}
 ##' @author Marvin N. Wright
 ##' @references
-##'   Janitza, S., Celik, E. & Boulesteix, A.-L., (2015). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \url{http://dx.doi.org/10.1007/s11634-016-0276-4}. \cr
-##'   Altmann, A., Tolosi, L., Sander, O. & Lengauer, T. (2010). Permutation importance: a corrected feature importance measure, Bioinformatics 26(10):1340-1347.
+##'   Janitza, S., Celik, E. & Boulesteix, A.-L., (2016). A computationally fast variable importance test for random forests for high-dimensional data. Adv Data Anal Classif \url{https://doi.org/10.1007/s11634-016-0276-4}. \cr
+##'   Altmann, A., Tolosi, L., Sander, O. & Lengauer, T. (2010). Permutation importance: a corrected feature importance measure, Bioinformatics 26:1340-1347.
 ##' @export 
 importance_pvalues <- function(x, method = c("janitza", "altmann"), num.permutations = 100, formula = NULL, data = NULL, ...) {
+  method <- match.arg(method)
   if (class(x) != "ranger" & class(x) != "holdoutRF") {
     stop("Object is no ranger or holdoutRF object.")
   }
@@ -77,10 +107,10 @@ importance_pvalues <- function(x, method = c("janitza", "altmann"), num.permutat
 
   if (method == "janitza") {
     if (x$importance.mode == "impurity") {
-      stop("Impurity variable importance found. Please use (hold-out) permutation importance to use this method.")
+      stop("Impurity variable importance found. Please use (hold-out) permutation importance or corrected impurity importance to use this method.")
     }
     if (class(x) != "holdoutRF" && x$importance.mode == "permutation") {
-      warning("Permutation variable importance found, inaccurate p-values. Please use hold-out permutation importance to use this method.")
+      warning("Permutation variable importance found, inaccurate p-values. Please use hold-out permutation importance or corrected impurity importance to use this method.")
     }
     if (x$treetype != "Classification") {
       warning("This method is tested for classification only, use with care.")
@@ -92,7 +122,8 @@ importance_pvalues <- function(x, method = c("janitza", "altmann"), num.permutat
     vimp <- c(m1, -m1, m2)
 
     ## Compute p-value
-    pval <- 1 - ecdf(vimp)(x$variable.importance)
+    ## Note: ecdf is smaller or equal, problems with 0 importance values
+    pval <- 1 - numSmaller(x$variable.importance, vimp) / length(vimp)
 
     ## TODO: 100 ok? increase? 
     if (length(m1) == 0) {
@@ -108,14 +139,17 @@ importance_pvalues <- function(x, method = c("janitza", "altmann"), num.permutat
     if (is.null(formula) || is.null(data)) {
       stop("Formula and data required for the 'altmann' method.")
     }
+    if (is.character(formula)) {
+      formula <- formula(formula)
+    }
 
     ## Permute and compute importance again
     if (x$treetype == "Survival") {
       dependent.variable.name <- all.vars(formula)[1:2]
     } else {
       dependent.variable.name <- all.vars(formula)[1]
     }
-    vimp <- replicate(num.permutations, {
+    vimp <- sapply(1:num.permutations, function(i) {
       dat <- data
       dat[, dependent.variable.name] <- dat[sample(nrow(dat)), dependent.variable.name]
       ranger(formula, dat, num.trees = x$num.trees, mtry = x$mtry, min.node.size = x$min.node.size,