Prepare 0.10.2 release. (grf-labs#341)

* Prepare 0.10.2 release. * Add a smoke test that runs on CRAN * Alphabetize the authors list, and add Vitor Hadad as a contributor.
vallader · Nov 24, 2018 · 54b5e4f · 54b5e4f
1 parent 82a75e1
commit 54b5e4f
Show file tree

Hide file tree

Showing 12 changed files with 201 additions and 91 deletions.
diff --git a/DEVELOPING.md b/DEVELOPING.md
@@ -8,7 +8,7 @@ The core forest implementation is written in C++, with an R interface powered by
 
 ### Code structure
 
-![GRF Architecture Diagram](https://github.com/grf-labs/grf/blob/master/documentation/arch_diagram.png)
+![GRF Architecture Diagram](https://github.com/grf-labs/grf/blob/master/diagrams/arch_diagram.png)
 
 The forest implementation is composed of two top-level components, [ForestTrainer](https://github.com/grf-labs/grf/blob/master/core/src/forest/ForestTrainer.h) and [ForestPredictor](https://github.com/grf-labs/grf/blob/master/core/src/forest/ForestPredictor.h).
 

diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ install.packages("grf")
 Any published release can also be installed from source:
 
 ```R
-install.packages("https://raw.github.com/grf-labs/grf/master/releases/grf_0.10.1.tar.gz", repos = NULL, type = "source")
+install.packages("https://raw.github.com/grf-labs/grf/master/releases/grf_0.10.2.tar.gz", repos = NULL, type = "source")
 ```
 
 Note that to install from source, a compiler that implements C++11 is required (clang 3.3 or higher, or g++ 4.8 or higher). If installing on Windows, the RTools toolchain is also required.

diff --git a/r-package/grf/.Rbuildignore b/r-package/grf/.Rbuildignore
@@ -1,4 +1,6 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^bindings$
-^tests/*
+
+^tests/testthat/test_((?!cran).).*
+^tests/benchmarks
diff --git a/r-package/grf/DESCRIPTION b/r-package/grf/DESCRIPTION
@@ -1,11 +1,12 @@
 Package: grf
 Title: Generalized Random Forests (Beta)
-Version: 0.10.1
+Version: 0.10.2
 Author: Julie Tibshirani [aut, cre],
     Susan Athey [aut],
-    Stefan Wager [aut],
     Rina Friedberg [ctb],
+    Vitor Hadad [ctb],
     Luke Miner [ctb],
+    Stefan Wager [aut],
     Marvin Wright [ctb]
 BugReports: https://github.com/grf-labs/grf/issues
 Maintainer: Julie Tibshirani <[email protected]>
@@ -26,7 +27,7 @@ Imports:
     methods,
     Rcpp (>= 0.12.15),
     sandwich (>= 2.4-0)
-RoxygenNote: 6.1.0
+RoxygenNote: 6.1.1
 Suggests:
     testthat
 SystemRequirements: GNU make

diff --git a/r-package/grf/NAMESPACE b/r-package/grf/NAMESPACE
@@ -2,7 +2,7 @@ useDynLib(grf)
 
 importFrom("Matrix", "Matrix", "cBind")
 importFrom("methods", "new")
-importFrom("stats", "coef", "lm", "predict", "runif", "var")
+importFrom("stats", "coef", "lm", "predict", "runif", "var", "weighted.mean")
 importFrom("utils", "capture.output")
 import(Rcpp)
 

diff --git a/r-package/grf/tests/benchmarks/#causal_benchmark.R# b/r-package/grf/tests/benchmarks/#causal_benchmark.R#
@@ -0,0 +1,132 @@
+rm(list = ls())
+library(grf)
+
+generate_data = function(setup) {
+  if (setup == 1) {
+    n = 600
+    p = 8
+    X = matrix(runif(n * p), n, p)
+    propensity = (1 + dbeta(X[,3], 2, 4)) / 4
+    tau = (1 + 1/(1 + exp(-20 * (X[,1] - 0.3)))) * (1 + 1/(1 + exp(-20 * (X[,2] - 0.3))))
+    W = rbinom(n, 1, 0.9)
+    Y = 2 * X[,3] - 1 + (W - 0.5) * tau + rnorm(n)
+  } else if (setup == 2) {
+    n = 1600
+    p = 6
+    k = 3
+    X = matrix(rnorm(n*p), n, p)
+    a = rowMeans(X[,1:k]) * sqrt(k)
+    x = sign(a) * a^2
+    tau = 1/(1 + exp(-X[,p]))
+    propensity = pmax(0.05, pmin(1/(1 + exp(-x)), 0.95))
+    mu = x - tau * (propensity - 0.5)
+    W = rbinom(n, 1, propensity)
+    Y = mu + W * tau + rnorm(n)
+  } else if (setup == 3) {
+    n = 1000
+    p = 10
+    X = matrix(runif(n * p), n, p)
+    tau = (1 + 1/(1 + exp(-20 * (X[,1] - 0.3)))) * (1 + 1/(1 + exp(-20 * (X[,2] - 0.3))))
+    W = rbinom(n, 1, 0.9)
+    Y = (W - 0.9) * tau + rnorm(n)
+  } else if (setup == 4) {
+    n = 1600
+    p = 20
+    X = matrix(runif(n * p), n, p)
+    propensity = (1 + dbeta(X[,3], 2, 4)) / 4
+    tau = rep(0, n)
+    W = rbinom(n, 1, 0.9)
+    Y = 2 * X[,3] - 1 + (W - 0.5) * tau + rnorm(n)
+  } else if (setup == 5) {
+    n = 4000
+    p = 10
+    X = matrix(rnorm(n * p), n, p)
+    W = rbinom(n, 1, 0.1)
+    tau = 0.2 * (X[,3] > 0)
+    Y = X[,1] + X[,2] + tau * W + rnorm(n)
+  }
+  list(X=X, W=W, Y=Y, tau=tau, n=n, p=p)
+}
+
+evaluate_method = function(estimate_tau, setup = 1) {
+  data = generate_data(setup)
+  tau.hat = estimate_tau(data$X, data$Y, data$W)
+  plot(data$tau, tau.hat)
+  abline(0, 1)
+  sqrt(mean((tau.hat - data$tau)^2))
+}
+
+make_causal_forest = function(stabilize.splits, min.node.size,
+                              alpha, imbalance.penalty) {
+  function(X, Y, W) {
+    cf = causal_forest(X, Y, W, stabilize.splits = stabilize.splits, alpha = alpha,
+                       min.node.size = min.node.size, imbalance.penalty = imbalance.penalty)
+    cf.pred = predict(cf)
+    print(cf)
+    cf.pred$predictions
+  }
+}
+
+res.untuned.unstab = sapply(1:5, function(setup) {
+  evaluate_method(function(X, Y, W) {
+    cf = causal_forest(X, Y, W, tune.parameters = FALSE, stabilize.splits = FALSE)
+    cf.pred = predict(cf)
+    cf.pred$predictions
+  }, setup)
+})
+res.untuned.unstab
+
+res.tuned.unstab = sapply(1:5, function(setup) {
+  evaluate_method(function(X, Y, W) {
+    cf = causal_forest(X, Y, W, tune.parameters = TRUE, stabilize.splits = FALSE)
+    cf.pred = predict(cf)
+    cf.pred$predictions
+  }, setup)
+})
+res.tuned.unstab
+
+res.untuned.stab = sapply(1:5, function(setup) {
+  evaluate_method(function(X, Y, W) {
+    cf = causal_forest(X, Y, W, min.node.size = 5, tune.parameters = FALSE, stabilize.splits = TRUE)
+    cf.pred = predict(cf)
+    cf.pred$predictions
+  }, setup)
+})
+res.untuned.stab
+
+res.tuned.stab = sapply(1:5, function(setup) {
+  evaluate_method(function(X, Y, W) {
+    cf = causal_forest(X, Y, W, tune.parameters = TRUE, stabilize.splits = TRUE)
+    cf.pred = predict(cf)
+    cf.pred$predictions
+  }, setup)
+})
+res.tuned.stab
+
+res.untuned.unstab
+res.tuned.unstab
+res.untuned.stab
+res.tuned.stab
+
+
+# res.ip = outer(c(0, 0.25, 0.5, 2, 4), 1:5,
+#             FUN = Vectorize(function(imbalance.penalty, setup) {
+#               evaluate_method(make_causal_forest(
+#                 stabilize.splits = TRUE,
+#                 min.node.size = 5,
+#                 alpha = 0.05,
+#                 imbalance.penalty = imbalance.penalty),
+#                 setup)
+#             }))
+# 
+# res.mns = outer(c(0, 1, 2, 4, 8, 16, 32), 1:5,
+#                 FUN = Vectorize(function(mns, setup) {
+#                   evaluate_method(make_causal_forest(
+#                     stabilize.splits = TRUE,
+#                     min.node.size = mns,
+#                     alpha = 0.05,
+#                     imbalance.penalty = 0),
+#                     setup)
+#                 }))
+# colnames(res.mns) = sapply(1:5, function(ii) paste("setup", ii))
+# rownames(res.mns) = sapply(c(0, 1, 2, 4, 8, 16, 32), function(ii) paste("min. node size", ii))
diff --git a/r-package/grf/tests/testthat/test_causal_forest.R b/r-package/grf/tests/testthat/test_causal_forest.R
@@ -2,40 +2,6 @@ library(grf)
 
 set.seed(3141)
 
-test_that("causal forests give reasonable estimates", {
-    p = 6
-    n = 1000
-
-    ticks = 101
-    X.test = matrix(0, ticks, p)
-    xvals = seq(-1, 1, length.out = ticks)
-    X.test[,1] = xvals
-    truth = 2 * (xvals > 0)
-
-    X = matrix(2 * runif(n * p) - 1, n, p)
-    W = rbinom(n, 1, 0.5)
-    Y = (X[,1] > 0) * (2 * W  - 1) + 2 * rnorm(n)
-
-    forest.causal = causal_forest(X, Y, W, num.trees = 2000,
-                                  ci.group.size = 4, W.hat = 0.5,
-                                  compute.oob.predictions = FALSE)
-    preds.causal.oob = predict(forest.causal, estimate.variance=TRUE)
-    preds.causal = predict(forest.causal, X.test, estimate.variance=TRUE)
-
-    expect_true(all(preds.causal$variance.estimate > 0))
-    expect_true(all(preds.causal.oob$variance.estimate > 0))
-
-    error = preds.causal$predictions - truth
-    expect_true(mean(error^2) < 0.5)
-
-    truth.oob = 2 * (X[,1] > 0)
-    error.oob = preds.causal.oob$predictions - truth.oob
-    expect_true(mean(error.oob^2) < 0.5)
-
-    Z.oob = error.oob / sqrt(preds.causal.oob$variance.estimate)
-    expect_true(mean(abs(Z.oob) > 1) < 0.5)
-})
-
 test_that("causal forests can split on the last parameter", {
     n = 1000
     p = 6

diff --git a/r-package/grf/tests/testthat/test_cran_smoke_test.R b/r-package/grf/tests/testthat/test_cran_smoke_test.R
@@ -0,0 +1,46 @@
+library(grf)
+set.seed(1234)
+
+test_that("regression forest split frequencies are reasonable", {
+	n = 100
+	p = 6
+	X = matrix(rnorm(n*p), n, p)
+	Y = 1000 * (X[,1]) + rnorm(n)
+	rrr = regression_forest(X, Y, mtry = p)
+	freq = split_frequencies(rrr, 4)
+	expect_true(freq[1,1] / sum(freq[1,]) > 1/2)
+})
+
+test_that("causal forests give reasonable estimates", {
+    p = 6
+    n = 1000
+
+    ticks = 101
+    X.test = matrix(0, ticks, p)
+    xvals = seq(-1, 1, length.out = ticks)
+    X.test[,1] = xvals
+    truth = 2 * (xvals > 0)
+
+    X = matrix(2 * runif(n * p) - 1, n, p)
+    W = rbinom(n, 1, 0.5)
+    Y = (X[,1] > 0) * (2 * W  - 1) + 2 * rnorm(n)
+
+    forest.causal = causal_forest(X, Y, W, num.trees = 2000,
+                                  ci.group.size = 4, W.hat = 0.5,
+                                  compute.oob.predictions = FALSE)
+    preds.causal.oob = predict(forest.causal, estimate.variance=TRUE)
+    preds.causal = predict(forest.causal, X.test, estimate.variance=TRUE)
+
+    expect_true(all(preds.causal$variance.estimate > 0))
+    expect_true(all(preds.causal.oob$variance.estimate > 0))
+
+    error = preds.causal$predictions - truth
+    expect_true(mean(error^2) < 0.5)
+
+    truth.oob = 2 * (X[,1] > 0)
+    error.oob = preds.causal.oob$predictions - truth.oob
+    expect_true(mean(error.oob^2) < 0.5)
+
+    Z.oob = error.oob / sqrt(preds.causal.oob$variance.estimate)
+    expect_true(mean(abs(Z.oob) > 1) < 0.5)
+})
diff --git a/r-package/grf/tests/testthat/test_error_estimation.R b/r-package/grf/tests/testthat/test_error_estimation.R
@@ -80,7 +80,7 @@ test_that("causal error estimates are reasonable", {
   # c(raw.10, raw.20, raw.400) / sigma^2
 
   expect_equal(err.400, raw.400, tolerance = 0.1 * sigma^2)
-  expect_equal(err.10, err.400, tolerance = sigma^2)
+  expect_equal(err.10, err.400, tolerance = 1.5 * sigma^2)
   expect_equal(err.20, err.400, tolerance = 0.5 * sigma^2)
   expect_true(raw.10 - err.400 > sigma^2)
   expect_true(err.10 - err.400 < sigma^2)

diff --git a/r-package/grf/tests/testthat/test_performance.R b/r-package/grf/tests/testthat/test_performance.R
diff --git a/r-package/grf/tests/testthat/test_regression_forest.R b/r-package/grf/tests/testthat/test_regression_forest.R
@@ -76,16 +76,6 @@ test_that("regression variance estimates are positive", {
 	expect_true(mean(abs(Z.oob) > 1) < 0.5)
 })
 
-test_that("regression forest split frequencies are reasonable", {
-	n = 100
-	p = 6
-	X = matrix(rnorm(n*p), n, p)
-	Y = 1000 * (X[,1]) + rnorm(n)
-	rrr = regression_forest(X, Y, mtry = p)
-	freq = split_frequencies(rrr, 4)
-	expect_true(freq[1,1] / sum(freq[1,]) > 1/2)
-})
-
 test_that("using a sparse data representation produces the same predictions", {
 	dim = 20
 	X = diag(rnorm(dim), dim)

diff --git a/releases/CHANGELOG.md b/releases/CHANGELOG.md
@@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
+## [0.10.2] - 2018-11-23
+### Added
+- Add support for confidence intervals in local linear regression forests. 
+
+### Changed
+- Allow samples_per_cluster to be larger than smallest cluster size. 
+
+### Fixed
+- Make sure average effect estimation doesn't error on data with a single feature.
+- Fix a bug in local linear prediction where the penalty wasn't properly calculated.
+- Fix two issues in causal forest tuning that could lead to unstable results.
+- Ensure that the ATE and APE functions correctly account for cluster membership.
 
 ## [0.10.1] - 2018-09-23
 ### Added