Skip to content

Commit

Permalink
Prepare 0.10.2 release. (grf-labs#341)
Browse files Browse the repository at this point in the history
* Prepare 0.10.2 release.
* Add a smoke test that runs on CRAN
* Alphabetize the authors list, and add Vitor Hadad as a contributor.
  • Loading branch information
jtibshirani authored Nov 24, 2018
1 parent 82a75e1 commit 54b5e4f
Show file tree
Hide file tree
Showing 12 changed files with 201 additions and 91 deletions.
2 changes: 1 addition & 1 deletion DEVELOPING.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ The core forest implementation is written in C++, with an R interface powered by

### Code structure

![GRF Architecture Diagram](https://github.com/grf-labs/grf/blob/master/documentation/arch_diagram.png)
![GRF Architecture Diagram](https://github.com/grf-labs/grf/blob/master/diagrams/arch_diagram.png)

The forest implementation is composed of two top-level components, [ForestTrainer](https://github.com/grf-labs/grf/blob/master/core/src/forest/ForestTrainer.h) and [ForestPredictor](https://github.com/grf-labs/grf/blob/master/core/src/forest/ForestPredictor.h).

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ install.packages("grf")
Any published release can also be installed from source:

```R
install.packages("https://raw.github.com/grf-labs/grf/master/releases/grf_0.10.1.tar.gz", repos = NULL, type = "source")
install.packages("https://raw.github.com/grf-labs/grf/master/releases/grf_0.10.2.tar.gz", repos = NULL, type = "source")
```

Note that to install from source, a compiler that implements C++11 is required (clang 3.3 or higher, or g++ 4.8 or higher). If installing on Windows, the RTools toolchain is also required.
Expand Down
4 changes: 3 additions & 1 deletion r-package/grf/.Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
^.*\.Rproj$
^\.Rproj\.user$
^bindings$
^tests/*

^tests/testthat/test_((?!cran).).*
^tests/benchmarks
7 changes: 4 additions & 3 deletions r-package/grf/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
Package: grf
Title: Generalized Random Forests (Beta)
Version: 0.10.1
Version: 0.10.2
Author: Julie Tibshirani [aut, cre],
Susan Athey [aut],
Stefan Wager [aut],
Rina Friedberg [ctb],
Vitor Hadad [ctb],
Luke Miner [ctb],
Stefan Wager [aut],
Marvin Wright [ctb]
BugReports: https://github.com/grf-labs/grf/issues
Maintainer: Julie Tibshirani <[email protected]>
Expand All @@ -26,7 +27,7 @@ Imports:
methods,
Rcpp (>= 0.12.15),
sandwich (>= 2.4-0)
RoxygenNote: 6.1.0
RoxygenNote: 6.1.1
Suggests:
testthat
SystemRequirements: GNU make
Expand Down
2 changes: 1 addition & 1 deletion r-package/grf/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ useDynLib(grf)

importFrom("Matrix", "Matrix", "cBind")
importFrom("methods", "new")
importFrom("stats", "coef", "lm", "predict", "runif", "var")
importFrom("stats", "coef", "lm", "predict", "runif", "var", "weighted.mean")
importFrom("utils", "capture.output")
import(Rcpp)

Expand Down
132 changes: 132 additions & 0 deletions r-package/grf/tests/benchmarks/#causal_benchmark.R#
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
rm(list = ls())
library(grf)

generate_data = function(setup) {
if (setup == 1) {
n = 600
p = 8
X = matrix(runif(n * p), n, p)
propensity = (1 + dbeta(X[,3], 2, 4)) / 4
tau = (1 + 1/(1 + exp(-20 * (X[,1] - 0.3)))) * (1 + 1/(1 + exp(-20 * (X[,2] - 0.3))))
W = rbinom(n, 1, 0.9)
Y = 2 * X[,3] - 1 + (W - 0.5) * tau + rnorm(n)
} else if (setup == 2) {
n = 1600
p = 6
k = 3
X = matrix(rnorm(n*p), n, p)
a = rowMeans(X[,1:k]) * sqrt(k)
x = sign(a) * a^2
tau = 1/(1 + exp(-X[,p]))
propensity = pmax(0.05, pmin(1/(1 + exp(-x)), 0.95))
mu = x - tau * (propensity - 0.5)
W = rbinom(n, 1, propensity)
Y = mu + W * tau + rnorm(n)
} else if (setup == 3) {
n = 1000
p = 10
X = matrix(runif(n * p), n, p)
tau = (1 + 1/(1 + exp(-20 * (X[,1] - 0.3)))) * (1 + 1/(1 + exp(-20 * (X[,2] - 0.3))))
W = rbinom(n, 1, 0.9)
Y = (W - 0.9) * tau + rnorm(n)
} else if (setup == 4) {
n = 1600
p = 20
X = matrix(runif(n * p), n, p)
propensity = (1 + dbeta(X[,3], 2, 4)) / 4
tau = rep(0, n)
W = rbinom(n, 1, 0.9)
Y = 2 * X[,3] - 1 + (W - 0.5) * tau + rnorm(n)
} else if (setup == 5) {
n = 4000
p = 10
X = matrix(rnorm(n * p), n, p)
W = rbinom(n, 1, 0.1)
tau = 0.2 * (X[,3] > 0)
Y = X[,1] + X[,2] + tau * W + rnorm(n)
}
list(X=X, W=W, Y=Y, tau=tau, n=n, p=p)
}

evaluate_method = function(estimate_tau, setup = 1) {
data = generate_data(setup)
tau.hat = estimate_tau(data$X, data$Y, data$W)
plot(data$tau, tau.hat)
abline(0, 1)
sqrt(mean((tau.hat - data$tau)^2))
}

make_causal_forest = function(stabilize.splits, min.node.size,
alpha, imbalance.penalty) {
function(X, Y, W) {
cf = causal_forest(X, Y, W, stabilize.splits = stabilize.splits, alpha = alpha,
min.node.size = min.node.size, imbalance.penalty = imbalance.penalty)
cf.pred = predict(cf)
print(cf)
cf.pred$predictions
}
}

res.untuned.unstab = sapply(1:5, function(setup) {
evaluate_method(function(X, Y, W) {
cf = causal_forest(X, Y, W, tune.parameters = FALSE, stabilize.splits = FALSE)
cf.pred = predict(cf)
cf.pred$predictions
}, setup)
})
res.untuned.unstab

res.tuned.unstab = sapply(1:5, function(setup) {
evaluate_method(function(X, Y, W) {
cf = causal_forest(X, Y, W, tune.parameters = TRUE, stabilize.splits = FALSE)
cf.pred = predict(cf)
cf.pred$predictions
}, setup)
})
res.tuned.unstab

res.untuned.stab = sapply(1:5, function(setup) {
evaluate_method(function(X, Y, W) {
cf = causal_forest(X, Y, W, min.node.size = 5, tune.parameters = FALSE, stabilize.splits = TRUE)
cf.pred = predict(cf)
cf.pred$predictions
}, setup)
})
res.untuned.stab

res.tuned.stab = sapply(1:5, function(setup) {
evaluate_method(function(X, Y, W) {
cf = causal_forest(X, Y, W, tune.parameters = TRUE, stabilize.splits = TRUE)
cf.pred = predict(cf)
cf.pred$predictions
}, setup)
})
res.tuned.stab

res.untuned.unstab
res.tuned.unstab
res.untuned.stab
res.tuned.stab


# res.ip = outer(c(0, 0.25, 0.5, 2, 4), 1:5,
# FUN = Vectorize(function(imbalance.penalty, setup) {
# evaluate_method(make_causal_forest(
# stabilize.splits = TRUE,
# min.node.size = 5,
# alpha = 0.05,
# imbalance.penalty = imbalance.penalty),
# setup)
# }))
#
# res.mns = outer(c(0, 1, 2, 4, 8, 16, 32), 1:5,
# FUN = Vectorize(function(mns, setup) {
# evaluate_method(make_causal_forest(
# stabilize.splits = TRUE,
# min.node.size = mns,
# alpha = 0.05,
# imbalance.penalty = 0),
# setup)
# }))
# colnames(res.mns) = sapply(1:5, function(ii) paste("setup", ii))
# rownames(res.mns) = sapply(c(0, 1, 2, 4, 8, 16, 32), function(ii) paste("min. node size", ii))
34 changes: 0 additions & 34 deletions r-package/grf/tests/testthat/test_causal_forest.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,6 @@ library(grf)

set.seed(3141)

test_that("causal forests give reasonable estimates", {
p = 6
n = 1000

ticks = 101
X.test = matrix(0, ticks, p)
xvals = seq(-1, 1, length.out = ticks)
X.test[,1] = xvals
truth = 2 * (xvals > 0)

X = matrix(2 * runif(n * p) - 1, n, p)
W = rbinom(n, 1, 0.5)
Y = (X[,1] > 0) * (2 * W - 1) + 2 * rnorm(n)

forest.causal = causal_forest(X, Y, W, num.trees = 2000,
ci.group.size = 4, W.hat = 0.5,
compute.oob.predictions = FALSE)
preds.causal.oob = predict(forest.causal, estimate.variance=TRUE)
preds.causal = predict(forest.causal, X.test, estimate.variance=TRUE)

expect_true(all(preds.causal$variance.estimate > 0))
expect_true(all(preds.causal.oob$variance.estimate > 0))

error = preds.causal$predictions - truth
expect_true(mean(error^2) < 0.5)

truth.oob = 2 * (X[,1] > 0)
error.oob = preds.causal.oob$predictions - truth.oob
expect_true(mean(error.oob^2) < 0.5)

Z.oob = error.oob / sqrt(preds.causal.oob$variance.estimate)
expect_true(mean(abs(Z.oob) > 1) < 0.5)
})

test_that("causal forests can split on the last parameter", {
n = 1000
p = 6
Expand Down
46 changes: 46 additions & 0 deletions r-package/grf/tests/testthat/test_cran_smoke_test.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
library(grf)
set.seed(1234)

test_that("regression forest split frequencies are reasonable", {
n = 100
p = 6
X = matrix(rnorm(n*p), n, p)
Y = 1000 * (X[,1]) + rnorm(n)
rrr = regression_forest(X, Y, mtry = p)
freq = split_frequencies(rrr, 4)
expect_true(freq[1,1] / sum(freq[1,]) > 1/2)
})

test_that("causal forests give reasonable estimates", {
p = 6
n = 1000

ticks = 101
X.test = matrix(0, ticks, p)
xvals = seq(-1, 1, length.out = ticks)
X.test[,1] = xvals
truth = 2 * (xvals > 0)

X = matrix(2 * runif(n * p) - 1, n, p)
W = rbinom(n, 1, 0.5)
Y = (X[,1] > 0) * (2 * W - 1) + 2 * rnorm(n)

forest.causal = causal_forest(X, Y, W, num.trees = 2000,
ci.group.size = 4, W.hat = 0.5,
compute.oob.predictions = FALSE)
preds.causal.oob = predict(forest.causal, estimate.variance=TRUE)
preds.causal = predict(forest.causal, X.test, estimate.variance=TRUE)

expect_true(all(preds.causal$variance.estimate > 0))
expect_true(all(preds.causal.oob$variance.estimate > 0))

error = preds.causal$predictions - truth
expect_true(mean(error^2) < 0.5)

truth.oob = 2 * (X[,1] > 0)
error.oob = preds.causal.oob$predictions - truth.oob
expect_true(mean(error.oob^2) < 0.5)

Z.oob = error.oob / sqrt(preds.causal.oob$variance.estimate)
expect_true(mean(abs(Z.oob) > 1) < 0.5)
})
2 changes: 1 addition & 1 deletion r-package/grf/tests/testthat/test_error_estimation.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ test_that("causal error estimates are reasonable", {
# c(raw.10, raw.20, raw.400) / sigma^2

expect_equal(err.400, raw.400, tolerance = 0.1 * sigma^2)
expect_equal(err.10, err.400, tolerance = sigma^2)
expect_equal(err.10, err.400, tolerance = 1.5 * sigma^2)
expect_equal(err.20, err.400, tolerance = 0.5 * sigma^2)
expect_true(raw.10 - err.400 > sigma^2)
expect_true(err.10 - err.400 < sigma^2)
Expand Down
39 changes: 0 additions & 39 deletions r-package/grf/tests/testthat/test_performance.R

This file was deleted.

10 changes: 0 additions & 10 deletions r-package/grf/tests/testthat/test_regression_forest.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,6 @@ test_that("regression variance estimates are positive", {
expect_true(mean(abs(Z.oob) > 1) < 0.5)
})

test_that("regression forest split frequencies are reasonable", {
n = 100
p = 6
X = matrix(rnorm(n*p), n, p)
Y = 1000 * (X[,1]) + rnorm(n)
rrr = regression_forest(X, Y, mtry = p)
freq = split_frequencies(rrr, 4)
expect_true(freq[1,1] / sum(freq[1,]) > 1/2)
})

test_that("using a sparse data representation produces the same predictions", {
dim = 20
X = diag(rnorm(dim), dim)
Expand Down
12 changes: 12 additions & 0 deletions releases/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [0.10.2] - 2018-11-23
### Added
- Add support for confidence intervals in local linear regression forests.

### Changed
- Allow samples_per_cluster to be larger than smallest cluster size.

### Fixed
- Make sure average effect estimation doesn't error on data with a single feature.
- Fix a bug in local linear prediction where the penalty wasn't properly calculated.
- Fix two issues in causal forest tuning that could lead to unstable results.
- Ensure that the ATE and APE functions correctly account for cluster membership.

## [0.10.1] - 2018-09-23
### Added
Expand Down

0 comments on commit 54b5e4f

Please sign in to comment.