documentation/iterative_algorithm.Rmd

---
title: Iterative partitioning
author: 
  - name: Bodo Burger
    #affiliation: LMU Munich
    #address: >
    #  First line
    #  Second line
    #email: \email{bb@example.com}
    #url: http://example.com
  #- name: Second Author
    #affiliation: Affiliation
date: 2018-00-00
output:
  html_document: 
    toc: yes
  github_document:
    toc: yes
  pdf_document:
    toc: yes
urlcolor: blue
#bibliography: "literature.bib"
#link-citations: true
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = TRUE,
                      cache = FALSE, cache.path = "cache/iter-algo")
library("intame")
library("ggplot2")
library("mlr")
library("ame")
library("patchwork")
theme_set(theme_light())
```

```{r}
y1 = function(x) 4*x
y2 = function(x) -4 * cos(4*pi*x) * x + 4
dgp = function(n, y1, y2) {
  x = runif(n, min = 0, max = 1)
  x1 = runif(n, min=0, max=1) + .5*x
  x2 = runif(n, min=0, max=1) + .5*x
  x_o_cor = runif(n, min=0, max=1) + .5*x
  x_o_indep = runif(n, min=0, max=1) + .5*runif(n, 0, 1)
  y = y1(x1) + y2(x2) + rnorm(n, 0, .5) + 20*x_o_cor^2 + 20*x_o_indep^2
  data.frame(y, x1, x2, x_o_cor, x_o_indep)
}
set.seed(4219)
df = dgp(n = 500, y1, y2)
knitr::kable(head(df))
knitr::kable(cor(df)[,-1])
qplot(df$x1, y1(df$x1), geom = "line") + ggtitle("\"True effect\" for x1") +
qplot(df$x2, y2(df$x2), geom = "line") + ggtitle("\"True effect\" for x2")
```

```{r fit-neural-network}
# lm.mod.omitted = lm(y ~ x1 + x2, df)
# lm.mod.omitted
lm.mod = lm(y ~ x1 + x2 + I(x_o_cor^2) + I(x_o_indep^2), df)
# lm.mod
df_test = dgp(1000, y1, y2)
mean((df_test$y - predict(lm.mod, df_test))^2) # test set MSE linear model

tsk = makeRegrTask(data = df, target = "y")
nnet.lrn = makeLearner("regr.nnet", skip = FALSE, size = 20, decay = 0.0001,
                       maxit = 1000, trace = FALSE)
nnet.mod = train(nnet.lrn, tsk)
performance(predict(nnet.mod, newdata = df_test), list(mse)) # test set MSE
plotPrediction(nnet.mod, tsk, "x1")$plot
```


```{r prediction-variation}
sd(getPredictionResponse(predict(nnet.mod, tsk)))
sd(df$y)
```

## ALE predictions to fit

```{r ALE}
grid_size = 30
nnet.ale1 = computeALE(nnet.mod$learner.model, df, "x1", grid_size = grid_size)
plot(nnet.ale1)
px1 = nnet.ale1$fp_x
pf1 = nnet.ale1$fp_f
nnet.ale2 = computeALE(nnet.mod$learner.model, df, "x2", grid_size = grid_size)
plot(nnet.ale2)
px2 = nnet.ale2$fp_x
pf2 = nnet.ale2$fp_f

nnet.pd2 = computePD(nnet.mod$learner.model, df, "x2", grid_size = grid_size,
  grid_method = "quantile")
plot(nnet.pd2)
```

## Partition AL predictions

```{r}
saf_x1 = iterative_partition(px1, pf1, metric_name = "R2", threshold = .95)
plot(saf_x1)
saf_x1

saf_x2 = iterative_partition(px2, pf2, "R2int", threshold = .95, max_splits = 10,
  greedy = TRUE)
plot(saf_x2)
saf_x2

saf_x2 = iterative_partition(px2, pf2, "R2", threshold = .95, max_splits = 10,
  greedy = FALSE)
plot(saf_x2)
saf_x2

intame_x2 = intame(nnet.mod$learner.model, df, "x2", fe_grid_size = grid_size,
  greedy = TRUE)
plot(intame_x2)
intame_x2

th = .3
sd(df$y) / 5
th = max(sd(pf1), sd(pf2)) / 5
th = suggest_threshold(nnet.mod$learner.model, df, c("x1", "x2"), "L2",
  explained_fraction = .95)
th

saf_x2.L2 = iterative_partition(px2, pf2, "L2", threshold = th, max_splits = 10,
  greedy = FALSE)
plot(saf_x2.L2)
saf_x2.L2

saf_x2.L1 = iterative_partition(px2, pf2, "L1", threshold = th, max_splits = 10,
  greedy = FALSE)
plot(saf_x2.L1)
saf_x2.L1

saf_x2_non_greedy = iterative_partition(px2, pf2, "R2", threshold = .98, max_splits = 10,
  greedy = FALSE)
plot(saf_x2_non_greedy)
saf_x2_non_greedy

plot(saf_x2_non_greedy) + plot(saf_x2)


# saf_x2_frechet = iterative_partition(px2, pf2, "Frechet", threshold = .5, max_splits = 6,
#   greedy = TRUE)
# plot(saf_x2_frechet)
# saf_x2_frechet
# library(profvis)
# profvis(iterative_partition(px2, pf2, "Frechet", threshold = .5, max_splits = 6,
#   greedy = TRUE))
# # Frechet function very slow
```

## Ideas

- after splitting --> try all combinations of splits, reward split reduction

## Remarks

### Preference for adjacent, left-side points

Comparing weighted means of the segments' R2 is important
because without it there was a strong preference for adjacent points 
(as two adjacent points yield an R2 of 1).

Further, due to starting at the left side left split points are preferred.
Problem?

### A trend influences splits when using R2

A trend might incluence when the algorithm stops when using R2.
The higher the slope in a segment, the more likely it is being split. True?

```{r slope-data}
set.seed(4219)
x_min = 0; x_max = 10
n = 1000
x1 = runif(n-2, x_min, x_max)
x2 = runif(n-2, x_min, x_max)
```

First example has a trend that dominates the cyclical component:

```{r slope-small-cycles}
y_flat = function(x) sin(.5*pi*x)
y_steep = function(x) sin(.5*pi*x) + x
ggplot() +
  geom_line(aes(x = x1, y = y_flat(x1), col = "y_flat")) +
  geom_line(aes(x = x2, y = y_steep(x2), col = "y_steep")) +
  ggtitle("\"true\" effects")
y = y_flat(x1) + y_steep(x2)
df = data.frame(x1, x2, y)

tsk = makeRegrTask(data = df, target = "y")
nnet.lrn = makeLearner("regr.nnet", skip = FALSE, size = 20, decay = 0.0001,
                       maxit = 1000, trace = FALSE)
nnet.mod = train(nnet.lrn, tsk)
performance(predict(nnet.mod, tsk), list(mse)) # test set MSE

ale.x1 = computeALE(nnet.mod$learner.model, df, "x1")
ale.x2 = computeALE(nnet.mod$learner.model, df, "x2")
ggplot() +
  geom_line(data = ale.x1$plot_data, aes(x = fp_x, y = fp_f, col = "y_flat")) +
  geom_line(data = ale.x2$plot_data, aes(x = fp_x, y = fp_f+5, col = "y_steep"))

th = sd(df$y) / 5 # threshold for L2
th = max(sd(ale.x1$fp_f), sd(ale.x2$fp_f)) / 5

saf_flat = iterative_partition(ale.x1$fp_x, ale.x1$fp_f, "R2int")
saf_steep = iterative_partition(ale.x2$fp_x, ale.x2$fp_f, "R2int")
plot(saf_flat) + ylim(-5, 5) +
plot(saf_steep)
saf_flat.L2 = iterative_partition(ale.x1$fp_x, ale.x1$fp_f, "L2", th)
saf_steep.L2 = iterative_partition(ale.x2$fp_x, ale.x2$fp_f, "L2", th)
plot(saf_flat.L2) + ylim(-5, 5) +
plot(saf_steep.L2)
```

"L2" metric does not depend on the size of the trend.
"R2" metric depends on the size of the trend.

Threshold for "L2": we diveded the standard deviation of the target variable by 5.
Works here, is this generally sensible?
It's arbitrary, but so is 95% for R squared.

Second example has a bigger cyclical component.
Now we would want the algorithm to catch the cycles.

```{r slope-big-cycles}
y_flat_bc = function(x) 4 * sin(.5*pi*x)
y_steep_bc = function(x) 4 * sin(.5*pi*x) + x
ggplot() +
  geom_line(aes(x = x1, y = y_flat_bc(x1), col = "y_flat")) +
  geom_line(aes(x = x2, y = y_steep_bc(x2), col = "y_steep")) +
  ggtitle("\"true\" effects")
y_bc = y_flat_bc(x1) + y_steep_bc(x2)
df_bc = data.frame(x1, x2, y_bc)

tsk_bc = makeRegrTask(data = df_bc, target = "y_bc")
nnet.lrn = makeLearner("regr.nnet", skip = FALSE, size = 20, decay = 0.0001,
                       maxit = 1000, trace = FALSE)
nnet.mod_bc = train(nnet.lrn, tsk_bc)
performance(predict(nnet.mod_bc, tsk_bc), list(mse)) # test set MSE

ale_x1_bc = computeALE(nnet.mod_bc$learner.model, df_bc, "x1")
ale_x2_bc = computeALE(nnet.mod_bc$learner.model, df_bc, "x2")
ggplot() +
  geom_line(data = ale_x1_bc$plot_data, aes(x = fp_x, y = fp_f, col = "y_flat")) +
  geom_line(data = ale_x2_bc$plot_data, aes(x = fp_x, y = fp_f+5, col = "y_steep"))

th_bc = sd(df_bc$y_bc) / 5 # threshold for L2
th_bc = max(sd(ale_x1_bc$fp_f), sd(ale_x2_bc$fp_f)) / 5

saf_flat_bc = iterative_partition(ale_x1_bc$fp_x, ale_x1_bc$fp_f, "R2int", .95)
saf_steep_bc = iterative_partition(ale_x2_bc$fp_x, ale_x2_bc$fp_f, "R2int", .95)
plot(saf_flat_bc) + ylim(-7, 9) +
plot(saf_steep_bc)
saf_flat_L2_bc = iterative_partition(ale_x1_bc$fp_x, ale_x1_bc$fp_f, "L2", th_bc)
saf_steep_L2_bc = iterative_partition(ale_x2_bc$fp_x, ale_x2_bc$fp_f, "L2", th_bc)
plot(saf_flat_L2_bc) + ylim(-7.5, 8.5) +
plot(saf_steep_L2_bc) + ylim(-7.5, 8.5)
```

Result for R2 with threshold = .95 is good.
$sd(y)/5$ works great for L2 again.

### Examples for saturated effect

TODO !!!

### Other models

Here we try a tree-based method, but as long as the ALE plot is similar to
the one above the underlying method does not play a role in the results of
this algorithm.

```{r}
gbm.predict_fun = function(object, newdata) predict(object, newdata, n.trees = 1000)
gbm.lrn = makeLearner("regr.gbm", n.trees = 1000)
gbm.mod = train(gbm.lrn, tsk)
performance(predict(gbm.mod, newdata = df_test), list(mse)) # test set MSE

gbm.ale1 = computeALE(gbm.mod$learner.model, df, "x1", grid_size = 30,
  predict_fun = gbm.predict_fun)
plot(gbm.ale1)
gbm.ale2 = computeALE(gbm.mod$learner.model, df, "x2", grid_size = 30,
  predict_fun = gbm.predict_fun)
plot(gbm.ale2)

saf_x1 = iterative_partition(gbm.ale1$fp_x, gbm.ale1$fp_f, threshold = .9)
plot(saf_x1)
saf_x1

saf_x2 = iterative_partition(px2, pf2, method = "R2int", threshold = .9, max_splits = 20)
plot(saf_x2)
saf_x2
```

## Profiling / Benchmarks

```{r profiling, benchmarks}
library(profvis)
profvis(iterative_partition(x1, y_flat(x1), "R2int"))
profvis(iterative_partition(x1, y_flat(x1), "L2",threshold = .1, max_splits = 3, greedy = FALSE))

profvis(iterative_partition(px2, pf2, "R2int", threshold = .95, max_splits = 20,
  greedy = FALSE))

library(microbenchmark)
microbenchmark(
  iterative_partition(x1, y_flat(x1), "R2int", max_splits = 2),
  iterative_partition(x1, y_flat(x1), "R2int", max_splits = 2, greedy = FALSE),
  times = 5)

saf_steep = iterative_partition(x1, y_steep(x1), "R2int")
saf_flat.L2 = iterative_partition(x1, y_flat(x1), "L2", .1)
saf_steep.L2 = iterative_partition(x1, y_steep(x1), "L2", .1)
```