add tests for oblivious & docs

Evovest · Jul 25, 2023 · 46e4012 · 46e4012
1 parent 82672ee
commit 46e4012
Show file tree

Hide file tree

Showing 12 changed files with 172 additions and 4 deletions.
diff --git a/figures/gaussian-sinus.png → docs/src/assets/gaussian-sinus-binary.png b/figures/gaussian-sinus.png → docs/src/assets/gaussian-sinus-binary.png
diff --git a/docs/src/assets/gaussian_sinus.png b/docs/src/assets/gaussian_sinus.png
diff --git a/docs/src/assets/quantiles-sinus-binary.png b/docs/src/assets/quantiles-sinus-binary.png
diff --git a/docs/src/assets/quantiles_sinus.png b/docs/src/assets/quantiles_sinus.png
diff --git a/docs/src/assets/regression-sinus-binary.png b/docs/src/assets/regression-sinus-binary.png
diff --git a/docs/src/assets/regression_sinus.png b/docs/src/assets/regression_sinus.png
diff --git a/docs/src/assets/regression_sinus2.png b/docs/src/assets/regression_sinus2.png
diff --git a/docs/src/tutorials/examples-API.md b/docs/src/tutorials/examples-API.md
@@ -6,7 +6,7 @@ The following provides minimal examples of usage of the various loss functions a
 
 Minimal example to fit a noisy sinus wave.
 
-![](../assets/regression_sinus.png)
+![](../assets/regression-sinus-binary.png)
 
 ```julia
 using EvoTrees
@@ -80,7 +80,7 @@ pred_eval_poisson = model(x_eval)
 
 ## Quantile Regression
 
-![](../assets/quantiles_sinus.png)
+![](../assets/quantiles-sinus-binary.png)
 
 ```julia
 # q50
@@ -119,7 +119,7 @@ pred_train_q80 = model(x_train)
 
 ## Gaussian Max Likelihood
 
-![](../assets/gaussian_sinus.png)
+![](../assets/gaussian-sinus-binary.png)
 
 ```julia
 config = EvoTreeMLE(

diff --git a/experiments/readme_plots_gpu.jl b/experiments/readme_plots_gpu.jl
@@ -10,7 +10,7 @@ using EvoTrees: predict, sigmoid, logit
 # using ProfileView
 
 # prepare a dataset
-tree_type = "oblivious"
+tree_type = "binary"
 device = "gpu"
 
 Random.seed!(123)

diff --git a/src/MLJ.jl b/src/MLJ.jl
@@ -194,6 +194,9 @@ A model type for constructing a EvoTreeRegressor, based on [EvoTrees.jl](https:/
 - `nbins=32`:             Number of bins into which each feature is quantized. Buckets are defined based on quantiles, hence resulting in equal weight bins. Should be between 2 and 255.
 - `monotone_constraints=Dict{Int, Int}()`: Specify monotonic constraints using a dict where the key is the feature index and the value the applicable constraint (-1=decreasing, 0=none, 1=increasing). 
   Only `:linear`, `:logistic`, `:gamma` and `tweedie` losses are supported at the moment.
+- `tree_type="binary"`    Tree structure to be used. One of:
+  - `binary`:       Each node of a tree is grown independently. Tree are built depthwise until max depth is reach or if min weight or gain (see `gamma`) stops further node splits.  
+  - `oblivious`:    A common splitting condition is imposed to all nodes of a given depth. 
 - `rng=123`:              Either an integer used as a seed to the random number generator or an actual random number generator (`::Random.AbstractRNG`).
 
 # Internal API
@@ -308,6 +311,9 @@ EvoTreeClassifier is used to perform multi-class classification, using cross-ent
 - `rowsample=1.0`:              Proportion of rows that are sampled at each iteration to build the tree. Should be in `]0, 1]`.
 - `colsample=1.0`:              Proportion of columns / features that are sampled at each iteration to build the tree. Should be in `]0, 1]`.
 - `nbins=32`:                   Number of bins into which each feature is quantized. Buckets are defined based on quantiles, hence resulting in equal weight bins. Should be between 2 and 255.
+- `tree_type="binary"`    Tree structure to be used. One of:
+  - `binary`:       Each node of a tree is grown independently. Tree are built depthwise until max depth is reach or if min weight or gain (see `gamma`) stops further node splits.  
+  - `oblivious`:    A common splitting condition is imposed to all nodes of a given depth. 
 - `rng=123`:                    Either an integer used as a seed to the random number generator or an actual random number generator (`::Random.AbstractRNG`).
 
 # Internal API
@@ -430,6 +436,9 @@ EvoTreeCount is used to perform Poisson probabilistic regression on count target
 - `colsample=1.0`:              Proportion of columns / features that are sampled at each iteration to build the tree. Should be `]0, 1]`.
 - `nbins=32`:                   Number of bins into which each feature is quantized. Buckets are defined based on quantiles, hence resulting in equal weight bins. Should be between 2 and 255.
 - `monotone_constraints=Dict{Int, Int}()`: Specify monotonic constraints using a dict where the key is the feature index and the value the applicable constraint (-1=decreasing, 0=none, 1=increasing).
+- `tree_type="binary"`    Tree structure to be used. One of:
+  - `binary`:       Each node of a tree is grown independently. Tree are built depthwise until max depth is reach or if min weight or gain (see `gamma`) stops further node splits.  
+  - `oblivious`:    A common splitting condition is imposed to all nodes of a given depth. 
 - `rng=123`:                    Either an integer used as a seed to the random number generator or an actual random number generator (`::Random.AbstractRNG`).
 
 # Internal API
@@ -557,6 +566,9 @@ EvoTreeGaussian is used to perform Gaussian probabilistic regression, fitting μ
 - `nbins=32`:                   Number of bins into which each feature is quantized. Buckets are defined based on quantiles, hence resulting in equal weight bins. Should be between 2 and 255.
 - `monotone_constraints=Dict{Int, Int}()`: Specify monotonic constraints using a dict where the key is the feature index and the value the applicable constraint (-1=decreasing, 0=none, 1=increasing). 
   !Experimental feature: note that for Gaussian regression, constraints may not be enforce systematically.
+- `tree_type="binary"`    Tree structure to be used. One of:
+  - `binary`:       Each node of a tree is grown independently. Tree are built depthwise until max depth is reach or if min weight or gain (see `gamma`) stops further node splits.  
+  - `oblivious`:    A common splitting condition is imposed to all nodes of a given depth. 
 - `rng=123`:                    Either an integer used as a seed to the random number generator or an actual random number generator (`::Random.AbstractRNG`).
 
 # Internal API
@@ -691,6 +703,9 @@ EvoTreeMLE performs maximum likelihood estimation. Assumed distribution is speci
 - `nbins=32`:                   Number of bins into which each feature is quantized. Buckets are defined based on quantiles, hence resulting in equal weight bins. Should be between 2 and 255.
 - `monotone_constraints=Dict{Int, Int}()`: Specify monotonic constraints using a dict where the key is the feature index and the value the applicable constraint (-1=decreasing, 0=none, 1=increasing). 
   !Experimental feature: note that for MLE regression, constraints may not be enforced systematically.
+- `tree_type="binary"`          Tree structure to be used. One of:
+  - `binary`:       Each node of a tree is grown independently. Tree are built depthwise until max depth is reach or if min weight or gain (see `gamma`) stops further node splits.  
+  - `oblivious`:    A common splitting condition is imposed to all nodes of a given depth. 
 - `rng=123`:                    Either an integer used as a seed to the random number generator or an actual random number generator (`::Random.AbstractRNG`).
 
 # Internal API

diff --git a/test/oblivious.jl b/test/oblivious.jl
@@ -0,0 +1,152 @@
+using Statistics
+using StatsBase: sample
+using EvoTrees: sigmoid, logit
+using EvoTrees: check_args, check_parameter
+using Random: seed!
+
+# prepare a dataset
+seed!(123)
+nobs = 2_000
+features = rand(nobs) .* 5
+X = reshape(features, (size(features)[1], 1))
+Y = sin.(features) .* 0.5 .+ 0.5
+Y = logit(Y) + randn(size(Y)) .* 0.1
+Y = sigmoid(Y)
+is = collect(1:size(X, 1))
+
+# train-eval split
+i_sample = sample(is, size(is, 1), replace=false)
+train_size = 0.8
+i_train = i_sample[1:floor(Int, train_size * size(is, 1))]
+i_eval = i_sample[floor(Int, train_size * size(is, 1))+1:end]
+
+x_train, x_eval = X[i_train, :], X[i_eval, :]
+y_train, y_eval = Y[i_train], Y[i_eval]
+
+Yc = (Y .> 0.8) .+ 1
+y_train_c, y_eval_c = Yc[i_train], Yc[i_eval]
+
+@testset "oblivious regressor" begin
+    @testset for loss in [:mse, :logloss, :quantile, :l1, :gamma, :tweedie]
+
+        metric = loss == :l1 ? :mae : loss
+
+        config = EvoTreeRegressor(
+            loss=loss,
+            tree_type="oblivious",
+            nrounds=200,
+            nbins=32,
+            rng=123,
+        )
+
+        model, cache = EvoTrees.init(config, x_train, y_train)
+        preds_ini = model(x_eval)
+        mse_error_ini = mean(abs.(preds_ini .- y_eval) .^ 2)
+        model = fit_evotree(
+            config;
+            x_train,
+            y_train,
+            x_eval,
+            y_eval,
+            metric=metric,
+            print_every_n=25
+        )
+
+        preds = model(x_eval)
+        mse_error = mean(abs.(preds .- y_eval) .^ 2)
+        mse_gain_pct = mse_error / mse_error_ini - 1
+        @test mse_gain_pct < -0.75
+
+    end
+end
+
+@testset "oblivious count" begin
+
+    config = EvoTreeCount(
+        tree_type="oblivious",
+        nrounds=200,
+        nbins=32,
+        rng=123,
+    )
+
+    model, cache = EvoTrees.init(config, x_train, y_train)
+    preds_ini = model(x_eval)
+    mse_error_ini = mean(abs.(preds_ini .- y_eval) .^ 2)
+    model = fit_evotree(
+        config;
+        x_train,
+        y_train,
+        x_eval,
+        y_eval,
+        metric=:poisson,
+        print_every_n=25
+    )
+
+    preds = model(x_eval)
+    mse_error = mean(abs.(preds .- y_eval) .^ 2)
+    mse_gain_pct = mse_error / mse_error_ini - 1
+    @test mse_gain_pct < -0.75
+
+end
+
+@testset "oblivious MLE" begin
+    @testset for loss in [:gaussian_mle, :logistic_mle]
+
+        config = EvoTreeMLE(
+            loss=loss,
+            tree_type="oblivious",
+            nrounds=200,
+            nbins=32,
+            rng=123,
+        )
+
+        model, cache = EvoTrees.init(config, x_train, y_train)
+        preds_ini = model(x_eval)[:, 1]
+        mse_error_ini = mean(abs.(preds_ini .- y_eval) .^ 2)
+        model = fit_evotree(
+            config;
+            x_train,
+            y_train,
+            x_eval,
+            y_eval,
+            metric=loss,
+            print_every_n=25
+        )
+
+        preds = model(x_eval)[:, 1]
+        mse_error = mean(abs.(preds .- y_eval) .^ 2)
+        mse_gain_pct = mse_error / mse_error_ini - 1
+        @test mse_gain_pct .< 0.75
+
+    end
+end
+
+@testset "oblivious classifier" begin
+
+    config = EvoTreeClassifier(
+        tree_type="oblivious",
+        nrounds=200,
+        nbins=32,
+        rng=123,
+    )
+
+    model, cache = EvoTrees.init(config, x_train, y_train_c)
+    preds_ini = model(x_eval)
+    acc_ini = mean(map(argmax, eachrow(preds_ini)) .== y_eval_c)
+
+    model = fit_evotree(
+        config;
+        x_train,
+        y_train=y_train_c,
+        x_eval,
+        y_eval=y_eval_c,
+        metric=:mlogloss,
+        print_every_n=25
+    )
+
+    preds = model(x_eval)
+    acc = mean(map(argmax, eachrow(preds)) .== y_eval_c)
+
+    @test acc > 0.9 
+
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -9,6 +9,7 @@ using Test
 
     @testset "Internal API" begin
         include("core.jl")
+        include("oblivious.jl")
         include("tables.jl")
         include("monotonic.jl")
     end