Skip to content

Commit

Permalink
MLJ table (#262)
Browse files Browse the repository at this point in the history
* integrate MLJ wrapper with Tables API

* integrate MLJ wrapper with Tables API

* adapt MLJ wrapper to Tables API

* fix support for rowtables

* mix for rowtables input

* replace @Spawn with @threads

* MLJ test

GPU launcher fixes

* up benchmarks

* add back Matrix support in MLJ
  • Loading branch information
jeremiedb authored Oct 27, 2023
1 parent 535b0c4 commit dbee39d
Show file tree
Hide file tree
Showing 41 changed files with 361 additions and 3,572 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <[email protected]>"]
version = "0.16.4"
version = "0.16.5"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
Expand Down
16 changes: 8 additions & 8 deletions benchmarks/regressor-df.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ using EvoTrees
using DataFrames
using BenchmarkTools
using Random: seed!
import CUDA
# import CUDA

nobs = Int(1e6)
num_feat = Int(100)
Expand Down Expand Up @@ -45,13 +45,13 @@ params_xgb = Dict(
:max_bin => 64,
)

# dtrain = DMatrix(x_train, y_train)
# watchlist = Dict("train" => DMatrix(x_train, y_train))
# @time m_xgb = xgboost(dtrain; watchlist, nthread=nthread, verbosity=0, eval_metric=metric_xgb, params_xgb...);
# # @btime m_xgb = xgboost($dtrain; watchlist, nthread=nthread, verbosity=0, eval_metric=metric_xgb, params_xgb...);
# @info "xgboost predict:"
# @time pred_xgb = XGBoost.predict(m_xgb, x_train);
# # @btime XGBoost.predict($m_xgb, $x_train);
dtrain = DMatrix(x_train, y_train)
watchlist = Dict("train" => DMatrix(x_train, y_train))
@time m_xgb = xgboost(dtrain; watchlist, nthread=nthread, verbosity=0, eval_metric=metric_xgb, params_xgb...);
# @btime m_xgb = xgboost($dtrain; watchlist, nthread=nthread, verbosity=0, eval_metric=metric_xgb, params_xgb...);
@info "xgboost predict:"
@time pred_xgb = XGBoost.predict(m_xgb, x_train);
# @btime XGBoost.predict($m_xgb, $x_train);

@info "EvoTrees"
dtrain = DataFrame(x_train, :auto)
Expand Down
116 changes: 116 additions & 0 deletions benchmarks/regressor-mlj.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
using Revise
using Statistics
using StatsBase: sample
using EvoTrees
using DataFrames
using BenchmarkTools
using Random: seed!
import CUDA
using MLJ

nobs = Int(2e6)
num_feat = Int(100)
nrounds = 200
T = Float64
nthread = Base.Threads.nthreads()
@info "testing with: $nobs observations | $num_feat features. nthread: $nthread"
seed!(123)
x_train = rand(T, nobs, num_feat)
y_train = rand(T, size(x_train, 1))

@info nthread
loss = "mse"
if loss == "mse"
loss_evo = :mse
metric_evo = :mae
elseif loss == "logloss"
loss_evo = :logloss
metric_evo = :logloss
end

@info "EvoTrees"
dtrain = DataFrame(x_train, :auto)
# dtrain.y .= y_train
# target_name = "y"
verbosity = 0

params_evo = EvoTreeRegressor(
loss=loss_evo,
nrounds=nrounds,
alpha=0.5,
lambda=0.0,
gamma=0.0,
eta=0.05,
max_depth=6,
min_weight=1.0,
rowsample=0.5,
colsample=0.5,
nbins=64,
rng=123,
)

@info "EvoTrees CPU"
device = "cpu"

iterated_model = IteratedModel(
model=params_evo,
resampling=Holdout(; fraction_train=0.5),
measures=rmse,
controls=[Step(5),
Patience(200),
NumberLimit(40)],
retrain=false)

mach = machine(iterated_model, dtrain, y_train)
@time fit!(mach);

@info "init"
@time m_df, cache_df = EvoTrees.init(params_evo, dtrain; target_name);

# @info "train - no eval"
# @time m_evo_df = fit_evotree(params_evo, dtrain; target_name, device, verbosity, print_every_n=100);
# @time m_evo_df = fit_evotree(params_evo, dtrain; target_name, device, verbosity, print_every_n=100);

@info "train - eval"
@time m_evo = fit_evotree(params_evo, dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100);
@time m_evo = fit_evotree(params_evo, dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100);
# @time m_evo = fit_evotree(params_evo, dtrain; target_name, device);
# @btime fit_evotree($params_evo, $dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100);
@info "predict"
@time pred_evo = m_evo(dtrain);
@btime m_evo($dtrain);

@info "EvoTrees GPU"
device = "gpu"
@info "train"
@time m_evo = fit_evotree(params_evo, dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100);
@time m_evo = fit_evotree(params_evo, dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100);
# @btime m_evo = fit_evotree($params_evo, $dtrain; target_name, device);
# @btime fit_evotree($params_evo, $dtrain; target_name, deval=dtrain, metric=metric_evo, device, verbosity, print_every_n=100);
@info "predict"
@time pred_evo = m_evo(dtrain; device);
@btime m_evo($dtrain; device);


using MLJBase
using MLJModels
using Tables

EvoTreeBooster = @load EvoTreeRegressor
booster = EvoTreeBooster()

X, y = make_regression(1000, 5)

# this works:
mach = machine(booster, X, y) |> fit!

# this doesn't
X, y = make_regression(1_000_000, 100);
@time X = DataFrame(X);
@time X = Tables.rowtable(X);
@time X = Tables.columntable(X);

mach = machine(booster, X, y) |> fit!

schema = Tables.schema(dtrain)
schema.names
8 changes: 4 additions & 4 deletions benchmarks/regressor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ using XGBoost
using EvoTrees
using BenchmarkTools
using Random: seed!
import CUDA
# import CUDA

### v.0.15.1
# desktop | 1e6 | depth 11 | cpu: 37.2s
# desktop | 10e6 | depth 11 | cpu

### perf depth
# desktop | 1e6 | depth 11 | cpu: 28s gpu: 73 sec | xgboost: 26s
# desktop | 10e6 | depth 11 | cpu 205s gpu: 109 sec | xgboost 260s
### v0.16.5
# desktop | 1e6 | depth 11 | cpu: 31s gpu: 50 sec | xgboost cpu: 26s
# desktop | 10e6 | depth 11 | cpu 200s gpu: 80 sec | xgboost cpu: 267s

#threads
# laptop depth 6: 12.717845 seconds (2.08 M allocations: 466.228 MiB)
Expand Down
1 change: 0 additions & 1 deletion experiments/MLJ.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ config = EvoTreeClassifier(
gamma = 0.0,
nbins = 32,
nrounds = 200,
device = "cpu"
)
model = fit_evotree(config; x_train, y_train);
model = fit_evotree(config; x_train, y_train, x_eval = x_train, y_eval = y_train, metric=:mlogloss, print_every_n=10, early_stopping_rounds=25);
Expand Down
129 changes: 0 additions & 129 deletions experiments/depth-debug.jl

This file was deleted.

Loading

0 comments on commit dbee39d

Please sign in to comment.