diff --git a/Project.toml b/Project.toml index b97067d..aaea5f1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,12 +1,13 @@ name = "JLBoost" uuid = "13d6d4a1-5e7f-472c-9ebc-8123a4fbb95f" authors = ["Dai ZJ "] -version = "0.1.2" +version = "0.1.3" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" +MLJJLBoost = "cb937e20-20f2-4cea-8a28-54eef8bab285" MappedArrays = "dbb5928d-eab1-5f90-85c2-b9b0edb7c900" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" @@ -27,6 +28,7 @@ Tables = "0.2" julia = "1" [extras] +MLJJLBoost = "cb937e20-20f2-4cea-8a28-54eef8bab285" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Weave = "44d3d7a6-8a23-5bf8-98c5-b353f8df5ec9" diff --git a/README.jmd b/README.jmd index 08667fb..151aedd 100644 --- a/README.jmd +++ b/README.jmd @@ -79,7 +79,7 @@ new_tree = 0.3 * trees(xgtreemodel)[1] # weight the first tree by 30% unique(predict(new_tree, iris) ./ predict(trees(xgtreemodel)[1], iris)) # 0.3 ``` -#### MLJ.jl & +#### MLJ.jl There is integration with the MLJ.jl modelling framework @@ -105,7 +105,7 @@ One can obtain the feature importance using the `feature_importance` function feature_importance(xgtreemodel, iris) ``` -#### Tables.jl integrations +#### Tables.jl integration Any Tables.jl compatible tabular data structure. So you can use any column accessible table with JLBoost. However, you are advised to define the following methods for `df` as the generic implementation in this package may not be efficient diff --git a/README.md b/README.md index 574a070..1209d00 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,222 @@ -# MLJJLBoost +# JLBoost.jl + +This is a 100%-Julia implementation of Gradient Boosting Regresssion Trees (GBRT) based heavily on the algorithms published in the XGBoost, LightGBM and Catboost papers. GBRT is also referred to as Gradient Boosting Decision Tree (GBDT). + +## Limitations for now +* Currently, `Union{T, Missing}` feature type is not supported, but are *planned*. +* Currently, only the single-valued models are supported. Multivariate-target models support are *planned*. +* Currently, only the numeric and boolean features are supported. Categorical support are *planned*. + +## Objectives +* A full-featured & batteries included Gradient Boosting Regression Tree library +* Play nice with the Julia ecosystem e.g. Tables.jl, DataFrames.jl and CategoricalArrays.jl +* 100%-Julia +* Fit models on large data + +* Easy to manipulate the tree after fitting; play with tree pruning and adjustments +* "Easy" to deploy + +## Quick-start + +### Fit model on `DataFrame` + +#### Binary Classification +We fit the model by predicting one of the iris Species. To fit a model on a `DataFrame` you need to specify the column and the features default to all columns other than the target. ````julia -using RDatasets +using JLBoost, RDatasets iris = dataset("datasets", "iris") -iris[!, :is_setosa] .= iris.Species .== "setosa" +iris[!, :is_setosa] = iris[!, :Species] .== "setosa" +target = :is_setosa + +features = setdiff(names(iris), [:Species, :is_setosa]) + +# fit one tree +# ?jlboost for more details +xgtreemodel = jlboost(iris, target) +```` + + +```` +JLBoost.JLBoostTrees.JLBoostTreeModel(JLBoost.JLBoostTrees.JLBoostTree[ + -- PetalLength <= 1.9 + ---- weight = 2.0 + + -- PetalLength > 1.9 + ---- weight = -2.0 +], JLBoost.LogitLogLoss(), :is_setosa) +```` + + + + + +The returned model contains a vector of trees and the loss function and target + +````julia +typeof(trees(xgtreemodel)) +```` + + +```` +Array{JLBoost.JLBoostTrees.JLBoostTree,1} +```` + + + +````julia +typeof(xgtreemodel.loss) +```` + + +```` +JLBoost.LogitLogLoss +```` + + + +````julia +typeof(xgtreemodel.target) +```` + + +```` +Symbol +```` + + + + + +You can control parameters like `max_depth` and `nrounds` +````julia +xgtreemodel2 = jlboost(iris, target; nrounds = 2, max_depth = 2) +```` + + +```` +JLBoost.JLBoostTrees.JLBoostTreeModel(JLBoost.JLBoostTrees.JLBoostTree[ + -- PetalLength <= 1.9 + ---- weight = 2.0 + + -- PetalLength > 1.9 + ---- weight = -2.0 +, + -- PetalLength <= 1.9 + -- SepalLength <= 4.8 + ---- weight = 1.1353352832366135 + + -- SepalLength > 4.8 + ---- weight = 1.1353352832366155 + + -- PetalLength > 1.9 + -- SepalLength <= 7.9 + ---- weight = -1.1353352832366106 + + -- SepalLength > 7.9 + ---- weight = -1.1353352832366106 +], JLBoost.LogitLogLoss(), :is_setosa) +```` + + + + + +Convenience `predict` function is provided. It can be used to score a tree or a vector of trees +````julia +iris.pred1 = predict(xgtreemodel, iris) +iris.pred2 = predict(xgtreemodel2, iris) +iris.pred1_plus_2 = predict(vcat(xgtreemodel, xgtreemodel2), iris) +```` + + +```` +150-element Array{Float64,1}: + 5.135335283236616 + 5.135335283236616 + 5.135335283236613 + 5.135335283236613 + 5.135335283236616 + 5.135335283236616 + 5.135335283236613 + 5.135335283236616 + 5.135335283236613 + 5.135335283236616 + ⋮ + -5.135335283236611 + -5.135335283236611 + -5.135335283236611 + -5.135335283236611 + -5.135335283236611 + -5.135335283236611 + -5.135335283236611 + -5.135335283236611 + -5.135335283236611 +```` + + + + + +There are also convenience functions for computing the AUC and gini +````julia +AUC(-iris.pred1, iris.is_setosa) +```` + + +```` +0.6666666666666667 +```` + + + +````julia +gini(-iris.pred1, iris.is_setosa) +```` + + +```` +0.3333333333333335 +```` + + + + + +As a convenience feature, you can adjust the `eta` weight of each tree by multiplying it by a factor e.g. + +```Julia +new_tree = 0.3 * trees(xgtreemodel)[1] # weight the first tree by 30% +unique(predict(new_tree, iris) ./ predict(trees(xgtreemodel)[1], iris)) # 0.3 +``` + +#### MLJ.jl + +There is integration with the MLJ.jl modelling framework + +````julia using MLJ, MLJBase, MLJJLBoost X, y = unpack(iris, x->!(x in [:is_setosa, :Species]), ==(:is_setosa)) -using MLJJLBoost:JLBoostClassifier model = JLBoostClassifier() ```` ```` -JLBoostClassifier(loss = JLBoost.LogitLogLoss(), - nrounds = 1, - subsample = 1, - eta = 1, - max_depth = 6, - min_child_weight = 1, - lambda = 0, - gamma = 0, - colsample_bytree = 1,) @ 2…77 +MLJJLBoost.JLBoostClassifier(loss = JLBoost.LogitLogLoss(), + nrounds = 1, + subsample = 1, + eta = 1, + max_depth = 6, + min_child_weight = 1, + lambda = 0, + gamma = 0, + colsample_bytree = 1,) @ 1…40 ```` - - -Fit the model ````julia mljmodel = fit(model, 1, X, y) ```` @@ -40,18 +227,25 @@ Choosing a split on SepalLength Choosing a split on SepalWidth Choosing a split on PetalLength Choosing a split on PetalWidth +Choosing a split on pred1 +Choosing a split on pred2 +Choosing a split on pred1_plus_2 (feature = :PetalLength, split_at = 1.9, cutpt = 50, gain = 133.33333333333 334, lweight = 2.0, rweight = -2.0) Choosing a split on SepalLength Choosing a split on SepalWidth Choosing a split on PetalLength Choosing a split on PetalWidth +Choosing a split on pred1 +Choosing a split on pred2 +Choosing a split on pred1_plus_2 Choosing a split on SepalLength Choosing a split on SepalWidth Choosing a split on PetalLength Choosing a split on PetalWidth -meh -got here +Choosing a split on pred1 +Choosing a split on pred2 +Choosing a split on pred1_plus_2 (fitresult = JLBoost.JLBoostTrees.JLBoostTreeModel(JLBoost.JLBoostTrees.JLB oostTree[ -- PetalLength <= 1.9 @@ -61,15 +255,16 @@ oostTree[ ---- weight = -2.0 ], JLBoost.LogitLogLoss(), :__y__), cache = nothing, - report = 0.16666666666666669,) + report = (AUC = 0.16666666666666669, + feature_importance = 1×4 DataFrames.DataFrame +│ Row │ feature │ Quality_Gain │ Coverage │ Frequency │ +│ │ Symbol │ Float64 │ Float64 │ Float64 │ +├─────┼─────────────┼──────────────┼──────────┼───────────┤ +│ 1 │ PetalLength │ 1.0 │ 1.0 │ 1.0 │,),) ```` - - -Predicting using the model - ````julia predict(model, mljmodel.fitresult, X) ```` @@ -107,12 +302,12 @@ predict(model, mljmodel.fitresult, X) One can obtain the feature importance using the `feature_importance` function ````julia -feature_importance(mljmodel.fitresult, X, y) +feature_importance(xgtreemodel, iris) ```` ```` -1×4 DataFrame +1×4 DataFrames.DataFrame │ Row │ feature │ Quality_Gain │ Coverage │ Frequency │ │ │ Symbol │ Float64 │ Float64 │ Float64 │ ├─────┼─────────────┼──────────────┼──────────┼───────────┤ @@ -120,3 +315,137 @@ feature_importance(mljmodel.fitresult, X, y) ```` + + + +#### Tables.jl integration + +Any Tables.jl compatible tabular data structure. So you can use any column accessible table with JLBoost. However, you are advised to define the following methods for `df` as the generic implementation in this package may not be efficient + +````julia + +nrow(df) # returns the number of rows +ncol(df) +view(df, rows, cols) +```` + + + + +#### Regression Model +By default `JLBoost.jl` defines it's own `LogitLogLoss` type for binary classification problems. You may replace the `loss` function-type from the `LossFunctions.jl` `SupervisedLoss` type. E.g for regression models you can choose the leaast squares loss called `L2DistLoss()` + +````julia +using DataFrames +using JLBoost +df = DataFrame(x = rand(100) * 100) + +df[!, :y] = 2*df.x .+ rand(100) + +target = :y +features = [:x] +warm_start = fill(0.0, nrow(df)) + + +using LossFunctions: L2DistLoss +loss = L2DistLoss() +jlboost(df, target, features, warm_start, loss; max_depth=2) # default max_depth = 6 +```` + + +```` +JLBoost.JLBoostTrees.JLBoostTreeModel(JLBoost.JLBoostTrees.JLBoostTree[ + -- x <= 48.46506111985791 + -- x <= 22.538674324075213 + ---- weight = 24.909767874932513 + + -- x > 22.538674324075213 + ---- weight = 70.22774285333539 + + -- x > 48.46506111985791 + -- x <= 74.58743339436928 + ---- weight = 126.45279819799187 + + -- x > 74.58743339436928 + ---- weight = 177.0348452615354 +], LossFunctions.LPDistLoss{2}(), :y) +```` + + + + + +### Save & Load models +You save the models using the `JLBoost.save` and load it with the `load` function + +````julia +JLBoost.save(xgtreemodel, "model.jlb") +JLBoost.save(trees(xgtreemodel), "model_tree.jlb") +```` + + + +````julia +JLBoost.load("model.jlb") +JLBoost.load("model_tree.jlb") +```` + + +```` +Tree 1 + + -- PetalLength <= 1.9 + ---- weight = 2.0 + + -- PetalLength > 1.9 + ---- weight = -2.0 +```` + + + + + +### Fit model on `JDF.JDFFile` - enabling larger-than-RAM model fit +Sometimes, you may want to fit a model on a dataset that is too large to fit into RAM. You can convert the dataset to [JDF format](https://github.com/xiaodaigh/JDF.jl) and then use `JDF.JDFFile` functionalities to fit the models. The interface `jlbosst` for `DataFrame` and `JDFFiLe` are the same. + +The key advantage of fitting a model using `JDF.JDFFile` is that not all the data need to be loaded into memory. This is because `JDF` can load the columns one at a time. Hence this will enable larger models to be trained on a single computer. + +````julia +using JLBoost, RDatasets, JDF +iris = dataset("datasets", "iris") + +iris[!, :is_setosa] = iris[!, :Species] .== "setosa" +target = :is_setosa + +features = setdiff(names(iris), [:Species, :is_setosa]) + +savejdf("iris.jdf", iris) +irisdisk = JDFFile("iris.jdf") + +# fit using on disk JDF format +xgtree1 = jlboost(irisdisk, target, features) +xgtree2 = jlboost(iris, target, features; nrounds = 2, max_depth = 2) + +# predict using on disk JDF format +iris.pred1 = predict(xgtree1, irisdisk) +iris.pred2 = predict(xgtree2, irisdisk) + +# AUC +AUC(-predict(xgtree1, irisdisk), irisdisk[!, :is_setosa]) + +# gini +gini(-predict(xgtree1, irisdisk), irisdisk[!, :is_setosa]) + +# clean up +rm("iris.jdf", force=true, recursive=true) +```` + + + + + +## Notes + +Currently has a CPU implementation of the `xgboost` binary boosting algorithm as described in the original paper. I am trying to implement the algorithms in the original `xgboost` paper. I want to implement the algorithms mentioned in LigthGBM and Catboost and to port them to GPUs. + +There is a similar project called [JuML.jl](https://github.com/Statfactory/JuML.jl). diff --git a/src/best_split.jl b/src/best_split.jl index 2cebfe2..9929ae9 100644 --- a/src/best_split.jl +++ b/src/best_split.jl @@ -6,6 +6,14 @@ using MappedArrays: mappedarray export best_split +""" + best_split(loss, df::DataFrameLike, feature, target, warmstart, lambda, gamma) + +Find the best (binary) split point by optimizing ∑ loss(warmstart + δx, target) using order-2 Taylor series expexpansion. + +Does not assume that Feature, target, and warmstart sorted and will sort them for you. +""" + function best_split(loss, df, feature::Symbol, target::Symbol, warmstart::AbstractVector, lambda, gamma; verbose = false, kwargs...) @assert Tables.istable(df) diff --git a/src/fit_tree.jl b/src/fit_tree.jl index 2c86fab..9d818ac 100644 --- a/src/fit_tree.jl +++ b/src/fit_tree.jl @@ -16,6 +16,9 @@ function _fit_tree!(loss, df, target, features, warm_start, feature_choice_strat @assert colsample_bylevel <= 1 && colsample_bylevel > 0 @assert Tables.istable(df) + # make absolutely sure that target is not part of it + features = setdiff(features, [target]) + # compute the gain for all splits for all features split_with_best_gain = best_split(loss, df, features[1], target, warm_start, lambda, gamma; verbose=verbose, kwargs...)