Skip to content

Commit

Permalink
features importance
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremiedb committed Feb 5, 2020
1 parent 722ba59 commit 0db9ba7
Show file tree
Hide file tree
Showing 11 changed files with 51 additions and 6 deletions.
6 changes: 6 additions & 0 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -340,3 +340,9 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[[Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[[XGBoost]]
deps = ["BinaryProvider", "Libdl", "Printf", "Random", "SparseArrays", "Statistics", "Test"]
git-tree-sha1 = "b1299fe4d279e2a5ca67406a5eb640e5f71d19ce"
uuid = "009559a3-9522-5dbb-924b-0b6ed2b22bb9"
version = "0.4.2"
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
XGBoost = "009559a3-9522-5dbb-924b-0b6ed2b22bb9"

[compat]
CategoricalArrays = "0.7"
Expand Down
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ Currently supports:
- multiclassification (softmax)
- Gaussian (max likelihood)

Input features is expected to be `Matrix{Float64}`. User friendly format conversion to be done.
Next priorities: histogram subtraction for improved performance and GPU support.

## Installation

Latest:
Expand Down Expand Up @@ -162,3 +165,11 @@ params1 = EvoTreeGaussian(
max_depth = 6, min_weight = 1.0,
rowsample=0.5, colsample=1.0, seed=123)
```

## Feature importance

Returns the normalized gain by feature.

```julia
features_gain = importance(model, var_names)
```
5 changes: 4 additions & 1 deletion experiments/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ num_round = 100
# by calling xgboost(data, num_round, label=label, training-parameters)
metrics = ["logloss"]
@time bst = xgboost(train_X, num_round, label = train_Y, eta = 0.1, max_depth = 3, metrics = metrics, silent=0, objective = "binary:logistic")
features_xgb = XGBoost.importance(bst)

X_train = Float64.(train_X)
Y_train = Float64.(train_Y)
Expand All @@ -113,7 +114,9 @@ params1 = EvoTreeRegressor(
nrounds=100,
λ = 0.0, γ=0.0, η=0.1,
max_depth = 4, min_weight = 1.0,
rowsample=0.5, colsample=0.5, nbins=250)
rowsample=1.0, colsample=1.0, nbins=250)

@time model = fit_evotree(params1, X_train, Y_train, print_every_n=50);
@time pred_train = EvoTrees.predict(model, X_train)
features_evo = importance(model, 1:size(X_train,2))
sort(collect(values(features_evo)))
1 change: 1 addition & 0 deletions experiments/random.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ params1.nrounds
@time model = fit_evotree(params1, X_train, Y_train);
@btime model = fit_evotree(params1, X_train, Y_train);
@time pred_train = predict(model, X_train)
@time gain = importance(model, 1:100)

# xgboost benchmark
using XGBoost
Expand Down
3 changes: 2 additions & 1 deletion src/EvoTrees.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module EvoTrees

export init_evotree, grow_evotree!, grow_tree, predict, fit_evotree,
EvoTreeRegressor, EvoTreeCount, EvoTreeClassifier, EvoTreeGaussian,
EvoTreeRModels
EvoTreeRModels, importance

using Statistics
using Base.Threads: @threads
Expand All @@ -22,6 +22,7 @@ include("eval.jl")
include("predict.jl")
include("find_split.jl")
include("fit.jl")
include("importance.jl")
include("MLJ.jl")

end # module
2 changes: 1 addition & 1 deletion src/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ function grow_tree(δ, δ², 𝑤,
train_nodes[leaf_count + 1] = TrainNode(node.depth + 1, best.∑δL, best.∑δ²L, best.∑𝑤L, best.gainL, left, node.𝑗)
train_nodes[leaf_count + 2] = TrainNode(node.depth + 1, best.∑δR, best.∑δ²R, best.∑𝑤R, best.gainR, right, node.𝑗)
# push split Node
push!(tree.nodes, TreeNode(leaf_count + 1, leaf_count + 2, best.feat, best.cond, L))
push!(tree.nodes, TreeNode(leaf_count + 1, leaf_count + 2, best.feat, best.cond, best.gain-node.gain, L))
push!(next_active_id, leaf_count + 1)
push!(next_active_id, leaf_count + 2)
leaf_count += 2
Expand Down
20 changes: 20 additions & 0 deletions src/importance.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# prediction from single tree - assign each observation to its final leaf
function importance!(gain::AbstractVector, tree::Tree)
@inbounds for node in tree.nodes
if node.split
gain[node.feat] += node.gain
end
end
end

# prediction from single tree - assign each observation to its final leaf
function importance(model::GBTree, vars::AbstractVector)
gain = zeros(length(vars))
for tree in model.trees
importance!(gain, tree)
end
gain .= gain ./ sum(gain)
pairs = collect(Dict(zip(string.(vars),gain)))
sort!(pairs, by = x -> -x[2])
return pairs
end
1 change: 0 additions & 1 deletion src/predict.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ function predict!(pred, tree::Tree, X::AbstractMatrix{T}) where {T<:Real}
end
pred[i] += tree.nodes[id].pred
end
return pred
end

# prediction from single tree - assign each observation to its final leaf
Expand Down
5 changes: 3 additions & 2 deletions src/structs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@ struct TreeNode{L, T<:AbstractFloat, S<:Int, B<:Bool}
right::S
feat::S
cond::T
gain::T
pred::SVector{L,T}
split::B
end

TreeNode(left::S, right::S, feat::S, cond::T, L::S) where {T<:AbstractFloat, S<:Int} = TreeNode{L,T,S,Bool}(left, right, feat, cond, zeros(SVector{L,T}), true)
TreeNode(pred::SVector{L,T}) where {L,T} = TreeNode(0, 0, 0, 0.0, pred, false)
TreeNode(left::S, right::S, feat::S, cond::T, gain::T, L::S) where {T<:AbstractFloat, S<:Int} = TreeNode{L,T,S,Bool}(left, right, feat, cond, gain, zeros(SVector{L,T}), true)
TreeNode(pred::SVector{L,T}) where {L,T} = TreeNode(0, 0, 0, zero(T), zero(T), pred, false)

# single tree is made of a root node that containes nested nodes and leafs
struct TrainNode{L, T<:AbstractFloat, S<:Int}
Expand Down
2 changes: 2 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,5 @@ params1 = EvoTreeGaussian(
rowsample=0.5, colsample=1.0, seed = seed)
@time model = fit_evotree(params1, X_train, Y_train, X_eval = X_eval, Y_eval = Y_eval, print_every_n = 25)
@time pred_train_gaussian = predict(model, X_train)

features_gain = importance(model, 1:1)

0 comments on commit 0db9ba7

Please sign in to comment.