Skip to content

Commit

Permalink
reproduce fig 18.6 szcf-weiya#207
Browse files Browse the repository at this point in the history
  • Loading branch information
szcf-weiya committed Sep 27, 2019
1 parent f83b921 commit e2d951d
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 1 deletion.
Binary file added code/fig18.5/err_and_dev_vs_log_lambda.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added code/fig18.5/err_vs_log_lambda.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
104 changes: 103 additions & 1 deletion code/fig18.5/script.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
using DelimitedFiles
using GLMNet

using StatsBase
using Plots
using LaTeXStrings
# import data
folder = "data/Leukemia/"
raw_X = readdlm(folder*"data_set_ALL_AML_train.txt", '\t')
Expand All @@ -18,3 +20,103 @@ train_y = hcat(vcat(fill(1, 27, 1), fill(0, 38-27, 1)),
grid = collect(range(exp(-8), exp(-1), length=100))
lasso_path = glmnet(train_X, train_y, Binomial(), lambda=grid)
elnet_path = glmnet(train_X, train_y, Binomial(), lambda=grid, alpha=0.8)

# calculate training error
lasso_train_val = predict(lasso_path, train_X)
elnet_train_val = predict(elnet_path, train_X)
lasso_train_yhat = predict(lasso_path, train_X) .> 0
elnet_train_yhat = predict(elnet_path, train_X) .> 0
lasso_train_err = [sum(lasso_train_yhat[:, i] .!= train_y[:, 2]) for i=1:size(lasso_train_yhat, 2)] / length(train_y[:, 2])
elnet_train_err = [sum(elnet_train_yhat[:, i] .!= train_y[:, 2]) for i=1:size(elnet_train_yhat, 2)] / length(train_y[:, 2])

# calculate deviance
function calc_dev(pred_val, grid)
dev = zeros(length(grid))
for i = 1:length(grid)
phat = 1 ./ (1 .+ exp.(pred_val[:, i]))
dev[i] = -sum( phat .* log.(phat) + (1 .- phat) .* log.(1 .- phat) )
end
return dev
end
lasso_train_dev = calc_dev(lasso_train_val, grid)

# #############################
# test
# #############################

# import test data
raw_test_X = readdlm(folder*"data_set_ALL_AML_independent.txt", '\t')
test_X = raw_test_X[:, 3:2:end]
test_id = Int.(test_X[1, :])
test_X = Array{Float64, 2}(test_X[2:end, :]')
lasso_test_val = predict(lasso_path, test_X)
elnet_test_val = predict(elnet_path, test_X)
lasso_yhat = predict(lasso_path, test_X) .> 0
elnet_yhat = predict(elnet_path, test_X) .> 0
# get from `Leuk_ALL_AML.test.cls` !!! strange thing!! it has 35 test samples, but actually it is only 34 in any other places.
# raw_test_y = readdlm(folder*"Leuk_ALL_AML.test.cls")
# test_y = raw_test_y[2,:]

# get from `table_ALL_AML_samples.txt`
test_y = vcat(fill(0, 49-39+1, 1), fill(1, 54-50+1, 1), fill(0, 56-55+1), fill(1, 58-57+1, 1), 0, fill(1, 66-60+1, 1), fill(0, 72-67+1, 1))
# sort according to the test_id
# test_y = test_y[sortperm(test_id)] !!
test_y = test_y[Int.(tiedrank(test_id))]
lasso_test_err = [sum(lasso_yhat[:, i] .!= test_y) for i=1:size(lasso_yhat, 2)] / length(test_y)
elnet_test_err = [sum(elnet_yhat[:, i] .!= test_y) for i=1:size(elnet_yhat, 2)] / length(test_y)

lasso_test_dev = calc_dev(lasso_test_val, grid)
# 10-fold on the train set
# cross validation (c.f. to https://github.com/szcf-weiya/ESL-CN/blob/99f1e9fd4b8c8c80fa0ff281f0e082cb810a54e0/code/LDA/diagonalLDA.jl#L99-L128)
# X is Nxp
function cv_err(X::Array{Float64, 2}, y::Array{Int, 2}, lambda::Array{Float64, 1}; nfold = 10)
N = size(X, 1)
folds = div_into_folds(N, K = nfold)
err = zeros(nfold, length(lambda))
dev = zeros(nfold, length(lambda))
for k = 1:nfold
test_idx = folds[k]
train_idx = setdiff(1:N, test_idx)
cvlasso = glmnet(X[train_idx, :], y[train_idx, :], Binomial(), lambda = lambda)
# calculate err
pred_val = predict(cvlasso, X[test_idx, :])
yhat = pred_val .> 0
err[k, :] = [sum(yhat[:, i] .!= y[test_idx, 2]) for i=1:length(lambda)]
dev[k, :] = calc_dev(pred_val, grid)
end
return [sum(err, dims=1) / N, sum(dev, dims=1)]
end

# 1:N divide into K-fold
function div_into_folds(N::Int; K::Int = 10)
# maximum quota per fold
n = Int(ceil(N/K))
# number folds for the maximum quota
k = N - (n-1)*K
# number fols for n-1 quota: K-k
folds = Array{Array{Int, 1}, 1}(undef, K)
for i = 1:k
folds[i] = collect(n*(i-1)+1:n*i)
end
for i = 1:K-k
folds[k+i] = collect((n-1)*(i-1)+1:(n-1)*i) .+ n*k
end
return folds
end

# 10-fold cv error
lasso_cv10_err, lasso_cv10_dev = cv_err(train_X, train_y, grid)
p1 = plot(log.(grid), lasso_cv10_err[:], color = "purple", label="10-fold CV", linewidth=3, legend=:topleft)
plot!(p1, log.(grid), lasso_train_err, color = "orange", label="Training", linewidth=3)
plot!(p1, log.(grid), lasso_test_err, color = "skyblue", label="Test", linewidth=3)
xlabel!(p1, L"\log \lambda")
ylabel!(p1, "Misclassification Error")
# savefig("err_vs_log_lambda.png")

p2 = plot(log.(grid), lasso_cv10_dev[:], color = "purple",linewidth=3, legend = :false, xlab = L"\log\lambda", ylab = "Deviance", fontsize=20)
plot!(p2, log.(grid), lasso_train_dev, color = "orange", linewidth=3)
plot!(p2, log.(grid), lasso_test_dev, color = "skyblue", linewidth=3)

plot(p1, p2, dpi=300)
# savefig("err_and_dev_vs_log_lambda.pdf")
savefig("err_and_dev_vs_log_lambda.png")

0 comments on commit e2d951d

Please sign in to comment.