From 75c10ffe49812f521f07dfcf2b710c91e9dca218 Mon Sep 17 00:00:00 2001 From: Tom Haber Date: Mon, 22 Mar 2021 12:34:28 +0100 Subject: [PATCH 1/5] add types to optimize_embedding --- src/embeddings.jl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/embeddings.jl b/src/embeddings.jl index ea86fed..c085268 100644 --- a/src/embeddings.jl +++ b/src/embeddings.jl @@ -80,17 +80,17 @@ Optimize "query" samples with respect to "reference" samples. # Keyword Arguments - `move_ref::Bool = false`: if true, also improve the embeddings in `ref_embedding`, else fix them and only improve embeddings in `query_embedding`. """ -function optimize_embedding(graph, +function optimize_embedding(graph::SparseMatrixCSC{<:Real}, query_embedding, ref_embedding, - n_epochs, - initial_alpha, - min_dist, - spread, - gamma, - neg_sample_rate, - _a=nothing, - _b=nothing; + n_epochs::Integer, + initial_alpha::Real, + min_dist::Real, + spread::Real, + gamma::Real, + neg_sample_rate::Integer, + _a::Union{Real, Nothing} = nothing, + _b::Union{Real, Nothing} = nothing; move_ref::Bool=false) a, b = fit_ab(min_dist, spread, _a, _b) self_reference = query_embedding === ref_embedding From bb23c3ad8be55051f27b8cb15fc249db5fed7af6 Mon Sep 17 00:00:00 2001 From: Tom Haber Date: Mon, 22 Mar 2021 12:57:43 +0100 Subject: [PATCH 2/5] optimize_embedding takes matrices --- src/embeddings.jl | 38 ++++++++++++++++++++++---------------- src/umap_.jl | 8 +++++--- test/umap_tests.jl | 14 +++++--------- 3 files changed, 32 insertions(+), 28 deletions(-) diff --git a/src/embeddings.jl b/src/embeddings.jl index c085268..484c279 100644 --- a/src/embeddings.jl +++ b/src/embeddings.jl @@ -68,8 +68,8 @@ Optimize "query" samples with respect to "reference" samples. # Arguments - `graph`: a sparse matrix of shape (n_samples, n_samples) -- `query_embedding`: a vector of length (n_samples,) of vectors representing the embedded data points to be optimized ("query" samples) -- `ref_embedding`: a vector of length (n_samples,) of vectors representing the embedded data points to optimize against ("reference" samples) +- `query_embedding`: a matrix (n_components, n_samples) the embedded data points to be optimized ("query" samples) +- `ref_embedding`: a matrix (n_components, n_samples) the embedded data points to be optimized against ("reference" samples) - `n_epochs`: the number of training epochs for optimization - `initial_alpha`: the initial learning rate - `gamma`: the repulsive strength of negative samples @@ -80,9 +80,9 @@ Optimize "query" samples with respect to "reference" samples. # Keyword Arguments - `move_ref::Bool = false`: if true, also improve the embeddings in `ref_embedding`, else fix them and only improve embeddings in `query_embedding`. """ -function optimize_embedding(graph::SparseMatrixCSC{<:Real}, - query_embedding, - ref_embedding, +function optimize_embedding(graph::SparseMatrixCSC{T}, + query_embedding::AbstractMatrix{S}, + ref_embedding::AbstractMatrix{S}, n_epochs::Integer, initial_alpha::Real, min_dist::Real, @@ -91,7 +91,7 @@ function optimize_embedding(graph::SparseMatrixCSC{<:Real}, neg_sample_rate::Integer, _a::Union{Real, Nothing} = nothing, _b::Union{Real, Nothing} = nothing; - move_ref::Bool=false) + move_ref::Bool=false) where {T <: Real, S <: Real} a, b = fit_ab(min_dist, spread, _a, _b) self_reference = query_embedding === ref_embedding @@ -101,39 +101,45 @@ function optimize_embedding(graph::SparseMatrixCSC{<:Real}, for ind in nzrange(graph, i) j = rowvals(graph)[ind] p = nonzeros(graph)[ind] + if rand() <= p - sdist = evaluate(SqEuclidean(), query_embedding[i], ref_embedding[j]) + xi = view(query_embedding, :, i) + xj = view(ref_embedding, :, j) + + sdist = evaluate(SqEuclidean(), xi, xj) if sdist > 0 delta = (-2 * a * b * sdist^(b-1))/(1 + a*sdist^b) else delta = 0 end - @simd for d in eachindex(query_embedding[i]) - grad = clamp(delta * (query_embedding[i][d] - ref_embedding[j][d]), -4, 4) - query_embedding[i][d] += alpha * grad + @simd for d in eachindex(xi) + grad = clamp(delta * (xi[d] - xj[d]), -4, 4) + xi[d] += alpha * grad if move_ref - ref_embedding[j][d] -= alpha * grad + xj[d] -= alpha * grad end end for _ in 1:neg_sample_rate - k = rand(eachindex(ref_embedding)) + k = rand(1:size(ref_embedding, 2)) if i == k && self_reference continue end - sdist = evaluate(SqEuclidean(), query_embedding[i], ref_embedding[k]) + xk = view(ref_embedding, :, k) + + sdist = evaluate(SqEuclidean(), xi, xk) if sdist > 0 delta = (2 * gamma * b) / ((1//1000 + sdist)*(1 + a*sdist^b)) else delta = 0 end - @simd for d in eachindex(query_embedding[i]) + @simd for d in eachindex(xi) if delta > 0 - grad = clamp(delta * (query_embedding[i][d] - ref_embedding[k][d]), -4, 4) + grad = clamp(delta * (xi[d] - xk[d]), -4, 4) else grad = 4 end - query_embedding[i][d] += alpha * grad + xi[d] += alpha * grad end end diff --git a/src/umap_.jl b/src/umap_.jl index be745da..9baab3d 100644 --- a/src/umap_.jl +++ b/src/umap_.jl @@ -105,12 +105,13 @@ function UMAP_(X::AbstractMatrix{S}, graph = fuzzy_simplicial_set(knns, dists, n_neighbors, size(X, 2), local_connectivity, set_operation_ratio) embedding = initialize_embedding(graph, n_components, Val(init)) + embedding = hcat(embedding...) embedding = optimize_embedding(graph, embedding, embedding, n_epochs, learning_rate, min_dist, spread, repulsion_strength, neg_sample_rate, move_ref=true) # TODO: if target variable y is passed, then construct target graph # in the same manner and do a fuzzy simpl set intersection - return UMAP_(graph, hcat(embedding...), X, knns, dists) + return UMAP_(graph, embedding, X, knns, dists) end """ @@ -167,10 +168,11 @@ function transform(model::UMAP_, graph = fuzzy_simplicial_set(knns, dists, n_neighbors, size(model.data, 2), local_connectivity, set_operation_ratio, false) embedding = initialize_embedding(graph, model.embedding) - ref_embedding = collect(eachcol(model.embedding)) + embedding = hcat(embedding...) + ref_embedding = model.embedding embedding = optimize_embedding(graph, embedding, ref_embedding, n_epochs, learning_rate, min_dist, spread, repulsion_strength, neg_sample_rate, a, b, move_ref=false) - return reduce(hcat, embedding) + return embedding end diff --git a/test/umap_tests.jl b/test/umap_tests.jl index bbd647a..5d98d8e 100644 --- a/test/umap_tests.jl +++ b/test/umap_tests.jl @@ -104,17 +104,13 @@ gamma = 1. neg_sample_rate = 5 for graph in [graph1, graph2, graph3] - ref_embedding = collect(eachcol(rand(2, size(graph, 1)))) + ref_embedding = rand(2, size(graph, 1)) old_ref_embedding = deepcopy(ref_embedding) query_embedding = rand(2, size(graph, 2)) - query_embedding = [query_embedding[:, i] for i in 1:size(query_embedding, 2)] - res_embedding = optimize_embedding(graph, query_embedding, ref_embedding, n_epochs, initial_alpha, + res_embedding = optimize_embedding(graph, query_embedding, ref_embedding, n_epochs, initial_alpha, min_dist, spread, gamma, neg_sample_rate, move_ref=false) - @test res_embedding isa Array{Array{Float64, 1}, 1} - @test length(res_embedding) == length(query_embedding) - for i in 1:length(res_embedding) - @test length(res_embedding[i]) == length(query_embedding[i]) - end + @test res_embedding isa Array{Float64, 2} + @test size(res_embedding) == size(query_embedding) @test isapprox(old_ref_embedding, ref_embedding, atol=1e-4) end end @@ -176,7 +172,7 @@ model = UMAP_(model.graph, model.embedding, rand(5, 9), model.knns, model.dists) @test_throws ArgumentError transform(model, query; n_neighbors=3) # data size error end - + @testset "transform test" begin data = rand(5, 30) model = UMAP_(data, 2, n_neighbors=2, n_epochs=1) From 86d899aca59cb9798817a075d18b756d6db9956a Mon Sep 17 00:00:00 2001 From: Tom Haber Date: Mon, 22 Mar 2021 13:44:24 +0100 Subject: [PATCH 3/5] improve performance by removing a pow and branches --- src/embeddings.jl | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/embeddings.jl b/src/embeddings.jl index 484c279..2d041df 100644 --- a/src/embeddings.jl +++ b/src/embeddings.jl @@ -107,11 +107,9 @@ function optimize_embedding(graph::SparseMatrixCSC{T}, xj = view(ref_embedding, :, j) sdist = evaluate(SqEuclidean(), xi, xj) - if sdist > 0 - delta = (-2 * a * b * sdist^(b-1))/(1 + a*sdist^b) - else - delta = 0 - end + sdist = max(sdist, eps()) + delta = (-2 * a * b * sdist^b)/(sdist*(1 + a*sdist^b)) + @simd for d in eachindex(xi) grad = clamp(delta * (xi[d] - xj[d]), -4, 4) xi[d] += alpha * grad @@ -128,11 +126,9 @@ function optimize_embedding(graph::SparseMatrixCSC{T}, xk = view(ref_embedding, :, k) sdist = evaluate(SqEuclidean(), xi, xk) - if sdist > 0 - delta = (2 * gamma * b) / ((1//1000 + sdist)*(1 + a*sdist^b)) - else - delta = 0 - end + sdist = max(sdist, eps()) + delta = (2 * gamma * b) / ((1//1000 + sdist)*(1 + a*sdist^b)) + @simd for d in eachindex(xi) if delta > 0 grad = clamp(delta * (xi[d] - xk[d]), -4, 4) From 991504d4e1f825d2f31f23e038cb998ec076d0f9 Mon Sep 17 00:00:00 2001 From: Tom Haber Date: Mon, 22 Mar 2021 14:01:30 +0100 Subject: [PATCH 4/5] change returntype of initialize_embedding to matrix --- src/embeddings.jl | 8 +++----- src/umap_.jl | 2 -- test/umap_tests.jl | 18 ++++++------------ 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/src/embeddings.jl b/src/embeddings.jl index 2d041df..3c814b1 100644 --- a/src/embeddings.jl +++ b/src/embeddings.jl @@ -7,7 +7,6 @@ function initialize_embedding(graph::AbstractMatrix{T}, n_components, ::Val{:spe # expand expansion = 10 / maximum(embed) embed .= (embed .* expansion) .+ (1//10000) .* randn.(T) - embed = collect(eachcol(embed)) catch e @info "$e\nError encountered in spectral_layout; defaulting to random layout" embed = initialize_embedding(graph, n_components, Val(:random)) @@ -16,7 +15,7 @@ function initialize_embedding(graph::AbstractMatrix{T}, n_components, ::Val{:spe end function initialize_embedding(graph::AbstractMatrix{T}, n_components, ::Val{:random}) where {T} - return [20 .* rand(T, n_components) .- 10 for _ in 1:size(graph, 1)] + 20 .* rand(T, (n_components, size(graph, 1))) .- 10 end """ @@ -29,9 +28,8 @@ The resulting embedding will have shape `(size(ref_embedding, 1), size(graph, 2) is the number of components (dimensions) of the `reference embedding`, and `size(graph, 2)` is the number of samples in the resulting embedding. Its elements will have type T. """ -function initialize_embedding(graph::AbstractMatrix{<:Real}, ref_embedding::AbstractMatrix{T})::Vector{Vector{T}} where {T<:AbstractFloat} - embed = (ref_embedding * graph) ./ (sum(graph, dims=1) .+ eps(T)) - return Vector{T}[eachcol(embed)...] +function initialize_embedding(graph::AbstractMatrix{<:Real}, ref_embedding::AbstractMatrix{T})::Matrix{T} where {T<:AbstractFloat} + (ref_embedding * graph) ./ (sum(graph, dims=1) .+ eps(T)) end """ diff --git a/src/umap_.jl b/src/umap_.jl index 9baab3d..766daae 100644 --- a/src/umap_.jl +++ b/src/umap_.jl @@ -105,7 +105,6 @@ function UMAP_(X::AbstractMatrix{S}, graph = fuzzy_simplicial_set(knns, dists, n_neighbors, size(X, 2), local_connectivity, set_operation_ratio) embedding = initialize_embedding(graph, n_components, Val(init)) - embedding = hcat(embedding...) embedding = optimize_embedding(graph, embedding, embedding, n_epochs, learning_rate, min_dist, spread, repulsion_strength, neg_sample_rate, move_ref=true) # TODO: if target variable y is passed, then construct target graph @@ -168,7 +167,6 @@ function transform(model::UMAP_, graph = fuzzy_simplicial_set(knns, dists, n_neighbors, size(model.data, 2), local_connectivity, set_operation_ratio, false) embedding = initialize_embedding(graph, model.embedding) - embedding = hcat(embedding...) ref_embedding = model.embedding embedding = optimize_embedding(graph, embedding, ref_embedding, n_epochs, learning_rate, min_dist, spread, repulsion_strength, neg_sample_rate, a, b, move_ref=false) diff --git a/test/umap_tests.jl b/test/umap_tests.jl index 5d98d8e..5c68bc1 100644 --- a/test/umap_tests.jl +++ b/test/umap_tests.jl @@ -132,28 +132,22 @@ 3 6 8 8] ./10 ref_embedding = Float64[1 2 0; 0 2 -1] - actual = [[9, 1], [8, 2], [3, -6], [3, -6]] ./10 + actual = Float64[9 1; 8 2; 3 -6; 3 -6]' ./10 embedding = initialize_embedding(graph, ref_embedding) - @test embedding isa AbstractVector{<:AbstractVector{Float64}} - @test length(embedding) == length(actual) - for i in 1:length(embedding) - @test length(embedding[i]) == length(actual[i]) - end + @test embedding isa AbstractMatrix{Float64} + @test size(embedding) == size(actual) @test isapprox(embedding, actual, atol=1e-8) graph = Float16.(graph[:, [1,2]]) graph[:, end] .= 0 ref_embedding = Float16[1 2 0; 0 2 -1] - actual = Vector{Float16}[[9, 1], [0, 0]] ./10 + actual = Float16[9 1; 0 0]' ./10 embedding = initialize_embedding(graph, ref_embedding) - @test embedding isa AbstractVector{<:AbstractVector{Float16}} - @test length(embedding) == length(actual) - for i in 1:length(embedding) - @test length(embedding[i]) == length(actual[i]) - end + @test embedding isa AbstractMatrix{Float16} + @test size(embedding) == size(actual) @test isapprox(embedding, actual, atol=1e-2) end From 357319cbb601d9c03a9ff69c8d6267cfa6984e2f Mon Sep 17 00:00:00 2001 From: Tom Haber Date: Tue, 23 Mar 2021 12:40:55 +0100 Subject: [PATCH 5/5] optimize embedding in correct precision --- src/embeddings.jl | 8 ++++---- src/utils.jl | 20 ++++++++++++-------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/embeddings.jl b/src/embeddings.jl index 3c814b1..4e9110c 100644 --- a/src/embeddings.jl +++ b/src/embeddings.jl @@ -90,7 +90,7 @@ function optimize_embedding(graph::SparseMatrixCSC{T}, _a::Union{Real, Nothing} = nothing, _b::Union{Real, Nothing} = nothing; move_ref::Bool=false) where {T <: Real, S <: Real} - a, b = fit_ab(min_dist, spread, _a, _b) + a, b = fit_ab(convert(S,min_dist), convert(S, spread), _a, _b) self_reference = query_embedding === ref_embedding alpha = initial_alpha @@ -105,7 +105,7 @@ function optimize_embedding(graph::SparseMatrixCSC{T}, xj = view(ref_embedding, :, j) sdist = evaluate(SqEuclidean(), xi, xj) - sdist = max(sdist, eps()) + sdist = max(sdist, eps(S)) delta = (-2 * a * b * sdist^b)/(sdist*(1 + a*sdist^b)) @simd for d in eachindex(xi) @@ -124,8 +124,8 @@ function optimize_embedding(graph::SparseMatrixCSC{T}, xk = view(ref_embedding, :, k) sdist = evaluate(SqEuclidean(), xi, xk) - sdist = max(sdist, eps()) - delta = (2 * gamma * b) / ((1//1000 + sdist)*(1 + a*sdist^b)) + sdist = max(sdist, eps(S)) + delta = (2 * gamma * b) / ((S(1//1000) + sdist)*(1 + a*sdist^b)) @simd for d in eachindex(xi) if delta > 0 diff --git a/src/utils.jl b/src/utils.jl index d706faa..4b7d9b0 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -12,12 +12,16 @@ Find a smooth approximation to the membership function of points embedded in ℜ This fits a smooth curve that approximates an exponential decay offset by `min_dist`, returning the parameters `(a, b)`. """ -function fit_ab(min_dist, spread, ::Nothing, ::Nothing) - ψ(d) = d >= min_dist ? exp(-(d - min_dist)/spread) : 1. - xs = LinRange(0., spread*3, 300) +function fit_ab(min_dist::Real, spread::Real, ::Nothing, ::Nothing) + fit_ab(promote(min_dist, spread)..., nothing, nothing) +end + +function fit_ab(min_dist::S, spread::S, ::Nothing, ::Nothing) where {S <: Real} + ψ(d) = d >= min_dist ? exp(-(d - min_dist)/spread) : one(S) + xs = LinRange{S}(0., spread*3, 300) ys = map(ψ, xs) - @. curve(x, p) = (1. + p[1]*x^(2*p[2]))^(-1) - result = curve_fit(curve, xs, ys, [1., 1.], lower=[0., -Inf]) + @. curve(x, p) = (one(S) + p[1]*x^(2*p[2]))^(-1) + result = curve_fit(curve, xs, ys, S[1., 1.], lower=S[0., -Inf]) a, b = result.param return a, b end @@ -78,9 +82,9 @@ end knn_search(X, Q, k, metric, knns, dists) -> knns, dists Given a matrix `X` and a matrix `Q`, use the given metric to compute the `k` nearest neighbors out of the -columns of `X` from the queries (columns in `Q`). +columns of `X` from the queries (columns in `Q`). If the matrices are large, reconstruct the approximate nearest neighbors graph of `X` using the given `knns` and `dists`, -representing indices and distances of pairwise neighbors of `X`, and use this to search for approximate nearest +representing indices and distances of pairwise neighbors of `X`, and use this to search for approximate nearest neighbors of `Q`. If the matrices are small, search for exact nearest neighbors of `Q` by computing all pairwise distances with `X`. @@ -93,7 +97,7 @@ If the matrices are small, search for exact nearest neighbors of `Q` by computin - `knns`: `knns[j, i]` is the index of node i's jth nearest neighbor. - `dists`: `dists[j, i]` is the distance of node i's jth nearest neighbor. """ -function knn_search(X::AbstractMatrix, +function knn_search(X::AbstractMatrix, Q::AbstractMatrix, k::Integer, metric::SemiMetric,