Skip to content

Commit

Permalink
t-SNE: Add multiscale option
Browse files Browse the repository at this point in the history
  • Loading branch information
pavlin-policar authored and VesnaT committed Dec 12, 2018
1 parent 54b9ff4 commit 5bf1fb9
Showing 1 changed file with 124 additions and 33 deletions.
157 changes: 124 additions & 33 deletions Orange/projection/manifold.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import logging
import warnings
from collections import Iterable

import numpy as np
import fastTSNE
import scipy.sparse as sp
from scipy.linalg import eigh as lapack_eigh
from scipy.sparse.linalg import eigsh as arpack_eigh
import sklearn.manifold as skl_manifold

import fastTSNE
import fastTSNE.affinity
import fastTSNE.initialization

import Orange
from Orange.data import Table, Domain, ContinuousVariable
from Orange.distance import Distance, DistanceModel, Euclidean
Expand Down Expand Up @@ -187,7 +191,7 @@ def __init__(self, n_components=2, affinity='nearest_neighbors', gamma=None,


class TSNEModel(Projection):
"""A tSNE embedding object. Supports further optimization as well as
"""A t-SNE embedding object. Supports further optimization as well as
adding new data into the existing embedding.
Attributes
Expand Down Expand Up @@ -219,8 +223,8 @@ def proj_variable(i):
def transform(self, X: np.ndarray, **kwargs) -> fastTSNE.PartialTSNEEmbedding:
if sp.issparse(X):
raise TypeError(
'A sparse matrix was passed, but dense data is required. Use '
'X.toarray() to convert to a dense numpy array.'
"A sparse matrix was passed, but dense data is required. Use "
"X.toarray() to convert to a dense numpy array."
)
return self.embedding_.transform(X, **kwargs)

Expand All @@ -234,8 +238,8 @@ def __call__(self, data: Table, **kwargs) -> Table:

def optimize(self, n_iter, inplace=False, propagate_exception=False, **kwargs):
"""Resume optimization for the current embedding."""
kwargs = {'n_iter': n_iter, 'inplace': inplace,
'propagate_exception': propagate_exception, **kwargs}
kwargs = {"n_iter": n_iter, "inplace": inplace,
"propagate_exception": propagate_exception, **kwargs}
if inplace:
self.embedding_.optimize(**kwargs)
return self
Expand All @@ -256,8 +260,11 @@ class TSNE(Projector):
The number of embedding that the embedding should contain. Note that
only up to two dimensions are supported as otherwise the process can
become prohibitively expensive.
perplexity : float
The desired perplexity of the probability distribution.
perplexity : Union[float, List[float]]
The desired perplexity of the probability distribution. If using
`multiscale` option, this must be a list of perplexities. The most
typical multiscale case consists a small perplexity ~50 and a higher
perplexity on the order ~N/50.
learning_rate : float
The learning rate for t-SNE. Typical values range from 1 to 1000.
Setting the learning rate too high will result in the crowding problem
Expand Down Expand Up @@ -316,6 +323,12 @@ class TSNE(Projector):
BH tends to be faster for smaller data sets but scales as O(n log n)
while FItSNE is faster for larger data sets and scales linearly in the
number of points.
multiscale : bool
If this option is set to true, the multiscale version of t-SNE will be
run. Please note that this can take a substantially longer time than
the default t-SNE. Also note that if this option is set, `perplexity`
must be a list of perplexities to use, as opposed to the single
perplexity value otherwise.
callbacks : Callable[[int, float, np.ndarray] -> bool]
The callback should accept three parameters, the first is the current
iteration, the second is the current KL divergence error and the last
Expand All @@ -335,39 +348,117 @@ class TSNE(Projector):
preprocessors
"""
name = 't-SNE'
name = "t-SNE"
preprocessors = [
Orange.preprocess.Continuize(),
Orange.preprocess.SklImpute(),
]

def __init__(self, n_components=2, perplexity=30, learning_rate=200,
early_exaggeration_iter=250, early_exaggeration=12,
n_iter=750, exaggeration=None, theta=0.5, min_num_intervals=10,
ints_in_interval=1, initialization='random', metric='euclidean',
n_jobs=1, neighbors='exact', negative_gradient_method='bh',
callbacks=None, callbacks_every_iters=50,
random_state=None, preprocessors=None):
def __init__(
self,
n_components=2,
perplexity=30,
learning_rate=200,
early_exaggeration_iter=250,
early_exaggeration=12,
n_iter=750,
exaggeration=None,
theta=0.5,
min_num_intervals=10,
ints_in_interval=1,
initialization="random",
metric="euclidean",
n_jobs=1,
neighbors="exact",
negative_gradient_method="bh",
multiscale=False,
callbacks=None,
callbacks_every_iters=50,
random_state=None,
preprocessors=None,
):
super().__init__(preprocessors=preprocessors)
self.tsne = fastTSNE.TSNE(
n_components=n_components, perplexity=perplexity,
learning_rate=learning_rate, early_exaggeration=early_exaggeration,
early_exaggeration_iter=early_exaggeration_iter, n_iter=n_iter,
exaggeration=exaggeration, theta=theta, min_num_intervals=min_num_intervals,
ints_in_interval=ints_in_interval, initialization=initialization,
metric=metric, n_jobs=n_jobs, neighbors=neighbors,
negative_gradient_method=negative_gradient_method,
callbacks=callbacks, callbacks_every_iters=callbacks_every_iters,
random_state=random_state
)
self.n_components = n_components
self.perplexity = perplexity
self.learning_rate = learning_rate
self.early_exaggeration = early_exaggeration
self.early_exaggeration_iter = early_exaggeration_iter
self.n_iter = n_iter
self.exaggeration = exaggeration
self.theta = theta
self.min_num_intervals = min_num_intervals
self.ints_in_interval = ints_in_interval
self.initialization = initialization
self.metric = metric
self.n_jobs = n_jobs
self.neighbors = neighbors
self.negative_gradient_method = negative_gradient_method
self.multiscale = multiscale
self.callbacks = callbacks
self.callbacks_every_iters = callbacks_every_iters
self.random_state = random_state

def fit(self, X: np.ndarray, Y: np.ndarray = None) -> fastTSNE.TSNEEmbedding:
# Sparse data are not supported
if sp.issparse(X):
raise TypeError(
'A sparse matrix was passed, but dense data is required. Use '
'X.toarray() to convert to a dense numpy array.'
"A sparse matrix was passed, but dense data is required. Use "
"X.toarray() to convert to a dense numpy array."
)
return self.tsne.fit(X)

# Build up the affinity matrix, using multiscale if needed
if self.multiscale:
# The local perplexity should be on the order ~50 while the higher
# perplexity should be on the order ~N/50
if not isinstance(self.perplexity, Iterable):
raise ValueError(
"Perplexity should be an instance of `Iterable`, `%s` "
"given." % type(self.perplexity).__name__)
affinities = fastTSNE.affinity.Multiscale(
X, perplexities=self.perplexity, metric=self.metric,
method=self.neighbors, random_state=self.random_state, n_jobs=self.n_jobs)
else:
if isinstance(self.perplexity, Iterable):
raise ValueError(
"Perplexity should be an instance of `float`, `%s` "
"given." % type(self.perplexity).__name__)
affinities = fastTSNE.affinity.PerplexityBasedNN(
X, perplexity=self.perplexity, metric=self.metric,
method=self.neighbors, random_state=self.random_state, n_jobs=self.n_jobs)

# Create an initial embedding
if isinstance(self.initialization, np.ndarray):
initialization = self.initialization
elif self.initialization == "pca":
initialization = fastTSNE.initialization.pca(
X, self.n_components, random_state=self.random_state)
elif self.initialization == "random":
initialization = fastTSNE.initialization.random(
X.shape[0], self.n_components, random_state=self.random_state)
else:
raise ValueError(
"Invalid initialization `%s`. Please use either `pca` or "
"`random` or provide a numpy array." % self.initialization)

embedding = fastTSNE.TSNEEmbedding(
initialization, affinities, learning_rate=self.learning_rate,
theta=self.theta, min_num_intervals=self.min_num_intervals,
ints_in_interval=self.ints_in_interval, n_jobs=self.n_jobs,
negative_gradient_method=self.negative_gradient_method,
callbacks=self.callbacks, callbacks_every_iters=self.callbacks_every_iters,
)

# Run standard t-SNE optimization
embedding.optimize(
n_iter=self.early_exaggeration_iter, exaggeration=self.early_exaggeration,
inplace=True, momentum=0.5, propagate_exception=True,
)
embedding.optimize(
n_iter=self.n_iter, exaggeration=self.exaggeration,
inplace=True, momentum=0.8, propagate_exception=True,
)

return embedding

def __call__(self, data: Table) -> TSNEModel:
# Preprocess the data - convert discrete to continuous
Expand All @@ -378,9 +469,9 @@ def __call__(self, data: Table) -> TSNEModel:

# The results should be accessible in an Orange table, which doesn't
# need the full embedding attributes and is cast into a regular array
n = self.tsne.n_components
postfixes = ['x', 'y'] if n == 2 else list(range(1, n + 1))
tsne_cols = [ContinuousVariable(f't-SNE-{p}') for p in postfixes]
n = self.n_components
postfixes = ["x", "y"] if n == 2 else list(range(1, n + 1))
tsne_cols = [ContinuousVariable(f"t-SNE-{p}") for p in postfixes]
embedding_domain = Domain(tsne_cols, data.domain.class_vars, data.domain.metas)
embedding_table = Table(embedding_domain, embedding.view(np.ndarray), data.Y, data.metas)

Expand Down

0 comments on commit 5bf1fb9

Please sign in to comment.