Skip to content

Commit

Permalink
accelerated PCA, kNN. readded cpu code
Browse files Browse the repository at this point in the history
  • Loading branch information
LouisFaure committed Feb 28, 2021
1 parent 0dc22ea commit bd61c82
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 84 deletions.
120 changes: 72 additions & 48 deletions src/harmony/core.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
import pandas as pd
import numpy as np
import cupy as cp
import scanpy as sc

from cupyx.scipy.sparse import find, csr_matrix
from scipy.sparse import find as find_cpu
from scipy.sparse import csr_matrix as csr_matrix_cpu
from cuml import NearestNeighbors
from cuml import LinearRegression
from scipy.sparse import find, csr_matrix

from harmony import utils

Expand All @@ -17,7 +12,9 @@ def augmented_affinity_matrix(
timepoints,
timepoint_connections,
n_neighbors=30,
pc_components=1000
n_jobs=-2,
pc_components=1000,
device="cpu",
):
"""Function for max min sampling of waypoints
Expand Down Expand Up @@ -47,7 +44,7 @@ def augmented_affinity_matrix(
if pc_components is None:
pca_projections = data_df
else:
pca_projections, _ = utils.run_pca(data_df, n_components=pc_components)
pca_projections, _ = utils.run_pca(data_df,device, n_components=pc_components)

# Nearest neighbor graph construction and affinity matrix
print('Nearest neighbor computation...')
Expand All @@ -64,18 +61,25 @@ def augmented_affinity_matrix(
# # Affinity matrix
# nn_aff = _convert_to_affinity(adj, scaling_factors, True)
# --------------------------------------------------------------------------


temp = sc.AnnData(data_df.values)
# the following call is the now the bottleneck:
sc.pp.neighbors(temp, n_pcs=0, n_neighbors=n_neighbors,method='rapids')
# maintaining backwards compatibility to Scanpy `sc.pp.neighbors`
try:
kNN = temp.uns['neighbors']['distances']
except KeyError:
kNN = temp.obsp['distances']


if device == "cpu":
temp = sc.AnnData(data_df.values)
sc.pp.neighbors(temp, n_pcs=0, n_neighbors=n_neighbors)
# maintaining backwards compatibility to Scanpy `sc.pp.neighbors`
try:
kNN = temp.uns['neighbors']['distances']
except KeyError:
kNN = temp.obsp['distances']
elif device == "gpu":
from cuml.neighbors import NearestNeighbors
nn = NearestNeighbors(n_neighbors=n_neighbors,metric="euclidean")
X_contiguous = np.ascontiguousarray(data_df.values)
nn.fit(X_contiguous)

kNN = nn.kneighbors_graph(X_contiguous,mode="distance")
kNN.setdiag(0)
kNN.eliminate_zeros()

# Adaptive k
adaptive_k = int(np.floor(n_neighbors / 3))
scaling_factors = np.zeros(data_df.shape[0])
Expand All @@ -86,12 +90,12 @@ def augmented_affinity_matrix(
scaling_factors = pd.Series(scaling_factors, index=cell_order)

# Affinity matrix
nn_aff = _convert_to_affinity(csr_matrix(kNN), scaling_factors, True)
nn_aff = _convert_to_affinity(kNN, scaling_factors, device, True)

# Mututally nearest neighbor affinity matrix
# Initilze mnn affinity matrix
N = len(cell_order)
full_mnn_aff = csr_matrix_cpu(([0], ([0], [0])), [N, N])
full_mnn_aff = csr_matrix(([0], ([0], [0])), [N, N])
for i in timepoint_connections.index:
t1, t2 = timepoint_connections.loc[i, :].values
print(f'Constucting affinities between {t1} and {t2}...')
Expand All @@ -100,7 +104,7 @@ def augmented_affinity_matrix(
t1_cells = tp_cells[t1]
t2_cells = tp_cells[t2]
mnn = _construct_mnn(t1_cells, t2_cells, pca_projections,
n_neighbors)
n_neighbors, device, n_jobs)

# MNN Scaling factors
# Distance to the adaptive neighbor
Expand All @@ -113,71 +117,91 @@ def augmented_affinity_matrix(
# Scaling factors
mnn_scaling_factors = pd.Series(0.0, index=cell_order)
mnn_scaling_factors[t1_cells] = _mnn_scaling_factors(
ka_dists[t1_cells], scaling_factors)
ka_dists[t1_cells], scaling_factors, device)
mnn_scaling_factors[t2_cells] = _mnn_scaling_factors(
ka_dists[t2_cells], scaling_factors)
ka_dists[t2_cells], scaling_factors, device)

# MNN affinity matrix
full_mnn_aff = full_mnn_aff + \
_mnn_affinity(mnn, mnn_scaling_factors,
tp_offset[t1], tp_offset[t2]).get()
tp_offset[t1], tp_offset[t2], device)

# Symmetrize the affinity matrix and return
aff = nn_aff.get() + nn_aff.get().T + full_mnn_aff + full_mnn_aff.T
aff = nn_aff + nn_aff.T + full_mnn_aff + full_mnn_aff.T
return aff, nn_aff + nn_aff.T


def _convert_to_affinity(adj, scaling_factors, with_self_loops=False):
def _convert_to_affinity(adj, scaling_factors, device, with_self_loops=False):
""" Convert adjacency matrix to affinity matrix
"""
N = adj.shape[0]
rows, cols, dists = find(adj)
dists = dists ** 2/(cp.array(scaling_factors.values[rows.get()]) ** 2)

# Self loops
if with_self_loops:
dists = cp.append(dists, cp.zeros(N))
rows = cp.append(rows, range(N))
cols = cp.append(cols, range(N))
aff = csr_matrix((cp.exp(-dists), (rows, cols)), shape=(N, N))
if device == "gpu":
import cupy as cp
from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu
dists = cp.array(dists) ** 2/(cp.array(scaling_factors.values[rows]) ** 2)
rows, cols = cp.array(rows), cp.array(cols)
# Self loops
if with_self_loops:
dists = cp.append(dists, cp.zeros(N))
rows = cp.append(rows, range(N))
cols = cp.append(cols, range(N))
aff = csr_matrix_gpu((cp.exp(-dists), (rows, cols)), shape=(N, N)).get()
elif device == "cpu":
dists = dists ** 2/(scaling_factors.values[rows] ** 2)

# Self loops
if with_self_loops:
dists = np.append(dists, np.zeros(N))
rows = np.append(rows, range(N))
cols = np.append(cols, range(N))
aff = csr_matrix((np.exp(-dists), (rows, cols)), shape=[N, N])
return aff


def _construct_mnn(t1_cells, t2_cells, data_df, n_neighbors):
def _construct_mnn(t1_cells, t2_cells, data_df, n_neighbors,device,n_jobs=-2):
# FUnction to construct mutually nearest neighbors bewteen two points


if device == "gpu":
from cuml import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=n_neighbors,
metric='euclidean')
elif device == "cpu":
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=n_neighbors,
metric='euclidean', n_jobs=n_jobs)

print(f't+1 neighbors of t...')
nbrs = NearestNeighbors(n_neighbors=n_neighbors,
metric='euclidean')
nbrs.fit(data_df.loc[t1_cells, :].values)
t1_nbrs = nbrs.kneighbors_graph(
data_df.loc[t2_cells, :].values, mode='distance')

print(f't neighbors of t+1...')
nbrs = NearestNeighbors(n_neighbors=n_neighbors,
metric='euclidean')
nbrs.fit(data_df.loc[t2_cells, :].values)
t2_nbrs = nbrs.kneighbors_graph(
data_df.loc[t1_cells, :].values, mode='distance')

# Mututally nearest neighbors
mnn = t2_nbrs.multiply(t1_nbrs.T)
mnn = mnn.sqrt()
return csr_matrix(mnn)
return mnn

# Rewritten for speed improvements
def _mnn_ka_distances(mnn, n_neighbors):
# Function to find distance ka^th neighbor in the mutual nearest neighbor matrix
ka = np.int(n_neighbors / 3)
ka_dists = np.repeat(None, mnn.shape[0])
x, y, z = find(mnn)
rows=pd.Series(x.get()).value_counts()
x,z=x.get(),z.get()
rows=pd.Series(x).value_counts()
for r in rows.index[rows >= ka]:
ka_dists[r] = np.sort(z[x==r])[ka - 1]
return ka_dists

def _mnn_scaling_factors(mnn_ka_dists, scaling_factors):
def _mnn_scaling_factors(mnn_ka_dists, scaling_factors,device):
if device == "gpu":
from cuml import LinearRegression
else:
from sklearn.linear_model import LinearRegression
cells = mnn_ka_dists.index[~mnn_ka_dists.isnull()]

# Linear model fit
Expand All @@ -194,15 +218,15 @@ def _mnn_scaling_factors(mnn_ka_dists, scaling_factors):
return mnn_scaling_factors


def _mnn_affinity(mnn, mnn_scaling_factors, offset1, offset2):
def _mnn_affinity(mnn, mnn_scaling_factors, offset1, offset2, device):
# Function to convert mnn matrix to affinicty matrix

# Construct adjacency matrix
N = len(mnn_scaling_factors)
x, y, z = find(mnn)
x = x + offset1
y = y + offset2
adj = csr_matrix((z, (x, y)), shape=(N, N))
adj = csr_matrix((z, (x, y)), shape=[N, N])

# Affinity matrix
return _convert_to_affinity(adj, mnn_scaling_factors, False)
return _convert_to_affinity(adj, mnn_scaling_factors, device, False)
90 changes: 58 additions & 32 deletions src/harmony/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,12 @@

warnings.filterwarnings(action="ignore", message="remove_na is deprecated")

import cugraph
import cudf
import numpy as np
from fa2 import ForceAtlas2
import pandas as pd
def force_directed_layout(affinity_matrix, cell_names=None, verbose=True, iterations=500):
import numpy as np
import random

def force_directed_layout(affinity_matrix, cell_names=None, verbose=True, iterations=500, device='cpu'):
"""" Function to compute force directed layout from the affinity_matrix
:param affinity_matrix: Sparse matrix representing affinities between cells
:param cell_names: pandas Series object with cell names
Expand All @@ -48,35 +49,60 @@ def force_directed_layout(affinity_matrix, cell_names=None, verbose=True, iterat
"""

init_coords = np.random.random((affinity_matrix.shape[0], 2))


offsets = cudf.Series(affinity_matrix.indptr)
indices = cudf.Series(affinity_matrix.indices)
G = cugraph.Graph()
G.from_cudf_adjlist(offsets, indices, None)

forceatlas2 = cugraph.layout.force_atlas2(
G,
max_iter=iterations,
pos_list=cudf.DataFrame(
{
"vertex": np.arange(init_coords.shape[0]),
"x": init_coords[:, 0],
"y": init_coords[:, 1],
}
),
outbound_attraction_distribution=False,
lin_log_mode=False,
edge_weight_influence=1.0,
jitter_tolerance=1.0,
barnes_hut_optimize=True,
barnes_hut_theta=1.2,
scaling_ratio=2.0,
strong_gravity_mode=False,
gravity=1.0,
verbose=True,
)
positions = forceatlas2.to_pandas().loc[:, ["x", "y"]].values
if device == 'cpu':
forceatlas2 = ForceAtlas2(
# Behavior alternatives
outboundAttractionDistribution=False,
linLogMode=False,
adjustSizes=False,
edgeWeightInfluence=1.0,
# Performance
jitterTolerance=1.0,
barnesHutOptimize=True,
barnesHutTheta=1.2,
multiThreaded=False,
# Tuning
scalingRatio=2.0,
strongGravityMode=False,
gravity=1.0,
# Log
verbose=verbose)

positions = forceatlas2.forceatlas2(
affinity_matrix, pos=init_coords, iterations=iterations)
positions = np.array(positions)

elif device == 'gpu':
import cugraph
import cudf
offsets = cudf.Series(affinity_matrix.indptr)
indices = cudf.Series(affinity_matrix.indices)
G = cugraph.Graph()
G.from_cudf_adjlist(offsets, indices, None)

forceatlas2 = cugraph.layout.force_atlas2(
G,
max_iter=iterations,
pos_list=cudf.DataFrame(
{
"vertex": np.arange(init_coords.shape[0]),
"x": init_coords[:, 0],
"y": init_coords[:, 1],
}
),
outbound_attraction_distribution=False,
lin_log_mode=False,
edge_weight_influence=1.0,
jitter_tolerance=1.0,
barnes_hut_optimize=True,
barnes_hut_theta=1.2,
scaling_ratio=2.0,
strong_gravity_mode=False,
gravity=1.0,
verbose=True,
)
positions = forceatlas2.to_pandas().loc[:, ["x", "y"]].values

# Convert to dataframe
if cell_names is None:
Expand Down
11 changes: 7 additions & 4 deletions src/harmony/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from itertools import chain
from collections import OrderedDict
import warnings
from sklearn.decomposition import PCA


def load_from_csvs(csv_files, sample_names=None, min_cell_count=10):
Expand Down Expand Up @@ -93,7 +92,7 @@ def hvg_genes(norm_df, no_genes=1000):
return use_genes


def run_pca(data, n_components=300, var_explained=0.85):
def run_pca(data, device, n_components=300, var_explained=0.85):
"""Run PCA
:param data: Dataframe of cells X genes. Typicaly multiscale space diffusion components
Expand All @@ -103,13 +102,17 @@ def run_pca(data, n_components=300, var_explained=0.85):
:return: PCA projections of the data and the explained variance
"""
init_components = min([n_components, data.shape[0]])
pca = PCA(n_components=init_components, svd_solver='randomized')
if device == "gpu":
from cuml import PCA
pca = PCA(n_components=init_components)
elif device == "cpu":
from sklearn.decomposition import PCA
pca = PCA(n_components=init_components, svd_solver='randomized')
pca.fit(data)
if pca.explained_variance_ratio_.sum() >= 0.85:
n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= var_explained)[0][0]

print(f'Running PCA with {n_components} components')
pca = PCA(n_components=n_components, svd_solver='randomized')
pca_projections = pca.fit_transform(data)
pca_projections = pd.DataFrame(pca_projections, index=data.index)
return pca_projections, pca.explained_variance_ratio_
Expand Down

0 comments on commit bd61c82

Please sign in to comment.