Skip to content

Commit

Permalink
Merge pull request scverse#425 from fidelram/tl.dendrogram
Browse files Browse the repository at this point in the history
dendrograms, correlation and marker genes filtering
  • Loading branch information
falexwolf authored Mar 4, 2019
2 parents 0f58f56 + 1e8042c commit 77e34d7
Show file tree
Hide file tree
Showing 28 changed files with 671 additions and 127 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
anndata>=0.6.15
matplotlib>=2.2
matplotlib>=3.0.0
pandas>=0.21
scipy
seaborn
Expand Down
2 changes: 2 additions & 0 deletions scanpy/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@
tl.louvain
tl.dpt
tl.paga
tl.dendrogram
Marker genes
~~~~~~~~~~~~
Expand All @@ -151,6 +152,7 @@
:toctree: .
tl.rank_genes_groups
tl.filter_rank_genes_groups
Gene scores, Cell cycle
~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
4 changes: 2 additions & 2 deletions scanpy/plotting/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ._anndata import scatter, violin, ranking, clustermap, stacked_violin, heatmap, dotplot, matrixplot, tracksplot
from ._anndata import scatter, violin, ranking, clustermap, stacked_violin, heatmap, dotplot, matrixplot, tracksplot, dendrogram, correlation_matrix

from ._preprocessing import filter_genes_dispersion, highly_variable_genes

Expand All @@ -11,7 +11,7 @@
from ._tools import sim

from ._rcmod import set_rcParams_scanpy, set_rcParams_defaults
from . import palettes
from . import palettes

from ._utils import matrix
from ._utils import timeseries, timeseries_subplot, timeseries_as_heatmap
Expand Down
436 changes: 340 additions & 96 deletions scanpy/plotting/_anndata.py

Large diffs are not rendered by default.

21 changes: 12 additions & 9 deletions scanpy/plotting/_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
Keys for annotations of observations/cells or variables/genes, e.g.,
`'ann1'` or `['ann1', 'ann2']`.
gene_symbols : string, optional (default: `None`)
Key for field in .var that stores gene symbols if you do not want to use
.var_names.
Column name in `.var` DataFrame that stores gene symbols. By default `var_names`
refer to the index column of the `.var` DataFrame. Setting this option allows
alternative names to be used.
use_raw : `bool`, optional (default: `None`)
Use `.raw` attribute of `adata` for coloring with gene expression. If
`None`, uses `.raw` if present.\
Expand Down Expand Up @@ -108,13 +109,15 @@
figsize : (`float`, `float`), optional (default: `None`)
Figure size when multi_panel = True. Otherwise the rcParam['figure.figsize] value is used.
Format is (width, height)
dendrogram: `bool` If True, hierarchical clustering between the `groupby` categories is
computed and a dendrogram is plotted. `groupby` categories are reordered according to
the dendrogram order. If groups of `var_names` (see next arguments) are set and those groups correspond
to the `groupby` categories, those groups are also reordered. The 'pearson' method
is used to compute the pairwise correlation between categories using all var_names in
`raw` if `use_raw` is None, otherwise all adata.var_names are used. The linkage method
used is `complete`.
dendrogram: `bool` or `str`, optional (default, `False`)
If True or a valid dendrogram key, a dendrogram based on the hierarchical clustering
between the `groupby` categories is added. The dendrogram information is computed
using :ref:`scanpy.tl.dendrogram`. If `tl.dendrogram` has not been called previously
the function is called with default parameters.
gene_symbols : string, optional (default: `None`)
Column name in `.var` DataFrame that stores gene symbols. By default `var_names`
refer to the index column of the `.var` DataFrame. Setting this option allows
alternative names to be used.
var_group_positions : list of `tuples`.
Use this parameter to highlight groups of `var_names`.
This will draw a 'bracket' or a color block between the given start and end positions. If the
Expand Down
24 changes: 17 additions & 7 deletions scanpy/plotting/_tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,13 +285,23 @@ def _rank_genes_groups_plot(adata, plot_type='heatmap', groups=None,
group_names = (adata.uns[key]['names'].dtype.names
if groups is None else groups)

# make a list of tuples containing the index for the start gene and the
# end gene that should be labelled
group_positions = [(x, x + n_genes - 1) for x in range(0, n_genes * len(group_names), n_genes)]

# sum(list, []) is used to flatten the gene list
gene_names = sum([list(adata.uns[key]['names'][x][:n_genes]) for x in group_names], [])

gene_names = []
start = 0
group_positions = []
group_names_valid = []
for group in group_names:
# get all genes that are 'not-nan'
genes_list = [gene for gene in adata.uns[key]['names'][group] if not pd.isnull(gene)][:n_genes]
if len(genes_list) == 0:
logg.warn("No genes found for group {}".format(group))
continue
gene_names.extend(genes_list)
end = start + len(genes_list)
group_positions.append((start, end -1))
group_names_valid.append(group)
start = end

group_names = group_names_valid
if plot_type == 'dotplot':
from .._anndata import dotplot
dotplot(adata, gene_names, groupby, var_group_labels=group_names,
Expand Down
4 changes: 4 additions & 0 deletions scanpy/plotting/_tools/scatterplots.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,10 @@ def _get_color_values(adata, value_to_plot, groups=None, palette=None, use_raw=F
else:
color_vector = adata.obs[value_to_plot]
elif gene_symbols in adata.var.columns:
if value_to_plot not in adata.var[gene_symbols].values:
logg.error("Gene symbol {!r} not found in given gene_symbols "
"column: {!r}".format(value_to_plot, gene_symbols))
return
gene_id = adata.var[adata.var[gene_symbols] == value_to_plot].index[0]
if use_raw:
color_vector = adata.raw[:, gene_id].X
Expand Down
Binary file added scanpy/tests/_images/correlation.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added scanpy/tests/_images/dendrogram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_dotplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_dotplot3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_dotplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_heatmap.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_heatmap_swap_axes.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_matrixplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_matrixplot_swap_axes.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_stacked_violin.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_ranked_genes_tracksplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified scanpy/tests/_images/master_umap_with_edges.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
45 changes: 43 additions & 2 deletions scanpy/tests/test_plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,18 @@ def test_violin():
save_and_compare_images('master_violin_multi_panel', tolerance=40)


def test_dendrogram():
pbmc = sc.datasets.pbmc68k_reduced()
sc.pl.dendrogram(pbmc, 'bulk_labels')
save_and_compare_images('dendrogram', tolerance=10)


def test_correlation():
pbmc = sc.datasets.pbmc68k_reduced()
sc.pl.correlation_matrix(pbmc, 'bulk_labels')
save_and_compare_images('correlation', tolerance=15)


def test_rank_genes_groups():
pbmc = sc.datasets.pbmc68k_reduced()
tolerance = 15
Expand All @@ -143,7 +155,7 @@ def test_rank_genes_groups():

# test ranked genes using stacked violin plots
sc.pl.rank_genes_groups_stacked_violin(pbmc, n_genes=3, show=False)
save_and_compare_images('master_ranked_genes_stacked_violin', tolerance=tolerance)
save_and_compare_images('master_ranked_genes_stacked_violin', tolerance=20)

# test ranked genes using dotplot
sc.pl.rank_genes_groups_dotplot(pbmc, n_genes=4, show=False)
Expand Down Expand Up @@ -171,6 +183,35 @@ def test_rank_genes_groups():
# save_and_compare_images('master_ranked_genes_stacked_violin', tolerance=tolerance)


def test_rank_genes_symbols():
adata = sc.datasets.krumsiek11()

# add a 'symbols' column
adata.var['symbols'] = adata.var.index.map(lambda x: "symbol_{}".format(x))
symbols = ["symbol_{}".format(x) for x in adata.var_names]
sc.pl.heatmap(adata, symbols, 'cell_type', use_raw=False, show=False, dendrogram=True,
gene_symbols='symbols')
save_and_compare_images('master_heatmap_gene_symbols')

sc.pl.dotplot(adata, symbols, 'cell_type', use_raw=False, dendrogram=True, show=False,
gene_symbols='symbols')

save_and_compare_images('master_dotplot_gene_symbols', tolerance=15)

sc.pl.matrixplot(adata, symbols, 'cell_type', use_raw=False, dendrogram=True, show=False,
gene_symbols='symbols')

save_and_compare_images('master_matrixplot_gene_symbols', tolerance=15)

sc.pl.stacked_violin(adata, symbols, 'cell_type', use_raw=False, color='blue', show=False,
gene_symbols='symbols')
save_and_compare_images('master_stacked_violin_gene_symbols', tolerance=20)

sc.pl.tracksplot(adata, symbols, 'cell_type', dendrogram=True, use_raw=False,
gene_symbols='symbols')
save_and_compare_images('master_tracksplot_gene_symbols')


def test_scatterplots():

pbmc = sc.datasets.pbmc68k_reduced()
Expand Down Expand Up @@ -209,7 +250,7 @@ def test_scatterplots():
# test edges = True
sc.pp.neighbors(pbmc)
sc.pl.umap(pbmc, color='louvain', edges=True, edges_width=0.1, s=50, show=False)
save_and_compare_images('master_umap_with_edges', tolerance=20)
save_and_compare_images('master_umap_with_edges', tolerance=35)

# test diffmap
# sc.tl.diffmap(pbmc)
Expand Down
3 changes: 2 additions & 1 deletion scanpy/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from ._draw_graph import draw_graph

from ._paga import paga, paga_degrees, paga_expression_entropies, paga_compare_paths
from ._rank_genes_groups import rank_genes_groups
from ._rank_genes_groups import rank_genes_groups, filter_rank_genes_groups
from ._dpt import dpt
from ._leiden import leiden
from ._louvain import louvain
from ._sim import sim
from ._score_genes import score_genes, score_genes_cell_cycle
from ._dendrogram import dendrogram
117 changes: 117 additions & 0 deletions scanpy/tools/_dendrogram.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""
Computes a dendrogram based on a given categorical observation.
"""

from typing import Optional, List
import pandas as pd
from anndata import AnnData
from pandas.api.types import is_categorical_dtype

from .. utils import doc_params
from .. import logging as logg
from ..tools._utils import choose_representation, doc_use_rep, doc_n_pcs


@doc_params(n_pcs=doc_n_pcs, use_rep=doc_use_rep)
def dendrogram(adata: AnnData, groupby: str,
n_pcs: Optional[int]=None,
use_rep: Optional[str]=None,
var_names: Optional[List[str]]=None,
use_raw: Optional[bool]=None,
cor_method: Optional[str]='pearson',
linkage_method: Optional[str]='complete',
key_added: Optional[str]=None) -> None:

"""
Computes a hierarchical clustering for the given `groupby` categories. Be default the PCA
components are used unless .X has less than 50 variables.
Alternatively, a list of var_names (e.g genes) can be given.
Average values of either var_names or components are used to compute a correlation matrix.
The hierarchical clustering can be visualized using `sc.pl.dendrogram` or multiple other
visualizations that can include a dendrogram: `matrixplot`, `heatmap`, `dotplot` and `stacked_violin`
.. note::
The computation of the hierarchical clustering is based on predefined groups and not
per cell. The correlation matrix is computed using by default pearson but other methods
are available.
Parameters
----------
adata : :class:`~anndata.AnnData`
Annotated data matrix
{n_pcs}
{use_rep}
var_names : `list of str` (default: None)
List of var_names to use for computing the hierarchical clustering. If `var_names` is given,
then `use_rep` and `n_pcs` is ignored.
use_raw : `bool`, optional (default: None)
Only when `var_names` is not None. Use `raw` attribute of `adata` if present.
cor_method : `str`, optional (default: `"pearson"`)
correlation method to use. Options are 'pearson', 'kendall', and 'spearman'
linkage_method : `str`, optional (default: `"complete"`)
linkage method to use. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
for more information.
key_added : : `str`, optional (default: `None`)
By default, the dendrogram information is added to `.uns['dendrogram_' + groupby]`. Notice
that the `groupby` information is added to the dendrogram.
Returns
-------
adata.uns['dendrogram'] (or instead of 'dendrogram' the value selected for `key_added`) is updated
with the dendrogram information
Examples
--------
>>> adata = sc.datasets.pbmc68k_reduced()
>>> sc.tl.dendrogram(adata, groupby='bulk_labels')
>>> sc.pl.dendrogram(adata)
>>> sc.pl.dotplot(adata, ['C1QA', 'PSAP', 'CD79A', 'CD79B', 'CST3', 'LYZ'],
... groupby='bulk_labels', dendrogram=True)
"""
if groupby not in adata.obs_keys():
raise ValueError('groupby has to be a valid observation. Given value: {}, '
'valid observations: {}'.format(groupby, adata.obs_keys()))
if not is_categorical_dtype(adata.obs[groupby]):
# if the groupby column is not categorical, turn it into one
# by subdividing into `num_categories` categories
raise ValueError('groupby has to be a categorical observation. Given value: {}, '
'Column type: {}'.format(groupby, adata.obs[groupby].dtype))

if var_names is None:
rep_df = pd.DataFrame(choose_representation(adata, use_rep=use_rep, n_pcs=n_pcs))
rep_df.set_index(adata.obs[groupby], inplace=True)
categories = rep_df.index.categories
else:
if use_raw is None and adata.raw is not None: use_raw = True
gene_names = adata.raw.var_names if use_raw else adata.var_names
from ..plotting._anndata import _prepare_dataframe
categories, rep_df = _prepare_dataframe(adata, gene_names, groupby, use_raw)

if key_added is None:
key_added = 'dendrogram_' + groupby

logg.info('Storing dendrogram info using `.uns[{!r}]`'.format(key_added))
# aggregate values within categories using 'mean'
mean_df = rep_df.groupby(level=0).mean()

import scipy.cluster.hierarchy as sch

corr_matrix = mean_df.T.corr(method=cor_method)
z_var = sch.linkage(corr_matrix, method=linkage_method)
dendro_info = sch.dendrogram(z_var, labels=categories, no_plot=True)

# order of groupby categories
categories_idx_ordered = dendro_info['leaves']

adata.uns[key_added] = {'linkage': z_var,
'groupby': groupby,
'use_rep': use_rep,
'cor_method': cor_method,
'linkage_method': linkage_method,
'categories_idx_ordered': categories_idx_ordered,
'dendrogram_info': dendro_info,
'correlation_matrix': corr_matrix.values}
Loading

0 comments on commit 77e34d7

Please sign in to comment.