Skip to content

Commit

Permalink
Add key_added to umap, tsne, and pca (scverse#3184)
Browse files Browse the repository at this point in the history
  • Loading branch information
flying-sheep authored Aug 2, 2024
1 parent 0e150e0 commit b7e599a
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 33 deletions.
3 changes: 2 additions & 1 deletion docs/release-notes/1.11.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@

#### Features

* Add layer argument to {func}`scanpy.tl.score_genes` and {func}`scanpy.tl.score_genes_cell_cycle` {pr}`2921` {smaller}`L Zappia`
* Add `layer` argument to {func}`scanpy.tl.score_genes` and {func}`scanpy.tl.score_genes_cell_cycle` {pr}`2921` {smaller}`L Zappia`
* Prevent `raw` conflict with `layer` in {func}`~scanpy.tl.score_genes` {pr}`3155` {smaller}`S Dicks`
* Add `key_added` argument to {func}`~scanpy.pp.pca`, {func}`~scanpy.tl.tsne` and {func}`~scanpy.tl.umap` {pr}`3184` {smaller}`P Angerer`

#### Docs

Expand Down
22 changes: 17 additions & 5 deletions src/scanpy/preprocessing/_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def pca(
dtype: DTypeLike = "float32",
chunked: bool = False,
chunk_size: int | None = None,
key_added: str | None = None,
copy: bool = False,
) -> AnnData | np.ndarray | spmatrix | None:
"""\
Expand Down Expand Up @@ -137,6 +138,15 @@ def pca(
chunk_size
Number of observations to include in each chunk.
Required if `chunked=True` was passed.
key_added
If not specified, the embedding is stored as
:attr:`~anndata.AnnData.obsm`\\ `['X_pca']`, the loadings as
:attr:`~anndata.AnnData.varm`\\ `['PCs']`, and the the parameters in
:attr:`~anndata.AnnData.uns`\\ `['pca']`.
If specified, the embedding is stored as
:attr:`~anndata.AnnData.obsm`\\ ``[key_added]``, the loadings as
:attr:`~anndata.AnnData.varm`\\ ``[key_added]``, and the the parameters in
:attr:`~anndata.AnnData.uns`\\ ``[key_added]``.
copy
If an :class:`~anndata.AnnData` is passed, determines whether a copy
is returned. Is ignored otherwise.
Expand All @@ -150,13 +160,13 @@ def pca(
Otherwise, it returns `None` if `copy=False`, else an updated `AnnData` object.
Sets the following fields:
`.obsm['X_pca']` : :class:`~scipy.sparse.spmatrix` | :class:`~numpy.ndarray` (shape `(adata.n_obs, n_comps)`)
`.obsm['X_pca' | key_added]` : :class:`~scipy.sparse.spmatrix` | :class:`~numpy.ndarray` (shape `(adata.n_obs, n_comps)`)
PCA representation of data.
`.varm['PCs']` : :class:`~numpy.ndarray` (shape `(adata.n_vars, n_comps)`)
`.varm['PCs' | key_added]` : :class:`~numpy.ndarray` (shape `(adata.n_vars, n_comps)`)
The principal components containing the loadings.
`.uns['pca']['variance_ratio']` : :class:`~numpy.ndarray` (shape `(n_comps,)`)
`.uns['pca' | key_added]['variance_ratio']` : :class:`~numpy.ndarray` (shape `(n_comps,)`)
Ratio of explained variance.
`.uns['pca']['variance']` : :class:`~numpy.ndarray` (shape `(n_comps,)`)
`.uns['pca' | key_added]['variance']` : :class:`~numpy.ndarray` (shape `(n_comps,)`)
Explained variance, equivalent to the eigenvalues of the
covariance matrix.
"""
Expand Down Expand Up @@ -313,7 +323,9 @@ def pca(
X_pca = X_pca.astype(dtype)

if data_is_AnnData:
key_obsm, key_varm, key_uns = ("X_pca", "PCs", "pca")
key_obsm, key_varm, key_uns = (
("X_pca", "PCs", "pca") if key_added is None else [key_added] * 3
)
adata.obsm[key_obsm] = X_pca

if mask_var is not None:
Expand Down
14 changes: 11 additions & 3 deletions src/scanpy/tools/_tsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def tsne(
random_state: AnyRandom = 0,
use_fast_tsne: bool = False,
n_jobs: int | None = None,
key_added: str | None = None,
copy: bool = False,
) -> AnnData | None:
"""\
Expand Down Expand Up @@ -88,16 +89,23 @@ def tsne(
n_jobs
Number of jobs for parallel computation.
`None` means using :attr:`scanpy._settings.ScanpyConfig.n_jobs`.
key_added
If not specified, the embedding is stored as
:attr:`~anndata.AnnData.obsm`\\ `['X_tsne']` and the the parameters in
:attr:`~anndata.AnnData.uns`\\ `['tsne']`.
If specified, the embedding is stored as
:attr:`~anndata.AnnData.obsm`\\ ``[key_added]`` and the the parameters in
:attr:`~anndata.AnnData.uns`\\ ``[key_added]``.
copy
Return a copy instead of writing to `adata`.
Returns
-------
Returns `None` if `copy=False`, else returns an `AnnData` object. Sets the following fields:
`adata.obsm['X_tsne']` : :class:`numpy.ndarray` (dtype `float`)
`adata.obsm['X_tsne' | key_added]` : :class:`numpy.ndarray` (dtype `float`)
tSNE coordinates of data.
`adata.uns['tsne']` : :class:`dict`
`adata.uns['tsne' | key_added]` : :class:`dict`
tSNE parameters.
"""
Expand Down Expand Up @@ -173,7 +181,7 @@ def tsne(
metric=metric,
use_rep=use_rep,
)
key_uns, key_obsm = ("tsne", "X_tsne")
key_uns, key_obsm = ("tsne", "X_tsne") if key_added is None else [key_added] * 2
adata.obsm[key_obsm] = X_tsne # annotate samples with tSNE coordinates
adata.uns[key_uns] = dict(params={k: v for k, v in params.items() if v is not None})

Expand Down
14 changes: 11 additions & 3 deletions src/scanpy/tools/_umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def umap(
a: float | None = None,
b: float | None = None,
method: Literal["umap", "rapids"] = "umap",
key_added: str | None = None,
neighbors_key: str = "neighbors",
copy: bool = False,
) -> AnnData | None:
Expand Down Expand Up @@ -132,6 +133,13 @@ def umap(
.. deprecated:: 1.10.0
Use :func:`rapids_singlecell.tl.umap` instead.
key_added
If not specified, the embedding is stored as
:attr:`~anndata.AnnData.obsm`\\ `['X_umap']` and the the parameters in
:attr:`~anndata.AnnData.uns`\\ `['umap']`.
If specified, the embedding is stored as
:attr:`~anndata.AnnData.obsm`\\ ``[key_added]`` and the the parameters in
:attr:`~anndata.AnnData.uns`\\ ``[key_added]``.
neighbors_key
Umap looks in
:attr:`~anndata.AnnData.uns`\\ ``[neighbors_key]`` for neighbors settings and
Expand All @@ -143,15 +151,15 @@ def umap(
-------
Returns `None` if `copy=False`, else returns an `AnnData` object. Sets the following fields:
`adata.obsm['X_umap']` : :class:`numpy.ndarray` (dtype `float`)
`adata.obsm['X_umap' | key_added]` : :class:`numpy.ndarray` (dtype `float`)
UMAP coordinates of data.
`adata.uns['umap']` : :class:`dict`
`adata.uns['umap' | key_added]` : :class:`dict`
UMAP parameters.
"""
adata = adata.copy() if copy else adata

key_obsm, key_uns = ("X_umap", "umap")
key_obsm, key_uns = ("X_umap", "umap") if key_added is None else [key_added] * 2

if neighbors_key is None: # backwards compat
neighbors_key = "neighbors"
Expand Down
48 changes: 34 additions & 14 deletions tests/test_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,36 +9,56 @@
from testing.scanpy._pytest.marks import needs


def test_tsne():
pbmc = pbmc68k_reduced()
@pytest.mark.parametrize(
("key_added", "key_obsm", "key_uns"),
[
pytest.param(None, "X_tsne", "tsne", id="None"),
pytest.param("custom_key", "custom_key", "custom_key", id="custom_key"),
],
)
def test_tsne(key_added: str | None, key_obsm: str, key_uns: str):
pbmc = pbmc68k_reduced()[:200].copy()

euclidean1 = sc.tl.tsne(pbmc, metric="euclidean", copy=True)
with pytest.warns(UserWarning, match="In previous versions of scanpy"):
euclidean2 = sc.tl.tsne(pbmc, metric="euclidean", n_jobs=2, copy=True)
euclidean2 = sc.tl.tsne(
pbmc, metric="euclidean", n_jobs=2, key_added=key_added, copy=True
)
cosine = sc.tl.tsne(pbmc, metric="cosine", copy=True)

# Reproducibility
np.testing.assert_equal(euclidean1.obsm["X_tsne"], euclidean2.obsm["X_tsne"])
np.testing.assert_equal(euclidean1.obsm["X_tsne"], euclidean2.obsm[key_obsm])
# Metric has some effect
assert not np.array_equal(euclidean1.obsm["X_tsne"], cosine.obsm["X_tsne"])

# Params are recorded
assert euclidean1.uns["tsne"]["params"]["n_jobs"] == 1
assert euclidean2.uns["tsne"]["params"]["n_jobs"] == 2
assert euclidean2.uns[key_uns]["params"]["n_jobs"] == 2
assert cosine.uns["tsne"]["params"]["n_jobs"] == 1
assert euclidean1.uns["tsne"]["params"]["metric"] == "euclidean"
assert euclidean2.uns["tsne"]["params"]["metric"] == "euclidean"
assert euclidean2.uns[key_uns]["params"]["metric"] == "euclidean"
assert cosine.uns["tsne"]["params"]["metric"] == "cosine"


def test_umap_init_dtype():
pbmc = pbmc68k_reduced()[:100, :].copy()
sc.tl.umap(pbmc, init_pos=pbmc.obsm["X_pca"][:, :2].astype(np.float32))
embed1 = pbmc.obsm["X_umap"].copy()
sc.tl.umap(pbmc, init_pos=pbmc.obsm["X_pca"][:, :2].astype(np.float64))
embed2 = pbmc.obsm["X_umap"].copy()
assert_array_almost_equal(embed1, embed2)
assert_array_almost_equal(embed1, embed2)
@pytest.mark.parametrize(
("key_added", "key_obsm", "key_uns"),
[
pytest.param(None, "X_umap", "umap", id="None"),
pytest.param("custom_key", "custom_key", "custom_key", id="custom_key"),
],
)
def test_umap_init_dtype(key_added: str | None, key_obsm: str, key_uns: str):
pbmc1 = pbmc68k_reduced()[:100, :].copy()
pbmc2 = pbmc1.copy()
for pbmc, dtype, k in [(pbmc1, np.float32, None), (pbmc2, np.float64, key_added)]:
sc.tl.umap(pbmc, init_pos=pbmc.obsm["X_pca"][:, :2].astype(dtype), key_added=k)

# check that embeddings are close for different dtypes
assert_array_almost_equal(pbmc1.obsm["X_umap"], pbmc2.obsm[key_obsm])

# check that params are recorded
assert pbmc1.uns["umap"]["params"]["a"] == pbmc2.uns[key_uns]["params"]["a"]
assert pbmc1.uns["umap"]["params"]["b"] == pbmc2.uns[key_uns]["params"]["b"]


@pytest.mark.parametrize(
Expand Down
23 changes: 16 additions & 7 deletions tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,27 +250,36 @@ def test_pca_shapes():
sc.pp.pca(adata, n_comps=100)


def test_pca_sparse():
@pytest.mark.parametrize(
("key_added", "keys_expected"),
[
pytest.param(None, ("X_pca", "PCs", "pca"), id="None"),
pytest.param("custom_key", ("custom_key",) * 3, id="custom_key"),
],
)
def test_pca_sparse(key_added: str | None, keys_expected: tuple[str, str, str]):
"""
Tests that implicitly centered pca on sparse arrays returns equivalent results to
explicit centering on dense arrays.
"""
pbmc = pbmc3k_normalized()
pbmc = pbmc3k_normalized()[:200].copy()

pbmc_dense = pbmc.copy()
pbmc_dense.X = pbmc_dense.X.toarray()

implicit = sc.pp.pca(pbmc, dtype=np.float64, copy=True)
explicit = sc.pp.pca(pbmc_dense, dtype=np.float64, copy=True)
explicit = sc.pp.pca(pbmc_dense, dtype=np.float64, key_added=key_added, copy=True)

key_obsm, key_varm, key_uns = keys_expected

np.testing.assert_allclose(
implicit.uns["pca"]["variance"], explicit.uns["pca"]["variance"]
implicit.uns["pca"]["variance"], explicit.uns[key_uns]["variance"]
)
np.testing.assert_allclose(
implicit.uns["pca"]["variance_ratio"], explicit.uns["pca"]["variance_ratio"]
implicit.uns["pca"]["variance_ratio"], explicit.uns[key_uns]["variance_ratio"]
)
np.testing.assert_allclose(implicit.obsm["X_pca"], explicit.obsm["X_pca"])
np.testing.assert_allclose(implicit.varm["PCs"], explicit.varm["PCs"])
np.testing.assert_allclose(implicit.obsm["X_pca"], explicit.obsm[key_obsm])
np.testing.assert_allclose(implicit.varm["PCs"], explicit.varm[key_varm])


def test_pca_reproducible(array_type):
Expand Down

0 comments on commit b7e599a

Please sign in to comment.