Skip to content

Commit

Permalink
Remove pytables dependency (scverse#2064)
Browse files Browse the repository at this point in the history
* Remove pytables dependency
* Bump h5py version, re-remove tables after merge
* Add release note
  • Loading branch information
ivirshup authored Feb 28, 2022
1 parent 92886db commit 4373348
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 11 deletions.
1 change: 1 addition & 0 deletions docs/release-notes/1.8.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- Fixed :func:`scanpy.external.pp.scrublet` to address :issue:`1957` :smaller:`FlMai` and ensure raw counts are used for simulation
- Functions in :mod:`scanpy.datasets` no longer throw `OldFormatWarnings` when using `anndata` `0.8` :pr:`2096` :small:`I Virshup`
- Fixed use of :func:`scanpy.pp.neighbors` with ``method='rapids'``: RAPIDS cuML no longer returns a squared Euclidean distance matrix, so we should not square-root the kNN distance matrix. :pr:`1828` :small:`M Zaslavsky`
- Removed `pytables` dependency by implementing `read_10x_h5` with `h5py` due to installation errors on Windows :pr:`2064`

.. rubric:: Performance

Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,7 @@ dependencies = [
"pandas>=0.21",
"scipy>=1.4",
"seaborn",
"h5py>=2.10.0",
"tables",
"h5py>=3",
"tqdm",
"scikit-learn>=0.22",
"statsmodels>=0.10.0rc2",
Expand Down
27 changes: 18 additions & 9 deletions scanpy/readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from pathlib import Path, PurePath
from typing import Union, Dict, Optional, Tuple, BinaryIO

import h5py
import json
import numpy as np
import pandas as pd
from matplotlib.image import imread
import tables
import anndata
from anndata import (
AnnData,
Expand Down Expand Up @@ -177,7 +177,7 @@ def read_10x_h5(
is_present = _check_datafile_present_and_download(filename, backup_url=backup_url)
if not is_present:
logg.debug(f'... did not find original file {filename}')
with tables.open_file(str(filename), 'r') as f:
with h5py.File(str(filename), 'r') as f:
v3 = '/matrix' in f
if v3:
adata = _read_v3_10x_h5(filename, start=start)
Expand All @@ -201,9 +201,9 @@ def _read_legacy_10x_h5(filename, *, genome=None, start=None):
"""
Read hdf5 file from Cell Ranger v2 or earlier versions.
"""
with tables.open_file(str(filename), 'r') as f:
with h5py.File(str(filename), 'r') as f:
try:
children = [x._v_name for x in f.list_nodes(f.root)]
children = list(f.keys())
if not genome:
if len(children) > 1:
raise ValueError(
Expand All @@ -217,9 +217,10 @@ def _read_legacy_10x_h5(filename, *, genome=None, start=None):
f"Could not find genome '{genome}' in '{filename}'. "
f'Available genomes are: {children}'
)

dsets = {}
for node in f.walk_nodes('/' + genome, 'Array'):
dsets[node.name] = node.read()
_collect_datasets(dsets, f)

# AnnData works with csr matrices
# 10x stores the transposed data, so we do the transposition right away
from scipy.sparse import csr_matrix
Expand Down Expand Up @@ -249,15 +250,23 @@ def _read_legacy_10x_h5(filename, *, genome=None, start=None):
raise Exception('File is missing one or more required datasets.')


def _collect_datasets(dsets: dict, group: h5py.Group):
for k, v in group.items():
if isinstance(v, h5py.Dataset):
dsets[k] = v[:]
else:
_collect_datasets(dsets, v)


def _read_v3_10x_h5(filename, *, start=None):
"""
Read hdf5 file from Cell Ranger v3 or later versions.
"""
with tables.open_file(str(filename), 'r') as f:
with h5py.File(str(filename), 'r') as f:
try:
dsets = {}
for node in f.walk_nodes('/matrix', 'Array'):
dsets[node.name] = node.read()
_collect_datasets(dsets, f["matrix"])

from scipy.sparse import csr_matrix

M, N = dsets['shape']
Expand Down

0 comments on commit 4373348

Please sign in to comment.