Skip to content

Commit

Permalink
bamloader rebased
Browse files Browse the repository at this point in the history
  • Loading branch information
KitVB committed Feb 9, 2024
1 parent c2a368e commit 938fba5
Show file tree
Hide file tree
Showing 9 changed files with 259 additions and 26 deletions.
4 changes: 4 additions & 0 deletions deepchem/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@
from deepchem.data.data_loader import SAMLoader
except ImportError:
print("Error: Unable to import pysam. Please make sure it is installed.")
try:
from deepchem.data.data_loader import BAMLoader
except ImportError:
print("Error: Unable to import pysam. Please make sure it is installed.")
87 changes: 87 additions & 0 deletions deepchem/data/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@
except ImportError:
print("Error: Unable to import pysam. Please make sure it is installed.")

try:
import pysam
from deepchem.feat import BAMFeaturizer
except ImportError:
print("Error: Unable to import pysam. Please make sure it is installed.")
logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -1933,6 +1938,7 @@ def __init__(self, featurizer: Optional[Featurizer] = None):
----------
featurizer: Featurizer (default: None)
The Featurizer to be used for the loaded SAM data.
"""

# Set attributes
Expand Down Expand Up @@ -1969,6 +1975,7 @@ def create_dataset(self,
DiskDataset
A `DiskDataset` object containing a featurized representation of
data from `input_files`.
"""
if isinstance(input_files, str):
input_files = [input_files]
Expand All @@ -1982,3 +1989,83 @@ def shard_generator(): # TODO Enable sharding with shard size parameter
yield X, None, None, ids

return DiskDataset.create_dataset(shard_generator(), data_dir)

class BAMLoader(DataLoader):
"""Handles loading of BAM files.
Binary Alignment Map (BAM) is the comprehensive raw data of genome
sequencing. It consists of the lossless, compressed binary representation
of the Sequence Alignment Map files. BAM files are smaller and more
efficient to work with than SAM files, saving time and reducing costs of
computation and storage. BAM files store alignment data and often have
corresponding BAM index files.The structure of BAM files include a header
section and an alignment section.
Here, we extract Query Name, Query Sequence, Query Length, Reference Name,
Reference Start, CIGAR and Mapping Quality of each read in the BAM file.
This class provides methods to load and featurize data from BAM files.
Examples
--------
>>> from deepchem.data.data_loader import BAMLoader
>>> import deepchem as dc
>>> import pytest
>>> inputs = 'deepchem/data/tests/example.bam'
>>> data = BAMLoader()
>>> output = data.create_dataset(inputs)
Note
----
This class requires pysam to be installed. Pysam can be used with Linux
or MacOS X. To use Pysam on Windows, use Windows Subsystem for Linux(WSL).
"""

def __init__(self, featurizer: Optional[Featurizer] = None):
"""Initialize BAMLoader.
Parameters
----------
featurizer: Featurizer (default: None)
The Featurizer to be used for the loaded BAM data.
"""

# Set attributes
self.user_specified_features = None

# Handle special featurizer cases
if isinstance(featurizer,
UserDefinedFeaturizer): # User defined featurizer
self.user_specified_features = featurizer.feature_fields
elif featurizer is None: # Default featurizer
featurizer = BAMFeaturizer(max_records=None)

# Set self.featurizer
self.featurizer = featurizer

def create_dataset(self,
input_files: OneOrMany[str],
data_dir: Optional[str] = None,
shard_size: Optional[int] = None) -> DiskDataset:
"""Creates a `Dataset` from input BAM files.
Parameters
----------
input_files: List[str]
List of BAM files, with their corresponding index files.
data_dir: str, optional (default None)
Name of directory where featurized data is stored.
shard_size: int, optional (default None)
For now, this argument is ignored and each BAM file gets its
own shard.
Returns
-------
DiskDataset
A `DiskDataset` object containing a featurized representation of
data from `input_files`.
"""
if isinstance(input_files, str):
input_files = [input_files]

def shard_generator(): # TODO Enable sharding with shard size parameter
for input_file in input_files:
bamfile = pysam.AlignmentFile(input_file, "rb")
X = self.featurizer._featurize(bamfile)
ids = np.ones(len(X))
# (X, y, w, ids)
yield X, None, None, ids

return DiskDataset.create_dataset(shard_generator(), data_dir)
Binary file added deepchem/data/tests/example.bam
Binary file not shown.
Binary file added deepchem/data/tests/example.bam.bai
Binary file not shown.
56 changes: 56 additions & 0 deletions deepchem/data/tests/test_bam_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
import unittest
import deepchem as dc
try:
import pysam
except ImportError:
print("Error: Unable to import pysam. Please make sure it is installed.")
import numpy as np


class TestBAMLoader(unittest.TestCase):
"""
Tests for BAMLoader and BAMFeaturizer
"""

def setUp(self):
super(TestBAMLoader, self).setUp()
self.current_dir = os.path.dirname(os.path.abspath(__file__))

def test_bam_loader_with_single_file(self):
"""
Tests BAMLoader with a single BAM file.
"""
bam_file_path = os.path.join(self.current_dir, "example.bam")
loader = dc.data.BAMLoader()
dataset = loader.create_dataset(bam_file_path)

assert dataset.X.shape == (396, 7)

def test_bam_loader_with_multiple_files(self):
"""
Tests BAMLoader with multiple BAM files.
"""
bam_files = [
os.path.join(self.current_dir, "example.bam"),
os.path.join(self.current_dir, "example.bam")
]
loader = dc.data.BAMLoader()
dataset = loader.create_dataset(bam_files)

assert dataset.X.shape == (792, 7)

def test_bam_featurizer(self):
"""
Tests BAMFeaturizer.
"""
bam_featurizer = dc.feat.BAMFeaturizer(max_records=5)
bam_file_path = os.path.join(self.current_dir, "example.bam")
bamfile = pysam.AlignmentFile(bam_file_path, "rb")
dataset = bam_featurizer._featurize(bamfile)

assert dataset.shape == (5, 7)


if __name__ == "__main__":
unittest.main()
5 changes: 5 additions & 0 deletions deepchem/feat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@
except ImportError:
print("Error: Unable to import pysam. Please make sure it is installed.")

try:
from deepchem.feat.bio_seq_featurizer import BAMFeaturizer
except ImportError:
print("Error: Unable to import pysam. Please make sure it is installed.")

# tokenizers
try:
from deepchem.feat.smiles_tokenizer import SmilesTokenizer
Expand Down
121 changes: 95 additions & 26 deletions deepchem/feat/bio_seq_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,6 @@ class SAMFeaturizer(Featurizer):
Reference Name,Reference Start, CIGAR and Mapping Quality of each read in
a SAM file.
Examples
--------
>>> from deepchem.data.data_loader import SAMLoader
>>> import deepchem as dc
>>> inputs = 'deepchem/data/tests/example.sam'
>>> featurizer = dc.feat.SAMFeaturizer()
>>> features = featurizer.featurize(inputs)
Information for each read is stored in a 'numpy.ndarray'.
>>> type(features[0])
<class 'numpy.ndarray'>
This is the default featurizer used by SAMLoader, and it extracts the following
fields from each read in each SAM file in the given order:-
- Column 0: Query Name
Expand All @@ -34,26 +23,22 @@ class SAMFeaturizer(Featurizer):
- Column 4: Reference Start
- Column 5: CIGAR
- Column 6: Mapping Quality
For the given example, to extract specific features, we do the following.
>>> features[0][0] # Query Name
r001
>>> features[0][1] # Query Sequence
TTAGATAAAGAGGATACTG
>>> features[0][2] # Query Length
19
>>> features[0][3] # Reference Name
ref
>>> features[0][4] # Reference Start
6
>>> features[0][5] # CIGAR
[(0, 8), (1, 4), (0, 4), (2, 1), (0, 3)]
>>> features[0][6] # Mapping Quality
30
Examples
--------
>>> from deepchem.data.data_loader import SAMLoader
>>> import deepchem as dc
>>> inputs = 'deepchem/data/tests/example.sam'
>>> featurizer = dc.feat.SAMFeaturizer()
>>> features = featurizer.featurize(inputs)
>>> type(features[0])
<class 'numpy.ndarray'>
Note
----
This class requires pysam to be installed. Pysam can be used with Linux or MacOS X.
To use Pysam on Windows, use Windows Subsystem for Linux(WSL).
"""

def __init__(self, max_records=None):
Expand All @@ -64,6 +49,7 @@ def __init__(self, max_records=None):
----------
max_records : int or None, optional
The maximum number of records to extract from the SAM file. If None, all records will be extracted.
"""
self.max_records = max_records

Expand All @@ -88,6 +74,89 @@ def _featurize(self, datapoint):
- Column 4: Reference Start
- Column 5: CIGAR
- Column 6: Mapping Quality
"""

features = []
record_count = 0

for record in datapoint:
feature_vector = [
record.query_name,
record.query_sequence,
record.query_length,
record.reference_name,
record.reference_start,
record.cigar,
record.mapping_quality,
]

features.append(feature_vector)
record_count += 1

# Break the loop if max_records is set
if self.max_records is not None and record_count >= self.max_records:
break

datapoint.close()

return np.array(features, dtype="object")

class BAMFeaturizer(Featurizer):
"""
Featurizes BAM files, that are compressed binary representations of SAM
(Sequence Alignment Map) files. This class extracts Query Name, Query
Sequence, Query Length, Reference Name, Reference Start, CIGAR and Mapping
Quality of the alignment in the BAM file.
This is the default featurizer used by BAMLoader, and it extracts the following
fields from each read in each BAM file in the given order:-
- Column 0: Query Name
- Column 1: Query Sequence
- Column 2: Query Length
- Column 3: Reference Name
- Column 4: Reference Start
- Column 5: CIGAR
- Column 6: Mapping Quality
Examples
--------
>>> from deepchem.data.data_loader import BAMLoader
>>> import deepchem as dc
>>> inputs = 'deepchem/data/tests/example.bam'
>>> featurizer = dc.feat.BAMFeaturizer()
>>> features = featurizer.featurize(inputs)
>>> type(features[0])
<class 'numpy.ndarray'>
Note
----
This class requires pysam to be installed. Pysam can be used with Linux or MacOS X.
To use Pysam on Windows, use Windows Subsystem for Linux(WSL).
"""
def __init__(self, max_records=None):
"""
Initialize BAMFeaturizer.
Parameters
----------
max_records : int or None, optional
The maximum number of records to extract from the BAM file. If None, all
records will be extracted.
"""
self.max_records = max_records

def _featurize(self, datapoint):
"""
Extract features from a BAM file.
Parameters
----------
bamfile : str
BAM file.
The corresponding index file must be in the same directory.
Returns
-------
features : numpy.ndarray
A 2D NumPy array representing the extracted features.
"""

features = []
Expand Down
5 changes: 5 additions & 0 deletions docs/source/api_reference/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,11 @@ SAM Loader
.. autoclass:: deepchem.data.data_loader.SAMLoader
:members: __init__, create_dataset

BAM Loader
^^^^^^^^^^
.. autoclass:: deepchem.data.data_loader.BAMLoader
:members: __init__, create_dataset


Data Classes
------------
Expand Down
7 changes: 7 additions & 0 deletions docs/source/api_reference/featurizers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,13 @@ SAMFeaturizer
:members:
:inherited-members:

BAMFeaturizer
^^^^^^^^^^^^^

.. autoclass:: deepchem.feat.BAMFeaturizer
:members:
:inherited-members:


Molecule Tokenizers
-------------------
Expand Down

0 comments on commit 938fba5

Please sign in to comment.