Skip to content

Commit

Permalink
Exclude non-DNA regions larger than 300,000 bases.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 520783371
  • Loading branch information
akolesnikov authored and copybara-github committed Mar 31, 2023
1 parent b52cd1c commit ed415a5
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 4 deletions.
59 changes: 57 additions & 2 deletions deepvariant/make_examples_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
from deepvariant.vendor import timer
from google.protobuf import text_format
from third_party.nucleus.io import fasta
from third_party.nucleus.io import genomics_reader
from third_party.nucleus.io import sam
from third_party.nucleus.io import sharded_file_utils
from third_party.nucleus.io import tfrecord
Expand Down Expand Up @@ -103,6 +104,9 @@
# flag.
MAX_PARTITION_LEN = 1000000

# Non DNA regions larger than this value are excluded from processing.
MIN_NON_DNA_REGION = 300000

# ---------------------------------------------------------------------------
# Selecting variants of specific types (e.g., SNPs)
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -503,7 +507,45 @@ def format_contig_matches():
)


def build_calling_regions(contigs, regions_to_include, regions_to_exclude):
def find_ref_n_regions(
ref_reader: genomics_reader.GenomicsReader, min_region_len: int
) -> List[range_pb2.Range]:
"""Returns List[nucleus.genomics.v1.Range] regions containing Ns.
Args:
ref_reader: genomics_reader.GenomicsReader. Nucleus Fasta reader.
min_region_len: int. Only regions larger than min_region_len are returned.
Returns:
A List of nucleus.genomics.v1.Range containing regions of Ns in the
reference.
"""
ref_n_regions = []
# ref_reader returns tupes of contig_name and vector of bases.
for ref in ref_reader.iterate():
length = 0
start = 0
i = 0
for i, b in enumerate(ref[1]):
if b not in vc_base.CANONICAL_DNA_BASES:
if length == 0:
start = i
length += 1
else:
if length >= min_region_len:
ref_n_regions.append(ranges.make_range(ref[0], start, i))
logging.info('Excluding %s:%d-%d', ref[0], start, i)
start = 0
length = 0
if length >= min_region_len:
ref_n_regions.append(ranges.make_range(ref[0], start, i + 1))
logging.info('Excluding %s:%d-%d', ref[0], start, i + 1)
return ref_n_regions


def build_calling_regions(
contigs, regions_to_include, regions_to_exclude, ref_n_regions
):
"""Builds a RangeSet containing the regions we should call variants in.
This function intersects the Ranges spanning all of the contigs with those
Expand All @@ -517,6 +559,7 @@ def build_calling_regions(contigs, regions_to_include, regions_to_exclude):
RangeSet.
regions_to_exclude: RangeSet or iterable that can be converted to a
RangeSet.
ref_n_regions: List of Range containing non DNA bases to exclude.
Returns:
A RangeSet.
Expand All @@ -533,6 +576,9 @@ def build_calling_regions(contigs, regions_to_include, regions_to_exclude):
ranges.RangeSet.from_regions(regions_to_include, contig_dict)
)

if ref_n_regions:
regions.exclude_regions(ranges.RangeSet(ref_n_regions))

# If we provided regions to exclude, intersect those with the existing calling
# regions to further refine our set of contigs to process.
if regions_to_exclude:
Expand Down Expand Up @@ -2269,6 +2315,12 @@ def processing_regions_from_options(options):
options.reference_filename
).header.contigs

ref_n_regions = None
if options.discard_non_dna_regions and not options.calling_regions:
ref_n_regions = find_ref_n_regions(
fasta.IndexedFastaReader(options.reference_filename), MIN_NON_DNA_REGION
)

# Add in confident regions and vcf_contigs if in training mode.
vcf_contigs = None
if in_training_mode(options):
Expand Down Expand Up @@ -2301,7 +2353,10 @@ def processing_regions_from_options(options):
options, 'Common contigs are %s' % [c.name for c in contigs]
)
calling_regions = build_calling_regions(
ref_contigs, options.calling_regions, options.exclude_calling_regions
ref_contigs,
options.calling_regions,
options.exclude_calling_regions,
ref_n_regions,
)
if not calling_regions:
raise ValueError(
Expand Down
59 changes: 58 additions & 1 deletion deepvariant/make_examples_core_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,44 @@ def test_regions_to_process_partition(
),
)

@parameterized.parameters(
dict(
ref_reader=fasta.InMemoryFastaReader([('chr1', 0, 'GATACA')]),
expected=[],
min_region_len=3,
),
dict(
ref_reader=fasta.InMemoryFastaReader([('chr1', 0, 'NNNGATACA')]),
expected=[ranges.make_range('chr1', 0, 3)],
min_region_len=3,
),
dict(
ref_reader=fasta.InMemoryFastaReader([('chr1', 0, 'GATACANNN')]),
expected=[ranges.make_range('chr1', 6, 9)],
min_region_len=3,
),
dict(
ref_reader=fasta.InMemoryFastaReader([('chr1', 0, 'GATACANNNTTT')]),
expected=[ranges.make_range('chr1', 6, 9)],
min_region_len=3,
),
dict(
ref_reader=fasta.InMemoryFastaReader(
[('chr1', 0, 'GATACANNNAAAAANNN')]
),
expected=[
ranges.make_range('chr1', 6, 9),
ranges.make_range('chr1', 14, 17),
],
min_region_len=3,
),
)
def test_find_ref_n_regions(self, ref_reader, expected, min_region_len):
self.assertCountEqual(
expected,
make_examples_core.find_ref_n_regions(ref_reader, min_region_len),
)

@parameterized.parameters(
dict(includes=[], excludes=[], expected=['1:1-100', '2:1-200']),
dict(includes=['1'], excludes=[], expected=['1:1-100']),
Expand Down Expand Up @@ -592,7 +630,7 @@ def test_regions_to_process_partition(
def test_build_calling_regions(self, includes, excludes, expected):
contigs = _make_contigs([('1', 100), ('2', 200)])
actual = make_examples_core.build_calling_regions(
contigs, includes, excludes
contigs, includes, excludes, None
)
self.assertCountEqual(actual, _from_literals_list(expected))

Expand Down Expand Up @@ -834,6 +872,25 @@ def test_regions_and_exclude_regions_flags(self):
),
)

@flagsaver.flagsaver
def test_regions_exclude_n_reference(self):
FLAGS.mode = 'calling'
FLAGS.ref = testdata.CHR20_FASTA
FLAGS.reads = testdata.CHR20_BAM
FLAGS.examples = 'examples.tfrecord'
FLAGS.discard_non_dna_regions = True

options = make_examples.default_options(add_flags=True)
_, regions_from_options = (
make_examples_core.processing_regions_from_options(options)
)
self.assertCountEqual(
list(ranges.RangeSet(regions_from_options)),
_from_literals_list(
['chr20:9,995,001-11,095,000', 'chr20:59,776,001-60,001,000']
),
)

@flagsaver.flagsaver
def test_incorrect_empty_regions(self):
FLAGS.mode = 'calling'
Expand Down
10 changes: 10 additions & 0 deletions deepvariant/make_examples_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,14 @@
'sharded into the same number of shards as the examples.'
),
)
_DISCARD_NON_DNA_REGIONS = flags.DEFINE_bool(
'discard_non_dna_regions',
False,
(
'Default is False. If set regions of Ns larger than 300,000bp are'
'discarded.'
),
)


def shared_flags_to_options(
Expand Down Expand Up @@ -778,6 +786,8 @@ def shared_flags_to_options(
flags_obj.customized_classes_labeler_info_field_name
)

options.discard_non_dna_regions = _DISCARD_NON_DNA_REGIONS.value

return options


Expand Down
4 changes: 3 additions & 1 deletion deepvariant/protos/deepvariant.proto
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ message SampleOptions {

// High-level options that encapsulates all of the parameters needed to run
// DeepVariant end-to-end.
// Next ID: 56.
// Next ID: 57.
message MakeExamplesOptions {
// A list of contig names we never want to call variants on. For example,
// chrM in humans is the mitocondrial genome and the caller isn't trained to
Expand Down Expand Up @@ -792,6 +792,8 @@ message MakeExamplesOptions {
int32 phase_max_candidates = 53;

string read_phases_output = 55;

bool discard_non_dna_regions = 56;
}

// Config describe information needed for a dataset that can be used for
Expand Down

0 comments on commit ed415a5

Please sign in to comment.