Exclude non-DNA regions larger than 300,000 bases.

PiperOrigin-RevId: 520783371
vegeta03 · Mar 31, 2023 · ed415a5 · ed415a5
1 parent b52cd1c
commit ed415a5
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 4 deletions.
diff --git a/deepvariant/make_examples_core.py b/deepvariant/make_examples_core.py
@@ -61,6 +61,7 @@
 from deepvariant.vendor import timer
 from google.protobuf import text_format
 from third_party.nucleus.io import fasta
+from third_party.nucleus.io import genomics_reader
 from third_party.nucleus.io import sam
 from third_party.nucleus.io import sharded_file_utils
 from third_party.nucleus.io import tfrecord
@@ -103,6 +104,9 @@
 # flag.
 MAX_PARTITION_LEN = 1000000
 
+# Non DNA regions larger than this value are excluded from processing.
+MIN_NON_DNA_REGION = 300000
+
 # ---------------------------------------------------------------------------
 # Selecting variants of specific types (e.g., SNPs)
 # ---------------------------------------------------------------------------
@@ -503,7 +507,45 @@ def format_contig_matches():
     )
 
 
-def build_calling_regions(contigs, regions_to_include, regions_to_exclude):
+def find_ref_n_regions(
+    ref_reader: genomics_reader.GenomicsReader, min_region_len: int
+) -> List[range_pb2.Range]:
+  """Returns List[nucleus.genomics.v1.Range] regions containing Ns.
+
+  Args:
+    ref_reader: genomics_reader.GenomicsReader. Nucleus Fasta reader.
+    min_region_len: int. Only regions larger than min_region_len are returned.
+
+  Returns:
+    A List of nucleus.genomics.v1.Range containing regions of Ns in the
+    reference.
+  """
+  ref_n_regions = []
+  # ref_reader returns tupes of contig_name and vector of bases.
+  for ref in ref_reader.iterate():
+    length = 0
+    start = 0
+    i = 0
+    for i, b in enumerate(ref[1]):
+      if b not in vc_base.CANONICAL_DNA_BASES:
+        if length == 0:
+          start = i
+        length += 1
+      else:
+        if length >= min_region_len:
+          ref_n_regions.append(ranges.make_range(ref[0], start, i))
+          logging.info('Excluding %s:%d-%d', ref[0], start, i)
+        start = 0
+        length = 0
+    if length >= min_region_len:
+      ref_n_regions.append(ranges.make_range(ref[0], start, i + 1))
+      logging.info('Excluding %s:%d-%d', ref[0], start, i + 1)
+  return ref_n_regions
+
+
+def build_calling_regions(
+    contigs, regions_to_include, regions_to_exclude, ref_n_regions
+):
   """Builds a RangeSet containing the regions we should call variants in.
 
   This function intersects the Ranges spanning all of the contigs with those
@@ -517,6 +559,7 @@ def build_calling_regions(contigs, regions_to_include, regions_to_exclude):
       RangeSet.
     regions_to_exclude: RangeSet or iterable that can be converted to a
       RangeSet.
+    ref_n_regions: List of Range containing non DNA bases to exclude.
 
   Returns:
     A RangeSet.
@@ -533,6 +576,9 @@ def build_calling_regions(contigs, regions_to_include, regions_to_exclude):
         ranges.RangeSet.from_regions(regions_to_include, contig_dict)
     )
 
+  if ref_n_regions:
+    regions.exclude_regions(ranges.RangeSet(ref_n_regions))
+
   # If we provided regions to exclude, intersect those with the existing calling
   # regions to further refine our set of contigs to process.
   if regions_to_exclude:
@@ -2269,6 +2315,12 @@ def processing_regions_from_options(options):
       options.reference_filename
   ).header.contigs
 
+  ref_n_regions = None
+  if options.discard_non_dna_regions and not options.calling_regions:
+    ref_n_regions = find_ref_n_regions(
+        fasta.IndexedFastaReader(options.reference_filename), MIN_NON_DNA_REGION
+    )
+
   # Add in confident regions and vcf_contigs if in training mode.
   vcf_contigs = None
   if in_training_mode(options):
@@ -2301,7 +2353,10 @@ def processing_regions_from_options(options):
       options, 'Common contigs are %s' % [c.name for c in contigs]
   )
   calling_regions = build_calling_regions(
-      ref_contigs, options.calling_regions, options.exclude_calling_regions
+      ref_contigs,
+      options.calling_regions,
+      options.exclude_calling_regions,
+      ref_n_regions,
   )
   if not calling_regions:
     raise ValueError(

diff --git a/deepvariant/make_examples_core_test.py b/deepvariant/make_examples_core_test.py
@@ -555,6 +555,44 @@ def test_regions_to_process_partition(
         ),
     )
 
+  @parameterized.parameters(
+      dict(
+          ref_reader=fasta.InMemoryFastaReader([('chr1', 0, 'GATACA')]),
+          expected=[],
+          min_region_len=3,
+      ),
+      dict(
+          ref_reader=fasta.InMemoryFastaReader([('chr1', 0, 'NNNGATACA')]),
+          expected=[ranges.make_range('chr1', 0, 3)],
+          min_region_len=3,
+      ),
+      dict(
+          ref_reader=fasta.InMemoryFastaReader([('chr1', 0, 'GATACANNN')]),
+          expected=[ranges.make_range('chr1', 6, 9)],
+          min_region_len=3,
+      ),
+      dict(
+          ref_reader=fasta.InMemoryFastaReader([('chr1', 0, 'GATACANNNTTT')]),
+          expected=[ranges.make_range('chr1', 6, 9)],
+          min_region_len=3,
+      ),
+      dict(
+          ref_reader=fasta.InMemoryFastaReader(
+              [('chr1', 0, 'GATACANNNAAAAANNN')]
+          ),
+          expected=[
+              ranges.make_range('chr1', 6, 9),
+              ranges.make_range('chr1', 14, 17),
+          ],
+          min_region_len=3,
+      ),
+  )
+  def test_find_ref_n_regions(self, ref_reader, expected, min_region_len):
+    self.assertCountEqual(
+        expected,
+        make_examples_core.find_ref_n_regions(ref_reader, min_region_len),
+    )
+
   @parameterized.parameters(
       dict(includes=[], excludes=[], expected=['1:1-100', '2:1-200']),
       dict(includes=['1'], excludes=[], expected=['1:1-100']),
@@ -592,7 +630,7 @@ def test_regions_to_process_partition(
   def test_build_calling_regions(self, includes, excludes, expected):
     contigs = _make_contigs([('1', 100), ('2', 200)])
     actual = make_examples_core.build_calling_regions(
-        contigs, includes, excludes
+        contigs, includes, excludes, None
     )
     self.assertCountEqual(actual, _from_literals_list(expected))
 
@@ -834,6 +872,25 @@ def test_regions_and_exclude_regions_flags(self):
         ),
     )
 
+  @flagsaver.flagsaver
+  def test_regions_exclude_n_reference(self):
+    FLAGS.mode = 'calling'
+    FLAGS.ref = testdata.CHR20_FASTA
+    FLAGS.reads = testdata.CHR20_BAM
+    FLAGS.examples = 'examples.tfrecord'
+    FLAGS.discard_non_dna_regions = True
+
+    options = make_examples.default_options(add_flags=True)
+    _, regions_from_options = (
+        make_examples_core.processing_regions_from_options(options)
+    )
+    self.assertCountEqual(
+        list(ranges.RangeSet(regions_from_options)),
+        _from_literals_list(
+            ['chr20:9,995,001-11,095,000', 'chr20:59,776,001-60,001,000']
+        ),
+    )
+
   @flagsaver.flagsaver
   def test_incorrect_empty_regions(self):
     FLAGS.mode = 'calling'

diff --git a/deepvariant/make_examples_options.py b/deepvariant/make_examples_options.py
@@ -537,6 +537,14 @@
         'sharded into the same number of shards as the examples.'
     ),
 )
+_DISCARD_NON_DNA_REGIONS = flags.DEFINE_bool(
+    'discard_non_dna_regions',
+    False,
+    (
+        'Default is False. If set regions of Ns larger than 300,000bp are'
+        'discarded.'
+    ),
+)
 
 
 def shared_flags_to_options(
@@ -778,6 +786,8 @@ def shared_flags_to_options(
         flags_obj.customized_classes_labeler_info_field_name
     )
 
+  options.discard_non_dna_regions = _DISCARD_NON_DNA_REGIONS.value
+
   return options
 
 

diff --git a/deepvariant/protos/deepvariant.proto b/deepvariant/protos/deepvariant.proto
@@ -594,7 +594,7 @@ message SampleOptions {
 
 // High-level options that encapsulates all of the parameters needed to run
 // DeepVariant end-to-end.
-// Next ID: 56.
+// Next ID: 57.
 message MakeExamplesOptions {
   // A list of contig names we never want to call variants on. For example,
   // chrM in humans is the mitocondrial genome and the caller isn't trained to
@@ -792,6 +792,8 @@ message MakeExamplesOptions {
   int32 phase_max_candidates = 53;
 
   string read_phases_output = 55;
+
+  bool discard_non_dna_regions = 56;
 }
 
 // Config describe information needed for a dataset that can be used for