Skip to content

Commit

Permalink
Fixes for Nanopore-only assembly
Browse files Browse the repository at this point in the history
  • Loading branch information
rrwick committed Apr 15, 2017
1 parent 8817fcd commit 6945abe
Show file tree
Hide file tree
Showing 7 changed files with 124 additions and 68 deletions.
4 changes: 2 additions & 2 deletions test/test_blast_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_random_seq_with_exact_gene_forward_strand(self):

length_before_rotate = len(seq)
seg = unicycler.assembly_graph_segment.Segment(1, 1.0, seq, True)
seg.rotate_sequence(hit.start_pos, hit.flip, 0)
seg.rotate_sequence(hit.start_pos, hit.flip)

self.assertEqual(len(seg.forward_sequence), length_before_rotate)
self.assertTrue(seg.forward_sequence.startswith('ATGCAGGAACGCATTAAAGCGTGCTTTACCGAAAG'))
Expand All @@ -79,7 +79,7 @@ def test_random_seq_with_exact_gene_reverse_strand(self):

length_before_rotate = len(seq)
seg = unicycler.assembly_graph_segment.Segment(1, 1.0, seq, True)
seg.rotate_sequence(hit.start_pos, hit.flip, 0)
seg.rotate_sequence(hit.start_pos, hit.flip)

self.assertEqual(len(seg.forward_sequence), length_before_rotate)
self.assertTrue(seg.forward_sequence.startswith('ATGCAGGAACGCATTAAAGCGTGCTTTACCGAAAG'))
8 changes: 5 additions & 3 deletions unicycler/assembly_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,10 @@ def normalise_read_depths(self):
than 1.
"""
median_depth = self.get_median_read_depth()
if median_depth == 0.0:
return
for segment in self.segments.values():
segment.divide_depth(median_depth)
segment.depth /= median_depth

def get_total_length(self):
"""
Expand Down Expand Up @@ -348,7 +350,7 @@ def save_to_fasta(self, filename, newline=False, min_length=1, verbosity=1, sile
with open(filename, 'w') as fasta:
sorted_segments = sorted(self.segments.values(), key=lambda x: x.number)
for segment in sorted_segments:
if len(segment.forward_sequence) >= min_length:
if segment.get_length() >= min_length:
fasta.write(segment.get_fasta_name_and_description_line(circular_seg_nums))
fasta.write(add_line_breaks_to_sequence(segment.forward_sequence))

Expand All @@ -366,7 +368,7 @@ def save_specific_segments_to_fasta(filename, segments, silent=False):
fasta.write(add_line_breaks_to_sequence(segment.forward_sequence))

def save_to_gfa(self, filename, verbosity=1, save_copy_depth_info=False,
save_seg_type_info=False, newline=False, include_insert_size=True):
save_seg_type_info=False, newline=False, include_insert_size=False):
"""
Saves whole graph to a GFA file.
"""
Expand Down
11 changes: 2 additions & 9 deletions unicycler/assembly_graph_segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,6 @@ def build_other_sequence_if_necessary(self):
if not self.reverse_sequence:
self.reverse_sequence = reverse_complement(self.forward_sequence)

def divide_depth(self, divisor):
self.depth /= divisor

def get_length(self):
return len(self.forward_sequence)

Expand Down Expand Up @@ -184,18 +181,14 @@ def remove_sequence(self):
self.forward_sequence = ''
self.reverse_sequence = ''

def rotate_sequence(self, start_pos, flip, overlap):
def rotate_sequence(self, start_pos, flip):
"""
Rotates the sequence so it begins at start_pos. If flip is True, it also switches the
forward and reverse strands. This function assumes that the segment is a circular
completed replicon.
completed replicon with no overlap.
"""
unrotated_seq = self.forward_sequence
if overlap > 0:
unrotated_seq = unrotated_seq[:-overlap]
rotated_seq = unrotated_seq[start_pos:] + unrotated_seq[:start_pos]
if overlap > 0:
rotated_seq += rotated_seq[:overlap]
rev_comp_rotated_seq = reverse_complement(rotated_seq)

if flip:
Expand Down
58 changes: 28 additions & 30 deletions unicycler/miniasm_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
import subprocess
import sys
import itertools
from .misc import green, red, line_iterator, print_table, int_to_str, reverse_complement
import collections
from .misc import green, red, line_iterator, print_table, int_to_str, reverse_complement, gfa_path
from .minimap_alignment import align_long_reads_to_assembly_graph, range_overlap_size, \
load_minimap_alignments
from .string_graph import StringGraph, StringGraphSegment, \
Expand All @@ -46,7 +47,7 @@ def __str__(self):


def make_miniasm_string_graph(graph, out_dir, keep, threads, read_dict, long_read_filename,
scoring_scheme, racon_path, read_nicknames):
scoring_scheme, racon_path, read_nicknames, counter):
log.log_section_header('Assembling contigs and long reads with miniasm')
if graph is not None:
log.log_explanation('Unicycler uses miniasm to construct a string graph '
Expand All @@ -67,6 +68,8 @@ def make_miniasm_string_graph(graph, out_dir, keep, threads, read_dict, long_rea
if not os.path.exists(miniasm_dir):
os.makedirs(miniasm_dir)

short_reads_available = graph is not None

assembly_read_names = get_miniasm_assembly_reads(graph, read_dict, long_read_filename,
miniasm_dir, threads)

Expand Down Expand Up @@ -128,44 +131,34 @@ def make_miniasm_string_graph(graph, out_dir, keep, threads, read_dict, long_rea
log.log(' ' + str(len(string_graph.segments)) + ' segments, ' +
str(len(string_graph.links) // 2) + ' links', verbosity=2)

if not short_reads_available and keep > 0:
shutil.copyfile(string_graph_filename, gfa_path(out_dir, next(counter), 'string_graph'))

string_graph.remove_branching_paths()
if keep >= 3:
string_graph.save_to_gfa(branching_paths_removed_filename)
string_graph.save_to_gfa(branching_paths_removed_filename, include_depth=False)

log.log('')
unitig_graph = merge_string_graph_segments_into_unitig_graph(string_graph, read_nicknames)
if keep >= 3:
unitig_graph.save_to_gfa(unitig_graph_filename)
unitig_graph.save_to_gfa(unitig_graph_filename, include_depth=False)

if not short_reads_available and keep > 0:
shutil.copyfile(unitig_graph_filename, gfa_path(out_dir, next(counter), 'unitig_graph'))

polish_unitigs_with_racon(unitig_graph, miniasm_dir, read_dict, graph, racon_path, threads,
scoring_scheme)
if keep >= 3:
unitig_graph.save_to_gfa(racon_polished_filename)

if graph is not None:
if not short_reads_available and keep > 0:
shutil.copyfile(racon_polished_filename,
gfa_path(out_dir, next(counter), 'racon_polished'))

if short_reads_available:
unitig_graph = place_contigs(miniasm_dir, graph, unitig_graph, threads, scoring_scheme)
if keep >= 3:
unitig_graph.save_to_gfa(contigs_placed_filename)



# Build MiniasmBridge objects from the graph.
# TO DO
# TO DO
# TO DO
# TO DO
# TO DO
# TO DO
# TO DO
# TO DO
# TO DO
# TO DO
# TO DO
# TO DO
# TO DO



unitig_graph.save_to_gfa(contigs_placed_filename, include_depth=False)

if keep < 3:
shutil.rmtree(miniasm_dir)
Expand Down Expand Up @@ -300,9 +293,9 @@ def polish_unitigs_with_racon(unitig_graph, miniasm_dir, read_dict, graph, racon
fixed_fasta = os.path.join(polish_dir, ('%03d' % next(counter)) + '_fixed.fasta')
rotated_fasta = os.path.join(polish_dir, ('%03d' % next(counter)) + '_rotated.fasta')

mapping_quality = make_racon_polish_alignments(current_fasta, mappings_filename,
polish_reads, threads, scoring_scheme,
aligner=racon_aligner)
mapping_quality, unitig_depths = \
make_racon_polish_alignments(current_fasta, mappings_filename, polish_reads, threads,
scoring_scheme, aligner=racon_aligner)

racon_table_row = ['begin' if polish_round_count == 0 else str(polish_round_count),
int_to_str(unitig_graph.get_total_segment_length()),
Expand Down Expand Up @@ -359,6 +352,9 @@ def polish_unitigs_with_racon(unitig_graph, miniasm_dir, read_dict, graph, racon
segment = unitig_graph.segments[unitig_name]
segment.forward_sequence = unitig_seq
segment.reverse_sequence = reverse_complement(unitig_seq)
if unitig_name in unitig_depths:
segment.depth = unitig_depths[unitig_name]
unitig_graph.normalise_read_depths()
else:
log.log(red('Polishing failed'))

Expand Down Expand Up @@ -680,6 +676,7 @@ def find_contig_starts_and_ends(miniasm_dir, assembly_graph, unitig_graph, threa
def make_racon_polish_alignments(current_fasta, mappings_filename, polish_reads, threads,
scoring_scheme, aligner='minimap'):
mapping_quality = 0
unitig_depths = collections.defaultdict(float)

if aligner == 'minimap':
minimap_alignments_str = minimap_align_reads(current_fasta, polish_reads, threads, 3,
Expand All @@ -693,6 +690,7 @@ def make_racon_polish_alignments(current_fasta, mappings_filename, polish_reads,
mappings.write(a.paf_line)
mappings.write('\n')
mapping_quality += a.matching_bases
unitig_depths[a.ref_name] += a.fraction_ref_aligned()

elif aligner == 'GraphMap':
command = ['graphmap', 'align', '-a', 'anchor', '--rebuild-index',
Expand Down Expand Up @@ -721,4 +719,4 @@ def make_racon_polish_alignments(current_fasta, mappings_filename, polish_reads,
else:
assert False

return mapping_quality
return mapping_quality, unitig_depths
4 changes: 4 additions & 0 deletions unicycler/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1056,3 +1056,7 @@ def simplify_ranges(ranges):
def remove_dupes_preserve_order(lst):
seen = set()
return [x for x in lst if not (x in seen or seen.add(x))]


def gfa_path(out_dir, file_num, name):
return os.path.join(out_dir, str(file_num).zfill(3) + '_' + name + '.gfa')
76 changes: 66 additions & 10 deletions unicycler/string_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,24 @@ def __init__(self, filename):
self.links[rev_link_tuple].seg_2_overlap = seg_1_to_seg_2_overlap
self.reverse_links = build_reverse_links(self.forward_links)

def save_to_gfa(self, filename, verbosity=1, newline=False):
def save_to_gfa(self, filename, verbosity=1, newline=False, include_depth=True):
"""
Saves whole graph to a GFA file.
"""
log.log(('\n' if newline else '') + 'Saving ' + filename, verbosity)
with open(filename, 'w') as gfa:
for segment in sorted(self.segments.values(), key=lambda x: x.full_name):
gfa.write(segment.gfa_segment_line())
gfa.write(segment.gfa_segment_line(include_depth))
for link in sorted(self.links.keys()):
gfa.write(self.links[link].gfa_link_line())

def save_to_fasta(self, filename, min_length=1):
with open(filename, 'w') as fasta:
for segment in sorted(self.segments.values(), reverse=True,
key=lambda x: x.get_length()):
if segment.get_length() >= min_length:
fasta.write(segment.fasta_record())

def get_preceding_segments(self, seg_name):
if seg_name not in self.reverse_links:
return []
Expand Down Expand Up @@ -259,12 +266,6 @@ def seq_from_signed_seg_name(self, signed_name):
else:
return self.segments[unsigned_seg_name].reverse_sequence

def save_to_fasta(self, filename):
with open(filename, 'w') as fasta:
for segment in sorted(self.segments.values(), reverse=True,
key=lambda x: x.get_length()):
fasta.write(segment.fasta_record())

def save_non_contigs_to_file(self, filename, min_length):
"""
Saves all graph segments which are not short read contigs to a FASTA file.
Expand Down Expand Up @@ -352,6 +353,17 @@ def segment_is_circular(self, seg_name):
following_seg_name = following_segments[0]
return preceding_seg_name == pos_seg_name and following_seg_name == pos_seg_name


def completed_circular_replicons(self):
completed_components = []
single_segment_components = [x for x in self.get_connected_components() if len(x) == 1]
for component in single_segment_components:
seg = component[0]
if self.segment_is_circular(seg):
completed_components.append(seg)
return completed_components


def get_connected_components(self):
"""
Returns a list of lists, where each inner list is the segment names of one connected
Expand Down Expand Up @@ -466,13 +478,37 @@ def rotate_circular_sequences(self, shift_fraction=0.70710678118655):
def get_total_segment_length(self):\
return sum(s.get_length() for s in self.segments.values())

def get_median_read_depth(self):
"""
Returns the assembly graph's median read depth (by base).
"""
sorted_segments = sorted(self.segments.values(), key=lambda x: x.depth)
total_length = 0
for segment in sorted_segments:
total_length += segment.get_length()
halfway_length = total_length // 2
length_so_far = 0
for segment in sorted_segments:
length_so_far += segment.get_length()
if length_so_far >= halfway_length:
return segment.depth
return 0.0

def normalise_read_depths(self):
median_depth = self.get_median_read_depth()
if median_depth == 0.0:
return
for segment in self.segments.values():
segment.depth /= median_depth


class StringGraphSegment(object):

def __init__(self, full_name, sequence, qual=None):
self.full_name = full_name
self.forward_sequence = sequence
self.reverse_sequence = reverse_complement(sequence)
self.depth = 1.0

# Miniasm trims reads and puts the start/end positions in the name...
try:
Expand Down Expand Up @@ -507,13 +543,33 @@ def __repr__(self):
def get_length(self):
return len(self.forward_sequence)

def gfa_segment_line(self):
return '\t'.join(['S', self.full_name, self.forward_sequence]) + '\n'
def gfa_segment_line(self, include_depth=True):
s_line_parts = ['S', self.full_name, self.forward_sequence, 'LN:i:', str(self.get_length())]
if include_depth:
s_line_parts += ['dp:f:', str(self.depth)]
return '\t'.join(s_line_parts) + '\n'

def fasta_record(self):
return ''.join(['>', self.full_name, '\n',
add_line_breaks_to_sequence(self.forward_sequence, 70)])

def rotate_sequence(self, start_pos, flip):
"""
Rotates the sequence so it begins at start_pos. If flip is True, it also switches the
forward and reverse strands. This function assumes that the segment is a circular
completed replicon with no overlap.
"""
unrotated_seq = self.forward_sequence
rotated_seq = unrotated_seq[start_pos:] + unrotated_seq[:start_pos]
rev_comp_rotated_seq = reverse_complement(rotated_seq)

if flip:
self.forward_sequence = rev_comp_rotated_seq
self.reverse_sequence = rotated_seq
else:
self.forward_sequence = rotated_seq
self.reverse_sequence = rev_comp_rotated_seq


class StringGraphLink(object):

Expand Down
Loading

0 comments on commit 6945abe

Please sign in to comment.