From 6945abe1bb735377e7701311527f10a4149a517e Mon Sep 17 00:00:00 2001 From: Ryan Wick Date: Sat, 15 Apr 2017 17:59:12 +1000 Subject: [PATCH] Fixes for Nanopore-only assembly --- test/test_blast_func.py | 4 +- unicycler/assembly_graph.py | 8 +-- unicycler/assembly_graph_segment.py | 11 +---- unicycler/miniasm_assembly.py | 58 +++++++++++----------- unicycler/misc.py | 4 ++ unicycler/string_graph.py | 76 +++++++++++++++++++++++++---- unicycler/unicycler.py | 31 ++++++------ 7 files changed, 124 insertions(+), 68 deletions(-) diff --git a/test/test_blast_func.py b/test/test_blast_func.py index b432c686..3c3ec675 100644 --- a/test/test_blast_func.py +++ b/test/test_blast_func.py @@ -61,7 +61,7 @@ def test_random_seq_with_exact_gene_forward_strand(self): length_before_rotate = len(seq) seg = unicycler.assembly_graph_segment.Segment(1, 1.0, seq, True) - seg.rotate_sequence(hit.start_pos, hit.flip, 0) + seg.rotate_sequence(hit.start_pos, hit.flip) self.assertEqual(len(seg.forward_sequence), length_before_rotate) self.assertTrue(seg.forward_sequence.startswith('ATGCAGGAACGCATTAAAGCGTGCTTTACCGAAAG')) @@ -79,7 +79,7 @@ def test_random_seq_with_exact_gene_reverse_strand(self): length_before_rotate = len(seq) seg = unicycler.assembly_graph_segment.Segment(1, 1.0, seq, True) - seg.rotate_sequence(hit.start_pos, hit.flip, 0) + seg.rotate_sequence(hit.start_pos, hit.flip) self.assertEqual(len(seg.forward_sequence), length_before_rotate) self.assertTrue(seg.forward_sequence.startswith('ATGCAGGAACGCATTAAAGCGTGCTTTACCGAAAG')) diff --git a/unicycler/assembly_graph.py b/unicycler/assembly_graph.py index 90ea2897..77225653 100644 --- a/unicycler/assembly_graph.py +++ b/unicycler/assembly_graph.py @@ -302,8 +302,10 @@ def normalise_read_depths(self): than 1. """ median_depth = self.get_median_read_depth() + if median_depth == 0.0: + return for segment in self.segments.values(): - segment.divide_depth(median_depth) + segment.depth /= median_depth def get_total_length(self): """ @@ -348,7 +350,7 @@ def save_to_fasta(self, filename, newline=False, min_length=1, verbosity=1, sile with open(filename, 'w') as fasta: sorted_segments = sorted(self.segments.values(), key=lambda x: x.number) for segment in sorted_segments: - if len(segment.forward_sequence) >= min_length: + if segment.get_length() >= min_length: fasta.write(segment.get_fasta_name_and_description_line(circular_seg_nums)) fasta.write(add_line_breaks_to_sequence(segment.forward_sequence)) @@ -366,7 +368,7 @@ def save_specific_segments_to_fasta(filename, segments, silent=False): fasta.write(add_line_breaks_to_sequence(segment.forward_sequence)) def save_to_gfa(self, filename, verbosity=1, save_copy_depth_info=False, - save_seg_type_info=False, newline=False, include_insert_size=True): + save_seg_type_info=False, newline=False, include_insert_size=False): """ Saves whole graph to a GFA file. """ diff --git a/unicycler/assembly_graph_segment.py b/unicycler/assembly_graph_segment.py index d6afcd2a..acdc2f2f 100644 --- a/unicycler/assembly_graph_segment.py +++ b/unicycler/assembly_graph_segment.py @@ -60,9 +60,6 @@ def build_other_sequence_if_necessary(self): if not self.reverse_sequence: self.reverse_sequence = reverse_complement(self.forward_sequence) - def divide_depth(self, divisor): - self.depth /= divisor - def get_length(self): return len(self.forward_sequence) @@ -184,18 +181,14 @@ def remove_sequence(self): self.forward_sequence = '' self.reverse_sequence = '' - def rotate_sequence(self, start_pos, flip, overlap): + def rotate_sequence(self, start_pos, flip): """ Rotates the sequence so it begins at start_pos. If flip is True, it also switches the forward and reverse strands. This function assumes that the segment is a circular - completed replicon. + completed replicon with no overlap. """ unrotated_seq = self.forward_sequence - if overlap > 0: - unrotated_seq = unrotated_seq[:-overlap] rotated_seq = unrotated_seq[start_pos:] + unrotated_seq[:start_pos] - if overlap > 0: - rotated_seq += rotated_seq[:overlap] rev_comp_rotated_seq = reverse_complement(rotated_seq) if flip: diff --git a/unicycler/miniasm_assembly.py b/unicycler/miniasm_assembly.py index be813ae9..3650fedf 100644 --- a/unicycler/miniasm_assembly.py +++ b/unicycler/miniasm_assembly.py @@ -19,7 +19,8 @@ import subprocess import sys import itertools -from .misc import green, red, line_iterator, print_table, int_to_str, reverse_complement +import collections +from .misc import green, red, line_iterator, print_table, int_to_str, reverse_complement, gfa_path from .minimap_alignment import align_long_reads_to_assembly_graph, range_overlap_size, \ load_minimap_alignments from .string_graph import StringGraph, StringGraphSegment, \ @@ -46,7 +47,7 @@ def __str__(self): def make_miniasm_string_graph(graph, out_dir, keep, threads, read_dict, long_read_filename, - scoring_scheme, racon_path, read_nicknames): + scoring_scheme, racon_path, read_nicknames, counter): log.log_section_header('Assembling contigs and long reads with miniasm') if graph is not None: log.log_explanation('Unicycler uses miniasm to construct a string graph ' @@ -67,6 +68,8 @@ def make_miniasm_string_graph(graph, out_dir, keep, threads, read_dict, long_rea if not os.path.exists(miniasm_dir): os.makedirs(miniasm_dir) + short_reads_available = graph is not None + assembly_read_names = get_miniasm_assembly_reads(graph, read_dict, long_read_filename, miniasm_dir, threads) @@ -128,44 +131,34 @@ def make_miniasm_string_graph(graph, out_dir, keep, threads, read_dict, long_rea log.log(' ' + str(len(string_graph.segments)) + ' segments, ' + str(len(string_graph.links) // 2) + ' links', verbosity=2) + if not short_reads_available and keep > 0: + shutil.copyfile(string_graph_filename, gfa_path(out_dir, next(counter), 'string_graph')) + string_graph.remove_branching_paths() if keep >= 3: - string_graph.save_to_gfa(branching_paths_removed_filename) + string_graph.save_to_gfa(branching_paths_removed_filename, include_depth=False) log.log('') unitig_graph = merge_string_graph_segments_into_unitig_graph(string_graph, read_nicknames) if keep >= 3: - unitig_graph.save_to_gfa(unitig_graph_filename) + unitig_graph.save_to_gfa(unitig_graph_filename, include_depth=False) + + if not short_reads_available and keep > 0: + shutil.copyfile(unitig_graph_filename, gfa_path(out_dir, next(counter), 'unitig_graph')) polish_unitigs_with_racon(unitig_graph, miniasm_dir, read_dict, graph, racon_path, threads, scoring_scheme) if keep >= 3: unitig_graph.save_to_gfa(racon_polished_filename) - if graph is not None: + if not short_reads_available and keep > 0: + shutil.copyfile(racon_polished_filename, + gfa_path(out_dir, next(counter), 'racon_polished')) + + if short_reads_available: unitig_graph = place_contigs(miniasm_dir, graph, unitig_graph, threads, scoring_scheme) if keep >= 3: - unitig_graph.save_to_gfa(contigs_placed_filename) - - - - # Build MiniasmBridge objects from the graph. - # TO DO - # TO DO - # TO DO - # TO DO - # TO DO - # TO DO - # TO DO - # TO DO - # TO DO - # TO DO - # TO DO - # TO DO - # TO DO - - - + unitig_graph.save_to_gfa(contigs_placed_filename, include_depth=False) if keep < 3: shutil.rmtree(miniasm_dir) @@ -300,9 +293,9 @@ def polish_unitigs_with_racon(unitig_graph, miniasm_dir, read_dict, graph, racon fixed_fasta = os.path.join(polish_dir, ('%03d' % next(counter)) + '_fixed.fasta') rotated_fasta = os.path.join(polish_dir, ('%03d' % next(counter)) + '_rotated.fasta') - mapping_quality = make_racon_polish_alignments(current_fasta, mappings_filename, - polish_reads, threads, scoring_scheme, - aligner=racon_aligner) + mapping_quality, unitig_depths = \ + make_racon_polish_alignments(current_fasta, mappings_filename, polish_reads, threads, + scoring_scheme, aligner=racon_aligner) racon_table_row = ['begin' if polish_round_count == 0 else str(polish_round_count), int_to_str(unitig_graph.get_total_segment_length()), @@ -359,6 +352,9 @@ def polish_unitigs_with_racon(unitig_graph, miniasm_dir, read_dict, graph, racon segment = unitig_graph.segments[unitig_name] segment.forward_sequence = unitig_seq segment.reverse_sequence = reverse_complement(unitig_seq) + if unitig_name in unitig_depths: + segment.depth = unitig_depths[unitig_name] + unitig_graph.normalise_read_depths() else: log.log(red('Polishing failed')) @@ -680,6 +676,7 @@ def find_contig_starts_and_ends(miniasm_dir, assembly_graph, unitig_graph, threa def make_racon_polish_alignments(current_fasta, mappings_filename, polish_reads, threads, scoring_scheme, aligner='minimap'): mapping_quality = 0 + unitig_depths = collections.defaultdict(float) if aligner == 'minimap': minimap_alignments_str = minimap_align_reads(current_fasta, polish_reads, threads, 3, @@ -693,6 +690,7 @@ def make_racon_polish_alignments(current_fasta, mappings_filename, polish_reads, mappings.write(a.paf_line) mappings.write('\n') mapping_quality += a.matching_bases + unitig_depths[a.ref_name] += a.fraction_ref_aligned() elif aligner == 'GraphMap': command = ['graphmap', 'align', '-a', 'anchor', '--rebuild-index', @@ -721,4 +719,4 @@ def make_racon_polish_alignments(current_fasta, mappings_filename, polish_reads, else: assert False - return mapping_quality \ No newline at end of file + return mapping_quality, unitig_depths \ No newline at end of file diff --git a/unicycler/misc.py b/unicycler/misc.py index cb65a0f3..1fba9c99 100644 --- a/unicycler/misc.py +++ b/unicycler/misc.py @@ -1056,3 +1056,7 @@ def simplify_ranges(ranges): def remove_dupes_preserve_order(lst): seen = set() return [x for x in lst if not (x in seen or seen.add(x))] + + +def gfa_path(out_dir, file_num, name): + return os.path.join(out_dir, str(file_num).zfill(3) + '_' + name + '.gfa') diff --git a/unicycler/string_graph.py b/unicycler/string_graph.py index ea91dc3b..225fedf7 100644 --- a/unicycler/string_graph.py +++ b/unicycler/string_graph.py @@ -73,17 +73,24 @@ def __init__(self, filename): self.links[rev_link_tuple].seg_2_overlap = seg_1_to_seg_2_overlap self.reverse_links = build_reverse_links(self.forward_links) - def save_to_gfa(self, filename, verbosity=1, newline=False): + def save_to_gfa(self, filename, verbosity=1, newline=False, include_depth=True): """ Saves whole graph to a GFA file. """ log.log(('\n' if newline else '') + 'Saving ' + filename, verbosity) with open(filename, 'w') as gfa: for segment in sorted(self.segments.values(), key=lambda x: x.full_name): - gfa.write(segment.gfa_segment_line()) + gfa.write(segment.gfa_segment_line(include_depth)) for link in sorted(self.links.keys()): gfa.write(self.links[link].gfa_link_line()) + def save_to_fasta(self, filename, min_length=1): + with open(filename, 'w') as fasta: + for segment in sorted(self.segments.values(), reverse=True, + key=lambda x: x.get_length()): + if segment.get_length() >= min_length: + fasta.write(segment.fasta_record()) + def get_preceding_segments(self, seg_name): if seg_name not in self.reverse_links: return [] @@ -259,12 +266,6 @@ def seq_from_signed_seg_name(self, signed_name): else: return self.segments[unsigned_seg_name].reverse_sequence - def save_to_fasta(self, filename): - with open(filename, 'w') as fasta: - for segment in sorted(self.segments.values(), reverse=True, - key=lambda x: x.get_length()): - fasta.write(segment.fasta_record()) - def save_non_contigs_to_file(self, filename, min_length): """ Saves all graph segments which are not short read contigs to a FASTA file. @@ -352,6 +353,17 @@ def segment_is_circular(self, seg_name): following_seg_name = following_segments[0] return preceding_seg_name == pos_seg_name and following_seg_name == pos_seg_name + + def completed_circular_replicons(self): + completed_components = [] + single_segment_components = [x for x in self.get_connected_components() if len(x) == 1] + for component in single_segment_components: + seg = component[0] + if self.segment_is_circular(seg): + completed_components.append(seg) + return completed_components + + def get_connected_components(self): """ Returns a list of lists, where each inner list is the segment names of one connected @@ -466,6 +478,29 @@ def rotate_circular_sequences(self, shift_fraction=0.70710678118655): def get_total_segment_length(self):\ return sum(s.get_length() for s in self.segments.values()) + def get_median_read_depth(self): + """ + Returns the assembly graph's median read depth (by base). + """ + sorted_segments = sorted(self.segments.values(), key=lambda x: x.depth) + total_length = 0 + for segment in sorted_segments: + total_length += segment.get_length() + halfway_length = total_length // 2 + length_so_far = 0 + for segment in sorted_segments: + length_so_far += segment.get_length() + if length_so_far >= halfway_length: + return segment.depth + return 0.0 + + def normalise_read_depths(self): + median_depth = self.get_median_read_depth() + if median_depth == 0.0: + return + for segment in self.segments.values(): + segment.depth /= median_depth + class StringGraphSegment(object): @@ -473,6 +508,7 @@ def __init__(self, full_name, sequence, qual=None): self.full_name = full_name self.forward_sequence = sequence self.reverse_sequence = reverse_complement(sequence) + self.depth = 1.0 # Miniasm trims reads and puts the start/end positions in the name... try: @@ -507,13 +543,33 @@ def __repr__(self): def get_length(self): return len(self.forward_sequence) - def gfa_segment_line(self): - return '\t'.join(['S', self.full_name, self.forward_sequence]) + '\n' + def gfa_segment_line(self, include_depth=True): + s_line_parts = ['S', self.full_name, self.forward_sequence, 'LN:i:', str(self.get_length())] + if include_depth: + s_line_parts += ['dp:f:', str(self.depth)] + return '\t'.join(s_line_parts) + '\n' def fasta_record(self): return ''.join(['>', self.full_name, '\n', add_line_breaks_to_sequence(self.forward_sequence, 70)]) + def rotate_sequence(self, start_pos, flip): + """ + Rotates the sequence so it begins at start_pos. If flip is True, it also switches the + forward and reverse strands. This function assumes that the segment is a circular + completed replicon with no overlap. + """ + unrotated_seq = self.forward_sequence + rotated_seq = unrotated_seq[start_pos:] + unrotated_seq[:start_pos] + rev_comp_rotated_seq = reverse_complement(rotated_seq) + + if flip: + self.forward_sequence = rev_comp_rotated_seq + self.reverse_sequence = rotated_seq + else: + self.forward_sequence = rotated_seq + self.reverse_sequence = rev_comp_rotated_seq + class StringGraphLink(object): diff --git a/unicycler/unicycler.py b/unicycler/unicycler.py index bc5c9174..75ff4942 100755 --- a/unicycler/unicycler.py +++ b/unicycler/unicycler.py @@ -34,7 +34,7 @@ get_default_thread_count, spades_path_and_version, makeblastdb_path_and_version, \ tblastn_path_and_version, bowtie2_build_path_and_version, bowtie2_path_and_version, \ samtools_path_and_version, java_path_and_version, pilon_path_and_version, \ - racon_path_and_version + racon_path_and_version, gfa_path from .spades_func import get_best_spades_graph from .blast_func import find_start_gene, CannotFindStart from .unicycler_align import add_aligning_arguments, fix_up_arguments, AlignmentScoringScheme, \ @@ -68,6 +68,7 @@ def main(): counter = itertools.count(start=1) bridges = [] + # If we have short reads, do all the SPAdes stuff. if short_reads_available: # Produce a SPAdes assembly graph with a k-mer that balances contig length and connectivity. @@ -84,13 +85,14 @@ def main(): args.no_correct, args.linear_seqs) single_copy_segments = get_single_copy_segments(graph, 0) if args.keep > 0: - graph.save_to_gfa(best_spades_graph, save_copy_depth_info=True, newline=True) + graph.save_to_gfa(best_spades_graph, save_copy_depth_info=True, newline=True, + include_insert_size=True) clean_up_spades_graph(graph) if args.keep > 2: overlap_removed_graph_filename = gfa_path(args.out, next(counter), 'overlaps_removed') graph.save_to_gfa(overlap_removed_graph_filename, save_copy_depth_info=True, - newline=True) + newline=True, include_insert_size=True) # TO DO: SHORT READ ALIGNMENT TO GRAPH # * This would be very useful for a number of reasons: @@ -137,7 +139,7 @@ def main(): if long_reads_available: string_graph = make_miniasm_string_graph(graph, args.out, args.keep, args.threads, read_dict, long_read_filename, scoring_scheme, - args.racon_path, read_nicknames) + args.racon_path, read_nicknames, counter) else: string_graph = None @@ -197,7 +199,7 @@ def main(): # Save the final state as both a GFA and FASTA file. log.log_section_header('Complete') - graph.save_to_gfa(os.path.join(args.out, 'assembly.gfa'), include_insert_size=False) + graph.save_to_gfa(os.path.join(args.out, 'assembly.gfa')) graph.save_to_fasta(os.path.join(args.out, 'assembly.fasta'), min_length=args.min_fasta_length) log.log('') @@ -715,8 +717,6 @@ def quit_if_dependency_problem(spades_status, racon_status, makeblastdb_status, quit_with_error('Unspecified error with Unicycler dependencies') -def gfa_path(out_dir, file_num, name): - return os.path.join(out_dir, str(file_num).zfill(3) + '_' + name + '.gfa') def rotate_completed_replicons(graph, args, counter): @@ -735,12 +735,15 @@ def rotate_completed_replicons(graph, args, counter): for completed_replicon in completed_replicons: segment = graph.segments[completed_replicon] sequence = segment.forward_sequence - if graph.overlap > 0: - sequence = sequence[:-graph.overlap] - depth = segment.depth - log.log('Segment ' + str(segment.number) + ':', 2) - rotation_result_row = [str(segment.number), int_to_str(len(sequence)), - float_to_str(depth, 2) + 'x'] + + try: + seg_name = str(segment.number) + except AttributeError: + seg_name = segment.full_name + + log.log('Segment ' + seg_name + ':', 2) + rotation_result_row = [seg_name, int_to_str(len(sequence)), + float_to_str(segment.depth, 2) + 'x'] try: blast_hit = find_start_gene(sequence, args.start_genes, args.start_gene_id, args.start_gene_cov, blast_dir, args.makeblastdb_path, @@ -752,7 +755,7 @@ def rotate_completed_replicons(graph, args, counter): 'reverse' if blast_hit.flip else 'forward', '%.1f' % blast_hit.pident + '%', '%.1f' % blast_hit.query_cov + '%'] - segment.rotate_sequence(blast_hit.start_pos, blast_hit.flip, graph.overlap) + segment.rotate_sequence(blast_hit.start_pos, blast_hit.flip) rotation_count += 1 rotation_result_table.append(rotation_result_row)