PerlLib/CDNA/Genome_based_cDNA_assembler.pm

#!/usr/local/bin/perl

package main;
our $SEE;


package CDNA::Genome_based_cDNA_assembler;

=head1 NAME

CDNA::Genome_based_cdna_assembler

=cut

=head1 DESCRIPTION

This module is used to assemble compatible cDNA alignments.  The algorithm is as follows:
must describe this here.

=cut

use strict;
use CDNA::CDNA_alignment;
use Data::Dumper;

my $DELIMETER = "$;,";
my $FUZZLENGTH = 20;


=item new()

=over 4

B<Description:> instantiates a new cDNA assembler obj.

B<Parameters:> $sequence_sref

$sequence_sref is a reference to a scalar containing the genomic sequence string.

B<Returns:> $obj_href

$obj_href is the object reference newly instantiated by this new method.

=back

=cut

sub new () {
    my $package_name = shift;
    my $self = {};
    bless ($self, $package_name);
    $self->_init(@_);
    return ($self);
}


sub _init {
    my $self = shift;
    my ($sequence_ref) = @_;
    $self->{incoming_alignments} = []; #these are the alignments to be assembled.
    $self->{assemblies} = []; #contains list of all singletons and assemblies.
    $self->{sequence_ref} = $sequence_ref;
    $self->{fuzzlength} = $FUZZLENGTH;  #default setting.
}


=item assemble_alignments()

=over 4

B<DESCRIPTION:> assembles a series of cDNA aligmnments into one or more cDNA assemblies

B<Parameters:> @alignments

@alignments is an array of CDNA::CDNA_alignment objects

B<Returns:> none.

=back

=cut

sub assemble_alignments {
    my $self = shift;
    my @alignments = @_;
    @alignments = reverse sort {$a->{length}<=>$b->{length}} @alignments; #keep in order of decreasing alignment length.
    $self->{incoming_alignments} = [@alignments];
    #return;
 
    ## Algorithm: given cdna, merge with rest of cDNAs iteratively until merging complete.
    ##            find unmerged cDNA, merge it if possible with all individual cdna entries.
    ##            continue until all cDNAs have merged products, if possible.
    
    my $num_alignments = $#alignments + 1;
    for (my $i = 0; $i < $num_alignments; $i++) {
	my $seed_alignment = $alignments[$i];
	if ($seed_alignment->{merged}) {next;}
	print "Checking seed $i\n" if $SEE;
	my $merged_alignment = $seed_alignment; #initialize to seed alignment.
	my $merged_flag = 1;
	my $round = 0;
	while ($merged_flag) {
	    $merged_flag = 0;
	    $round++;
	    print "Merging round $round\n" if $SEE;
	    for (my $j=0; $j < $num_alignments; $j++) {
		if ($i == $j) {next;} #no self comparisons.
		my $other_alignment = $alignments[$j];
		if ($self->already_contains($merged_alignment, $other_alignment)) { next;}
		if ($self->can_merge($merged_alignment, $other_alignment)) {
		    my $initial_merged_fli_status = $merged_alignment->is_fli();
		    my $initial_merged_orient = $merged_alignment->get_orientation();
		    $merged_alignment = $self->merge_alignments($merged_alignment, $other_alignment);
		    $alignments[$i]->{merged} = 1; #set seed alignment merge flag.
		    $alignments[$j]->{merged} = 1; #set other alignment merge flag.
		    $merged_flag = 1; #indicates something actually merged this round.
		    
		    $merged_alignment->remap_cdna_segment_coords();
		    print "merged: " . $merged_alignment->toToken() . "\n" if $SEE;
		    
		}
	    }
	}
	push (@{$self->{assemblies}}, $merged_alignment); #either an assembled product, or something that won't ever merge.
    }
}

=item get_assemblies()

=over 4

B<Description:> returns all the alignment assemblies resulting from the assembly procedure.

B<Parameters:> none.

B<Returns:> @assemblies

@assemblies is an array of CDNA::CDNA_alignment objects.

use the get_acc() method of the alignment object to retrieve all the accessions of the cDNAs that were merged into the assembly.

=back

=cut


sub get_assemblies {
    my $self = shift;
    return (@{$self->{assemblies}});
}


# private method.  Determines if two alignments are compatible with one another.
sub can_merge () {
    my $self = shift;
    my ($a1, $a2) = @_;
    print "Checking to see if can merge: " . $a1->get_acc() . ", " . $a2->get_acc() . "\n" if $::SEE;
    ## See if the coord spans overlap
    my ($a1_lend, $a1_rend) = $a1->get_coords();
    my ($a2_lend, $a2_rend) = $a2->get_coords();
    unless (&overlap($a1_lend, $a1_rend, $a2_lend, $a2_rend)) {
	print "failed merge: No overlap between alignment spans. ($a1_lend, $a1_rend) vs. ($a2_lend, $a2_rend)\n" if $::SEE;
	return(0);
    }
    
    ## Make sure the spliced orientation is equivalent if appropriate
    my $a1_num_segs = $a1->get_num_segments();
    my $a2_num_segs = $a2->get_num_segments();
    my $a1_spliced_orientation = $a1->get_spliced_orientation();
    my $a2_spliced_orientation = $a2->get_spliced_orientation();
    my $a1_is_fli = $a1->is_fli();
    my $a2_is_fli = $a2->is_fli();

    my $fuzzlength = $self->{fuzzlength};

    if ($a1_num_segs > 1 && $a2_num_segs > 1) { #if more than one segment, then spliced orientation is relevant.
	if ($a1_spliced_orientation ne $a2_spliced_orientation) {
	    print "failed merge: $a1_num_segs segments vs. $a2_num_segs and opposite spliced orientations.\n" if $::SEE;
	    return (0);
	}
    }
    
    if ($a1_is_fli && $a2_is_fli && ($a1_spliced_orientation ne $a2_spliced_orientation)) { #fli's must have same orient.
	print "failed merge: (a1-fli: $a1_is_fli, a2-fli: $a2_is_fli) and opposite orientations.\n" if $SEE;
	return (0);
    }

    ## Check all overlapping segments to ensure non-conflicting segments.
    my @a1_segments = $a1->get_alignment_segments();
    my @a2_segments = $a2->get_alignment_segments();
    
    
    ## align segment orders between a1 and a2
    my ($starting_a1, $starting_a2);
    for (my $i = 0; $i <= $#a1_segments; $i++) {
	my $a1_seg = $a1_segments[$i];
	my ($a1_lend, $a1_rend) = $a1_seg->get_coords();
	for (my $j = 0; $j <= $#a2_segments; $j++) {
	    my $a2_seg = $a2_segments[$j];
	    my ($a2_lend, $a2_rend) = $a2_seg->get_coords();
	    if (&overlap($a1_lend, $a1_rend, $a2_lend, $a2_rend)) {
		$starting_a1 = $i;
		$starting_a2 = $j;
		last;
	    }
	} 
	if (defined ($starting_a1) && defined ($starting_a2)) {
	    last;
	}
    }
    
    unless (defined ($starting_a1) && defined ($starting_a2)) {
	print "failed merge:  can't align two segments between overlapping alignments.\n" if $::SEE;
	return (0);
    }
    
    unless ($starting_a1 == 0 || $starting_a2 == 0) {
	print "failed merge: segment alignment doesn't begin at either cDNA terminus.\n" if $::SEE;
	return (0);
    }
    
    while ($starting_a1 <= $#a1_segments && $starting_a2 <= $#a2_segments) {
	my $a1_segment = $a1_segments[$starting_a1];
	my $a2_segment = $a2_segments[$starting_a2];
	
	my ($a1_lend, $a1_rend) = $a1_segment->get_coords();
	my ($a2_lend, $a2_rend) = $a2_segment->get_coords();

	if (&overlap($a1_lend, $a1_rend, $a2_lend, $a2_rend)) {
	    ## See if have splice sites, do they exist and are they identical
	    if ($a1_segment->has_left_splice_junction() || $a2_segment->has_left_splice_junction()) {
		if ($a1_segment->has_left_splice_junction() && $a2_segment->has_left_splice_junction() && $a1_lend != $a2_lend) {
		    print "failed merge:\tboth left splice, but unequal coords: L1 ($a1_lend), L2 ($a2_lend)\n" if $::SEE;
		    return (0);
		} elsif ($a1_segment->has_left_splice_junction() && ($a2_lend + $fuzzlength < $a1_lend)) { #alignment extends beyond a splice junction.
		    print "failed merge:\tL1 left splice, L2 ($a2_lend) < L1 ($a1_lend)\n" if $::SEE;
		    return (0);
		} elsif ($a2_segment->has_left_splice_junction() && ($a1_lend + $fuzzlength < $a2_lend)) {
		    print "failed merge:\tL2 left splice, L1 ($a1_lend) < L2 ($a2_lend)\n" if $::SEE; 
		    return (0);
		}
		
	    }
	    if ($a1_segment->has_right_splice_junction() || $a2_segment->has_right_splice_junction()) {
		if ($a1_segment->has_right_splice_junction() && $a2_segment->has_right_splice_junction() && $a1_rend != $a2_rend) {
		    print "failed merge:\tboth right splice, but unequal coords: R1($a1_rend), R2($a2_rend)\n" if $::SEE;
		    return (0);
		} elsif ($a1_segment->has_right_splice_junction() && ($a2_rend - $fuzzlength > $a1_rend)) {
		    print "failed merge:\tR1 right splice, R2($a2_rend) > R1 ($a1_rend)\n" if $::SEE;
		    return (0);
		} elsif ($a2_segment->has_right_splice_junction() && ($a1_rend - $fuzzlength > $a2_rend)) {
		    print "failed merge:\tR2 right splice, R1 ($a1_rend) > R2 ($a2_rend)\n" if $::SEE;
		    return (0);
		}
	    }
	} else {
	    print "failed merge: Two ordered segments don't overlap. ($a1_lend, $a1_rend) , ($a2_lend, $a2_rend)\n" if $::SEE;
	    return (0);
	}
	$starting_a1++;
	$starting_a2++;
	
    }

    ## Passed all tests
    print "Merge tests PASSED.\n" if $::SEE;
    return (1);

}


# private method
# merges two alignment objects together into an assembly.
sub merge_alignments () {
    my ($self, $a1, $a2) = @_;
    print "Merging <" . $a1->get_acc() . ">, <" . $a2->get_acc() . ">\n" if $::SEE;
    my $a1_fli = $a1->is_fli();
    my $a2_fli = $a2->is_fli();
    print "a1_fli: $a1_fli, a2_fli: $a2_fli\n" if $SEE;
    ## Determine fli status for merged product.
    my $merged_fli_status = ($a1_fli || $a2_fli);
    
    my $merged_orientation = $self->determine_merged_orientation($a1, $a2);
    
    #get a1 segment cooridnates;
    my %leftsplicecoords; #preferrentially use splice coords over Fuzzlength extensions.
    my %rightsplicecoords;
    my %a1_coords;
    my @a1_segments = $a1->get_alignment_segments();
    foreach my $seg (@a1_segments) {
    	my ($lend, $rend) = $seg->get_coords();
	$a1_coords{$lend} = $rend;
	if ($seg->has_left_splice_junction()) {
	    $leftsplicecoords{$lend} = 1;
	}
	if ($seg->has_right_splice_junction()) {
	    $rightsplicecoords{$rend} = 1;
	}
    }
    
    # get a2 segment coordinates:
    my %a2_coords;
    my @a2_segments = $a2->get_alignment_segments();
    foreach my $seg (@a2_segments) {
	my ($lend, $rend) = $seg->get_coords();
	$a2_coords{$lend} = $rend;
	if ($seg->has_left_splice_junction()) {
	    $leftsplicecoords{$lend} = 1;
	}
	if ($seg->has_right_splice_junction()) {
	    $rightsplicecoords{$rend} = 1;
	}
    }
    
    my %merged_coords;
    
    #print "Coord dumps:\n" . Dumper (\%a1_coords) . Dumper (\%a2_coords) . "\n";
    
    #print "\n\nBEGIN\n";
    ## merge the overlapping coordinate sets between a1 and a2 alignments.
    foreach my $a1_lend (keys %a1_coords) {
	my $a1_rend = $a1_coords{$a1_lend};
	my ($merged_lend, $merged_rend);
	foreach my $a2_lend (keys %a2_coords) {
	    my $a2_rend = $a2_coords{$a2_lend};
	    if (&overlap($a1_lend, $a1_rend, $a2_lend, $a2_rend)) { #overlap
		## Determine merged lend;
		if ($leftsplicecoords{$a1_lend}) {
		    $merged_lend = $a1_lend;
		} elsif ($leftsplicecoords{$a2_lend}) {
		    $merged_lend = $a2_lend;
		} else {
		    $merged_lend = min ($a1_lend, $a2_lend);
		}
		# Determine merged rend
		if ($rightsplicecoords{$a1_rend}) {
		    $merged_rend = $a1_rend;
		} elsif ($rightsplicecoords{$a2_rend}) {
		    $merged_rend = $a2_rend;
		} else {
		    $merged_rend = max ($a1_rend, $a2_rend);
		}
		last;
	    }
	}
	if ($merged_lend && $merged_rend) {
	    #print "Adding  overlapped \$merged_coords{$merged_lend} = $merged_rend\n";
	    $merged_coords{$merged_lend} = $merged_rend;
	} else { #must not have been any overlap; keep a1 coordset
	    #print "Keeping a1 coords:  \$merged_coords{$a1_lend} = $a1_rend\n";
	    $merged_coords{$a1_lend} = $a1_rend;
	}
    }
    #print "adding unconsumed a2 coords:\n";
    ## add non-overlapping a2-segments
    foreach my $a2_lend (keys %a2_coords) {
	my $overlap = 0;
	my $a2_rend = $a2_coords{$a2_lend};
	foreach my $m_lend (keys %merged_coords) {
	    my $m_rend = $merged_coords{$m_lend};
	    if (&overlap($a2_lend, $a2_rend, $m_lend, $m_rend)) {
		$overlap = 1;
		last;
	    }
	}
	if (!$overlap) {
	    #print "Consuming a2 non-overlapping coords.\n";
	    $merged_coords{$a2_lend} = $a2_rend; #added a2 non-overlapping coordset
	} else {
	    #print "coords overlapped, not consuming.\n";
	}
    }
    print Dumper (\%merged_coords) if $::SEE;
    ## Create a new alignment based on a1 and a2
    my @alignment_segments;
    my $merged_length = 0;
    foreach my $end5 (keys %merged_coords) {
	my $end3 = $merged_coords{$end5};
	my $alignment_seg = new CDNA::Alignment_segment($end5, $end3);
	push (@alignment_segments, $alignment_seg);
	$merged_length += abs ($end3 - $end5) + 1;
    }
    my $new_alignment = new CDNA::CDNA_alignment($merged_length, \@alignment_segments, $self->{sequence_ref});
    my $new_acc = $self->merge_accs($a1->get_acc(), $a2->get_acc());
    $new_alignment->set_acc($new_acc);
    $new_alignment->set_fli_status($merged_fli_status);
    $new_alignment->force_spliced_validation($merged_orientation);
    #print "END.\n\n";
    return ($new_alignment);
}


#private method
# returns the minimum of an array of numerical values.
sub min {
    my @x = @_;
    @x = sort {$a<=>$b} @x;
    my $y = shift @x;
    return ($y);
}


#private method
# returns the maximum of an array of numerical values.
sub max {
    my @x = @_;
    @x = sort {$a<=>$b} @x;
    my $y = pop @x;
    return ($y);
}


#private method
# returns true/false, determines whether two coordinate sets overlap each other.
sub overlap {
    my ($a1_lend, $a1_rend, $a2_lend, $a2_rend) = @_;
    #print "Checking overlap @_\t";
    if ($a2_rend >= $a1_lend && $a2_lend <= $a1_rend) { #overlap
	#print "YES\n";
	return (1);
    } else {
	#print "NO\n";
	return (0);
    }
}


=item toAlignIllustration()

=over 4

B<Description:> illustrates the individual cDNAs to be assembled along with the final products.

B<Parameters:> $max_line_chars(optional)

$max_line_chars is an integer representing the maximum number of characters in a single line of output to the terminal.  The default is  100.

B<Returns:> $alignment_illustration_text

$alignment_illustration_text is a string containing a paragraph of text which illustrates the alignments and assemblies. An example is below:

 --->    <-->  <----->     <--->    <----------------	(+)gi|1199466

 --->    <-->  <----->     <--->    <------------   (+)gi|1209702
                        
---->    <-->  <----	(+)AV827070

---->    <-->  <---	(+)AV828861

---->    <-->  <---      (+)AV830936

 --->    <-->  <-	(+)H36350

ASSEMBLIES: (1)

---->    <-->  <----->     <--->    <---------------- (+) gi|1199466, gi|1209702, AV827070, AV828861, AV830936, H36350


=back

=cut


sub toAlignIllustration () {
    my $self = shift;
    my $max_line_chars = shift;
    $max_line_chars = ($max_line_chars) ? $max_line_chars : 100; #if not specified, 100  chars / line is default.
    
    ## Get minimum coord for relative positioning.
    my @coords;
    my @alignments = @{$self->{incoming_alignments}};
    foreach my $alignment (@alignments) {
	my @c = $alignment->get_coords();
	push (@coords, @c);
    }
    @coords = sort {$a<=>$b} @coords;
    print "coords: @coords\n" if $::SEE;
    my $min_coord = shift @coords;
    my $max_coord = pop @coords;
    my $rel_max = $max_coord - $min_coord;
    my $alignment_text = "";
    ## print each alignment followed by assemblies:
    my $num_alignments = $#alignments + 1;
    $alignment_text .= "Individual Alignments: ($num_alignments)\n";
    my $i = 0;
    foreach my $alignment (@alignments) {
	$alignment_text .= (sprintf ("%3d ", $i)) . $alignment->toAlignIllustration($min_coord, $rel_max, $max_line_chars) . "\n";
	$i++;
    }
    
    my @assemblies = @{$self->{assemblies}};
    my $num_assemblies = $#assemblies + 1;
    $alignment_text .= "\n\nASSEMBLIES: ($num_assemblies)\n";
    foreach my $assembly (@assemblies) {
	$alignment_text .= "    " . $assembly->toAlignIllustration($min_coord, $rel_max, $max_line_chars) . "\n";
    }

    return ($alignment_text);
}


=over 4

=item set_fuzzlength()

B<Description:> Sets the fuzzlength parameter.

B<Parameters:> int 

B<Returns:> none.

The fuzzlength is the length allowed to be fuzzy at the terminus of all alignments when compared to overlapping exons containining nearby splice sites.

=back

=cut

sub set_fuzzlength {
    my $self = shift;
    my $fuzzlength = shift;

    $self->{fuzzlength} = $fuzzlength;
}


#private method
# determines if two assemblies have the same accessions.
sub composition_same () {
    my ($name1, $name2) = @_;
    if ($name1 eq $name2) {
	return (1);
    } else {
	return (0);
    }
}

#private method
# creates a new name based on the accessions composing two assemblies.
sub merge_accs () {
    my $self = shift;
    my @names = @_;
    my @nameaccs;
    foreach my $name (@names) {
	my @nameacclist = split (/$DELIMETER/, $name);
	push (@nameaccs, @nameacclist);
    }
    my %unique;
    foreach my $acc (@nameaccs) {
	$unique{$acc} = 1;
    }
    my @accs = sort {$a cmp $b} keys %unique;
    my $combo = join ($DELIMETER, @accs);
    return ($combo);
}

#private method
# checks to see if an assembly already contains all the accessions built into a second assembly.
sub already_contains () {
    ## Checks to see if a1 contains all accessions of a2
    my $self = shift;
    my ($a1, $a2) = @_;
    my $a1_acc = $a1->get_acc();
    my $a2_acc = $a2->get_acc();
    my @a1_accs = split (/$DELIMETER/, $a1_acc);
    my @a2_accs = split (/$DELIMETER/, $a2_acc);
    my %a1_acc_hash;
    foreach my $acc (@a1_accs) {
	$a1_acc_hash{$acc} = 1;
    }
    my %a2_acc_hash;
    foreach my $acc (@a2_accs) {
	$a2_acc_hash{$acc} = 1;
    }
    
    foreach my $acc (keys %a2_acc_hash) {
	unless ($a1_acc_hash{$acc}) {
	    return (0);
	}
    }
    ## if still here, then must have all a2 accs in a1.
    return (1);
}


#private
sub determine_merged_orientation {
    my ($self, $a1, $a2) = @_;
    ## Determine the orientation of the merged product.
    my $num_a1_segments = $a1->get_num_segments();
    my $num_a2_segments = $a2->get_num_segments();
    my $a1_spliced_orientation = $a1->get_spliced_orientation();
    my $a2_spliced_orientation = $a2->get_spliced_orientation();
    print "a1_spliced_orient: $a1_spliced_orientation, a2_spliced_orient: $a2_spliced_orientation\n" if $SEE;
    my $merged_orientation = '+'; #initialize to a default.
    if ($a1_spliced_orientation eq $a2_spliced_orientation) {
	$merged_orientation = $a1_spliced_orientation; 
	print "Same spliced orientation: $merged_orientation\n" if $SEE;
    } elsif ($a1->is_fli() || $a2->is_fli()) {
	$merged_orientation = ($a1->is_fli()) ? $a1_spliced_orientation : $a2_spliced_orientation;
	print "a1 is fli, using a1 orient: $merged_orientation\n" if ($a1->is_fli() && $SEE);
	print "a2 is fli, using a2 orient: $merged_orientation\n" if ($a2->is_fli() && $SEE);
   
    } elsif ($num_a1_segments > 1 || $num_a2_segments > 1) {
	$merged_orientation = ($num_a1_segments > 1) ? $a1_spliced_orientation : $a2_spliced_orientation;
	print "a1 has multiple segments, using a1 orient: $merged_orientation\n" if ($num_a1_segments > 1 && $SEE);
	print "a2 has multiple segments, using a2 orient: $merged_orientation\n" if ($num_a2_segments > 1 && $SEE);
	
    } else {
	print "Can't predict the orientation of the merged alignments ($a1_spliced_orientation vs. $a2_spliced_orientation).  Using default '+'\n" if $SEE;
    }
    return ($merged_orientation);
}


1; #EOM