forked from blaxterlab/blobology
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgc_cov_annotate.pl
executable file
·236 lines (206 loc) · 9.08 KB
/
gc_cov_annotate.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#!/usr/bin/perl
=head1 NAME
gc_cov_annotate.pl
=head1 SYNOPSIS
gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...]
=head1 AUTHORS
[email protected] 2013.09.15
=cut
use strict;
use warnings;
use Getopt::Long qw(:config pass_through no_ignore_case);
my ($blasttaxid_file, $taxdump_dir, $assembly_file, $output_file) = ("",".","","");
my @tax_list;
my @bam_files;
my @cov_files;
GetOptions (
"blasttaxid=s" => \$blasttaxid_file,
"assembly=s" => \$assembly_file,
"out:s" => \$output_file,
"taxdump:s" => \$taxdump_dir,
"bam:s{,}" => \@bam_files,
"cov:s{,}" => \@cov_files,
"taxlist:s{,}" => \@tax_list,
);
if (not @tax_list) {@tax_list = ("species","order","phylum","superkingdom")};
my %tax_levels;
foreach (@tax_list) {$tax_levels{$_}=1}
if (not $output_file) {$output_file = $assembly_file . ".txt"};
print "Usage: gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...] [--cov COVFILES...] [--taxlist species...]\n" .
"--taxdump is '.' by default, i.e. the files nodes.dmp and names.dmp from the NCBI taxonomy database ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz are expected to be in the current directory\n" .
"--taxlist: default is: species order phylum superkingdom, but you can add any other NCBI taxlevel such as class family suborder etc\n" unless
(-r $blasttaxid_file or $blasttaxid_file eq "-") and -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp" and
(-r $assembly_file or $assembly_file eq "-");
die "Please specify --blasttaxid\n" unless (-r $blasttaxid_file or $blasttaxid_file eq "-");
die "Please specify --taxdump\n" unless -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp";
die "Please specify --assembly\n" unless (-r $assembly_file or $assembly_file eq "-");
#-----------------------------------------------------------------
# Get taxon annotation info from contig-taxid file
#-----------------------------------------------------------------
my (%taxid_has_parent, %taxid_has_taxlevel, %taxid_has_name);
print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ...\n";
&load_nodes_names ("$taxdump_dir/nodes.dmp","$taxdump_dir/names.dmp");
print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ... DONE\n";
my $blasttaxid_fh = &read_fh($blasttaxid_file);
my %contig_taxinfo;
while (<$blasttaxid_fh>) {
die "Contig-taxid file $blasttaxid_file does not seem to have two cols with the seqid in the first col and taxid in the second col" unless
/^(\S+)\t(\d+)/;
$contig_taxinfo{$1} = &taxonomy_report($2);
}
close $blasttaxid_fh;
#-----------------------------------------------------------------
# Calculate GC, len, for assembly file, get coverage from bamfiles
#-----------------------------------------------------------------
# load assembly file into memory:
print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ...\n";
my $fastahash = &fastafile2hash ($assembly_file);
print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ... DONE\n";
# calculate coverage for each bam file
for my $bam_file (@bam_files) {
my $reads_processed = 0;
my ($refstart, $refend, $readid, @F); # declaring here to avoid having to declare in each instance of loop
open SAM, "samtools view $bam_file |" or die $!;
print STDERR scalar localtime() . " - Reading $bam_file ...\n";
while (<SAM>) {
# ignore comment lines/ SAM headers
next if /^\s*$/ or /^@/ or /^#/;
chomp;
@F=split/\t/;
if (($F[1] & 4) != 4) {
$refstart = $F[3];
$refend = $F[3] - 1;
while ($F[5] =~ /(\d+)[MIDNP]/g) { $refend += $1 } # calculate coverage on reference sequence from CIGAR
die "ContigID $F[2] in bamfile $bam_file, but not in assembly file $assembly_file\n" if not exists $$fastahash{$F[2]};
$$fastahash{$F[2]}{$bam_file} += ($refend - $refstart + 1)/$$fastahash{$F[2]}{len};
}
$reads_processed++;
print STDERR "Processed $reads_processed reads by " . localtime() . "...\n" if $reads_processed % 1000000 == 0;
}
print STDERR scalar localtime() . " - Reading $bam_file ... DONE\n";
}
# calculate coverage for each cov file (two col file with seqid in first col, average read coverage depth in second col)
for my $cov_file (@cov_files) {
open COV, $cov_file or die $!;
print STDERR scalar localtime() . " - Reading $cov_file ...\n";
while (<COV>) {
/^(\S+)\s+(\S+)/ and $$fastahash{$1}{$cov_file} = $2;
}
print STDERR scalar localtime() . " - Reading $cov_file ... DONE\n";
}
# print len gc cov (one for each bam, and each cov) and taxon annotation info in one large file (that can be used for plotting by R)
print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ...\n";
open LENCOVGC, ">$output_file" or die $!;
# header row:
print LENCOVGC "seqid\tlen\tgc";
foreach (@bam_files) {print LENCOVGC "\tcov_$_"};
foreach (@cov_files) {print LENCOVGC "\tcov_$_"};
foreach (@tax_list) {print LENCOVGC "\ttaxlevel_$_"};
print LENCOVGC "\n";
# for each contig/seqid:
my ($seqid, $length, $gccount, $nonatgc, $totalcov, $cov); # declared outside loop for efficiency
for $seqid (keys %{$fastahash}) {
$length = $$fastahash{$seqid}{len};
$gccount = $$fastahash{$seqid}{gc};
$nonatgc = $$fastahash{$seqid}{nonatgc};
print LENCOVGC "$seqid\t$length\t". ($gccount/($length-$nonatgc));
for my $bam_file (@bam_files) {
$cov = (exists($$fastahash{$seqid}{$bam_file}) ? $$fastahash{$seqid}{$bam_file} : 0);
print LENCOVGC "\t" . $cov;
}
for my $cov_file (@cov_files) {
$cov = (exists($$fastahash{$seqid}{$cov_file}) ? $$fastahash{$seqid}{$cov_file} : 0);
print LENCOVGC "\t" . $cov;
}
for my $tax_level (@tax_list) {
print LENCOVGC "\t" . (exists(${$contig_taxinfo{$seqid}}{$tax_level}) ? ${$contig_taxinfo{$seqid}}{$tax_level} : "Not annotated");
}
print LENCOVGC "\n";
}
print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ... DONE\n";
############################################################
sub taxonomy_report {
my $hit_taxid = shift @_;
my @parents = &get_parents($hit_taxid);
# convert @parents to tax names:
my %taxinfo;
# my $taxonomy_report_string = "";
for my $parent (@parents) {
if (exists $taxid_has_taxlevel{$parent} and exists $tax_levels{$taxid_has_taxlevel{$parent}}) {
$taxinfo{$taxid_has_taxlevel{$parent}} = $taxid_has_name{$parent};
}
}
return \%taxinfo;
# for my $tax_level (keys %hit_counts) {
# for my $tax_name (keys %{$hit_counts{$tax_level}}) {
# $taxonomy_report_string .= "$tax_level\t$tax_name\t";
# }
# }
# return $taxonomy_report_string . "\n";
}
############################################################
sub get_parents {
my @all = @_;
my $current_id = $all[0];
if (exists $taxid_has_parent{$current_id} and $current_id ne $taxid_has_parent{$current_id}) {
unshift @all, $taxid_has_parent{$current_id};
@all = &get_parents(@all);
}
return @all;
}
############################################################
sub load_nodes_names {
my $fh;
my $nodesfile = shift @_;
my $namesfile = shift @_;
$fh = &read_fh($nodesfile);
while (my $line = <$fh>) {
# line in nodes.dmp should match the regexp below.
# Change the regexp if NCBI changes their file format
next if $line !~ /^(\d+)\s*\|\s*(\d+)\s*\|\s*(.+?)\s*\|/;
$taxid_has_parent{$1} = $2;
$taxid_has_taxlevel{$1} = $3;
}
close $fh;
$fh = &read_fh($namesfile);
while (my $line = <$fh>) {
next unless $line =~ /^(\d+)\s*\|\s*(.+?)\s*\|.+scientific name/;
$taxid_has_name{$1} = $2;
}
}
############################################################
sub fastafile2hash {
my $fastafile = shift @_;
my %sequences;
my $fh = &read_fh($fastafile);
my $seqid;
while (my $line = <$fh>) {
if ($line =~ /^>(\S+)(.*)/) {
$seqid = $1;
$sequences{$seqid}{desc} = $2;
}
else {
chomp $line;
$sequences{$seqid}{seq} .= $line;
$sequences{$seqid}{len} += length $line;
$sequences{$seqid}{gc} += ($line =~ tr/gcGC/gcGC/);
$line =~ s/[^atgc]/N/ig;
$sequences{$seqid}{nonatgc} += ($line =~ tr/N/N/);
}
}
close $fh;
return \%sequences;
}
############################################################
sub read_fh {
my $filename = shift @_;
my $filehandle;
if ($filename =~ /gz$/) {
open $filehandle, "gunzip -dc $filename |" or die $!;
}
else {
open $filehandle, "<$filename" or die $!;
}
return $filehandle;
}
############################################################