-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathDownloadViruses.pl
45 lines (36 loc) · 1.59 KB
/
DownloadViruses.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# This script downloads all viral genomes in RefSeq and puts them in viruses.fa
# Script is taken from: http://www.ncbi.nlm.nih.gov/books/NBK25498/#chapter3.Application_3_Retrieving_large
# BY INTERFACE NCBI: http://www.ncbi.nlm.nih.gov/nuccore?term=%22Viruses%22[PORG]+AND+srcdb_refseq[PROP] , then sent to file: fasta.
use LWP::Simple;
$organism = 'viruses';
$query = $organism.'[orgn]+AND+srcdb_refseq[prop]';
print STDERR "Searching RefSeq for $organism: $query\n";
#assemble the esearch URL
$base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
$url = $base . "esearch.fcgi?db=nucleotide&term=$query&usehistory=y";
#post the esearch URL
$output = get($url);
#parse WebEnv, QueryKey and Count (# records retrieved)
$web = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/);
$key = $1 if ($output =~ /<QueryKey>(\d+)<\/QueryKey>/);
$count = $1 if ($output =~ /<Count>(\d+)<\/Count>/);
print STDERR "Found: $count records for $organism\n";
if($count == 0) {
exit(0);
}
#open output file for writing
open(OUT, ">tmp.$organism.fa") || die "Can't open file!\n";
#retrieve data in batches of 500
$retmax = 500;
for ($ret = 0; $ret < $count; ) {
$efetch_url = $base ."efetch.fcgi?db=nucleotide&WebEnv=$web";
$efetch_url .= "&query_key=$key&retstart=$ret";
$efetch_url .= "&retmax=$retmax&rettype=fasta&retmode=text";
$efetch_out = get($efetch_url);
$actual_sequences_returned = $efetch_out =~ s/>/\n>/g; # count number of sequences returned
$ret += $actual_sequences_returned;
print OUT "$efetch_out";
print STDERR "Fetched $ret\n";
}
close OUT;
rename("tmp.$organism.fa", "$organism.fa");