forked from DerrickWood/kraken
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request DerrickWood#78 from DerrickWood/revert-76-revert-7…
…4-gifix Second shot at GI fix
- Loading branch information
Showing
12 changed files
with
392 additions
and
188 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2013-2015, Derrick Wood <[email protected]> | ||
# Copyright 2013-2017, Derrick Wood <[email protected]> | ||
# | ||
# This file is part of the Kraken taxonomic sequence classification system. | ||
# | ||
|
@@ -35,15 +35,13 @@ then | |
exit 1 | ||
fi | ||
|
||
if ! verify_gi_numbers.pl "$1" | ||
then | ||
echo "Can't add \"$1\": sequence is missing GI number" | ||
exit 1 | ||
fi | ||
|
||
add_dir="$LIBRARY_DIR/added" | ||
mkdir -p "$add_dir" | ||
scan_fasta_file.pl "$1" > "$add_dir/temp_map.txt" | ||
|
||
filename=$(cp_into_tempfile.pl -t "XXXXXXXXXX" -d "$add_dir" -s fna "$1") | ||
|
||
cat "$add_dir/temp_map.txt" >> "$add_dir/prelim_map.txt" | ||
rm "$add_dir/temp_map.txt" | ||
|
||
echo "Added \"$1\" to library ($KRAKEN_DB_NAME)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2013-2015, Derrick Wood <[email protected]> | ||
# Copyright 2013-2017, Derrick Wood <[email protected]> | ||
# | ||
# This file is part of the Kraken taxonomic sequence classification system. | ||
# | ||
|
@@ -154,29 +154,36 @@ else | |
echo "K-mer set sorted. [$(report_time_elapsed $start_time1)]" | ||
fi | ||
|
||
if [ -e "gi2seqid.map" ] | ||
then | ||
echo "Skipping step 4, GI number to seqID map already complete." | ||
else | ||
echo "Creating GI number to seqID map (step 4 of 6)..." | ||
start_time1=$(date "+%s.%N") | ||
find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \ | ||
xargs -0 cat | report_gi_numbers.pl > gi2seqid.map.tmp | ||
mv gi2seqid.map.tmp gi2seqid.map | ||
|
||
echo "GI number to seqID map created. [$(report_time_elapsed $start_time1)]" | ||
fi | ||
echo "Skipping step 4, GI number to seqID map now obsolete." | ||
|
||
if [ -e "seqid2taxid.map" ] | ||
seqid2taxid_map_file="seqid2taxid.map" | ||
if [ -e "$seqid2taxid_map_file" ] | ||
then | ||
echo "Skipping step 5, seqID to taxID map already complete." | ||
else | ||
echo "Creating seqID to taxID map (step 5 of 6)..." | ||
start_time1=$(date "+%s.%N") | ||
make_seqid_to_taxid_map taxonomy/gi_taxid_nucl.dmp gi2seqid.map \ | ||
> seqid2taxid.map.tmp | ||
mv seqid2taxid.map.tmp seqid2taxid.map | ||
line_ct=$(wc -l seqid2taxid.map | awk '{print $1}') | ||
|
||
find library/ -maxdepth 2 -name prelim_map.txt | xargs cat > taxonomy/prelim_map.txt | ||
if [ ! -s "taxonomy/prelim_map.txt" ]; then | ||
echo "No preliminary seqid/taxid mapping files found, aborting." | ||
exit 1 | ||
fi | ||
|
||
grep "^TAXID" taxonomy/prelim_map.txt | cut -f 2- > $seqid2taxid_map_file.tmp || true | ||
if grep "^ACCNUM" taxonomy/prelim_map.txt | cut -f 2- > accmap_file.tmp; then | ||
if compgen -G "taxonomy/*.accession2taxid" > /dev/null; then | ||
lookup_accession_numbers.pl accmap_file.tmp taxonomy/*.accession2taxid > seqid2taxid_acc.tmp | ||
cat seqid2taxid_acc.tmp >> $seqid2taxid_map_file.tmp | ||
rm seqid2taxid_acc.tmp | ||
else | ||
echo "Accession to taxid map files are required to build this DB." | ||
echo "Run 'kraken-build --db $KRAKEN_DB_NAME --download-taxonomy' again?" | ||
exit 1 | ||
fi | ||
fi | ||
mv $seqid2taxid_map_file.tmp $seqid2taxid_map_file | ||
line_ct=$(wc -l $seqid2taxid_map_file | awk '{print $1}') | ||
|
||
echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]" | ||
fi | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2013-2015, Derrick Wood <[email protected]> | ||
# Copyright 2013-2017, Derrick Wood <[email protected]> | ||
# | ||
# This file is part of the Kraken taxonomic sequence classification system. | ||
# | ||
|
@@ -19,9 +19,10 @@ | |
|
||
# Download specific genomic libraries for use with Kraken. | ||
# Supported choices are: | ||
# bacteria - NCBI RefSeq complete bacterial/archaeal genomes | ||
# archaea - NCBI RefSeq complete archaeal genomes | ||
# bacteria - NCBI RefSeq complete bacterial genomes | ||
# plasmids - NCBI RefSeq plasmid sequences | ||
# viruses - NCBI RefSeq complete viral DNA and RNA genomes | ||
# viral - NCBI RefSeq complete viral DNA and RNA genomes | ||
# human - NCBI RefSeq GRCh38 human reference genome | ||
|
||
set -u # Protect against uninitialized vars. | ||
|
@@ -33,87 +34,44 @@ FTP_SERVER="ftp://$NCBI_SERVER" | |
RSYNC_SERVER="rsync://$NCBI_SERVER" | ||
THIS_DIR=$PWD | ||
|
||
library_name="$1" | ||
library_file="library.fna" | ||
case "$1" in | ||
"bacteria") | ||
mkdir -p $LIBRARY_DIR/Bacteria | ||
cd $LIBRARY_DIR/Bacteria | ||
if [ ! -e "lib.complete" ] | ||
then | ||
rm -f all.fna.tar.gz | ||
wget $FTP_SERVER/genomes/archive/old_refseq/Bacteria/all.fna.tar.gz | ||
echo -n "Unpacking..." | ||
tar zxf all.fna.tar.gz | ||
rm all.fna.tar.gz | ||
echo " complete." | ||
touch "lib.complete" | ||
else | ||
echo "Skipping download of bacterial genomes, already downloaded here." | ||
"archaea" | "bacteria" | "viral" | "human" ) | ||
mkdir -p $LIBRARY_DIR/$library_name | ||
cd $LIBRARY_DIR/$library_name | ||
rm -f assembly_summary.txt | ||
remote_dir_name=$library_name | ||
if [ "$library_name" = "human" ]; then | ||
remote_dir_name="vertebrate_mammalian/Homo_sapiens" | ||
fi | ||
;; | ||
"plasmids") | ||
mkdir -p $LIBRARY_DIR/Plasmids | ||
cd $LIBRARY_DIR/Plasmids | ||
if [ ! -e "lib.complete" ] | ||
then | ||
rm -f plasmids.all.fna.tar.gz | ||
wget $FTP_SERVER/genomes/Plasmids/plasmids.all.fna.tar.gz | ||
echo -n "Unpacking..." | ||
tar zxf plasmids.all.fna.tar.gz | ||
rm plasmids.all.fna.tar.gz | ||
echo " complete." | ||
touch "lib.complete" | ||
else | ||
echo "Skipping download of plasmids, already downloaded here." | ||
if ! wget -q $FTP_SERVER/genomes/refseq/$remote_dir_name/assembly_summary.txt; then | ||
echo "Error downloading assembly summary file for $library_name, exiting." >/dev/fd/2 | ||
exit 1 | ||
fi | ||
;; | ||
"viruses") | ||
mkdir -p $LIBRARY_DIR/Viruses | ||
cd $LIBRARY_DIR/Viruses | ||
if [ ! -e "lib.complete" ] | ||
then | ||
rm -f all.fna.tar.gz | ||
rm -f all.ffn.tar.gz | ||
wget $FTP_SERVER/genomes/Viruses/all.fna.tar.gz | ||
wget $FTP_SERVER/genomes/Viruses/all.ffn.tar.gz | ||
echo -n "Unpacking..." | ||
tar zxf all.fna.tar.gz | ||
tar zxf all.ffn.tar.gz | ||
rm all.fna.tar.gz | ||
rm all.ffn.tar.gz | ||
echo " complete." | ||
touch "lib.complete" | ||
else | ||
echo "Skipping download of viral genomes, already downloaded here." | ||
if [ "$library_name" = "human" ]; then | ||
grep "Genome Reference Consortium" assembly_summary.txt > x | ||
mv x assembly_summary.txt | ||
fi | ||
rm -rf all/ library.f* manifest.txt rsync.err | ||
rsync_from_ncbi.pl assembly_summary.txt | ||
scan_fasta_file.pl $library_file >> prelim_map.txt | ||
;; | ||
"human") | ||
mkdir -p $LIBRARY_DIR/Human | ||
cd $LIBRARY_DIR/Human | ||
if [ ! -e "lib.complete" ] | ||
then | ||
# get list of CHR_* directories | ||
wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/ | ||
directories=$(perl -nle '/^d/ and /(CHR_\w+)\s*$/ and print $1' .listing) | ||
rm .listing | ||
|
||
# For each CHR_* directory, get GRCh* fasta gzip file name, d/l, unzip, and add | ||
for directory in $directories | ||
do | ||
wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/$directory/ | ||
file=$(perl -nle '/^-/ and /\b(hs_ref_GRCh\S+\.fa\.gz)\s*$/ and print $1' .listing) | ||
[ -z "$file" ] && exit 1 | ||
rm .listing | ||
wget $FTP_SERVER/genomes/H_sapiens/$directory/$file | ||
gunzip "$file" | ||
done | ||
|
||
touch "lib.complete" | ||
else | ||
echo "Skipping download of human genome, already downloaded here." | ||
fi | ||
"plasmid") | ||
mkdir -p $LIBRARY_DIR/plasmid | ||
cd $LIBRARY_DIR/plasmid | ||
rm -f library.f* plasmid.* | ||
echo -n "Downloading plasmid files from FTP..." | ||
wget -q --no-remove-listing --spider $FTP_SERVER/genomes/refseq/plasmid/ | ||
awk '{ print $NF }' .listing | perl -ple 'tr/\r//d' | grep '\.fna\.gz' > manifest.txt | ||
cat manifest.txt | xargs -n1 -I{} wget -q $FTP_SERVER/genomes/refseq/plasmid/{} | ||
cat manifest.txt | xargs -n1 -I{} gunzip -c {} > $library_file | ||
rm -f plasmid.* .listing | ||
scan_fasta_file.pl $library_file > prelim_map.txt | ||
echo " done." | ||
;; | ||
*) | ||
echo "Unsupported library. Valid options are: " | ||
echo " bacteria plasmids virus human" | ||
echo " archaea bacteria plasmid viral human" | ||
;; | ||
esac |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,30 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2013-2015, Derrick Wood <[email protected]> | ||
# Copyright 2013-2017, Derrick Wood <[email protected]> | ||
# | ||
# This file is part of the Kraken taxonomic sequence classification system. | ||
# | ||
# Kraken is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# Kraken is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with Kraken. If not, see <http://www.gnu.org/licenses/>. | ||
|
||
# Download NCBI taxonomy information for Kraken. | ||
# Designed to be called by kraken_build | ||
# Designed to be called by kraken-build | ||
|
||
set -u # Protect against uninitialized vars. | ||
set -e # Stop on error | ||
|
||
TAXONOMY_DIR="$KRAKEN_DB_NAME/taxonomy" | ||
NCBI_SERVER="ftp.ncbi.nih.gov" | ||
NCBI_SERVER="ftp.ncbi.nlm.nih.gov" | ||
FTP_SERVER="ftp://$NCBI_SERVER" | ||
THIS_DIR=$PWD | ||
|
||
mkdir -p "$TAXONOMY_DIR" | ||
cd "$TAXONOMY_DIR" | ||
|
||
if [ ! -e "gimap.dlflag" ] | ||
if [ ! -e "accmap.dlflag" ] | ||
then | ||
wget $FTP_SERVER/pub/taxonomy/gi_taxid_nucl.dmp.gz | ||
touch gimap.dlflag | ||
echo "Downloaded GI to taxon map" | ||
wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_est.accession2taxid.gz | ||
wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz | ||
wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_gss.accession2taxid.gz | ||
wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_wgs.accession2taxid.gz | ||
touch accmap.dlflag | ||
echo "Downloaded accession to taxon map(s)" | ||
fi | ||
|
||
if [ ! -e "taxdump.dlflag" ] | ||
|
@@ -45,16 +34,17 @@ then | |
echo "Downloaded taxonomy tree data" | ||
fi | ||
|
||
if [ ! -e "gimap.flag" ] | ||
if ls | grep -q 'accession2taxid\.gz$' | ||
then | ||
gunzip gi_taxid_nucl.dmp.gz | ||
touch gimap.flag | ||
echo "Uncompressed GI to taxon map" | ||
echo -n "Uncompressing taxonomy data... " | ||
gunzip *accession2taxid.gz | ||
echo "done." | ||
fi | ||
|
||
if [ ! -e "taxdump.flag" ] | ||
if [ ! -e "taxdump.untarflag" ] | ||
then | ||
echo -n "Untarring taxonomy tree data... " | ||
tar zxf taxdump.tar.gz | ||
touch taxdump.flag | ||
echo "Uncompressed taxonomy tree data" | ||
touch taxdump.untarflag | ||
echo "done." | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
#!/usr/bin/perl | ||
|
||
# Copyright 2013-2015, Derrick Wood <[email protected]> | ||
# Copyright 2013-2017, Derrick Wood <[email protected]> | ||
# | ||
# This file is part of the Kraken taxonomic sequence classification system. | ||
# | ||
|
@@ -40,7 +40,7 @@ my $DEF_MINIMIZER_LEN = 15; | |
my $DEF_KMER_LEN = 31; | ||
my $DEF_THREAD_CT = 1; | ||
|
||
my @VALID_LIBRARY_TYPES = qw/bacteria plasmids viruses human/; | ||
my @VALID_LIBRARY_TYPES = qw/archaea bacteria plasmid viral human/; | ||
|
||
# Option/task option variables | ||
my ( | ||
|
@@ -199,8 +199,8 @@ Usage: $PROG [task option] [options] | |
Task options (exactly one must be selected): | ||
--download-taxonomy Download NCBI taxonomic information | ||
--download-library TYPE Download partial library | ||
(TYPE = one of "bacteria", "plasmids", | ||
"viruses", "human") | ||
(TYPE = one of "archaea", "bacteria", "plasmid", | ||
"viral", "human") | ||
--add-to-library FILE Add FILE to library | ||
--build Create DB from library | ||
(requires taxonomy d/l'ed and at least one file | ||
|
Oops, something went wrong.