Merge pull request DerrickWood#78 from DerrickWood/revert-76-revert-7…

…4-gifix Second shot at GI fix
haoziyeung · Aug 28, 2017 · d1b1edd · d1b1edd
2 parents 75327c4 + 9a4efe4
commit d1b1edd
Show file tree

Hide file tree

Showing 12 changed files with 392 additions and 188 deletions.
diff --git a/docs/MANUAL.html b/docs/MANUAL.html
diff --git a/docs/MANUAL.markdown b/docs/MANUAL.markdown
@@ -171,7 +171,7 @@ have some multi-threading enabled) in February 2015:
      7m48s  *Step 1 (create set)
        n/a   Step 2 (reduce database, optional and skipped)
     53m16s  *Step 3 (sort set)
-     1m04s   Step 4 (GI number to sequence ID map)
+     1m04s   Step 4 (GI number to sequence ID map - now obsolete)
      0m27s   Step 5 (Sequence ID to taxon map)
     29m20s  *Step 6 (set LCA values)
     ------
@@ -352,7 +352,7 @@ To build a custom database:
 
         kraken-build --download-taxonomy --db $DBNAME
 
-    This will download the GI number to taxon map, as well as the
+    This will download the accession number to taxon map, as well as the
     taxonomic name and tree information from NCBI.  These files can
     be found in `$DBNAME/taxonomy/` .  If you need to modify the taxonomy,
     edits can be made to the `names.dmp` and `nodes.dmp` files in this directory;
@@ -361,9 +361,10 @@ To build a custom database:
 2) Install a genomic library.  Four sets of standard genomes are
     made easily available through `kraken-build`:
 
-    - bacteria: RefSeq complete bacterial/archaeal genomes
-    - plasmids: RefSeq plasmid sequences
-    - viruses: RefSeq complete viral genomes
+    - archaea: RefSeq complete archaeal genomes
+    - bacteria: RefSeq complete bacterial genomes
+    - plasmid: RefSeq plasmid sequences
+    - viral: RefSeq complete viral genomes
     - human: GRCh38 human genome
 
     To download and install any one of these, use the `--download-library`
@@ -376,7 +377,7 @@ To build a custom database:
     - Sequences must be in a FASTA file (multi-FASTA is allowed)
     - Each sequence's ID (the string between the `>` and the first
       whitespace character on the header line) must contain either
-      a GI number to allow Kraken to lookup the correct taxa, or an
+      an NCBI accession number to allow Kraken to lookup the correct taxa, or an
       explicit assignment of the taxonomy ID using `kraken:taxid` (see below).
 
     Replicons not downloaded from NCBI may need their taxonomy information
@@ -390,7 +391,7 @@ To build a custom database:
 
     The `kraken:taxid` string must begin the sequence ID or be immediately
     preceded by a pipe character (`|`).  Explicit assignment of taxonomy IDs
-    in this manner will override the GI number mapping provided by NCBI.
+    in this manner will override the accession number mapping provided by NCBI.
 
     If your genomes meet the requirements above, then you can add each
     replicon to your database's genomic library using the `--add-to-library`

diff --git a/scripts/add_to_library.sh b/scripts/add_to_library.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2013-2015, Derrick Wood <[email protected]>
+# Copyright 2013-2017, Derrick Wood <[email protected]>
 #
 # This file is part of the Kraken taxonomic sequence classification system.
 #
@@ -35,15 +35,13 @@ then
   exit 1
 fi
 
-if ! verify_gi_numbers.pl "$1"
-then
-  echo "Can't add \"$1\": sequence is missing GI number"
-  exit 1
-fi
-
 add_dir="$LIBRARY_DIR/added"
 mkdir -p "$add_dir"
+scan_fasta_file.pl "$1" > "$add_dir/temp_map.txt"
 
 filename=$(cp_into_tempfile.pl -t "XXXXXXXXXX" -d "$add_dir" -s fna "$1")
 
+cat "$add_dir/temp_map.txt" >> "$add_dir/prelim_map.txt"
+rm "$add_dir/temp_map.txt"
+
 echo "Added \"$1\" to library ($KRAKEN_DB_NAME)"
diff --git a/scripts/build_kraken_db.sh b/scripts/build_kraken_db.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2013-2015, Derrick Wood <[email protected]>
+# Copyright 2013-2017, Derrick Wood <[email protected]>
 #
 # This file is part of the Kraken taxonomic sequence classification system.
 #
@@ -154,29 +154,36 @@ else
   echo "K-mer set sorted. [$(report_time_elapsed $start_time1)]"
 fi
 
-if [ -e "gi2seqid.map" ]
-then
-  echo "Skipping step 4, GI number to seqID map already complete."
-else
-  echo "Creating GI number to seqID map (step 4 of 6)..."
-  start_time1=$(date "+%s.%N")
-  find library/ '(' -name '*.fna' -o -name '*.fa' -o -name '*.ffn' ')' -print0 | \
-    xargs -0 cat | report_gi_numbers.pl > gi2seqid.map.tmp
-  mv gi2seqid.map.tmp gi2seqid.map
-
-  echo "GI number to seqID map created. [$(report_time_elapsed $start_time1)]"
-fi
+echo "Skipping step 4, GI number to seqID map now obsolete."
 
-if [ -e "seqid2taxid.map" ]
+seqid2taxid_map_file="seqid2taxid.map"
+if [ -e "$seqid2taxid_map_file" ]
 then
   echo "Skipping step 5, seqID to taxID map already complete."
 else
   echo "Creating seqID to taxID map (step 5 of 6)..."
   start_time1=$(date "+%s.%N")
-  make_seqid_to_taxid_map taxonomy/gi_taxid_nucl.dmp gi2seqid.map \
-    > seqid2taxid.map.tmp
-  mv seqid2taxid.map.tmp seqid2taxid.map
-  line_ct=$(wc -l seqid2taxid.map | awk '{print $1}')
+
+  find library/ -maxdepth 2 -name prelim_map.txt | xargs cat > taxonomy/prelim_map.txt
+  if [ ! -s "taxonomy/prelim_map.txt" ]; then
+    echo "No preliminary seqid/taxid mapping files found, aborting."
+    exit 1
+  fi
+
+  grep "^TAXID" taxonomy/prelim_map.txt | cut -f 2- > $seqid2taxid_map_file.tmp || true
+  if grep "^ACCNUM" taxonomy/prelim_map.txt | cut -f 2- > accmap_file.tmp; then
+    if compgen -G "taxonomy/*.accession2taxid" > /dev/null; then
+      lookup_accession_numbers.pl accmap_file.tmp taxonomy/*.accession2taxid > seqid2taxid_acc.tmp
+      cat seqid2taxid_acc.tmp >> $seqid2taxid_map_file.tmp
+      rm seqid2taxid_acc.tmp
+    else
+      echo "Accession to taxid map files are required to build this DB."
+      echo "Run 'kraken-build --db $KRAKEN_DB_NAME --download-taxonomy' again?"
+      exit 1
+    fi
+  fi
+  mv $seqid2taxid_map_file.tmp $seqid2taxid_map_file
+  line_ct=$(wc -l $seqid2taxid_map_file | awk '{print $1}')
 
   echo "$line_ct sequences mapped to taxa. [$(report_time_elapsed $start_time1)]"
 fi

diff --git a/scripts/download_genomic_library.sh b/scripts/download_genomic_library.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2013-2015, Derrick Wood <[email protected]>
+# Copyright 2013-2017, Derrick Wood <[email protected]>
 #
 # This file is part of the Kraken taxonomic sequence classification system.
 #
@@ -19,9 +19,10 @@
 
 # Download specific genomic libraries for use with Kraken.
 # Supported choices are:
-#   bacteria - NCBI RefSeq complete bacterial/archaeal genomes
+#   archaea - NCBI RefSeq complete archaeal genomes
+#   bacteria - NCBI RefSeq complete bacterial genomes
 #   plasmids - NCBI RefSeq plasmid sequences
-#   viruses - NCBI RefSeq complete viral DNA and RNA genomes
+#   viral - NCBI RefSeq complete viral DNA and RNA genomes
 #   human - NCBI RefSeq GRCh38 human reference genome
 
 set -u  # Protect against uninitialized vars.
@@ -33,87 +34,44 @@ FTP_SERVER="ftp://$NCBI_SERVER"
 RSYNC_SERVER="rsync://$NCBI_SERVER"
 THIS_DIR=$PWD
 
+library_name="$1"
+library_file="library.fna"
 case "$1" in
-  "bacteria")
-    mkdir -p $LIBRARY_DIR/Bacteria
-    cd $LIBRARY_DIR/Bacteria
-    if [ ! -e "lib.complete" ]
-    then
-      rm -f all.fna.tar.gz
-      wget $FTP_SERVER/genomes/archive/old_refseq/Bacteria/all.fna.tar.gz
-      echo -n "Unpacking..."
-      tar zxf all.fna.tar.gz
-      rm all.fna.tar.gz
-      echo " complete."
-      touch "lib.complete"
-    else
-      echo "Skipping download of bacterial genomes, already downloaded here."
+  "archaea" | "bacteria" | "viral" | "human" )
+    mkdir -p $LIBRARY_DIR/$library_name
+    cd $LIBRARY_DIR/$library_name
+    rm -f assembly_summary.txt
+    remote_dir_name=$library_name
+    if [ "$library_name" = "human" ]; then
+      remote_dir_name="vertebrate_mammalian/Homo_sapiens"
     fi
-    ;;
-  "plasmids")
-    mkdir -p $LIBRARY_DIR/Plasmids
-    cd $LIBRARY_DIR/Plasmids
-    if [ ! -e "lib.complete" ]
-    then
-      rm -f plasmids.all.fna.tar.gz
-      wget $FTP_SERVER/genomes/Plasmids/plasmids.all.fna.tar.gz
-      echo -n "Unpacking..."
-      tar zxf plasmids.all.fna.tar.gz
-      rm plasmids.all.fna.tar.gz
-      echo " complete."
-      touch "lib.complete"
-    else
-      echo "Skipping download of plasmids, already downloaded here."
+    if ! wget -q $FTP_SERVER/genomes/refseq/$remote_dir_name/assembly_summary.txt; then
+      echo "Error downloading assembly summary file for $library_name, exiting." >/dev/fd/2
+      exit 1
     fi
-    ;;
-  "viruses")
-    mkdir -p $LIBRARY_DIR/Viruses
-    cd $LIBRARY_DIR/Viruses
-    if [ ! -e "lib.complete" ]
-    then
-      rm -f all.fna.tar.gz
-      rm -f all.ffn.tar.gz
-      wget $FTP_SERVER/genomes/Viruses/all.fna.tar.gz
-      wget $FTP_SERVER/genomes/Viruses/all.ffn.tar.gz
-      echo -n "Unpacking..."
-      tar zxf all.fna.tar.gz
-      tar zxf all.ffn.tar.gz
-      rm all.fna.tar.gz
-      rm all.ffn.tar.gz
-      echo " complete."
-      touch "lib.complete"
-    else
-      echo "Skipping download of viral genomes, already downloaded here."
+    if [ "$library_name" = "human" ]; then
+      grep "Genome Reference Consortium" assembly_summary.txt > x
+      mv x assembly_summary.txt
     fi
+    rm -rf all/ library.f* manifest.txt rsync.err
+    rsync_from_ncbi.pl assembly_summary.txt
+    scan_fasta_file.pl $library_file >> prelim_map.txt
     ;;
-  "human")
-    mkdir -p $LIBRARY_DIR/Human
-    cd $LIBRARY_DIR/Human
-    if [ ! -e "lib.complete" ]
-    then
-      # get list of CHR_* directories
-      wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/
-      directories=$(perl -nle '/^d/ and /(CHR_\w+)\s*$/ and print $1' .listing)
-      rm .listing
-
-      # For each CHR_* directory, get GRCh* fasta gzip file name, d/l, unzip, and add
-      for directory in $directories
-      do
-        wget --spider --no-remove-listing $FTP_SERVER/genomes/H_sapiens/$directory/
-        file=$(perl -nle '/^-/ and /\b(hs_ref_GRCh\S+\.fa\.gz)\s*$/ and print $1' .listing)
-        [ -z "$file" ] && exit 1
-        rm .listing
-        wget $FTP_SERVER/genomes/H_sapiens/$directory/$file
-        gunzip "$file"
-      done
-
-      touch "lib.complete"
-    else
-      echo "Skipping download of human genome, already downloaded here."
-    fi
+  "plasmid")
+    mkdir -p $LIBRARY_DIR/plasmid
+    cd $LIBRARY_DIR/plasmid
+    rm -f library.f* plasmid.*
+    echo -n "Downloading plasmid files from FTP..."
+    wget -q --no-remove-listing --spider $FTP_SERVER/genomes/refseq/plasmid/
+    awk '{ print $NF }' .listing | perl -ple 'tr/\r//d' | grep '\.fna\.gz' > manifest.txt
+    cat manifest.txt | xargs -n1 -I{} wget -q $FTP_SERVER/genomes/refseq/plasmid/{}
+    cat manifest.txt | xargs -n1 -I{} gunzip -c {} > $library_file
+    rm -f plasmid.* .listing
+    scan_fasta_file.pl $library_file > prelim_map.txt
+    echo " done."
     ;;
   *)
     echo "Unsupported library.  Valid options are: "
-    echo "  bacteria plasmids virus human"
+    echo "  archaea bacteria plasmid viral human"
     ;;
 esac
diff --git a/scripts/download_taxonomy.sh b/scripts/download_taxonomy.sh
@@ -1,41 +1,30 @@
 #!/bin/bash
 
-# Copyright 2013-2015, Derrick Wood <[email protected]>
+# Copyright 2013-2017, Derrick Wood <[email protected]>
 #
 # This file is part of the Kraken taxonomic sequence classification system.
-#
-# Kraken is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Kraken is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Kraken.  If not, see <http://www.gnu.org/licenses/>.
 
 # Download NCBI taxonomy information for Kraken.
-# Designed to be called by kraken_build
+# Designed to be called by kraken-build
 
 set -u  # Protect against uninitialized vars.
 set -e  # Stop on error
 
 TAXONOMY_DIR="$KRAKEN_DB_NAME/taxonomy"
-NCBI_SERVER="ftp.ncbi.nih.gov"
+NCBI_SERVER="ftp.ncbi.nlm.nih.gov"
 FTP_SERVER="ftp://$NCBI_SERVER"
-THIS_DIR=$PWD
 
 mkdir -p "$TAXONOMY_DIR"
 cd "$TAXONOMY_DIR"
 
-if [ ! -e "gimap.dlflag" ]
+if [ ! -e "accmap.dlflag" ]
 then
-  wget $FTP_SERVER/pub/taxonomy/gi_taxid_nucl.dmp.gz
-  touch gimap.dlflag
-  echo "Downloaded GI to taxon map"
+  wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_est.accession2taxid.gz
+  wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz
+  wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_gss.accession2taxid.gz
+  wget $FTP_SERVER/pub/taxonomy/accession2taxid/nucl_wgs.accession2taxid.gz
+  touch accmap.dlflag
+  echo "Downloaded accession to taxon map(s)"
 fi
 
 if [ ! -e "taxdump.dlflag" ]
@@ -45,16 +34,17 @@ then
   echo "Downloaded taxonomy tree data"
 fi
 
-if [ ! -e "gimap.flag" ]
+if ls | grep -q 'accession2taxid\.gz$'
 then
-  gunzip gi_taxid_nucl.dmp.gz
-  touch gimap.flag
-  echo "Uncompressed GI to taxon map"
+  echo -n "Uncompressing taxonomy data... "
+  gunzip *accession2taxid.gz
+  echo "done."
 fi
 
-if [ ! -e "taxdump.flag" ]
+if [ ! -e "taxdump.untarflag" ]
 then
+  echo -n "Untarring taxonomy tree data... "
   tar zxf taxdump.tar.gz
-  touch taxdump.flag
-  echo "Uncompressed taxonomy tree data"
+  touch taxdump.untarflag
+  echo "done."
 fi
diff --git a/scripts/kraken-build b/scripts/kraken-build
@@ -1,6 +1,6 @@
 #!/usr/bin/perl
 
-# Copyright 2013-2015, Derrick Wood <[email protected]>
+# Copyright 2013-2017, Derrick Wood <[email protected]>
 #
 # This file is part of the Kraken taxonomic sequence classification system.
 #
@@ -40,7 +40,7 @@ my $DEF_MINIMIZER_LEN = 15;
 my $DEF_KMER_LEN = 31;
 my $DEF_THREAD_CT = 1;
 
-my @VALID_LIBRARY_TYPES = qw/bacteria plasmids viruses human/;
+my @VALID_LIBRARY_TYPES = qw/archaea bacteria plasmid viral human/;
 
 # Option/task option variables
 my (
@@ -199,8 +199,8 @@ Usage: $PROG [task option] [options]
 Task options (exactly one must be selected):
   --download-taxonomy        Download NCBI taxonomic information
   --download-library TYPE    Download partial library
-                             (TYPE = one of "bacteria", "plasmids", 
-                             "viruses", "human")
+                             (TYPE = one of "archaea", "bacteria", "plasmid", 
+                             "viral", "human")
   --add-to-library FILE      Add FILE to library
   --build                    Create DB from library
                              (requires taxonomy d/l'ed and at least one file