Update

yge15 · Oct 23, 2017 · 7d1ab24 · 7d1ab24
1 parent cdc9fb6
commit 7d1ab24
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 33 deletions.
diff --git a/scripts/krakenu-build_db.sh b/scripts/krakenu-build_db.sh
@@ -255,7 +255,7 @@ if [ "$KRAKEN_LCA_DATABASE" != "0" ]; then
     set -x
       set_lcas $MEMFLAG -x -d $SORTED_DB_NAME -o database.kdb -i database.idx -v \
       -b taxDB $PARAM -t $KRAKEN_THREAD_CT -m seqid2taxid.map -c database.kdb.counts \
-      -F <( cat_library ) > seqid2taxid-plus.map
+      -F <( cat_library ) -T > seqid2taxid-plus.map
     set +x
     if [ "$KRAKEN_ADD_TAXIDS_FOR_SEQ" == "1" ] || [ "$KRAKEN_ADD_TAXIDS_FOR_GENOME" == "1" ]; then
       mv seqid2taxid.map seqid2taxid.map.orig
@@ -300,8 +300,8 @@ if [ "$KRAKEN_UID_DATABASE" != "0" ]; then
   ## Make a classification report
   REPNAME=uid_database
   if [[ ! -s $REPNAME.report.tsv ]]; then
-    #echo "Creating UID database summary report $REPNAME.report.tsv ..."
-    #krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv
+    echo "Creating UID database summary report $REPNAME.report.tsv ..."
+    krakenu --db . --report-file $REPNAME.report.tsv --threads $KRAKEN_THREAD_CT --uid-mapping --fasta-input <(cat_library) > $REPNAME.kraken.tsv
   fi
 fi
 

diff --git a/src/Makefile b/src/Makefile
@@ -1,8 +1,8 @@
 CXX = g++
 FOPENMP?=-fopenmp
-CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O2 -g -Wfatal-errors
+CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -g -Wfatal-errors
 #CXXFLAGS = -Wall -std=c++11 $(FOPENMP) -O3 -Wfatal-errors
-PROGS = db_sort set_lcas classify make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
+PROGS = classify db_sort set_lcas make_seqid_to_taxid_map db_shrink build_taxdb grade_classification dump_taxdb read_uid_mapping
 LIBFLAGS = -L. -I./gzstream -L./gzstream -lz -lgzstream
 
 .PHONY: all install clean
@@ -28,7 +28,7 @@ read_uid_mapping: quickfile.o
 classify: classify.cpp krakendb.o quickfile.o krakenutil.o seqreader.o uid_mapping.o hyperloglogplus.h taxdb.h report-cols.h
 	$(CXX) $(CXXFLAGS) -o classify $^ $(LIBFLAGS)
 
-build_taxdb: taxdb.h report-cols.h
+build_taxdb: taxdb.h report-cols.h quickfile.o
 
 make_seqid_to_taxid_map: quickfile.o
 

diff --git a/src/krakenutil.cpp b/src/krakenutil.cpp
@@ -21,6 +21,7 @@
 #include "kraken_headers.hpp"
 #include "krakenutil.hpp"
 #include <unordered_set>
+#include<algorithm>
 
 using namespace std;
 
@@ -50,7 +51,26 @@ namespace kraken {
   // Return lowest common ancestor of a and b
   // LCA(0,x) = LCA(x,0) = x
   // Default ancestor is 1 (root of tree)
-  uint32_t lca(const unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b) {
+  uint32_t lca(const unordered_map<uint32_t, uint32_t> &parent_map,
+    uint32_t a, uint32_t b)
+  {
+    if (a == 0 || b == 0)
+      return a ? a : b;
+
+    unordered_set<uint32_t> a_path;
+    while (a > 0) {
+      a_path.insert(a);
+      a = parent_map.at(a);
+    }
+    while (b > 0) {
+      if (a_path.count(b) > 0)
+        return b;
+      b = parent_map.at(b);
+    }
+    return 1;
+  }
+
+  uint32_t lca_vec(const unordered_map<uint32_t, uint32_t> &parent_map, uint32_t a, uint32_t b) {
     if (a == 0 || b == 0)
       return a ? a : b;
 
@@ -59,19 +79,19 @@ namespace kraken {
     do {
       if (a == b) 
         return a; 
-      a_path.insert(a);
+      a_path.push_back(a);
       a = parent_map.at(a);
-    } while (a != a_path.back())
+    } while (a != a_path.back());
 
     // search for b in the path from a to the root
     uint32_t last_b = 0;
     do {
-      if (a_path.find(b) != a_path.end())
+      if (std::find(a_path.begin(), a_path.end(), b) != a_path.end())
         return b;
 
       last_b = b;
       b = parent_map.at(b);
-    } while (last_b != b)
+    } while (last_b != b);
     return 1;
   }
 

diff --git a/src/set_lcas.cpp b/src/set_lcas.cpp
@@ -39,16 +39,16 @@ void usage(int exit_code=EX_USAGE);
 void process_files();
 void process_single_file();
 void process_file(string filename, uint32_t taxid);
-void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish);
+void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish, bool is_contaminant_taxid = false);
 
 int Num_threads = 1;
 string DB_filename, Index_filename,
   Output_DB_filename, TaxDB_filename,
   Kmer_count_filename,
   File_to_taxon_map_filename,
   ID_to_taxon_map_filename, Multi_fasta_filename;
-bool force_taxid = false;
-int New_taxid_start = 1000000000;
+bool force_contaminant_taxid = false;
+uint32_t New_taxid_start = 1000000000;
 
 bool Allow_extra_kmers = false;
 bool verbose = false;
@@ -88,7 +88,7 @@ int main(int argc, char **argv) {
 
   parse_command_line(argc, argv);
 
-  if (!TaxDB_filename.empty() && !force_taxid) {
+  if (!TaxDB_filename.empty()) {
     taxdb = TaxonomyDB<uint32_t, ReadCounts>(TaxDB_filename);
     for (const auto & tax : taxdb.taxIDsAndEntries) {
       if (tax.first != 0)
@@ -211,6 +211,15 @@ unordered_map<string,uint32_t> read_seqid_to_taxid_map(string ID_to_taxon_map_fi
   string line, seq_id, name;
   uint32_t taxid;
 
+  if (Add_taxIds_for_Assembly && Add_taxIds_for_Sequences) {
+    for (const auto& k : taxdb.taxIDsAndEntries) {
+      if (k.first >= New_taxid_start) {
+        New_taxid_start = k.first;
+      } 
+    }
+    cerr << "Starting new taxonomy IDs with " << (New_taxid_start+1) << endl;
+  }
+
   // Used when adding new taxids for assembly or sequence
   unordered_map<string, uint32_t> name_to_taxid_map;
 
@@ -298,6 +307,8 @@ void process_single_file() {
       continue;
     }
 
+    bool is_contaminant_taxid = taxid == 32630 || taxid == 81077;
+
     if (Add_taxIds_for_Sequences && taxid != 9606) {
       // Update entry based on header line
       auto entryIt = taxdb.taxIDsAndEntries.find(taxid);
@@ -319,7 +330,7 @@ void process_single_file() {
       } else {
         #pragma omp parallel for schedule(dynamic)
         for (size_t i = 0; i < dna.seq.size(); i += SKIP_LEN)
-          set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1);
+          set_lcas(taxid, dna.seq, i, i + SKIP_LEN + Database.get_k() - 1, is_contaminant_taxid);
          ++seqs_processed;
       }
     } else {
@@ -378,7 +389,7 @@ void process_sequence(DNASequence dna) {
   // Or maybe asembly_summary file?
 }
 
-void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
+void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish, bool is_contaminant_taxid) {
   KmerScanner scanner(seq, start, finish);
   uint64_t *kmer_ptr;
   uint32_t *val_ptr;
@@ -403,14 +414,12 @@ void set_lcas(uint32_t taxid, string &seq, size_t start, size_t finish) {
     if (Use_uids_instead_of_taxids) {
       #pragma omp critical(new_uid)
       *val_ptr = uid_mapping(Taxids_to_UID_map, UID_to_taxids_vec, taxid, *val_ptr, current_uid, UID_map_file);
-    } else if (!force_taxid && taxid != contaminant_taxids) {
-      if (Parent_map.find(taxid) != Parent_map.end()) {
-        *val_ptr = lca(Parent_map, taxid, *val_ptr);
-      }
-    } else {
-      // When force_taxid is set, do not compute lca, but assign the taxid
+    } else if (force_contaminant_taxid && is_contaminant_taxid) {
+      // When force_contaminant_taxid is set, do not compute lca, but assign the taxid
       // of the (last) sequence to k-mers
       *val_ptr = taxid;
+    } else {
+      *val_ptr = lca(Parent_map, taxid, *val_ptr);
     }
   }
 }
@@ -454,7 +463,7 @@ void parse_command_line(int argc, char **argv) {
         #endif
         break;
       case 'T' :
-        force_taxid = true;
+        force_contaminant_taxid = true;
         break;
       case 'v' :
         verbose = true;
@@ -516,8 +525,8 @@ void usage(int exit_code) {
        << "  -f filename      File to taxon map" << endl
        << "  -F filename      Multi-FASTA file with sequence data" << endl
        << "  -m filename      Sequence ID to taxon map" << endl
-       << "  -a               Add taxonomy IDs (starting with "<<New_taxid_start<<") for assemblies (third column in seqid2taxid.map) to Taxonomy DB" << endl
-       << "  -A               Add taxonomy IDs (starting with "<<New_taxid_start<<") for sequences to Taxonomy DB" << endl
+       << "  -a               Add taxonomy IDs (starting with "<<(New_taxid_start+1)<<") for assemblies (third column in seqid2taxid.map) to Taxonomy DB" << endl
+       << "  -A               Add taxonomy IDs (starting with "<<(New_taxid_start+1)<<") for sequences to Taxonomy DB" << endl
        << "  -T               Do not set LCA as taxid for kmers, but the taxid of the sequence" << endl
        << "  -I filename      Write UIDs into database, and output (binary) UID-to-taxid map to filename" << endl
        << "  -p               Pretend - do not write database back to disk (when working in RAM)" << endl

diff --git a/tests/build-dbs.sh b/tests/build-dbs.sh
@@ -7,14 +7,15 @@ set -eu
 
 
 build_db() {
-  K=$1; shift
-  NAM=$1; shift
+  local K=$1; shift
+  local NAM=$1; shift
+  local MIN=15
 
-  DB_NAM=refseq-$NAM-k$K
+  local DB_NAM=refseq-$NAM-k$K
   DB_DIR=$DIR/dbs/$DB_NAM
 
   mkdir -p $DB_DIR
-  CMD="krakenu-build --kmer-len $K --minimizer-len 12 --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database"
+  CMD="krakenu-build --kmer-len $K --minimizer-len $MIN --threads $THREADS --db $DB_DIR --build --taxids-for-genomes --taxids-for-sequences --taxonomy-dir=$DIR/data/taxonomy --uid-database"
   for L in $@; do
     CMD="$CMD  --library-dir=$DIR/data/library/$L"
   done
@@ -34,13 +35,18 @@ build_db() {
 }
 
 #export PATH="$DIR/install:$PATH"
-for K in 31 21; do
+for K in 31; do
   if [[ `uname` == "Darwin" ]]; then
     build_db $K viral viral
     build_db $K all-viral viral viral-neighbors
   else
     build_db $K oct2017 archaea-dusted bacteria-dusted viral-dusted viral-neighbors-dusted \
                          vertebrate_mammalian contaminants
+
+    EUKD=$DIR/dbs/refseq-euk-oct2017-k31
+    [[ -d $EUKD ]] || mkdir -p $EUKD
+    [[ -f $EUKD/taxDB ]] || cp -v $DB_DIR/taxDB $EUKD
+    build_db $K euk-oct2017 fungi protozoa
     #build_db $K bacteria bacteria archaea
   fi
 done

diff --git a/tests/init.sh b/tests/init.sh
@@ -9,10 +9,11 @@ set -xeu
 
 ## Download taxonomy and genomic data into data/
 #$DIR/install/krakenu-download --db $DIR/data -R --include-viral-neighbors taxonomy refseq/archaea refseq/bacteria refseq/viral/Any
+$DIR/install/krakenu-download --db $DIR/data -R refseq/fungi refseq/fungi/Chromosome refseq/protozoa refseq/protozoa/Chromosome
 #$DIR/install/krakenu-download --db $DIR/data --fna rna,genomic -R refseq/vertebrate_mammalian/Chromosome/taxid9606 
-$DIR/install/krakenu-download --db $DIR/data -R contaminants
+#$DIR/install/krakenu-download --db $DIR/data -R contaminants
 
-for i in viral viral-neighbors archaea bacteria; do 
+for i in fungi protozoa viral viral-neighbors archaea bacteria; do 
   [[ -s "$DIR/data/all-$i.fna" ]] || find $DIR/data/library/$i -name '*.fna' -exec cat {} \; > $DIR/data/all-$i.fna
   [[ -s "$DIR/data/all-$i.map" ]] || find $DIR/data/library/$i -name '*.map' -exec cat {} \; > $DIR/data/all-$i.map
   DUSTED_F="$DIR/data/all-$i-dusted.fna"