From ef84b132bafcf3881e77f25d8bcde2cca51d00a2 Mon Sep 17 00:00:00 2001 From: Derrick Wood Date: Wed, 11 Feb 2015 00:24:08 -0500 Subject: [PATCH] Revert "Switch to sliding window minimum" This reverts commit d5daae90e043432bcc75949d78385837514cd0e7. --- CHANGELOG | 12 ++++------ src/classify.cpp | 45 ++++------------------------------- src/kraken_headers.hpp | 1 - src/krakendb.cpp | 54 +++++++++++++++++++++++++++++++++++------- src/krakendb.hpp | 9 +++---- 5 files changed, 60 insertions(+), 61 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index ff40241..cfe329f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,14 +4,12 @@ v0.10.5-beta: * improved support for adding multi-FASTA files with --add-to-library * allow assigning taxon IDs in reference sequences w/o GI numbers using "kraken:taxid" code -* included full sequence descriptions when using "--[un]classified-out" -* reduced memory usage of db_shrink (Build step 2 / kraken-build --shrink) -* reduced memory usage of db_sort (Build step 3) -* support added for KRAKEN_NUM_THREADS, KRAKEN_DB_PATH, and KRAKEN_DEFAULT_DB +* Included full sequence descriptions when using "--[un]classified-out" +* Reduced memory usage of db_shrink (Build step 2 / kraken-build --shrink) +* Reduced memory usage of db_sort (Build step 3) +* Support added for KRAKEN_NUM_THREADS, KRAKEN_DB_PATH, and KRAKEN_DEFAULT_DB env. variables -* added kraken-translate for getting taxonomic names for each sequence -* removed presumptive search, now using sliding window minimum calculation - of minimizers to increase classification speed +* Added kraken-translate for getting taxonomic names for each sequence v0.10.4-beta: * use GRCh38 for human genome library diff --git a/src/classify.cpp b/src/classify.cpp index 92d3911..87edc95 100644 --- a/src/classify.cpp +++ b/src/classify.cpp @@ -201,11 +201,6 @@ void process_file(char *filename) { delete reader; } -typedef struct { - uint64_t minimizer; - uint64_t pos; -} minimizer_pos_pair_t; - void classify_sequence(DNASequence &dna, ostringstream &koss, ostringstream &coss, ostringstream &uoss) { vector taxa; @@ -215,13 +210,9 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, uint32_t taxon = 0; uint32_t hits = 0; // only maintained if in quick mode - // Fields in support of sliding window min. calculation of minimizers (bin keys) - uint8_t minimizer_len = Database.get_index()->indexed_nt(); - uint64_t minimizer_mask = (1u << (minimizer_len * 2)) - 1; - deque minimizer_queue; - uint64_t candidate_pos = 0; // position in sequence since start/last ambig. nucl. - uint8_t k = KmerScanner::get_k(); - uint64_t xor_mask = Database.get_xor_mask() & minimizer_mask; + uint64_t current_bin_key; + int64_t current_min_pos = 1; + int64_t current_max_pos = 0; if (dna.seq.size() >= Database.get_k()) { KmerScanner scanner(dna.seq); @@ -229,39 +220,13 @@ void classify_sequence(DNASequence &dna, ostringstream &koss, taxon = 0; if (scanner.ambig_kmer()) { ambig_list.push_back(1); - minimizer_queue.clear(); } else { ambig_list.push_back(0); - if (minimizer_queue.empty()) { - candidate_pos = 0; - while (candidate_pos < k - minimizer_len + 1u) { - uint64_t candidate = Database.canonical_representation( - ((*kmer_ptr) >> (2 * (k - minimizer_len - candidate_pos))) & minimizer_mask, minimizer_len - ) ^ xor_mask; - while (! minimizer_queue.empty() && minimizer_queue.back().minimizer > candidate) { - minimizer_queue.pop_back(); - } - minimizer_pos_pair_t data = { candidate, candidate_pos++ }; - minimizer_queue.push_back(data); - } - } - else { - uint64_t candidate = Database.canonical_representation( - *kmer_ptr & minimizer_mask, minimizer_len - ) ^ xor_mask; - while (! minimizer_queue.empty() && minimizer_queue.back().minimizer > candidate) { - minimizer_queue.pop_back(); - } - minimizer_pos_pair_t data = { candidate, candidate_pos++ }; - minimizer_queue.push_back(data); - if (minimizer_queue.front().pos < candidate_pos - k + minimizer_len - 1u) { - minimizer_queue.pop_front(); - } - } uint32_t *val_ptr = Database.kmer_query( Database.canonical_representation(*kmer_ptr), - minimizer_queue.front().minimizer + ¤t_bin_key, + ¤t_min_pos, ¤t_max_pos ); taxon = val_ptr ? *val_ptr : 0; if (taxon) { diff --git a/src/kraken_headers.hpp b/src/kraken_headers.hpp index c74cc67..bca9858 100644 --- a/src/kraken_headers.hpp +++ b/src/kraken_headers.hpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/src/krakendb.cpp b/src/krakendb.cpp index 7fa5c4e..83e71b3 100644 --- a/src/krakendb.cpp +++ b/src/krakendb.cpp @@ -164,10 +164,6 @@ uint64_t KrakenDB::bin_key(uint64_t kmer) { return min_bin_key; } -uint64_t KrakenDB::get_xor_mask() { - return index_ptr->index_type() == 1 ? 0 : INDEX2_XOR_MASK; -} - // Code mostly from Jellyfish 1.6 source uint64_t KrakenDB::reverse_complement(uint64_t kmer, uint8_t n) { kmer = ((kmer >> 2) & 0x3333333333333333UL) | ((kmer & 0x3333333333333333UL) << 2); @@ -199,15 +195,36 @@ uint64_t KrakenDB::canonical_representation(uint64_t kmer) { return kmer < revcom ? kmer : revcom; } -uint32_t *KrakenDB::kmer_query(uint64_t kmer, uint64_t b_key) +// perform search over last range to speed up queries +// NOTE: retry_on_failure implies all pointer params are non-NULL +uint32_t *KrakenDB::kmer_query(uint64_t kmer, uint64_t *last_bin_key, + int64_t *min_pos, int64_t *max_pos, + bool retry_on_failure) { int64_t min, max, mid; uint64_t comp_kmer; + uint64_t b_key; char *ptr = get_pair_ptr(); size_t pair_sz = pair_size(); - min = index_ptr->at(b_key); - max = index_ptr->at(b_key + 1) - 1; + // Use provided values if they exist and are valid + if (retry_on_failure && *min_pos <= *max_pos) { + b_key = *last_bin_key; + min = *min_pos; + max = *max_pos; + } + else { + b_key = bin_key(kmer); + min = index_ptr->at(b_key); + max = index_ptr->at(b_key + 1) - 1; + // Invalid min/max values + retry_on_failure means min/max need to be + // initialized and set in caller + if (retry_on_failure) { + *last_bin_key = b_key; + *min_pos = min; + *max_pos = max; + } + } // Binary search with large window while (min + 15 <= max) { @@ -231,12 +248,31 @@ uint32_t *KrakenDB::kmer_query(uint64_t kmer, uint64_t b_key) return (uint32_t *) (ptr + pair_sz * mid + key_len); } - return NULL; + uint32_t *answer = NULL; + // ROF implies the provided values might be out of date + // If they are, we'll update them and search again + if (retry_on_failure) { + b_key = bin_key(kmer); + // If bin key hasn't changed, search fails + if (b_key == *last_bin_key) + return NULL; + min = index_ptr->at(b_key); + max = index_ptr->at(b_key + 1) - 1; + // Recursive call w/ adjusted search params and w/o retry + answer = kmer_query(kmer, &b_key, &min, &max, false); + // Update caller's search params due to bin key change + if (last_bin_key != NULL) { + *last_bin_key = b_key; + *min_pos = min; + *max_pos = max; + } + } + return answer; } // Binary search w/in the k-mer's bin uint32_t *KrakenDB::kmer_query(uint64_t kmer) { - return kmer_query(kmer, bin_key(kmer)); + return kmer_query(kmer, NULL, NULL, NULL, false); } KrakenDBIndex::KrakenDBIndex() { diff --git a/src/krakendb.hpp b/src/krakendb.hpp index e99700f..c30eeb5 100644 --- a/src/krakendb.hpp +++ b/src/krakendb.hpp @@ -56,14 +56,15 @@ namespace kraken { size_t header_size(); // Jellyfish uses variable header sizes uint32_t *kmer_query(uint64_t kmer); // return ptr to pair w/ kmer - // Assume correct bin key has been calculated and provided - uint32_t *kmer_query(uint64_t kmer, uint64_t bin_key); - + // perform search over last range to speed up queries + uint32_t *kmer_query(uint64_t kmer, uint64_t *last_bin_key, + int64_t *min_pos, int64_t *max_pos, + bool retry_on_failure=true); + // return "bin key" for kmer, based on index // If idx_nt not specified, use index's value uint64_t bin_key(uint64_t kmer, uint64_t idx_nt); uint64_t bin_key(uint64_t kmer); - uint64_t get_xor_mask(); // Code from Jellyfish, rev. comp. of a k-mer with n nt. // If n is not specified, use k in DB, otherwise use first n nt in kmer