Skip to content

Commit

Permalink
Revert "Switch to sliding window minimum"
Browse files Browse the repository at this point in the history
This reverts commit d5daae9.
  • Loading branch information
DerrickWood committed Feb 11, 2015
1 parent d5daae9 commit ef84b13
Showing 5 changed files with 60 additions and 61 deletions.
12 changes: 5 additions & 7 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -4,14 +4,12 @@ v0.10.5-beta:
* improved support for adding multi-FASTA files with --add-to-library
* allow assigning taxon IDs in reference sequences w/o GI numbers
using "kraken:taxid" code
* included full sequence descriptions when using "--[un]classified-out"
* reduced memory usage of db_shrink (Build step 2 / kraken-build --shrink)
* reduced memory usage of db_sort (Build step 3)
* support added for KRAKEN_NUM_THREADS, KRAKEN_DB_PATH, and KRAKEN_DEFAULT_DB
* Included full sequence descriptions when using "--[un]classified-out"
* Reduced memory usage of db_shrink (Build step 2 / kraken-build --shrink)
* Reduced memory usage of db_sort (Build step 3)
* Support added for KRAKEN_NUM_THREADS, KRAKEN_DB_PATH, and KRAKEN_DEFAULT_DB
env. variables
* added kraken-translate for getting taxonomic names for each sequence
* removed presumptive search, now using sliding window minimum calculation
of minimizers to increase classification speed
* Added kraken-translate for getting taxonomic names for each sequence

v0.10.4-beta:
* use GRCh38 for human genome library
45 changes: 5 additions & 40 deletions src/classify.cpp
Original file line number Diff line number Diff line change
@@ -201,11 +201,6 @@ void process_file(char *filename) {
delete reader;
}

typedef struct {
uint64_t minimizer;
uint64_t pos;
} minimizer_pos_pair_t;

void classify_sequence(DNASequence &dna, ostringstream &koss,
ostringstream &coss, ostringstream &uoss) {
vector<uint32_t> taxa;
@@ -215,53 +210,23 @@ void classify_sequence(DNASequence &dna, ostringstream &koss,
uint32_t taxon = 0;
uint32_t hits = 0; // only maintained if in quick mode

// Fields in support of sliding window min. calculation of minimizers (bin keys)
uint8_t minimizer_len = Database.get_index()->indexed_nt();
uint64_t minimizer_mask = (1u << (minimizer_len * 2)) - 1;
deque<minimizer_pos_pair_t> minimizer_queue;
uint64_t candidate_pos = 0; // position in sequence since start/last ambig. nucl.
uint8_t k = KmerScanner::get_k();
uint64_t xor_mask = Database.get_xor_mask() & minimizer_mask;
uint64_t current_bin_key;
int64_t current_min_pos = 1;
int64_t current_max_pos = 0;

if (dna.seq.size() >= Database.get_k()) {
KmerScanner scanner(dna.seq);
while ((kmer_ptr = scanner.next_kmer()) != NULL) {
taxon = 0;
if (scanner.ambig_kmer()) {
ambig_list.push_back(1);
minimizer_queue.clear();
}
else {
ambig_list.push_back(0);
if (minimizer_queue.empty()) {
candidate_pos = 0;
while (candidate_pos < k - minimizer_len + 1u) {
uint64_t candidate = Database.canonical_representation(
((*kmer_ptr) >> (2 * (k - minimizer_len - candidate_pos))) & minimizer_mask, minimizer_len
) ^ xor_mask;
while (! minimizer_queue.empty() && minimizer_queue.back().minimizer > candidate) {
minimizer_queue.pop_back();
}
minimizer_pos_pair_t data = { candidate, candidate_pos++ };
minimizer_queue.push_back(data);
}
}
else {
uint64_t candidate = Database.canonical_representation(
*kmer_ptr & minimizer_mask, minimizer_len
) ^ xor_mask;
while (! minimizer_queue.empty() && minimizer_queue.back().minimizer > candidate) {
minimizer_queue.pop_back();
}
minimizer_pos_pair_t data = { candidate, candidate_pos++ };
minimizer_queue.push_back(data);
if (minimizer_queue.front().pos < candidate_pos - k + minimizer_len - 1u) {
minimizer_queue.pop_front();
}
}
uint32_t *val_ptr = Database.kmer_query(
Database.canonical_representation(*kmer_ptr),
minimizer_queue.front().minimizer
&current_bin_key,
&current_min_pos, &current_max_pos
);
taxon = val_ptr ? *val_ptr : 0;
if (taxon) {
1 change: 0 additions & 1 deletion src/kraken_headers.hpp
Original file line number Diff line number Diff line change
@@ -27,7 +27,6 @@
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <deque>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
54 changes: 45 additions & 9 deletions src/krakendb.cpp
Original file line number Diff line number Diff line change
@@ -164,10 +164,6 @@ uint64_t KrakenDB::bin_key(uint64_t kmer) {
return min_bin_key;
}

uint64_t KrakenDB::get_xor_mask() {
return index_ptr->index_type() == 1 ? 0 : INDEX2_XOR_MASK;
}

// Code mostly from Jellyfish 1.6 source
uint64_t KrakenDB::reverse_complement(uint64_t kmer, uint8_t n) {
kmer = ((kmer >> 2) & 0x3333333333333333UL) | ((kmer & 0x3333333333333333UL) << 2);
@@ -199,15 +195,36 @@ uint64_t KrakenDB::canonical_representation(uint64_t kmer) {
return kmer < revcom ? kmer : revcom;
}

uint32_t *KrakenDB::kmer_query(uint64_t kmer, uint64_t b_key)
// perform search over last range to speed up queries
// NOTE: retry_on_failure implies all pointer params are non-NULL
uint32_t *KrakenDB::kmer_query(uint64_t kmer, uint64_t *last_bin_key,
int64_t *min_pos, int64_t *max_pos,
bool retry_on_failure)
{
int64_t min, max, mid;
uint64_t comp_kmer;
uint64_t b_key;
char *ptr = get_pair_ptr();
size_t pair_sz = pair_size();

min = index_ptr->at(b_key);
max = index_ptr->at(b_key + 1) - 1;
// Use provided values if they exist and are valid
if (retry_on_failure && *min_pos <= *max_pos) {
b_key = *last_bin_key;
min = *min_pos;
max = *max_pos;
}
else {
b_key = bin_key(kmer);
min = index_ptr->at(b_key);
max = index_ptr->at(b_key + 1) - 1;
// Invalid min/max values + retry_on_failure means min/max need to be
// initialized and set in caller
if (retry_on_failure) {
*last_bin_key = b_key;
*min_pos = min;
*max_pos = max;
}
}

// Binary search with large window
while (min + 15 <= max) {
@@ -231,12 +248,31 @@ uint32_t *KrakenDB::kmer_query(uint64_t kmer, uint64_t b_key)
return (uint32_t *) (ptr + pair_sz * mid + key_len);
}

return NULL;
uint32_t *answer = NULL;
// ROF implies the provided values might be out of date
// If they are, we'll update them and search again
if (retry_on_failure) {
b_key = bin_key(kmer);
// If bin key hasn't changed, search fails
if (b_key == *last_bin_key)
return NULL;
min = index_ptr->at(b_key);
max = index_ptr->at(b_key + 1) - 1;
// Recursive call w/ adjusted search params and w/o retry
answer = kmer_query(kmer, &b_key, &min, &max, false);
// Update caller's search params due to bin key change
if (last_bin_key != NULL) {
*last_bin_key = b_key;
*min_pos = min;
*max_pos = max;
}
}
return answer;
}

// Binary search w/in the k-mer's bin
uint32_t *KrakenDB::kmer_query(uint64_t kmer) {
return kmer_query(kmer, bin_key(kmer));
return kmer_query(kmer, NULL, NULL, NULL, false);
}

KrakenDBIndex::KrakenDBIndex() {
9 changes: 5 additions & 4 deletions src/krakendb.hpp
Original file line number Diff line number Diff line change
@@ -56,14 +56,15 @@ namespace kraken {
size_t header_size(); // Jellyfish uses variable header sizes
uint32_t *kmer_query(uint64_t kmer); // return ptr to pair w/ kmer

// Assume correct bin key has been calculated and provided
uint32_t *kmer_query(uint64_t kmer, uint64_t bin_key);

// perform search over last range to speed up queries
uint32_t *kmer_query(uint64_t kmer, uint64_t *last_bin_key,
int64_t *min_pos, int64_t *max_pos,
bool retry_on_failure=true);

// return "bin key" for kmer, based on index
// If idx_nt not specified, use index's value
uint64_t bin_key(uint64_t kmer, uint64_t idx_nt);
uint64_t bin_key(uint64_t kmer);
uint64_t get_xor_mask();

// Code from Jellyfish, rev. comp. of a k-mer with n nt.
// If n is not specified, use k in DB, otherwise use first n nt in kmer

0 comments on commit ef84b13

Please sign in to comment.