Skip to content

Commit

Permalink
Handle sub-8 byte k-mers properly
Browse files Browse the repository at this point in the history
  • Loading branch information
DerrickWood committed Dec 14, 2014
1 parent fe83728 commit 8812161
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 16 deletions.
22 changes: 20 additions & 2 deletions src/db_sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,13 @@ uint8_t Bin_key_nt = 15;
int Num_threads = 1;
bool Zero_vals = false;
bool Operate_in_RAM = false;
// Global until I can find a way to pass this to the sorting function
size_t Key_len = 8;
#ifdef _OPENMP
omp_lock_t *Locks;
#endif

static int pair_cmp(const void *a, const void *b);
static void parse_command_line(int argc, char **argv);
static void bin_and_sort_data(KrakenDB &in, KrakenDB &out);
static void usage(int exit_code=EX_USAGE);
Expand All @@ -46,6 +49,7 @@ int main(int argc, char **argv) {

QuickFile input_db_file(Input_DB_filename);
KrakenDB input_db(input_db_file.ptr());
Key_len = input_db.get_key_len();

input_db.make_index(Index_filename, Bin_key_nt);
QuickFile index_file(Index_filename);
Expand Down Expand Up @@ -100,7 +104,9 @@ static void bin_and_sort_data(KrakenDB &in, KrakenDB &out) {
vector<uint64_t> pos(offsets, offsets + entries);
#pragma omp parallel for schedule(dynamic,400)
for (uint64_t i = 0; i < in.get_key_ct(); i++) {
uint64_t bin_key = in.bin_key(* (uint64_t *)(in_ptr + i * in.pair_size()));
uint64_t kmer = 0;
memcpy(&kmer, in_ptr + i * in.pair_size(), Key_len);
uint64_t bin_key = in.bin_key(kmer);
#ifdef _OPENMP
omp_set_lock(&Locks[bin_key]);
#endif
Expand Down Expand Up @@ -128,10 +134,22 @@ static void bin_and_sort_data(KrakenDB &in, KrakenDB &out) {
for (uint64_t i = 0; i < entries; i++) {
qsort(out_ptr + offsets[i] * out.pair_size(),
offsets[i+1] - offsets[i], out.pair_size(),
KrakenDB::pair_cmp);
pair_cmp);
}
}

static int pair_cmp(const void *a, const void *b) {
uint64_t aval = 0, bval = 0;
memcpy(&aval, a, Key_len);
memcpy(&bval, b, Key_len);
if (aval < bval)
return -1;
else if (aval == bval)
return 0;
else
return 1;
}

void parse_command_line(int argc, char **argv) {
int opt;
int sig;
Expand Down
19 changes: 7 additions & 12 deletions src/krakendb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ void KrakenDB::make_index(string index_filename, uint8_t nt) {
char *ptr = get_pair_ptr();
#pragma omp parallel for schedule(dynamic,400)
for (uint64_t i = 0; i < key_ct; i++) {
uint64_t b_key = bin_key(* (uint64_t *) (ptr + i * pair_size()), nt);
uint64_t kmer = 0;
memcpy(&kmer, ptr + i * pair_size(), key_len);
uint64_t b_key = bin_key(kmer, nt);
#pragma omp atomic
bin_counts[b_key]++;
}
Expand Down Expand Up @@ -115,15 +117,6 @@ void KrakenDB::set_index(KrakenDBIndex *i_ptr) {
index_ptr = i_ptr;
}

// Used for sorting pairs within a bin
// Compares k-mers at a and b by simple numeric (as well as lexico.) value
int KrakenDB::pair_cmp(const void *a, const void *b) {
return * ((uint64_t *) a) < * ((uint64_t *) b)
? -1
: * ((uint64_t *) a) > * ((uint64_t *) b)
? 1 : 0;
}

// Simple accessors/convenience methods
uint8_t KrakenDB::get_k() { return k; }
uint64_t KrakenDB::get_key_bits() { return key_bits; }
Expand Down Expand Up @@ -236,7 +229,8 @@ uint32_t *KrakenDB::kmer_query(uint64_t kmer, uint64_t *last_bin_key,
// Binary search with large window
while (min + 15 <= max) {
mid = min + (max - min) / 2;
comp_kmer = * (uint64_t *) (ptr + pair_sz * mid);
comp_kmer = 0;
memcpy(&comp_kmer, ptr + pair_sz * mid, key_len);
comp_kmer &= (1ull << key_bits) - 1; // trim any excess
if (kmer > comp_kmer)
min = mid + 1;
Expand All @@ -247,7 +241,8 @@ uint32_t *KrakenDB::kmer_query(uint64_t kmer, uint64_t *last_bin_key,
}
// Linear search once window shrinks
for (mid = min; mid <= max; mid++) {
comp_kmer = * (uint64_t *) (ptr + pair_sz * mid);
comp_kmer = 0;
memcpy(&comp_kmer, ptr + pair_sz * mid, key_len);
comp_kmer &= (1ull << key_bits) - 1; // trim any excess
if (kmer == comp_kmer)
return (uint32_t *) (ptr + pair_sz * mid + key_len);
Expand Down
2 changes: 0 additions & 2 deletions src/krakendb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ namespace kraken {

class KrakenDB {
public:
// Compares key/value pairs in DB, - => a<b, 0 => a=b, + => a>b
static int pair_cmp(const void *a, const void *b);

char *get_ptr(); // Return the file pointer
char *get_pair_ptr(); // Return pointer to start of pairs
Expand Down

0 comments on commit 8812161

Please sign in to comment.