Skip to content

Commit

Permalink
Trying to make HLL more efficient with rval refs
Browse files Browse the repository at this point in the history
When merging, use r-value references when this is zero.
  • Loading branch information
fbreitwieser committed Dec 14, 2017
1 parent c85bb96 commit 55d0078
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 43 deletions.
2 changes: 1 addition & 1 deletion src/classify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ void process_file(char *filename) {
//if (!res.second) {
// res.first->second += std::move(it->second);
//}
taxon_counts[it->first] += it->second;
taxon_counts[it->first] += std::move(it->second);
}

if (Print_kraken)
Expand Down
111 changes: 73 additions & 38 deletions src/hyperloglogplus.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,9 @@ void HyperLogLogPlusMinus<uint64_t>::add(uint64_t item) {
cerr << bitset<64>(hash_value) << endl;
#endif

if (sparse && this->sparseList.size() + 1 > this->m/4) {
switchToNormalRepresentation();
}
if (sparse) {
// sparse mode: put the encoded hash into sparse list
uint32_t encoded_hash_value = encodeHashIn32Bit(hash_value, pPrime, p);
Expand All @@ -443,9 +446,6 @@ void HyperLogLogPlusMinus<uint64_t>::add(uint64_t item) {
#endif

// if the sparseList is too large, switch to normal (register) representation
if (this->sparseList.size() > this->m/4) {
switchToNormalRepresentation();
}
} else {
// normal mode
// take first p bits as index {x63,...,x64-p}
Expand Down Expand Up @@ -514,65 +514,100 @@ uint64_t HyperLogLogPlusMinus<T>::nObserved() const {
return n_observed;
}

// Merge other HyperLogLogPlusMinus into this one. May convert to normal representation

template<typename T>
void HyperLogLogPlusMinus<T>::add(const HyperLogLogPlusMinus<T>* other) {
if (this->p != other->p) {
void HyperLogLogPlusMinus<T>::merge(HyperLogLogPlusMinus<T>&& other) {
if (this->p != other.p) {
throw std::invalid_argument("precisions must be equal");
}
if (other->n_observed == 0)
if (other.n_observed == 0)
return;

if (this->n_observed == 0) {
// TODO: Make this more efficient when other is disowned
n_observed = other->n_observed;
sparse = other->sparse;
sparseList = other->sparseList;
M = other->M;
return;
}

if (this->sparse && other->sparse) {
if (this->sparseList.size()+other->sparseList.size() > this->m) {
// TODO: this switches to normal representation too soon if there is duplication
switchToNormalRepresentation();
addToRegisters(other->sparseList);
n_observed = other.n_observed;
sparse = other.sparse;
sparseList = std::move(other.sparseList);
M = std::move(other.M);
} else {
n_observed += other.n_observed;
if (this->sparse && other.sparse) {
// this->merge(static_cast<const HyperLogLogPlusMinus<T>&>(other));
// consider using addHashToSparseList(this->sparseList, val, pPrime) and checking for sizes
this->sparseList.insert(other.sparseList.begin(), other.sparseList.end());
} else if (other.sparse) {
// other is sparse, but this is not
addToRegisters(other.sparseList);
} else {

for (const auto val : other->sparseList) {
addHashToSparseList(this->sparseList, val, pPrime);
if (this->sparse) {
this->sparse = false;
M = std::move(other.M);
addToRegisters(this->sparseList);
this->sparseList.clear();
} else {
// merge registers
for (size_t i = 0; i < other.M.size(); ++i) {
if (other.M[i] > this->M[i]) {
this->M[i] = other.M[i];
}
}
}
}
} else if (other->sparse) {
// other is sparse, but this is not
addToRegisters(other->sparseList);
}
}

// Merge other HyperLogLogPlusMinus into this one. May convert to normal representation
template<typename T>
void HyperLogLogPlusMinus<T>::merge(const HyperLogLogPlusMinus<T>& other) {
if (this->p != other.p) {
throw std::invalid_argument("precisions must be equal");
}
if (other.n_observed == 0)
return;

if (this->n_observed == 0) {
// TODO: Make this more efficient when other is disowned
n_observed = other.n_observed;
sparse = other.sparse;
sparseList = other.sparseList;
M = other.M;
} else {
if (this->sparse) {
switchToNormalRepresentation();
}
// merge registers
for (size_t i = 0; i < other->M.size(); ++i) {
if (other->M[i] > this->M[i]) {
this->M[i] = other->M[i];
n_observed += other.n_observed;
if (this->sparse && other.sparse) {
// consider using addHashToSparseList(this->sparseList, val, pPrime) and checking for sizes
this->sparseList.insert(other.sparseList.begin(), other.sparseList.end());
} else if (other.sparse) {
// other is sparse, but this is not
addToRegisters(other.sparseList);
} else {
if (this->sparse) {
this->sparse = false;
M = other.M;
addToRegisters(this->sparseList);
this->sparseList.clear();
} else {
// merge registers
for (size_t i = 0; i < other.M.size(); ++i) {
if (other.M[i] > this->M[i]) {
this->M[i] = other.M[i];
}
}
}
}
}
n_observed += other->n_observed;
}

template<typename T>
HyperLogLogPlusMinus<T>& HyperLogLogPlusMinus<T>::operator+=(const HyperLogLogPlusMinus<T>* other) {
add(other);
HyperLogLogPlusMinus<T>& HyperLogLogPlusMinus<T>::operator+=(HyperLogLogPlusMinus<T>&& other) {
merge(std::move(other));
return *this;
}

template<typename T>
HyperLogLogPlusMinus<T>& HyperLogLogPlusMinus<T>::operator+=(const HyperLogLogPlusMinus<T>& other) {
add(&other);
merge(other);
return *this;
}


template<>
uint64_t HyperLogLogPlusMinus<uint64_t>::flajoletCardinality(bool use_sparse_precision) const {
vector<uint8_t> M = this->M;
Expand Down
8 changes: 6 additions & 2 deletions src/hyperloglogplus.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,12 @@ class HyperLogLogPlusMinus {
// Add items or other HLL to this sketch
void add(uint64_t item);
void add(vector<uint64_t> items);
void add(const HyperLogLogPlusMinus<HASH>* other);
HyperLogLogPlusMinus<HASH>& operator+=(const HyperLogLogPlusMinus<HASH>* other);

// Merge another sketch into this one
// TODO: assumes equal bit_mixers! but does not check that
void merge(HyperLogLogPlusMinus<HASH>&& other);
void merge(const HyperLogLogPlusMinus<HASH>& other);
HyperLogLogPlusMinus<HASH>& operator+=(HyperLogLogPlusMinus<HASH>&& other);
HyperLogLogPlusMinus<HASH>& operator+=(const HyperLogLogPlusMinus<HASH>& other);

// Calculate cardinality estimates
Expand Down
9 changes: 8 additions & 1 deletion src/readcounts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ namespace kraken {
struct ReadCounts {
uint64_t n_reads;
// uint64_t n_kmers; // now in kmers.nObserved()
bool count_kmers = HLL_PRECISION == 0;
bool count_kmers = HLL_PRECISION == 0; // TODO: redo
HyperLogLogPlusMinus<uint64_t> kmers; // unique k-mer count per taxon

ReadCounts() : n_reads(0), count_kmers(HLL_PRECISION > 0) {
Expand All @@ -49,6 +49,13 @@ namespace kraken {
return *this;
}

ReadCounts& operator+=(ReadCounts&& b) {
n_reads += b.n_reads;
if (count_kmers)
kmers += std::move(b.kmers);
return *this;
}

bool operator<(const ReadCounts& rc) {
if (n_reads < rc.n_reads) {
return true;
Expand Down
5 changes: 4 additions & 1 deletion src/taxdb.h
Original file line number Diff line number Diff line change
Expand Up @@ -947,7 +947,7 @@ TaxReport<TAXID,READCOUNTS>::TaxReport(std::ostream& reportOfb, const TaxonomyDB
const std::unordered_map<TAXID, READCOUNTS>& readCounts,
bool show_zeros) : _reportOfb(reportOfb), _taxdb(taxdb), _taxCounts(readCounts), _show_zeros(show_zeros) {

cerr << "Setting all the values in the TaxTree ...";
cerr << "Setting values in the taxonomy tree ...";
for (auto it = _taxCounts.begin(); it != _taxCounts.end(); ++it) {
auto tax_it = taxdb.entries.find(it->first);
if (tax_it == taxdb.entries.end()) {
Expand Down Expand Up @@ -986,6 +986,9 @@ void TaxReport<TAXID,READCOUNTS>::setReportCols(const std::vector<std::string>&

template<typename TAXID, typename READCOUNTS>
void TaxReport<TAXID,READCOUNTS>::printReport(const std::string& format) {

cerr << "Printing classification report ... " << endl;

const auto it_unclassified = _cladeCounts.find(0);
const auto it_classified = _cladeCounts.find(1);
const auto it_other = _cladeCounts.find(-1);
Expand Down

0 comments on commit 55d0078

Please sign in to comment.