Skip to content

Commit

Permalink
Support Bloom filter format used in SSTables 3.0.
Browse files Browse the repository at this point in the history
The two hash values, base and increment, used to produce indices for
setting bits in the filter, have been swapped in SSTables 3.0.
See CASSANDRA-8413 for details.

Signed-off-by: Vladimir Krivopalov <[email protected]>
  • Loading branch information
vkrivopalov committed May 8, 2018
1 parent fe2358e commit 0f37c0e
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 21 deletions.
6 changes: 3 additions & 3 deletions sstables/sstables.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1454,7 +1454,7 @@ future<> sstable::read_filter(const io_priority_class& pc) {
read_simple<component_type::Filter>(filter, pc).get();
auto nr_bits = filter.buckets.elements.size() * std::numeric_limits<typename decltype(filter.buckets.elements)::value_type>::digits;
large_bitset bs(nr_bits, std::move(filter.buckets.elements));
_components->filter = utils::filter::create_filter(filter.hashes, std::move(bs));
_components->filter = utils::filter::create_filter(filter.hashes, std::move(bs), (_version != sstable_version_types::mc));
});
}

Expand Down Expand Up @@ -2218,7 +2218,7 @@ components_writer::components_writer(sstable& sst, const schema& s, file_writer&
, _range_tombstones(s)
, _large_partition_handler(cfg.large_partition_handler)
{
_sst._components->filter = utils::i_filter::get_filter(estimated_partitions, _schema.bloom_filter_fp_chance());
_sst._components->filter = utils::i_filter::get_filter(estimated_partitions, _schema.bloom_filter_fp_chance(), true);
_sst._pi_write.desired_block_size = cfg.promoted_index_block_size.value_or(get_config().column_index_size_in_kb() * 1024);
_sst._correctly_serialize_non_compound_range_tombstones = cfg.correctly_serialize_non_compound_range_tombstones;
_index_sampling_state.summary_byte_cost = summary_byte_cost();
Expand Down Expand Up @@ -2721,7 +2721,7 @@ class sstable_writer_m : public sstable_writer::writer_impl {
_sst._shards = { shard };

_cfg.monitor->on_write_started(_data_writer->offset_tracker());
_sst._components->filter = utils::i_filter::get_filter(estimated_partitions, _schema.bloom_filter_fp_chance());
_sst._components->filter = utils::i_filter::get_filter(estimated_partitions, _schema.bloom_filter_fp_chance(), false);
_pi_write_m.desired_block_size = cfg.promoted_index_block_size.value_or(get_config().column_index_size_in_kb() * 1024);
_sst._correctly_serialize_non_compound_range_tombstones = _cfg.correctly_serialize_non_compound_range_tombstones;
_index_sampling_state.summary_byte_cost = summary_byte_cost();
Expand Down
18 changes: 9 additions & 9 deletions utils/bloom_filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@ namespace utils {
namespace filter {

template<typename Func>
void for_each_index(hashed_key hk, int count, int64_t max, Func&& func) {
void for_each_index(hashed_key hk, int count, int64_t max, bool old_bf_hash_order, Func&& func) {
auto h = hk.hash();
int64_t base = h[0];
int64_t inc = h[1];
int64_t base = old_bf_hash_order ? h[0] : h[1];
int64_t inc = old_bf_hash_order ? h[1] : h[0];
for (int i = 0; i < count; i++) {
if (func(std::abs(base % max)) == stop_iteration::yes) {
break;
Expand All @@ -67,7 +67,7 @@ void for_each_index(hashed_key hk, int count, int64_t max, Func&& func) {

bool bloom_filter::is_present(hashed_key key) {
bool result = true;
for_each_index(key, _hash_count, _bitset.size(), [this, &result] (auto i) {
for_each_index(key, _hash_count, _bitset.size(), _old_bf_hash_order, [this, &result] (auto i) {
if (!_bitset.test(i)) {
result = false;
return stop_iteration::yes;
Expand All @@ -78,7 +78,7 @@ bool bloom_filter::is_present(hashed_key key) {
}

void bloom_filter::add(const bytes_view& key) {
for_each_index(make_hashed_key(key), _hash_count, _bitset.size(), [this] (auto i) {
for_each_index(make_hashed_key(key), _hash_count, _bitset.size(), _old_bf_hash_order, [this] (auto i) {
_bitset.set(i);
return stop_iteration::no;
});
Expand All @@ -88,15 +88,15 @@ bool bloom_filter::is_present(const bytes_view& key) {
return is_present(make_hashed_key(key));
}

filter_ptr create_filter(int hash, large_bitset&& bitset) {
return std::make_unique<murmur3_bloom_filter>(hash, std::move(bitset));
filter_ptr create_filter(int hash, large_bitset&& bitset, bool old_bf_hash_order) {
return std::make_unique<murmur3_bloom_filter>(hash, std::move(bitset), old_bf_hash_order);
}

filter_ptr create_filter(int hash, int64_t num_elements, int buckets_per) {
filter_ptr create_filter(int hash, int64_t num_elements, int buckets_per, bool old_bf_hash_order) {
int64_t num_bits = (num_elements * buckets_per) + bloom_calculations::EXCESS;
num_bits = align_up<int64_t>(num_bits, 64); // Seems to be implied in origin
large_bitset bitset(num_bits);
return std::make_unique<murmur3_bloom_filter>(hash, std::move(bitset));
return std::make_unique<murmur3_bloom_filter>(hash, std::move(bitset), old_bf_hash_order);
}
}
}
17 changes: 11 additions & 6 deletions utils/bloom_filter.hh
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,16 @@ public:
private:
bitmap _bitset;
int _hash_count;
bool _old_bf_hash_order;
public:
int num_hashes() { return _hash_count; }
bitmap& bits() { return _bitset; }

bloom_filter(int hashes, bitmap&& bs) : _bitset(std::move(bs)), _hash_count(hashes) {
}
bloom_filter(int hashes, bitmap&& bs, bool old_bf_hash_order)
: _bitset(std::move(bs))
, _hash_count(hashes)
, _old_bf_hash_order(old_bf_hash_order)
{}

virtual void add(const bytes_view& key) override;

Expand All @@ -84,8 +88,9 @@ public:

struct murmur3_bloom_filter: public bloom_filter {

murmur3_bloom_filter(int hashes, bitmap&& bs) : bloom_filter(hashes, std::move(bs)) {}

murmur3_bloom_filter(int hashes, bitmap&& bs, bool old_bf_hash_order)
: bloom_filter(hashes, std::move(bs), old_bf_hash_order)
{}
};

struct always_present_filter: public i_filter {
Expand All @@ -109,7 +114,7 @@ struct always_present_filter: public i_filter {
}
};

filter_ptr create_filter(int hash, large_bitset&& bitset);
filter_ptr create_filter(int hash, int64_t num_elements, int buckets_per);
filter_ptr create_filter(int hash, large_bitset&& bitset, bool old_bf_hash_order);
filter_ptr create_filter(int hash, int64_t num_elements, int buckets_per, bool old_bf_hash_order);
}
}
4 changes: 2 additions & 2 deletions utils/i_filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
namespace utils {
static logging::logger filterlog("bloom_filter");

filter_ptr i_filter::get_filter(int64_t num_elements, double max_false_pos_probability) {
filter_ptr i_filter::get_filter(int64_t num_elements, double max_false_pos_probability, bool old_bf_hash_order) {
assert(seastar::thread::running_in_thread());

if (max_false_pos_probability > 1.0) {
Expand All @@ -41,7 +41,7 @@ filter_ptr i_filter::get_filter(int64_t num_elements, double max_false_pos_proba

int buckets_per_element = bloom_calculations::max_buckets_per_element(num_elements);
auto spec = bloom_calculations::compute_bloom_spec(buckets_per_element, max_false_pos_probability);
return filter::create_filter(spec.K, num_elements, spec.buckets_per_element);
return filter::create_filter(spec.K, num_elements, spec.buckets_per_element, old_bf_hash_order);
}

hashed_key make_hashed_key(bytes_view b) {
Expand Down
2 changes: 1 addition & 1 deletion utils/i_filter.hh
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,6 @@ struct i_filter {
* Asserts that the given probability can be satisfied using this
* filter.
*/
static filter_ptr get_filter(int64_t num_elements, double max_false_pos_prob);
static filter_ptr get_filter(int64_t num_elements, double max_false_pos_prob, bool old_bf_hash_order);
};
}

0 comments on commit 0f37c0e

Please sign in to comment.