Skip to content

Commit

Permalink
Multi-bit RLE
Browse files Browse the repository at this point in the history
* Import Impala's multi-bit RLE:
- Updated the fixed-size buffer code to use faststring
- Re-implemented single-bit RLE APIs introduced in Kudu
- Templatize RleEncoder/Decoder (not all APIs are passed the type, also
allows the compiler to generate type-specific code. NB: buffered values
remain uint64_t for performance)
- Remove BitWriter#Finish as we no longer write partially-populated bytes
- Import bit-util.h & test from Impala, should merge with bitmap.h later
- Fix some lint warnings
* Add bit stream read/write and RLE microbenchmark, performance on
release builds is preserved.

Benchmark results before:

BooleanBitStream(Prealloc)

         35.791540 task-clock                #    0.986 CPUs utilized
                 3 context-switches          #    0.000 M/sec
                 0 CPU-migrations            #    0.000 M/sec
               983 page-faults               #    0.027 M/sec
        64,126,619 cycles                    #    1.792 GHz                     [77.75%]
        30,326,772 stalled-cycles-frontend   #   47.29% frontend cycles idle    [77.64%]
         6,513,738 stalled-cycles-backend    #   10.16% backend  cycles idle    [75.74%]
       112,617,774 instructions              #    1.76  insns per cycle
                                             #    0.27  stalled cycles per insn [88.85%]
        30,659,968 branches                  #  856.626 M/sec                   [88.84%]
            30,433 branch-misses             #    0.10% of all branches         [86.20%]

       0.036281802 seconds time elapsed

BooleanRLE(Prealloc)

       1034.815160 task-clock                #    0.998 CPUs utilized
                87 context-switches          #    0.000 M/sec
                 3 CPU-migrations            #    0.000 M/sec
               743 page-faults               #    0.001 M/sec
     3,016,920,461 cycles                    #    2.915 GHz                     [83.37%]
        61,362,300 stalled-cycles-frontend   #    2.03% frontend cycles idle    [83.39%]
       234,655,764 stalled-cycles-backend    #    7.78% backend  cycles idle    [66.78%]
     7,886,418,908 instructions              #    2.61  insns per cycle
                                             #    0.03  stalled cycles per insn [83.36%]
     2,960,156,594 branches                  # 2860.566 M/sec                   [83.43%]
            68,001 branch-misses             #    0.00% of all branches         [83.31%]

       1.036906394 seconds time elapsed

Benchmark results after:

BooleanBitStream(Prealloc)

         30.804692 task-clock                #    0.984 CPUs utilized
                 3 context-switches          #    0.000 M/sec
                 0 CPU-migrations            #    0.000 M/sec
               983 page-faults               #    0.032 M/sec
        70,225,460 cycles                    #    2.280 GHz                     [74.17%]
         8,950,196 stalled-cycles-frontend   #   12.74% frontend cycles idle    [86.58%]
         5,749,925 stalled-cycles-backend    #    8.19% backend  cycles idle    [74.19%]
       169,178,749 instructions              #    2.41  insns per cycle
                                             #    0.05  stalled cycles per insn [87.09%]
        37,462,695 branches                  # 1216.136 M/sec                   [86.92%]
            25,178 branch-misses             #    0.07% of all branches         [81.07%]

       0.031313790 seconds time elapsed

BooleanRLE(Prealloc)

       1019.328238 task-clock                #    0.998 CPUs utilized
                86 context-switches          #    0.000 M/sec
                 1 CPU-migrations            #    0.000 M/sec
               743 page-faults               #    0.001 M/sec
     2,988,429,141 cycles                    #    2.932 GHz                     [83.16%]
        22,229,870 stalled-cycles-frontend   #    0.74% frontend cycles idle    [83.10%]
       154,395,031 stalled-cycles-backend    #    5.17% backend  cycles idle    [67.00%]
     7,902,188,553 instructions              #    2.64  insns per cycle
                                             #    0.02  stalled cycles per insn [83.54%]
     2,951,981,394 branches                  # 2896.007 M/sec                   [83.57%]
            51,824 branch-misses             #    0.00% of all branches         [83.35%]

       1.021079053 seconds time elapsed

Change-Id: Id442121b6bc169ac51f73d626be4139de44d29a0
Reviewed-on: http://gerrit.ent.cloudera.com:8080/151
Tested-by: Todd Lipcon <[email protected]>
Reviewed-by: Todd Lipcon <[email protected]>
  • Loading branch information
elicollins authored and toddlipcon committed Aug 20, 2013
1 parent 044360f commit a268c03
Show file tree
Hide file tree
Showing 12 changed files with 586 additions and 203 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ oprofile_data
*.proxy.h
protoc-gen-krpc
tpch1
rle
cfile-dump
rwlock-perf
rpc-bench
4 changes: 4 additions & 0 deletions src/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@
# tpch1
ADD_EXECUTABLE(tpch1 tpch/tpch1.cc)
TARGET_LINK_LIBRARIES(tpch1 ${KUDU_TEST_LINK_LIBS})

# rle
ADD_EXECUTABLE(rle rle.cc)
TARGET_LINK_LIBRARIES(rle ${KUDU_TEST_LINK_LIBS})
102 changes: 102 additions & 0 deletions src/benchmarks/rle.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// Copyright (c) 2013, Cloudera, inc.
//
// Micro benchmark for writing/reading bit streams and Kudu specific
// run-length encoding (RLE) APIs. Currently only covers booleans and
// the most performance sensitive APIs. NB: Impala contains a RLE
// micro benchmark (rle-benchmark.cc).
//

#include <glog/logging.h>

#include "util/bit-stream-utils.h"
#include "util/rle-encoding.h"
#include "util/stopwatch.h"

namespace kudu {

// Measure writing and reading single-bit streams
void BooleanBitStream() {
const int num_iters = 1024 * 1024;

faststring buffer(1024 * 1024);
BitWriter writer(&buffer);

// Write alternating strings of repeating 0's and 1's
for (int i = 0; i < num_iters; ++i) {
writer.PutValue(i % 2, 1);
writer.PutValue(i % 2, 1);
writer.PutValue(i % 2, 1);
writer.PutValue(i % 2, 1);
writer.PutValue(i % 2, 1);
writer.PutValue(i % 2, 1);
writer.PutValue(i % 2, 1);
writer.PutValue(i % 2, 1);
}
writer.Flush();

LOG(INFO) << "Wrote " << writer.bytes_written() << " bytes";

BitReader reader(buffer.data(), writer.bytes_written());
for (int i = 0; i < num_iters; ++i) {
bool val;
reader.GetValue(1, &val);
reader.GetValue(1, &val);
reader.GetValue(1, &val);
reader.GetValue(1, &val);
reader.GetValue(1, &val);
reader.GetValue(1, &val);
reader.GetValue(1, &val);
reader.GetValue(1, &val);
}
}

// Measure bulk puts and decoding runs of RLE bools
void BooleanRLE() {
const int num_iters = 3 * 1024;

faststring buffer(45 * 1024);
RleEncoder<bool> encoder(&buffer, 1);

for (int i = 0; i < num_iters; i++) {
encoder.Put(false, 100 * 1024);
encoder.Put(true, 3);
encoder.Put(false, 3);
encoder.Put(true, 213 * 1024);
encoder.Put(false, 300);
encoder.Put(true, 8);
encoder.Put(false, 4);
}

LOG(INFO) << "Wrote " << encoder.len() << " bytes";

RleDecoder<bool> decoder(buffer.data(), encoder.len(), 1);
bool val = false;
size_t run_length;
for (int i = 0; i < num_iters; i++) {
decoder.GetNextRun(&val, &run_length);
decoder.GetNextRun(&val, &run_length);
decoder.GetNextRun(&val, &run_length);
decoder.GetNextRun(&val, &run_length);
decoder.GetNextRun(&val, &run_length);
decoder.GetNextRun(&val, &run_length);
decoder.GetNextRun(&val, &run_length);
}
}

} // namespace kudu

int main(int argc, char **argv) {
FLAGS_logtostderr = 1;
google::InitGoogleLogging(argv[0]);
google::ParseCommandLineFlags(&argc, &argv, true);

LOG_TIMING(INFO, "BooleanBitStream") {
kudu::BooleanBitStream();
}

LOG_TIMING(INFO, "BooleanRLE") {
kudu::BooleanRLE();
}

return 0;
}
4 changes: 2 additions & 2 deletions src/cfile/cfile.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class NullBitmapBuilder {
explicit NullBitmapBuilder(size_t initial_row_capacity)
: nitems_(0),
bitmap_(BitmapSize(initial_row_capacity)),
rle_encoder_(&bitmap_) {
rle_encoder_(&bitmap_, 1) {
}

size_t nitems() const {
Expand All @@ -78,7 +78,7 @@ class NullBitmapBuilder {
private:
size_t nitems_;
faststring bitmap_;
RleEncoder rle_encoder_;
RleEncoder<bool> rle_encoder_;
};

// Main class used to write a CFile.
Expand Down
4 changes: 2 additions & 2 deletions src/cfile/cfile_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ void CFileIterator::SeekToPositionInBlock(PreparedBlock *pb, rowid_t ord_idx) {
// to the index within the non-null entries.
uint32_t index_within_nonnulls;
if (reader_->is_nullable()) {
RleDecoder rle_decoder(pb->rle_bitmap.data(), pb->rle_bitmap.size());
RleDecoder<bool> rle_decoder(pb->rle_bitmap.data(), pb->rle_bitmap.size(), 1);
index_within_nonnulls = rle_decoder.Skip(ord_idx);
} else {
index_within_nonnulls = ord_idx;
Expand Down Expand Up @@ -607,7 +607,7 @@ Status CFileIterator::Scan(ColumnBlock *dst) {
if (reader_->is_nullable()) {
DCHECK(dst->is_nullable());

RleDecoder rle_decoder(pb->rle_bitmap.data(), pb->rle_bitmap.size());
RleDecoder<bool> rle_decoder(pb->rle_bitmap.data(), pb->rle_bitmap.size(), 1);

size_t index = pb->row_index_ - pb->first_row_idx_;
rle_decoder.Skip(index);
Expand Down
2 changes: 1 addition & 1 deletion src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,4 @@ ADD_KUDU_TEST(pb_util-test)
ADD_KUDU_TEST(threadpool-test)
ADD_KUDU_TEST(task_executor-test)
ADD_KUDU_TEST(safe_math-test)

ADD_KUDU_TEST(bit-util-test)
93 changes: 54 additions & 39 deletions src/util/bit-stream-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#define IMPALA_UTIL_BIT_STREAM_UTILS_H

#include "gutil/port.h"
#include "util/bit-util.h"
#include "util/faststring.h"

namespace kudu {
Expand All @@ -27,93 +28,107 @@ class BitWriter {
public:
// buffer: buffer to write bits to.
explicit BitWriter(faststring *buffer)
: buffer_(buffer), byte_offset_(0), bit_offset_(0) {
: buffer_(buffer) {
Clear();
}

void Clear() {
buffered_values_ = 0;
byte_offset_ = 0;
bit_offset_ = 0;
buffer_->clear();
}

// Returns a pointer to the underlying buffer
faststring *buffer() const { return buffer_; }
int bytes_written() const { return byte_offset_ + (bit_offset_ != 0); }

int Finish() {
if (bit_offset_ > 0) {
buffer_->data()[byte_offset_] &= ((1 << bit_offset_) - 1);
}
return bytes_written();
}
// The number of current bytes written, including the current byte (i.e. may include a
// fraction of a byte). Includes buffered values.
int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); }

// Writes a bool to the buffer.
void PutBool(bool b);
// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit
// packed. num_bits must be <= 32.
void PutValue(uint64_t v, int num_bits);

// Writes v to the next aligned byte.
// Writes v to the next aligned byte using num_bits. If T is larger than num_bits, the
// extra high-order bits will be ignored.
template<typename T>
void PutAligned(T v);
void PutAligned(T v, int num_bits);

// Write a Vlq encoded int to the buffer. The value is written byte aligned.
// For more details on vlq: en.wikipedia.org/wiki/Variable-length_quantity
void PutVlqInt(int32_t v);

// Get the index to the next aligned byte and advance the underlying buffer by num_bytes.
size_t GetByteIndexAndAdvance(int num_bytes = 1) {
uint8_t *ptr = GetNextBytePtr(num_bytes);
size_t GetByteIndexAndAdvance(int num_bytes) {
uint8_t* ptr = GetNextBytePtr(num_bytes);
return ptr - buffer_->data();
}

private:
// Get a pointer to the next aligned byte and advance the underlying buffer by num_bytes.
uint8_t *GetNextBytePtr(int num_bytes);
uint8_t* GetNextBytePtr(int num_bytes);

// Flushes all buffered values to the buffer. Call this when done writing to the buffer.
// If 'align' is true, buffered_values_ is reset and any future writes will be written
// to the next byte boundary.
void Flush(bool align = false);

private:
// Bit-packed values are initially written to this variable before being memcpy'd to
// buffer_. This is faster than writing values byte by byte directly to buffer_.
uint64_t buffered_values_;

faststring *buffer_;
int byte_offset_;
int bit_offset_; // Offset in current byte
int byte_offset_; // Offset in buffer_
int bit_offset_; // Offset in buffered_values_
};

// Utility class to read bit/byte stream. This class can read bits or bytes
// that are either byte aligned or not. It also has utilities to read multiple
// bytes in one read (e.g. encoded int).
class BitReader {
public:
// buffer: buffer to read from. the length is 'num_bytes'
BitReader(const uint8_t* buffer, int num_bytes) :
buffer_(buffer),
num_bytes_(num_bytes),
byte_offset_(0),
bit_offset_(0) {
}
// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'.
BitReader(const uint8_t* buffer, int buffer_len);

BitReader() : buffer_(NULL), num_bytes_(0) {}
BitReader() : buffer_(NULL), max_bytes_(0) {}

// Gets the next bool from the buffers.
// Returns true if 'v' could be read or false if there are not enough bytes left.
bool GetBool(bool* b);
// Gets the next value from the buffer. Returns true if 'v' could be read or false if
// there are not enough bytes left. num_bits must be <= 32.
template<typename T>
bool GetValue(int num_bits, T* v);

// Reads a T sized value from the buffer. T needs to be a native type and little
// endian. The value is assumed to be byte aligned so the stream will be advance
// to the start of the next byte before v is read.
// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a
// little-endian native type and big enough to store 'num_bytes'. The value is assumed
// to be byte-aligned so the stream will be advanced to the start of the next byte
// before 'v' is read. Returns false if there are not enough bytes left.
template<typename T>
bool GetAligned(T* v);
bool GetAligned(int num_bytes, T* v);

// Reads a vlq encoded int from the stream. The encoded int must start at the
// beginning of a byte. Return false if there were not enough bytes in the buffer.
bool GetVlqInt(int32_t* v);

// Returns the number of bytes left in the stream, including the current byte.
int bytes_left() { return num_bytes_ - byte_offset_; }
// Returns the number of bytes left in the stream, not including the current byte (i.e.,
// there may be an additional fraction of a byte).
int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); }

void RewindBool();
// Rewind the stream by 'num_bits' bits
void Rewind(int num_bits);

// Maximum byte length of a vlq encoded int
static const int MAX_VLQ_BYTE_LEN = 5;

private:
const uint8_t* buffer_;
int num_bytes_;
int byte_offset_;
int bit_offset_; // Offset in current byte
int max_bytes_;

// Bytes are memcpy'd from buffer_ and values are read from this variable. This is
// faster than reading values byte by byte directly from buffer_.
uint64_t buffered_values_;

int byte_offset_; // Offset in buffer_
int bit_offset_; // Offset in buffered_values_
};

}
Expand Down
Loading

0 comments on commit a268c03

Please sign in to comment.