Skip to content

Commit

Permalink
Add new integer types of all sizes and templates for constructing enc…
Browse files Browse the repository at this point in the history
…oder/decoders

- Added new integer types UINT8,INT8,UINT16,INT16,UINT64,INT64
- Added a factory method for Block Builders and Decoders
- Added a factory method for KeyEncoders
- Templatized IntTestSetup in table-test base to allow to run all int types
- Refactored away switch/case's for encoding/block building

Change-Id: I91da37dbe6aafb7303c14c70a387ddb0fadd0c9
Change-Id: I91da37dbe6aafb7303c14c70a387ddb0fadd0c99
Reviewed-on: http://gerrit.ent.cloudera.com:8080/33
Reviewed-by: Todd Lipcon <[email protected]>
Tested-by: Todd Lipcon <[email protected]>
  • Loading branch information
David Alves authored and toddlipcon committed Jul 10, 2013
1 parent d53cb36 commit 2b11231
Show file tree
Hide file tree
Showing 25 changed files with 1,074 additions and 390 deletions.
3 changes: 2 additions & 1 deletion src/cfile/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ SET(CFILE_SRCS cfile.cc index_block.cc index_btree.cc cfile_reader.cc

block_cache.cc
bloomfile.cc
type_encodings.cc

compression_codec.cc
block_compression.cc
Expand All @@ -35,4 +36,4 @@ ADD_KUDU_TEST(compression-test)

# Tools
ADD_EXECUTABLE(cfile-dump cfile-dump.cc)
TARGET_LINK_LIBRARIES(cfile-dump ${KUDU_LINK_LIBS})
TARGET_LINK_LIBRARIES(cfile-dump ${KUDU_LINK_LIBS})
17 changes: 0 additions & 17 deletions src/cfile/block_encodings.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,6 @@ class ColumnDataView;

namespace cfile {

// Return the default encoding to use for the given data type.
// TODO: this probably won't stay around too long - in a real Flush
// situation, we can look at the content in the memstore and pick the
// most effective coding.
inline EncodingType GetDefaultEncoding(DataType type) {
switch (type) {
case STRING:
return PREFIX;
case INT32:
return PLAIN;
case UINT32:
return GROUP_VARINT;
default:
CHECK(0) << "unknown type: " << type;
}
}


class BlockBuilder {
public:
Expand Down
89 changes: 21 additions & 68 deletions src/cfile/cfile.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,16 @@
#include <utility>

#include "cfile/cfile.h"
#include "cfile/cfile.pb.h"
#include "cfile/block_pointer.h"
#include "cfile/gvint_block.h"
#include "cfile/string_prefix_block.h"
#include "cfile/string_plain_block.h"
#include "cfile/index_block.h"
#include "cfile/index_btree.h"
#include "cfile/plain_block.h"
#include "common/key_encoder.h"
#include "cfile/type_encodings.h"
#include "util/env.h"
#include "util/coding.h"
#include "util/logging.h"
#include "util/pb_util.h"
#include "util/hexdump.h"
#include "common/key_encoder.h"

using std::string;

Expand Down Expand Up @@ -72,16 +68,19 @@ Writer::Writer(const WriterOptions &options,
DataType type,
bool is_nullable,
EncodingType encoding,
shared_ptr<WritableFile> file)
: file_(file),
off_(0),
value_count_(0),
options_(options),
is_nullable_(is_nullable),
datatype_(type),
typeinfo_(GetTypeInfo(type)),
encoding_type_(encoding),
state_(kWriterInitialized) {
shared_ptr<WritableFile> file) :
file_(file),
off_(0),
value_count_(0),
options_(options),
is_nullable_(is_nullable),
datatype_(type),
encoding_type_(encoding),
typeinfo_(GetTypeInfo(type)),
type_encoding_info_(TypeEncodingInfo::Get(type, encoding)),
key_encoder_(GetKeyEncoder(type)),
state_(kWriterInitialized)
{
if (options.write_posidx) {
posidx_builder_.reset(new IndexTreeBuilder(&options_,
this));
Expand Down Expand Up @@ -123,7 +122,7 @@ Status Writer::Start() {
off_ += buf.size();

BlockBuilder *bb;
RETURN_NOT_OK(CreateBlockBuilder(&bb));
RETURN_NOT_OK(type_encoding_info_.CreateBlockBuilder(&bb, &options_) );
data_block_.reset(bb);

if (is_nullable_) {
Expand All @@ -136,52 +135,6 @@ Status Writer::Start() {
return Status::OK();
}

// TODO: refactor this into some kind of block factory
// module, with its equivalent in CFileReader
Status Writer::CreateBlockBuilder(BlockBuilder **bb) const {
*bb = NULL;
switch (datatype_) {
case UINT32:
switch (encoding_type_) {
case PLAIN:
*bb = new PlainBlockBuilder<UINT32>(&options_);
break;
case GROUP_VARINT:
*bb = new GVIntBlockBuilder(&options_);
break;
default:
return Status::NotFound("bad uint encoding: " + EncodingType_Name(encoding_type_));
}
break;
case INT32:
switch (encoding_type_) {
case PLAIN:
*bb = new PlainBlockBuilder<INT32>(&options_);
break;
default:
return Status::NotFound("bad int encoding: " + EncodingType_Name(encoding_type_));
}
break;
case STRING:
switch (encoding_type_) {
case PREFIX:
*bb = new StringPrefixBlockBuilder(&options_);
break;
case PLAIN:
*bb = new StringPlainBlockBuilder(&options_);
break;
default:
return Status::NotFound("bad string encoding: " + EncodingType_Name(encoding_type_));
}
break;
default:
return Status::NotFound("bad datatype");
}

CHECK(*bb != NULL); // sanity check postcondition
return Status::OK();
}

Status Writer::Finish() {
CHECK(state_ == kWriterWriting) <<
"Bad state for Finish(): " << state_;
Expand Down Expand Up @@ -365,16 +318,16 @@ Status Writer::AppendRawBlock(const vector<Slice> &data_slices,

BlockPointer ptr;
Status s = AddBlock(data_slices, &ptr, "data");
KeyEncoder encoder(&tmp_buf_);
if (!s.ok()) {
LOG(WARNING) << "Unable to append block to file: " << s.ToString();
return s;
}

// Now add to the index blocks
if (posidx_builder_ != NULL) {
Slice slice = encoder.ResetBufferAndEncodeToSlice(UINT32, ordinal_pos);
RETURN_NOT_OK(posidx_builder_->Append(slice, ptr));
tmp_buf_.clear();
KeyEncoderTraits<UINT32>::Encode(ordinal_pos, &tmp_buf_);
RETURN_NOT_OK(posidx_builder_->Append(Slice(tmp_buf_), ptr));
}

if (validx_builder_ != NULL) {
Expand All @@ -383,8 +336,8 @@ Status Writer::AppendRawBlock(const vector<Slice> &data_slices,
VLOG(1) << "Appending validx entry\n" <<
kudu::HexDump(Slice(reinterpret_cast<const uint8_t *>(validx_key),
typeinfo_.size()));
Slice slice = encoder.ResetBufferAndEncodeToSlice(datatype_, validx_key);
s = validx_builder_->Append(slice, ptr);
key_encoder_.ResetAndEncode(validx_key, &tmp_buf_);
s = validx_builder_->Append(Slice(tmp_buf_), ptr);
if (!s.ok()) {
LOG(WARNING) << "Unable to append to value index: " << s.ToString();
return s;
Expand Down
48 changes: 6 additions & 42 deletions src/cfile/cfile.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,14 @@
#include "cfile/block_encodings.h"
#include "cfile/block_compression.h"
#include "cfile/cfile.pb.h"
#include "cfile/cfile_util.h"
#include "cfile/type_encodings.h"
#include "common/key_encoder.h"
#include "common/types.h"
#include "gutil/gscoped_ptr.h"
#include "gutil/macros.h"
#include "util/rle-encoding.h"
#include "util/status.h"
#include "common/key_encoder.h"

namespace kudu {

Expand All @@ -45,44 +47,6 @@ extern const char kMagicString[];
const int kCFileMajorVersion = 1;
const int kCFileMinorVersion = 0;


struct WriterOptions {
// Approximate size of user data packed per block. Note that the
// block size specified here corresponds to uncompressed data. The
// actual size of the unit read from disk may be smaller if
// compression is enabled. This parameter can be changed dynamically.
//
// Default: 256K
size_t block_size;

// Approximate size of index blocks.
//
// Default: 32KB.
size_t index_block_size;

// Number of keys between restart points for delta encoding of keys.
// This parameter can be changed dynamically. Most clients should
// leave this parameter alone.
//
// This is currently only used by StringPrefixBlockBuilder
//
// Default: 16
int block_restart_interval;

// Whether the file needs a positional index.
bool write_posidx;

// Whether the file needs a value index
bool write_validx;

// Block compression codec type
//
// Default: specified by --cfile_default_compression_codec
CompressionType compression;

WriterOptions();
};

class NullBitmapBuilder {
public:
explicit NullBitmapBuilder(size_t initial_row_capacity)
Expand Down Expand Up @@ -180,8 +144,6 @@ class Writer {
// field, clearing the buffer.
void FlushMetadataToPB(RepeatedPtrField<FileMetadataPairPB> *field);

Status CreateBlockBuilder(BlockBuilder **builder) const;

// File being written.
shared_ptr<WritableFile> file_;

Expand All @@ -196,8 +158,10 @@ class Writer {
// Type of data being written
bool is_nullable_;
DataType datatype_;
const TypeInfo &typeinfo_;
EncodingType encoding_type_;
const TypeInfo &typeinfo_;
const TypeEncodingInfo &type_encoding_info_;
const KeyEncoder& key_encoder_;

// a temporary buffer for encoding
faststring tmp_buf_;
Expand Down
62 changes: 7 additions & 55 deletions src/cfile/cfile_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@
#include "cfile/gvint_block.h"
#include "cfile/index_block.h"
#include "cfile/index_btree.h"
#include "cfile/string_plain_block.h"
#include "cfile/string_prefix_block.h"
#include "cfile/plain_block.h"
#include "gutil/gscoped_ptr.h"
#include "util/coding.h"
#include "util/env.h"
Expand Down Expand Up @@ -99,6 +96,9 @@ Status CFileReader::Init() {
RETURN_NOT_OK(ReadAndParseFooter());

type_info_ = &GetTypeInfo(footer_->data_type());
type_encoding_info_ = &TypeEncodingInfo::Get(footer_->data_type(),
footer_->encoding());
key_encoder_ = &GetKeyEncoder(footer_->data_type());
VLOG(1) << "Initialized CFile reader. "
<< "Header: " << header_->DebugString()
<< " Footer: " << footer_->DebugString()
Expand Down Expand Up @@ -255,54 +255,6 @@ bool CFileReader::GetMetadataEntry(const string &key, string *val) {
return false;
}


// TODO: perhaps decoders should be able to be Reset
// to point to a different slice? any benefit to that?
Status CFileReader::CreateBlockDecoder(
BlockDecoder **bd, const Slice &slice) const {
*bd = NULL;
switch (footer_->data_type()) {
case UINT32:
switch (footer_->encoding()) {
case PLAIN:
*bd = new PlainBlockDecoder<UINT32>(slice);
break;
case GROUP_VARINT:
*bd = new GVIntBlockDecoder(slice);
break;
default:
return Status::NotFound("bad uint encoding");
}
break;
case INT32:
switch (footer_->encoding()) {
case PLAIN:
*bd = new PlainBlockDecoder<INT32>(slice);
break;
default:
return Status::NotFound("bad int encoding");
}
break;
case STRING:
switch (footer_->encoding()) {
case PREFIX:
*bd = new StringPrefixBlockDecoder(slice);
break;
case PLAIN:
*bd = new StringPlainBlockDecoder(slice);
break;
default:
return Status::NotFound("bad string encoding");
}
break;
default:
return Status::NotFound("bad datatype");
}

CHECK(*bd != NULL); // sanity check postcondition
return Status::OK();
}

Status CFileReader::NewIterator(CFileIterator **iter) const {
gscoped_ptr<BlockPointer> posidx_root;
if (footer_->has_posidx_info()) {
Expand Down Expand Up @@ -357,9 +309,9 @@ Status CFileIterator::SeekToOrdinal(rowid_t ord_idx) {
return Status::NotSupported("no positional index in file");
}

KeyEncoder encoder(&tmp_buf_);
Slice slice = encoder.ResetBufferAndEncodeToSlice(UINT32, ord_idx);
RETURN_NOT_OK(posidx_iter_->SeekAtOrBefore(slice));
tmp_buf_.clear();
KeyEncoderTraits<UINT32>::Encode(ord_idx, &tmp_buf_);
RETURN_NOT_OK(posidx_iter_->SeekAtOrBefore(Slice(tmp_buf_)));

// TODO: fast seek within block (without reseeking index)
pblock_pool_scoped_ptr b = prepared_block_pool_.make_scoped_ptr(
Expand Down Expand Up @@ -498,7 +450,7 @@ Status CFileIterator::ReadCurrentDataBlock(const IndexTreeIterator &idx_iter,
}

BlockDecoder *bd;
RETURN_NOT_OK(reader_->CreateBlockDecoder(&bd, data_block));
RETURN_NOT_OK(reader_->type_encoding_info()->CreateBlockDecoder(&bd, data_block));
prep_block->dblk_.reset(bd);
RETURN_NOT_OK(prep_block->dblk_->ParseHeader());

Expand Down
Loading

0 comments on commit 2b11231

Please sign in to comment.