Add new integer types of all sizes and templates for constructing enc…

…oder/decoders - Added new integer types UINT8,INT8,UINT16,INT16,UINT64,INT64 - Added a factory method for Block Builders and Decoders - Added a factory method for KeyEncoders - Templatized IntTestSetup in table-test base to allow to run all int types - Refactored away switch/case's for encoding/block building Change-Id: I91da37dbe6aafb7303c14c70a387ddb0fadd0c9 Change-Id: I91da37dbe6aafb7303c14c70a387ddb0fadd0c99 Reviewed-on: http://gerrit.ent.cloudera.com:8080/33 Reviewed-by: Todd Lipcon <[email protected]> Tested-by: Todd Lipcon <[email protected]>
jhx1008 · Jul 10, 2013 · 2b11231 · 2b11231
1 parent d53cb36
commit 2b11231
Show file tree

Hide file tree

Showing 25 changed files with 1,074 additions and 390 deletions.
diff --git a/src/cfile/CMakeLists.txt b/src/cfile/CMakeLists.txt
@@ -12,6 +12,7 @@ SET(CFILE_SRCS cfile.cc index_block.cc index_btree.cc cfile_reader.cc
 
   block_cache.cc
   bloomfile.cc
+  type_encodings.cc
 
   compression_codec.cc
   block_compression.cc
@@ -35,4 +36,4 @@ ADD_KUDU_TEST(compression-test)
 
 # Tools
 ADD_EXECUTABLE(cfile-dump cfile-dump.cc)
-TARGET_LINK_LIBRARIES(cfile-dump ${KUDU_LINK_LIBS})
+TARGET_LINK_LIBRARIES(cfile-dump ${KUDU_LINK_LIBS})
diff --git a/src/cfile/block_encodings.h b/src/cfile/block_encodings.h
@@ -20,23 +20,6 @@ class ColumnDataView;
 
 namespace cfile {
 
-// Return the default encoding to use for the given data type.
-// TODO: this probably won't stay around too long - in a real Flush
-// situation, we can look at the content in the memstore and pick the
-// most effective coding.
-inline EncodingType GetDefaultEncoding(DataType type) {
-  switch (type) {
-    case STRING:
-      return PREFIX;
-    case INT32:
-      return PLAIN;
-    case UINT32:
-      return GROUP_VARINT;
-    default:
-      CHECK(0) << "unknown type: " << type;
-  }
-}
-
 
 class BlockBuilder {
  public:

diff --git a/src/cfile/cfile.cc b/src/cfile/cfile.cc
@@ -7,20 +7,16 @@
 #include <utility>
 
 #include "cfile/cfile.h"
-#include "cfile/cfile.pb.h"
 #include "cfile/block_pointer.h"
-#include "cfile/gvint_block.h"
-#include "cfile/string_prefix_block.h"
-#include "cfile/string_plain_block.h"
 #include "cfile/index_block.h"
 #include "cfile/index_btree.h"
-#include "cfile/plain_block.h"
+#include "common/key_encoder.h"
+#include "cfile/type_encodings.h"
 #include "util/env.h"
 #include "util/coding.h"
 #include "util/logging.h"
 #include "util/pb_util.h"
 #include "util/hexdump.h"
-#include "common/key_encoder.h"
 
 using std::string;
 
@@ -72,16 +68,19 @@ Writer::Writer(const WriterOptions &options,
                DataType type,
                bool is_nullable,
                EncodingType encoding,
-               shared_ptr<WritableFile> file)
-  : file_(file),
-    off_(0),
-    value_count_(0),
-    options_(options),
-    is_nullable_(is_nullable),
-    datatype_(type),
-    typeinfo_(GetTypeInfo(type)),
-    encoding_type_(encoding),
-    state_(kWriterInitialized) {
+               shared_ptr<WritableFile> file) :
+  file_(file),
+  off_(0),
+  value_count_(0),
+  options_(options),
+  is_nullable_(is_nullable),
+  datatype_(type),
+  encoding_type_(encoding),
+  typeinfo_(GetTypeInfo(type)),
+  type_encoding_info_(TypeEncodingInfo::Get(type, encoding)),
+  key_encoder_(GetKeyEncoder(type)),
+  state_(kWriterInitialized)
+{
   if (options.write_posidx) {
     posidx_builder_.reset(new IndexTreeBuilder(&options_,
                                                this));
@@ -123,7 +122,7 @@ Status Writer::Start() {
   off_ += buf.size();
 
   BlockBuilder *bb;
-  RETURN_NOT_OK(CreateBlockBuilder(&bb));
+  RETURN_NOT_OK(type_encoding_info_.CreateBlockBuilder(&bb, &options_) );
   data_block_.reset(bb);
 
   if (is_nullable_) {
@@ -136,52 +135,6 @@ Status Writer::Start() {
   return Status::OK();
 }
 
-// TODO: refactor this into some kind of block factory
-// module, with its equivalent in CFileReader
-Status Writer::CreateBlockBuilder(BlockBuilder **bb) const {
-  *bb = NULL;
-  switch (datatype_) {
-    case UINT32:
-      switch (encoding_type_) {
-        case PLAIN:
-          *bb = new PlainBlockBuilder<UINT32>(&options_);
-          break;
-        case GROUP_VARINT:
-          *bb = new GVIntBlockBuilder(&options_);
-          break;
-        default:
-          return Status::NotFound("bad uint encoding: " + EncodingType_Name(encoding_type_));
-      }
-      break;
-    case INT32:
-      switch (encoding_type_) {
-        case PLAIN:
-          *bb = new PlainBlockBuilder<INT32>(&options_);
-          break;
-        default:
-          return Status::NotFound("bad int encoding: " + EncodingType_Name(encoding_type_));
-      }
-      break;
-    case STRING:
-      switch (encoding_type_) {
-        case PREFIX:
-          *bb = new StringPrefixBlockBuilder(&options_);
-          break;
-        case PLAIN:
-          *bb = new StringPlainBlockBuilder(&options_);
-          break;
-        default:
-          return Status::NotFound("bad string encoding: " + EncodingType_Name(encoding_type_));
-      }
-      break;
-    default:
-      return Status::NotFound("bad datatype");
-  }
-
-  CHECK(*bb != NULL); // sanity check postcondition
-  return Status::OK();
-}
-
 Status Writer::Finish() {
   CHECK(state_ == kWriterWriting) <<
     "Bad state for Finish(): " << state_;
@@ -365,16 +318,16 @@ Status Writer::AppendRawBlock(const vector<Slice> &data_slices,
 
   BlockPointer ptr;
   Status s = AddBlock(data_slices, &ptr, "data");
-  KeyEncoder encoder(&tmp_buf_);
   if (!s.ok()) {
     LOG(WARNING) << "Unable to append block to file: " << s.ToString();
     return s;
   }
 
   // Now add to the index blocks
   if (posidx_builder_ != NULL) {
-    Slice slice = encoder.ResetBufferAndEncodeToSlice(UINT32, ordinal_pos);
-    RETURN_NOT_OK(posidx_builder_->Append(slice, ptr));
+    tmp_buf_.clear();
+    KeyEncoderTraits<UINT32>::Encode(ordinal_pos, &tmp_buf_);
+    RETURN_NOT_OK(posidx_builder_->Append(Slice(tmp_buf_), ptr));
   }
 
   if (validx_builder_ != NULL) {
@@ -383,8 +336,8 @@ Status Writer::AppendRawBlock(const vector<Slice> &data_slices,
     VLOG(1) << "Appending validx entry\n" <<
       kudu::HexDump(Slice(reinterpret_cast<const uint8_t *>(validx_key),
                           typeinfo_.size()));
-    Slice slice = encoder.ResetBufferAndEncodeToSlice(datatype_, validx_key);
-    s = validx_builder_->Append(slice, ptr);
+    key_encoder_.ResetAndEncode(validx_key, &tmp_buf_);
+    s = validx_builder_->Append(Slice(tmp_buf_), ptr);
     if (!s.ok()) {
       LOG(WARNING) << "Unable to append to value index: " << s.ToString();
       return s;

diff --git a/src/cfile/cfile.h b/src/cfile/cfile.h
@@ -15,12 +15,14 @@
 #include "cfile/block_encodings.h"
 #include "cfile/block_compression.h"
 #include "cfile/cfile.pb.h"
+#include "cfile/cfile_util.h"
+#include "cfile/type_encodings.h"
+#include "common/key_encoder.h"
 #include "common/types.h"
 #include "gutil/gscoped_ptr.h"
 #include "gutil/macros.h"
 #include "util/rle-encoding.h"
 #include "util/status.h"
-#include "common/key_encoder.h"
 
 namespace kudu {
 
@@ -45,44 +47,6 @@ extern const char kMagicString[];
 const int kCFileMajorVersion = 1;
 const int kCFileMinorVersion = 0;
 
-
-struct WriterOptions {
-  // Approximate size of user data packed per block.  Note that the
-  // block size specified here corresponds to uncompressed data.  The
-  // actual size of the unit read from disk may be smaller if
-  // compression is enabled.  This parameter can be changed dynamically.
-  //
-  // Default: 256K
-  size_t block_size;
-
-  // Approximate size of index blocks.
-  //
-  // Default: 32KB.
-  size_t index_block_size;
-
-  // Number of keys between restart points for delta encoding of keys.
-  // This parameter can be changed dynamically.  Most clients should
-  // leave this parameter alone.
-  //
-  // This is currently only used by StringPrefixBlockBuilder
-  //
-  // Default: 16
-  int block_restart_interval;
-
-  // Whether the file needs a positional index.
-  bool write_posidx;
-
-  // Whether the file needs a value index
-  bool write_validx;
-
-  // Block compression codec type
-  //
-  // Default: specified by --cfile_default_compression_codec
-  CompressionType compression;
-
-  WriterOptions();
-};
-
 class NullBitmapBuilder {
  public:
   explicit NullBitmapBuilder(size_t initial_row_capacity)
@@ -180,8 +144,6 @@ class Writer {
   // field, clearing the buffer.
   void FlushMetadataToPB(RepeatedPtrField<FileMetadataPairPB> *field);
 
-  Status CreateBlockBuilder(BlockBuilder **builder) const;
-
   // File being written.
   shared_ptr<WritableFile> file_;
 
@@ -196,8 +158,10 @@ class Writer {
   // Type of data being written
   bool is_nullable_;
   DataType datatype_;
-  const TypeInfo &typeinfo_;
   EncodingType encoding_type_;
+  const TypeInfo &typeinfo_;
+  const TypeEncodingInfo &type_encoding_info_;
+  const KeyEncoder& key_encoder_;
 
   // a temporary buffer for encoding
   faststring tmp_buf_;

diff --git a/src/cfile/cfile_reader.cc b/src/cfile/cfile_reader.cc
@@ -14,9 +14,6 @@
 #include "cfile/gvint_block.h"
 #include "cfile/index_block.h"
 #include "cfile/index_btree.h"
-#include "cfile/string_plain_block.h"
-#include "cfile/string_prefix_block.h"
-#include "cfile/plain_block.h"
 #include "gutil/gscoped_ptr.h"
 #include "util/coding.h"
 #include "util/env.h"
@@ -99,6 +96,9 @@ Status CFileReader::Init() {
   RETURN_NOT_OK(ReadAndParseFooter());
 
   type_info_ = &GetTypeInfo(footer_->data_type());
+  type_encoding_info_ = &TypeEncodingInfo::Get(footer_->data_type(),
+                                               footer_->encoding());
+  key_encoder_ = &GetKeyEncoder(footer_->data_type());
   VLOG(1) << "Initialized CFile reader. "
           << "Header: " << header_->DebugString()
           << " Footer: " << footer_->DebugString()
@@ -255,54 +255,6 @@ bool CFileReader::GetMetadataEntry(const string &key, string *val) {
   return false;
 }
 
-
-// TODO: perhaps decoders should be able to be Reset
-// to point to a different slice? any benefit to that?
-Status CFileReader::CreateBlockDecoder(
-  BlockDecoder **bd, const Slice &slice) const {
-  *bd = NULL;
-  switch (footer_->data_type()) {
-    case UINT32:
-      switch (footer_->encoding()) {
-        case PLAIN:
-          *bd = new PlainBlockDecoder<UINT32>(slice);
-          break;
-        case GROUP_VARINT:
-          *bd = new GVIntBlockDecoder(slice);
-          break;
-        default:
-          return Status::NotFound("bad uint encoding");
-      }
-      break;
-    case INT32:
-      switch (footer_->encoding()) {
-        case PLAIN:
-          *bd = new PlainBlockDecoder<INT32>(slice);
-          break;
-        default:
-          return Status::NotFound("bad int encoding");
-      }
-      break;
-    case STRING:
-      switch (footer_->encoding()) {
-        case PREFIX:
-          *bd = new StringPrefixBlockDecoder(slice);
-          break;
-        case PLAIN:
-          *bd = new StringPlainBlockDecoder(slice);
-          break;
-        default:
-          return Status::NotFound("bad string encoding");
-      }
-      break;
-    default:
-      return Status::NotFound("bad datatype");
-  }
-
-  CHECK(*bd != NULL); // sanity check postcondition
-  return Status::OK();
-}
-
 Status CFileReader::NewIterator(CFileIterator **iter) const {
   gscoped_ptr<BlockPointer> posidx_root;
   if (footer_->has_posidx_info()) {
@@ -357,9 +309,9 @@ Status CFileIterator::SeekToOrdinal(rowid_t ord_idx) {
     return Status::NotSupported("no positional index in file");
   }
 
-  KeyEncoder encoder(&tmp_buf_);
-  Slice slice = encoder.ResetBufferAndEncodeToSlice(UINT32, ord_idx);
-  RETURN_NOT_OK(posidx_iter_->SeekAtOrBefore(slice));
+  tmp_buf_.clear();
+  KeyEncoderTraits<UINT32>::Encode(ord_idx, &tmp_buf_);
+  RETURN_NOT_OK(posidx_iter_->SeekAtOrBefore(Slice(tmp_buf_)));
 
   // TODO: fast seek within block (without reseeking index)
   pblock_pool_scoped_ptr b = prepared_block_pool_.make_scoped_ptr(
@@ -498,7 +450,7 @@ Status CFileIterator::ReadCurrentDataBlock(const IndexTreeIterator &idx_iter,
   }
 
   BlockDecoder *bd;
-  RETURN_NOT_OK(reader_->CreateBlockDecoder(&bd, data_block));
+  RETURN_NOT_OK(reader_->type_encoding_info()->CreateBlockDecoder(&bd, data_block));
   prep_block->dblk_.reset(bd);
   RETURN_NOT_OK(prep_block->dblk_->ParseHeader());