Skip to content

Commit

Permalink
WriteUnPrepared: Add support for recovering WriteUnprepared transacti…
Browse files Browse the repository at this point in the history
…ons (facebook#4078)

Summary:
This adds support for recovering WriteUnprepared transactions through the following changes:
- The information in `RecoveredTransaction` is extended so that it can reference multiple batches.
- `MarkBeginPrepare` is extended with a bool indicating whether it is an unprepared begin, and this is passed down to `InsertRecoveredTransaction` to indicate whether the current transaction is prepared or not.
- `WriteUnpreparedTxnDB::Initialize` is overridden so that it will rollback unprepared transactions from the recovered transactions. This can be done without updating the prepare heap/commit map, because this is before the DB has finished initializing, and after writing the rollback batch, those data structures should not contain information about the rolled back transaction anyway.

Commit/Rollback of live transactions is still unimplemented and will come later.
Pull Request resolved: facebook#4078

Differential Revision: D8703382

Pulled By: lth

fbshipit-source-id: 7e0aada6c23bd39299f1f20d6c060492e0e6b60a
  • Loading branch information
lth authored and facebook-github-bot committed Jul 7, 2018
1 parent db7ae0a commit b984637
Show file tree
Hide file tree
Showing 21 changed files with 524 additions and 53 deletions.
80 changes: 60 additions & 20 deletions db/db_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -561,25 +561,50 @@ class DBImpl : public DB {
// these will then be passed to TransactionDB so that
// locks can be reacquired before writing can resume.
struct RecoveredTransaction {
uint64_t log_number_;
std::string name_;
WriteBatch* batch_;
// The seq number of the first key in the batch
SequenceNumber seq_;
// Number of sub-batched. A new sub-batch is created if we txn attempts to
// inserts a duplicate key,seq to memtable. This is currently used in
// WritePrparedTxn
size_t batch_cnt_;
bool unprepared_;

struct BatchInfo {
uint64_t log_number_;
// TODO(lth): For unprepared, the memory usage here can be big for
// unprepared transactions. This is only useful for rollbacks, and we
// can in theory just keep keyset for that.
WriteBatch* batch_;
// Number of sub-batches. A new sub-batch is created if txn attempts to
// insert a duplicate key,seq to memtable. This is currently used in
// WritePreparedTxn/WriteUnpreparedTxn.
size_t batch_cnt_;
};

// This maps the seq of the first key in the batch to BatchInfo, which
// contains WriteBatch and other information relevant to the batch.
//
// For WriteUnprepared, batches_ can have size greater than 1, but for
// other write policies, it must be of size 1.
std::map<SequenceNumber, BatchInfo> batches_;

explicit RecoveredTransaction(const uint64_t log, const std::string& name,
WriteBatch* batch, SequenceNumber seq,
size_t batch_cnt)
: log_number_(log),
name_(name),
batch_(batch),
seq_(seq),
batch_cnt_(batch_cnt) {}

~RecoveredTransaction() { delete batch_; }
size_t batch_cnt, bool unprepared)
: name_(name), unprepared_(unprepared) {
batches_[seq] = {log, batch, batch_cnt};
}

~RecoveredTransaction() {
for (auto& it : batches_) {
delete it.second.batch_;
}
}

void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch,
size_t batch_cnt, bool unprepared) {
assert(batches_.count(seq) == 0);
batches_[seq] = {log_number, batch, batch_cnt};
// Prior state must be unprepared, since the prepare batch must be the
// last batch.
assert(unprepared_);
unprepared_ = unprepared;
}
};

bool allow_2pc() const { return immutable_db_options_.allow_2pc; }
Expand All @@ -600,9 +625,19 @@ class DBImpl : public DB {

void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
WriteBatch* batch, SequenceNumber seq,
size_t batch_cnt) {
recovered_transactions_[name] =
new RecoveredTransaction(log, name, batch, seq, batch_cnt);
size_t batch_cnt, bool unprepared_batch) {
// For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple
// times for every unprepared batch encountered during recovery.
//
// If the transaction is prepared, then the last call to
// InsertRecoveredTransaction will have unprepared_batch = false.
auto rtxn = recovered_transactions_.find(name);
if (rtxn == recovered_transactions_.end()) {
recovered_transactions_[name] = new RecoveredTransaction(
log, name, batch, seq, batch_cnt, unprepared_batch);
} else {
rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch);
}
logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log);
}

Expand All @@ -611,7 +646,10 @@ class DBImpl : public DB {
assert(it != recovered_transactions_.end());
auto* trx = it->second;
recovered_transactions_.erase(it);
logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed(trx->log_number_);
for (const auto& info : trx->batches_) {
logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed(
info.second.log_number_);
}
delete trx;
}

Expand Down Expand Up @@ -751,6 +789,7 @@ class DBImpl : public DB {
friend class WritePreparedTxn;
friend class WritePreparedTxnDB;
friend class WriteBatchWithIndex;
friend class WriteUnpreparedTxnDB;
#ifndef ROCKSDB_LITE
friend class ForwardIterator;
#endif
Expand All @@ -762,6 +801,7 @@ class DBImpl : public DB {
friend class WriteCallbackTest_WriteWithCallbackTest_Test;
friend class XFTransactionWriteHandler;
friend class DBBlobIndexTest;
friend class WriteUnpreparedTransactionTest_RecoveryRollbackUnprepared_Test;
#endif
struct CompactionState;

Expand Down
4 changes: 2 additions & 2 deletions db/dbformat.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ enum ValueType : unsigned char {
// Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL
// generated by WriteUnprepared write policy is not mistakenly read by
// another.
kTypeBeginUnprepareXID = 0x13, // WAL only.
kMaxValue = 0x7F // Not used for storing records.
kTypeBeginUnprepareXID = 0x13, // WAL only.
kMaxValue = 0x7F // Not used for storing records.
};

// Defined in dbformat.cc
Expand Down
2 changes: 1 addition & 1 deletion db/transaction_log_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
const Slice& /*val*/) override {
return Status::OK();
}
Status MarkBeginPrepare() override { return Status::OK(); }
Status MarkBeginPrepare(bool) override { return Status::OK(); }
Status MarkRollback(const Slice&) override { return Status::OK(); }
};

Expand Down
31 changes: 24 additions & 7 deletions db/write_batch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ enum ContentFlags : uint32_t {
HAS_ROLLBACK = 1 << 8,
HAS_DELETE_RANGE = 1 << 9,
HAS_BLOB_INDEX = 1 << 10,
HAS_BEGIN_UNPREPARE = 1 << 11,
};

struct BatchContentClassifier : public WriteBatch::Handler {
Expand Down Expand Up @@ -108,8 +109,11 @@ struct BatchContentClassifier : public WriteBatch::Handler {
return Status::OK();
}

Status MarkBeginPrepare() override {
Status MarkBeginPrepare(bool unprepare) override {
content_flags |= ContentFlags::HAS_BEGIN_PREPARE;
if (unprepare) {
content_flags |= ContentFlags::HAS_BEGIN_UNPREPARE;
}
return Status::OK();
}

Expand Down Expand Up @@ -532,8 +536,8 @@ Status WriteBatch::Iterate(Handler* handler) const {
break;
case kTypeBeginUnprepareXID:
assert(content_flags_.load(std::memory_order_relaxed) &
(ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
handler->MarkBeginPrepare();
(ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE));
handler->MarkBeginPrepare(true /* unprepared */);
empty_batch = false;
if (handler->WriteAfterCommit()) {
s = Status::NotSupported(
Expand Down Expand Up @@ -1052,6 +1056,8 @@ class MemTableInserter : public WriteBatch::Handler {
bool write_after_commit_;
// Whether memtable write can be done before prepare
bool write_before_prepare_;
// Whether this batch was unprepared or not
bool unprepared_batch_;
using DupDetector = std::aligned_storage<sizeof(DuplicateDetector)>::type;
DupDetector duplicate_detector_;
bool dup_dectector_on_;
Expand Down Expand Up @@ -1111,6 +1117,7 @@ class MemTableInserter : public WriteBatch::Handler {
// WriteUnprepared can write WriteBatches per transaction, so
// batch_per_txn being false indicates write_before_prepare.
write_before_prepare_(!batch_per_txn),
unprepared_batch_(false),
duplicate_detector_(),
dup_dectector_on_(false) {
assert(cf_mems_);
Expand Down Expand Up @@ -1586,7 +1593,9 @@ class MemTableInserter : public WriteBatch::Handler {
}
}

Status MarkBeginPrepare() override {
// The write batch handler calls MarkBeginPrepare with unprepare set to true
// if it encounters the kTypeBeginUnprepareXID marker.
Status MarkBeginPrepare(bool unprepare) override {
assert(rebuilding_trx_ == nullptr);
assert(db_);

Expand All @@ -1602,6 +1611,11 @@ class MemTableInserter : public WriteBatch::Handler {
// we are now iterating through a prepared section
rebuilding_trx_ = new WriteBatch();
rebuilding_trx_seq_ = sequence_;
// We only call MarkBeginPrepare once per batch, and unprepared_batch_
// is initialized to false by default.
assert(!unprepared_batch_);
unprepared_batch_ = unprepare;

if (has_valid_writes_ != nullptr) {
*has_valid_writes_ = true;
}
Expand All @@ -1622,7 +1636,7 @@ class MemTableInserter : public WriteBatch::Handler {
: static_cast<size_t>(sequence_ - rebuilding_trx_seq_ + 1);
db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(),
rebuilding_trx_, rebuilding_trx_seq_,
batch_cnt);
batch_cnt, unprepared_batch_);
rebuilding_trx_ = nullptr;
} else {
assert(rebuilding_trx_ == nullptr);
Expand Down Expand Up @@ -1665,9 +1679,12 @@ class MemTableInserter : public WriteBatch::Handler {
// duplicate re-insertion of values.
assert(log_number_ref_ == 0);
if (write_after_commit_) {
// write_after_commit_ can only have one batch in trx.
assert(trx->batches_.size() == 1);
const auto& batch_info = trx->batches_.begin()->second;
// all inserts must reference this trx log number
log_number_ref_ = trx->log_number_;
s = trx->batch_->Iterate(this);
log_number_ref_ = batch_info.log_number_;
s = batch_info.batch_->Iterate(this);
log_number_ref_ = 0;
}
// else the values are already inserted before the commit
Expand Down
7 changes: 4 additions & 3 deletions db/write_batch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,9 @@ namespace {
virtual void LogData(const Slice& blob) override {
seen += "LogData(" + blob.ToString() + ")";
}
virtual Status MarkBeginPrepare() override {
seen += "MarkBeginPrepare()";
virtual Status MarkBeginPrepare(bool unprepare) override {
seen +=
"MarkBeginPrepare(" + std::string(unprepare ? "true" : "false") + ")";
return Status::OK();
}
virtual Status MarkEndPrepare(const Slice& xid) override {
Expand Down Expand Up @@ -403,7 +404,7 @@ TEST_F(WriteBatchTest, PrepareCommit) {
TestHandler handler;
batch.Iterate(&handler);
ASSERT_EQ(
"MarkBeginPrepare()"
"MarkBeginPrepare(false)"
"Put(k1, v1)"
"Put(k2, v2)"
"MarkEndPrepare(xid1)"
Expand Down
1 change: 1 addition & 0 deletions include/rocksdb/utilities/transaction.h
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,7 @@ class Transaction {

private:
friend class PessimisticTransactionDB;
friend class WriteUnpreparedTxnDB;
// No copying allowed
Transaction(const Transaction&);
void operator=(const Transaction&);
Expand Down
2 changes: 1 addition & 1 deletion include/rocksdb/write_batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ class WriteBatch : public WriteBatchBase {
// The default implementation of LogData does nothing.
virtual void LogData(const Slice& blob);

virtual Status MarkBeginPrepare() {
virtual Status MarkBeginPrepare(bool = false) {
return Status::InvalidArgument("MarkBeginPrepare() handler not defined.");
}

Expand Down
3 changes: 2 additions & 1 deletion java/rocksjni/writebatchhandlerjnicallback.cc
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,8 @@ rocksdb::Status WriteBatchHandlerJniCallback::PutBlobIndexCF(uint32_t column_fam
}
}

rocksdb::Status WriteBatchHandlerJniCallback::MarkBeginPrepare() {
rocksdb::Status WriteBatchHandlerJniCallback::MarkBeginPrepare(bool unprepare) {
assert(!unprepare);
m_env->CallVoidMethod(m_jcallback_obj, m_jMarkBeginPrepareMethodId);

// check for Exception, in-particular RocksDBException
Expand Down
4 changes: 2 additions & 2 deletions java/rocksjni/writebatchhandlerjnicallback.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ class WriteBatchHandlerJniCallback : public JniCallback, public WriteBatch::Hand
void DeleteRange(const Slice& beginKey, const Slice& endKey);
void LogData(const Slice& blob);
Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
const Slice& value);
Status MarkBeginPrepare();
const Slice& value);
Status MarkBeginPrepare(bool);
Status MarkEndPrepare(const Slice& xid);
Status MarkNoop(bool empty_batch);
Status MarkRollback(const Slice& xid);
Expand Down
5 changes: 3 additions & 2 deletions tools/ldb_cmd.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1917,8 +1917,9 @@ class InMemoryHandler : public WriteBatch::Handler {
return Status::OK();
}

virtual Status MarkBeginPrepare() override {
row_ << "BEGIN_PREARE ";
virtual Status MarkBeginPrepare(bool unprepare) override {
row_ << "BEGIN_PREPARE(";
row_ << (unprepare ? "true" : "false") << ") ";
return Status::OK();
}

Expand Down
18 changes: 10 additions & 8 deletions utilities/transactions/pessimistic_transaction_db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,10 @@ Status PessimisticTransactionDB::Initialize(
for (auto it = rtrxs.begin(); it != rtrxs.end(); it++) {
auto recovered_trx = it->second;
assert(recovered_trx);
assert(recovered_trx->log_number_);
assert(recovered_trx->batches_.size() == 1);
const auto& seq = recovered_trx->batches_.begin()->first;
const auto& batch_info = recovered_trx->batches_.begin()->second;
assert(batch_info.log_number_);
assert(recovered_trx->name_.length());

WriteOptions w_options;
Expand All @@ -133,21 +136,20 @@ Status PessimisticTransactionDB::Initialize(

Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
assert(real_trx);
real_trx->SetLogNumber(recovered_trx->log_number_);
assert(recovered_trx->seq_ != kMaxSequenceNumber);
real_trx->SetId(recovered_trx->seq_);
real_trx->SetLogNumber(batch_info.log_number_);
assert(seq != kMaxSequenceNumber);
real_trx->SetId(seq);

s = real_trx->SetName(recovered_trx->name_);
if (!s.ok()) {
break;
}

s = real_trx->RebuildFromWriteBatch(recovered_trx->batch_);
s = real_trx->RebuildFromWriteBatch(batch_info.batch_);
// WriteCommitted set this to to disable this check that is specific to
// WritePrepared txns
assert(recovered_trx->batch_cnt_ == 0 ||
real_trx->GetWriteBatch()->SubBatchCnt() ==
recovered_trx->batch_cnt_);
assert(batch_info.batch_cnt_ == 0 ||
real_trx->GetWriteBatch()->SubBatchCnt() == batch_info.batch_cnt_);
real_trx->SetState(Transaction::PREPARED);
if (!s.ok()) {
break;
Expand Down
1 change: 1 addition & 0 deletions utilities/transactions/pessimistic_transaction_db.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ class PessimisticTransactionDB : public TransactionDB {
friend class TransactionTest_TwoPhaseLongPrepareTest_Test;
friend class TransactionTest_TwoPhaseDoubleRecoveryTest_Test;
friend class TransactionTest_TwoPhaseOutOfOrderDelete_Test;
friend class WriteUnpreparedTransactionTest_RecoveryRollbackUnprepared_Test;
TransactionLockMgr lock_mgr_;

// Must be held when adding/dropping column families.
Expand Down
2 changes: 1 addition & 1 deletion utilities/transactions/transaction_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ Status TransactionBaseImpl::RebuildFromWriteBatch(WriteBatch* src_batch) {
// this is used for reconstructing prepared transactions upon
// recovery. there should not be any meta markers in the batches
// we are processing.
Status MarkBeginPrepare() override { return Status::InvalidArgument(); }
Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }

Status MarkEndPrepare(const Slice&) override {
return Status::InvalidArgument();
Expand Down
2 changes: 1 addition & 1 deletion utilities/transactions/write_prepared_txn.cc
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ Status WritePreparedTxn::RollbackInternal() {
}

Status MarkNoop(bool) override { return Status::OK(); }
Status MarkBeginPrepare() override { return Status::OK(); }
Status MarkBeginPrepare(bool) override { return Status::OK(); }
Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
Status MarkCommit(const Slice&) override { return Status::OK(); }
Status MarkRollback(const Slice&) override {
Expand Down
1 change: 1 addition & 0 deletions utilities/transactions/write_prepared_txn.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class WritePreparedTxn : public PessimisticTransaction {
private:
friend class WritePreparedTransactionTest_BasicRecoveryTest_Test;
friend class WritePreparedTxnDB;
friend class WriteUnpreparedTxnDB;

Status PrepareInternal() override;

Expand Down
8 changes: 6 additions & 2 deletions utilities/transactions/write_prepared_txn_db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,13 @@ Status WritePreparedTxnDB::Initialize(
assert(dbimpl != nullptr);
auto rtxns = dbimpl->recovered_transactions();
for (auto rtxn : rtxns) {
auto cnt = rtxn.second->batch_cnt_ ? rtxn.second->batch_cnt_ : 1;
// There should only one batch for WritePrepared policy.
assert(rtxn.second->batches_.size() == 1);
const auto& seq = rtxn.second->batches_.begin()->first;
const auto& batch_info = rtxn.second->batches_.begin()->second;
auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
for (size_t i = 0; i < cnt; i++) {
AddPrepared(rtxn.second->seq_ + i);
AddPrepared(seq + i);
}
}
SequenceNumber prev_max = max_evicted_seq_;
Expand Down
Loading

0 comments on commit b984637

Please sign in to comment.