Skip to content

Commit

Permalink
WritePrepared Txn: Advance seq one per batch
Browse files Browse the repository at this point in the history
Summary:
By default the seq number in DB is increased once per written key. WritePrepared txns requires the seq to be increased once per the entire batch so that the seq would be used as the prepare timestamp by which the transaction is identified. Also we need to increase seq for the commit marker since it would give a unique id to the commit timestamp of transactions.

Two unit tests are added to verify our understanding of how the seq should be increased. The recovery path requires much more work and is left to another patch.
Closes facebook#2885

Differential Revision: D5837843

Pulled By: maysamyabandeh

fbshipit-source-id: a08960b93d727e1cf438c254d0c2636fb133cc1c
  • Loading branch information
Maysam Yabandeh authored and facebook-github-bot committed Sep 18, 2017
1 parent c57050b commit 60beefd
Show file tree
Hide file tree
Showing 18 changed files with 329 additions and 72 deletions.
3 changes: 2 additions & 1 deletion db/db_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
refitting_level_(false),
opened_successfully_(false),
concurrent_prepare_(options.concurrent_prepare),
manual_wal_flush_(options.manual_wal_flush) {
manual_wal_flush_(options.manual_wal_flush),
seq_per_batch_(options.seq_per_batch) {
env_->GetAbsolutePath(dbname, &db_absolute_path_);

// Reserve ten files or so for other uses and give the rest to TableCache.
Expand Down
3 changes: 2 additions & 1 deletion db/db_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,7 @@ class DBImpl : public DB {

Status ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
uint64_t* log_used, SequenceNumber* last_sequence,
int total_count);
size_t seq_inc);

// Used by WriteImpl to update bg_error_ if paranoid check is enabled.
void WriteCallbackStatusCheck(const Status& status);
Expand Down Expand Up @@ -1267,6 +1267,7 @@ class DBImpl : public DB {
// 2PC these are the writes at Prepare phase.
const bool concurrent_prepare_;
const bool manual_wal_flush_;
const bool seq_per_batch_;
};

extern Options SanitizeOptions(const std::string& db,
Expand Down
2 changes: 1 addition & 1 deletion db/db_impl_open.cc
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
status = WriteBatchInternal::InsertInto(
&batch, column_family_memtables_.get(), &flush_scheduler_, true,
log_number, this, false /* concurrent_memtable_writes */,
next_sequence, &has_valid_writes);
next_sequence, &has_valid_writes, seq_per_batch_);
MaybeIgnoreError(&status);
if (!status.ok()) {
// We are treating this as a failure while reading since we read valid
Expand Down
39 changes: 26 additions & 13 deletions db/db_impl_write.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
return Status::NotSupported(
"pipelined_writes is not compatible with concurrent prepares");
}
if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) {
return Status::NotSupported(
"pipelined_writes is not compatible with seq_per_batch");
}

Status status;
if (write_options.low_pri) {
Expand Down Expand Up @@ -184,7 +188,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
// more than once to a particular key.
bool parallel = immutable_db_options_.allow_concurrent_memtable_write &&
write_group.size > 1;
int total_count = 0;
size_t total_count = 0;
uint64_t total_byte_size = 0;
for (auto* writer : write_group) {
if (writer->CheckCallback(this)) {
Expand All @@ -197,6 +201,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
}
}
size_t seq_inc = seq_per_batch_ ? write_group.size : total_count;

const bool concurrent_update = concurrent_prepare_;
// Update stats while we are an exclusive group leader, so we know
Expand Down Expand Up @@ -238,15 +243,15 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
// LastToBeWrittenSequence is increased inside WriteToWAL under
// wal_write_mutex_ to ensure ordered events in WAL
status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
total_count);
seq_inc);
} else {
// Otherwise we inc seq number for memtable writes
last_sequence = versions_->FetchAddLastToBeWrittenSequence(total_count);
last_sequence = versions_->FetchAddLastToBeWrittenSequence(seq_inc);
}
}
assert(last_sequence != kMaxSequenceNumber);
const SequenceNumber current_sequence = last_sequence + 1;
last_sequence += total_count;
last_sequence += seq_inc;

if (status.ok()) {
PERF_TIMER_GUARD(write_memtable_time);
Expand All @@ -255,12 +260,16 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
w.status = WriteBatchInternal::InsertInto(
write_group, current_sequence, column_family_memtables_.get(),
&flush_scheduler_, write_options.ignore_missing_column_families,
0 /*recovery_log_number*/, this);
0 /*recovery_log_number*/, this, parallel, seq_per_batch_);
} else {
SequenceNumber next_sequence = current_sequence;
for (auto* writer : write_group) {
if (writer->ShouldWriteToMemtable()) {
writer->sequence = next_sequence;
}
if (seq_per_batch_) {
next_sequence++;
} else if (writer->ShouldWriteToMemtable()) {
next_sequence += WriteBatchInternal::Count(writer->batch);
}
}
Expand All @@ -281,9 +290,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
write_options.ignore_missing_column_families, 0 /*log_number*/,
this, true /*concurrent_memtable_writes*/);
}
if (seq_used != nullptr) {
*seq_used = w.sequence;
}
}
if (seq_used != nullptr) {
*seq_used = w.sequence;
}
}
}
Expand Down Expand Up @@ -427,7 +436,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
memtable_write_group.status = WriteBatchInternal::InsertInto(
memtable_write_group, w.sequence, column_family_memtables_.get(),
&flush_scheduler_, write_options.ignore_missing_column_families,
0 /*log_number*/, this);
0 /*log_number*/, this, seq_per_batch_);
versions_->SetLastSequence(memtable_write_group.last_sequence);
write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
}
Expand Down Expand Up @@ -521,12 +530,16 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
PERF_TIMER_GUARD(write_wal_time);
// LastToBeWrittenSequence is increased inside WriteToWAL under
// wal_write_mutex_ to ensure ordered events in WAL
status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
0 /*total_count*/);
size_t seq_inc = seq_per_batch_ ? write_group.size : 0 /*total_count*/;
status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
auto curr_seq = last_sequence + 1;
for (auto* writer : write_group) {
if (writer->CheckCallback(this)) {
writer->sequence = curr_seq;
}
if (seq_per_batch_) {
curr_seq++;
} else if (writer->CheckCallback(this)) {
curr_seq += WriteBatchInternal::Count(writer->batch);
}
}
Expand Down Expand Up @@ -778,7 +791,7 @@ Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
uint64_t* log_used,
SequenceNumber* last_sequence,
int total_count) {
size_t seq_inc) {
Status status;

WriteBatch tmp_batch;
Expand All @@ -796,7 +809,7 @@ Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
writer->log_used = logfile_number_;
}
}
*last_sequence = versions_->FetchAddLastToBeWrittenSequence(total_count);
*last_sequence = versions_->FetchAddLastToBeWrittenSequence(seq_inc);
auto sequence = *last_sequence + 1;
WriteBatchInternal::SetSequence(merged_batch, sequence);

Expand Down
Loading

0 comments on commit 60beefd

Please sign in to comment.