Skip to content

Commit

Permalink
more times in perf_context and iostats_context
Browse files Browse the repository at this point in the history
Summary:
We occasionally get write stalls (>1s Write() calls) on HDD under read load. The following timers explain almost all of the stalls:
 - perf_context.db_mutex_lock_nanos
 - perf_context.db_condition_wait_nanos
 - iostats_context.open_time
 - iostats_context.allocate_time
 - iostats_context.write_time
 - iostats_context.range_sync_time
 - iostats_context.logger_time

In my experiments each of these occasionally takes >1s on write path under some workload. There are rare cases when Write() takes long but none of these takes long.

Test Plan: Added code to our application to write the listed timings to log for slow writes. They usually add up to almost exactly the time Write() call took.

Reviewers: rven, yhchiang, sdong

Reviewed By: sdong

Subscribers: march, dhruba, tnovak

Differential Revision: https://reviews.facebook.net/D39177
  • Loading branch information
al13n321 committed Jun 2, 2015
1 parent 4266d4f commit ec7a944
Show file tree
Hide file tree
Showing 14 changed files with 257 additions and 89 deletions.
3 changes: 3 additions & 0 deletions db/db_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3322,7 +3322,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
if (context.schedule_bg_work_) {
MaybeScheduleFlushOrCompaction();
}
PERF_TIMER_STOP(write_pre_and_post_process_time);
PERF_TIMER_GUARD(write_delay_time);
status = DelayWrite(expiration_time);
PERF_TIMER_START(write_pre_and_post_process_time);
}

if (UNLIKELY(status.ok() && has_timeout &&
Expand Down
16 changes: 16 additions & 0 deletions include/rocksdb/iostats_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
#include <stdint.h>
#include <string>

#include "rocksdb/perf_level.h"

// A thread local context for gathering io-stats efficiently and transparently.
// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.

namespace rocksdb {

struct IOStatsContext {
Expand All @@ -25,6 +29,18 @@ struct IOStatsContext {
uint64_t bytes_written;
// number of bytes that has been read.
uint64_t bytes_read;

// time spent in open() and fopen().
uint64_t open_nanos;
// time spent in fallocate().
uint64_t allocate_nanos;
// time spent in write() and pwrite().
uint64_t write_nanos;
// time spent in sync_file_range().
uint64_t range_sync_nanos;

// time spent in Logger::Logv().
uint64_t logger_nanos;
};

#ifndef IOS_CROSS_COMPILE
Expand Down
26 changes: 11 additions & 15 deletions include/rocksdb/perf_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,13 @@
#include <stdint.h>
#include <string>

namespace rocksdb {

enum PerfLevel {
kDisable = 0, // disable perf stats
kEnableCount = 1, // enable only count stats
kEnableTime = 2 // enable time stats too
};
#include "rocksdb/perf_level.h"

// set the perf stats level
void SetPerfLevel(PerfLevel level);

// get current perf stats level
PerfLevel GetPerfLevel();
namespace rocksdb {

// A thread local context for gathering performance counter efficiently
// and transparently.
// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.

struct PerfContext {

Expand Down Expand Up @@ -64,11 +55,16 @@ struct PerfContext {
uint64_t seek_internal_seek_time;
// total time spent on iterating internal entries to find the next user entry
uint64_t find_next_user_entry_time;
// total time spent on pre or post processing when writing a record
uint64_t write_pre_and_post_process_time;
uint64_t write_wal_time; // total time spent on writing to WAL

// total time spent on writing to WAL
uint64_t write_wal_time;
// total time spent on writing to mem tables
uint64_t write_memtable_time;
// total time spent on delaying write
uint64_t write_delay_time;
// total time spent on writing a record, excluding the above three times
uint64_t write_pre_and_post_process_time;

uint64_t db_mutex_lock_nanos; // time spent on acquiring DB mutex.
// Time spent on waiting with a condition variable created with DB mutex.
uint64_t db_condition_wait_nanos;
Expand Down
30 changes: 30 additions & 0 deletions include/rocksdb/perf_level.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.

#ifndef INCLUDE_ROCKSDB_PERF_LEVEL_H_
#define INCLUDE_ROCKSDB_PERF_LEVEL_H_

#include <stdint.h>
#include <string>

namespace rocksdb {

// How much perf stats to collect. Affects perf_context and iostats_context.

enum PerfLevel {
kDisable = 0, // disable perf stats
kEnableCount = 1, // enable only count stats
kEnableTime = 2 // enable time stats too
};

// set the perf stats level for current thread
void SetPerfLevel(PerfLevel level);

// get current perf stats level for current thread
PerfLevel GetPerfLevel();

} // namespace rocksdb

#endif // INCLUDE_ROCKSDB_PERF_LEVEL_H_
1 change: 1 addition & 0 deletions src.mk
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ LIB_SOURCES = \
util/options.cc \
util/options_helper.cc \
util/perf_context.cc \
util/perf_level.cc \
util/rate_limiter.cc \
util/skiplistrep.cc \
util/slice.cc \
Expand Down
79 changes: 62 additions & 17 deletions util/env_posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -439,14 +439,17 @@ class PosixMmapFile : public WritableFile {

TEST_KILL_RANDOM(rocksdb_kill_odds);
// we can't fallocate with FALLOC_FL_KEEP_SIZE here
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
if (alloc_status != 0) {
// fallback to posix_fallocate
alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
}
if (alloc_status != 0) {
return Status::IOError("Error allocating space to file : " + filename_ +
"Error : " + strerror(alloc_status));
{
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
if (alloc_status != 0) {
// fallback to posix_fallocate
alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
}
if (alloc_status != 0) {
return Status::IOError("Error allocating space to file : " + filename_ +
"Error : " + strerror(alloc_status));
}
}

TEST_KILL_RANDOM(rocksdb_kill_odds);
Expand Down Expand Up @@ -635,6 +638,7 @@ class PosixMmapFile : public WritableFile {
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) override {
TEST_KILL_RANDOM(rocksdb_kill_odds);
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
if (alloc_status == 0) {
Expand Down Expand Up @@ -721,7 +725,12 @@ class PosixWritableFile : public WritableFile {
cursize_ += left;
} else {
while (left != 0) {
ssize_t done = write(fd_, src, RequestToken(left));
ssize_t done;
size_t size = RequestToken(left);
{
IOSTATS_TIMER_GUARD(write_nanos);
done = write(fd_, src, size);
}
if (done < 0) {
if (errno == EINTR) {
continue;
Expand Down Expand Up @@ -769,6 +778,7 @@ class PosixWritableFile : public WritableFile {
// tmpfs (since Linux 3.5)
// We ignore error since failure of this operation does not affect
// correctness.
IOSTATS_TIMER_GUARD(allocate_nanos);
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
filesize_, block_size * last_allocated_block - filesize_);
#endif
Expand All @@ -787,7 +797,12 @@ class PosixWritableFile : public WritableFile {
size_t left = cursize_;
char* src = buf_.get();
while (left != 0) {
ssize_t done = write(fd_, src, RequestToken(left));
ssize_t done;
size_t size = RequestToken(left);
{
IOSTATS_TIMER_GUARD(write_nanos);
done = write(fd_, src, size);
}
if (done < 0) {
if (errno == EINTR) {
continue;
Expand Down Expand Up @@ -861,7 +876,9 @@ class PosixWritableFile : public WritableFile {
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) override {
TEST_KILL_RANDOM(rocksdb_kill_odds);
int alloc_status = fallocate(
int alloc_status;
IOSTATS_TIMER_GUARD(allocate_nanos);
alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
if (alloc_status == 0) {
return Status::OK();
Expand All @@ -871,6 +888,7 @@ class PosixWritableFile : public WritableFile {
}

virtual Status RangeSync(off_t offset, off_t nbytes) override {
IOSTATS_TIMER_GUARD(range_sync_nanos);
if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) {
return Status::OK();
} else {
Expand Down Expand Up @@ -929,7 +947,11 @@ class PosixRandomRWFile : public RandomRWFile {
pending_fsync_ = true;

while (left != 0) {
ssize_t done = pwrite(fd_, src, left, offset);
ssize_t done;
{
IOSTATS_TIMER_GUARD(write_nanos);
done = pwrite(fd_, src, left, offset);
}
if (done < 0) {
if (errno == EINTR) {
continue;
Expand Down Expand Up @@ -1001,6 +1023,7 @@ class PosixRandomRWFile : public RandomRWFile {
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) override {
TEST_KILL_RANDOM(rocksdb_kill_odds);
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
if (alloc_status == 0) {
Expand Down Expand Up @@ -1109,6 +1132,7 @@ class PosixEnv : public Env {
result->reset();
FILE* f = nullptr;
do {
IOSTATS_TIMER_GUARD(open_nanos);
f = fopen(fname.c_str(), "r");
} while (f == nullptr && errno == EINTR);
if (f == nullptr) {
Expand All @@ -1127,7 +1151,11 @@ class PosixEnv : public Env {
const EnvOptions& options) override {
result->reset();
Status s;
int fd = open(fname.c_str(), O_RDONLY);
int fd;
{
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), O_RDONLY);
}
SetFD_CLOEXEC(fd, &options);
if (fd < 0) {
s = IOError(fname, errno);
Expand Down Expand Up @@ -1160,6 +1188,7 @@ class PosixEnv : public Env {
Status s;
int fd = -1;
do {
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
} while (fd < 0 && errno == EINTR);
if (fd < 0) {
Expand Down Expand Up @@ -1200,7 +1229,11 @@ class PosixEnv : public Env {
return Status::NotSupported("No support for mmap read/write yet");
}
Status s;
const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644);
int fd;
{
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644);
}
if (fd < 0) {
s = IOError(fname, errno);
} else {
Expand All @@ -1213,7 +1246,11 @@ class PosixEnv : public Env {
virtual Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result) override {
result->reset();
const int fd = open(name.c_str(), 0);
int fd;
{
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(name.c_str(), 0);
}
if (fd < 0) {
return IOError(name, errno);
} else {
Expand Down Expand Up @@ -1325,7 +1362,11 @@ class PosixEnv : public Env {
virtual Status LockFile(const std::string& fname, FileLock** lock) override {
*lock = nullptr;
Status result;
int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
int fd;
{
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
}
if (fd < 0) {
result = IOError(fname, errno);
} else if (LockOrUnlock(fname, fd, true) == -1) {
Expand Down Expand Up @@ -1396,7 +1437,11 @@ class PosixEnv : public Env {

virtual Status NewLogger(const std::string& fname,
shared_ptr<Logger>* result) override {
FILE* f = fopen(fname.c_str(), "w");
FILE* f;
{
IOSTATS_TIMER_GUARD(open_nanos);
f = fopen(fname.c_str(), "w");
}
if (f == nullptr) {
result->reset();
return IOError(fname, errno);
Expand Down
13 changes: 12 additions & 1 deletion util/iostats_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ void IOStatsContext::Reset() {
thread_pool_id = Env::Priority::TOTAL;
bytes_read = 0;
bytes_written = 0;
open_nanos = 0;
allocate_nanos = 0;
write_nanos = 0;
range_sync_nanos = 0;
logger_nanos = 0;
}

#define OUTPUT(counter) #counter << " = " << counter << ", "
Expand All @@ -25,7 +30,13 @@ std::string IOStatsContext::ToString() const {
std::ostringstream ss;
ss << OUTPUT(thread_pool_id)
<< OUTPUT(bytes_read)
<< OUTPUT(bytes_written);
<< OUTPUT(bytes_written)
<< OUTPUT(open_nanos)
<< OUTPUT(allocate_nanos)
<< OUTPUT(write_nanos)
<< OUTPUT(range_sync_nanos)
<< OUTPUT(logger_nanos);

return ss.str();
}

Expand Down
17 changes: 17 additions & 0 deletions util/iostats_context_imp.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
//
#pragma once
#include "rocksdb/iostats_context.h"
#include "util/perf_step_timer.h"

#ifndef IOS_CROSS_COMPILE

Expand Down Expand Up @@ -33,6 +34,18 @@
#define IOSTATS(metric) \
(iostats_context.metric)

// Stop the timer and update the metric
#define IOSTATS_TIMER_STOP(metric) \
iostats_step_timer_ ## metric.Stop();

#define IOSTATS_TIMER_START(metric) \
iostats_step_timer_ ## metric.Start();

// Declare and set start time of the timer
#define IOSTATS_TIMER_GUARD(metric) \
PerfStepTimer iostats_step_timer_ ## metric(&(iostats_context.metric)); \
iostats_step_timer_ ## metric.Start();

#else // IOS_CROSS_COMPILE

#define IOSTATS_ADD(metric, value)
Expand All @@ -43,4 +56,8 @@
#define IOSTATS_THREAD_POOL_ID()
#define IOSTATS(metric) 0

#define IOSTATS_TIMER_GUARD(metric)
#define IOSTATS_TIMER_STOP(metric)
#define IOSTATS_TIMER_START(metric)

#endif // IOS_CROSS_COMPILE
Loading

0 comments on commit ec7a944

Please sign in to comment.