Skip to content

Commit

Permalink
KUDU-2705: Add timing information to /scans
Browse files Browse the repository at this point in the history
This patch adds timing information to the /scans page. Specifically, it
adds time(1)-like timing information for the time spent by a service
thread working with a scanner, including both initialization and
continuation of scans.

Screenshot: https://github.com/wdberkeley/kudu/blob/scanspage_screenshot/ss.png

Change-Id: Id30251557ae7f0d97e0d1328bd53511dae7a820b
Reviewed-on: http://gerrit.cloudera.org:8080/12604
Reviewed-by: Adar Dembo <[email protected]>
Tested-by: Adar Dembo <[email protected]>
  • Loading branch information
wdberkeley committed Mar 7, 2019
1 parent e0557a2 commit 78d2c2f
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 6 deletions.
7 changes: 7 additions & 0 deletions src/kudu/tserver/scanners.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include "kudu/util/logging.h"
#include "kudu/util/metrics.h"
#include "kudu/util/status.h"
#include "kudu/util/stopwatch.h"
#include "kudu/util/thread.h"

DEFINE_int32(scanner_ttl_ms, 60000,
Expand Down Expand Up @@ -351,6 +352,11 @@ void Scanner::UpdateAccessTime() {
last_access_time_ = MonoTime::Now();
}

void Scanner::AddTimings(const CpuTimes& elapsed) {
std::lock_guard<simple_spinlock> l(lock_);
cpu_times_.Add(elapsed);
}

void Scanner::Init(unique_ptr<RowwiseIterator> iter,
gscoped_ptr<ScanSpec> spec) {
std::lock_guard<simple_spinlock> l(lock_);
Expand Down Expand Up @@ -414,6 +420,7 @@ ScanDescriptor Scanner::descriptor() const {
std::lock_guard<simple_spinlock> l(lock_);
descriptor.last_call_seq_id = call_seq_id_;
descriptor.last_access_time = last_access_time_;
descriptor.cpu_times = cpu_times_;
}

return descriptor;
Expand Down
43 changes: 43 additions & 0 deletions src/kudu/tserver/scanners.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "kudu/util/mutex.h"
#include "kudu/util/oid_generator.h"
#include "kudu/util/rw_mutex.h"
#include "kudu/util/stopwatch.h"

namespace kudu {

Expand Down Expand Up @@ -225,6 +226,9 @@ class Scanner {
// period.
void UpdateAccessTime();

// Add the timings in 'elapsed' to the total timings for this scanner.
void AddTimings(const CpuTimes& elapsed);

// Return the auto-release pool which will be freed when this scanner
// closes. This can be used as a storage area for the ScanSpec and any
// associated data (eg storage for its predicates).
Expand Down Expand Up @@ -380,6 +384,10 @@ class Scanner {
// this scanner.
int64_t num_rows_returned_;

// The cumulative amounts of wall, user cpu, and system cpu time spent on
// this scanner, in seconds.
CpuTimes cpu_times_;

DISALLOW_COPY_AND_ASSIGN(Scanner);
};

Expand Down Expand Up @@ -420,6 +428,41 @@ struct ScanDescriptor {
MonoTime start_time;
MonoTime last_access_time;
uint32_t last_call_seq_id;

// The cumulative amounts of wall, user cpu, and system cpu time spent on
// this scanner, in seconds.
CpuTimes cpu_times;
};

// RAII wrapper to update a scanner with timing information upon scope exit.
class ScopedAddScannerTiming {
public:
// 'scanner' must outlive the scoped object.
explicit ScopedAddScannerTiming(Scanner* scanner)
: stopped_(false),
scanner_(scanner) {
sw_.start();
}

~ScopedAddScannerTiming() {
if (!stopped_) {
Stop();
}
}

// Stop the timing and update the scanner.
void Stop() {
stopped_ = true;
sw_.stop();
scanner_->AddTimings(sw_.elapsed());
scanner_->UpdateAccessTime();
}

bool stopped_;
Scanner* scanner_;
Stopwatch sw_;

DISALLOW_COPY_AND_ASSIGN(ScopedAddScannerTiming);
};

} // namespace tserver
Expand Down
16 changes: 11 additions & 5 deletions src/kudu/tserver/tablet_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1984,9 +1984,10 @@ Status TabletServiceImpl::HandleNewScanRequest(TabletReplica* replica,
// If we early-exit out of this function, automatically unregister
// the scanner.
ScopedUnregisterScanner unreg_scanner(server_->scanner_manager(), scanner->id());
ScopedAddScannerTiming scanner_timer(scanner.get());

// Create the user's requested projection.
// TODO: add test cases for bad projections including 0 columns
// TODO(todd): Add test cases for bad projections including 0 columns.
Schema projection;
Status s = ColumnPBsToSchema(scan_pb.projected_columns(), &projection);
if (PREDICT_FALSE(!s.ok())) {
Expand Down Expand Up @@ -2167,6 +2168,9 @@ Status TabletServiceImpl::HandleNewScanRequest(TabletReplica* replica,
}

scanner->Init(std::move(iter), std::move(orig_spec));

// Stop the scanner timer because ContinueScanRequest starts its own timer.
scanner_timer.Stop();
unreg_scanner.Cancel();
*scanner_id = scanner->id();

Expand All @@ -2175,8 +2179,11 @@ Status TabletServiceImpl::HandleNewScanRequest(TabletReplica* replica,
size_t batch_size_bytes = GetMaxBatchSizeBytesHint(req);
if (batch_size_bytes > 0) {
TRACE("Continuing scan request");
// TODO: instead of copying the pb, instead split HandleContinueScanRequest
// and call the second half directly
// TODO(wdberkeley): Instead of copying the pb, instead split
// HandleContinueScanRequest and call the second half directly. Once that's
// done, remove the call to ScopedAddScannerTiming::Stop() above (and the
// method as it won't be used) and start the timing for continue requests
// from the first half that is no longer executed in this codepath.
ScanRequestPB continue_req(*req);
continue_req.set_scanner_id(scanner->id());
RETURN_NOT_OK(HandleContinueScanRequest(&continue_req, rpc_context, result_collector,
Expand Down Expand Up @@ -2231,6 +2238,7 @@ Status TabletServiceImpl::HandleContinueScanRequest(const ScanRequestPB* req,

// If we early-exit out of this function, automatically unregister the scanner.
ScopedUnregisterScanner unreg_scanner(server_->scanner_manager(), scanner->id());
ScopedAddScannerTiming scanner_timer(scanner.get());

VLOG(2) << "Found existing scanner " << scanner->id() << " for request: "
<< SecureShortDebugString(*req);
Expand All @@ -2246,7 +2254,6 @@ Status TabletServiceImpl::HandleContinueScanRequest(const ScanRequestPB* req,
return Status::InvalidArgument("Invalid call sequence ID in scan request");
}
scanner->IncrementCallSeqId();
scanner->UpdateAccessTime();

RowwiseIterator* iter = scanner->iter();

Expand Down Expand Up @@ -2350,7 +2357,6 @@ Status TabletServiceImpl::HandleContinueScanRequest(const ScanRequestPB* req,
tablet->metrics()->scanner_bytes_scanned_from_disk->IncrementBy(delta_stats.bytes_read);
}

scanner->UpdateAccessTime();
*has_more_results = !req->close_scanner() && iter->HasNext() &&
!scanner->has_fulfilled_limit();
if (*has_more_results) {
Expand Down
17 changes: 17 additions & 0 deletions src/kudu/tserver/tserver_path_handlers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
#include "kudu/util/maintenance_manager.pb.h"
#include "kudu/util/monotime.h"
#include "kudu/util/pb_util.h"
#include "kudu/util/stopwatch.h"
#include "kudu/util/url-coding.h"
#include "kudu/util/web_callback_registry.h"

Expand Down Expand Up @@ -543,6 +544,14 @@ void ScanToJson(const ScanDescriptor& scan, EasyJson* json) {
json->Set("time_since_start",
HumanReadableElapsedTime::ToShortString(time_since_start.ToSeconds()));

const auto& cpu_times = scan.cpu_times;
json->Set("wall_secs",
HumanReadableElapsedTime::ToShortString(cpu_times.wall_seconds()));
json->Set("user_secs",
HumanReadableElapsedTime::ToShortString(cpu_times.user_cpu_seconds()));
json->Set("sys_secs",
HumanReadableElapsedTime::ToShortString(cpu_times.system_cpu_seconds()));

json->Set("duration_title", duration.ToSeconds());
json->Set("time_since_start_title", time_since_start.ToSeconds());

Expand All @@ -551,8 +560,16 @@ void ScanToJson(const ScanDescriptor& scan, EasyJson* json) {
}
} // anonymous namespace

const char* kLongTimingTitle = "wall time, user cpu time, and system cpu time "
"spent processing the scan: the wall time differs from the duration in that "
"it counts the amount of time the server spent (or has spent so far) "
"processing the scan, whereas the duration measures the amount of time "
"the scanner was (or is so far) open, but possibly dormant waiting for the "
"client to request to continue the scan.";

void TabletServerPathHandlers::HandleScansPage(const Webserver::WebRequest& /*req*/,
Webserver::WebResponse* resp) {
resp->output->Set("timing_title", kLongTimingTitle);
EasyJson scans = resp->output->Set("scans", EasyJson::kArray);
vector<ScanDescriptor> descriptors = tserver_->scanner_manager()->ListScans();

Expand Down
12 changes: 12 additions & 0 deletions src/kudu/util/stopwatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,20 @@ struct CpuTimes {
nanosecond_type system;
int64_t context_switches;

CpuTimes() {
clear();
}

void clear() { wall = user = system = context_switches = 0LL; }

// Add 'other' to this instance in the obvious way.
void Add(const CpuTimes& other) {
wall += other.wall;
user += other.user;
system += other.system;
context_switches += other.context_switches;
}

// Return a string formatted similar to the output of the "time" shell command.
std::string ToString() const {
return StringPrintf(
Expand Down
4 changes: 3 additions & 1 deletion www/scans.mustache
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ under the License.
<th>State</th>
<th title="pseudo-SQL query description">Query</th>
<th>Requestor</th>
<th title="running time of the scan">Duration</th>
<th title="amount of time that the scanner was/has been open on the server">Duration</th>
<th title="elapsed time since the scan started">Time since start</th>
<th title="{{timing_title}}">Timing</th>
<th>Column Stats</th>
</tr>
</thead>
Expand All @@ -42,6 +43,7 @@ under the License.
<td><samp>{{requestor}}</samp></td>
<td title="{{duration_title}}">{{duration}}</td>
<td title="{{time_since_start_title}}">{{time_since_start}}</td>
<td>real: {{wall_secs}} user: {{user_secs}} sys: {{sys_secs}}</td>
<td>
<table class="table table-striped">
Expand Down

0 comments on commit 78d2c2f

Please sign in to comment.