Skip to content

Commit

Permalink
[ksck] KUDU-3258: allow ksck and rebalancer to work on txn status table
Browse files Browse the repository at this point in the history
This patch adds the transaction system table to the ksck output in its
own section for system tables. Here's a sample snippet of an output that
has the system table:

Summary by system table
             Name              | RF |      Status      | Total Tablets | Healthy | Recovering | Under-replicated | Unavailable
-------------------------------+----+------------------+---------------+---------+------------+------------------+-------------
 kudu_system.kudu_transactions | 3  | UNDER_REPLICATED | 1             | 0       | 0          | 1                | 0

Summary by table
                         Name                          | RF |   Status    | Total Tablets | Healthy | Recovering | Under-replicated | Unavailable
-------------------------------------------------------+----+-------------+---------------+---------+------------+------------------+-------------
 default.loadgen_auto_05cf5be513ea4a84a052e8044f641c1a | 1  | UNAVAILABLE | 8             | 6       | 0          | 0                | 2
 default.loadgen_auto_0c7ea48d5f6948408694b176f70e69ec | 1  | UNAVAILABLE | 8             | 5       | 0          | 0                | 3
 default.loadgen_auto_241be343981c46d081ab2b3d2e3b6e6a | 1  | UNAVAILABLE | 8             | 5       | 0          | 0                | 3
 default.loadgen_auto_385476d5d3b6493f8cbf659c8a4cf7cc | 1  | UNAVAILABLE | 8             | 6       | 0          | 0                | 2
 default.loadgen_auto_430e280e8aa7450591da67ae15ff0f37 | 1  | UNAVAILABLE | 8             | 6       | 0          | 0                | 2

The section can be included/excluded via the --sections flag of ksck.

Since ksck and the rebalancer use the same cluster-examining code, this
patch also updates the rebalancer cluster_status class to account for
system tables -- the tool would have crashed upon trying to find the
replication factor of the system table otherwise.

Change-Id: I8162f6eb046d98791c6bdeb5c15a0af72487300d
Reviewed-on: http://gerrit.cloudera.org:8080/17315
Tested-by: Andrew Wong <[email protected]>
Reviewed-by: Alexey Serbin <[email protected]>
  • Loading branch information
andrwng committed Apr 19, 2021
1 parent a354ce3 commit bdb6d06
Show file tree
Hide file tree
Showing 11 changed files with 247 additions and 94 deletions.
52 changes: 52 additions & 0 deletions src/kudu/integration-tests/txn_status_table-itest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "kudu/gutil/port.h"
#include "kudu/gutil/ref_counted.h"
#include "kudu/gutil/stl_util.h"
#include "kudu/gutil/strings/join.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/integration-tests/cluster_itest_util.h"
#include "kudu/integration-tests/test_workload.h"
Expand All @@ -51,13 +52,15 @@
#include "kudu/tablet/metadata.pb.h"
#include "kudu/tablet/tablet_metadata.h"
#include "kudu/tablet/tablet_replica.h"
#include "kudu/tools/tool_test_util.h"
#include "kudu/transactions/transactions.pb.h"
#include "kudu/transactions/txn_status_tablet.h"
#include "kudu/transactions/txn_system_client.h"
#include "kudu/tserver/mini_tablet_server.h"
#include "kudu/tserver/tablet_server.h"
#include "kudu/tserver/ts_tablet_manager.h"
#include "kudu/util/monotime.h"
#include "kudu/util/net/net_util.h"
#include "kudu/util/scoped_cleanup.h"
#include "kudu/util/status.h"
#include "kudu/util/test_macros.h"
Expand Down Expand Up @@ -85,6 +88,7 @@ using kudu::cluster::InternalMiniClusterOptions;
using kudu::itest::TServerDetails;
using kudu::itest::TabletServerMap;
using kudu::tablet::TabletReplica;
using kudu::tools::RunKuduTool;
using kudu::transactions::TxnStatePB;
using kudu::transactions::TxnStatusEntryPB;
using kudu::transactions::TxnStatusTablet;
Expand Down Expand Up @@ -207,6 +211,54 @@ TEST_F(TxnStatusTableITest, TestTxnStatusTableNotListed) {
ASSERT_NE(nullptr, table);
}

// Test that the transaction status table is visible in a separate section in
// ksck, and that its health is reported as other tables are.
TEST_F(TxnStatusTableITest, TestTxnStatusTableInKsck) {
vector<string> master_addrs;
for (const auto& hp : cluster_->master_rpc_addrs()) {
master_addrs.emplace_back(hp.ToString());
}
string out;
string err;
vector<string> ksck_args = { "cluster", "ksck", JoinStrings(master_addrs, ",") };
ASSERT_OK(RunKuduTool(ksck_args, &out, &err));
ASSERT_STR_CONTAINS(out, "The cluster doesn't have any matching system tables");

// Nothing should be logged on error for finding the txn status table.
ASSERT_STR_NOT_CONTAINS(err, TxnStatusTablet::kTxnStatusTableName) << err;
ASSERT_OK(txn_sys_client_->CreateTxnStatusTable(100));

ASSERT_OK(RunKuduTool(ksck_args, &out));
ASSERT_STR_MATCHES(out,
"^Summary by system table.*\n"
"^ Name | RF | Status | Total Tablets | Healthy .*\n"
"^-------------------------------+----+---------+---------------+---------.*\n"
"^ kudu_system.kudu_transactions | 1 | HEALTHY | 1 | 1 .*\n");

// Now bring down a tablet server and we should see the health update.
cluster_->mini_tablet_server(0)->Shutdown();
Status s = RunKuduTool(ksck_args, &out);
ASSERT_TRUE(s.IsRuntimeError()) << s.ToString();
ASSERT_STR_MATCHES(out,
"^Summary by system table.*\n"
"^ Name | RF | Status | Total Tablets | Healthy .*\n"
"^-------------------------------+----+-------------+---------------+---------.*\n"
"^ kudu_system.kudu_transactions | 1 | UNAVAILABLE | 1 | 0 .*\n");
}

// Test that despite being unable to list the transaction status table, we are
// able to run the tool. In previous iterations of the rebalancer, the tool may
// have crashed attempting to find the RF for a table it didn't know about.
TEST_F(TxnStatusTableITest, TestTxnStatusTableInRebalancer) {
vector<string> master_addrs;
for (const auto& hp : cluster_->master_rpc_addrs()) {
master_addrs.emplace_back(hp.ToString());
}
ASSERT_OK(txn_sys_client_->CreateTxnStatusTable(100));
vector<string> rebalancer_args = { "cluster", "rebalance", JoinStrings(master_addrs, ",") };
ASSERT_OK(RunKuduTool(rebalancer_args));
}

// Test that only the service- or super-user can create or alter the
// transaction status table.
TEST_F(TxnStatusTableITest, TestProtectCreateAndAlter) {
Expand Down
1 change: 1 addition & 0 deletions src/kudu/rebalance/cluster_status.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ struct ClusterStatus {
// Tablet information includes consensus state.
std::vector<TabletSummary> tablet_summaries;
std::vector<TableSummary> table_summaries;
std::vector<TableSummary> system_table_summaries;
};

} // namespace cluster_summary
Expand Down
133 changes: 79 additions & 54 deletions src/kudu/tools/ksck-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
#include <set>
#include <sstream>
#include <string>
#include <type_traits>
#include <unordered_map>
#include <utility>
#include <vector>
Expand All @@ -50,6 +49,7 @@
#include "kudu/tablet/tablet.pb.h"
#include "kudu/tools/ksck_checksum.h"
#include "kudu/tools/ksck_results.h"
#include "kudu/transactions/txn_status_tablet.h"
#include "kudu/util/jsonreader.h"
#include "kudu/util/scoped_cleanup.h"
#include "kudu/util/status.h"
Expand All @@ -74,6 +74,7 @@ using kudu::cluster_summary::TableSummary;
using kudu::cluster_summary::TabletSummary;
using kudu::server::GetFlagsResponsePB;
using kudu::tablet::TabletDataState;
using kudu::transactions::TxnStatusTablet;

using std::make_shared;
using std::ostringstream;
Expand Down Expand Up @@ -256,6 +257,7 @@ class MockKsckCluster : public KsckCluster {
using KsckCluster::masters_;
using KsckCluster::tables_;
using KsckCluster::tablet_servers_;
using KsckCluster::txn_sys_table_;
};

class KsckTest : public KuduTest {
Expand Down Expand Up @@ -312,7 +314,7 @@ class KsckTest : public KuduTest {
table_summary.consensus_mismatch_tablets = consensus_mismatch_tablets;
table_summary.unavailable_tablets = unavailable_tablets;
std::ostringstream oss;
PrintTableSummaries({ table_summary }, oss);
PrintTableSummaries({ table_summary }, "table", oss);
return oss.str();
}

Expand All @@ -331,13 +333,20 @@ class KsckTest : public KuduTest {
}
}

void CreateOneTableOneTablet() {
CreateDefaultAssignmentPlan(1);
void CreateOneTableOneTablet(bool create_txn_status_table = false) {
NO_FATALS(CreateDefaultAssignmentPlan(create_txn_status_table ? 2 : 1));

auto table = CreateAndAddTable("test", 1);
auto tablet(make_shared<KsckTablet>(table.get(), "tablet-id-1"));
CreateAndFillTablet(tablet, 1, true, true);
NO_FATALS(CreateAndFillTablet(tablet, 1, true, true));
table->set_tablets({ tablet });

if (create_txn_status_table) {
auto sys_table = CreateAndAddTxnStatusTable(1);
auto sys_tablet(make_shared<KsckTablet>(sys_table.get(), "sys-tablet-id-1"));
NO_FATALS(CreateAndFillTablet(sys_tablet, 1, true, true));
sys_table->set_tablets({ sys_tablet });
}
}

void CreateOneSmallReplicatedTable(const string& table_name = "test",
Expand Down Expand Up @@ -384,6 +393,14 @@ class KsckTest : public KuduTest {
table->set_tablets({ tablet });
}

shared_ptr<KsckTable> CreateAndAddTxnStatusTable(int num_replicas) {
auto table(make_shared<KsckTable>(
TxnStatusTablet::kTxnStatusTableName, TxnStatusTablet::kTxnStatusTableName,
TxnStatusTablet::GetSchema(), num_replicas));
cluster_->txn_sys_table_ = table;
return table;
}

shared_ptr<KsckTable> CreateAndAddTable(const string& id_and_name, int num_replicas) {
auto table(make_shared<KsckTable>(
id_and_name, id_and_name, Schema(), num_replicas));
Expand All @@ -396,11 +413,11 @@ class KsckTest : public KuduTest {
{
vector<shared_ptr<KsckTabletReplica>> replicas;
if (has_leader) {
CreateReplicaAndAdd(&replicas, tablet->id(), true, is_running);
NO_FATALS(CreateReplicaAndAdd(&replicas, tablet->id(), true, is_running));
num_replicas--;
}
for (int i = 0; i < num_replicas; i++) {
CreateReplicaAndAdd(&replicas, tablet->id(), false, is_running);
NO_FATALS(CreateReplicaAndAdd(&replicas, tablet->id(), false, is_running));
}
tablet->set_replicas(std::move(replicas));
}
Expand All @@ -419,7 +436,7 @@ class KsckTest : public KuduTest {
for (const auto& replica : tablet->replicas()) {
shared_ptr<MockKsckTabletServer> ts =
static_pointer_cast<MockKsckTabletServer>(cluster_->tablet_servers_.at(replica->ts_uuid()));
InsertOrDieNoPrint(&ts->tablet_consensus_state_map_,
InsertIfNotPresent(&ts->tablet_consensus_state_map_,
std::make_pair(replica->ts_uuid(), tablet->id()),
cstate);
}
Expand Down Expand Up @@ -856,27 +873,29 @@ void CheckPlainStringSection(const string& plain, const string& header, bool pre
}

void CheckPlainStringSections(const string& plain, int sections) {
CheckPlainStringSection(plain,
"Master Summary\n",
sections & PrintSections::Values::MASTER_SUMMARIES);
CheckPlainStringSection(plain,
"Tablet Server Summary\n",
sections & PrintSections::Values::TSERVER_SUMMARIES);
CheckPlainStringSection(plain,
"Version Summary\n",
sections & PrintSections::Values::VERSION_SUMMARIES);
CheckPlainStringSection(plain,
"Tablet Summary\n",
sections & PrintSections::Values::TABLET_SUMMARIES);
CheckPlainStringSection(plain,
"Summary by table\n",
sections & PrintSections::Values::TABLE_SUMMARIES);
CheckPlainStringSection(plain,
"Checksum Summary\n",
sections & PrintSections::Values::CHECKSUM_RESULTS);
CheckPlainStringSection(plain,
"Total Count Summary\n",
sections & PrintSections::Values::TOTAL_COUNT);
NO_FATALS(CheckPlainStringSection(plain,
"Master Summary\n",
sections & PrintSections::Values::MASTER_SUMMARIES));
NO_FATALS(CheckPlainStringSection(plain,
"Tablet Server Summary\n",
sections & PrintSections::Values::TSERVER_SUMMARIES));
NO_FATALS(CheckPlainStringSection(plain,
"Version Summary\n",
sections & PrintSections::Values::VERSION_SUMMARIES));
NO_FATALS(CheckPlainStringSection(plain,
"Tablet Summary\n",
sections & PrintSections::Values::TABLET_SUMMARIES));
NO_FATALS(CheckPlainStringSection(plain,
"Summary by table\n",
sections & PrintSections::Values::TABLE_SUMMARIES));
NO_FATALS(CheckPlainStringSection(plain, "Summary by system table\n",
sections & PrintSections::Values::SYSTEM_TABLE_SUMMARIES));
NO_FATALS(CheckPlainStringSection(plain,
"Checksum Summary\n",
sections & PrintSections::Values::CHECKSUM_RESULTS));
NO_FATALS(CheckPlainStringSection(plain,
"Total Count Summary\n",
sections & PrintSections::Values::TOTAL_COUNT));
}

void CheckJsonStringVsKsckResults(const string& json,
Expand All @@ -885,53 +904,59 @@ void CheckJsonStringVsKsckResults(const string& json,
JsonReader r(json);
ASSERT_OK(r.Init());

CheckJsonVsServerHealthSummaries(
NO_FATALS(CheckJsonVsServerHealthSummaries(
r,
"master_summaries",
sections & PrintSections::Values::MASTER_SUMMARIES ?
boost::optional<vector<ServerHealthSummary>>
(results.cluster_status.master_summaries) : boost::none);
CheckJsonVsMasterConsensus(
(results.cluster_status.master_summaries) : boost::none));
NO_FATALS(CheckJsonVsMasterConsensus(
r,
results.cluster_status.master_consensus_conflict,
sections & PrintSections::Values::MASTER_SUMMARIES ?
boost::optional<ConsensusStateMap>
(results.cluster_status.master_consensus_state_map) : boost::none);
CheckJsonVsServerHealthSummaries(
(results.cluster_status.master_consensus_state_map) : boost::none));
NO_FATALS(CheckJsonVsServerHealthSummaries(
r,
"tserver_summaries",
sections & PrintSections::Values::TSERVER_SUMMARIES ?
boost::optional<vector<ServerHealthSummary>>
(results.cluster_status.tserver_summaries) : boost::none);
CheckJsonVsVersionSummaries(
(results.cluster_status.tserver_summaries) : boost::none));
NO_FATALS(CheckJsonVsVersionSummaries(
r,
"version_summaries",
sections & PrintSections::Values::VERSION_SUMMARIES ?
boost::optional<KsckVersionToServersMap>
(results.version_summaries) : boost::none);
CheckJsonVsTabletSummaries(
(results.version_summaries) : boost::none));
NO_FATALS(CheckJsonVsTabletSummaries(
r,
"tablet_summaries",
sections & PrintSections::Values::TABLET_SUMMARIES ?
boost::optional<vector<TabletSummary>>
(results.cluster_status.tablet_summaries) : boost::none);
CheckJsonVsTableSummaries(
(results.cluster_status.tablet_summaries) : boost::none));
NO_FATALS(CheckJsonVsTableSummaries(
r,
"table_summaries",
sections & PrintSections::Values::TABLE_SUMMARIES ?
boost::optional<vector<TableSummary>>
(results.cluster_status.table_summaries) : boost::none);
CheckJsonVsChecksumResults(
(results.cluster_status.table_summaries) : boost::none));
NO_FATALS(CheckJsonVsTableSummaries(
r,
"system_table_summaries",
sections & PrintSections::Values::SYSTEM_TABLE_SUMMARIES ?
boost::optional<vector<TableSummary>>
(results.cluster_status.system_table_summaries) : boost::none));
NO_FATALS(CheckJsonVsChecksumResults(
r,
"checksum_results",
sections & PrintSections::Values::CHECKSUM_RESULTS ?
boost::optional<KsckChecksumResults>(results.checksum_results) : boost::none);
CheckJsonVsCountSummaries(
boost::optional<KsckChecksumResults>(results.checksum_results) : boost::none));
NO_FATALS(CheckJsonVsCountSummaries(
r,
"count_summaries",
sections & PrintSections::Values::TOTAL_COUNT ?
boost::optional<KsckResults>(results) : boost::none);
CheckJsonVsErrors(r, "errors", results.error_messages);
boost::optional<KsckResults>(results) : boost::none));
NO_FATALS(CheckJsonVsErrors(r, "errors", results.error_messages));
}

void CheckMessageNotPresent(const vector<Status>& messages, const string& msg) {
Expand Down Expand Up @@ -1872,25 +1897,25 @@ TEST_F(KsckTest, TestSectionFilter) {
{PrintSections::Values::VERSION_SUMMARIES, "VERSION_SUMMARIES"},
{PrintSections::Values::TABLET_SUMMARIES, "TABLET_SUMMARIES"},
{PrintSections::Values::TABLE_SUMMARIES, "TABLE_SUMMARIES"},
{PrintSections::Values::SYSTEM_TABLE_SUMMARIES, "SYSTEM_TABLE_SUMMARIES"},
{PrintSections::Values::CHECKSUM_RESULTS, "CHECKSUM_RESULTS"},
{PrintSections::Values::TOTAL_COUNT, "TOTAL_COUNT"}};
CreateOneTableOneTablet();
for (const auto& section : sections) {
if (section.first == PrintSections::Values::CHECKSUM_RESULTS) {
NO_FATALS(CreateOneTableOneTablet(/*create_txn_status_table=*/true));
for (const auto& [s_enum, s_str] : sections) {
if (s_enum == PrintSections::Values::CHECKSUM_RESULTS) {
FLAGS_checksum_scan = true;
}
int selected_sections = section.first;
ksck_->set_print_sections({section.second});
ksck_->set_print_sections({s_str});
err_stream_.str("");
err_stream_.clear();
ASSERT_OK(RunKsck());

// Check plain string output.
CheckPlainStringSections(err_stream_.str(), selected_sections);
NO_FATALS(CheckPlainStringSections(err_stream_.str(), s_enum));

// Check json string output.
const string& json_output = KsckResultsToJsonString(selected_sections);
CheckJsonStringVsKsckResults(json_output, ksck_->results(), selected_sections);
const string& json_output = KsckResultsToJsonString(s_enum);
NO_FATALS(CheckJsonStringVsKsckResults(json_output, ksck_->results(), s_enum));
}
}

Expand Down
Loading

0 comments on commit bdb6d06

Please sign in to comment.