Skip to content

Commit

Permalink
[master] KUDU-3322 / KUDU-3319 HMS event logging
Browse files Browse the repository at this point in the history
It would be useful to have additional logging in cases where HMS
notification events have to be analyzed from Kudu side. The Jiras
have more context on the reasoning behind this but this patch adds
* an INFO level log which shows the last processed HMS event id during
startup and
* an ERROR log if the latest event id from HMS is less than
the last seen event id from the master. Was hesitant on crashing the
master and chose to log an ERROR as in the case an out of order event
id received from HMS might lead to unnecessary crash of the Kudu
master.

Change-Id: I242c9cedf170ed621867b905f1cb1875347d5887
Reviewed-on: http://gerrit.cloudera.org:8080/18866
Tested-by: Alexey Serbin <[email protected]>
Reviewed-by: Alexey Serbin <[email protected]>
  • Loading branch information
achennaka authored and alexeyserbin committed Sep 25, 2022
1 parent 0c71a07 commit e39411e
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 3 deletions.
6 changes: 6 additions & 0 deletions src/kudu/hms/hms_catalog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,12 @@ Status HmsCatalog::GetNotificationEvents(int64_t last_event_id, int max_events,
});
}

Status HmsCatalog::GetCurrentNotificationEventId(int64_t* event_id) {
return ha_client_.Execute([&] (HmsClient* client) {
return client->GetCurrentNotificationEventId(event_id);
});
}

Status HmsCatalog::GetUuid(string* uuid) {
std::lock_guard<simple_spinlock> l(uuid_lock_);
if (!uuid_) {
Expand Down
3 changes: 3 additions & 0 deletions src/kudu/hms/hms_catalog.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ class HmsCatalog {
Status GetNotificationEvents(int64_t last_event_id, int max_events,
std::vector<hive::NotificationEvent>* events) WARN_UNUSED_RESULT;

// Retrieves the latest notification event id from the HMS.
Status GetCurrentNotificationEventId(int64_t* event_id) WARN_UNUSED_RESULT;

// Get the UUID associated with the remote HMS instance. This is an identifier
// stored in the HMS's backing database which does not change even if the
// HMS itself changes hostnames, etc.
Expand Down
4 changes: 4 additions & 0 deletions src/kudu/hms/mini_hms.cc
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ Status MiniHms::Start() {
return wait;
}

Status MiniHms::DeleteDatabaseDir() {
return Env::Default()->DeleteRecursively(JoinPathSegments(data_root_, metadb_subdir_));
}

Status MiniHms::Stop() {
if (hms_process_) {
VLOG(1) << "Stopping HMS";
Expand Down
3 changes: 3 additions & 0 deletions src/kudu/hms/mini_hms.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ class MiniHms {
// Unpause the Hive metastore process.
Status Resume() WARN_UNUSED_RESULT;

// Delete the HMS database directory.
Status DeleteDatabaseDir() WARN_UNUSED_RESULT;

// Returns the address of the Hive metastore. Should only be called after the
// metastore is started.
HostPort address() const {
Expand Down
47 changes: 46 additions & 1 deletion src/kudu/integration-tests/master_hms-itest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// under the License.

#include <algorithm>
#include <fstream>
#include <functional>
#include <initializer_list>
#include <map>
Expand Down Expand Up @@ -84,8 +85,8 @@ class MasterHmsTest : public ExternalMiniClusterITestBase {
opts.principal = Principal();
// Tune down the notification log poll period in order to speed up catalog convergence.
opts.extra_master_flags.emplace_back("--hive_metastore_notification_log_poll_period_seconds=1");
SetFlags(&opts);
StartClusterWithOpts(std::move(opts));

thrift::ClientOptions hms_opts;
hms_opts.enable_kerberos = EnableKerberos();
hms_opts.service_principal = "hive";
Expand Down Expand Up @@ -181,6 +182,8 @@ class MasterHmsTest : public ExternalMiniClusterITestBase {
virtual string Principal() {
return "kudu";
}

virtual void SetFlags(ExternalMiniClusterOptions* opts) const {}
};

TEST_F(MasterHmsTest, TestCreateTable) {
Expand Down Expand Up @@ -878,4 +881,46 @@ TEST_F(MasterHmsKerberizedTest, TestTableOwnership) {
ASSERT_OK(harness_.hms_client()->GetTable("default", "my_table", &table));
ASSERT_EQ("test-user", table.owner);
}

class MasterHmsDBTest : public MasterHmsTest {
public:
void SetFlags(ExternalMiniClusterOptions* opts) const override {
opts->extra_master_flags.emplace_back("--logtostderr=false");
}
};

// TODO(achennaka): Skip test until server timeouts seen in ASAN builds are resolved.
TEST_F(MasterHmsDBTest, DISABLED_TestHMSDBErase) {
// Generate 2 HMS events. With just one event created the published
// event id is not 0 after the database drop.
ASSERT_OK(CreateKuduTable("default", "a"));
unique_ptr<KuduTableAlterer> table_alterer;
table_alterer.reset(client_->NewTableAlterer("default.a")->RenameTo("default.b"));
ASSERT_OK(table_alterer->Alter());

// Ensure the table is present in HMS database.
NO_FATALS(CheckTable("default", "b", /*user=*/ nullopt));

// Simulate accidental HMS database loss.
ASSERT_OK(StopHms());
ASSERT_OK(cluster_->hms()->DeleteDatabaseDir());
ASSERT_OK(StartHms());

// Now everytime we poll HMS for new events, we should be logging an error message.
constexpr const char* const str = "The event ID 2 last seen by Kudu master is greater "
"than 0 currently reported by HMS. Has the HMS database "
"been reset (backup&restore, etc.)?";
ASSERT_EVENTUALLY([&] {
string line;
bool log_found = false;
std::ifstream in(strings::Substitute("$0/kudu-master.ERROR", cluster_->master()->log_dir()));
while (std::getline(in, line)) {
if (line.find(str) != std::string::npos) {
log_found = true;
break;
}
}
ASSERT_TRUE(log_found);
});
}
} // namespace kudu
6 changes: 4 additions & 2 deletions src/kudu/master/catalog_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1475,8 +1475,8 @@ void CatalogManager::PrepareForLeadershipTask() {
"Loading latest processed Hive Metastore notification log event ID";
LOG(INFO) << kNotificationLogEventIdDescription << "...";
LOG_SLOW_EXECUTION(WARNING, 1000, LogPrefix() + kNotificationLogEventIdDescription) {
if (!check([this]() { return this->InitLatestNotificationLogEventId(); },
*consensus, term, kNotificationLogEventIdDescription).ok()) {
if (!check([this]() { return this->InitLatestNotificationLogEventId(); },
*consensus, term, kNotificationLogEventIdDescription).ok()) {
return;
}
}
Expand Down Expand Up @@ -5508,6 +5508,8 @@ Status CatalogManager::InitLatestNotificationLogEventId() {
int64_t hms_notification_log_event_id;
RETURN_NOT_OK(sys_catalog_->GetLatestNotificationLogEventId(&hms_notification_log_event_id));
hms_notification_log_event_id_ = hms_notification_log_event_id;
LOG(INFO) << "Last processed Hive Metastore notification event ID: "
<< hms_notification_log_event_id_;
return Status::OK();
}

Expand Down
16 changes: 16 additions & 0 deletions src/kudu/master/hms_notification_log_listener.cc
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,22 @@ Status HmsNotificationLogListenerTask::Poll() {
processed_event_id, batch_size, &events),
"failed to retrieve notification log events");

// If we do not receive any new events it could be because the HMS event ID in the Kudu
// master is higher than what is in the HMS database which causes Drop/Alter table
// commands to fail on Kudu side.
if (events.empty()) {
int64_t event_id;
RETURN_NOT_OK_PREPEND(catalog_manager_->hms_catalog()->
GetCurrentNotificationEventId(&event_id),
"failed to retrieve latest notification log event");
if (event_id < processed_event_id) {
LOG(ERROR) << Substitute("The event ID $0 last seen by Kudu master is greater "
"than $1 currently reported by HMS. Has the HMS database "
"been reset (backup&restore, etc.)?",
processed_event_id, event_id);
}
}

for (const auto& event : events) {
VLOG(1) << "Processing notification log event: " << EventDebugString(event);

Expand Down

0 comments on commit e39411e

Please sign in to comment.