Skip to content

Commit

Permalink
[Event] Add some error level events (ray-project#18118)
Browse files Browse the repository at this point in the history
* add event 'RAY_WORKER_FAILURE' and 'RAY_DRIVER_FAILURE'

* add some events

* move event 'EL_RAY_NODE_REMOVED' to 'RemoveNode()'
  • Loading branch information
SongGuyang authored Aug 31, 2021
1 parent 82465f9 commit be772df
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 4 deletions.
6 changes: 6 additions & 0 deletions src/ray/gcs/gcs_server/gcs_node_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include "ray/common/ray_config.h"
#include "ray/gcs/pb_util.h"
#include "ray/stats/stats.h"
#include "ray/util/event.h"
#include "ray/util/event_label.h"
#include "src/ray/protobuf/gcs.pb.h"

namespace ray {
Expand Down Expand Up @@ -156,6 +158,10 @@ std::shared_ptr<rpc::GcsNodeInfo> GcsNodeManager::RemoveNode(
RAY_LOG(INFO) << "Publish RemoveNode, msg=" << error_message.str();
auto error_data_ptr =
gcs::CreateErrorTableData(type, error_message.str(), current_time_ms());
RAY_EVENT(ERROR, EL_RAY_NODE_REMOVED)
.WithField("node_id", node_id.Hex())
.WithField("ip", removed_node->node_manager_address())
<< error_message.str();
RAY_CHECK_OK(gcs_pub_sub_->Publish(ERROR_INFO_CHANNEL, node_id.Hex(),
error_data_ptr->SerializeAsString(), nullptr));
}
Expand Down
22 changes: 18 additions & 4 deletions src/ray/raylet/agent_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include <thread>

#include "ray/common/ray_config.h"
#include "ray/util/event.h"
#include "ray/util/event_label.h"
#include "ray/util/logging.h"
#include "ray/util/process.h"

Expand Down Expand Up @@ -81,8 +83,14 @@ void AgentManager::StartAgent() {
auto timer = delay_executor_(
[this, child]() mutable {
if (agent_pid_ != child.GetId()) {
RAY_LOG(WARNING) << "Agent process with pid " << child.GetId()
<< " has not registered, restart it.";
std::ostringstream error_message;
error_message << "Agent process with pid " << child.GetId()
<< " has not registered, restart it.";
RAY_LOG(WARNING) << error_message.str();
RAY_EVENT(ERROR, EL_RAY_AGENT_NOT_REGISTERED)
.WithField("ip", agent_ip_address_)
.WithField("pid", agent_pid_)
<< error_message.str();
child.Kill();
}
},
Expand All @@ -91,8 +99,14 @@ void AgentManager::StartAgent() {
int exit_code = child.Wait();
timer->cancel();

RAY_LOG(WARNING) << "Agent process with pid " << child.GetId()
<< " exit, return value " << exit_code;
std::ostringstream error_message;
error_message << "Agent process with pid " << child.GetId() << " exit, return value "
<< exit_code;
RAY_LOG(WARNING) << error_message.str();
RAY_EVENT(ERROR, EL_RAY_AGENT_EXIT)
.WithField("ip", agent_ip_address_)
.WithField("pid", agent_pid_)
<< error_message.str();
RAY_UNUSED(delay_executor_([this] { StartAgent(); },
RayConfig::instance().agent_restart_interval_ms()));
});
Expand Down
15 changes: 15 additions & 0 deletions src/ray/raylet/node_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include "ray/gcs/pb_util.h"
#include "ray/raylet/format/node_manager_generated.h"
#include "ray/stats/stats.h"
#include "ray/util/event.h"
#include "ray/util/event_label.h"
#include "ray/util/sample.h"
#include "ray/util/util.h"

Expand Down Expand Up @@ -1236,6 +1238,11 @@ void NodeManager::DisconnectClient(
<< " Worker PID: " << worker->GetProcess().GetId();
std::string error_message_str = error_message.str();
RAY_LOG(INFO) << error_message_str;
RAY_EVENT(ERROR, EL_RAY_WORKER_FAILURE)
.WithField("worker_id", worker->WorkerId().Hex())
.WithField("node_id", self_node_id_.Hex())
.WithField("job_id", worker->GetAssignedJobId().Hex())
<< error_message_str;
auto error_data_ptr =
gcs::CreateErrorTableData(type, error_message_str, current_time_ms(), job_id);
RAY_CHECK_OK(gcs_client_->Errors().AsyncReportJobError(error_data_ptr, nullptr));
Expand All @@ -1260,6 +1267,14 @@ void NodeManager::DisconnectClient(
RAY_LOG(INFO) << "Driver (pid=" << worker->GetProcess().GetId()
<< ") is disconnected. "
<< "job_id: " << worker->GetAssignedJobId();
if (disconnect_type == rpc::WorkerExitType::SYSTEM_ERROR_EXIT) {
RAY_EVENT(ERROR, EL_RAY_DRIVER_FAILURE)
.WithField("node_id", self_node_id_.Hex())
.WithField("job_id", worker->GetAssignedJobId().Hex())
<< "Driver " << worker->WorkerId() << " died. Address: " << worker->IpAddress()
<< ":" << worker->Port() << ", Pid: " << worker->GetProcess().GetId()
<< ", JobId: " << worker->GetAssignedJobId();
}
}

client->Close();
Expand Down
10 changes: 10 additions & 0 deletions src/ray/util/event_label.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,14 @@ namespace ray {

#define EL_RAY_FATAL_CHECK_FAILED "RAY_FATAL_CHECK_FAILED"

#define EL_RAY_WORKER_FAILURE "RAY_WORKER_FAILURE"

#define EL_RAY_DRIVER_FAILURE "RAY_DRIVER_FAILURE"

#define EL_RAY_AGENT_EXIT "RAY_AGENT_EXIT"

#define EL_RAY_AGENT_NOT_REGISTERED "RAY_AGENT_NOT_REGISTERED"

#define EL_RAY_NODE_REMOVED "RAY_NODE_REMOVED"

} // namespace ray

0 comments on commit be772df

Please sign in to comment.