forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[core] add retriable fifo policy (ray-project#33514)
For long-living, memory leaking actors, it is more desirable to kill oldest task that is leaking the most. This avoid the situation where we constantly kill actor, which may lead to side effects where we generate a lot of log files, or trigger increased memory consumption in gcs / dashboard This fixes the test failure introduced in ray-project#33430 Adding sleep to give time for memory monitor to kick in Also increasing the memory limit since the node may be using a lot of memory in the first place
- Loading branch information
Showing
8 changed files
with
285 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
// Copyright 2022 The Ray Authors. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "ray/raylet/worker_killing_policy_retriable_fifo.h" | ||
|
||
#include <gtest/gtest_prod.h> | ||
|
||
#include <boost/container_hash/hash.hpp> | ||
#include <unordered_map> | ||
|
||
#include "absl/container/flat_hash_map.h" | ||
#include "absl/time/time.h" | ||
#include "ray/common/asio/instrumented_io_context.h" | ||
#include "ray/common/asio/periodical_runner.h" | ||
#include "ray/raylet/worker.h" | ||
#include "ray/raylet/worker_killing_policy.h" | ||
#include "ray/raylet/worker_pool.h" | ||
|
||
namespace ray { | ||
|
||
namespace raylet { | ||
|
||
RetriableFIFOWorkerKillingPolicy::RetriableFIFOWorkerKillingPolicy() {} | ||
|
||
const std::pair<std::shared_ptr<WorkerInterface>, bool> | ||
RetriableFIFOWorkerKillingPolicy::SelectWorkerToKill( | ||
const std::vector<std::shared_ptr<WorkerInterface>> &workers, | ||
const MemorySnapshot &system_memory) const { | ||
if (workers.empty()) { | ||
RAY_LOG_EVERY_MS(INFO, 5000) << "Worker list is empty. Nothing can be killed"; | ||
return std::make_pair(nullptr, /*should retry*/ false); | ||
} | ||
|
||
std::vector<std::shared_ptr<WorkerInterface>> sorted = workers; | ||
|
||
std::sort(sorted.begin(), | ||
sorted.end(), | ||
[](std::shared_ptr<WorkerInterface> const &left, | ||
std::shared_ptr<WorkerInterface> const &right) -> bool { | ||
// First sort by retriable tasks and then by task time in ascending order. | ||
int left_retriable = | ||
left->GetAssignedTask().GetTaskSpecification().IsRetriable() ? 0 : 1; | ||
int right_retriable = | ||
right->GetAssignedTask().GetTaskSpecification().IsRetriable() ? 0 : 1; | ||
if (left_retriable == right_retriable) { | ||
return left->GetAssignedTaskTime() < right->GetAssignedTaskTime(); | ||
} | ||
return left_retriable < right_retriable; | ||
}); | ||
|
||
const static int32_t max_to_print = 10; | ||
RAY_LOG(INFO) << "The top 10 workers to be killed based on the worker killing policy:\n" | ||
<< WorkerKillingPolicy::WorkersDebugString( | ||
sorted, max_to_print, system_memory); | ||
|
||
return std::make_pair(sorted.front(), /*should retry*/ true); | ||
} | ||
|
||
} // namespace raylet | ||
|
||
} // namespace ray |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
// Copyright 2022 The Ray Authors. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#pragma once | ||
|
||
#include <gtest/gtest_prod.h> | ||
|
||
#include "absl/container/flat_hash_set.h" | ||
#include "absl/time/clock.h" | ||
#include "absl/time/time.h" | ||
#include "ray/common/memory_monitor.h" | ||
#include "ray/raylet/worker.h" | ||
#include "ray/raylet/worker_killing_policy.h" | ||
|
||
namespace ray { | ||
|
||
namespace raylet { | ||
|
||
/// Prefers killing retriable workers over non-retriable ones, then in FIFO order. | ||
class RetriableFIFOWorkerKillingPolicy : public WorkerKillingPolicy { | ||
public: | ||
RetriableFIFOWorkerKillingPolicy(); | ||
const std::pair<std::shared_ptr<WorkerInterface>, bool> SelectWorkerToKill( | ||
const std::vector<std::shared_ptr<WorkerInterface>> &workers, | ||
const MemorySnapshot &system_memory) const; | ||
}; | ||
|
||
} // namespace raylet | ||
|
||
} // namespace ray |
108 changes: 108 additions & 0 deletions
108
src/ray/raylet/worker_killing_policy_retriable_fifo_test.cc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
// Copyright 2022 The Ray Authors. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "ray/raylet/worker_killing_policy_retriable_fifo.h" | ||
|
||
#include "gtest/gtest.h" | ||
#include "ray/common/task/task_spec.h" | ||
#include "ray/raylet/test/util.h" | ||
#include "ray/raylet/worker_killing_policy.h" | ||
|
||
namespace ray { | ||
|
||
namespace raylet { | ||
|
||
class WorkerKillerTest : public ::testing::Test { | ||
protected: | ||
int32_t port_ = 2389; | ||
RetriableFIFOWorkerKillingPolicy worker_killing_policy_; | ||
|
||
std::shared_ptr<WorkerInterface> CreateActorCreationWorker(int32_t max_restarts) { | ||
rpc::TaskSpec message; | ||
message.mutable_actor_creation_task_spec()->set_max_actor_restarts(max_restarts); | ||
message.set_type(ray::rpc::TaskType::ACTOR_CREATION_TASK); | ||
TaskSpecification task_spec(message); | ||
RayTask task(task_spec); | ||
auto worker = std::make_shared<MockWorker>(ray::WorkerID::FromRandom(), port_); | ||
worker->SetAssignedTask(task); | ||
return worker; | ||
} | ||
|
||
std::shared_ptr<WorkerInterface> CreateTaskWorker(int32_t max_retries) { | ||
rpc::TaskSpec message; | ||
message.set_max_retries(max_retries); | ||
message.set_type(ray::rpc::TaskType::NORMAL_TASK); | ||
TaskSpecification task_spec(message); | ||
RayTask task(task_spec); | ||
auto worker = std::make_shared<MockWorker>(ray::WorkerID::FromRandom(), port_); | ||
worker->SetAssignedTask(task); | ||
return worker; | ||
} | ||
}; | ||
|
||
TEST_F(WorkerKillerTest, TestEmptyWorkerPoolSelectsNullWorker) { | ||
std::vector<std::shared_ptr<WorkerInterface>> workers; | ||
auto worker_to_kill_and_should_retry = | ||
worker_killing_policy_.SelectWorkerToKill(workers, MemorySnapshot()); | ||
auto worker_to_kill = worker_to_kill_and_should_retry.first; | ||
ASSERT_TRUE(worker_to_kill == nullptr); | ||
} | ||
|
||
TEST_F(WorkerKillerTest, | ||
TestPreferRetriableOverNonRetriableAndOrderByTimestampAscending) { | ||
std::vector<std::shared_ptr<WorkerInterface>> workers; | ||
auto first_submitted = | ||
WorkerKillerTest::CreateActorCreationWorker(0 /* max_restarts */); | ||
auto second_submitted = | ||
WorkerKillerTest::CreateActorCreationWorker(5 /* max_restarts */); | ||
auto third_submitted = WorkerKillerTest::CreateTaskWorker(0 /* max_restarts */); | ||
auto fourth_submitted = WorkerKillerTest::CreateTaskWorker(11 /* max_restarts */); | ||
|
||
workers.push_back(first_submitted); | ||
workers.push_back(second_submitted); | ||
workers.push_back(third_submitted); | ||
workers.push_back(fourth_submitted); | ||
|
||
MemorySnapshot memory_snapshot; | ||
auto worker_to_kill = | ||
worker_killing_policy_.SelectWorkerToKill(workers, memory_snapshot).first; | ||
ASSERT_EQ(worker_to_kill->WorkerId(), second_submitted->WorkerId()); | ||
workers.erase(std::remove(workers.begin(), workers.end(), worker_to_kill), | ||
workers.end()); | ||
|
||
worker_to_kill = | ||
worker_killing_policy_.SelectWorkerToKill(workers, memory_snapshot).first; | ||
ASSERT_EQ(worker_to_kill->WorkerId(), fourth_submitted->WorkerId()); | ||
workers.erase(std::remove(workers.begin(), workers.end(), worker_to_kill), | ||
workers.end()); | ||
|
||
worker_to_kill = | ||
worker_killing_policy_.SelectWorkerToKill(workers, memory_snapshot).first; | ||
ASSERT_EQ(worker_to_kill->WorkerId(), first_submitted->WorkerId()); | ||
workers.erase(std::remove(workers.begin(), workers.end(), worker_to_kill), | ||
workers.end()); | ||
|
||
worker_to_kill = | ||
worker_killing_policy_.SelectWorkerToKill(workers, memory_snapshot).first; | ||
ASSERT_EQ(worker_to_kill->WorkerId(), third_submitted->WorkerId()); | ||
} | ||
|
||
} // namespace raylet | ||
|
||
} // namespace ray | ||
|
||
int main(int argc, char **argv) { | ||
::testing::InitGoogleTest(&argc, argv); | ||
return RUN_ALL_TESTS(); | ||
} |