diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index 39767f7563d3..64b3cb042dd4 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -442,6 +442,14 @@ message Cluster { // to an upstream host that becomes unhealthy, Envoy may spend a substantial amount of // time exclusively closing these connections, and not processing any other traffic. bool close_connections_on_host_health_failure = 31; + + // If this cluster uses EDS or STRICT_DNS to configure its hosts, immediately drain + // connections from any hosts that are removed from service discovery. + // + // This only affects behavior for hosts that are being actively health checked. + // If this flag is not set to true, Envoy will wait until the hosts fail active health + // checking before removing it from the cluster. + bool drain_connections_on_host_removal = 32; } // An extensible structure containing the address Envoy should bind to when diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index cafbc593e2af..9c254b4a92a3 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -24,7 +24,8 @@ Version history * cli: added --config-yaml flag to the Envoy binary. When set its value is interpreted as a yaml representation of the bootstrap config and overrides --config-path. * cluster: Add :ref:`option ` - to close tcp_proxy upstream connections when health checks fail. +* cluster: Add :ref:`option ` to drain + connections from hosts after they are removed from service discovery, regardless of health status. * health check: added ability to set :ref:`additional HTTP headers ` for HTTP health check. * health check: added support for EDS delivered :ref:`endpoint health status diff --git a/include/envoy/upstream/upstream.h b/include/envoy/upstream/upstream.h index 59b78a29bc87..c9e30ea01986 100644 --- a/include/envoy/upstream/upstream.h +++ b/include/envoy/upstream/upstream.h @@ -528,6 +528,12 @@ class ClusterInfo { * connections for this cluster. */ virtual const Network::ConnectionSocket::OptionsSharedPtr& clusterSocketOptions() const PURE; + + /** + * @return whether to skip waiting for health checking before draining connections + * after a host is removed from service discovery. + */ + virtual bool drainConnectionsOnHostRemoval() const PURE; }; typedef std::shared_ptr ClusterInfoConstSharedPtr; diff --git a/source/common/upstream/eds.cc b/source/common/upstream/eds.cc index 4a66dfd2bcda..5a0c4592fcc2 100644 --- a/source/common/upstream/eds.cc +++ b/source/common/upstream/eds.cc @@ -129,8 +129,7 @@ bool EdsClusterImpl::updateHostsPerLocality(HostSet& host_set, const HostVector& // out of the locality scheduler, we discover their new weights. We don't currently have a shared // object for locality weights that we can update here, we should add something like this to // improve performance and scalability of locality weight updates. - if (updateDynamicHostList(new_hosts, *current_hosts_copy, hosts_added, hosts_removed, - health_checker_ != nullptr) || + if (updateDynamicHostList(new_hosts, *current_hosts_copy, hosts_added, hosts_removed) || locality_weights_map != new_locality_weights_map) { locality_weights_map = new_locality_weights_map; LocalityWeightsSharedPtr locality_weights; diff --git a/source/common/upstream/upstream_impl.cc b/source/common/upstream/upstream_impl.cc index 84dacdecf579..6922736f29f2 100644 --- a/source/common/upstream/upstream_impl.cc +++ b/source/common/upstream/upstream_impl.cc @@ -269,7 +269,8 @@ ClusterInfoImpl::ClusterInfoImpl(const envoy::api::v2::Cluster& config, ssl_context_manager_(ssl_context_manager), added_via_api_(added_via_api), lb_subset_(LoadBalancerSubsetInfoImpl(config.lb_subset_config())), metadata_(config.metadata()), common_lb_config_(config.common_lb_config()), - cluster_socket_options_(parseClusterSocketOptions(config, bind_config)) { + cluster_socket_options_(parseClusterSocketOptions(config, bind_config)), + drain_connections_on_host_removal_(config.drain_connections_on_host_removal()) { // If the cluster doesn't have a transport socket configured, override with the default transport // socket implementation based on the tls_context. We copy by value first then override if @@ -659,7 +660,7 @@ void StaticClusterImpl::startPreInit() { bool BaseDynamicClusterImpl::updateDynamicHostList(const HostVector& new_hosts, HostVector& current_hosts, HostVector& hosts_added, - HostVector& hosts_removed, bool depend_on_hc) { + HostVector& hosts_removed) { uint64_t max_host_weight = 1; // Has the EDS health status changed the health of any endpoint? If so, we // rebuild the hosts vectors. We only do this if the health status of an @@ -729,14 +730,16 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const HostVector& new_hosts, hosts_added.push_back(host); // If we are depending on a health checker, we initialize to unhealthy. - if (depend_on_hc) { + if (health_checker_ != nullptr) { hosts_added.back()->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC); } } } + const bool dont_remove_healthy_hosts = + health_checker_ != nullptr && !info()->drainConnectionsOnHostRemoval(); // If there are removed hosts, check to see if we should only delete if unhealthy. - if (!current_hosts.empty() && depend_on_hc) { + if (!current_hosts.empty() && dont_remove_healthy_hosts) { for (auto i = current_hosts.begin(); i != current_hosts.end();) { if (!(*i)->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC)) { if ((*i)->weight() > max_host_weight) { @@ -865,7 +868,7 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() { HostVector hosts_added; HostVector hosts_removed; - if (parent_.updateDynamicHostList(new_hosts, hosts_, hosts_added, hosts_removed, false)) { + if (parent_.updateDynamicHostList(new_hosts, hosts_, hosts_added, hosts_removed)) { ENVOY_LOG(debug, "DNS hosts have changed for {}", dns_address_); parent_.updateAllHosts(hosts_added, hosts_removed); } diff --git a/source/common/upstream/upstream_impl.h b/source/common/upstream/upstream_impl.h index 0fa6fdff2ede..df4088fd462a 100644 --- a/source/common/upstream/upstream_impl.h +++ b/source/common/upstream/upstream_impl.h @@ -360,6 +360,8 @@ class ClusterInfoImpl : public ClusterInfo, return cluster_socket_options_; }; + bool drainConnectionsOnHostRemoval() const override { return drain_connections_on_host_removal_; } + private: struct ResourceManagers { ResourceManagers(const envoy::api::v2::Cluster& config, Runtime::Loader& runtime, @@ -398,6 +400,7 @@ class ClusterInfoImpl : public ClusterInfo, const envoy::api::v2::core::Metadata metadata_; const envoy::api::v2::Cluster::CommonLbConfig common_lb_config_; const Network::ConnectionSocket::OptionsSharedPtr cluster_socket_options_; + const bool drain_connections_on_host_removal_; }; /** @@ -515,7 +518,7 @@ class BaseDynamicClusterImpl : public ClusterImplBase { using ClusterImplBase::ClusterImplBase; bool updateDynamicHostList(const HostVector& new_hosts, HostVector& current_hosts, - HostVector& hosts_added, HostVector& hosts_removed, bool depend_on_hc); + HostVector& hosts_added, HostVector& hosts_removed); }; /** diff --git a/test/common/upstream/eds_test.cc b/test/common/upstream/eds_test.cc index 2818ad23b43e..1af86bb5669c 100644 --- a/test/common/upstream/eds_test.cc +++ b/test/common/upstream/eds_test.cc @@ -1,3 +1,5 @@ +#include + #include "envoy/api/v2/eds.pb.h" #include "common/config/utility.h" @@ -15,6 +17,7 @@ using testing::Return; using testing::ReturnRef; +using testing::_; namespace Envoy { namespace Upstream { @@ -302,6 +305,72 @@ TEST_F(EdsTest, EndpointHealthStatus) { } } +// Validate that onConfigUpdate() removes endpoints that are marked as healthy +// when configured to do so. +TEST_F(EdsTest, EndpointRemoval) { + resetCluster(R"EOF( + name: name + connect_timeout: 0.25s + type: EDS + lb_policy: ROUND_ROBIN + drain_connections_on_host_removal: true + eds_cluster_config: + service_name: fare + eds_config: + api_config_source: + cluster_names: + - eds + refresh_delay: 1s + )EOF"); + + auto health_checker = std::make_shared(); + EXPECT_CALL(*health_checker, start()); + EXPECT_CALL(*health_checker, addHostCheckCompleteCb(_)).Times(2); + cluster_->setHealthChecker(health_checker); + + Protobuf::RepeatedPtrField resources; + auto* cluster_load_assignment = resources.Add(); + cluster_load_assignment->set_cluster_name("fare"); + + auto add_endpoint = [cluster_load_assignment](int port) { + auto* endpoints = cluster_load_assignment->add_endpoints(); + + auto* socket_address = endpoints->add_lb_endpoints() + ->mutable_endpoint() + ->mutable_address() + ->mutable_socket_address(); + socket_address->set_address("1.2.3.4"); + socket_address->set_port_value(port); + }; + + add_endpoint(80); + add_endpoint(81); + + VERBOSE_EXPECT_NO_THROW(cluster_->onConfigUpdate(resources)); + + { + auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts(); + EXPECT_EQ(hosts.size(), 2); + + EXPECT_TRUE(hosts[0]->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC)); + EXPECT_TRUE(hosts[1]->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC)); + // Mark the hosts as healthy + hosts[0]->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC); + hosts[1]->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC); + } + + // Remove endpoints and add back the port 80 one + cluster_load_assignment->clear_endpoints(); + add_endpoint(80); + + VERBOSE_EXPECT_NO_THROW(cluster_->onConfigUpdate(resources)); + + { + auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts(); + EXPECT_EQ(hosts.size(), 1); + } +} + // Validate that onConfigUpdate() updates the endpoint locality. TEST_F(EdsTest, EndpointLocality) { Protobuf::RepeatedPtrField resources; diff --git a/test/common/upstream/upstream_impl_test.cc b/test/common/upstream/upstream_impl_test.cc index 5e461cd84bb0..bde6efe7b5c1 100644 --- a/test/common/upstream/upstream_impl_test.cc +++ b/test/common/upstream/upstream_impl_test.cc @@ -307,6 +307,58 @@ TEST(StrictDnsClusterImplTest, Basic) { EXPECT_CALL(resolver2.active_dns_query_, cancel()); } +// Verifies that host removal works correctly when hosts are being health checked +// but the cluster is configured to always remove hosts +TEST(StrictDnsClusterImplTest, HostRemovalActiveHealthSkipped) { + Stats::IsolatedStoreImpl stats; + Ssl::MockContextManager ssl_context_manager; + auto dns_resolver = std::make_shared(); + NiceMock dispatcher; + NiceMock runtime; + NiceMock cm; + + const std::string yaml = R"EOF( + name: name + connect_timeout: 0.25s + type: STRICT_DNS + lb_policy: ROUND_ROBIN + drain_connections_on_host_removal: true + hosts: [{ socket_address: { address: foo.bar.com, port_value: 443 }}] + )EOF"; + + ResolverData resolver(*dns_resolver, dispatcher); + StrictDnsClusterImpl cluster(parseClusterFromV2Yaml(yaml), runtime, stats, ssl_context_manager, + dns_resolver, cm, dispatcher, false); + std::shared_ptr health_checker(new MockHealthChecker()); + EXPECT_CALL(*health_checker, start()); + EXPECT_CALL(*health_checker, addHostCheckCompleteCb(_)); + cluster.setHealthChecker(health_checker); + cluster.initialize([&]() -> void {}); + + EXPECT_CALL(*health_checker, addHostCheckCompleteCb(_)); + EXPECT_CALL(*resolver.timer_, enableTimer(_)).Times(2); + resolver.dns_callback_(TestUtility::makeDnsResponse({"127.0.0.1", "127.0.0.2"})); + + // Verify that both endpoints are initially marked with FAILED_ACTIVE_HC, then + // clear the flag to simulate that these endpoints have been sucessfully health + // checked. + { + const auto& hosts = cluster.prioritySet().hostSetsPerPriority()[0]->hosts(); + EXPECT_EQ(2UL, hosts.size()); + + for (size_t i = 0; i < hosts.size(); ++i) { + EXPECT_TRUE(hosts[i]->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC)); + hosts[i]->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC); + } + } + + // Re-resolve the DNS name with only one record + resolver.dns_callback_(TestUtility::makeDnsResponse({"127.0.0.1"})); + + const auto& hosts = cluster.prioritySet().hostSetsPerPriority()[0]->hosts(); + EXPECT_EQ(1UL, hosts.size()); +} + TEST(HostImplTest, HostCluster) { MockCluster cluster; HostSharedPtr host = makeTestHost(cluster.info_, "tcp://10.0.0.1:1234", 1); diff --git a/test/mocks/upstream/cluster_info.h b/test/mocks/upstream/cluster_info.h index 5a6017ce5852..8c34157daf4d 100644 --- a/test/mocks/upstream/cluster_info.h +++ b/test/mocks/upstream/cluster_info.h @@ -63,6 +63,7 @@ class MockClusterInfo : public ClusterInfo { MOCK_CONST_METHOD0(lbSubsetInfo, const LoadBalancerSubsetInfo&()); MOCK_CONST_METHOD0(metadata, const envoy::api::v2::core::Metadata&()); MOCK_CONST_METHOD0(clusterSocketOptions, const Network::ConnectionSocket::OptionsSharedPtr&()); + MOCK_CONST_METHOD0(drainConnectionsOnHostRemoval, bool()); std::string name_{"fake_cluster"}; Http::Http2Settings http2_settings_{};