Skip to content

Commit

Permalink
Adds performance tuning capability for Raft, detuned defaults, and su…
Browse files Browse the repository at this point in the history
…pplemental docs.
  • Loading branch information
slackpad committed Aug 25, 2016
1 parent 36dc920 commit 57db4bc
Show file tree
Hide file tree
Showing 13 changed files with 263 additions and 27 deletions.
5 changes: 5 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,11 @@ func (a *Agent) consulConfig() *consul.Config {
// Apply dev mode
base.DevMode = a.config.DevMode

// Apply performance factors
if a.config.Performance.RaftMultiplier > 0 {
base.ScaleRaft(a.config.Performance.RaftMultiplier)
}

// Override with our config
if a.config.Datacenter != "" {
base.Datacenter = a.config.Datacenter
Expand Down
41 changes: 41 additions & 0 deletions command/agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/hashicorp/consul/consul"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
"github.com/hashicorp/raft"
)

const (
Expand Down Expand Up @@ -191,6 +192,46 @@ func TestAgent_CheckAdvertiseAddrsSettings(t *testing.T) {
}
}

func TestAgent_CheckPerformanceSettings(t *testing.T) {
// Try a default config.
{
c := nextConfig()
c.ConsulConfig = nil
dir, agent := makeAgent(t, c)
defer os.RemoveAll(dir)
defer agent.Shutdown()

raftMult := time.Duration(consul.DefaultRaftMultiplier)
r := agent.consulConfig().RaftConfig
def := raft.DefaultConfig()
if r.HeartbeatTimeout != raftMult*def.HeartbeatTimeout ||
r.ElectionTimeout != raftMult*def.ElectionTimeout ||
r.CommitTimeout != raftMult*def.CommitTimeout ||
r.LeaderLeaseTimeout != raftMult*def.LeaderLeaseTimeout {
t.Fatalf("bad: %#v", *r)
}
}

// Try a multiplier.
{
c := nextConfig()
c.Performance.RaftMultiplier = 99
dir, agent := makeAgent(t, c)
defer os.RemoveAll(dir)
defer agent.Shutdown()

const raftMult time.Duration = 99
r := agent.consulConfig().RaftConfig
def := raft.DefaultConfig()
if r.HeartbeatTimeout != raftMult*def.HeartbeatTimeout ||
r.ElectionTimeout != raftMult*def.ElectionTimeout ||
r.CommitTimeout != raftMult*def.CommitTimeout ||
r.LeaderLeaseTimeout != raftMult*def.LeaderLeaseTimeout {
t.Fatalf("bad: %#v", *r)
}
}
}

func TestAgent_ReconnectConfigSettings(t *testing.T) {
c := nextConfig()
func() {
Expand Down
18 changes: 17 additions & 1 deletion command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,14 @@ type DNSConfig struct {
DisableCompression bool `mapstructure:"disable_compression"`
}

// Performance is used to tune the performance of Consul's subsystems.
type Performance struct {
// RaftMultiplier is an integer multiplier used to scale Raft timing
// parameters: HeartbeatTimeout, ElectionTimeout, CommitTimeout, and
// LeaderLeaseTimeout.
RaftMultiplier uint `mapstructure:"raft_multiplier"`
}

// Telemetry is the telemetry configuration for the server
type Telemetry struct {
// StatsiteAddr is the address of a statsite instance. If provided,
Expand Down Expand Up @@ -205,10 +213,13 @@ type Telemetry struct {
// Some of this is configurable as CLI flags, but most must
// be set using a configuration file.
type Config struct {
// DevMode enables a fast-path mode of opertaion to bring up an in-memory
// DevMode enables a fast-path mode of operation to bring up an in-memory
// server with minimal configuration. Useful for developing Consul.
DevMode bool `mapstructure:"-"`

// Performance is used to tune the performance of Consul's subsystems.
Performance Performance `mapstructure:"performance"`

// Bootstrap is used to bring up the first Consul server, and
// permits that node to elect itself leader
Bootstrap bool `mapstructure:"bootstrap"`
Expand Down Expand Up @@ -1085,6 +1096,11 @@ func DecodeCheckDefinition(raw interface{}) (*CheckDefinition, error) {
func MergeConfig(a, b *Config) *Config {
var result Config = *a

// Propagate non-default performance settings
if b.Performance.RaftMultiplier > 0 {
result.Performance.RaftMultiplier = b.Performance.RaftMultiplier
}

// Copy the strings if they're set
if b.Bootstrap {
result.Bootstrap = true
Expand Down
14 changes: 14 additions & 0 deletions command/agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -957,6 +957,17 @@ func TestDecodeConfig_invalidKeys(t *testing.T) {
}
}

func TestDecodeConfig_Performance(t *testing.T) {
input := `{"performance": { "raft_multiplier": 3 }}`
config, err := DecodeConfig(bytes.NewReader([]byte(input)))
if err != nil {
t.Fatalf("err: %s", err)
}
if config.Performance.RaftMultiplier != 3 {
t.Fatalf("bad: multiplier isn't set: %#v", config)
}
}

func TestDecodeConfig_Services(t *testing.T) {
input := `{
"services": [
Expand Down Expand Up @@ -1382,6 +1393,9 @@ func TestMergeConfig(t *testing.T) {
}

b := &Config{
Performance: Performance{
RaftMultiplier: 99,
},
Bootstrap: true,
BootstrapExpect: 3,
Datacenter: "dc2",
Expand Down
19 changes: 19 additions & 0 deletions consul/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ const (
DefaultDC = "dc1"
DefaultLANSerfPort = 8301
DefaultWANSerfPort = 8302

// See docs/guides/performance.html for information on how this value
// was obtained.
DefaultRaftMultiplier uint = 5
)

var (
Expand Down Expand Up @@ -333,13 +337,28 @@ func DefaultConfig() *Config {
// Enable interoperability with unversioned Raft library, and don't
// start using new ID-based features yet.
conf.RaftConfig.ProtocolVersion = 1
conf.ScaleRaft(DefaultRaftMultiplier)

// Disable shutdown on removal
conf.RaftConfig.ShutdownOnRemove = false

return conf
}

// ScaleRaft sets the config to have Raft timing parameters scaled by the given
// performance multiplier. This is done in an idempotent way so it's not tricky
// to call this when composing configurations and potentially calling this
// multiple times on the same structure.
func (c *Config) ScaleRaft(raftMultRaw uint) {
raftMult := time.Duration(raftMultRaw)

def := raft.DefaultConfig()
c.RaftConfig.HeartbeatTimeout = raftMult * def.HeartbeatTimeout
c.RaftConfig.ElectionTimeout = raftMult * def.ElectionTimeout
c.RaftConfig.CommitTimeout = raftMult * def.CommitTimeout
c.RaftConfig.LeaderLeaseTimeout = raftMult * def.LeaderLeaseTimeout
}

func (c *Config) tlsConfig() *tlsutil.Config {
tlsConf := &tlsutil.Config{
VerifyIncoming: c.VerifyIncoming,
Expand Down
11 changes: 6 additions & 5 deletions consul/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -820,11 +820,12 @@ func (s *Server) Stats() map[string]map[string]string {
s.remoteLock.RUnlock()
stats := map[string]map[string]string{
"consul": map[string]string{
"server": "true",
"leader": fmt.Sprintf("%v", s.IsLeader()),
"leader_addr": string(s.raft.Leader()),
"bootstrap": fmt.Sprintf("%v", s.config.Bootstrap),
"known_datacenters": toString(uint64(numKnownDCs)),
"server": "true",
"leader": fmt.Sprintf("%v", s.IsLeader()),
"leader_addr": string(s.raft.Leader()),
"bootstrap": fmt.Sprintf("%v", s.config.Bootstrap),
"known_datacenters": toString(uint64(numKnownDCs)),
"leader_lease_timeout": fmt.Sprintf("%v", s.config.RaftConfig.LeaderLeaseTimeout),
},
"raft": s.raft.Stats(),
"serf_lan": s.serfLAN.Stats(),
Expand Down
47 changes: 28 additions & 19 deletions testutil/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ import (
// offset is used to atomically increment the port numbers.
var offset uint64

// TestPerformanceConfig configures the performance parameters.
type TestPerformanceConfig struct {
RaftMultiplier uint `json:"raft_multiplier,omitempty"`
}

// TestPortConfig configures the various ports used for services
// provided by the Consul server.
type TestPortConfig struct {
Expand All @@ -51,20 +56,21 @@ type TestAddressConfig struct {

// TestServerConfig is the main server configuration struct.
type TestServerConfig struct {
NodeName string `json:"node_name"`
Bootstrap bool `json:"bootstrap,omitempty"`
Server bool `json:"server,omitempty"`
DataDir string `json:"data_dir,omitempty"`
Datacenter string `json:"datacenter,omitempty"`
DisableCheckpoint bool `json:"disable_update_check"`
LogLevel string `json:"log_level,omitempty"`
Bind string `json:"bind_addr,omitempty"`
Addresses *TestAddressConfig `json:"addresses,omitempty"`
Ports *TestPortConfig `json:"ports,omitempty"`
ACLMasterToken string `json:"acl_master_token,omitempty"`
ACLDatacenter string `json:"acl_datacenter,omitempty"`
ACLDefaultPolicy string `json:"acl_default_policy,omitempty"`
Stdout, Stderr io.Writer `json:"-"`
NodeName string `json:"node_name"`
Performance *TestPerformanceConfig `json:"performance,omitempty"`
Bootstrap bool `json:"bootstrap,omitempty"`
Server bool `json:"server,omitempty"`
DataDir string `json:"data_dir,omitempty"`
Datacenter string `json:"datacenter,omitempty"`
DisableCheckpoint bool `json:"disable_update_check"`
LogLevel string `json:"log_level,omitempty"`
Bind string `json:"bind_addr,omitempty"`
Addresses *TestAddressConfig `json:"addresses,omitempty"`
Ports *TestPortConfig `json:"ports,omitempty"`
ACLMasterToken string `json:"acl_master_token,omitempty"`
ACLDatacenter string `json:"acl_datacenter,omitempty"`
ACLDefaultPolicy string `json:"acl_default_policy,omitempty"`
Stdout, Stderr io.Writer `json:"-"`
}

// ServerConfigCallback is a function interface which can be
Expand All @@ -79,11 +85,14 @@ func defaultServerConfig() *TestServerConfig {
return &TestServerConfig{
NodeName: fmt.Sprintf("node%d", idx),
DisableCheckpoint: true,
Bootstrap: true,
Server: true,
LogLevel: "debug",
Bind: "127.0.0.1",
Addresses: &TestAddressConfig{},
Performance: &TestPerformanceConfig{
RaftMultiplier: 1,
},
Bootstrap: true,
Server: true,
LogLevel: "debug",
Bind: "127.0.0.1",
Addresses: &TestAddressConfig{},
Ports: &TestPortConfig{
DNS: 20000 + idx,
HTTP: 21000 + idx,
Expand Down
18 changes: 18 additions & 0 deletions website/source/docs/agent/options.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -576,6 +576,24 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass
* <a name="node_name"></a><a href="#node_name">`node_name`</a> Equivalent to the
[`-node` command-line flag](#_node).

* <a name="performance"></a><a href="#performance">`performance`</a> Available in Consul 0.7 and
later, this is a nested object that allows tuning the performance of different subsystems in
Consul. See the [Server Performance](/docs/guides/performance.html) guide for more details. The
following parameters are available:
* <a name="raft_multiplier"></a><a href="#raft_multiplier">`raft_multiplier`</a> - An integer
multiplier used by Consul servers to scale key Raft timing parameters. Tuning this affects
the time it takes Consul to detect leader failures and to perform leader elections, at the
expense of requiring more network and CPU resources for better performance.<br><br>A value
of 0, the default, means that Consul will use a lower-performance timing that's suitable for
[minimal Consul servers](/docs/guides/performance.html#minumum), currently equivalent to
setting this to a value of 5 (this default may be changed in future versions of Consul,
depending if the target minimum server profile changes). Above 0, higher values imply lower
levels of performance. Setting this to a value of 1 will configure Raft to its
highest-performance mode, equivalent to the default timing of Consul prior to 0.7, and is
recommended for [production Consul servers](/docs/guides/performance.html#production). See
the note on [last contact](/docs/guides/performance.html#last-contact) timing for more
details on tuning this parameter.
* <a name="ports"></a><a href="#ports">`ports`</a> This is a nested object that allows setting
the bind ports for the following keys:
* <a name="dns_port"></a><a href="#dns_port">`dns`</a> - The DNS server, -1 to disable. Default 8600.
Expand Down
4 changes: 2 additions & 2 deletions website/source/docs/agent/telemetry.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,8 @@ These metrics are used to monitor the health of the Consul servers.
<td>timer</td>
</tr>
<tr>
<td>`consul.raft.leader.lastContact`</td>
<td>This measures the time that a Consul server was last contacted by the leader (will be zero on the leader itself). This is a general indicator of latency in the Raft subsystem, and gives a general indicator of how far behind [stale](/docs/agent/http.html#consistency) queries will be.</td>
<td><a name="last-contact"></a>`consul.raft.leader.lastContact`</td>
<td>This will only be emitted by the Raft leader and measures the time since the leader was last able to contact the follower nodes when checking its leader lease. It can be used as a measure for how stable the Raft timing is and how close the leader is to timing out its lease.<br><br>The lease timeout is 500 ms times the [`raft_multiplier` configuration](/docs/agent/options.html#raft_multiplier), so this telemetry value should not be getting close to that configured value, otherwise the Raft timing is marginal and might need to be tuned, or more powerful servers might be needed. See the [Server Performance](/docs/guides/performance.html) guide for more details.</td>
<td>ms</td>
<td>timer</td>
</tr>
Expand Down
2 changes: 2 additions & 0 deletions website/source/docs/guides/dns-cache.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ for each lookup and can potentially exhaust the query throughput of a cluster.
For this reason, Consul provides a number of tuning parameters that can
customize how DNS queries are handled.

<a name="stale"></a>
## Stale Reads

Stale reads can be used to reduce latency and increase the throughput
Expand Down Expand Up @@ -60,6 +61,7 @@ client and Consul and set the cache values appropriately. In many cases
"appropriately" simply is turning negative response caching off to get the best
recovery time when a service becomes available again.

<a name="ttl"></a>
## TTL Values

TTL values can be set to allow DNS results to be cached downstream of Consul. Higher
Expand Down
Loading

0 comments on commit 57db4bc

Please sign in to comment.