test/integration/replication.bats

#!/usr/bin/env bats

load helpers

# Address on which the store will listen
STORE_HOST_1=127.0.0.1:8500
STORE_HOST_2=127.0.0.1:8501
STORE_HOST_3=127.0.0.1:8502

# Container name for integration test
CONTAINER_NAME=swarm_leader

# Names for store cluster nodes
NODE_1="node1"
NODE_2="node2"
NODE_3="node3"

# Urls of store cluster nodes
NODE_1_URL="consul://${STORE_HOST_1}/test"
NODE_2_URL="consul://${STORE_HOST_2}/test"
NODE_3_URL="consul://${STORE_HOST_3}/test"

function start_store_cluster() {
	docker_host run -v $(pwd)/discovery/consul/config:/config --name=$NODE_1 -h $NODE_1 -p $STORE_HOST_1:8500 -d progrium/consul -server -bootstrap-expect 3 -config-file=/config/consul.json

	# Grab node_1 address required for other nodes to join the cluster
	JOIN_ENDPOINT=$(docker_host inspect -f '{{.NetworkSettings.IPAddress}}' $NODE_1)

	docker_host run -v $(pwd)/discovery/consul/config:/config --name=$NODE_2 -h $NODE_2 -p $STORE_HOST_2:8500 -d progrium/consul -server -join $JOIN_ENDPOINT -config-file=/config/consul.json

	docker_host run -v $(pwd)/discovery/consul/config:/config --name=$NODE_3 -h $NODE_3 -p $STORE_HOST_3:8500 -d progrium/consul -server -join $JOIN_ENDPOINT -config-file=/config/consul.json

	# Wait for the cluster to be available.
	sleep 2
}

function restart_leader() {
	# TODO at this point we know the leader is node_1 but the function would be more
	# useful if we find out who is the leader by polling the Consul leader endpoint
	docker_host restart -t 5 $NODE_1
}

function stop_store() {
	docker_host rm -f -v $CONTAINER_NAME
}

function stop_store_cluster() {
	docker_host rm -f -v $NODE_1 $NODE_2 $NODE_3
}

function setup() {
	start_store_cluster
}

function teardown() {
	swarm_manage_cleanup
	swarm_join_cleanup
	stop_docker
	stop_store_cluster
}

@test "replication options" {
	# Bring up one manager
	# --advertise
	run swarm manage --replication --replication-ttl "4s" --advertise "" "$NODE_1_URL"
	[ "$status" -ne 0 ]
	[[ "${output}" == *"--advertise address must be provided when using --leader-election"* ]]

	# --advertise
	run swarm manage --replication --replication-ttl "4s" --advertise 127.0.0.1ab:1bcde "$NODE_1_URL"
	[ "$status" -ne 0 ]
	[[ "${output}" == *"--advertise should be of the form ip:port or hostname:port"* ]]

	# --replication-ttl
	run swarm manage --replication --replication-ttl "-20s" --advertise 127.0.0.1:$SWARM_BASE_PORT "$NODE_1_URL"
	[ "$status" -ne 0 ]
	[[ "${output}" == *"--replication-ttl should be a positive number"* ]]
}

@test "leader election" {
	local i=${#SWARM_MANAGE_PID[@]}
	local port=$(($SWARM_BASE_PORT + $i))
	local host=127.0.0.1:$port

	# Bring up one manager, make sure it becomes primary.
	swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$SWARM_BASE_PORT "$NODE_1_URL"
	run docker -H ${SWARM_HOSTS[0]} info
	[[ "${output}" == *"Role: primary"* ]]

	# Fire up a second manager. Ensure it's a replica forwarding to the right primary.
	swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$(($SWARM_BASE_PORT + 1)) "$NODE_1_URL"
	retry 20 1 eval "docker -H ${SWARM_HOSTS[1]} info | grep -q 'Role: replica'"
	retry 20 1 eval "docker -H ${SWARM_HOSTS[1]} info | grep -q 'Primary: ${SWARM_HOSTS[0]}'"

	# Kill the leader and ensure the replica takes over.
	kill "${SWARM_MANAGE_PID[0]}"
	retry 20 1 eval "docker -H ${SWARM_HOSTS[1]} info | grep -q 'Role: primary'"

	# Add a new replica and make sure it sees the new leader as primary.
	swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$(($SWARM_BASE_PORT + 2)) "$NODE_1_URL"
	run docker -H ${SWARM_HOSTS[2]} info
	[[ "${output}" == *"Role: replica"* ]]
	[[ "${output}" == *"Primary: ${SWARM_HOSTS[1]}"* ]]
}

function containerRunning() {
	local container="$1"
	local node="$2"
	run docker_swarm inspect "$container"
	[ "$status" -eq 0 ]
	[[ "${output}" == *"\"Name\": \"$node\""* ]]
	[[ "${output}" == *"\"Status\": \"running\""* ]]
}

@test "leader election - rescheduling" {
	local i=${#SWARM_MANAGE_PID[@]}
	local port=$(($SWARM_BASE_PORT + $i))
	local host=127.0.0.1:$port

	start_docker_with_busybox 2
	swarm_join "$NODE_1_URL"

	# Bring up one manager, make sure it becomes primary.
	swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$SWARM_BASE_PORT --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 "$NODE_1_URL"
	run docker -H ${SWARM_HOSTS[0]} info
	[[ "${output}" == *"Role: primary"* ]]

	# Fire up a second manager. Ensure it's a replica forwarding to the right primary.
	swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$(($SWARM_BASE_PORT + 1)) --engine-refresh-min-interval=1s --engine-refresh-max-interval=1s --engine-failure-retry=1 "$NODE_1_URL"
	run docker -H ${SWARM_HOSTS[1]} info
	[[ "${output}" == *"Role: replica"* ]]
	[[ "${output}" == *"Primary: ${SWARM_HOSTS[0]}"* ]]

	# c1 on node-0 with reschedule=on-node-failure
	run docker_swarm run -dit --name c1 -e constraint:node==~node-0 --label 'com.docker.swarm.reschedule-policies=["on-node-failure"]' busybox sh
	[ "$status" -eq 0 ]
	# c2 on node-0 with reschedule=off
	run docker_swarm run -dit --name c2 -e constraint:node==~node-0 --label 'com.docker.swarm.reschedule-policies=["off"]' busybox sh
	[ "$status" -eq 0 ]
	# c3 on node-1
	run docker_swarm run -dit --name c3 -e constraint:node==~node-1 --label 'com.docker.swarm.reschedule-policies=["on-node-failure"]' busybox sh
	[ "$status" -eq 0 ]

	run docker_swarm ps -q
	[ "${#lines[@]}" -eq  3 ]

	# Make sure containers are running where they should.
	containerRunning "c1" "node-0"
	containerRunning "c2" "node-0"
	containerRunning "c3" "node-1"

	# Get c1 swarm id
	swarm_id=$(docker_swarm inspect -f '{{ index .Config.Labels "com.docker.swarm.id" }}' c1)

	# Stop node-0
	docker_host stop ${DOCKER_CONTAINERS[0]}

	# Wait for Swarm to detect the node failure.
	retry 5 1 eval "docker_swarm info | grep -q 'Unhealthy'"

	# Wait for the container to be rescheduled
	# c1 should have been rescheduled from node-0 to node-1
	retry 15 1 containerRunning "c1" "node-1"

	# Check swarm id didn't change for c1
	[[ "$swarm_id" == $(docker_swarm inspect -f '{{ index .Config.Labels "com.docker.swarm.id" }}' c1) ]]

	run docker_swarm inspect "$swarm_id"
	[ "$status" -eq 0 ]
	[[ "${output}" == *'"Name": "node-1"'* ]]

	# c2 should still be on node-0 since the rescheduling policy was off.
	run docker_swarm inspect c2
	[ "$status" -eq 1 ]

	# c3 should still be on node-1 since it wasn't affected
	containerRunning "c3" "node-1"

	run docker_swarm ps -q
	[ "${#lines[@]}" -eq  2 ]
}

@test "leader election - store failure" {
	# Bring up one manager, make sure it becomes primary.
	swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$SWARM_BASE_PORT "$NODE_1_URL"
	run docker -H ${SWARM_HOSTS[0]} info
	echo "$output"
	[[ "${output}" == *"Role: primary"* ]]

	# Fire up a second manager. Ensure it's a replica forwarding to the right primary.
	swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$(($SWARM_BASE_PORT + 1)) "$NODE_1_URL"
	run docker -H ${SWARM_HOSTS[1]} info
	[[ "${output}" == *"Role: replica"* ]]
	[[ "${output}" == *"Primary: ${SWARM_HOSTS[0]}"* ]]

	# Fire up a third manager. Ensure it's a replica forwarding to the right primary.
	swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$(($SWARM_BASE_PORT + 2)) "$NODE_1_URL"
	run docker -H ${SWARM_HOSTS[2]} info
	[[ "${output}" == *"Role: replica"* ]]
	[[ "${output}" == *"Primary: ${SWARM_HOSTS[0]}"* ]]

	# Stop and start the store holding the leader metadata
	stop_store_cluster
	sleep 3
	start_store_cluster

	# Wait a little bit for the re-election to occur
	# This is specific to Consul (liveness over safety)
	sleep 20

	# Make sure the managers are either in the 'primary' or the 'replica' state.
	for host in "${SWARM_HOSTS[@]}"; do
		retry 120 1 eval "docker -H ${host} info | grep -Eq 'Role: primary|Role: replica'"
	done

	# Find out which one of the node is the Primary and
	# the ones that are Replicas after the store failure
	primary=${SWARM_HOSTS[0]}
	declare -a replicas
	i=0
	for host in "${SWARM_HOSTS[@]}"; do
		run docker -H $host info
		if [[ "${output}" == *"Role: primary"* ]]; then
			primary=$host
		else
			replicas[$((i=i+1))]=$host
		fi
	done

	# Check if we have indeed 2 replicas
	[[ "${#replicas[@]}" -eq 2 ]]

	# Check if the replicas are pointing to the right Primary
	for host in "${replicas[@]}"; do
		run docker -H $host info
		[[ "${output}" == *"Primary: ${primary}"* ]]
	done
}

@test "leader election - dispatched discovery urls - leader failure" {
	# Bring up one manager, make sure it becomes primary.
	swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$SWARM_BASE_PORT "$NODE_1_URL"
	run docker -H ${SWARM_HOSTS[0]} info
	[[ "${output}" == *"Role: primary"* ]]

	# Fire up a second manager. Ensure it's a replica forwarding to the right primary.
	swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$(($SWARM_BASE_PORT + 1)) "$NODE_2_URL"
	run docker -H ${SWARM_HOSTS[1]} info
	[[ "${output}" == *"Role: replica"* ]]
	[[ "${output}" == *"Primary: ${SWARM_HOSTS[0]}"* ]]

	# Fire up a third manager. Ensure it's a replica forwarding to the right primary.
	swarm_manage --replication --replication-ttl "4s" --advertise 127.0.0.1:$(($SWARM_BASE_PORT + 2)) "$NODE_3_URL"
	run docker -H ${SWARM_HOSTS[2]} info
	[[ "${output}" == *"Role: replica"* ]]
	[[ "${output}" == *"Primary: ${SWARM_HOSTS[0]}"* ]]

	# Stop and start the store leader
	restart_leader

	# Wait a little bit for the re-election to occur
	# This is specific to Consul (liveness over safety)
	sleep 15

	# Make sure the managers are either in the 'primary' or the 'replica' state.
	for host in "${SWARM_HOSTS[@]}"; do
		retry 120 1 eval "docker -H ${host} info | grep -Eq 'Role: primary|Role: replica'"
	done

	# Find out which one of the node is the Primary and
	# the ones that are Replicas after the store failure
	primary=${SWARM_HOSTS[0]}
	declare -a replicas
	i=0
	for host in "${SWARM_HOSTS[@]}"; do
		run docker -H $host info
		if [[ "${output}" == *"Role: primary"* ]]; then
			primary=$host
		else
			replicas[$((i=i+1))]=$host
		fi
	done

	# Check if we have indeed 2 replicas
	[[ "${#replicas[@]}" -eq 2 ]]

	# Check if the replicas are pointing to the right Primary
	for host in "${replicas[@]}"; do
		run docker -H $host info
		[[ "${output}" == *"Primary: ${primary}"* ]]
	done
}