raft: Fix the problem of stuck in candidate role forever.

Sometimes a server can stay in candidate role forever, even if the server already see the new leader and handles append-requests normally. However, because of the wrong role, it appears as disconnected from cluster and so the clients are disconnected. This problem happens when 2 servers become candidates in the same term, and one of them is elected as leader in that term. It can be reproduced by the test cases added in this patch. The root cause is that the current implementation only changes role to follower when a bigger term is observed (in raft_receive_term__()). According to the RAFT paper, if another candidate becomes leader with the same term, the candidate should change to follower. This patch fixes it by changing the role to follower when leader is being updated in raft_update_leader(). Signed-off-by: Han Zhou <[email protected]> Signed-off-by: Ben Pfaff <[email protected]>
qihuaheng · Mar 6, 2020 · 93ee420 · 93ee420
1 parent 315e88c
commit 93ee420
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 2 deletions.
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
@@ -73,7 +73,8 @@ enum raft_failure_test {
     FT_CRASH_BEFORE_SEND_EXEC_REQ,
     FT_CRASH_AFTER_SEND_EXEC_REQ,
     FT_CRASH_AFTER_RECV_APPEND_REQ_UPDATE,
-    FT_DELAY_ELECTION
+    FT_DELAY_ELECTION,
+    FT_DONT_SEND_VOTE_REQUEST
 };
 static enum raft_failure_test failure_test;
 
@@ -1647,6 +1648,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
     }
 
     ovs_assert(raft->role != RAFT_LEADER);
+
     raft->role = RAFT_CANDIDATE;
     /* If there was no leader elected since last election, we know we are
      * retrying now. */
@@ -1690,7 +1692,9 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
                 .leadership_transfer = leadership_transfer,
             },
         };
-        raft_send(raft, &rq);
+        if (failure_test != FT_DONT_SEND_VOTE_REQUEST) {
+            raft_send(raft, &rq);
+        }
     }
 
     /* Vote for ourselves. */
@@ -2965,6 +2969,15 @@ raft_update_leader(struct raft *raft, const struct uuid *sid)
         };
         ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r)));
     }
+    if (raft->role == RAFT_CANDIDATE) {
+        /* Section 3.4: While waiting for votes, a candidate may
+         * receive an AppendEntries RPC from another server claiming to
+         * be leader. If the leader’s term (included in its RPC) is at
+         * least as large as the candidate’s current term, then the
+         * candidate recognizes the leader as legitimate and returns to
+         * follower state. */
+        raft->role = RAFT_FOLLOWER;
+    }
     return true;
 }
 
@@ -4673,6 +4686,8 @@ raft_unixctl_failure_test(struct unixctl_conn *conn OVS_UNUSED,
                 raft_reset_election_timer(raft);
             }
         }
+    } else if (!strcmp(test, "dont-send-vote-request")) {
+        failure_test = FT_DONT_SEND_VOTE_REQUEST;
     } else if (!strcmp(test, "clear")) {
         failure_test = FT_NO_TEST;
         unixctl_command_reply(conn, "test dismissed");

diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
@@ -527,6 +527,61 @@ AT_KEYWORDS([ovsdb server negative unix cluster pending-txn])
 ovsdb_cluster_failure_test 2 2 3 crash-after-receiving-append-request-update
 AT_CLEANUP
 
+
+AT_SETUP([OVSDB cluster - competing candidates])
+AT_KEYWORDS([ovsdb server negative unix cluster competing-candidates])
+
+n=3
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
+ordinal_schema > schema
+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
+cid=`ovsdb-tool db-cid s1.db`
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
+for i in `seq 2 $n`; do
+    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
+done
+
+on_exit 'kill `cat *.pid`'
+for i in `seq $n`; do
+    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db])
+done
+for i in `seq $n`; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+# We need to simulate the situation when 2 candidates starts election with same
+# term.
+#
+# Before triggering leader election, tell follower s2 don't send vote request (simulating
+# vote-request lost or not handled in time), and tell follower s3 to delay
+# election timer to make sure s3 doesn't send vote-request before s2 enters
+# term 2.
+AT_CHECK([ovs-appctl -t "`pwd`"/s2 cluster/failure-test dont-send-vote-request], [0], [ignore])
+AT_CHECK([ovs-appctl -t "`pwd`"/s3 cluster/failure-test delay-election], [0], [ignore])
+
+# Restart leader, which will become follower, and both old followers will start
+# election as candidate. The new follower (old leader) will vote one of them,
+# and the other candidate should step back as follower as again.
+kill -9 `cat s1.pid`
+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s1.log --pidfile=s1.pid --unixctl=s1 --remote=punix:s1.ovsdb s1.db])
+
+# Tell s1 to delay election timer so that it won't start election before s3
+# becomes candidate.
+AT_CHECK([ovs-appctl -t "`pwd`"/s1 cluster/failure-test delay-election], [0], [ignore])
+
+OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s1 cluster/status $schema_name | grep "Term: 2"])
+
+for i in `seq $n`; do
+    OVS_WAIT_WHILE([ovs-appctl -t "`pwd`"/s$i cluster/status $schema_name | grep "candidate"])
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+for i in `seq $n`; do
+    OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
+done
+
+AT_CLEANUP
+
 
 AT_BANNER([OVSDB - cluster tests])