From 887fccb07afdcfc7bd62c485e00adef69e2bad03 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 26 May 2026 17:49:10 +0800 Subject: [PATCH 01/11] Speed up split-vote elections with the new FAILOVER_AUTH_NACK message In the current failover protocol a replica sends AUTH_REQUEST exactly once per epoch and each voter casts at most one vote per epoch. Despite the various delay heuristics in clusterHandleReplicaFailover that try to stagger replicas, concurrent elections can still collide on the same epoch. When the votes split and nobody reaches the quorum, the losing replica has no way to learn this in time and must first wait for the election to be declared expired after auth_timeout (2*cluster-node-timeout) and then wait another auth_retry_time (2*auth_timeout) before it is even allowed to start the next election with a higher epoch. Introduce a new message type, FAILOVER_AUTH_NACK, that voters reply with from every refusal branch. The replica counts incoming NACKs; since a voter never re-answers within the same epoch, once size - nack_count drops below the quorum the election is declared unwinnable and a new one is started with a higher epoch right away, shrinking recovery from the auth_timeout + auth_retry_time window to a few cron ticks. Wire compatibility is preserved by gating NACK emission on a new CLUSTER_NODE_FAILOVER_AUTH_NACK_SUPPORTED capability flag advertised in PING/PONG flags via clusterUpdateMyselfFlags. Peers that do not advertise the capability never see the new message type and fall back to the legacy auth_timeout path. Adding a new DEBUG CLUSTER-FAILOVER-DELAY hook overrides the delay computed in clusterHandleReplicaFailover for testing. Signed-off-by: Binbin --- src/cluster_legacy.c | 93 +++++++++++++++++++++++++++++++- src/cluster_legacy.h | 23 +++++++- src/debug.c | 12 +++++ src/server.c | 1 + src/server.h | 2 + tests/unit/cluster/failover2.tcl | 21 ++++++-- 6 files changed, 145 insertions(+), 7 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index e648332ded5..6d36be5ca26 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -67,6 +67,9 @@ void clusterReadHandler(connection *conn); void clusterSendPing(clusterLink *link, int type); void clusterSendFail(char *nodename); void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request); +void clusterProcessFailoverAuthNack(clusterNode *sender, clusterMsg *request); +void clusterSendFailoverNack(clusterNode *node, uint8_t reason); +static const char *clusterNackReasonString(uint8_t reason); void clusterUpdateState(void); list *clusterGetNodesInMyShard(clusterNode *node); int clusterNodeAddReplica(clusterNode *primary, clusterNode *replica); @@ -1253,7 +1256,8 @@ void clusterUpdateMyselfFlags(void) { myself->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED | CLUSTER_NODE_LIGHT_HDR_PUBLISH_SUPPORTED | CLUSTER_NODE_LIGHT_HDR_MODULE_SUPPORTED | - CLUSTER_NODE_MULTI_MEET_SUPPORTED; + CLUSTER_NODE_MULTI_MEET_SUPPORTED | + CLUSTER_NODE_FAILOVER_AUTH_NACK_SUPPORTED; if (myself->flags != oldflags) { clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE); @@ -1455,6 +1459,7 @@ void clusterInit(void) { server.cluster->importing_slots_from = dictCreate(&clusterSlotDictType); server.cluster->failover_auth_time = 0; server.cluster->failover_auth_count = 0; + server.cluster->failover_auth_nack_count = 0; server.cluster->failover_auth_rank = 0; server.cluster->failover_auth_sent = 0; server.cluster->failover_failed_primary_rank = 0; @@ -3772,6 +3777,9 @@ int clusterIsValidPacket(clusterLink *link) { } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST || type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK || type == CLUSTERMSG_TYPE_MFSTART) { explen = sizeof(clusterMsg) - sizeof(union clusterMsgData); + } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_NACK) { + explen = sizeof(clusterMsg) - sizeof(union clusterMsgData); + explen += sizeof(clusterMsgDataFailoverNack); } else if (type == CLUSTERMSG_TYPE_UPDATE) { explen = sizeof(clusterMsg) - sizeof(union clusterMsgData); explen += sizeof(clusterMsgDataUpdate); @@ -3902,6 +3910,13 @@ int clusterProcessPacket(clusterLink *link) { } else { sender->flags &= ~CLUSTER_NODE_MY_PRIMARY_FAIL; } + + /* Check if the node understands FAILOVER_AUTH_NACK packets. */ + if (flags & CLUSTER_NODE_FAILOVER_AUTH_NACK_SUPPORTED) { + sender->flags |= CLUSTER_NODE_FAILOVER_AUTH_NACK_SUPPORTED; + } else { + sender->flags &= ~CLUSTER_NODE_FAILOVER_AUTH_NACK_SUPPORTED; + } } /* Update the last time we saw any data from this node. We @@ -4419,6 +4434,16 @@ int clusterProcessPacket(clusterLink *link) { * we check ASAP. */ clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); } + } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_NACK) { + if (!sender) return 1; /* We don't know that node. */ + + /* We consider this nack only if the sender is a primary serving + * a non zero number of slots, and its currentEpoch is greater or + * equal to epoch where this node started the election. */ + if (server.cluster->failover_auth_time && clusterNodeIsVotingPrimary(sender) && + sender_claimed_current_epoch >= server.cluster->failover_auth_epoch) { + clusterProcessFailoverAuthNack(sender, msg); + } } else if (type == CLUSTERMSG_TYPE_MFSTART) { /* This message is acceptable only if I'm a primary and the sender * is one of my replicas. */ @@ -5272,6 +5297,32 @@ void clusterSendFailoverAuth(clusterNode *node) { clusterMsgSendBlockDecrRefCount(msgblock); } +static const char *clusterNackReasonString(uint8_t reason) { + switch (reason) { + case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_NOT_SAFE: return "not-safe"; + case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_REQ_EPOCH_OLD: return "req-epoch-old"; + case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_ALREADY_VOTED: return "already-voted"; + case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_PRIMARY_UP: return "primary-up"; + case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_STALE_CONFIG: return "stale-config"; + default: return "unknown"; + } +} + +/* Send a FAILOVER_AUTH_NACK message to the specified node. */ +void clusterSendFailoverNack(clusterNode *node, uint8_t reason) { + if (!node->link) return; + if (!nodeSupportsFailoverAuthNack(node)) return; + + uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgDataFailoverNack); + clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_NACK, msglen); + + clusterMsg *hdr = getMessageFromSendBlock(msgblock); + hdr->data.failover_nack.nack.reason = reason; + + clusterSendMessage(node->link, msgblock); + clusterMsgSendBlockDecrRefCount(msgblock); +} + /* Send a MFSTART message to the specified node. */ void clusterSendMFStart(clusterNode *node) { if (!node->link) return; @@ -5302,6 +5353,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { if (!server.cluster->safe_to_join) { serverLog(LL_WARNING, "Failover auth denied to %.40s (%s): it is not safe to vote in this moment)", node->name, humanNodename(node)); + clusterSendFailoverNack(node, CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_NOT_SAFE); return; } @@ -5313,6 +5365,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { serverLog(LL_WARNING, "Failover auth denied to %.40s (%s): reqEpoch (%llu) < curEpoch(%llu)", node->name, humanNodename(node), (unsigned long long)requestCurrentEpoch, (unsigned long long)server.cluster->currentEpoch); + clusterSendFailoverNack(node, CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_REQ_EPOCH_OLD); return; } @@ -5320,6 +5373,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { if (server.cluster->lastVoteEpoch == server.cluster->currentEpoch) { serverLog(LL_WARNING, "Failover auth denied to %.40s (%s): already voted for epoch %llu", node->name, humanNodename(node), (unsigned long long)server.cluster->currentEpoch); + clusterSendFailoverNack(node, CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_ALREADY_VOTED); return; } @@ -5337,6 +5391,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { serverLog(LL_WARNING, "Failover auth denied to %.40s (%s) for epoch %llu: its primary is up", node->name, humanNodename(node), (unsigned long long)requestCurrentEpoch); } + clusterSendFailoverNack(node, CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_PRIMARY_UP); return; } @@ -5371,6 +5426,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { "an UPDATE message about %.40s (%s)", node->name, humanNodename(node), slot_owner->name, humanNodename(slot_owner)); clusterSendUpdate(node->link, slot_owner); + clusterSendFailoverNack(node, CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_STALE_CONFIG); return; } } @@ -5383,6 +5439,37 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { (unsigned long long)server.cluster->currentEpoch); } +/* Handle a FAILOVER_AUTH_NACK from a voter. */ +void clusterProcessFailoverAuthNack(clusterNode *sender, clusterMsg *request) { + /* The reason is bracketed up front so operators can quickly grep + * for a specific cause; the trailing NACKs k/N gives at-a-glance + * progress towards the fast-fail threshold. */ + server.cluster->failover_auth_nack_count++; + serverLog(LL_WARNING, + "Failover auth NACK [%s] from %.40s (%s) for epoch %llu (NACKs %d/%d)", + clusterNackReasonString(request->data.failover_nack.nack.reason), sender->name, + humanNodename(sender), (unsigned long long)server.cluster->failover_auth_epoch, + server.cluster->failover_auth_nack_count, server.cluster->size); + + /* A voter that NACKed us in this epoch will not change its mind, so + * the upper bound on the votes we can ever collect is the voters that + * have not (yet) NACKed. Fast-fail once that bound drops below the + * quorum we need to win. */ + int needed_quorum = (server.cluster->size / 2) + 1; + int max_possible_acks = server.cluster->size - server.cluster->failover_auth_nack_count; + if (max_possible_acks < needed_quorum) { + serverLog(LL_WARNING, + "Failover election for epoch %llu cannot reach quorum %d (NACKs %d/%d). " + "Resetting the election since we cannot win an election without quorum.", + (unsigned long long)server.cluster->failover_auth_epoch, needed_quorum, + server.cluster->failover_auth_nack_count, server.cluster->size); + server.cluster->failover_auth_time = 0; + /* Maybe we could start a new election, set a flag here to make sure + * we check as soon as possible, instead of waiting for a cron. */ + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + } +} + /* This function returns the "rank" of this instance, a replica, in the context * of its primary-replicas ring. The rank of the replica is given by the number of * other replicas for the same primary that have a better replication offset @@ -5626,6 +5713,7 @@ void clusterHandleReplicaFailover(void) { /* Use a failover delay relative to node timeout: 500 for the default node * timeout of 15000, less for lower node timeout, but not more. */ long long delay = min(server.cluster_node_timeout / 30, 500); + if (server.debug_cluster_failover_delay >= 0) delay = server.debug_cluster_failover_delay; /* Pre conditions to run the function, that must be met both in case * of an automatic or manual failover: @@ -5674,7 +5762,9 @@ void clusterHandleReplicaFailover(void) { server.cluster->failover_auth_time = now + delay + /* Fixed delay to let FAIL msg propagate. */ random() % delay; /* Random delay between 0 and the fixed delay. */ + if (server.debug_cluster_failover_delay >= 0) server.cluster->failover_auth_time = now + delay; server.cluster->failover_auth_count = 0; + server.cluster->failover_auth_nack_count = 0; server.cluster->failover_auth_sent = 0; server.cluster->failover_auth_rank = clusterGetReplicaRank(); /* We add another delay that is proportional to the replica rank. @@ -7139,6 +7229,7 @@ const char *clusterGetMessageTypeString(int type) { case CLUSTERMSG_TYPE_PUBLISHSHARD: return "publishshard"; case CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST: return "auth-req"; case CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK: return "auth-ack"; + case CLUSTERMSG_TYPE_FAILOVER_AUTH_NACK: return "auth-nack"; case CLUSTERMSG_TYPE_UPDATE: return "update"; case CLUSTERMSG_TYPE_MFSTART: return "mfstart"; case CLUSTERMSG_TYPE_MODULE: return "module"; diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 7703f071de0..0461979987e 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -69,6 +69,7 @@ typedef struct clusterLink { * myself will gossip this flag to other replica in the \ * shard so that the replicas can make a better ranking \ * decisions to help with the failover. */ +#define CLUSTER_NODE_FAILOVER_AUTH_NACK_SUPPORTED (1 << 14) /* This node understands FAILOVER_AUTH_NACK messages. */ #define CLUSTER_NODE_NULL_NAME \ "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ @@ -86,6 +87,7 @@ typedef struct clusterLink { #define nodeSupportsMultiMeet(n) ((n)->flags & CLUSTER_NODE_MULTI_MEET_SUPPORTED) #define nodeInNormalState(n) (!((n)->flags & (CLUSTER_NODE_HANDSHAKE | CLUSTER_NODE_MEET | CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL))) #define nodePrimaryIsFail(n) ((n)->flags & CLUSTER_NODE_MY_PRIMARY_FAIL) +#define nodeSupportsFailoverAuthNack(n) ((n)->flags & CLUSTER_NODE_FAILOVER_AUTH_NACK_SUPPORTED) /* Cluster messages header */ @@ -106,7 +108,8 @@ typedef struct clusterLink { #define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */ #define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */ #define CLUSTERMSG_TYPE_PUBLISHSHARD 10 /* Pub/Sub Publish shard propagation */ -#define CLUSTERMSG_TYPE_COUNT 11 /* Total number of message types. */ +#define CLUSTERMSG_TYPE_FAILOVER_AUTH_NACK 11 /* No, you don't have my vote. */ +#define CLUSTERMSG_TYPE_COUNT 12 /* Total number of message types. */ #define CLUSTERMSG_LIGHT 0x8000 /* Modifier bit for message types that support light header */ @@ -141,6 +144,11 @@ typedef struct { char nodename[CLUSTER_NAMELEN]; } clusterMsgDataFail; +typedef struct { + uint8_t reason; + char notused1[24]; +} clusterMsgDataFailoverNack; + typedef struct { uint32_t channel_len; uint32_t message_len; @@ -264,6 +272,11 @@ union clusterMsgData { struct { clusterMsgModule msg; } module; + + /* FAILOVER_AUTH_NACK */ + struct { + clusterMsgDataFailoverNack nack; + } failover_nack; }; #define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */ @@ -336,6 +349,13 @@ static_assert(offsetof(clusterMsg, data) == 2256, "unexpected field offset"); primary is up. */ #define CLUSTERMSG_FLAG0_EXT_DATA (1 << 2) /* Message contains extension data */ +/* Reason values carried in clusterMsgDataFailoverNack.reason. */ +#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_NOT_SAFE 1 /* Voter is not safe to vote yet. */ +#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_REQ_EPOCH_OLD 2 /* Request epoch < voter's currentEpoch. */ +#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_ALREADY_VOTED 3 /* Voter already voted in this epoch. */ +#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_PRIMARY_UP 4 /* Replica's primary is not failed. */ +#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_STALE_CONFIG 5 /* Replica's slot config is stale. */ + typedef struct { char sig[4]; /* Signature "RCmb" (Cluster message bus). */ uint32_t totlen; /* Total length of this message */ @@ -448,6 +468,7 @@ struct clusterState { /* The following fields are used to take the replica state on elections. */ mstime_t failover_auth_time; /* Time of previous or next election. */ int failover_auth_count; /* Number of votes received so far. */ + int failover_auth_nack_count; /* Number of rejected votes received so far. */ int failover_auth_sent; /* True if we already asked for votes. */ int failover_auth_rank; /* This replica rank for current auth request. */ int failover_failed_primary_rank; /* The rank of this instance in the context of all failed primary list. */ diff --git a/src/debug.c b/src/debug.c index 9ba8f36fb49..32a9c74402a 100644 --- a/src/debug.c +++ b/src/debug.c @@ -448,6 +448,9 @@ void debugCommand(client *c) { " Disable sending cluster ping to a random node every second.", "DISABLE-CLUSTER-RECONNECTION <0|1>", " Disable cluster reconnection of cluster nodes.", + "CLUSTER-FAILOVER-DELAY ", + " Override the failover delay. -1 is the default value, meaning disabled,", + " values >= 0 will be used for the failover delay.", "OOM", " Crash the server simulating an out-of-memory error.", "PANIC", @@ -638,6 +641,15 @@ void debugCommand(client *c) { } else if (!strcasecmp(objectGetVal(c->argv[1]), "disable-cluster-reconnection") && c->argc == 3) { server.debug_cluster_disable_reconnection = atoi(objectGetVal(c->argv[2])); addReply(c, shared.ok); + } else if (!strcasecmp(objectGetVal(c->argv[1]), "cluster-failover-delay") && c->argc == 3) { + int delay_ms; + if (getIntFromObjectOrReply(c, c->argv[2], &delay_ms, NULL) != C_OK) return; + if (delay_ms < -1) { + addReplyError(c, "delay-ms must be -1 (default) or a non-negative value in ms"); + return; + } + server.debug_cluster_failover_delay = delay_ms; + addReply(c, shared.ok); } else if (!strcasecmp(objectGetVal(c->argv[1]), "slotmigration")) { if (!strcasecmp(objectGetVal(c->argv[2]), "prevent-pause")) { server.debug_slot_migration_prevent_pause = atoi(objectGetVal(c->argv[3])); diff --git a/src/server.c b/src/server.c index 8961d8f20a1..e55f3dd837c 100644 --- a/src/server.c +++ b/src/server.c @@ -2963,6 +2963,7 @@ void initServer(void) { server.cluster_drop_packet_filter = -1; server.debug_cluster_disable_random_ping = 0; server.debug_cluster_disable_reconnection = 0; + server.debug_cluster_failover_delay = -1; server.reply_buffer_peak_reset_time = REPLY_BUFFER_DEFAULT_PEAK_RESET_TIME; server.reply_buffer_resizing_enabled = 1; server.client_mem_usage_buckets = NULL; diff --git a/src/server.h b/src/server.h index 51667c2017f..eb54700780a 100644 --- a/src/server.h +++ b/src/server.h @@ -2325,6 +2325,8 @@ struct valkeyServer { /* Debug config to expose intermediary slot migration states. */ uint32_t debug_slot_migration_prevent_pause : 1; uint32_t debug_slot_migration_prevent_failover : 1; + /* Debug config to override the failover delay (in ms). */ + int debug_cluster_failover_delay; sds cached_cluster_slot_info[CACHE_CONN_TYPE_MAX]; /* Index in array is a bitwise or of CACHE_CONN_TYPE_* */ /* Scripting */ mstime_t busy_reply_threshold; /* Script / module timeout in milliseconds */ diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 5eb8c628552..476d6b7fd40 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -64,16 +64,19 @@ start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval } } ;# start_cluster -start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000}} { - test "Primaries will not time out then they are elected in the same epoch" { +# Needs to run in the body of +# start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000}} +proc test_same_epoch {delay} { + test "Primaries will not time out then they are elected in the same epoch - delay $delay" { # Since we have the delay time, so these node may not initiate the # election at the same time (same epoch). But if they do, we make # sure there is no failover timeout. + R 7 DEBUG CLUSTER-FAILOVER-DELAY $delay + R 8 DEBUG CLUSTER-FAILOVER-DELAY $delay + R 9 DEBUG CLUSTER-FAILOVER-DELAY $delay # Killing there primary nodes. - pause_process [srv 0 pid] - pause_process [srv -1 pid] - pause_process [srv -2 pid] + exec kill -SIGSTOP [srv 0 pid] [srv -1 pid] [srv -2 pid] # Wait for the failover wait_for_condition 1000 50 { @@ -99,6 +102,14 @@ start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval resume_process [srv -1 pid] resume_process [srv -2 pid] } +} + +start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000}} { + test_same_epoch 500 +} ;# start_cluster + +start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000}} { + test_same_epoch 0 } ;# start_cluster run_solo {cluster} { From 89d45a7788e45758693091f98cb65971d5026ff6 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 9 Jun 2026 10:33:13 +0800 Subject: [PATCH 02/11] Update src/debug.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Viktor Söderqvist Signed-off-by: Binbin --- src/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/debug.c b/src/debug.c index 32a9c74402a..97d91387abc 100644 --- a/src/debug.c +++ b/src/debug.c @@ -449,8 +449,8 @@ void debugCommand(client *c) { "DISABLE-CLUSTER-RECONNECTION <0|1>", " Disable cluster reconnection of cluster nodes.", "CLUSTER-FAILOVER-DELAY ", - " Override the failover delay. -1 is the default value, meaning disabled,", - " values >= 0 will be used for the failover delay.", + " Override the failover delay. -1 is the default value, meaning don't", + " override, values >= 0 will be used for the failover delay.", "OOM", " Crash the server simulating an out-of-memory error.", "PANIC", From 6a9ddad39d4bdfa88c3c63235c3086b6bde31339 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 9 Jun 2026 10:49:35 +0800 Subject: [PATCH 03/11] More update from code review Signed-off-by: Binbin --- src/cluster_legacy.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 6d36be5ca26..1994e40b754 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4440,8 +4440,8 @@ int clusterProcessPacket(clusterLink *link) { /* We consider this nack only if the sender is a primary serving * a non zero number of slots, and its currentEpoch is greater or * equal to epoch where this node started the election. */ - if (server.cluster->failover_auth_time && clusterNodeIsVotingPrimary(sender) && - sender_claimed_current_epoch >= server.cluster->failover_auth_epoch) { + if (server.cluster->failover_auth_time && server.cluster->failover_auth_sent && + clusterNodeIsVotingPrimary(sender) && sender_claimed_current_epoch >= server.cluster->failover_auth_epoch) { clusterProcessFailoverAuthNack(sender, msg); } } else if (type == CLUSTERMSG_TYPE_MFSTART) { @@ -5760,8 +5760,8 @@ void clusterHandleReplicaFailover(void) { * elapsed, we can setup a new one. */ if (auth_age > auth_retry_time) { server.cluster->failover_auth_time = now + - delay + /* Fixed delay to let FAIL msg propagate. */ - random() % delay; /* Random delay between 0 and the fixed delay. */ + delay + /* Fixed delay to let FAIL msg propagate. */ + (delay ? random() % delay : 0); /* Random delay between 0 and the fixed delay. */ if (server.debug_cluster_failover_delay >= 0) server.cluster->failover_auth_time = now + delay; server.cluster->failover_auth_count = 0; server.cluster->failover_auth_nack_count = 0; From 60b0ef2be8a12ca90ea198312d8152d92c99dc9a Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 9 Jun 2026 12:28:48 +0800 Subject: [PATCH 04/11] Handle and improve "Replica can update the config epoch when trigger the failover" test Signed-off-by: Binbin --- src/cluster_legacy.c | 5 ++ tests/unit/cluster/failover2.tcl | 91 ++++++++++++++++++++++++++------ 2 files changed, 80 insertions(+), 16 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 1994e40b754..5730801606d 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -5425,6 +5425,11 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { "Node %.40s (%s) has old slots configuration, sending " "an UPDATE message about %.40s (%s)", node->name, humanNodename(node), slot_owner->name, humanNodename(slot_owner)); + /* Send UPDATE first so the replica can fix its slot config; the NACK + * that follows then triggers fast-fail, letting the replica retry + * with the freshly-updated configEpoch right away instead of waiting + * for auth_timeout. TCP ordering on the same link guarantees the + * UPDATE arrives before the NACK. */ clusterSendUpdate(node->link, slot_owner); clusterSendFailoverNack(node, CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_STALE_CONFIG); return; diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 476d6b7fd40..3605511b538 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -145,12 +145,26 @@ run_solo {cluster} { } ;# start_cluster } ;# run_solo +# Setup: R3 is the only replica of R0. While R3 is network-isolated, +# we bump R0's configEpoch and let R1/R2 learn the new value through +# gossip. R3's view is therefore stale. We then pause R0 and let R3 +# attempt failover: the voters reject it with STALE_CONFIG, send back +# both an UPDATE (with R0's fresh slot config) and a NACK, and R3 must +# eventually win a second election with the refreshed configEpoch. +# +# `type` : "automatic" or "manual" failover. +# `drop_nack` : 1 -> drop FAILOVER_AUTH_NACK on R3 to exercise the +# legacy timeout-based retry. +# 0 -> let NACKs through and exercise the fast-fail +# path that retries without waiting for the timeout. +# # Needs to run in the body of # start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} -proc test_replica_config_epoch_failover {type} { - test "Replica can update the config epoch when trigger the failover - $type" { +proc test_replica_config_epoch_failover {type drop_nack} { + test "Replica can update the config epoch when trigger the failover - $type - drop_nack $drop_nack" { set CLUSTER_PACKET_TYPE_NONE -1 set CLUSTER_PACKET_TYPE_ALL -2 + set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_NACK 11 if {$type == "automatic"} { R 3 CONFIG SET cluster-replica-no-failover no @@ -178,24 +192,61 @@ proc test_replica_config_epoch_failover {type} { # Make sure that replica do not update config epoch. assert_not_equal $R0_config_epoch [dict get [cluster_get_node_by_id 3 $R0_nodeid] config_epoch] - # Pause the R 0 and wait for the cluster to be down. + # Pause R0 and resume R3's debug. + # drop_nack=1 keeps NACKs filtered so the legacy timeout path is exercised. + # drop_nack=0 lets NACKs through for the fast-fail path. pause_process [srv 0 pid] - R 3 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + if {$drop_nack} { + R 3 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_NACK + } else { + R 3 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + } R 3 DEBUG CLOSE-CLUSTER-LINK-ON-PACKET-DROP 0 + + # Wait for R3 to reconnect to both voters before triggering anything + # that depends on bidirectional traffic, otherwise an immediate failover + # request can race the link rebuild and the NACK reply may be lost. + set R1_nodeid [R 1 cluster myid] + set R2_nodeid [R 2 cluster myid] + set R3_nodeid [R 3 cluster myid] wait_for_condition 1000 50 { - [CI 1 cluster_state] == "fail" && - [CI 2 cluster_state] == "fail" && - [CI 3 cluster_state] == "fail" + [dict get [cluster_get_node_by_id 1 $R3_nodeid] linkstate] eq "connected" && + [dict get [cluster_get_node_by_id 3 $R1_nodeid] linkstate] eq "connected" && + [dict get [cluster_get_node_by_id 2 $R3_nodeid] linkstate] eq "connected" && + [dict get [cluster_get_node_by_id 3 $R2_nodeid] linkstate] eq "connected" } else { - fail "Cluster does not fail" + fail "R3 did not reconnect its bus links to the voters" } - # Make sure both the automatic and the manual failover will fail in the first time. - if {$type == "automatic"} { - wait_for_log_messages -3 {"*Failover attempt expired*"} 0 1200 50 - } elseif {$type == "manual"} { + if {$drop_nack} { + wait_for_condition 1000 50 { + [CI 1 cluster_state] == "fail" && + [CI 2 cluster_state] == "fail" && + [CI 3 cluster_state] == "fail" + } else { + fail "Cluster does not fail" + } + } + + if {$type == "manual"} { R 3 cluster failover force - wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1200 50 + } + + if {$drop_nack} { + # Make sure both the automatic and the manual failover will fail in the first time. + if {$type == "automatic"} { + wait_for_log_messages -3 {"*Failover attempt expired*"} 0 1200 50 + } elseif {$type == "manual"} { + wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1200 50 + } + } else { + # Fast-fail path: NACK accounting trips the quorum check + # and "attempt expired" / "timed out" must never appear. + wait_for_log_messages -3 {"*cannot reach quorum*"} 0 1200 50 + verify_no_log_message -3 "*Failover attempt expired*" 0 + if {$type == "manual"} { + verify_no_log_message -3 "*Manual failover timed out*" 0 + } } # Make sure the primaries prints the relevant logs. @@ -211,7 +262,7 @@ proc test_replica_config_epoch_failover {type} { fail "The replica does not update the config epoch" } - if {$type == "manual"} { + if {$drop_nack && $type == "manual"} { # The second manual failure will succeed because the config epoch # has already propagated. R 3 cluster failover force @@ -239,9 +290,17 @@ proc test_replica_config_epoch_failover {type} { } start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} { - test_replica_config_epoch_failover "automatic" + test_replica_config_epoch_failover "automatic" 1 +} + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} { + test_replica_config_epoch_failover "manual" 1 +} + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} { + test_replica_config_epoch_failover "automatic" 0 } start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} { - test_replica_config_epoch_failover "manual" + test_replica_config_epoch_failover "manual" 0 } From 64fac401c559cf7a2c431bf41e5207f4a5806b7b Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 9 Jun 2026 13:51:27 +0800 Subject: [PATCH 05/11] Fix format Signed-off-by: Binbin --- src/cluster_legacy.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 5730801606d..f07188a6ebe 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -5299,12 +5299,12 @@ void clusterSendFailoverAuth(clusterNode *node) { static const char *clusterNackReasonString(uint8_t reason) { switch (reason) { - case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_NOT_SAFE: return "not-safe"; + case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_NOT_SAFE: return "not-safe"; case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_REQ_EPOCH_OLD: return "req-epoch-old"; case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_ALREADY_VOTED: return "already-voted"; - case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_PRIMARY_UP: return "primary-up"; - case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_STALE_CONFIG: return "stale-config"; - default: return "unknown"; + case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_PRIMARY_UP: return "primary-up"; + case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_STALE_CONFIG: return "stale-config"; + default: return "unknown"; } } From 2c0126535e876b1d8d73ef626eab53aa4d83e25f Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 9 Jun 2026 15:01:03 +0800 Subject: [PATCH 06/11] Fix warning and format Signed-off-by: Binbin --- src/cluster_legacy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index f07188a6ebe..825cf190b27 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -5317,7 +5317,7 @@ void clusterSendFailoverNack(clusterNode *node, uint8_t reason) { clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_NACK, msglen); clusterMsg *hdr = getMessageFromSendBlock(msgblock); - hdr->data.failover_nack.nack.reason = reason; + memcpy(&hdr->data.failover_nack.nack.reason, &reason, sizeof(reason)); clusterSendMessage(node->link, msgblock); clusterMsgSendBlockDecrRefCount(msgblock); @@ -5452,7 +5452,7 @@ void clusterProcessFailoverAuthNack(clusterNode *sender, clusterMsg *request) { server.cluster->failover_auth_nack_count++; serverLog(LL_WARNING, "Failover auth NACK [%s] from %.40s (%s) for epoch %llu (NACKs %d/%d)", - clusterNackReasonString(request->data.failover_nack.nack.reason), sender->name, + clusterNackReasonString(request->data.failover_nack.nack.reason), sender->name, humanNodename(sender), (unsigned long long)server.cluster->failover_auth_epoch, server.cluster->failover_auth_nack_count, server.cluster->size); From dc1bb788e9c5d7f9a17ea19d18f56954f7b3f6f9 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 9 Jun 2026 22:57:01 +0800 Subject: [PATCH 07/11] code review from viktor Signed-off-by: Binbin --- src/cluster_legacy.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 825cf190b27..5f39a2976ab 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4438,10 +4438,12 @@ int clusterProcessPacket(clusterLink *link) { if (!sender) return 1; /* We don't know that node. */ /* We consider this nack only if the sender is a primary serving - * a non zero number of slots, and its currentEpoch is greater or + * a non-zero number of slots, and its currentEpoch is greater or * equal to epoch where this node started the election. */ - if (server.cluster->failover_auth_time && server.cluster->failover_auth_sent && - clusterNodeIsVotingPrimary(sender) && sender_claimed_current_epoch >= server.cluster->failover_auth_epoch) { + if (server.cluster->failover_auth_time && + server.cluster->failover_auth_sent && + clusterNodeIsVotingPrimary(sender) && + sender_claimed_current_epoch >= server.cluster->failover_auth_epoch) { clusterProcessFailoverAuthNack(sender, msg); } } else if (type == CLUSTERMSG_TYPE_MFSTART) { @@ -5450,7 +5452,7 @@ void clusterProcessFailoverAuthNack(clusterNode *sender, clusterMsg *request) { * for a specific cause; the trailing NACKs k/N gives at-a-glance * progress towards the fast-fail threshold. */ server.cluster->failover_auth_nack_count++; - serverLog(LL_WARNING, + serverLog(LL_NOTICE, "Failover auth NACK [%s] from %.40s (%s) for epoch %llu (NACKs %d/%d)", clusterNackReasonString(request->data.failover_nack.nack.reason), sender->name, humanNodename(sender), (unsigned long long)server.cluster->failover_auth_epoch, @@ -5463,7 +5465,7 @@ void clusterProcessFailoverAuthNack(clusterNode *sender, clusterMsg *request) { int needed_quorum = (server.cluster->size / 2) + 1; int max_possible_acks = server.cluster->size - server.cluster->failover_auth_nack_count; if (max_possible_acks < needed_quorum) { - serverLog(LL_WARNING, + serverLog(LL_NOTICE, "Failover election for epoch %llu cannot reach quorum %d (NACKs %d/%d). " "Resetting the election since we cannot win an election without quorum.", (unsigned long long)server.cluster->failover_auth_epoch, needed_quorum, From 315e1416e0015f984fdcde8bbac65b8746bcfc09 Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 10 Jun 2026 12:20:48 +0800 Subject: [PATCH 08/11] More reasonse and fix the test Signed-off-by: Binbin --- src/cluster_legacy.c | 6 +++++- src/cluster_legacy.h | 12 +++++++----- tests/support/cluster_util.tcl | 14 ++++++++++++++ tests/unit/cluster/failover2.tcl | 19 +++++++++++++++++-- 4 files changed, 43 insertions(+), 8 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 04c920a37f1..49fc3e7e82c 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -5306,6 +5306,8 @@ static const char *clusterNackReasonString(uint8_t reason) { case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_NOT_SAFE: return "not-safe"; case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_REQ_EPOCH_OLD: return "req-epoch-old"; case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_ALREADY_VOTED: return "already-voted"; + case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_REQ_IS_PRIMARY: return "req-is-primary"; + case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_NO_PRIMARY: return "no-primary"; case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_PRIMARY_UP: return "primary-up"; case CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_STALE_CONFIG: return "stale-config"; default: return "unknown"; @@ -5388,14 +5390,16 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { if (clusterNodeIsPrimary(node)) { serverLog(LL_WARNING, "Failover auth denied to %.40s (%s) for epoch %llu: it is a primary node", node->name, humanNodename(node), (unsigned long long)requestCurrentEpoch); + clusterSendFailoverNack(node, CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_REQ_IS_PRIMARY); } else if (primary == NULL) { serverLog(LL_WARNING, "Failover auth denied to %.40s (%s) for epoch %llu: I don't know its primary", node->name, humanNodename(node), (unsigned long long)requestCurrentEpoch); + clusterSendFailoverNack(node, CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_NO_PRIMARY); } else if (!nodeFailed(primary)) { serverLog(LL_WARNING, "Failover auth denied to %.40s (%s) for epoch %llu: its primary is up", node->name, humanNodename(node), (unsigned long long)requestCurrentEpoch); + clusterSendFailoverNack(node, CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_PRIMARY_UP); } - clusterSendFailoverNack(node, CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_PRIMARY_UP); return; } diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 0461979987e..90195ad8fcf 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -350,11 +350,13 @@ static_assert(offsetof(clusterMsg, data) == 2256, "unexpected field offset"); #define CLUSTERMSG_FLAG0_EXT_DATA (1 << 2) /* Message contains extension data */ /* Reason values carried in clusterMsgDataFailoverNack.reason. */ -#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_NOT_SAFE 1 /* Voter is not safe to vote yet. */ -#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_REQ_EPOCH_OLD 2 /* Request epoch < voter's currentEpoch. */ -#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_ALREADY_VOTED 3 /* Voter already voted in this epoch. */ -#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_PRIMARY_UP 4 /* Replica's primary is not failed. */ -#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_STALE_CONFIG 5 /* Replica's slot config is stale. */ +#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_NOT_SAFE 1 /* Voter is not safe to vote yet. */ +#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_REQ_EPOCH_OLD 2 /* Request epoch < voter's currentEpoch. */ +#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_ALREADY_VOTED 3 /* Voter already voted in this epoch. */ +#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_REQ_IS_PRIMARY 4 /* Requester is a primary itself. */ +#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_NO_PRIMARY 5 /* Voter doesn't know it's primary. */ +#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_PRIMARY_UP 6 /* Voter still sees it's primary up. */ +#define CLUSTERMSG_FAILOVER_AUTH_NACK_REASON_STALE_CONFIG 7 /* Replica's slot config is stale. */ typedef struct { char sig[4]; /* Signature "RCmb" (Cluster message bus). */ diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index ec1f91fdf0f..f4beb406549 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -292,6 +292,20 @@ proc cluster_has_flag {node flag} { expr {[lsearch -exact [dict get $node flags] $flag] != -1} } +# Returns 1 only when every server instance in `srv_idxs` sees every +# node id in `node_ids` carrying `flag` in its CLUSTER NODES output. +proc cluster_all_see_flag {srv_idxs node_ids flag} { + foreach idx $srv_idxs { + foreach id $node_ids { + set node [cluster_get_node_by_id $idx $id] + if {![cluster_has_flag $node $flag]} { + return 0 + } + } + } + return 1 +} + # Returns the parsed "myself" node entry as a dictionary. proc cluster_get_myself id { set nodes [get_cluster_nodes $id] diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 3605511b538..b53daea632e 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -76,8 +76,23 @@ proc test_same_epoch {delay} { R 9 DEBUG CLUSTER-FAILOVER-DELAY $delay # Killing there primary nodes. + set primary_ids [list [R 0 cluster myid] [R 1 cluster myid] [R 2 cluster myid]] exec kill -SIGSTOP [srv 0 pid] [srv -1 pid] [srv -2 pid] + # Wait until every voter (idx 3..6) sees all three paused primaries + # as fail, so the upcoming election is granted on the first round. + # Otherwise voter might reply with NACK primary-up. + wait_for_condition 1000 50 { + [cluster_all_see_flag {3 4 5 6} $primary_ids fail] + } else { + fail "Voters did not mark all paused primaries as fail" + } + + # Now let the replicas proceed with the election. + R 7 CONFIG SET cluster-replica-no-failover no + R 8 CONFIG SET cluster-replica-no-failover no + R 9 CONFIG SET cluster-replica-no-failover no + # Wait for the failover wait_for_condition 1000 50 { [s -7 role] == "master" && @@ -104,11 +119,11 @@ proc test_same_epoch {delay} { } } -start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000}} { +start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-replica-no-failover yes}} { test_same_epoch 500 } ;# start_cluster -start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000}} { +start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-replica-no-failover yes}} { test_same_epoch 0 } ;# start_cluster From a468af96cf58226bbd37bec517b2e1864a175e13 Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 17 Jun 2026 13:44:45 +0800 Subject: [PATCH 09/11] Remove useless notused1 Signed-off-by: Binbin --- src/cluster_legacy.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 90195ad8fcf..98cf5622ee3 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -146,7 +146,6 @@ typedef struct { typedef struct { uint8_t reason; - char notused1[24]; } clusterMsgDataFailoverNack; typedef struct { From 573ece4753dbff1c2130c234f55aefdf308ff33c Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 17 Jun 2026 18:44:37 +0800 Subject: [PATCH 10/11] Remove tmp file adding by mistake Signed-off-by: Binbin --- .../redis62-valkey90-command-compat.md | 85 ------------------- 1 file changed, 85 deletions(-) delete mode 100644 design-docs/redis62-valkey90-command-compat.md diff --git a/design-docs/redis62-valkey90-command-compat.md b/design-docs/redis62-valkey90-command-compat.md deleted file mode 100644 index 65c7309b8fe..00000000000 --- a/design-docs/redis62-valkey90-command-compat.md +++ /dev/null @@ -1,85 +0,0 @@ -# Redis 6.2 与 Valkey 9.0 命令兼容性整理 - -> 本文基于当前仓库 `src/commands/*.json` 的实际命令定义整理。Valkey 由 Redis 7.2.4 分叉而来,因此在命令层面**完全向后兼容 Redis 6.2**,并在其基础上新增了大量命令与子命令。 - -## 一、总体结论 - -- **向后兼容**:Redis 6.2 中的全部命令在 Valkey 9.0 中均可用,行为与返回值保持一致,无需修改客户端代码即可平滑迁移。 -- **协议兼容**:同时支持 RESP2 与 RESP3(`HELLO` 协议协商),与 Redis 6.2(默认 RESP2、可选 RESP3)一致。 -- **超集关系**:Valkey 9.0 是 Redis 6.2 命令集的超集,新增命令主要来自 Redis 7.x 时代特性,以及 Valkey 独立演进(8.0/8.1/9.0)的特性。 -- **迁移注意**:从 Redis 6.2 迁移到 Valkey 9.0 时,命令兼容性不是障碍,需要关注的是 RDB/AOF 文件版本、配置项变化以及部分集群运维命令的差异(详见第四节)。 - -## 二、Redis 6.2 命令在 Valkey 9.0 中的状态 - -下列 Redis 6.2 的代表性命令在 Valkey 9.0 中均原样保留: - -| 类别 | 命令(节选) | -| --- | --- | -| 字符串 | `GET`、`SET`、`GETDEL`、`GETEX`、`SETRANGE`、`GETRANGE`、`APPEND`、`STRLEN`、`INCR`、`INCRBYFLOAT`、`MSET`、`MGET`、`LCS` | -| 哈希 | `HSET`、`HGET`、`HDEL`、`HRANDFIELD`、`HSCAN`、`HMGET`、`HGETALL` | -| 列表 | `LPUSH`、`RPUSH`、`LPOP`、`RPOP`、`LMOVE`、`BLMOVE`、`LPOS`、`LRANGE`、`LINSERT` | -| 集合 | `SADD`、`SREM`、`SMEMBERS`、`SMISMEMBER`、`SINTERSTORE`、`SRANDMEMBER`、`SSCAN` | -| 有序集合 | `ZADD`、`ZRANGE`、`ZRANGESTORE`、`ZDIFF`、`ZUNION`、`ZRANDMEMBER`、`ZPOPMIN`、`BZPOPMAX` | -| 通用键 | `COPY`、`OBJECT`、`EXPIRE`、`PERSIST`、`TTL`、`SCAN`、`TYPE`、`RENAME`、`TOUCH`、`UNLINK` | -| 位图 | `SETBIT`、`GETBIT`、`BITCOUNT`、`BITPOS`、`BITFIELD`、`BITOP` | -| HyperLogLog | `PFADD`、`PFCOUNT`、`PFMERGE` | -| 地理位置 | `GEOADD`、`GEOSEARCH`、`GEOSEARCHSTORE`、`GEODIST`、`GEORADIUS`(已废弃但保留) | -| Stream | `XADD`、`XREAD`、`XRANGE`、`XAUTOCLAIM`、`XACK`、`XGROUP`、`XINFO` | -| 发布订阅 | `PUBLISH`、`SUBSCRIBE`、`PSUBSCRIBE`、`PUBSUB` | -| 脚本 | `EVAL`、`EVALSHA`、`EVAL_RO`、`SCRIPT`、`SUBSCRIBE` | -| 事务 | `MULTI`、`EXEC`、`DISCARD`、`WATCH`、`UNWATCH` | -| 连接/管理 | `HELLO`、`AUTH`、`CLIENT`、`ACL`、`CONFIG`、`SLOWLOG`、`LATENCY`、`MEMORY`、`INFO` | -| 复制/集群 | `REPLICAOF`、`WAIT`、`FAILOVER`、`CLUSTER`、`MIGRATE`、`DUMP`、`RESTORE` | - -> 说明:Redis 6.2 引入的 `SLAVEOF`→`REPLICAOF` 别名、`SMISMEMBER`、`GETDEL`、`GETEX`、`COPY`、`ZRANGESTORE`、`LMPOP` 前身能力等均已包含。 - -## 三、Valkey 9.0 相比 Redis 6.2 新增的命令 - -以下命令在 Redis 6.2 中**不存在**,是 Valkey 9.0 相对 6.2 的增量能力,按来源分组。 - -### 1. 来自 Redis 7.0 时代(Valkey 继承) - -- 列表/有序集合多键弹出:`LMPOP`、`BLMPOP`、`ZMPOP`、`BZMPOP` -- 集合/有序集合交集基数:`SINTERCARD`、`ZINTERCARD` -- 过期时间戳查询:`EXPIRETIME`、`PEXPIRETIME` -- 函数引擎:`FUNCTION`(`LOAD`/`DELETE`/`FLUSH`/`LIST`/`DUMP`/`RESTORE`/`STATS`/`KILL`)、`FCALL`、`FCALL_RO` -- 分片发布订阅:`SPUBLISH`、`SSUBSCRIBE`、`SUNSUBSCRIBE`、`PUBSUB SHARDCHANNELS`、`PUBSUB SHARDNUMSUB` -- 集群可观测:`CLUSTER SHARDS`、`CLUSTER LINKS` -- 连接管理:`CLIENT NO-EVICT` -- 命令自省增强:`COMMAND DOCS`、`COMMAND LIST`、`COMMAND GETKEYSANDFLAGS` - -### 2. 来自 Redis 7.2 时代(Valkey 继承) - -- AOF 持久化等待:`WAITAOF` -- 连接管理:`CLIENT NO-TOUCH`、`CLIENT SETINFO` -- 集群分片标识:`CLUSTER MYSHARDID` - -### 3. 哈希字段 TTL(与 Redis 7.4 能力重叠,Valkey 独立实现) - -- `HEXPIRE`、`HPEXPIRE`、`HEXPIREAT`、`HPEXPIREAT` -- `HTTL`、`HPTTL`、`HEXPIRETIME`、`HPEXPIRETIME` -- `HPERSIST` - -### 4. Valkey 专属新增(8.0/8.1/9.0) - -- 哈希增强:`HGETEX`、`HGETDEL`、`HSETEX` -- 字符串/批量写:`MSETEX`、`DELIFEQ`(条件删除,9.0 新增) -- 脚本自省:`SCRIPT SHOW` -- 命令日志(替代/扩展 SLOWLOG 语义,二者并存):`COMMANDLOG`(`GET`/`LEN`/`RESET`/`HELP`) -- 连接能力协商:`CLIENT CAPA`、`CLIENT IMPORT-SOURCE` -- 集群槽位统计:`CLUSTER SLOT-STATS` -- 集群原子化槽迁移(9.0 新增):`CLUSTER MIGRATESLOTS`、`CLUSTER SYNCSLOTS`、`CLUSTER GETSLOTMIGRATIONS`、`CLUSTER CANCELSLOTMIGRATIONS`、`CLUSTER FLUSHSLOT`、`CLUSTER FLUSHSLOTS` -- 集群键扫描:`CLUSTERSCAN` - -## 四、迁移与兼容性注意事项 - -1. **命令层面无破坏性变更**:Redis 6.2 客户端可直接连接 Valkey 9.0,命令调用不会因为兼容性问题失败。 -2. **数据文件版本**:Valkey 9.0 的 RDB/AOF 版本高于 Redis 6.2,**向上兼容(旧 → 新可加载)**,但反向不可(不要用 Redis 6.2 加载 Valkey 9.0 产生的文件)。 -3. **配置项差异**:部分 Redis 7.x/Valkey 新增配置项在 6.2 中不存在;从 6.2 沿用的旧 `redis.conf` 一般可直接被 Valkey 读取(保留 `redis-*` 兼容别名)。 -4. **`SLOWLOG` 与 `COMMANDLOG` 并存**:`SLOWLOG` 仍可用,新功能建议关注 `COMMANDLOG`。 -5. **已废弃命令仍保留**:如 `GEORADIUS`、`GEORADIUSBYMEMBER`、`SLAVEOF`、`SUBSTR` 等在两边都保留,行为一致。 -6. **许可证差异(非命令兼容性,但需留意)**:Redis 6.2 为 BSD,Valkey 9.0 同样维持开源 BSD-3 协议;这与 Redis 7.4+ 改用的 RSALv2/SSPL 不同。 - -## 五、一句话总结 - -> 从 Redis 6.2 升级到 Valkey 9.0,**命令完全兼容、协议完全兼容**,可视为「Redis 6.2 命令集 + Redis 7.x 全部增量 + Valkey 8.0/8.1/9.0 专属增强」的超集;迁移工作重点不在命令本身,而在持久化文件方向性和少量运维命令的新用法。 From d730a437784b987c45577a6645f4f8d477f47a2b Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 18 Jun 2026 14:04:50 +0800 Subject: [PATCH 11/11] Also counting fail_size, let's see what will happen Signed-off-by: Binbin --- src/cluster_legacy.c | 45 ++++++++++++++++++++------------ src/cluster_legacy.h | 1 + src/debug.c | 14 ++++++++++ src/server.c | 1 + src/server.h | 3 +++ tests/unit/cluster/failover2.tcl | 17 ++++++++++++ 6 files changed, 65 insertions(+), 16 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 49fc3e7e82c..c5fc876201c 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1451,6 +1451,7 @@ void clusterInit(void) { server.cluster->fail_reason = CLUSTER_FAIL_NONE; server.cluster->safe_to_join = 0; server.cluster->size = 0; + server.cluster->size_fail = 0; server.cluster->todo_before_sleep = 0; server.cluster->nodes = dictCreate(&clusterNodesDictType); server.cluster->shards = dictCreate(&clusterSdsToListType); @@ -4430,6 +4431,9 @@ int clusterProcessPacket(clusterLink *link) { * equal to epoch where this node started the election. */ if (clusterNodeIsVotingPrimary(sender) && sender_claimed_current_epoch >= server.cluster->failover_auth_epoch) { server.cluster->failover_auth_count++; + serverLog(LL_NOTICE, "Failover auth ACK from %.40s (%s) for epoch %llu (ACKs %d, quorum %d)", + sender->name, humanNodename(sender), (unsigned long long)server.cluster->failover_auth_epoch, + server.cluster->failover_auth_count, (server.cluster->size / 2) + 1); /* Maybe we reached a quorum here, set a flag to make sure * we check ASAP. */ clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); @@ -5454,28 +5458,25 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* Handle a FAILOVER_AUTH_NACK from a voter. */ void clusterProcessFailoverAuthNack(clusterNode *sender, clusterMsg *request) { - /* The reason is bracketed up front so operators can quickly grep - * for a specific cause; the trailing NACKs k/N gives at-a-glance - * progress towards the fast-fail threshold. */ server.cluster->failover_auth_nack_count++; - serverLog(LL_NOTICE, - "Failover auth NACK [%s] from %.40s (%s) for epoch %llu (NACKs %d/%d)", - clusterNackReasonString(request->data.failover_nack.nack.reason), sender->name, - humanNodename(sender), (unsigned long long)server.cluster->failover_auth_epoch, - server.cluster->failover_auth_nack_count, server.cluster->size); - /* A voter that NACKed us in this epoch will not change its mind, so - * the upper bound on the votes we can ever collect is the voters that - * have not (yet) NACKed. Fast-fail once that bound drops below the - * quorum we need to win. */ + /* A voter that NACKed us in this epoch will not change its mind, so the + * upper bound on the votes we can still collect is the voters that have + * not NACKed, minus FAIL voters that will never reply (they count towards + * size but neither ACK nor NACK). Fast-fail once that bound drops below + * the quorum we need to win.. */ int needed_quorum = (server.cluster->size / 2) + 1; - int max_possible_acks = server.cluster->size - server.cluster->failover_auth_nack_count; + int max_possible_acks = server.cluster->size - server.cluster->size_fail - server.cluster->failover_auth_nack_count; + serverLog(LL_NOTICE, "Failover auth NACK [%s] from %.40s (%s) for epoch %llu (NACKs %d, quorum %d)", + clusterNackReasonString(request->data.failover_nack.nack.reason), sender->name, + humanNodename(sender), (unsigned long long)server.cluster->failover_auth_epoch, + server.cluster->failover_auth_nack_count, needed_quorum); if (max_possible_acks < needed_quorum) { serverLog(LL_NOTICE, - "Failover election for epoch %llu cannot reach quorum %d (NACKs %d/%d). " + "Failover election for epoch %llu cannot reach quorum %d (NACKs %d, dead voters %d). " "Resetting the election since we cannot win an election without quorum.", (unsigned long long)server.cluster->failover_auth_epoch, needed_quorum, - server.cluster->failover_auth_nack_count, server.cluster->size); + server.cluster->failover_auth_nack_count, server.cluster->size_fail); server.cluster->failover_auth_time = 0; /* Maybe we could start a new election, set a flag here to make sure * we check as soon as possible, instead of waiting for a cron. */ @@ -5887,7 +5888,17 @@ void clusterHandleReplicaFailover(void) { /* Ask for votes if needed. */ if (server.cluster->failover_auth_sent == 0) { - server.cluster->currentEpoch++; + if (server.debug_cluster_failover_epoch >= 0) { + /* Testing only: force this election to run in a specific epoch so + * that several replicas can be made to contend in the very same + * epoch, deterministically reproducing a split vote. Consumed + * once; subsequent retries fall back to the normal currentEpoch++ + * so the replicas can eventually win in distinct epochs. */ + server.cluster->currentEpoch = server.debug_cluster_failover_epoch; + server.debug_cluster_failover_epoch = -1; + } else { + server.cluster->currentEpoch++; + } server.cluster->failover_auth_epoch = server.cluster->currentEpoch; serverLog(LL_NOTICE, "Starting a failover election for epoch %llu, node config epoch is %llu", (unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself)); @@ -6754,12 +6765,14 @@ void clusterUpdateState(void) { dictEntry *de; server.cluster->size = 0; + server.cluster->size_fail = 0; di = dictGetSafeIterator(server.cluster->nodes); while ((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); if (clusterNodeIsVotingPrimary(node)) { server.cluster->size++; + if (node->flags & CLUSTER_NODE_FAIL) server.cluster->size_fail++; if ((node->flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) == 0) reachable_primaries++; } } diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 98cf5622ee3..ccbb3a48adc 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -457,6 +457,7 @@ struct clusterState { int fail_reason; /* Why the cluster state changes to fail. */ int safe_to_join; /* Can the restarted node safely join the cluster? */ int size; /* Num of primary nodes with at least one slot */ + int size_fail; /* Num of voting primaries currently in FAIL state (subset of size). */ dict *nodes; /* Hash table of name -> clusterNode structures */ dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */ diff --git a/src/debug.c b/src/debug.c index a4ccc635f3a..1c6afad82f2 100644 --- a/src/debug.c +++ b/src/debug.c @@ -451,6 +451,11 @@ void debugCommand(client *c) { "CLUSTER-FAILOVER-DELAY ", " Override the failover delay. -1 is the default value, meaning don't", " override, values >= 0 will be used for the failover delay.", + "CLUSTER-FAILOVER-EPOCH ", + " Force the next failover election started by this replica to run in", + " the given epoch instead of currentEpoch+1. -1 (default) disables the", + " override. It is consumed once: subsequent retries use currentEpoch+1", + " again. Useful to make several replicas contend in the same epoch.", "OOM", " Crash the server simulating an out-of-memory error.", "PANIC", @@ -650,6 +655,15 @@ void debugCommand(client *c) { } server.debug_cluster_failover_delay = delay_ms; addReply(c, shared.ok); + } else if (!strcasecmp(objectGetVal(c->argv[1]), "cluster-failover-epoch") && c->argc == 3) { + long long epoch; + if (getLongLongFromObjectOrReply(c, c->argv[2], &epoch, NULL) != C_OK) return; + if (epoch < -1) { + addReplyError(c, "epoch must be -1 (default) or a non-negative value"); + return; + } + server.debug_cluster_failover_epoch = epoch; + addReply(c, shared.ok); } else if (!strcasecmp(objectGetVal(c->argv[1]), "slotmigration")) { if (!strcasecmp(objectGetVal(c->argv[2]), "prevent-pause")) { server.debug_slot_migration_prevent_pause = atoi(objectGetVal(c->argv[3])); diff --git a/src/server.c b/src/server.c index 0fddedde809..e16b235a42e 100644 --- a/src/server.c +++ b/src/server.c @@ -2964,6 +2964,7 @@ void initServer(void) { server.debug_cluster_disable_random_ping = 0; server.debug_cluster_disable_reconnection = 0; server.debug_cluster_failover_delay = -1; + server.debug_cluster_failover_epoch = -1; server.reply_buffer_peak_reset_time = REPLY_BUFFER_DEFAULT_PEAK_RESET_TIME; server.reply_buffer_resizing_enabled = 1; server.client_mem_usage_buckets = NULL; diff --git a/src/server.h b/src/server.h index 190abfb75fa..a323b011154 100644 --- a/src/server.h +++ b/src/server.h @@ -2319,6 +2319,9 @@ struct valkeyServer { uint32_t debug_slot_migration_prevent_failover : 1; /* Debug config to override the failover delay (in ms). */ int debug_cluster_failover_delay; + /* Debug config to force the next failover election to run in a specific + * epoch (testing only). -1 means don't override; consumed once. */ + long long debug_cluster_failover_epoch; sds cached_cluster_slot_info[CACHE_CONN_TYPE_MAX]; /* Index in array is a bitwise or of CACHE_CONN_TYPE_* */ /* Scripting */ mstime_t busy_reply_threshold; /* Script / module timeout in milliseconds */ diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 3a6875bfb6a..72e5aa05bc0 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -88,11 +88,28 @@ proc test_same_epoch {delay} { fail "Voters did not mark all paused primaries as fail" } + # Force the three replicas to start their election in the very same + # epoch, so the same-epoch split vote is reproduced deterministically + # rather than depending on the timing of the (possibly zero) delay. + if {$delay == 0} { + set epoch [expr [CI 3 cluster_current_epoch] + 1] + R 7 DEBUG CLUSTER-FAILOVER-EPOCH $epoch + R 8 DEBUG CLUSTER-FAILOVER-EPOCH $epoch + R 9 DEBUG CLUSTER-FAILOVER-EPOCH $epoch + } + # Now let the replicas proceed with the election. R 7 CONFIG SET cluster-replica-no-failover no R 8 CONFIG SET cluster-replica-no-failover no R 9 CONFIG SET cluster-replica-no-failover no + # All three must have contended in the very same (forced) epoch. + if {$delay == 0} { + wait_for_log_messages -7 [list "*Starting a failover election for epoch $epoch*"] 0 1000 50 + wait_for_log_messages -8 [list "*Starting a failover election for epoch $epoch*"] 0 1000 50 + wait_for_log_messages -9 [list "*Starting a failover election for epoch $epoch*"] 0 1000 50 + } + # Wait for the failover wait_for_condition 1000 50 { [s -7 role] == "master" &&