From 4fcef86ebfd6fea2680568e0a774b76d8123ba1b Mon Sep 17 00:00:00 2001 From: sushil paneru Date: Mon, 1 Jun 2026 19:37:34 +0000 Subject: [PATCH 1/9] Cluster bus v2 - Added shard level epoch Signed-off-by: Sushil Paneru --- design-docs/cluster-raft.md | 73 +++-- src/cluster.c | 1 + src/cluster_legacy.c | 1 + src/cluster_raft.c | 367 +++++++++++++++++++--- src/cluster_state.c | 51 +++ src/cluster_state.h | 5 + tests/unit/cluster/cluster-raft-proto.tcl | 123 ++++++++ 7 files changed, 558 insertions(+), 63 deletions(-) diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md index 3794f12cdf1..d389142ac84 100644 --- a/design-docs/cluster-raft.md +++ b/design-docs/cluster-raft.md @@ -723,21 +723,29 @@ reuse), but the vars and log lines are raft-specific. The file is not compatible between protocols — switching from gossip to raft (or vice versa) requires removing nodes.conf. -## Shard Epoch (not yet implemented) - -A shard-epoch is a per-shard monotonically increasing counter, bumped -on topology changes within the shard (FAILOVER, SET_REPLICA_OF, -SLOT_CHANGE). Entries that modify shard topology include the current -shard-epoch at proposal time. On apply, if the shard-epoch has -advanced, the entry is stale and becomes a no-op. - -This prevents stale entries from causing inconsistencies when -concurrent operations race in the log. Example: +## Shard Epoch + +Raft ensures entries are applied in a total order, but ordering alone +is not sufficient to prevent stale mutations from corrupting cluster +state. When concurrent operations target the same shard (e.g., a slot +migration racing with a failover), a committed entry may carry +assumptions about shard topology that are no longer true by the time +it is applied. Without additional application-level state to fence +against these stale updates, the apply logic can produce +inconsistencies — such as moving a slot to a node that no longer owns +the corresponding keys. + +A shard-epoch is a per-shard monotonically increasing counter stored +in `server.cluster->shard_epochs`. It is bumped each time a topology +change is applied to the shard. Entries that modify shard topology +include the shard's current epoch at proposal time. On apply, if the +epoch has advanced past the value in the entry, the entry is stale +and becomes a no-op. + +### Example: slot migration racing with failover ``` -Slot migration racing with failover: - -1. Atomic slot migration starts: keys transferred from shard A to B. +1. Slot migration starts: keys transferred from shard A to shard B. 2. Primary of shard A fails. FAILOVER entry is proposed. 3. Migration is rolled back (keys stay on shard A's new primary). 4. SLOT_CHANGE entry (assigning slot to shard B) was proposed before @@ -748,14 +756,39 @@ Slot migration racing with failover: carries the old epoch, so it's a no-op. Slot stays on shard A. ``` -Entries that should carry a shard-epoch: -- FAILOVER (bumps epoch of the shard) -- SET_REPLICA_OF (bumps epoch when changing shard membership) -- SLOT_CHANGE (checked against source and target shard epochs) +### Entry formats with epoch + +``` +FAILOVER +SET_REPLICA_OF +SLOT_CHANGE +NODE_FORGET +``` + +SLOT_CHANGE carries two epochs because it involves two shards (source +and target). NODE_FORGET carries the epoch of the departing node's +shard to guard against removing a node whose role changed (e.g., +promoted to primary via a concurrent FAILOVER). + +### Validation + +Epoch validation happens at two points: + +1. **Pre-validation on the leader** — before appending to the log. + This is a best-effort optimization that rejects obviously stale + proposals early, saving log space and replication bandwidth. It + performs a read-only check without bumping the epoch. + +2. **Apply-time validation** — the authoritative check. Each apply + function validates the entry's epoch against the current shard + epoch. On match (or epoch 0 for a new shard), the epoch is bumped + and the entry is applied. On mismatch, the entry is a no-op and + the error is propagated to the caller's callback. + +### Entries that don't carry an epoch -Entries that don't need a shard-epoch: -- NODE_FAIL / NODE_RECOVER (liveness, not topology) -- NODE_INFO / NODE_JOIN / NODE_FORGET (node-level, not shard-level) +- NODE_FAIL / NODE_RECOVER +- NODE_INFO, NODE_JOIN ## Leader Transfer diff --git a/src/cluster.c b/src/cluster.c index 3a7907fa738..ffc614b70ac 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -70,6 +70,7 @@ void clusterInit(void) { server.cluster->size = 0; server.cluster->nodes = dictCreate(&clusterNodesDictType); server.cluster->shards = dictCreate(&clusterSdsToListType); + server.cluster->shard_epochs = dictCreate(&clusterShardEpochDictType); server.cluster->migrating_slots_to = dictCreate(&clusterSlotDictType); server.cluster->importing_slots_from = dictCreate(&clusterSlotDictType); server.cluster->stat_cluster_links_buffer_limit_exceeded = 0; diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index bb62df1d35f..65b46670389 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -735,6 +735,7 @@ static void clusterLegacyReset(int hard) { /* Recreate shards dict */ dictEmpty(server.cluster->shards, NULL); + dictEmpty(server.cluster->shard_epochs, NULL); /* Forget all the nodes, but myself. */ di = dictGetSafeIterator(server.cluster->nodes); diff --git a/src/cluster_raft.c b/src/cluster_raft.c index 37bbf28d401..30831a540c5 100644 --- a/src/cluster_raft.c +++ b/src/cluster_raft.c @@ -308,6 +308,7 @@ static clusterMsgSendBlock *clusterRaftBuildAllOffsetsMsg(void) { static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, const char *error)); static void clusterRaftDeferPendingProposals(void); +static int clusterRaftPreValidateEpoch(int type, sds data); static void clusterRaftUpdateMyself(int old_flags); static sds clusterRaftBuildMyNodeInfo(void); static void clusterRaftCheckSlotCoverage(void); @@ -315,9 +316,9 @@ static void clusterRaftBroadcastAppendEntries(void); static void clusterRaftSendAppendEntries(clusterLink *link, clusterNode *node); static void clusterRaftSendPreVoteRequest(clusterLink *link, uint64_t term); static void clusterRaftUnblockMeet(clusterNode *node); -static void clusterRaftApplySlotChange(sds data); -static void clusterRaftApplySetReplica(sds data); -static void clusterRaftApplyFailover(sds data); +static const char *clusterRaftApplySlotChange(sds data); +static const char *clusterRaftApplySetReplica(sds data); +static const char *clusterRaftApplyFailover(sds data); static void raftLogApply(raftLogEntry *e); static raftLogEntry *raftLogCreate(uint64_t term, uint64_t index, uint8_t type, sds data); static void raftLogAppend(raftLogEntry *e); @@ -766,6 +767,15 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, sds data = sp ? sdsnew(sp + 1) : sdsempty(); + if (rs->role == RAFT_ROLE_LEADER) { + /* Pre-validate epoch on the leader to reject obviously stale proposals. */ + if (!clusterRaftPreValidateEpoch(type, data)) { + if (callback) callback(ctx, "stale shard epoch"); + sdsfree(data); + return; + } + } + /* Track pending proposal for retry on leader change. */ { raftPendingProposal *pp = zmalloc(sizeof(*pp)); @@ -786,7 +796,6 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, memcmp(data, myself->name, CLUSTER_NAMELEN) == 0)) { clusterRaftSelfJoin(); } - uint64_t idx = raftLogLastIndex() + 1; raftLogAppend(raftLogCreate(rs->current_term, idx, type, data)); serverLog(LL_NOTICE, "Leader appended %s (index %llu).", @@ -815,6 +824,134 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, } } +static int isShardEpochCurrent(const char *shard_id, uint64_t entry_epoch) { + uint64_t current = clusterGetShardEpoch(shard_id); + return (current == 0 || entry_epoch == current); +} + +/* Parsed fields from a SLOT_CHANGE entry's epoch-related data. + * Format: " " */ +typedef struct { + clusterNode *target; /* Target node (NULL if dash). */ + clusterNode *source_owner; /* Current owner of the first slot in the range. */ + const char *source_shard_id; /* source_owner->shard_id or NULL. */ + const char *target_shard_id; /* target->shard_id or NULL. */ + uint64_t source_epoch; + uint64_t target_epoch; + int range_end; /* Index in argv where ranges end (argc - 2). */ + int valid; /* 1 if parsing succeeded, 0 otherwise. */ +} slotChangeEpochInfo; + +/* Parse epoch-related fields from a SLOT_CHANGE entry's split argv. + * Caller must have split data and verified argc >= 4. */ +static slotChangeEpochInfo parseSlotChangeEpochs(sds *argv, int argc) { + slotChangeEpochInfo info = {0}; + if (argc < 4) return info; + + info.target = (sdslen(argv[0]) == CLUSTER_NAMELEN) + ? clusterLookupNode(argv[0], CLUSTER_NAMELEN) + : NULL; + info.source_epoch = strtoull(argv[argc - 2], NULL, 10); + info.target_epoch = strtoull(argv[argc - 1], NULL, 10); + info.range_end = argc - 2; + + /* Derive source shard from current owner of the first slot in the range. */ + int first_start; + char *dash = strchr(argv[1], '-'); + if (dash) { + *dash = '\0'; + first_start = atoi(argv[1]); + *dash = '-'; + } else { + first_start = atoi(argv[1]); + } + info.source_owner = (first_start >= 0 && first_start < CLUSTER_SLOTS) + ? server.cluster->slots[first_start] + : NULL; + info.source_shard_id = info.source_owner ? info.source_owner->shard_id : NULL; + info.target_shard_id = info.target ? info.target->shard_id : NULL; + info.valid = 1; + return info; +} + +/* Pre-validate shard epoch on the leader before appending to the log. + * This is an optimization: reject obviously stale proposals early to avoid + * wasting log space and replication bandwidth. + * Returns 1 if the entry should be appended, 0 if stale. */ +/* Returns 0 if the entry_epoch is stale relative to the shard's current epoch. + * Returns 1 if valid (epoch matches or shard is at epoch 0). */ +static int clusterRaftPreValidateEpoch(int type, sds data) { + int argc; + sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc); + if (!argv) return 1; + + int ok = 1; + switch (type) { + case RAFT_ENTRY_FAILOVER: { + /* Format: */ + if (argc < 3) { + ok = 0; + } else { + clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1])); + if (primary) { + uint64_t epoch = strtoull(argv[2], NULL, 10); + ok = isShardEpochCurrent(primary->shard_id, epoch); + } + } + break; + } + case RAFT_ENTRY_SET_REPLICA_OF: { + /* Format: */ + if (argc < 4) { + ok = 0; + } else { + uint64_t epoch = strtoull(argv[3], NULL, 10); + ok = isShardEpochCurrent(argv[2], epoch); + } + break; + } + case RAFT_ENTRY_SLOT_CHANGE: { + /* Format: */ + if (argc < 4) { + ok = 0; + } else { + slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc); + if (info.valid) { + if (info.source_shard_id) { + ok = isShardEpochCurrent(info.source_shard_id, info.source_epoch); + } + if (ok && info.target_shard_id) { + ok = isShardEpochCurrent(info.target_shard_id, info.target_epoch); + } + } + } + break; + } + case RAFT_ENTRY_NODE_FORGET: { + /* Format: */ + if (argc < 2) { + ok = 0; + } else { + clusterNode *node = clusterLookupNode(argv[0], sdslen(argv[0])); + if (node) { + uint64_t epoch = strtoull(argv[1], NULL, 10); + ok = isShardEpochCurrent(node->shard_id, epoch); + } + } + break; + } + default: + break; + } + + sdsfreesplitres(argv, argc); + if (!ok) { + serverLog(LL_NOTICE, "Leader pre-validation: rejecting %s proposal (missing or stale epoch).", + raftEntryTypeName(type)); + } + return ok; +} + static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) { UNUSED(link); clusterRaftState *rs = RAFT_STATE(); @@ -834,6 +971,13 @@ static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) { /* Data is everything after the type name. */ sds data = (argc >= 3) ? sdsjoinsds(argv + 2, argc - 2, " ", 1) : sdsempty(); + /* Pre-validate epoch on the leader to reject obviously stale proposals. */ + if (!clusterRaftPreValidateEpoch(type, data)) { + sdsfree(data); + sdsfree(entry); + return 1; + } + uint64_t idx = raftLogLastIndex() + 1; raftLogAppend(raftLogCreate(rs->current_term, idx, type, data)); serverLog(LL_NOTICE, "Leader appended proposed %s (index %llu).", @@ -957,9 +1101,63 @@ static uint64_t raftLogTermAt(uint64_t index) { return e ? e->term : 0; } +/* -------------------------------------------------------------------------- + * Shard epoch validation helpers + * -------------------------------------------------------------------------- */ + +/* Validate and bump epoch for a single-shard entry. + * Returns 1 if the entry should be applied, 0 if stale. + * On success, increments the shard epoch (or sets it to 1 if currently 0). + * If shard_id is NULL, always returns 1 (no shard to validate). */ +static int clusterCheckAndBumpShardEpoch(const char *shard_id, uint64_t entry_epoch) { + if (!shard_id) return 1; + uint64_t current = clusterGetShardEpoch(shard_id); + if (current == 0) { + /* First operation on this shard — apply unconditionally, bump to 1. */ + clusterSetShardEpoch(shard_id, 1); + return 1; + } + if (entry_epoch == current) { + clusterSetShardEpoch(shard_id, current + 1); + return 1; + } + return 0; +} + +/* Validate and bump epochs for SLOT_CHANGE (two shards). + * Returns 1 if the entry should be applied, 0 if either epoch is stale. + * On success, increments both source and target epochs. + * source_shard_id may be NULL (unassigned slots). */ +static int clusterCheckAndBumpSlotChangeEpochs(const char *source_shard_id, + uint64_t source_epoch, + const char *target_shard_id, + uint64_t target_epoch) { + uint64_t src_current = source_shard_id ? clusterGetShardEpoch(source_shard_id) : 0; + uint64_t tgt_current = target_shard_id ? clusterGetShardEpoch(target_shard_id) : 0; + + /* Check source. */ + if (source_shard_id && src_current > 0 && source_epoch != src_current) { + return 0; + } + /* Check target. */ + if (target_shard_id && tgt_current > 0 && target_epoch != tgt_current) { + return 0; + } + + /* Both passed — bump both. */ + if (source_shard_id) { + clusterSetShardEpoch(source_shard_id, src_current == 0 ? 1 : src_current + 1); + } + if (target_shard_id) { + clusterSetShardEpoch(target_shard_id, tgt_current == 0 ? 1 : tgt_current + 1); + } + return 1; +} + /* Apply a committed log entry. */ static void raftLogApply(raftLogEntry *e) { clusterRaftState *rs = RAFT_STATE(); + const char *entry_error = NULL; switch (e->type) { case RAFT_ENTRY_NODE_JOIN: { /* data: "
" */ @@ -967,6 +1165,8 @@ static void raftLogApply(raftLogEntry *e) { sds *argv = sdssplitlen(e->data, sdslen(e->data), " ", 1, &argc); if (argv && argc >= 2 && sdslen(argv[0]) == CLUSTER_NAMELEN) { clusterNode *existing = clusterLookupNode(argv[0], CLUSTER_NAMELEN); + /* For new nodes (existing == NULL), epoch should be 0 — no validation needed. */ + if (!existing) { clusterNode *n = createClusterNode(argv[0], CLUSTER_NODE_PRIMARY); if (clusterNodeParseAddressString(n, argv[1]) == C_OK) { @@ -1065,26 +1265,53 @@ static void raftLogApply(raftLogEntry *e) { if (argv) sdsfreesplitres(argv, argc); break; } - case RAFT_ENTRY_SLOT_CHANGE: - clusterRaftApplySlotChange(e->data); - rs->todo_update_slot_coverage = 1; - rs->todo_invalidate_slots_cache = 1; - serverLog(LL_NOTICE, "Applied SLOT_CHANGE (index %llu).", (unsigned long long)e->index); + case RAFT_ENTRY_SLOT_CHANGE: { + entry_error = clusterRaftApplySlotChange(e->data); + if (!entry_error) { + rs->todo_update_slot_coverage = 1; + rs->todo_invalidate_slots_cache = 1; + } + serverLog(LL_NOTICE, "Applied SLOT_CHANGE (index %llu)%s.", (unsigned long long)e->index, + entry_error ? " [stale]" : ""); break; - case RAFT_ENTRY_SET_REPLICA_OF: - clusterRaftApplySetReplica(e->data); - rs->todo_invalidate_slots_cache = 1; - serverLog(LL_NOTICE, "Applied SET_REPLICA_OF (index %llu).", (unsigned long long)e->index); + } + case RAFT_ENTRY_SET_REPLICA_OF: { + entry_error = clusterRaftApplySetReplica(e->data); + if (!entry_error) { + rs->todo_invalidate_slots_cache = 1; + } + serverLog(LL_NOTICE, "Applied SET_REPLICA_OF (index %llu)%s.", (unsigned long long)e->index, + entry_error ? " [stale]" : ""); break; - case RAFT_ENTRY_FAILOVER: - clusterRaftApplyFailover(e->data); - rs->todo_update_slot_coverage = 1; - rs->todo_invalidate_slots_cache = 1; - serverLog(LL_NOTICE, "Applied FAILOVER (index %llu).", (unsigned long long)e->index); + } + case RAFT_ENTRY_FAILOVER: { + entry_error = clusterRaftApplyFailover(e->data); + if (!entry_error) { + rs->todo_update_slot_coverage = 1; + rs->todo_invalidate_slots_cache = 1; + } + serverLog(LL_NOTICE, "Applied FAILOVER (index %llu)%s.", (unsigned long long)e->index, + entry_error ? " [stale]" : ""); break; + } case RAFT_ENTRY_NODE_FORGET: { - clusterNode *node = clusterLookupNode(e->data, sdslen(e->data)); + int argc; + sds *argv = sdssplitlen(e->data, sdslen(e->data), " ", 1, &argc); + if (!argv || argc < 2) { + if (argv) sdsfreesplitres(argv, argc); + break; + } + clusterNode *node = clusterLookupNode(argv[0], sdslen(argv[0])); if (node && node != myself) { + /* Epoch validation. */ + uint64_t epoch = strtoull(argv[1], NULL, 10); + if (!clusterCheckAndBumpShardEpoch(node->shard_id, epoch)) { + entry_error = "stale shard epoch"; + serverLog(LL_NOTICE, "Applied NODE_FORGET %.40s (index %llu) [stale].", + argv[0], (unsigned long long)e->index); + sdsfreesplitres(argv, argc); + break; + } /* Detach link before deleting so clusterReadHandler can detect * that the node it was talking to is gone. */ if (node->link) { @@ -1096,7 +1323,8 @@ static void raftLogApply(raftLogEntry *e) { rs->todo_invalidate_slots_cache = 1; } serverLog(LL_NOTICE, "Applied NODE_FORGET %.40s (index %llu).", - e->data, (unsigned long long)e->index); + argv[0], (unsigned long long)e->index); + sdsfreesplitres(argv, argc); break; } case RAFT_ENTRY_NODE_FAIL: { @@ -1143,8 +1371,9 @@ static void raftLogApply(raftLogEntry *e) { * managed by NODE_JOIN and SET_REPLICA_OF entries. */ int argc; sds *argv = sdssplitlen(e->data, sdslen(e->data), " ", 1, &argc); - if (argv && argc >= 2 && sdslen(argv[0]) == CLUSTER_NAMELEN) { + if (argv && argc >= 3 && sdslen(argv[0]) == CLUSTER_NAMELEN) { clusterNode *node = clusterLookupNode(argv[0], CLUSTER_NAMELEN); + if (node && node != myself) { /* Reset optional fields so absent aux fields get cleared. */ node->announce_client_tcp_port = 0; @@ -1155,8 +1384,7 @@ static void raftLogApply(raftLogEntry *e) { sdsclear(node->announce_client_ipv6); sdsclear(node->availability_zone); clusterNodeParseAddressString(node, argv[1]); - /* Apply self-set flags. TODO: split on comma and compare - * each part individually when more flags are added. */ + /* Apply self-set flags. */ if (argc >= 3) { node->flags &= ~CLUSTER_NODE_NOFAILOVER; if (strstr(argv[2], "nofailover")) { @@ -1191,7 +1419,7 @@ static void raftLogApply(raftLogEntry *e) { while ((ln = listNext(&li)) != NULL) { raftPendingProposal *pp = listNodeValue(ln); if (pp->type == e->type && !sdscmp(pp->data, e->data)) { - if (pp->callback) pp->callback(pp->ctx, NULL); + if (pp->callback) pp->callback(pp->ctx, entry_error); sdsfree(pp->data); zfree(pp); listDelNode(rs->pending_proposals, ln); @@ -1972,6 +2200,9 @@ static void clusterRaftCron(void) { entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); entry = sdscatlen(entry, " ", 1); entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN); + /* Append primary's shard epoch. */ + uint64_t epoch = clusterGetShardEpoch(primary->shard_id); + entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); clusterRaftPropose(entry, rs->mf_ctx, rs->mf_callback); sdsfree(entry); rs->mf_end = 0; @@ -1991,6 +2222,9 @@ static void clusterRaftCron(void) { entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); entry = sdscatlen(entry, " ", 1); entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN); + /* Append primary's shard epoch. */ + uint64_t epoch = clusterGetShardEpoch(primary->shard_id); + entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); clusterRaftPropose(entry, NULL, NULL); sdsfree(entry); } @@ -2760,7 +2994,7 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode return; } - /* Build entry: "SLOT_CHANGE " */ + /* Build entry: "SLOT_CHANGE " */ sds entry = sdsnew("SLOT_CHANGE "); entry = sdscatlen(entry, target ? target->name : "-", target ? CLUSTER_NAMELEN : 1); for (int i = 0; i < numranges; i++) { @@ -2770,6 +3004,12 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode entry = sdscatfmt(entry, " %i-%i", ranges[i].start_slot, ranges[i].end_slot); } + /* Append source-epoch and target-epoch as last two fields. */ + clusterNode *source_owner = server.cluster->slots[ranges[0].start_slot]; + uint64_t source_epoch = (source_owner) ? clusterGetShardEpoch(source_owner->shard_id) : 0; + uint64_t target_epoch = target ? clusterGetShardEpoch(target->shard_id) : 0; + entry = sdscatfmt(entry, " %U %U", (unsigned long long)source_epoch, (unsigned long long)target_epoch); + /* Propose through Raft leader. The callback is invoked when the entry is * committed and applied. We use two chained callbacks, to handle some * side-effects like replica migration, before invoking the original @@ -2782,18 +3022,28 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode sdsfree(entry); } -/* Apply a SLOT_CHANGE entry. Format: " [ ...]" - * Ranges use the same format as nodes.conf: "0-5460" or "5461". */ -static void clusterRaftApplySlotChange(sds data) { +/* Apply a SLOT_CHANGE entry. Format: " " + * Ranges use the same format as nodes.conf: "0-5460" or "5461". + * The last two fields are the source and target shard epochs. */ +static const char *clusterRaftApplySlotChange(sds data) { + const char *error = NULL; int argc; sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc); - if (!argv || argc < 2) goto done; + /* Need at least: target + one_range + source_epoch + target_epoch = 4 */ + if (!argv || argc < 4) goto done; - clusterNode *target = (sdslen(argv[0]) == CLUSTER_NAMELEN) - ? clusterLookupNode(argv[0], CLUSTER_NAMELEN) - : NULL; + slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc); + if (!info.valid) goto done; - for (int i = 1; i < argc; i++) { + /* Epoch validation: validate both source and target epochs. */ + if (!clusterCheckAndBumpSlotChangeEpochs(info.source_shard_id, info.source_epoch, + info.target_shard_id, info.target_epoch)) { + error = "stale shard epoch"; + goto done; + } + + /* Range fields are argv[1] through argv[range_end-1] (excluding last two epoch fields). */ + for (int i = 1; i < info.range_end; i++) { int start, end; char *p = strchr(argv[i], '-'); if (p) { @@ -2804,17 +3054,17 @@ static void clusterRaftApplySlotChange(sds data) { start = end = atoi(argv[i]); } for (int j = start; j <= end; j++) { - if (target == myself || server.cluster->slots[j] == myself) + if (info.target == myself || server.cluster->slots[j] == myself) RAFT_STATE()->todo_save_config = 1; - if (target) { + if (info.target) { /* If this slot is moving away from myself, delete keys. */ - if (server.cluster->slots[j] == myself && target != myself) { + if (server.cluster->slots[j] == myself && info.target != myself) { serverLog(LL_NOTICE, "Deleting keys in dirty slot %d on node %.40s", j, myself->name); delKeysInSlot(j, server.lazyfree_lazy_server_del, true, false); } if (server.cluster->slots[j]) clusterDelSlot(j); - clusterAddSlot(target, j); + clusterAddSlot(info.target, j); } else { if (server.cluster->slots[j] == myself) { delKeysInSlot(j, server.lazyfree_lazy_server_del, true, false); @@ -2825,15 +3075,17 @@ static void clusterRaftApplySlotChange(sds data) { } done: if (argv) sdsfreesplitres(argv, argc); + return error; } /* Apply a SET_REPLICA_OF entry. - * Format: " " */ -static void clusterRaftApplySetReplica(sds data) { + * Format: " " */ +static const char *clusterRaftApplySetReplica(sds data) { clusterRaftState *rs = RAFT_STATE(); + const char *error = NULL; int argc; sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc); - if (!argv || argc != 3) goto done; + if (!argv || argc != 4) goto done; clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0])); if (!replica) goto done; @@ -2842,6 +3094,13 @@ static void clusterRaftApplySetReplica(sds data) { char *shard_id = argv[2]; + /* Epoch validation: validate against the shard-id from the entry. */ + uint64_t epoch = strtoull(argv[3], NULL, 10); + if (!clusterCheckAndBumpShardEpoch(shard_id, epoch)) { + error = "stale shard epoch"; + goto done; + } + if (sdslen(argv[1]) == 1 && argv[1][0] == '-') { /* Promote to primary with the shard-id from the entry. */ if (nodeIsReplica(replica)) { @@ -2884,20 +3143,30 @@ static void clusterRaftApplySetReplica(sds data) { } done: if (argv) sdsfreesplitres(argv, argc); + return error; } -/* Apply a FAILOVER entry. Format: " " +/* Apply a FAILOVER entry. Format: " " * The replica takes over the primary's slots and becomes primary. * The old primary becomes a replica of the new primary. */ -static void clusterRaftApplyFailover(sds data) { +static const char *clusterRaftApplyFailover(sds data) { clusterRaftState *rs = RAFT_STATE(); + const char *error = NULL; int argc; sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc); - if (!argv || argc != 2) goto done; + if (!argv || argc != 3) goto done; clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0])); clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1])); if (!replica || !primary) goto done; + + /* Epoch validation: validate against primary's shard before domain logic. */ + uint64_t epoch = strtoull(argv[2], NULL, 10); + if (!clusterCheckAndBumpShardEpoch(primary->shard_id, epoch)) { + error = "stale shard epoch"; + goto done; + } + if (!nodeIsReplica(replica) || nodeIsReplica(primary) || replica->replicaof != primary) goto done; /* Transfer slots from old primary to new primary. */ @@ -2946,11 +3215,16 @@ static void clusterRaftApplyFailover(sds data) { } done: if (argv) sdsfreesplitres(argv, argc); + return error; } static void clusterRaftForgetNode(const char *node_id, size_t id_len, void *ctx, void (*callback)(void *ctx, const char *error)) { sds entry = sdsnew("NODE_FORGET "); entry = sdscatlen(entry, node_id, id_len); + /* Append departing node's shard epoch. */ + clusterNode *node = clusterLookupNode(node_id, id_len); + uint64_t epoch = node ? clusterGetShardEpoch(node->shard_id) : 0; + entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); clusterRaftPropose(entry, ctx, callback); sdsfree(entry); } @@ -2972,6 +3246,9 @@ static void clusterRaftSetReplicaOf(clusterNode *primary, void *ctx, void (*call getRandomHexChars(new_shard_id, CLUSTER_NAMELEN); entry = sdscatlen(entry, new_shard_id, CLUSTER_NAMELEN); } + /* Append target shard epoch: use primary's shard epoch for assignment, 0 for promotion. */ + uint64_t epoch = primary ? clusterGetShardEpoch(primary->shard_id) : 0; + entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); clusterRaftPropose(entry, ctx, callback); sdsfree(entry); } @@ -2990,6 +3267,9 @@ static void clusterRaftFailover(int force, int takeover, void *ctx, void (*callb entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); entry = sdscatlen(entry, " ", 1); entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN); + /* Append primary's shard epoch. */ + uint64_t epoch = clusterGetShardEpoch(primary->shard_id); + entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); clusterRaftPropose(entry, ctx, callback); sdsfree(entry); } else { @@ -3085,6 +3365,7 @@ static void clusterRaftResetCluster(int hard) { /* Recreate shards dict. */ dictEmpty(server.cluster->shards, NULL); + dictEmpty(server.cluster->shard_epochs, NULL); /* Forget all nodes except myself. */ dictIterator *di = dictGetSafeIterator(server.cluster->nodes); diff --git a/src/cluster_state.c b/src/cluster_state.c index ad8d6ade13f..e9df0344d07 100644 --- a/src/cluster_state.c +++ b/src/cluster_state.c @@ -62,6 +62,14 @@ dictType clusterSlotDictType = { .entryDestructor = zfree, }; +/* Shard epoch table, mapping shard_id to uint64_t epoch (stored inline). */ +dictType clusterShardEpochDictType = { + .entryGetKey = dictEntryGetKey, + .hashFunction = dictSdsHash, + .keyCompare = dictSdsKeyCompare, + .entryDestructor = dictEntryDestructorSdsKey, +}; + /* ----------------------------------------------------------------------------- * Bitmap helpers * -------------------------------------------------------------------------- */ @@ -362,6 +370,14 @@ void clusterAddNodeToShard(const char *shard_id, clusterNode *node) { list *l = listCreate(); listAddNodeTail(l, node); serverAssert(dictAdd(server.cluster->shards, s, l) == DICT_OK); + /* Initialize shard epoch to 0 for the new shard. */ + sds epoch_key = sdsnewlen(shard_id, CLUSTER_NAMELEN); + dictEntry *epoch_de = dictAddRaw(server.cluster->shard_epochs, epoch_key, NULL); + if (epoch_de) { + dictSetUnsignedIntegerVal(epoch_de, 0); + } else { + sdsfree(epoch_key); + } } else { list *l = dictGetVal(de); if (listSearchKey(l, node) == NULL) { @@ -382,11 +398,46 @@ void clusterRemoveNodeFromShard(clusterNode *node) { } if (listLength(l) == 0) { dictDelete(server.cluster->shards, s); + /* Remove shard epoch when shard is destroyed. */ + dictDelete(server.cluster->shard_epochs, s); } } sdsfree(s); } +/* -------------------------------------------------------------------------- + * Shard epoch helpers + * -------------------------------------------------------------------------- */ + +uint64_t clusterGetShardEpoch(const char *shard_id) { + sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); + dictEntry *de = dictFind(server.cluster->shard_epochs, s); + sdsfree(s); + return de ? dictGetUnsignedIntegerVal(de) : 0; +} + +void clusterSetShardEpoch(const char *shard_id, uint64_t epoch) { + sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); + dictEntry *de = dictFind(server.cluster->shard_epochs, s); + if (de) { + dictSetUnsignedIntegerVal(de, epoch); + sdsfree(s); + } else { + de = dictAddRaw(server.cluster->shard_epochs, s, NULL); + if (de) { + dictSetUnsignedIntegerVal(de, epoch); + } else { + sdsfree(s); + } + } +} + +void clusterRemoveShardEpoch(const char *shard_id) { + sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); + dictDelete(server.cluster->shard_epochs, s); + sdsfree(s); +} + /* ----------------------------------------------------------------------------- * Slot migration state * -------------------------------------------------------------------------- */ diff --git a/src/cluster_state.h b/src/cluster_state.h index 3a9bb49ab08..34485d78e7d 100644 --- a/src/cluster_state.h +++ b/src/cluster_state.h @@ -11,6 +11,7 @@ typedef struct clusterLink clusterLink; extern dictType clusterNodesDictType; extern dictType clusterSdsToListType; extern dictType clusterSlotDictType; +extern dictType clusterShardEpochDictType; /* Cluster node flags and macros. */ #define CLUSTER_NODE_PRIMARY (1 << 0) /* The node is a primary */ @@ -85,6 +86,7 @@ struct clusterState { int size; /* Num of primary nodes with at least one slot */ dict *nodes; /* Hash table of name -> clusterNode structures */ dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ + dict *shard_epochs; /* Hash table of shard_id -> uint64_t epoch */ dict *migrating_slots_to; dict *importing_slots_from; clusterNode *slots[CLUSTER_SLOTS]; @@ -127,6 +129,9 @@ int clusterPrimariesHaveReplicas(void); int clusterNodeClearSlotBit(clusterNode *n, int slot); void clusterRemoveNodeFromShard(clusterNode *node); void clusterAddNodeToShard(const char *shard_id, clusterNode *node); +uint64_t clusterGetShardEpoch(const char *shard_id); +void clusterSetShardEpoch(const char *shard_id, uint64_t epoch); +void clusterRemoveShardEpoch(const char *shard_id); void clusterAddNode(clusterNode *node); void clusterDelNode(clusterNode *node); void clusterRenameNode(clusterNode *node, char *newname); diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl index a33a2c7a7b0..565cb15b976 100644 --- a/tests/unit/cluster/cluster-raft-proto.tcl +++ b/tests/unit/cluster/cluster-raft-proto.tcl @@ -515,4 +515,127 @@ test "Raft proto: leader sends REPL_OFFSETS after follower offset changes" { } } +# -------------------------------------------------------------------------- +# Shard epoch: stale proposal rejection tests (parameterized). +# These verify that the leader rejects stale proposals BEFORE appending +# to the raft log (commit index unchanged, cluster state unchanged). +# -------------------------------------------------------------------------- + +proc get_cluster_info_field {client field} { + set info [$client CLUSTER INFO] + foreach line [split $info "\n"] { + set line [string trim $line "\r"] + if {[string match "${field}:*" $line]} { + return [lindex [split $line ":"] 1] + } + } + return "" +} + +start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft cluster-node-timeout 2000}} { + # Shared setup: form cluster and assign slots. + set r0 [srv 0 client] + set r1 [srv -1 client] + + $r0 CLUSTER MEET [srv -1 host] [srv -1 port] + + wait_for_condition 50 100 { + [get_cluster_info_field $r0 cluster_size] == 2 && + [get_cluster_info_field $r1 cluster_size] == 2 + } else { + fail "Cluster did not form" + } + + $r0 CLUSTER ADDSLOTSRANGE 0 16383 + + wait_for_condition 50 100 { + [get_cluster_info_field $r0 cluster_slots_assigned] == 16384 && + [get_cluster_info_field $r1 cluster_slots_assigned] == 16384 + } else { + fail "Slots not assigned" + } + + set node_id [$r0 CLUSTER MYID] + set node1_id [$r1 CLUSTER MYID] + set port [srv 0 port] + set cport [expr {$port + 10000}] + + # Helper: connect to cluster bus and complete HELLO handshake. + proc connect_fake_node {cport} { + set fake_id [string repeat "f" 40] + set fake_shard [string repeat "e" 40] + set fake_addr "127.0.0.1:9999@19999,,tls-port=0,shard-id=$fake_shard" + set fd [raft_connect 127.0.0.1 $cport] + raft_send $fd "HELLO $fake_id $fake_addr" + set reply [raft_recv $fd] + assert_match "HI *" $reply + return $fd + } + + # Each entry: {entry_type propose_msg state_check} + set stale_proposals [list \ + [list "SLOT_CHANGE" "PROPOSE SLOT_CHANGE - 50 0 0" { + assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned] + }] \ + [list "FAILOVER" "PROPOSE FAILOVER $node1_id $node_id 0" { + set nodes [$r0 CLUSTER NODES] + assert_match "*$node_id*myself,master*" $nodes + assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned] + }] \ + ] + + foreach case $stale_proposals { + lassign $case entry_type propose_msg state_check + + test "Raft shard epoch: stale $entry_type is rejected (pre-validation, no log append)" { + # Record commit index before injection. + set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] + + # Connect and inject stale PROPOSE. + set fd [connect_fake_node $cport] + raft_send $fd $propose_msg + after 500 + + # Verify log index unchanged (rejected at pre-validation). + set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] + assert_equal $commit_before $commit_after + + # Verify cluster state unchanged. + eval $state_check + + close $fd + } + } + + # -------------------------------------------------------------------------- + # Missing epoch: proposals without the required epoch field are rejected + # at pre-validation (never appended to the log). + # -------------------------------------------------------------------------- + + set missing_epoch_proposals [list \ + [list "SLOT_CHANGE (missing epoch)" "PROPOSE SLOT_CHANGE - 50"] \ + [list "FAILOVER (missing epoch)" "PROPOSE FAILOVER $node1_id $node_id"] \ + [list "SET_REPLICA_OF (missing epoch)" "PROPOSE SET_REPLICA_OF $node1_id $node_id [string repeat a 40]"] \ + [list "NODE_FORGET (missing epoch)" "PROPOSE NODE_FORGET $node1_id"] \ + ] + + foreach case $missing_epoch_proposals { + lassign $case label propose_msg + + test "Raft shard epoch: $label is rejected (pre-validation, no log append)" { + set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] + + set fd [connect_fake_node $cport] + raft_send $fd $propose_msg + after 500 + + # Verify log index unchanged (rejected at pre-validation). + set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] + assert_equal $commit_before $commit_after + + close $fd + } + } +} + } ;# tags From ebc2456e63fcaad7160f228a121b94110f7a3482 Mon Sep 17 00:00:00 2001 From: Sushil Paneru Date: Fri, 5 Jun 2026 03:58:07 +0000 Subject: [PATCH 2/9] Use existing configEpoch field API for shard epoch Signed-off-by: Sushil Paneru --- design-docs/cluster-raft.md | 12 +++---- src/cluster.c | 1 - src/cluster_legacy.c | 1 - src/cluster_raft.c | 70 +++++++++++++++++++++++++++++++++++-- src/cluster_state.c | 51 --------------------------- src/cluster_state.h | 5 --- 6 files changed, 73 insertions(+), 67 deletions(-) diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md index d389142ac84..67dca8267e3 100644 --- a/design-docs/cluster-raft.md +++ b/design-docs/cluster-raft.md @@ -212,9 +212,9 @@ changes are infrequent. ## PROPOSE and Leader Validation Followers forward proposals to the leader using the PROPOSE message, -sent on the outbound link to the leader. The leader always accepts -proposals without validation — it appends them to the log and -replicates them. Validation happens at apply time, where the apply +sent on the outbound link to the leader. The leader accepts +proposals with best effort pre-validations — it appends them to the log and +replicates them. Authoritative validation happens at apply time, where the apply function can detect conflicts and treat them as no-ops. This design simplifies the leader: it doesn't need to understand the @@ -738,9 +738,9 @@ the corresponding keys. A shard-epoch is a per-shard monotonically increasing counter stored in `server.cluster->shard_epochs`. It is bumped each time a topology change is applied to the shard. Entries that modify shard topology -include the shard's current epoch at proposal time. On apply, if the -epoch has advanced past the value in the entry, the entry is stale -and becomes a no-op. +include the shard's current epoch at proposal time. Epoch is validated +at prepare time and at apply time. If the epoch has advanced past the +value in the entry, the entry is stale and is ignored. ### Example: slot migration racing with failover diff --git a/src/cluster.c b/src/cluster.c index ffc614b70ac..3a7907fa738 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -70,7 +70,6 @@ void clusterInit(void) { server.cluster->size = 0; server.cluster->nodes = dictCreate(&clusterNodesDictType); server.cluster->shards = dictCreate(&clusterSdsToListType); - server.cluster->shard_epochs = dictCreate(&clusterShardEpochDictType); server.cluster->migrating_slots_to = dictCreate(&clusterSlotDictType); server.cluster->importing_slots_from = dictCreate(&clusterSlotDictType); server.cluster->stat_cluster_links_buffer_limit_exceeded = 0; diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 65b46670389..bb62df1d35f 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -735,7 +735,6 @@ static void clusterLegacyReset(int hard) { /* Recreate shards dict */ dictEmpty(server.cluster->shards, NULL); - dictEmpty(server.cluster->shard_epochs, NULL); /* Forget all the nodes, but myself. */ di = dictGetSafeIterator(server.cluster->nodes); diff --git a/src/cluster_raft.c b/src/cluster_raft.c index 30831a540c5..e77c6fae0e8 100644 --- a/src/cluster_raft.c +++ b/src/cluster_raft.c @@ -44,6 +44,14 @@ /* From module.c */ void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, const unsigned char *payload, uint32_t len); +/* Shard epoch dict type, mapping shard_id to epoch. */ +static dictType raftShardEpochDictType = { + .entryGetKey = dictEntryGetKey, + .hashFunction = dictSdsHash, + .keyCompare = dictSdsKeyCompare, + .entryDestructor = dictEntryDestructorSdsKey, +}; + /* -------------------------------------------------------------------------- * Raft log entry types — what gets replicated * -------------------------------------------------------------------------- */ @@ -177,6 +185,9 @@ typedef struct { uint64_t stats_pubsub_bytes_received; uint64_t stats_module_bytes_sent; uint64_t stats_module_bytes_received; + + /* Shard epoch tracking (per-shard monotonic counter for stale proposal detection). */ + dict *shard_epochs; } clusterRaftState; #define RAFT_STATE() ((clusterRaftState *)server.cluster->protocol_data) @@ -824,6 +835,38 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, } } +/* -------------------------------------------------------------------------- + * Shard epoch helpers + * -------------------------------------------------------------------------- */ + +static uint64_t clusterGetShardEpoch(const char *shard_id) { + clusterRaftState *rs = RAFT_STATE(); + sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); + dictEntry *de = dictFind(rs->shard_epochs, s); + sdsfree(s); + return de ? dictGetUnsignedIntegerVal(de) : 0; +} + +static void clusterSetShardEpoch(const char *shard_id, uint64_t epoch) { + clusterRaftState *rs = RAFT_STATE(); + sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); + dictEntry *de = dictFind(rs->shard_epochs, s); + if (de) { + /* Existing entry — just update the value. */ + dictSetUnsignedIntegerVal(de, epoch); + sdsfree(s); + } else { + /* Only create a new epoch entry if the shard actually exists. */ + dictEntry *shard_de = dictFind(server.cluster->shards, s); + if (shard_de) { + de = dictAddRaw(rs->shard_epochs, s, NULL); + dictSetUnsignedIntegerVal(de, epoch); + } else { + sdsfree(s); + } + } +} + static int isShardEpochCurrent(const char *shard_id, uint64_t entry_epoch) { uint64_t current = clusterGetShardEpoch(shard_id); return (current == 0 || entry_epoch == current); @@ -1926,7 +1969,11 @@ static void clusterRaftInit(void) { rs->my_last_committed_info = sdsempty(); rs->last_node_info_check = monotonicMs(); rs->last_repl_offsets_broadcast = monotonicMs(); +<<<<<<< HEAD rs->todo_update_slot_coverage = 1; +======= + rs->shard_epochs = dictCreate(&raftShardEpochDictType); +>>>>>>> 4ec207fbe (Use existing configEpoch field API for shard epoch) server.cluster->size = 0; /* Incremented by NODE_JOIN apply */ } @@ -2439,6 +2486,7 @@ static void clusterRaftHandleServerShutdown(void) { } zfree(rs->log); sdsfree(rs->my_last_committed_info); + dictRelease(rs->shard_epochs); zfree(rs); server.cluster->protocol_data = NULL; } @@ -2664,6 +2712,22 @@ static void clusterRaftFreeNodeData(clusterNode *node) { zfree(node->protocol_data); } +/* For raft, the config_epoch field in nodes.conf stores the shard epoch. + * ping_sent and pong_received have no meaning in raft (no gossip). */ +static void clusterRaftGetNodePingPongEpoch(clusterNode *node, long long *ping_sent, long long *pong_received, uint64_t *config_epoch) { + *ping_sent = 0; + *pong_received = 0; + *config_epoch = clusterGetShardEpoch(node->shard_id); +} + +/* On load, the config_epoch field is the shard epoch for this node's shard. + * We use it to restore the shard_epochs dict. ping/pong are ignored. */ +static void clusterRaftSetNodePingPongEpoch(clusterNode *node, int ping_active, int pong_active, uint64_t shard_epoch) { + UNUSED(ping_active); + UNUSED(pong_active); + clusterSetShardEpoch(node->shard_id, shard_epoch); +} + static sds clusterRaftAppendVarsLine(sds config) { clusterRaftState *rs = RAFT_STATE(); config = sdscatprintf(config, "vars currentTerm %llu lastApplied %llu", @@ -3365,7 +3429,7 @@ static void clusterRaftResetCluster(int hard) { /* Recreate shards dict. */ dictEmpty(server.cluster->shards, NULL); - dictEmpty(server.cluster->shard_epochs, NULL); + dictEmpty(rs->shard_epochs, NULL); /* Forget all nodes except myself. */ dictIterator *di = dictGetSafeIterator(server.cluster->nodes); @@ -3478,8 +3542,8 @@ clusterBusType clusterRaftBus = { .resetStats = clusterRaftResetStats, .appendInfoFields = clusterRaftAppendInfoFields, .getFailureReportsCount = clusterRaftGetFailureReportsCount, - .getNodePingPongEpoch = NULL, - .setNodePingPongEpoch = NULL, + .getNodePingPongEpoch = clusterRaftGetNodePingPongEpoch, + .setNodePingPongEpoch = clusterRaftSetNodePingPongEpoch, .setNodeFailed = NULL, .appendVarsLine = clusterRaftAppendVarsLine, .parseVarsLine = clusterRaftParseVarsLine, diff --git a/src/cluster_state.c b/src/cluster_state.c index e9df0344d07..ad8d6ade13f 100644 --- a/src/cluster_state.c +++ b/src/cluster_state.c @@ -62,14 +62,6 @@ dictType clusterSlotDictType = { .entryDestructor = zfree, }; -/* Shard epoch table, mapping shard_id to uint64_t epoch (stored inline). */ -dictType clusterShardEpochDictType = { - .entryGetKey = dictEntryGetKey, - .hashFunction = dictSdsHash, - .keyCompare = dictSdsKeyCompare, - .entryDestructor = dictEntryDestructorSdsKey, -}; - /* ----------------------------------------------------------------------------- * Bitmap helpers * -------------------------------------------------------------------------- */ @@ -370,14 +362,6 @@ void clusterAddNodeToShard(const char *shard_id, clusterNode *node) { list *l = listCreate(); listAddNodeTail(l, node); serverAssert(dictAdd(server.cluster->shards, s, l) == DICT_OK); - /* Initialize shard epoch to 0 for the new shard. */ - sds epoch_key = sdsnewlen(shard_id, CLUSTER_NAMELEN); - dictEntry *epoch_de = dictAddRaw(server.cluster->shard_epochs, epoch_key, NULL); - if (epoch_de) { - dictSetUnsignedIntegerVal(epoch_de, 0); - } else { - sdsfree(epoch_key); - } } else { list *l = dictGetVal(de); if (listSearchKey(l, node) == NULL) { @@ -398,46 +382,11 @@ void clusterRemoveNodeFromShard(clusterNode *node) { } if (listLength(l) == 0) { dictDelete(server.cluster->shards, s); - /* Remove shard epoch when shard is destroyed. */ - dictDelete(server.cluster->shard_epochs, s); } } sdsfree(s); } -/* -------------------------------------------------------------------------- - * Shard epoch helpers - * -------------------------------------------------------------------------- */ - -uint64_t clusterGetShardEpoch(const char *shard_id) { - sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); - dictEntry *de = dictFind(server.cluster->shard_epochs, s); - sdsfree(s); - return de ? dictGetUnsignedIntegerVal(de) : 0; -} - -void clusterSetShardEpoch(const char *shard_id, uint64_t epoch) { - sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); - dictEntry *de = dictFind(server.cluster->shard_epochs, s); - if (de) { - dictSetUnsignedIntegerVal(de, epoch); - sdsfree(s); - } else { - de = dictAddRaw(server.cluster->shard_epochs, s, NULL); - if (de) { - dictSetUnsignedIntegerVal(de, epoch); - } else { - sdsfree(s); - } - } -} - -void clusterRemoveShardEpoch(const char *shard_id) { - sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); - dictDelete(server.cluster->shard_epochs, s); - sdsfree(s); -} - /* ----------------------------------------------------------------------------- * Slot migration state * -------------------------------------------------------------------------- */ diff --git a/src/cluster_state.h b/src/cluster_state.h index 34485d78e7d..3a9bb49ab08 100644 --- a/src/cluster_state.h +++ b/src/cluster_state.h @@ -11,7 +11,6 @@ typedef struct clusterLink clusterLink; extern dictType clusterNodesDictType; extern dictType clusterSdsToListType; extern dictType clusterSlotDictType; -extern dictType clusterShardEpochDictType; /* Cluster node flags and macros. */ #define CLUSTER_NODE_PRIMARY (1 << 0) /* The node is a primary */ @@ -86,7 +85,6 @@ struct clusterState { int size; /* Num of primary nodes with at least one slot */ dict *nodes; /* Hash table of name -> clusterNode structures */ dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ - dict *shard_epochs; /* Hash table of shard_id -> uint64_t epoch */ dict *migrating_slots_to; dict *importing_slots_from; clusterNode *slots[CLUSTER_SLOTS]; @@ -129,9 +127,6 @@ int clusterPrimariesHaveReplicas(void); int clusterNodeClearSlotBit(clusterNode *n, int slot); void clusterRemoveNodeFromShard(clusterNode *node); void clusterAddNodeToShard(const char *shard_id, clusterNode *node); -uint64_t clusterGetShardEpoch(const char *shard_id); -void clusterSetShardEpoch(const char *shard_id, uint64_t epoch); -void clusterRemoveShardEpoch(const char *shard_id); void clusterAddNode(clusterNode *node); void clusterDelNode(clusterNode *node); void clusterRenameNode(clusterNode *node, char *newname); From 5219c9c04dbd21da6d76a84560ee823706189043 Mon Sep 17 00:00:00 2001 From: Sushil Paneru Date: Fri, 5 Jun 2026 06:08:33 +0000 Subject: [PATCH 3/9] Changed message format for FAILOVER and SLOT_CHANGE Signed-off-by: Sushil Paneru --- design-docs/cluster-raft.md | 8 +- src/cluster_raft.c | 146 +++++++++++----------- tests/unit/cluster/cluster-raft-proto.tcl | 55 +++++++- 3 files changed, 130 insertions(+), 79 deletions(-) diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md index 67dca8267e3..1898b4fc17f 100644 --- a/design-docs/cluster-raft.md +++ b/design-docs/cluster-raft.md @@ -141,7 +141,7 @@ NODE_JOIN
NODE_FORGET Remove a node from the cluster (CLUSTER FORGET). Not yet implemented. -SLOT_CHANGE [ ...] +SLOT_CHANGE [ ...] Assign or remove slot ownership. A dash means "no owner" (delete slots). Ranges use the nodes.conf format: "0-5460" or "5461". @@ -171,7 +171,7 @@ NODE_RECOVER ``` Ranges in SLOT_CHANGE use the same format as nodes.conf: `0-5460` or -`5461`. A dash as node-id means "no owner" (delete slots) or "no +`5461`. A dash as source/target-id means "no owner" (delete slots) or "no primary" (promote to primary). ### Why typed entries instead of a key-value store? @@ -759,9 +759,9 @@ value in the entry, the entry is stale and is ignored. ### Entry formats with epoch ``` -FAILOVER +FAILOVER SET_REPLICA_OF -SLOT_CHANGE +SLOT_CHANGE NODE_FORGET ``` diff --git a/src/cluster_raft.c b/src/cluster_raft.c index e77c6fae0e8..f879e8bda8d 100644 --- a/src/cluster_raft.c +++ b/src/cluster_raft.c @@ -433,8 +433,9 @@ static void clusterRaftSelfJoin(void) { slots = sdscatfmt(slots, " %i-%i", start, j); } if (sdslen(slots) > 0) { - entry = sdsnew("SLOT_CHANGE "); + entry = sdsnew("SLOT_CHANGE - 0 "); entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); + entry = sdscatfmt(entry, " %U", (unsigned long long)0); entry = sdscatsds(entry, slots); clusterRaftPropose(entry, NULL, NULL); sdsfree(entry); @@ -731,7 +732,7 @@ static int clusterRaftProcessMeetRejected(clusterLink *link, int argc, sds *argv * * Examples: * PROPOSE NODE_JOIN
- * PROPOSE SLOT_CHANGE [ ...] + * PROPOSE SLOT_CHANGE [ ...] * -------------------------------------------------------------------------- */ static int raftEntryTypeByName(const char *name) { if (!strcasecmp(name, "NODE_JOIN")) return RAFT_ENTRY_NODE_JOIN; @@ -761,7 +762,7 @@ static const char *raftEntryTypeName(uint8_t type) { /* Propose a log entry. The entry is an sds containing the type name * followed by the data, e.g. "NODE_JOIN " or - * "SLOT_CHANGE 0-5460". On the leader, it's appended directly + * "SLOT_CHANGE 0-5460". On the leader, it's appended directly * to the log. On followers, it's forwarded to the leader. */ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, const char *error)) { clusterRaftState *rs = RAFT_STATE(); @@ -873,56 +874,48 @@ static int isShardEpochCurrent(const char *shard_id, uint64_t entry_epoch) { } /* Parsed fields from a SLOT_CHANGE entry's epoch-related data. - * Format: " " */ + * Format: " " */ typedef struct { clusterNode *target; /* Target node (NULL if dash). */ - clusterNode *source_owner; /* Current owner of the first slot in the range. */ + clusterNode *source_owner; /* Source node (NULL if dash). */ const char *source_shard_id; /* source_owner->shard_id or NULL. */ const char *target_shard_id; /* target->shard_id or NULL. */ uint64_t source_epoch; uint64_t target_epoch; - int range_end; /* Index in argv where ranges end (argc - 2). */ + int range_end; /* Index in argv where ranges begin (first range element). */ int valid; /* 1 if parsing succeeded, 0 otherwise. */ } slotChangeEpochInfo; -/* Parse epoch-related fields from a SLOT_CHANGE entry's split argv. - * Caller must have split data and verified argc >= 4. */ +/* Parse fields from a SLOT_CHANGE entry's split argv. + * Format: " " + * Caller must have verified argc >= 5. */ static slotChangeEpochInfo parseSlotChangeEpochs(sds *argv, int argc) { slotChangeEpochInfo info = {0}; - if (argc < 4) return info; - info.target = (sdslen(argv[0]) == CLUSTER_NAMELEN) - ? clusterLookupNode(argv[0], CLUSTER_NAMELEN) - : NULL; - info.source_epoch = strtoull(argv[argc - 2], NULL, 10); - info.target_epoch = strtoull(argv[argc - 1], NULL, 10); - info.range_end = argc - 2; - - /* Derive source shard from current owner of the first slot in the range. */ - int first_start; - char *dash = strchr(argv[1], '-'); - if (dash) { - *dash = '\0'; - first_start = atoi(argv[1]); - *dash = '-'; - } else { - first_start = atoi(argv[1]); - } - info.source_owner = (first_start >= 0 && first_start < CLUSTER_SLOTS) - ? server.cluster->slots[first_start] + /* argv[0] = source-id-or-dash */ + info.source_owner = (sdslen(argv[0]) == CLUSTER_NAMELEN) + ? clusterLookupNode(argv[0], CLUSTER_NAMELEN) : NULL; + /* argv[1] = source-epoch */ + info.source_epoch = strtoull(argv[1], NULL, 10); + /* argv[2] = target-id-or-dash */ + info.target = (sdslen(argv[2]) == CLUSTER_NAMELEN) + ? clusterLookupNode(argv[2], CLUSTER_NAMELEN) + : NULL; + /* argv[3] = target-epoch */ + info.target_epoch = strtoull(argv[3], NULL, 10); + /* argv[4..argc-1] = ranges */ + info.range_end = 4; + info.source_shard_id = info.source_owner ? info.source_owner->shard_id : NULL; info.target_shard_id = info.target ? info.target->shard_id : NULL; - info.valid = 1; + info.valid = (argc > 4); return info; } /* Pre-validate shard epoch on the leader before appending to the log. - * This is an optimization: reject obviously stale proposals early to avoid - * wasting log space and replication bandwidth. - * Returns 1 if the entry should be appended, 0 if stale. */ -/* Returns 0 if the entry_epoch is stale relative to the shard's current epoch. - * Returns 1 if valid (epoch matches or shard is at epoch 0). */ + * Rejects stale proposals early to avoid wasting log space. + * Returns 1 if valid, 0 if stale. */ static int clusterRaftPreValidateEpoch(int type, sds data) { int argc; sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc); @@ -931,15 +924,12 @@ static int clusterRaftPreValidateEpoch(int type, sds data) { int ok = 1; switch (type) { case RAFT_ENTRY_FAILOVER: { - /* Format: */ - if (argc < 3) { + /* Format: */ + if (argc < 4) { ok = 0; } else { - clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1])); - if (primary) { - uint64_t epoch = strtoull(argv[2], NULL, 10); - ok = isShardEpochCurrent(primary->shard_id, epoch); - } + uint64_t epoch = strtoull(argv[3], NULL, 10); + ok = isShardEpochCurrent(argv[2], epoch); } break; } @@ -954,8 +944,8 @@ static int clusterRaftPreValidateEpoch(int type, sds data) { break; } case RAFT_ENTRY_SLOT_CHANGE: { - /* Format: */ - if (argc < 4) { + /* Format: */ + if (argc < 5) { ok = 0; } else { slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc); @@ -1276,23 +1266,27 @@ static void raftLogApply(raftLogEntry *e) { /* Propose SLOT_CHANGE for slots assigned before joining * the cluster (from our singleton state). */ - sds entry = sdsnew("SLOT_CHANGE "); - entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); int count = 0; + sds slot_range = sdsempty(); for (int j = 0; j < CLUSTER_SLOTS; j++) { if (server.cluster->slots[j] != myself) continue; int start = j; while (j + 1 < CLUSTER_SLOTS && server.cluster->slots[j + 1] == myself) j++; if (j == start) - entry = sdscatfmt(entry, " %i", start); + slot_range = sdscatfmt(slot_range, " %i", start); else - entry = sdscatfmt(entry, " %i-%i", start, j); + slot_range = sdscatfmt(slot_range, " %i-%i", start, j); count++; } if (count > 0) { + sds entry = sdsnew("SLOT_CHANGE - 0 "); + entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); + entry = sdscatfmt(entry, " %U", (unsigned long long)0); + entry = sdscatsds(entry, slot_range); clusterRaftPropose(entry, NULL, NULL); + sdsfree(entry); } - sdsfree(entry); + sdsfree(slot_range); } } @@ -2247,7 +2241,8 @@ static void clusterRaftCron(void) { entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); entry = sdscatlen(entry, " ", 1); entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN); - /* Append primary's shard epoch. */ + entry = sdscatlen(entry, " ", 1); + entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN); uint64_t epoch = clusterGetShardEpoch(primary->shard_id); entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); clusterRaftPropose(entry, rs->mf_ctx, rs->mf_callback); @@ -2269,7 +2264,8 @@ static void clusterRaftCron(void) { entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); entry = sdscatlen(entry, " ", 1); entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN); - /* Append primary's shard epoch. */ + entry = sdscatlen(entry, " ", 1); + entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN); uint64_t epoch = clusterGetShardEpoch(primary->shard_id); entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); clusterRaftPropose(entry, NULL, NULL); @@ -3058,9 +3054,17 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode return; } - /* Build entry: "SLOT_CHANGE " */ + /* Build entry: "SLOT_CHANGE " */ + clusterNode *source_owner = server.cluster->slots[ranges[0].start_slot]; + serverAssert(source_owner || target); /* At least one side must be set. */ + uint64_t source_epoch = (source_owner) ? clusterGetShardEpoch(source_owner->shard_id) : 0; + uint64_t target_epoch = target ? clusterGetShardEpoch(target->shard_id) : 0; + sds entry = sdsnew("SLOT_CHANGE "); + entry = sdscatlen(entry, source_owner ? source_owner->name : "-", source_owner ? CLUSTER_NAMELEN : 1); + entry = sdscatfmt(entry, " %U ", (unsigned long long)source_epoch); entry = sdscatlen(entry, target ? target->name : "-", target ? CLUSTER_NAMELEN : 1); + entry = sdscatfmt(entry, " %U", (unsigned long long)target_epoch); for (int i = 0; i < numranges; i++) { if (ranges[i].start_slot == ranges[i].end_slot) entry = sdscatfmt(entry, " %i", ranges[i].start_slot); @@ -3068,12 +3072,6 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode entry = sdscatfmt(entry, " %i-%i", ranges[i].start_slot, ranges[i].end_slot); } - /* Append source-epoch and target-epoch as last two fields. */ - clusterNode *source_owner = server.cluster->slots[ranges[0].start_slot]; - uint64_t source_epoch = (source_owner) ? clusterGetShardEpoch(source_owner->shard_id) : 0; - uint64_t target_epoch = target ? clusterGetShardEpoch(target->shard_id) : 0; - entry = sdscatfmt(entry, " %U %U", (unsigned long long)source_epoch, (unsigned long long)target_epoch); - /* Propose through Raft leader. The callback is invoked when the entry is * committed and applied. We use two chained callbacks, to handle some * side-effects like replica migration, before invoking the original @@ -3086,15 +3084,14 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode sdsfree(entry); } -/* Apply a SLOT_CHANGE entry. Format: " " - * Ranges use the same format as nodes.conf: "0-5460" or "5461". - * The last two fields are the source and target shard epochs. */ +/* Apply a SLOT_CHANGE entry. Format: " " + * Ranges use the same format as nodes.conf: "0-5460" or "5461". */ static const char *clusterRaftApplySlotChange(sds data) { const char *error = NULL; int argc; sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc); - /* Need at least: target + one_range + source_epoch + target_epoch = 4 */ - if (!argv || argc < 4) goto done; + /* Need at least: source + source_epoch + target + target_epoch + one_range = 5 */ + if (!argv || argc < 5) goto done; slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc); if (!info.valid) goto done; @@ -3106,8 +3103,8 @@ static const char *clusterRaftApplySlotChange(sds data) { goto done; } - /* Range fields are argv[1] through argv[range_end-1] (excluding last two epoch fields). */ - for (int i = 1; i < info.range_end; i++) { + /* Range fields are argv[range_end] through argv[argc-1]. */ + for (int i = info.range_end; i < argc; i++) { int start, end; char *p = strchr(argv[i], '-'); if (p) { @@ -3210,7 +3207,7 @@ static const char *clusterRaftApplySetReplica(sds data) { return error; } -/* Apply a FAILOVER entry. Format: " " +/* Apply a FAILOVER entry. Format: " " * The replica takes over the primary's slots and becomes primary. * The old primary becomes a replica of the new primary. */ static const char *clusterRaftApplyFailover(sds data) { @@ -3218,19 +3215,27 @@ static const char *clusterRaftApplyFailover(sds data) { const char *error = NULL; int argc; sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc); - if (!argv || argc != 3) goto done; + if (!argv || argc != 4) goto done; clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0])); clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1])); if (!replica || !primary) goto done; - /* Epoch validation: validate against primary's shard before domain logic. */ - uint64_t epoch = strtoull(argv[2], NULL, 10); - if (!clusterCheckAndBumpShardEpoch(primary->shard_id, epoch)) { + /* Epoch validation: validate against the shard-id from the entry. */ + uint64_t epoch = strtoull(argv[3], NULL, 10); + if (!clusterCheckAndBumpShardEpoch(argv[2], epoch)) { error = "stale shard epoch"; goto done; } + /* Validate that the primary still belongs to the shard claimed in the entry. */ + if (memcmp(primary->shard_id, argv[2], CLUSTER_NAMELEN) != 0) { + serverLog(LL_WARNING, "FAILOVER rejected: primary %.40s shard mismatch (expected %.40s, got %.40s).", + primary->name, argv[2], primary->shard_id); + error = "primary shard mismatch"; + goto done; + } + if (!nodeIsReplica(replica) || nodeIsReplica(primary) || replica->replicaof != primary) goto done; /* Transfer slots from old primary to new primary. */ @@ -3331,7 +3336,8 @@ static void clusterRaftFailover(int force, int takeover, void *ctx, void (*callb entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); entry = sdscatlen(entry, " ", 1); entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN); - /* Append primary's shard epoch. */ + entry = sdscatlen(entry, " ", 1); + entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN); uint64_t epoch = clusterGetShardEpoch(primary->shard_id); entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); clusterRaftPropose(entry, ctx, callback); diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl index 565cb15b976..0ef5a8fa573 100644 --- a/tests/unit/cluster/cluster-raft-proto.tcl +++ b/tests/unit/cluster/cluster-raft-proto.tcl @@ -560,6 +560,18 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c set port [srv 0 port] set cport [expr {$port + 10000}] + # Extract shard-id of r0 from CLUSTER SHARDS output. + set node0_shard "" + set shards [$r0 CLUSTER SHARDS] + foreach shard $shards { + foreach node [dict get $shard nodes] { + if {[dict get $node id] eq $node_id} { + set node0_shard [dict get $shard id] + } + } + } + assert {$node0_shard ne ""} + # Helper: connect to cluster bus and complete HELLO handshake. proc connect_fake_node {cport} { set fake_id [string repeat "f" 40] @@ -574,10 +586,10 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c # Each entry: {entry_type propose_msg state_check} set stale_proposals [list \ - [list "SLOT_CHANGE" "PROPOSE SLOT_CHANGE - 50 0 0" { + [list "SLOT_CHANGE" "PROPOSE SLOT_CHANGE $node_id 50 $node1_id 0 0-100" { assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned] }] \ - [list "FAILOVER" "PROPOSE FAILOVER $node1_id $node_id 0" { + [list "FAILOVER" "PROPOSE FAILOVER $node1_id $node_id $node0_shard 50" { set nodes [$r0 CLUSTER NODES] assert_match "*$node_id*myself,master*" $nodes assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned] @@ -600,7 +612,7 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] assert_equal $commit_before $commit_after - # Verify cluster state unchanged. + # Verify cluster state unchanged (stale proposal had no effect). eval $state_check close $fd @@ -613,8 +625,8 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c # -------------------------------------------------------------------------- set missing_epoch_proposals [list \ - [list "SLOT_CHANGE (missing epoch)" "PROPOSE SLOT_CHANGE - 50"] \ - [list "FAILOVER (missing epoch)" "PROPOSE FAILOVER $node1_id $node_id"] \ + [list "SLOT_CHANGE (missing epoch)" "PROPOSE SLOT_CHANGE $node_id 1 $node1_id"] \ + [list "FAILOVER (missing epoch)" "PROPOSE FAILOVER $node1_id $node_id $node0_shard"] \ [list "SET_REPLICA_OF (missing epoch)" "PROPOSE SET_REPLICA_OF $node1_id $node_id [string repeat a 40]"] \ [list "NODE_FORGET (missing epoch)" "PROPOSE NODE_FORGET $node1_id"] \ ] @@ -636,6 +648,39 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c close $fd } } + + # -------------------------------------------------------------------------- + # FAILOVER with wrong shard-id: passes pre-validation (unknown shard has + # epoch 0, so isShardEpochCurrent returns true) but is rejected at apply + # because primary's current shard_id doesn't match the entry. + # -------------------------------------------------------------------------- + + test "Raft shard epoch: FAILOVER with wrong shard-id is rejected at apply" { + # Use a bogus shard-id that doesn't match node_id's actual shard. + set wrong_shard [string repeat "1" 40] + + # Get current commit index. + set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] + + # Propose FAILOVER with correct epoch (0, since wrong_shard has no + # epoch tracked) but wrong shard-id. + set fd [connect_fake_node $cport] + raft_send $fd "PROPOSE FAILOVER $node1_id $node_id $wrong_shard 0" + after 500 + + # The entry passes pre-validation (epoch 0 for unknown shard is valid) + # and gets appended + committed. But apply rejects it due to shard mismatch. + # Commit index advances (entry was appended) but state is unchanged. + set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] + assert {$commit_after > $commit_before} + + # Verify the failover did NOT happen: r0 is still primary with all slots. + set nodes [$r0 CLUSTER NODES] + assert_match "*$node_id*myself,master*" $nodes + assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned] + + close $fd + } } } ;# tags From ab5807de3a3022ac49f7186402084e4a4604435a Mon Sep 17 00:00:00 2001 From: Sushil Paneru Date: Fri, 5 Jun 2026 06:32:13 +0000 Subject: [PATCH 4/9] Avoid epoch bump for slot_change Signed-off-by: Sushil Paneru --- src/cluster_raft.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/src/cluster_raft.c b/src/cluster_raft.c index f879e8bda8d..ac96853c8b1 100644 --- a/src/cluster_raft.c +++ b/src/cluster_raft.c @@ -923,18 +923,9 @@ static int clusterRaftPreValidateEpoch(int type, sds data) { int ok = 1; switch (type) { - case RAFT_ENTRY_FAILOVER: { - /* Format: */ - if (argc < 4) { - ok = 0; - } else { - uint64_t epoch = strtoull(argv[3], NULL, 10); - ok = isShardEpochCurrent(argv[2], epoch); - } - break; - } + case RAFT_ENTRY_FAILOVER: case RAFT_ENTRY_SET_REPLICA_OF: { - /* Format: */ + /* Format: <..ids..> (argv[2] and argv[3]) */ if (argc < 4) { ok = 0; } else { @@ -1177,13 +1168,6 @@ static int clusterCheckAndBumpSlotChangeEpochs(const char *source_shard_id, return 0; } - /* Both passed — bump both. */ - if (source_shard_id) { - clusterSetShardEpoch(source_shard_id, src_current == 0 ? 1 : src_current + 1); - } - if (target_shard_id) { - clusterSetShardEpoch(target_shard_id, tgt_current == 0 ? 1 : tgt_current + 1); - } return 1; } @@ -1421,7 +1405,8 @@ static void raftLogApply(raftLogEntry *e) { sdsclear(node->announce_client_ipv6); sdsclear(node->availability_zone); clusterNodeParseAddressString(node, argv[1]); - /* Apply self-set flags. */ + /* Apply self-set flags. TODO: split on comma and compare + * each part individually when more flags are added. */ if (argc >= 3) { node->flags &= ~CLUSTER_NODE_NOFAILOVER; if (strstr(argv[2], "nofailover")) { From 3c8dd26e3205bdc3ad7fec701152bd4cf42a9866 Mon Sep 17 00:00:00 2001 From: Sushil Paneru Date: Fri, 5 Jun 2026 18:29:45 +0000 Subject: [PATCH 5/9] Added proposal rejection stat Signed-off-by: Sushil Paneru --- design-docs/cluster-raft.md | 16 ++--- src/cluster_raft.c | 81 +++++++++++++---------- tests/unit/cluster/cluster-raft-proto.tcl | 52 ++++++++++----- 3 files changed, 87 insertions(+), 62 deletions(-) diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md index 1898b4fc17f..7428a6d113b 100644 --- a/design-docs/cluster-raft.md +++ b/design-docs/cluster-raft.md @@ -138,20 +138,20 @@ NODE_JOIN
via MEET. The node starts as a learner and is promoted to follower when the entry is committed. -NODE_FORGET +NODE_FORGET Remove a node from the cluster (CLUSTER FORGET). Not yet implemented. SLOT_CHANGE [ ...] Assign or remove slot ownership. A dash means "no owner" (delete slots). Ranges use the nodes.conf format: "0-5460" or "5461". -SET_REPLICA_OF +SET_REPLICA_OF Set a node as replica of a primary (CLUSTER REPLICATE). A dash as primary means promote to primary. The shard-id is the target shard: for promotion, a new random id; for assignment, the primary's current shard-id (used as a guard against concurrent changes). -FAILOVER +FAILOVER The replica takes over the primary's slots and becomes primary. The old primary becomes a replica of the new primary. @@ -736,11 +736,11 @@ inconsistencies — such as moving a slot to a node that no longer owns the corresponding keys. A shard-epoch is a per-shard monotonically increasing counter stored -in `server.cluster->shard_epochs`. It is bumped each time a topology -change is applied to the shard. Entries that modify shard topology -include the shard's current epoch at proposal time. Epoch is validated -at prepare time and at apply time. If the epoch has advanced past the -value in the entry, the entry is stale and is ignored. +in `server.cluster->shard_epochs`. It is bumped each time membership or +leadership of the shard changes. Such entries include the shard's +current epoch at proposal time. Epoch is validated at prepare time +and at apply time. If the epoch has advanced past the value in the entry, +the entry is stale and is ignored. ### Example: slot migration racing with failover diff --git a/src/cluster_raft.c b/src/cluster_raft.c index ac96853c8b1..a6f36196a54 100644 --- a/src/cluster_raft.c +++ b/src/cluster_raft.c @@ -185,6 +185,8 @@ typedef struct { uint64_t stats_pubsub_bytes_received; uint64_t stats_module_bytes_sent; uint64_t stats_module_bytes_received; + uint64_t stats_proposals_rejected_prevalidation; + uint64_t stats_proposals_rejected_apply; /* Shard epoch tracking (per-shard monotonic counter for stale proposal detection). */ dict *shard_epochs; @@ -851,21 +853,11 @@ static uint64_t clusterGetShardEpoch(const char *shard_id) { static void clusterSetShardEpoch(const char *shard_id, uint64_t epoch) { clusterRaftState *rs = RAFT_STATE(); sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN); - dictEntry *de = dictFind(rs->shard_epochs, s); - if (de) { - /* Existing entry — just update the value. */ - dictSetUnsignedIntegerVal(de, epoch); + dictEntry *de = dictAddOrFind(rs->shard_epochs, s); + if (dictGetKey(de) != s) { sdsfree(s); - } else { - /* Only create a new epoch entry if the shard actually exists. */ - dictEntry *shard_de = dictFind(server.cluster->shards, s); - if (shard_de) { - de = dictAddRaw(rs->shard_epochs, s, NULL); - dictSetUnsignedIntegerVal(de, epoch); - } else { - sdsfree(s); - } } + dictSetUnsignedIntegerVal(de, epoch); } static int isShardEpochCurrent(const char *shard_id, uint64_t entry_epoch) { @@ -929,13 +921,21 @@ static int clusterRaftPreValidateEpoch(int type, sds data) { if (argc < 4) { ok = 0; } else { - uint64_t epoch = strtoull(argv[3], NULL, 10); - ok = isShardEpochCurrent(argv[2], epoch); + /* Validate shard-id matches the primary's current shard. */ + clusterNode *primary = (sdslen(argv[1]) == CLUSTER_NAMELEN) + ? clusterLookupNode(argv[1], CLUSTER_NAMELEN) + : NULL; + if (primary && memcmp(primary->shard_id, argv[2], CLUSTER_NAMELEN) != 0) { + ok = 0; + } else { + uint64_t epoch = strtoull(argv[3], NULL, 10); + ok = isShardEpochCurrent(argv[2], epoch); + } } break; } case RAFT_ENTRY_SLOT_CHANGE: { - /* Format: */ + /* Format: */ if (argc < 5) { ok = 0; } else { @@ -970,6 +970,7 @@ static int clusterRaftPreValidateEpoch(int type, sds data) { sdsfreesplitres(argv, argc); if (!ok) { + RAFT_STATE()->stats_proposals_rejected_prevalidation++; serverLog(LL_NOTICE, "Leader pre-validation: rejecting %s proposal (missing or stale epoch).", raftEntryTypeName(type)); } @@ -1133,7 +1134,7 @@ static uint64_t raftLogTermAt(uint64_t index) { * Returns 1 if the entry should be applied, 0 if stale. * On success, increments the shard epoch (or sets it to 1 if currently 0). * If shard_id is NULL, always returns 1 (no shard to validate). */ -static int clusterCheckAndBumpShardEpoch(const char *shard_id, uint64_t entry_epoch) { +static int clusterValidateAndBumpShardEpoch(const char *shard_id, uint64_t entry_epoch) { if (!shard_id) return 1; uint64_t current = clusterGetShardEpoch(shard_id); if (current == 0) { @@ -1148,14 +1149,14 @@ static int clusterCheckAndBumpShardEpoch(const char *shard_id, uint64_t entry_ep return 0; } -/* Validate and bump epochs for SLOT_CHANGE (two shards). +/* Validate SLOT_CHANGE (two shards). We do not bump epoch to support + * concurrent slot migration. * Returns 1 if the entry should be applied, 0 if either epoch is stale. - * On success, increments both source and target epochs. * source_shard_id may be NULL (unassigned slots). */ -static int clusterCheckAndBumpSlotChangeEpochs(const char *source_shard_id, - uint64_t source_epoch, - const char *target_shard_id, - uint64_t target_epoch) { +static int clusterValidateSlotChangeEpochs(const char *source_shard_id, + uint64_t source_epoch, + const char *target_shard_id, + uint64_t target_epoch) { uint64_t src_current = source_shard_id ? clusterGetShardEpoch(source_shard_id) : 0; uint64_t tgt_current = target_shard_id ? clusterGetShardEpoch(target_shard_id) : 0; @@ -1326,7 +1327,7 @@ static void raftLogApply(raftLogEntry *e) { if (node && node != myself) { /* Epoch validation. */ uint64_t epoch = strtoull(argv[1], NULL, 10); - if (!clusterCheckAndBumpShardEpoch(node->shard_id, epoch)) { + if (!clusterValidateAndBumpShardEpoch(node->shard_id, epoch)) { entry_error = "stale shard epoch"; serverLog(LL_NOTICE, "Applied NODE_FORGET %.40s (index %llu) [stale].", argv[0], (unsigned long long)e->index); @@ -1433,6 +1434,10 @@ static void raftLogApply(raftLogEntry *e) { break; } + if (entry_error) { + rs->stats_proposals_rejected_apply++; + } + /* Check pending proposals for a match and remove it. */ if (listLength(rs->pending_proposals) > 0) { listIter li; @@ -2903,6 +2908,8 @@ static void clusterRaftResetStats(void) { rs->stats_pubsub_bytes_received = 0; rs->stats_module_bytes_sent = 0; rs->stats_module_bytes_received = 0; + rs->stats_proposals_rejected_prevalidation = 0; + rs->stats_proposals_rejected_apply = 0; } static sds clusterRaftAppendInfoFields(sds info) { @@ -2957,6 +2964,8 @@ static sds clusterRaftAppendInfoFields(sds info) { "cluster_stats_pubsub_bytes_received:%llu\r\n" "cluster_stats_module_bytes_sent:%llu\r\n" "cluster_stats_module_bytes_received:%llu\r\n" + "cluster_stats_proposals_rejected_prevalidation:%llu\r\n" + "cluster_stats_proposals_rejected_apply:%llu\r\n" "total_cluster_links_buffer_limit_exceeded:%llu\r\n", role_str, (unsigned long long)rs->current_term, (unsigned long long)rs->commit_index, (unsigned long long)rs->last_applied, @@ -2969,6 +2978,8 @@ static sds clusterRaftAppendInfoFields(sds info) { (unsigned long long)rs->stats_pubsub_bytes_received, (unsigned long long)rs->stats_module_bytes_sent, (unsigned long long)rs->stats_module_bytes_received, + (unsigned long long)rs->stats_proposals_rejected_prevalidation, + (unsigned long long)rs->stats_proposals_rejected_apply, (unsigned long long)server.cluster->stat_cluster_links_buffer_limit_exceeded); return info; } @@ -3082,8 +3093,8 @@ static const char *clusterRaftApplySlotChange(sds data) { if (!info.valid) goto done; /* Epoch validation: validate both source and target epochs. */ - if (!clusterCheckAndBumpSlotChangeEpochs(info.source_shard_id, info.source_epoch, - info.target_shard_id, info.target_epoch)) { + if (!clusterValidateSlotChangeEpochs(info.source_shard_id, info.source_epoch, + info.target_shard_id, info.target_epoch)) { error = "stale shard epoch"; goto done; } @@ -3139,10 +3150,8 @@ static const char *clusterRaftApplySetReplica(sds data) { if (replica == myself) rs->todo_save_config = 1; char *shard_id = argv[2]; - - /* Epoch validation: validate against the shard-id from the entry. */ uint64_t epoch = strtoull(argv[3], NULL, 10); - if (!clusterCheckAndBumpShardEpoch(shard_id, epoch)) { + if (!clusterValidateAndBumpShardEpoch(shard_id, epoch)) { error = "stale shard epoch"; goto done; } @@ -3206,13 +3215,6 @@ static const char *clusterRaftApplyFailover(sds data) { clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1])); if (!replica || !primary) goto done; - /* Epoch validation: validate against the shard-id from the entry. */ - uint64_t epoch = strtoull(argv[3], NULL, 10); - if (!clusterCheckAndBumpShardEpoch(argv[2], epoch)) { - error = "stale shard epoch"; - goto done; - } - /* Validate that the primary still belongs to the shard claimed in the entry. */ if (memcmp(primary->shard_id, argv[2], CLUSTER_NAMELEN) != 0) { serverLog(LL_WARNING, "FAILOVER rejected: primary %.40s shard mismatch (expected %.40s, got %.40s).", @@ -3221,6 +3223,13 @@ static const char *clusterRaftApplyFailover(sds data) { goto done; } + /* Epoch validation: validate against the shard-id from the entry. */ + uint64_t epoch = strtoull(argv[3], NULL, 10); + if (!clusterValidateAndBumpShardEpoch(argv[2], epoch)) { + error = "stale shard epoch"; + goto done; + } + if (!nodeIsReplica(replica) || nodeIsReplica(primary) || replica->replicaof != primary) goto done; /* Transfer slots from old primary to new primary. */ diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl index 0ef5a8fa573..22f7179dee1 100644 --- a/tests/unit/cluster/cluster-raft-proto.tcl +++ b/tests/unit/cluster/cluster-raft-proto.tcl @@ -557,6 +557,11 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c set node_id [$r0 CLUSTER MYID] set node1_id [$r1 CLUSTER MYID] + + # Make r1 a replica of r0. This commits a SET_REPLICA_OF entry + # which bumps the shard epoch from 0 to 1. + $r1 CLUSTER REPLICATE $node_id + set port [srv 0 port] set cport [expr {$port + 10000}] @@ -600,13 +605,20 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c lassign $case entry_type propose_msg state_check test "Raft shard epoch: stale $entry_type is rejected (pre-validation, no log append)" { - # Record commit index before injection. + # Record commit index and rejection counter before injection. set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] + set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] # Connect and inject stale PROPOSE. set fd [connect_fake_node $cport] raft_send $fd $propose_msg - after 500 + + # Wait for the rejection counter to increase. + wait_for_condition 50 100 { + [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before + } else { + fail "Stale $entry_type proposal was not rejected" + } # Verify log index unchanged (rejected at pre-validation). set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] @@ -636,10 +648,17 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c test "Raft shard epoch: $label is rejected (pre-validation, no log append)" { set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] + set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] set fd [connect_fake_node $cport] raft_send $fd $propose_msg - after 500 + + # Wait for the rejection counter to increase. + wait_for_condition 50 100 { + [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before + } else { + fail "$label proposal was not rejected" + } # Verify log index unchanged (rejected at pre-validation). set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] @@ -649,30 +668,27 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c } } - # -------------------------------------------------------------------------- - # FAILOVER with wrong shard-id: passes pre-validation (unknown shard has - # epoch 0, so isShardEpochCurrent returns true) but is rejected at apply - # because primary's current shard_id doesn't match the entry. - # -------------------------------------------------------------------------- - - test "Raft shard epoch: FAILOVER with wrong shard-id is rejected at apply" { + test "Raft shard epoch: FAILOVER with wrong shard-id is rejected at pre-validation" { # Use a bogus shard-id that doesn't match node_id's actual shard. set wrong_shard [string repeat "1" 40] - # Get current commit index. set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] + set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] - # Propose FAILOVER with correct epoch (0, since wrong_shard has no - # epoch tracked) but wrong shard-id. + # Propose FAILOVER with wrong shard-id. set fd [connect_fake_node $cport] raft_send $fd "PROPOSE FAILOVER $node1_id $node_id $wrong_shard 0" - after 500 - # The entry passes pre-validation (epoch 0 for unknown shard is valid) - # and gets appended + committed. But apply rejects it due to shard mismatch. - # Commit index advances (entry was appended) but state is unchanged. + # Wait for the rejection counter to increase. + wait_for_condition 50 100 { + [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before + } else { + fail "FAILOVER with wrong shard-id was not rejected" + } + + # Verify log index unchanged (rejected at pre-validation). set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] - assert {$commit_after > $commit_before} + assert_equal $commit_before $commit_after # Verify the failover did NOT happen: r0 is still primary with all slots. set nodes [$r0 CLUSTER NODES] From 7893b988fb3f6887eb40d3db00a13520d10e11da Mon Sep 17 00:00:00 2001 From: Sushil Paneru Date: Sun, 7 Jun 2026 22:10:43 +0000 Subject: [PATCH 6/9] Add shard-id for SET_REPLICA_OF Signed-off-by: Sushil Paneru --- design-docs/cluster-raft.md | 9 +- src/cluster_raft.c | 159 +++++++++++++--------- tests/unit/cluster/cluster-raft-proto.tcl | 58 ++++---- 3 files changed, 131 insertions(+), 95 deletions(-) diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md index 7428a6d113b..6fd477a82eb 100644 --- a/design-docs/cluster-raft.md +++ b/design-docs/cluster-raft.md @@ -145,13 +145,14 @@ SLOT_CHANGE +SET_REPLICA_OF Set a node as replica of a primary (CLUSTER REPLICATE). A dash as - primary means promote to primary. The shard-id is the target shard: + primary means promote to primary. Both source and target shard epochs + are validated to guard against concurrent shard changes. for promotion, a new random id; for assignment, the primary's current shard-id (used as a guard against concurrent changes). -FAILOVER +FAILOVER The replica takes over the primary's slots and becomes primary. The old primary becomes a replica of the new primary. @@ -760,7 +761,7 @@ the entry is stale and is ignored. ``` FAILOVER -SET_REPLICA_OF +SET_REPLICA_OF SLOT_CHANGE NODE_FORGET ``` diff --git a/src/cluster_raft.c b/src/cluster_raft.c index a6f36196a54..466372a7119 100644 --- a/src/cluster_raft.c +++ b/src/cluster_raft.c @@ -915,13 +915,11 @@ static int clusterRaftPreValidateEpoch(int type, sds data) { int ok = 1; switch (type) { - case RAFT_ENTRY_FAILOVER: - case RAFT_ENTRY_SET_REPLICA_OF: { - /* Format: <..ids..> (argv[2] and argv[3]) */ + case RAFT_ENTRY_FAILOVER: { + /* Format: */ if (argc < 4) { ok = 0; } else { - /* Validate shard-id matches the primary's current shard. */ clusterNode *primary = (sdslen(argv[1]) == CLUSTER_NAMELEN) ? clusterLookupNode(argv[1], CLUSTER_NAMELEN) : NULL; @@ -934,6 +932,29 @@ static int clusterRaftPreValidateEpoch(int type, sds data) { } break; } + case RAFT_ENTRY_SET_REPLICA_OF: { + /* Format: */ + if (argc < 6) { + ok = 0; + } else { + /* Validate source shard epoch. */ + uint64_t source_epoch = strtoull(argv[2], NULL, 10); + ok = isShardEpochCurrent(argv[1], source_epoch); + /* Validate target shard epoch. */ + if (ok) { + uint64_t target_epoch = strtoull(argv[5], NULL, 10); + ok = isShardEpochCurrent(argv[4], target_epoch); + } + /* Validate primary's shard matches target-shard. */ + if (ok && sdslen(argv[3]) == CLUSTER_NAMELEN) { + clusterNode *primary = clusterLookupNode(argv[3], CLUSTER_NAMELEN); + if (primary && memcmp(primary->shard_id, argv[4], CLUSTER_NAMELEN) != 0) { + ok = 0; + } + } + } + break; + } case RAFT_ENTRY_SLOT_CHANGE: { /* Format: */ if (argc < 5) { @@ -1130,33 +1151,13 @@ static uint64_t raftLogTermAt(uint64_t index) { * Shard epoch validation helpers * -------------------------------------------------------------------------- */ -/* Validate and bump epoch for a single-shard entry. - * Returns 1 if the entry should be applied, 0 if stale. - * On success, increments the shard epoch (or sets it to 1 if currently 0). - * If shard_id is NULL, always returns 1 (no shard to validate). */ -static int clusterValidateAndBumpShardEpoch(const char *shard_id, uint64_t entry_epoch) { - if (!shard_id) return 1; - uint64_t current = clusterGetShardEpoch(shard_id); - if (current == 0) { - /* First operation on this shard — apply unconditionally, bump to 1. */ - clusterSetShardEpoch(shard_id, 1); - return 1; - } - if (entry_epoch == current) { - clusterSetShardEpoch(shard_id, current + 1); - return 1; - } - return 0; -} - -/* Validate SLOT_CHANGE (two shards). We do not bump epoch to support - * concurrent slot migration. - * Returns 1 if the entry should be applied, 0 if either epoch is stale. - * source_shard_id may be NULL (unassigned slots). */ -static int clusterValidateSlotChangeEpochs(const char *source_shard_id, - uint64_t source_epoch, - const char *target_shard_id, - uint64_t target_epoch) { +/* Validate a pair of shard epochs (source and target). + * Returns 1 if both are current, 0 if either is stale. + * Either shard_id may be NULL (skipped). Does not bump. */ +static int clusterValidateShardEpochPair(const char *source_shard_id, + uint64_t source_epoch, + const char *target_shard_id, + uint64_t target_epoch) { uint64_t src_current = source_shard_id ? clusterGetShardEpoch(source_shard_id) : 0; uint64_t tgt_current = target_shard_id ? clusterGetShardEpoch(target_shard_id) : 0; @@ -1325,15 +1326,17 @@ static void raftLogApply(raftLogEntry *e) { } clusterNode *node = clusterLookupNode(argv[0], sdslen(argv[0])); if (node && node != myself) { - /* Epoch validation. */ + /* Epoch validation (no bump yet). */ uint64_t epoch = strtoull(argv[1], NULL, 10); - if (!clusterValidateAndBumpShardEpoch(node->shard_id, epoch)) { + if (!isShardEpochCurrent(node->shard_id, epoch)) { entry_error = "stale shard epoch"; - serverLog(LL_NOTICE, "Applied NODE_FORGET %.40s (index %llu) [stale].", - argv[0], (unsigned long long)e->index); sdsfreesplitres(argv, argc); break; } + /* Save shard_id before deleting the node. */ + char shard_id[CLUSTER_NAMELEN]; + memcpy(shard_id, node->shard_id, CLUSTER_NAMELEN); + /* Detach link before deleting so clusterReadHandler can detect * that the node it was talking to is gone. */ if (node->link) { @@ -1343,6 +1346,10 @@ static void raftLogApply(raftLogEntry *e) { clusterDelNode(node); rs->todo_update_slot_coverage = 1; rs->todo_invalidate_slots_cache = 1; + + /* Bump shard epoch after successful delete. */ + uint64_t current = clusterGetShardEpoch(shard_id); + clusterSetShardEpoch(shard_id, current == 0 ? 1 : current + 1); } serverLog(LL_NOTICE, "Applied NODE_FORGET %.40s (index %llu).", argv[0], (unsigned long long)e->index); @@ -1953,11 +1960,8 @@ static void clusterRaftInit(void) { rs->my_last_committed_info = sdsempty(); rs->last_node_info_check = monotonicMs(); rs->last_repl_offsets_broadcast = monotonicMs(); -<<<<<<< HEAD rs->todo_update_slot_coverage = 1; -======= rs->shard_epochs = dictCreate(&raftShardEpochDictType); ->>>>>>> 4ec207fbe (Use existing configEpoch field API for shard epoch) server.cluster->size = 0; /* Incremented by NODE_JOIN apply */ } @@ -3093,8 +3097,8 @@ static const char *clusterRaftApplySlotChange(sds data) { if (!info.valid) goto done; /* Epoch validation: validate both source and target epochs. */ - if (!clusterValidateSlotChangeEpochs(info.source_shard_id, info.source_epoch, - info.target_shard_id, info.target_epoch)) { + if (!clusterValidateShardEpochPair(info.source_shard_id, info.source_epoch, + info.target_shard_id, info.target_epoch)) { error = "stale shard epoch"; goto done; } @@ -3136,28 +3140,35 @@ static const char *clusterRaftApplySlotChange(sds data) { } /* Apply a SET_REPLICA_OF entry. - * Format: " " */ + * Format: " " */ static const char *clusterRaftApplySetReplica(sds data) { clusterRaftState *rs = RAFT_STATE(); const char *error = NULL; int argc; sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc); - if (!argv || argc != 4) goto done; + if (!argv || argc != 6) goto done; clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0])); if (!replica) goto done; - if (replica == myself) rs->todo_save_config = 1; - char *shard_id = argv[2]; uint64_t epoch = strtoull(argv[3], NULL, 10); - if (!clusterValidateAndBumpShardEpoch(shard_id, epoch)) { + char *source_shard = argv[1]; + uint64_t source_epoch = strtoull(argv[2], NULL, 10); + char *target_shard = argv[4]; + uint64_t target_epoch = strtoull(argv[5], NULL, 10); + + /* Validate both shard epochs without bumping. */ + if (!clusterValidateShardEpochPair(source_shard, source_epoch, + target_shard, target_epoch)) { error = "stale shard epoch"; goto done; } - if (sdslen(argv[1]) == 1 && argv[1][0] == '-') { - /* Promote to primary with the shard-id from the entry. */ + if (replica == myself) rs->todo_save_config = 1; + + if (sdslen(argv[3]) == 1 && argv[3][0] == '-') { + /* Promote to primary with the target-shard from the entry. */ if (nodeIsReplica(replica)) { if (replica->replicaof) clusterNodeRemoveReplica(replica->replicaof, replica); replica->flags &= ~CLUSTER_NODE_REPLICA; @@ -3166,13 +3177,13 @@ static const char *clusterRaftApplySetReplica(sds data) { if (replica == myself) rs->todo_update_replication = 1; } clusterRemoveNodeFromShard(replica); - memcpy(replica->shard_id, shard_id, CLUSTER_NAMELEN); - clusterAddNodeToShard(shard_id, replica); + memcpy(replica->shard_id, target_shard, CLUSTER_NAMELEN); + clusterAddNodeToShard(target_shard, replica); } else { - clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1])); + clusterNode *primary = clusterLookupNode(argv[3], sdslen(argv[3])); if (!primary) goto done; /* Guard: skip if the primary's shard-id has changed. */ - if (memcmp(primary->shard_id, shard_id, CLUSTER_NAMELEN) != 0) goto done; + if (memcmp(primary->shard_id, target_shard, CLUSTER_NAMELEN) != 0) goto done; if (replica == myself) { /* Update cluster state; actual replication change deferred to beforeSleep. */ if (myself->replicaof) clusterNodeRemoveReplica(myself->replicaof, myself); @@ -3196,6 +3207,10 @@ static const char *clusterRaftApplySetReplica(sds data) { clusterAddNodeToShard(primary->shard_id, replica); } } + + /* Bump both shard epochs after successful apply. */ + clusterSetShardEpoch(source_shard, source_epoch + 1); + clusterSetShardEpoch(target_shard, target_epoch + 1); done: if (argv) sdsfreesplitres(argv, argc); return error; @@ -3223,9 +3238,9 @@ static const char *clusterRaftApplyFailover(sds data) { goto done; } - /* Epoch validation: validate against the shard-id from the entry. */ - uint64_t epoch = strtoull(argv[3], NULL, 10); - if (!clusterValidateAndBumpShardEpoch(argv[2], epoch)) { + /* Epoch validation: validate against the shard-id from the entry (no bump yet). */ + uint64_t shard_epoch = strtoull(argv[3], NULL, 10); + if (!isShardEpochCurrent(argv[2], shard_epoch)) { error = "stale shard epoch"; goto done; } @@ -3276,6 +3291,10 @@ static const char *clusterRaftApplyFailover(sds data) { rs->todo_update_replication = 1; rs->todo_save_config = 1; } + + /* Bump shard epoch after successful apply. */ + clusterSetShardEpoch(argv[2], shard_epoch + 1); + done: if (argv) sdsfreesplitres(argv, argc); return error; @@ -3293,25 +3312,29 @@ static void clusterRaftForgetNode(const char *node_id, size_t id_len, void *ctx, } static void clusterRaftSetReplicaOf(clusterNode *primary, void *ctx, void (*callback)(void *ctx, const char *error)) { - /* Propose SET_REPLICA_OF: " " - * For promotion (dash): shard-id is a new random id. - * For assignment: shard-id is the primary's current shard-id, used - * as a guard against concurrent shard changes. */ + /* Propose SET_REPLICA_OF: + * " " + * Source is myself's current shard. Target is the primary's shard (for assignment) + * or a new random shard (for promotion to primary). */ + uint64_t source_epoch = clusterGetShardEpoch(myself->shard_id); + + char target_shard[CLUSTER_NAMELEN]; + if (primary) { + memcpy(target_shard, primary->shard_id, CLUSTER_NAMELEN); + } else { + getRandomHexChars(target_shard, CLUSTER_NAMELEN); + } + uint64_t target_epoch = primary ? clusterGetShardEpoch(primary->shard_id) : 0; + sds entry = sdsnew("SET_REPLICA_OF "); entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); entry = sdscatlen(entry, " ", 1); + entry = sdscatlen(entry, myself->shard_id, CLUSTER_NAMELEN); + entry = sdscatfmt(entry, " %U ", (unsigned long long)source_epoch); entry = sdscatlen(entry, primary ? primary->name : "-", primary ? CLUSTER_NAMELEN : 1); entry = sdscatlen(entry, " ", 1); - if (primary) { - entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN); - } else { - char new_shard_id[CLUSTER_NAMELEN]; - getRandomHexChars(new_shard_id, CLUSTER_NAMELEN); - entry = sdscatlen(entry, new_shard_id, CLUSTER_NAMELEN); - } - /* Append target shard epoch: use primary's shard epoch for assignment, 0 for promotion. */ - uint64_t epoch = primary ? clusterGetShardEpoch(primary->shard_id) : 0; - entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); + entry = sdscatlen(entry, target_shard, CLUSTER_NAMELEN); + entry = sdscatfmt(entry, " %U", (unsigned long long)target_epoch); clusterRaftPropose(entry, ctx, callback); sdsfree(entry); } diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl index 22f7179dee1..14a04e2b4b5 100644 --- a/tests/unit/cluster/cluster-raft-proto.tcl +++ b/tests/unit/cluster/cluster-raft-proto.tcl @@ -639,7 +639,7 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c set missing_epoch_proposals [list \ [list "SLOT_CHANGE (missing epoch)" "PROPOSE SLOT_CHANGE $node_id 1 $node1_id"] \ [list "FAILOVER (missing epoch)" "PROPOSE FAILOVER $node1_id $node_id $node0_shard"] \ - [list "SET_REPLICA_OF (missing epoch)" "PROPOSE SET_REPLICA_OF $node1_id $node_id [string repeat a 40]"] \ + [list "SET_REPLICA_OF (missing epoch)" "PROPOSE SET_REPLICA_OF $node1_id [string repeat a 40] 1 $node_id [string repeat b 40]"] \ [list "NODE_FORGET (missing epoch)" "PROPOSE NODE_FORGET $node1_id"] \ ] @@ -668,34 +668,46 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c } } - test "Raft shard epoch: FAILOVER with wrong shard-id is rejected at pre-validation" { - # Use a bogus shard-id that doesn't match node_id's actual shard. - set wrong_shard [string repeat "1" 40] + # -------------------------------------------------------------------------- + # Wrong shard-id: proposals where the shard-id doesn't match the node's + # actual shard are rejected at pre-validation. + # -------------------------------------------------------------------------- - set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] - set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] + set wrong_shard [string repeat "1" 40] - # Propose FAILOVER with wrong shard-id. - set fd [connect_fake_node $cport] - raft_send $fd "PROPOSE FAILOVER $node1_id $node_id $wrong_shard 0" + set wrong_shard_proposals [list \ + [list "FAILOVER" "PROPOSE FAILOVER $node1_id $node_id $wrong_shard 0"] \ + [list "SET_REPLICA_OF" "PROPOSE SET_REPLICA_OF $node1_id $node0_shard 1 $node_id $wrong_shard 0"] \ + ] - # Wait for the rejection counter to increase. - wait_for_condition 50 100 { - [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before - } else { - fail "FAILOVER with wrong shard-id was not rejected" - } + foreach case $wrong_shard_proposals { + lassign $case entry_type propose_msg - # Verify log index unchanged (rejected at pre-validation). - set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] - assert_equal $commit_before $commit_after + test "Raft shard epoch: $entry_type with wrong shard-id is rejected at pre-validation" { + set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] + set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] - # Verify the failover did NOT happen: r0 is still primary with all slots. - set nodes [$r0 CLUSTER NODES] - assert_match "*$node_id*myself,master*" $nodes - assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned] + set fd [connect_fake_node $cport] + raft_send $fd $propose_msg - close $fd + # Wait for the rejection counter to increase. + wait_for_condition 50 100 { + [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before + } else { + fail "$entry_type with wrong shard-id was not rejected" + } + + # Verify log index unchanged (rejected at pre-validation, epoch not bumped). + set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] + assert_equal $commit_before $commit_after + + # Verify cluster state unchanged. + set nodes [$r0 CLUSTER NODES] + assert_match "*$node_id*myself,master*" $nodes + assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned] + + close $fd + } } } From b49b879a02ffb0f335427a4d778f5fa0adeed32d Mon Sep 17 00:00:00 2001 From: Sushil Paneru Date: Mon, 8 Jun 2026 04:44:01 +0000 Subject: [PATCH 7/9] Added pre-Validation rejection in raft leader Signed-off-by: Sushil Paneru --- src/cluster_raft.c | 117 +++++++++++----- tests/support/cluster_util.tcl | 35 ++++- tests/unit/cluster/cluster-raft-proto.tcl | 126 +++++++++++++----- tests/unit/cluster/faster-failover.tcl | 4 +- .../cluster/pubsubshard-slot-migration.tcl | 2 +- 5 files changed, 208 insertions(+), 76 deletions(-) diff --git a/src/cluster_raft.c b/src/cluster_raft.c index 466372a7119..ed882a80ef9 100644 --- a/src/cluster_raft.c +++ b/src/cluster_raft.c @@ -41,6 +41,9 @@ #include +/* Generic error message returned to clients when a proposal is rejected. */ +#define GENERIC_PROPOSAL_REJECTION_MSG "proposal rejected by raft leader" + /* From module.c */ void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, const unsigned char *payload, uint32_t len); @@ -321,6 +324,7 @@ static clusterMsgSendBlock *clusterRaftBuildAllOffsetsMsg(void) { static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, const char *error)); static void clusterRaftDeferPendingProposals(void); +static void clusterRaftCompletePendingProposal(int type, sds data, const char *error); static int clusterRaftPreValidateEpoch(int type, sds data); static void clusterRaftUpdateMyself(int old_flags); static sds clusterRaftBuildMyNodeInfo(void); @@ -784,7 +788,7 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, if (rs->role == RAFT_ROLE_LEADER) { /* Pre-validate epoch on the leader to reject obviously stale proposals. */ if (!clusterRaftPreValidateEpoch(type, data)) { - if (callback) callback(ctx, "stale shard epoch"); + if (callback) callback(ctx, GENERIC_PROPOSAL_REJECTION_MSG); sdsfree(data); return; } @@ -1019,6 +1023,14 @@ static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) { /* Pre-validate epoch on the leader to reject obviously stale proposals. */ if (!clusterRaftPreValidateEpoch(type, data)) { + /* Send REJECT back to the proposing follower so it can unblock the client. */ + if (link) { + sds msg = wireNewMsg("REJECT"); + msg = sdscatlen(msg, " ", 1); + msg = sdscatlen(msg, entry, sdslen(entry)); + msg = wireFinishMsg(msg); + clusterRaftSendMsg(link, msg); + } sdsfree(data); sdsfree(entry); return 1; @@ -1039,6 +1051,26 @@ static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) { return 1; } +/* Handle a REJECT message from the leader. The leader sends this when it + * rejects a forwarded PROPOSE at pre-validation. We match it against our + * pending_proposals and fire the callback with an error so the client + * gets an immediate reply instead of hanging until timeout. + * Format: "REJECT " (echoes back the original entry). */ +static int clusterRaftProcessReject(clusterLink *link, int argc, sds *argv) { + UNUSED(link); + + /* argv[0]="REJECT", argv[1]=type, argv[2..]=data */ + if (argc < 2) return 1; + + int type = raftEntryTypeByName(argv[1]); + if (type < 0) return 1; + + sds data = (argc >= 3) ? sdsjoinsds(argv + 2, argc - 2, " ", 1) : sdsempty(); + clusterRaftCompletePendingProposal(type, data, GENERIC_PROPOSAL_REJECTION_MSG); + sdsfree(data); + return 1; +} + /* -------------------------------------------------------------------------- * Raft election and heartbeat * -------------------------------------------------------------------------- */ @@ -1173,6 +1205,28 @@ static int clusterValidateShardEpochPair(const char *source_shard_id, return 1; } +/* Find a pending proposal matching type+data, fire its callback, and remove it. + * Called both when an entry is applied (from raftLogApply) and when the leader + * sends a REJECT for a forwarded proposal. */ +static void clusterRaftCompletePendingProposal(int type, sds data, const char *error) { + clusterRaftState *rs = RAFT_STATE(); + if (listLength(rs->pending_proposals) == 0) return; + + listIter li; + listNode *ln; + listRewind(rs->pending_proposals, &li); + while ((ln = listNext(&li)) != NULL) { + raftPendingProposal *pp = listNodeValue(ln); + if (pp->type == type && !sdscmp(pp->data, data)) { + if (pp->callback) pp->callback(pp->ctx, error); + sdsfree(pp->data); + zfree(pp); + listDelNode(rs->pending_proposals, ln); + break; + } + } +} + /* Apply a committed log entry. */ static void raftLogApply(raftLogEntry *e) { clusterRaftState *rs = RAFT_STATE(); @@ -1329,7 +1383,7 @@ static void raftLogApply(raftLogEntry *e) { /* Epoch validation (no bump yet). */ uint64_t epoch = strtoull(argv[1], NULL, 10); if (!isShardEpochCurrent(node->shard_id, epoch)) { - entry_error = "stale shard epoch"; + entry_error = GENERIC_PROPOSAL_REJECTION_MSG; sdsfreesplitres(argv, argc); break; } @@ -1446,21 +1500,7 @@ static void raftLogApply(raftLogEntry *e) { } /* Check pending proposals for a match and remove it. */ - if (listLength(rs->pending_proposals) > 0) { - listIter li; - listNode *ln; - listRewind(rs->pending_proposals, &li); - while ((ln = listNext(&li)) != NULL) { - raftPendingProposal *pp = listNodeValue(ln); - if (pp->type == e->type && !sdscmp(pp->data, e->data)) { - if (pp->callback) pp->callback(pp->ctx, entry_error); - sdsfree(pp->data); - zfree(pp); - listDelNode(rs->pending_proposals, ln); - break; - } - } - } + clusterRaftCompletePendingProposal(e->type, e->data, entry_error); } /* -------------------------------------------------------------------------- @@ -2530,6 +2570,8 @@ static int clusterRaftProcessMessage(struct clusterLink *link) { ret = clusterRaftProcessMeetRejected(link, argc, argv); } else if (!strcasecmp(argv[0], "PROPOSE")) { ret = clusterRaftProcessPropose(link, argc, argv); + } else if (!strcasecmp(argv[0], "REJECT")) { + ret = clusterRaftProcessReject(link, argc, argv); } else if (!strcasecmp(argv[0], "AE")) { ret = clusterRaftProcessAppendEntries(link, argc, argv, lines + 1, line_count - 1); } else if (!strcasecmp(argv[0], "AE_ACK")) { @@ -3091,16 +3133,15 @@ static const char *clusterRaftApplySlotChange(sds data) { int argc; sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc); /* Need at least: source + source_epoch + target + target_epoch + one_range = 5 */ - if (!argv || argc < 5) goto done; + if (!argv || argc < 5) goto reject; slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc); - if (!info.valid) goto done; + if (!info.valid) goto reject; /* Epoch validation: validate both source and target epochs. */ if (!clusterValidateShardEpochPair(info.source_shard_id, info.source_epoch, info.target_shard_id, info.target_epoch)) { - error = "stale shard epoch"; - goto done; + goto reject; } /* Range fields are argv[range_end] through argv[argc-1]. */ @@ -3134,6 +3175,10 @@ static const char *clusterRaftApplySlotChange(sds data) { } } } + goto done; + +reject: + if (!error) error = GENERIC_PROPOSAL_REJECTION_MSG; done: if (argv) sdsfreesplitres(argv, argc); return error; @@ -3146,10 +3191,10 @@ static const char *clusterRaftApplySetReplica(sds data) { const char *error = NULL; int argc; sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc); - if (!argv || argc != 6) goto done; + if (!argv || argc != 6) goto reject; clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0])); - if (!replica) goto done; + if (!replica) goto reject; char *shard_id = argv[2]; uint64_t epoch = strtoull(argv[3], NULL, 10); @@ -3161,8 +3206,7 @@ static const char *clusterRaftApplySetReplica(sds data) { /* Validate both shard epochs without bumping. */ if (!clusterValidateShardEpochPair(source_shard, source_epoch, target_shard, target_epoch)) { - error = "stale shard epoch"; - goto done; + goto reject; } if (replica == myself) rs->todo_save_config = 1; @@ -3181,9 +3225,9 @@ static const char *clusterRaftApplySetReplica(sds data) { clusterAddNodeToShard(target_shard, replica); } else { clusterNode *primary = clusterLookupNode(argv[3], sdslen(argv[3])); - if (!primary) goto done; + if (!primary) goto reject; /* Guard: skip if the primary's shard-id has changed. */ - if (memcmp(primary->shard_id, target_shard, CLUSTER_NAMELEN) != 0) goto done; + if (memcmp(primary->shard_id, target_shard, CLUSTER_NAMELEN) != 0) goto reject; if (replica == myself) { /* Update cluster state; actual replication change deferred to beforeSleep. */ if (myself->replicaof) clusterNodeRemoveReplica(myself->replicaof, myself); @@ -3211,6 +3255,10 @@ static const char *clusterRaftApplySetReplica(sds data) { /* Bump both shard epochs after successful apply. */ clusterSetShardEpoch(source_shard, source_epoch + 1); clusterSetShardEpoch(target_shard, target_epoch + 1); + goto done; + +reject: + if (!error) error = GENERIC_PROPOSAL_REJECTION_MSG; done: if (argv) sdsfreesplitres(argv, argc); return error; @@ -3224,28 +3272,26 @@ static const char *clusterRaftApplyFailover(sds data) { const char *error = NULL; int argc; sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc); - if (!argv || argc != 4) goto done; + if (!argv || argc != 4) goto reject; clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0])); clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1])); - if (!replica || !primary) goto done; + if (!replica || !primary) goto reject; /* Validate that the primary still belongs to the shard claimed in the entry. */ if (memcmp(primary->shard_id, argv[2], CLUSTER_NAMELEN) != 0) { serverLog(LL_WARNING, "FAILOVER rejected: primary %.40s shard mismatch (expected %.40s, got %.40s).", primary->name, argv[2], primary->shard_id); - error = "primary shard mismatch"; - goto done; + goto reject; } /* Epoch validation: validate against the shard-id from the entry (no bump yet). */ uint64_t shard_epoch = strtoull(argv[3], NULL, 10); if (!isShardEpochCurrent(argv[2], shard_epoch)) { - error = "stale shard epoch"; - goto done; + goto reject; } - if (!nodeIsReplica(replica) || nodeIsReplica(primary) || replica->replicaof != primary) goto done; + if (!nodeIsReplica(replica) || nodeIsReplica(primary) || replica->replicaof != primary) goto reject; /* Transfer slots from old primary to new primary. */ for (int j = 0; j < CLUSTER_SLOTS; j++) { @@ -3294,7 +3340,10 @@ static const char *clusterRaftApplyFailover(sds data) { /* Bump shard epoch after successful apply. */ clusterSetShardEpoch(argv[2], shard_epoch + 1); + goto done; +reject: + if (!error) error = GENERIC_PROPOSAL_REJECTION_MSG; done: if (argv) sdsfreesplitres(argv, argc); return error; diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index ec1f91fdf0f..00a7e4f63a8 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -185,13 +185,44 @@ proc cluster_allocate_slots {masters replicas} { } } +# Issue CLUSTER REPLICATE with retry on epoch rejection. In raft mode, +# concurrent REPLICATE commands for the same shard may race with shard +# epoch bumps, causing transient "proposal rejected" errors. +proc cluster_replicate_with_retry {node_id target_id {max_retries 5}} { + for {set attempt 0} {$attempt < $max_retries} {incr attempt} { + if {[catch {R $node_id cluster replicate $target_id} err]} { + if {$attempt < $max_retries - 1 && [string match "*proposal rejected*" $err]} { + after 100 + continue + } + error $err + } + return + } +} + +# Same as above but takes a client object instead of a node index. +# Returns the result on success. +proc cluster_client_replicate_with_retry {client target_id {max_retries 5}} { + for {set attempt 0} {$attempt < $max_retries} {incr attempt} { + if {[catch {$client cluster replicate $target_id} result]} { + if {$attempt < $max_retries - 1 && [string match "*proposal rejected*" $result]} { + after 100 + continue + } + error $result + } + return $result + } +} + proc default_replica_allocation {masters replicas} { # Setup master/replica relationships set node_count [expr $masters + $replicas] for {set i 0} {$i < $masters} {incr i} { set nodeid [R $i CLUSTER MYID] for {set j [expr $i + $masters]} {$j < $node_count} {incr j $masters} { - R $j CLUSTER REPLICATE $nodeid + cluster_replicate_with_retry $j $nodeid } } } @@ -204,7 +235,7 @@ proc cluster_allocate_replicas {masters replicas} { set master_id [expr {$j % $masters}] set replica_id [cluster_find_available_replica $masters] set master_myself [cluster_get_myself $master_id] - R $replica_id cluster replicate [dict get $master_myself id] + cluster_replicate_with_retry $replica_id [dict get $master_myself id] } } diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl index 14a04e2b4b5..6b8919244b4 100644 --- a/tests/unit/cluster/cluster-raft-proto.tcl +++ b/tests/unit/cluster/cluster-raft-proto.tcl @@ -84,6 +84,16 @@ proc raft_listen_and_accept {port_var {timeout 5000} {before_accept {}}} { return $::_raft_accepted } +# Connect a fake node to a cluster bus port and complete HELLO handshake. +# Returns the connected fd. +proc raft_connect_fake_node {host cport fake_id fake_addr} { + set fd [raft_connect $host $cport] + raft_send $fd "HELLO $fake_id $fake_addr" + set reply [raft_recv $fd] + assert_match "HI *" $reply + return $fd +} + # Parse an AE message and reply with AE_ACK. # Returns the last log index after applying the entries. proc raft_reply_ae_ack {fd ae_msg repl_offset} { @@ -577,17 +587,8 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c } assert {$node0_shard ne ""} - # Helper: connect to cluster bus and complete HELLO handshake. - proc connect_fake_node {cport} { - set fake_id [string repeat "f" 40] - set fake_shard [string repeat "e" 40] - set fake_addr "127.0.0.1:9999@19999,,tls-port=0,shard-id=$fake_shard" - set fd [raft_connect 127.0.0.1 $cport] - raft_send $fd "HELLO $fake_id $fake_addr" - set reply [raft_recv $fd] - assert_match "HI *" $reply - return $fd - } + set fake_id [string repeat "f" 40] + set fake_addr "127.0.0.1:9999@19999,,tls-port=0,shard-id=[string repeat e 40]" # Each entry: {entry_type propose_msg state_check} set stale_proposals [list \ @@ -610,15 +611,16 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] # Connect and inject stale PROPOSE. - set fd [connect_fake_node $cport] + set fd [raft_connect_fake_node 127.0.0.1 $cport $fake_id $fake_addr] raft_send $fd $propose_msg - # Wait for the rejection counter to increase. - wait_for_condition 50 100 { - [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before - } else { - fail "Stale $entry_type proposal was not rejected" - } + # Expect a REJECT message back from the leader. + set reply [raft_recv $fd 2000] + assert_match "REJECT *" $reply + + # Verify rejection counter increased. + set rejected_after [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] + assert {$rejected_after > $rejected_before} # Verify log index unchanged (rejected at pre-validation). set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] @@ -650,15 +652,16 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] - set fd [connect_fake_node $cport] + set fd [raft_connect_fake_node 127.0.0.1 $cport $fake_id $fake_addr] raft_send $fd $propose_msg - # Wait for the rejection counter to increase. - wait_for_condition 50 100 { - [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before - } else { - fail "$label proposal was not rejected" - } + # Expect a REJECT message back from the leader. + set reply [raft_recv $fd 2000] + assert_match "REJECT *" $reply + + # Verify rejection counter increased. + set rejected_after [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] + assert {$rejected_after > $rejected_before} # Verify log index unchanged (rejected at pre-validation). set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] @@ -687,28 +690,77 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] - set fd [connect_fake_node $cport] + set fd [raft_connect_fake_node 127.0.0.1 $cport $fake_id $fake_addr] raft_send $fd $propose_msg - # Wait for the rejection counter to increase. - wait_for_condition 50 100 { - [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before - } else { - fail "$entry_type with wrong shard-id was not rejected" - } + # Expect a REJECT message back from the leader. + set reply [raft_recv $fd 2000] + assert_match "REJECT *" $reply + + # Verify rejection counter increased. + set rejected_after [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] + assert {$rejected_after > $rejected_before} # Verify log index unchanged (rejected at pre-validation, epoch not bumped). set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] assert_equal $commit_before $commit_after - # Verify cluster state unchanged. - set nodes [$r0 CLUSTER NODES] - assert_match "*$node_id*myself,master*" $nodes - assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned] - close $fd } } + + # -------------------------------------------------------------------------- + # Duplicate FAILOVER: two clients on the follower both send FAILOVER FORCE + # simultaneously. The first succeeds, the second is rejected at apply time + # (epoch bumped by the first). Neither client should hang. + # -------------------------------------------------------------------------- + + test "Raft shard epoch: duplicate FAILOVER from follower - second client gets rejection" { + set prevalidation_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] + set apply_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_apply] + + # Two deferred clients on r1 (the follower/replica). + set c1 [valkey_deferring_client -1] + set c2 [valkey_deferring_client -1] + + # Both issue CLUSTER FAILOVER FORCE concurrently. + $c1 CLUSTER FAILOVER FORCE + $c2 CLUSTER FAILOVER FORCE + + # Wait for one of the rejection counters to increase. + wait_for_condition 50 100 { + [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $prevalidation_before || + [get_cluster_info_field $r0 cluster_stats_proposals_rejected_apply] > $apply_before + } else { + fail "Second FAILOVER was not rejected" + } + + # Read both replies. One should succeed (OK), the other should + # get a rejection error. Neither should hang. + # Use catch because error replies throw in the deferring client. + if {[catch {set reply1 [$c1 read]} err1]} { + set reply1 $err1 + } + if {[catch {set reply2 [$c2 read]} err2]} { + set reply2 $err2 + } + + # Exactly one should be OK and the other should be rejected. + set ok_count 0 + set err_count 0 + foreach reply [list $reply1 $reply2] { + if {$reply eq "OK"} { + incr ok_count + } elseif {[string match "*proposal rejected by raft leader*" $reply]} { + incr err_count + } + } + assert_equal 1 $ok_count + assert_equal 1 $err_count + + $c1 close + $c2 close + } } } ;# tags diff --git a/tests/unit/cluster/faster-failover.tcl b/tests/unit/cluster/faster-failover.tcl index 6ecfbf73c99..cb06b8125bc 100644 --- a/tests/unit/cluster/faster-failover.tcl +++ b/tests/unit/cluster/faster-failover.tcl @@ -40,8 +40,8 @@ start_cluster 3 1 {overrides {cluster-ping-interval 1000 cluster-node-timeout 50 # R3 and R4 are both replicas of R0. proc single_primary_replica_allocation {masters replicas} { set master0_id [R 0 CLUSTER MYID] - R 3 CLUSTER REPLICATE $master0_id - R 4 CLUSTER REPLICATE $master0_id + cluster_replicate_with_retry 3 $master0_id + cluster_replicate_with_retry 4 $master0_id } # Test 1: Single primary failure. diff --git a/tests/unit/cluster/pubsubshard-slot-migration.tcl b/tests/unit/cluster/pubsubshard-slot-migration.tcl index c5a324f0949..3d49ca76ed5 100644 --- a/tests/unit/cluster/pubsubshard-slot-migration.tcl +++ b/tests/unit/cluster/pubsubshard-slot-migration.tcl @@ -156,7 +156,7 @@ test "Move a replica to another primary, verify client receives sunsubscribe on $nodefrom(link) spublish $channelname hello assert_equal {smessage mychannel2 hello} [$subscribeclient read] - assert_equal {OK} [$replica_client cluster replicate $nodeto(id)] + assert_equal {OK} [cluster_client_replicate_with_retry $replica_client $nodeto(id)] set msg [$subscribeclient read] assert {"sunsubscribe" eq [lindex $msg 0]} From f4d7d1ecf1149a64fc8389d2e35a03fbe7712d6b Mon Sep 17 00:00:00 2001 From: Sushil Paneru Date: Tue, 16 Jun 2026 06:43:43 +0000 Subject: [PATCH 8/9] Addressed comments Signed-off-by: Sushil Paneru --- design-docs/cluster-raft.md | 2 +- src/cluster_raft.c | 87 ++++++++++++----------- tests/unit/cluster/cli.tcl | 2 +- tests/unit/cluster/cluster-raft-proto.tcl | 39 ++++------ 4 files changed, 62 insertions(+), 68 deletions(-) diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md index 6fd477a82eb..11a1961dafd 100644 --- a/design-docs/cluster-raft.md +++ b/design-docs/cluster-raft.md @@ -139,7 +139,7 @@ NODE_JOIN
when the entry is committed. NODE_FORGET - Remove a node from the cluster (CLUSTER FORGET). Not yet implemented. + Remove a node from the cluster (CLUSTER FORGET). SLOT_CHANGE [ ...] Assign or remove slot ownership. A dash means "no owner" (delete diff --git a/src/cluster_raft.c b/src/cluster_raft.c index ed882a80ef9..209c4694837 100644 --- a/src/cluster_raft.c +++ b/src/cluster_raft.c @@ -43,6 +43,7 @@ /* Generic error message returned to clients when a proposal is rejected. */ #define GENERIC_PROPOSAL_REJECTION_MSG "proposal rejected by raft leader" +#define STALE_SHARD_EPOCH_REJECTION_MSG "proposal rejected due to stale shard epoch" /* From module.c */ void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, const unsigned char *payload, uint32_t len); @@ -188,8 +189,6 @@ typedef struct { uint64_t stats_pubsub_bytes_received; uint64_t stats_module_bytes_sent; uint64_t stats_module_bytes_received; - uint64_t stats_proposals_rejected_prevalidation; - uint64_t stats_proposals_rejected_apply; /* Shard epoch tracking (per-shard monotonic counter for stale proposal detection). */ dict *shard_epochs; @@ -441,7 +440,7 @@ static void clusterRaftSelfJoin(void) { if (sdslen(slots) > 0) { entry = sdsnew("SLOT_CHANGE - 0 "); entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); - entry = sdscatfmt(entry, " %U", (unsigned long long)0); + entry = sdscat(entry, " 0"); entry = sdscatsds(entry, slots); clusterRaftPropose(entry, NULL, NULL); sdsfree(entry); @@ -788,7 +787,7 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, if (rs->role == RAFT_ROLE_LEADER) { /* Pre-validate epoch on the leader to reject obviously stale proposals. */ if (!clusterRaftPreValidateEpoch(type, data)) { - if (callback) callback(ctx, GENERIC_PROPOSAL_REJECTION_MSG); + if (callback) callback(ctx, STALE_SHARD_EPOCH_REJECTION_MSG); sdsfree(data); return; } @@ -879,34 +878,33 @@ typedef struct { uint64_t source_epoch; uint64_t target_epoch; int range_end; /* Index in argv where ranges begin (first range element). */ - int valid; /* 1 if parsing succeeded, 0 otherwise. */ } slotChangeEpochInfo; /* Parse fields from a SLOT_CHANGE entry's split argv. * Format: " " - * Caller must have verified argc >= 5. */ -static slotChangeEpochInfo parseSlotChangeEpochs(sds *argv, int argc) { - slotChangeEpochInfo info = {0}; + * Caller must have verified argc >= 5. + * Returns true if parsing succeeded, false otherwise. */ +static bool parseSlotChangeEpochs(sds *argv, int argc, slotChangeEpochInfo *info) { + memset(info, 0, sizeof(*info)); /* argv[0] = source-id-or-dash */ - info.source_owner = (sdslen(argv[0]) == CLUSTER_NAMELEN) - ? clusterLookupNode(argv[0], CLUSTER_NAMELEN) - : NULL; + info->source_owner = (sdslen(argv[0]) == CLUSTER_NAMELEN) + ? clusterLookupNode(argv[0], CLUSTER_NAMELEN) + : NULL; /* argv[1] = source-epoch */ - info.source_epoch = strtoull(argv[1], NULL, 10); + info->source_epoch = strtoull(argv[1], NULL, 10); /* argv[2] = target-id-or-dash */ - info.target = (sdslen(argv[2]) == CLUSTER_NAMELEN) - ? clusterLookupNode(argv[2], CLUSTER_NAMELEN) - : NULL; + info->target = (sdslen(argv[2]) == CLUSTER_NAMELEN) + ? clusterLookupNode(argv[2], CLUSTER_NAMELEN) + : NULL; /* argv[3] = target-epoch */ - info.target_epoch = strtoull(argv[3], NULL, 10); + info->target_epoch = strtoull(argv[3], NULL, 10); /* argv[4..argc-1] = ranges */ - info.range_end = 4; + info->range_end = 4; - info.source_shard_id = info.source_owner ? info.source_owner->shard_id : NULL; - info.target_shard_id = info.target ? info.target->shard_id : NULL; - info.valid = (argc > 4); - return info; + info->source_shard_id = info->source_owner ? info->source_owner->shard_id : NULL; + info->target_shard_id = info->target ? info->target->shard_id : NULL; + return (argc > 4); } /* Pre-validate shard epoch on the leader before appending to the log. @@ -964,8 +962,8 @@ static int clusterRaftPreValidateEpoch(int type, sds data) { if (argc < 5) { ok = 0; } else { - slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc); - if (info.valid) { + slotChangeEpochInfo info; + if (parseSlotChangeEpochs(argv, argc, &info)) { if (info.source_shard_id) { ok = isShardEpochCurrent(info.source_shard_id, info.source_epoch); } @@ -995,8 +993,7 @@ static int clusterRaftPreValidateEpoch(int type, sds data) { sdsfreesplitres(argv, argc); if (!ok) { - RAFT_STATE()->stats_proposals_rejected_prevalidation++; - serverLog(LL_NOTICE, "Leader pre-validation: rejecting %s proposal (missing or stale epoch).", + serverLog(LL_DEBUG, "Leader pre-validation: rejecting %s proposal (missing or stale epoch).", raftEntryTypeName(type)); } return ok; @@ -1321,7 +1318,7 @@ static void raftLogApply(raftLogEntry *e) { if (count > 0) { sds entry = sdsnew("SLOT_CHANGE - 0 "); entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); - entry = sdscatfmt(entry, " %U", (unsigned long long)0); + entry = sdscat(entry, " 0"); entry = sdscatsds(entry, slot_range); clusterRaftPropose(entry, NULL, NULL); sdsfree(entry); @@ -1380,10 +1377,14 @@ static void raftLogApply(raftLogEntry *e) { } clusterNode *node = clusterLookupNode(argv[0], sdslen(argv[0])); if (node && node != myself) { - /* Epoch validation (no bump yet). */ uint64_t epoch = strtoull(argv[1], NULL, 10); if (!isShardEpochCurrent(node->shard_id, epoch)) { - entry_error = GENERIC_PROPOSAL_REJECTION_MSG; + entry_error = STALE_SHARD_EPOCH_REJECTION_MSG; + sdsfreesplitres(argv, argc); + break; + } + if (nodeIsPrimary(node) && (node->num_replicas > 0 || node->numslots > 0)) { + entry_error = "Can't forget a primary with replicas or assigned slots."; sdsfreesplitres(argv, argc); break; } @@ -1496,7 +1497,8 @@ static void raftLogApply(raftLogEntry *e) { } if (entry_error) { - rs->stats_proposals_rejected_apply++; + serverLog(LL_DEBUG, "Proposal rejected at apply: %s (type %s, index %llu).", + entry_error, raftEntryTypeName(e->type), (unsigned long long)e->index); } /* Check pending proposals for a match and remove it. */ @@ -2140,6 +2142,9 @@ static void clusterRaftRetryProposals(void) { } serverLog(LL_NOTICE, "Forwarded %lu deferred proposals to leader.", listLength(rs->pending_proposals)); + } else { + /* Leader link not available yet — retry on next cycle. */ + rs->todo_retry_proposals = 1; } } } @@ -2954,8 +2959,6 @@ static void clusterRaftResetStats(void) { rs->stats_pubsub_bytes_received = 0; rs->stats_module_bytes_sent = 0; rs->stats_module_bytes_received = 0; - rs->stats_proposals_rejected_prevalidation = 0; - rs->stats_proposals_rejected_apply = 0; } static sds clusterRaftAppendInfoFields(sds info) { @@ -3010,8 +3013,6 @@ static sds clusterRaftAppendInfoFields(sds info) { "cluster_stats_pubsub_bytes_received:%llu\r\n" "cluster_stats_module_bytes_sent:%llu\r\n" "cluster_stats_module_bytes_received:%llu\r\n" - "cluster_stats_proposals_rejected_prevalidation:%llu\r\n" - "cluster_stats_proposals_rejected_apply:%llu\r\n" "total_cluster_links_buffer_limit_exceeded:%llu\r\n", role_str, (unsigned long long)rs->current_term, (unsigned long long)rs->commit_index, (unsigned long long)rs->last_applied, @@ -3024,8 +3025,6 @@ static sds clusterRaftAppendInfoFields(sds info) { (unsigned long long)rs->stats_pubsub_bytes_received, (unsigned long long)rs->stats_module_bytes_sent, (unsigned long long)rs->stats_module_bytes_received, - (unsigned long long)rs->stats_proposals_rejected_prevalidation, - (unsigned long long)rs->stats_proposals_rejected_apply, (unsigned long long)server.cluster->stat_cluster_links_buffer_limit_exceeded); return info; } @@ -3127,7 +3126,8 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode } /* Apply a SLOT_CHANGE entry. Format: " " - * Ranges use the same format as nodes.conf: "0-5460" or "5461". */ + * Ranges use the same format as nodes.conf: "0-5460" or "5461". + * Returns NULL on success, or a error string describing the failure. */ static const char *clusterRaftApplySlotChange(sds data) { const char *error = NULL; int argc; @@ -3135,12 +3135,13 @@ static const char *clusterRaftApplySlotChange(sds data) { /* Need at least: source + source_epoch + target + target_epoch + one_range = 5 */ if (!argv || argc < 5) goto reject; - slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc); - if (!info.valid) goto reject; + slotChangeEpochInfo info; + if (!parseSlotChangeEpochs(argv, argc, &info)) goto reject; /* Epoch validation: validate both source and target epochs. */ if (!clusterValidateShardEpochPair(info.source_shard_id, info.source_epoch, info.target_shard_id, info.target_epoch)) { + error = STALE_SHARD_EPOCH_REJECTION_MSG; goto reject; } @@ -3185,7 +3186,8 @@ static const char *clusterRaftApplySlotChange(sds data) { } /* Apply a SET_REPLICA_OF entry. - * Format: " " */ + * Format: " " + * Returns NULL on success, or a error string describing the failure. */ static const char *clusterRaftApplySetReplica(sds data) { clusterRaftState *rs = RAFT_STATE(); const char *error = NULL; @@ -3196,8 +3198,6 @@ static const char *clusterRaftApplySetReplica(sds data) { clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0])); if (!replica) goto reject; - char *shard_id = argv[2]; - uint64_t epoch = strtoull(argv[3], NULL, 10); char *source_shard = argv[1]; uint64_t source_epoch = strtoull(argv[2], NULL, 10); char *target_shard = argv[4]; @@ -3206,6 +3206,7 @@ static const char *clusterRaftApplySetReplica(sds data) { /* Validate both shard epochs without bumping. */ if (!clusterValidateShardEpochPair(source_shard, source_epoch, target_shard, target_epoch)) { + error = STALE_SHARD_EPOCH_REJECTION_MSG; goto reject; } @@ -3266,7 +3267,8 @@ static const char *clusterRaftApplySetReplica(sds data) { /* Apply a FAILOVER entry. Format: " " * The replica takes over the primary's slots and becomes primary. - * The old primary becomes a replica of the new primary. */ + * The old primary becomes a replica of the new primary. + * Returns NULL on success, or a error string describing the failure. */ static const char *clusterRaftApplyFailover(sds data) { clusterRaftState *rs = RAFT_STATE(); const char *error = NULL; @@ -3288,6 +3290,7 @@ static const char *clusterRaftApplyFailover(sds data) { /* Epoch validation: validate against the shard-id from the entry (no bump yet). */ uint64_t shard_epoch = strtoull(argv[3], NULL, 10); if (!isShardEpochCurrent(argv[2], shard_epoch)) { + error = STALE_SHARD_EPOCH_REJECTION_MSG; goto reject; } diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl index 277a8e873b0..efbc2660cb5 100644 --- a/tests/unit/cluster/cli.tcl +++ b/tests/unit/cluster/cli.tcl @@ -565,7 +565,7 @@ start_multiple_servers 3 [list overrides $base_conf] { # Test valkey-cli --cluster del-node # For cluster-raft we need to handle leader transfer when FORGET targets the leader. set base_conf [list cluster-enabled yes cluster-node-timeout 1000] -start_multiple_servers 3 [list overrides $base_conf tags {cluster-raft:skip}] { +start_multiple_servers 3 [list overrides $base_conf] { # Create cluster with 1 primary and 2 replicas for del-node tests exec $::VALKEY_CLI_BIN --cluster-yes --cluster create \ diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl index 6b8919244b4..552f804363d 100644 --- a/tests/unit/cluster/cluster-raft-proto.tcl +++ b/tests/unit/cluster/cluster-raft-proto.tcl @@ -542,7 +542,7 @@ proc get_cluster_info_field {client field} { return "" } -start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft cluster-node-timeout 2000}} { +start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft cluster-node-timeout 2000 loglevel debug}} { # Shared setup: form cluster and assign slots. set r0 [srv 0 client] set r1 [srv -1 client] @@ -606,9 +606,9 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c lassign $case entry_type propose_msg state_check test "Raft shard epoch: stale $entry_type is rejected (pre-validation, no log append)" { - # Record commit index and rejection counter before injection. + # Record commit index before injection. set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] - set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] + set loglines [count_log_lines 0] # Connect and inject stale PROPOSE. set fd [raft_connect_fake_node 127.0.0.1 $cport $fake_id $fake_addr] @@ -618,9 +618,8 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c set reply [raft_recv $fd 2000] assert_match "REJECT *" $reply - # Verify rejection counter increased. - set rejected_after [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] - assert {$rejected_after > $rejected_before} + # Verify rejection logged. + wait_for_log_messages 0 {"*Leader pre-validation: rejecting*"} $loglines 1000 10 # Verify log index unchanged (rejected at pre-validation). set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] @@ -650,7 +649,7 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c test "Raft shard epoch: $label is rejected (pre-validation, no log append)" { set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] - set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] + set loglines [count_log_lines 0] set fd [raft_connect_fake_node 127.0.0.1 $cport $fake_id $fake_addr] raft_send $fd $propose_msg @@ -659,9 +658,8 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c set reply [raft_recv $fd 2000] assert_match "REJECT *" $reply - # Verify rejection counter increased. - set rejected_after [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] - assert {$rejected_after > $rejected_before} + # Verify rejection logged. + wait_for_log_messages 0 {"*Leader pre-validation: rejecting*"} $loglines 1000 10 # Verify log index unchanged (rejected at pre-validation). set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] @@ -688,7 +686,7 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c test "Raft shard epoch: $entry_type with wrong shard-id is rejected at pre-validation" { set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index] - set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] + set loglines [count_log_lines 0] set fd [raft_connect_fake_node 127.0.0.1 $cport $fake_id $fake_addr] raft_send $fd $propose_msg @@ -697,9 +695,8 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c set reply [raft_recv $fd 2000] assert_match "REJECT *" $reply - # Verify rejection counter increased. - set rejected_after [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] - assert {$rejected_after > $rejected_before} + # Verify rejection logged. + wait_for_log_messages 0 {"*Leader pre-validation: rejecting*"} $loglines 1000 10 # Verify log index unchanged (rejected at pre-validation, epoch not bumped). set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index] @@ -716,8 +713,7 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c # -------------------------------------------------------------------------- test "Raft shard epoch: duplicate FAILOVER from follower - second client gets rejection" { - set prevalidation_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] - set apply_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_apply] + set loglines [count_log_lines 0] # Two deferred clients on r1 (the follower/replica). set c1 [valkey_deferring_client -1] @@ -727,13 +723,8 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c $c1 CLUSTER FAILOVER FORCE $c2 CLUSTER FAILOVER FORCE - # Wait for one of the rejection counters to increase. - wait_for_condition 50 100 { - [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $prevalidation_before || - [get_cluster_info_field $r0 cluster_stats_proposals_rejected_apply] > $apply_before - } else { - fail "Second FAILOVER was not rejected" - } + # Wait for a rejection to be logged (either at pre-validation or apply). + wait_for_log_messages 0 {"*rejecting*proposal*" "*Proposal rejected at apply*"} $loglines 1000 10 # Read both replies. One should succeed (OK), the other should # get a rejection error. Neither should hang. @@ -751,7 +742,7 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c foreach reply [list $reply1 $reply2] { if {$reply eq "OK"} { incr ok_count - } elseif {[string match "*proposal rejected by raft leader*" $reply]} { + } elseif {[string match "*proposal rejected*" $reply]} { incr err_count } } From 18a1c8bbfbfe16543835389636fa49e414a00f91 Mon Sep 17 00:00:00 2001 From: Sushil Paneru Date: Wed, 17 Jun 2026 06:32:44 +0000 Subject: [PATCH 9/9] Added retry logic for proposal rejection Signed-off-by: Sushil Paneru --- design-docs/cluster-raft.md | 20 ++ src/cluster_raft.c | 231 ++++++++++++++++++++-- tests/unit/cluster/cli.tcl | 2 +- tests/unit/cluster/cluster-raft-proto.tcl | 47 ----- 4 files changed, 235 insertions(+), 65 deletions(-) diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md index 11a1961dafd..2da2bda9fe7 100644 --- a/design-docs/cluster-raft.md +++ b/design-docs/cluster-raft.md @@ -786,6 +786,26 @@ Epoch validation happens at two points: and the entry is applied. On mismatch, the entry is a no-op and the error is propagated to the caller's callback. +### Retry on stale epoch + +Proposals rejected due to a stale shard epoch are automatically retried +with a fresh epoch (up to 5 attempts): + +- **SET_REPLICA_OF / NODE_FORGET / FAILOVER (force) / SLOT_CHANGE** — + the proposal is rebuilt with current epoch(s) and re-submitted. + +- **Automatic failover** — if the FAILOVER proposal is rejected, the + failover is re-scheduled (via `todo_schedule_failover`) as long as + the primary is still failed. The next attempt uses the current epoch. + For automatic failover, no cap on retry attempt to avoid leaderless shard. + +Only `STALE_SHARD_EPOCH_REJECTION_MSG` triggers retry. Other errors +(format errors, invalid state) are forwarded to the client immediately. + +When the leader rejects a forwarded proposal at pre-validation, it sends +a `REJECT retry` message back. The `retry` suffix signals +the follower that the rejection is epoch-related and eligible for retry. + ### Entries that don't carry an epoch - NODE_FAIL / NODE_RECOVER diff --git a/src/cluster_raft.c b/src/cluster_raft.c index 209c4694837..9476ba3e334 100644 --- a/src/cluster_raft.c +++ b/src/cluster_raft.c @@ -216,6 +216,7 @@ typedef struct { #define RAFT_HDR_SIZE 8 #define REPL_OFFSETS_BROADCAST_PERIOD_MS 10000 #define RAFT_LOG_REWRITE_THRESHOLD 100 +#define PROPOSAL_MAX_RETRIES 5 /* Monotonic millisecond clock for timeouts and failure detection. * Unlike gettimeofday(), this is not affected by system clock adjustments. */ @@ -325,6 +326,8 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, static void clusterRaftDeferPendingProposals(void); static void clusterRaftCompletePendingProposal(int type, sds data, const char *error); static int clusterRaftPreValidateEpoch(int type, sds data); +static void clusterRaftAutoFailoverCallback(void *ctx, const char *error); +static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode *target, void *ctx, void (*callback)(void *ctx, const char *error)); static void clusterRaftUpdateMyself(int old_flags); static sds clusterRaftBuildMyNodeInfo(void); static void clusterRaftCheckSlotCoverage(void); @@ -993,14 +996,13 @@ static int clusterRaftPreValidateEpoch(int type, sds data) { sdsfreesplitres(argv, argc); if (!ok) { - serverLog(LL_DEBUG, "Leader pre-validation: rejecting %s proposal (missing or stale epoch).", + serverLog(LL_DEBUG, "Leader pre-validation: rejecting %s proposal (stale epoch).", raftEntryTypeName(type)); } return ok; } static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) { - UNUSED(link); clusterRaftState *rs = RAFT_STATE(); /* argv[0]="PROPOSE", argv[1..] is the entry (type + data). */ @@ -1020,11 +1022,13 @@ static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) { /* Pre-validate epoch on the leader to reject obviously stale proposals. */ if (!clusterRaftPreValidateEpoch(type, data)) { - /* Send REJECT back to the proposing follower so it can unblock the client. */ + /* Send REJECT back to the proposing follower so it can unblock the client. + * Append "retry" hint so the follower knows it can retry with a fresh epoch. */ if (link) { sds msg = wireNewMsg("REJECT"); msg = sdscatlen(msg, " ", 1); msg = sdscatlen(msg, entry, sdslen(entry)); + msg = sdscat(msg, " retry"); msg = wireFinishMsg(msg); clusterRaftSendMsg(link, msg); } @@ -1049,21 +1053,28 @@ static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) { } /* Handle a REJECT message from the leader. The leader sends this when it - * rejects a forwarded PROPOSE at pre-validation. We match it against our + * rejects a forwarded PROPOSE. We match it against our * pending_proposals and fire the callback with an error so the client * gets an immediate reply instead of hanging until timeout. - * Format: "REJECT " (echoes back the original entry). */ + * Format: "REJECT [retry]" (echoes back the original entry). + * If the last token is "retry", the rejection is retryable. */ static int clusterRaftProcessReject(clusterLink *link, int argc, sds *argv) { UNUSED(link); - /* argv[0]="REJECT", argv[1]=type, argv[2..]=data */ + /* argv[0]="REJECT", argv[1]=type, argv[2..]=data, optional last="retry" */ if (argc < 2) return 1; int type = raftEntryTypeByName(argv[1]); if (type < 0) return 1; - sds data = (argc >= 3) ? sdsjoinsds(argv + 2, argc - 2, " ", 1) : sdsempty(); - clusterRaftCompletePendingProposal(type, data, GENERIC_PROPOSAL_REJECTION_MSG); + /* Check if the last token is the "retry" hint. */ + int retryable = (argc >= 3 && sdslen(argv[argc - 1]) == 5 && + memcmp(argv[argc - 1], "retry", 5) == 0); + int data_argc = retryable ? argc - 3 : argc - 2; + + sds data = (data_argc > 0) ? sdsjoinsds(argv + 2, data_argc, " ", 1) : sdsempty(); + const char *error = retryable ? STALE_SHARD_EPOCH_REJECTION_MSG : GENERIC_PROPOSAL_REJECTION_MSG; + clusterRaftCompletePendingProposal(type, data, error); sdsfree(data); return 1; } @@ -1235,8 +1246,6 @@ static void raftLogApply(raftLogEntry *e) { sds *argv = sdssplitlen(e->data, sdslen(e->data), " ", 1, &argc); if (argv && argc >= 2 && sdslen(argv[0]) == CLUSTER_NAMELEN) { clusterNode *existing = clusterLookupNode(argv[0], CLUSTER_NAMELEN); - /* For new nodes (existing == NULL), epoch should be 0 — no validation needed. */ - if (!existing) { clusterNode *n = createClusterNode(argv[0], CLUSTER_NODE_PRIMARY); if (clusterNodeParseAddressString(n, argv[1]) == C_OK) { @@ -2307,7 +2316,7 @@ static void clusterRaftCron(void) { entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN); uint64_t epoch = clusterGetShardEpoch(primary->shard_id); entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); - clusterRaftPropose(entry, NULL, NULL); + clusterRaftPropose(entry, NULL, clusterRaftAutoFailoverCallback); sdsfree(entry); } } @@ -2762,7 +2771,10 @@ static void clusterRaftGetNodePingPongEpoch(clusterNode *node, long long *ping_s static void clusterRaftSetNodePingPongEpoch(clusterNode *node, int ping_active, int pong_active, uint64_t shard_epoch) { UNUSED(ping_active); UNUSED(pong_active); - clusterSetShardEpoch(node->shard_id, shard_epoch); + uint64_t current = clusterGetShardEpoch(node->shard_id); + if (shard_epoch > current) { + clusterSetShardEpoch(node->shard_id, shard_epoch); + } } static sds clusterRaftAppendVarsLine(sds config) { @@ -3060,15 +3072,28 @@ typedef struct { void *orig_ctx; void (*orig_callback)(void *orig_ctx, const char *error); clusterNode *target; + slotRange *ranges; + int numranges; + int retries; } slotChangeCallbackCtx; static void clusterRaftSlotChangeApplyCallback(void *ctx, const char *error) { slotChangeCallbackCtx *sc = (slotChangeCallbackCtx *)ctx; - if (clusterNodeGetPrimary(myself)->numslots == 0 && sc->target && !error) { + if (!error && clusterNodeGetPrimary(myself)->numslots == 0 && sc->target) { clusterHandleLostLastSlot(sc->target); } + if (error && strcmp(error, STALE_SHARD_EPOCH_REJECTION_MSG) == 0 && sc->retries > 0) { + sc->retries--; + /* Retry: re-invoke slot change with fresh epochs. */ + clusterRaftSlotChange(sc->ranges, sc->numranges, sc->target, + sc->orig_ctx, sc->orig_callback); + zfree(sc->ranges); + zfree(sc); + return; + } if (sc->orig_callback) sc->orig_callback(sc->orig_ctx, error); - zfree(ctx); + zfree(sc->ranges); + zfree(sc); } /* Invoked for slot assignments and slot migrations (ADDSLOTS, SETSLOT, etc.) */ @@ -3121,6 +3146,10 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode sc->orig_ctx = ctx; sc->orig_callback = callback; sc->target = target; + sc->ranges = zmalloc(sizeof(slotRange) * numranges); + memcpy(sc->ranges, ranges, sizeof(slotRange) * numranges); + sc->numranges = numranges; + sc->retries = PROPOSAL_MAX_RETRIES; clusterRaftPropose(entry, sc, &clusterRaftSlotChangeApplyCallback); sdsfree(entry); } @@ -3352,14 +3381,162 @@ static const char *clusterRaftApplyFailover(sds data) { return error; } +/* -------------------------------------------------------------------------- + * Proposal retry on stale shard epoch + * + * REPLICATE and FORGET operations can be safely retried when rejected due to + * a stale shard epoch (a concurrent operation bumped the epoch). The retry + * rebuilds the proposal with a fresh epoch. Max retries: 5. + * -------------------------------------------------------------------------- */ +typedef struct { + void *client_ctx; /* Original blocked client handle. */ + void (*client_callback)(void *, const char *); /* Original completion callback. */ + int retries; /* Remaining retry attempts. */ + /* FORGET-specific fields. */ + char node_id[CLUSTER_NAMELEN]; + /* SET_REPLICA_OF-specific fields. */ + char primary_name[CLUSTER_NAMELEN]; /* Primary node name (or "-" if promotion). */ + int has_primary; /* 1 if replicating, 0 if promoting. */ +} proposalRetryCtx; + +/* Callback for automatic failover proposals. On rejection (stale epoch or any + * reason), re-schedule the failover if the primary is still failed. The next + * attempt will rebuild the proposal with a fresh shard epoch. */ +static void clusterRaftAutoFailoverCallback(void *ctx, const char *error) { + UNUSED(ctx); + if (!error) return; /* Success — nothing to do. */ + + clusterNode *myself = getMyClusterNode(); + if (nodeIsReplica(myself) && myself->replicaof && + nodeFailed(myself->replicaof) && !nodeCantFailover(myself)) { + RAFT_STATE()->todo_schedule_failover = 1; + serverLog(LL_NOTICE, "Automatic failover proposal rejected (%s), re-scheduling.", error); + } +} + +static void clusterRaftForgetNodeRetryCallback(void *ctx, const char *error) { + proposalRetryCtx *rc = (proposalRetryCtx *)ctx; + if (error && strcmp(error, STALE_SHARD_EPOCH_REJECTION_MSG) == 0 && rc->retries > 0) { + rc->retries--; + /* Rebuild proposal with fresh epoch. */ + clusterNode *node = clusterLookupNode(rc->node_id, CLUSTER_NAMELEN); + if (!node) { + /* Node already gone — treat as success. */ + if (rc->client_callback) rc->client_callback(rc->client_ctx, NULL); + zfree(rc); + return; + } + sds entry = sdsnew("NODE_FORGET "); + entry = sdscatlen(entry, rc->node_id, CLUSTER_NAMELEN); + uint64_t epoch = clusterGetShardEpoch(node->shard_id); + entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); + clusterRaftPropose(entry, rc, clusterRaftForgetNodeRetryCallback); + sdsfree(entry); + return; + } + /* No retry — forward result to the original callback. */ + if (rc->client_callback) rc->client_callback(rc->client_ctx, error); + zfree(rc); +} + +static void clusterRaftSetReplicaOfRetryCallback(void *ctx, const char *error) { + proposalRetryCtx *rc = (proposalRetryCtx *)ctx; + if (error && strcmp(error, STALE_SHARD_EPOCH_REJECTION_MSG) == 0 && rc->retries > 0) { + rc->retries--; + /* Rebuild proposal with fresh epochs. */ + clusterNode *primary = rc->has_primary + ? clusterLookupNode(rc->primary_name, CLUSTER_NAMELEN) + : NULL; + if (rc->has_primary && !primary) { + if (rc->client_callback) rc->client_callback(rc->client_ctx, "target primary no longer exists"); + zfree(rc); + return; + } + uint64_t source_epoch = clusterGetShardEpoch(myself->shard_id); + char target_shard[CLUSTER_NAMELEN]; + uint64_t target_epoch; + if (primary) { + memcpy(target_shard, primary->shard_id, CLUSTER_NAMELEN); + target_epoch = clusterGetShardEpoch(primary->shard_id); + } else { + getRandomHexChars(target_shard, CLUSTER_NAMELEN); + target_epoch = 0; + } + sds entry = sdsnew("SET_REPLICA_OF "); + entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); + entry = sdscatlen(entry, " ", 1); + entry = sdscatlen(entry, myself->shard_id, CLUSTER_NAMELEN); + entry = sdscatfmt(entry, " %U ", (unsigned long long)source_epoch); + entry = sdscatlen(entry, primary ? primary->name : "-", primary ? CLUSTER_NAMELEN : 1); + entry = sdscatlen(entry, " ", 1); + entry = sdscatlen(entry, target_shard, CLUSTER_NAMELEN); + entry = sdscatfmt(entry, " %U", (unsigned long long)target_epoch); + clusterRaftPropose(entry, rc, clusterRaftSetReplicaOfRetryCallback); + sdsfree(entry); + return; + } + /* No retry — forward result to the original callback. */ + if (rc->client_callback) rc->client_callback(rc->client_ctx, error); + zfree(rc); +} + +static void clusterRaftFailoverRetryCallback(void *ctx, const char *error) { + proposalRetryCtx *rc = (proposalRetryCtx *)ctx; + if (error && strcmp(error, STALE_SHARD_EPOCH_REJECTION_MSG) == 0 && rc->retries > 0) { + rc->retries--; + /* Rebuild proposal with fresh epoch. */ + clusterNode *primary = myself->replicaof; + if (!primary) { + if (rc->client_callback) rc->client_callback(rc->client_ctx, "no primary to fail over"); + zfree(rc); + return; + } + sds entry = sdsnew("FAILOVER "); + entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); + entry = sdscatlen(entry, " ", 1); + entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN); + entry = sdscatlen(entry, " ", 1); + entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN); + uint64_t epoch = clusterGetShardEpoch(primary->shard_id); + entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); + clusterRaftPropose(entry, rc, clusterRaftFailoverRetryCallback); + sdsfree(entry); + return; + } + /* No retry — forward result to the original callback. */ + if (rc->client_callback) rc->client_callback(rc->client_ctx, error); + zfree(rc); +} + static void clusterRaftForgetNode(const char *node_id, size_t id_len, void *ctx, void (*callback)(void *ctx, const char *error)) { + /* Reject forgetting the raft leader — it would crash the cluster. + * The admin should transfer leadership first. Only block if the leader + * node is still a known, non-failed member. */ + clusterRaftState *rs = RAFT_STATE(); + if (id_len == CLUSTER_NAMELEN && memcmp(node_id, rs->leader, CLUSTER_NAMELEN) == 0) { + clusterNode *leader_node = clusterLookupNode(node_id, id_len); + if (leader_node && !nodeFailed(leader_node)) { + if (callback) callback(ctx, "Can't forget the raft leader. Transfer leadership first."); + return; + } + } + + /* Wrap with retry context for stale epoch recovery. */ + proposalRetryCtx *rc = zmalloc(sizeof(*rc)); + rc->client_ctx = ctx; + rc->client_callback = callback; + rc->retries = PROPOSAL_MAX_RETRIES; + memset(rc->node_id, 0, CLUSTER_NAMELEN); + memcpy(rc->node_id, node_id, id_len < CLUSTER_NAMELEN ? id_len : CLUSTER_NAMELEN); + rc->has_primary = 0; + sds entry = sdsnew("NODE_FORGET "); entry = sdscatlen(entry, node_id, id_len); /* Append departing node's shard epoch. */ clusterNode *node = clusterLookupNode(node_id, id_len); uint64_t epoch = node ? clusterGetShardEpoch(node->shard_id) : 0; entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); - clusterRaftPropose(entry, ctx, callback); + clusterRaftPropose(entry, rc, clusterRaftForgetNodeRetryCallback); sdsfree(entry); } @@ -3368,6 +3545,18 @@ static void clusterRaftSetReplicaOf(clusterNode *primary, void *ctx, void (*call * " " * Source is myself's current shard. Target is the primary's shard (for assignment) * or a new random shard (for promotion to primary). */ + + /* Wrap with retry context for stale epoch recovery. */ + proposalRetryCtx *rc = zmalloc(sizeof(*rc)); + rc->client_ctx = ctx; + rc->client_callback = callback; + rc->retries = PROPOSAL_MAX_RETRIES; + memset(rc->node_id, 0, CLUSTER_NAMELEN); + rc->has_primary = (primary != NULL); + if (primary) { + memcpy(rc->primary_name, primary->name, CLUSTER_NAMELEN); + } + uint64_t source_epoch = clusterGetShardEpoch(myself->shard_id); char target_shard[CLUSTER_NAMELEN]; @@ -3387,7 +3576,7 @@ static void clusterRaftSetReplicaOf(clusterNode *primary, void *ctx, void (*call entry = sdscatlen(entry, " ", 1); entry = sdscatlen(entry, target_shard, CLUSTER_NAMELEN); entry = sdscatfmt(entry, " %U", (unsigned long long)target_epoch); - clusterRaftPropose(entry, ctx, callback); + clusterRaftPropose(entry, rc, clusterRaftSetReplicaOfRetryCallback); sdsfree(entry); } @@ -3401,6 +3590,14 @@ static void clusterRaftFailover(int force, int takeover, void *ctx, void (*callb if (force) { /* FORCE/TAKEOVER: propose immediately without coordination. */ + /* Wrap with retry context for stale epoch recovery. */ + proposalRetryCtx *rc = zmalloc(sizeof(*rc)); + rc->client_ctx = ctx; + rc->client_callback = callback; + rc->retries = PROPOSAL_MAX_RETRIES; + memset(rc->node_id, 0, CLUSTER_NAMELEN); + rc->has_primary = 0; + sds entry = sdsnew("FAILOVER "); entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN); entry = sdscatlen(entry, " ", 1); @@ -3409,7 +3606,7 @@ static void clusterRaftFailover(int force, int takeover, void *ctx, void (*callb entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN); uint64_t epoch = clusterGetShardEpoch(primary->shard_id); entry = sdscatfmt(entry, " %U", (unsigned long long)epoch); - clusterRaftPropose(entry, ctx, callback); + clusterRaftPropose(entry, rc, clusterRaftFailoverRetryCallback); sdsfree(entry); } else { /* Coordinated failover: ask primary to pause writes, then wait diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl index efbc2660cb5..277a8e873b0 100644 --- a/tests/unit/cluster/cli.tcl +++ b/tests/unit/cluster/cli.tcl @@ -565,7 +565,7 @@ start_multiple_servers 3 [list overrides $base_conf] { # Test valkey-cli --cluster del-node # For cluster-raft we need to handle leader transfer when FORGET targets the leader. set base_conf [list cluster-enabled yes cluster-node-timeout 1000] -start_multiple_servers 3 [list overrides $base_conf] { +start_multiple_servers 3 [list overrides $base_conf tags {cluster-raft:skip}] { # Create cluster with 1 primary and 2 replicas for del-node tests exec $::VALKEY_CLI_BIN --cluster-yes --cluster create \ diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl index 552f804363d..f48cd2aed6a 100644 --- a/tests/unit/cluster/cluster-raft-proto.tcl +++ b/tests/unit/cluster/cluster-raft-proto.tcl @@ -705,53 +705,6 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c close $fd } } - - # -------------------------------------------------------------------------- - # Duplicate FAILOVER: two clients on the follower both send FAILOVER FORCE - # simultaneously. The first succeeds, the second is rejected at apply time - # (epoch bumped by the first). Neither client should hang. - # -------------------------------------------------------------------------- - - test "Raft shard epoch: duplicate FAILOVER from follower - second client gets rejection" { - set loglines [count_log_lines 0] - - # Two deferred clients on r1 (the follower/replica). - set c1 [valkey_deferring_client -1] - set c2 [valkey_deferring_client -1] - - # Both issue CLUSTER FAILOVER FORCE concurrently. - $c1 CLUSTER FAILOVER FORCE - $c2 CLUSTER FAILOVER FORCE - - # Wait for a rejection to be logged (either at pre-validation or apply). - wait_for_log_messages 0 {"*rejecting*proposal*" "*Proposal rejected at apply*"} $loglines 1000 10 - - # Read both replies. One should succeed (OK), the other should - # get a rejection error. Neither should hang. - # Use catch because error replies throw in the deferring client. - if {[catch {set reply1 [$c1 read]} err1]} { - set reply1 $err1 - } - if {[catch {set reply2 [$c2 read]} err2]} { - set reply2 $err2 - } - - # Exactly one should be OK and the other should be rejected. - set ok_count 0 - set err_count 0 - foreach reply [list $reply1 $reply2] { - if {$reply eq "OK"} { - incr ok_count - } elseif {[string match "*proposal rejected*" $reply]} { - incr err_count - } - } - assert_equal 1 $ok_count - assert_equal 1 $err_count - - $c1 close - $c2 close - } } } ;# tags