From 4fcef86ebfd6fea2680568e0a774b76d8123ba1b Mon Sep 17 00:00:00 2001
From: sushil paneru <spaneru@amazon.com>
Date: Mon, 1 Jun 2026 19:37:34 +0000
Subject: [PATCH 1/9] Cluster bus v2 - Added shard level epoch

Signed-off-by: Sushil Paneru <sushil.paneru1@gmail.com>
---
 design-docs/cluster-raft.md               |  73 +++--
 src/cluster.c                             |   1 +
 src/cluster_legacy.c                      |   1 +
 src/cluster_raft.c                        | 367 +++++++++++++++++++---
 src/cluster_state.c                       |  51 +++
 src/cluster_state.h                       |   5 +
 tests/unit/cluster/cluster-raft-proto.tcl | 123 ++++++++
 7 files changed, 558 insertions(+), 63 deletions(-)
diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md
index 3794f12cdf1..d389142ac84 100644
--- a/design-docs/cluster-raft.md
+++ b/design-docs/cluster-raft.md
@@ -723,21 +723,29 @@ reuse), but the vars and log lines are raft-specific. The file is not
 compatible between protocols — switching from gossip to raft (or vice
 versa) requires removing nodes.conf.
 
-## Shard Epoch (not yet implemented)
-
-A shard-epoch is a per-shard monotonically increasing counter, bumped
-on topology changes within the shard (FAILOVER, SET_REPLICA_OF,
-SLOT_CHANGE). Entries that modify shard topology include the current
-shard-epoch at proposal time. On apply, if the shard-epoch has
-advanced, the entry is stale and becomes a no-op.
-
-This prevents stale entries from causing inconsistencies when
-concurrent operations race in the log. Example:
+## Shard Epoch
+
+Raft ensures entries are applied in a total order, but ordering alone
+is not sufficient to prevent stale mutations from corrupting cluster
+state. When concurrent operations target the same shard (e.g., a slot
+migration racing with a failover), a committed entry may carry
+assumptions about shard topology that are no longer true by the time
+it is applied. Without additional application-level state to fence
+against these stale updates, the apply logic can produce
+inconsistencies — such as moving a slot to a node that no longer owns
+the corresponding keys.
+
+A shard-epoch is a per-shard monotonically increasing counter stored
+in `server.cluster->shard_epochs`. It is bumped each time a topology
+change is applied to the shard. Entries that modify shard topology
+include the shard's current epoch at proposal time. On apply, if the
+epoch has advanced past the value in the entry, the entry is stale
+and becomes a no-op.
+
+### Example: slot migration racing with failover
 
 ```
-Slot migration racing with failover:
-
-1. Atomic slot migration starts: keys transferred from shard A to B.
+1. Slot migration starts: keys transferred from shard A to shard B.
 2. Primary of shard A fails. FAILOVER entry is proposed.
 3. Migration is rolled back (keys stay on shard A's new primary).
 4. SLOT_CHANGE entry (assigning slot to shard B) was proposed before
@@ -748,14 +756,39 @@ Slot migration racing with failover:
    carries the old epoch, so it's a no-op. Slot stays on shard A.
 ```
 
-Entries that should carry a shard-epoch:
-- FAILOVER (bumps epoch of the shard)
-- SET_REPLICA_OF (bumps epoch when changing shard membership)
-- SLOT_CHANGE (checked against source and target shard epochs)
+### Entry formats with epoch
+
+```
+FAILOVER <replica-id> <primary-id> <epoch>
+SET_REPLICA_OF <replica-id> <primary-id-or-dash> <shard-id> <epoch>
+SLOT_CHANGE <target-id-or-dash> <ranges...> <source-epoch> <target-epoch>
+NODE_FORGET <node-id> <epoch>
+```
+
+SLOT_CHANGE carries two epochs because it involves two shards (source
+and target). NODE_FORGET carries the epoch of the departing node's
+shard to guard against removing a node whose role changed (e.g.,
+promoted to primary via a concurrent FAILOVER).
+
+### Validation
+
+Epoch validation happens at two points:
+
+1. **Pre-validation on the leader** — before appending to the log.
+   This is a best-effort optimization that rejects obviously stale
+   proposals early, saving log space and replication bandwidth. It
+   performs a read-only check without bumping the epoch.
+
+2. **Apply-time validation** — the authoritative check. Each apply
+   function validates the entry's epoch against the current shard
+   epoch. On match (or epoch 0 for a new shard), the epoch is bumped
+   and the entry is applied. On mismatch, the entry is a no-op and
+   the error is propagated to the caller's callback.
+
+### Entries that don't carry an epoch
 
-Entries that don't need a shard-epoch:
-- NODE_FAIL / NODE_RECOVER (liveness, not topology)
-- NODE_INFO / NODE_JOIN / NODE_FORGET (node-level, not shard-level)
+- NODE_FAIL / NODE_RECOVER 
+- NODE_INFO, NODE_JOIN
 
 ## Leader Transfer
 
diff --git a/src/cluster.c b/src/cluster.c
index 3a7907fa738..ffc614b70ac 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -70,6 +70,7 @@ void clusterInit(void) {
     server.cluster->size = 0;
     server.cluster->nodes = dictCreate(&clusterNodesDictType);
     server.cluster->shards = dictCreate(&clusterSdsToListType);
+    server.cluster->shard_epochs = dictCreate(&clusterShardEpochDictType);
     server.cluster->migrating_slots_to = dictCreate(&clusterSlotDictType);
     server.cluster->importing_slots_from = dictCreate(&clusterSlotDictType);
     server.cluster->stat_cluster_links_buffer_limit_exceeded = 0;
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index bb62df1d35f..65b46670389 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -735,6 +735,7 @@ static void clusterLegacyReset(int hard) {
 
     /* Recreate shards dict */
     dictEmpty(server.cluster->shards, NULL);
+    dictEmpty(server.cluster->shard_epochs, NULL);
 
     /* Forget all the nodes, but myself. */
     di = dictGetSafeIterator(server.cluster->nodes);
diff --git a/src/cluster_raft.c b/src/cluster_raft.c
index 37bbf28d401..30831a540c5 100644
--- a/src/cluster_raft.c
+++ b/src/cluster_raft.c
@@ -308,6 +308,7 @@ static clusterMsgSendBlock *clusterRaftBuildAllOffsetsMsg(void) {
 
 static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, const char *error));
 static void clusterRaftDeferPendingProposals(void);
+static int clusterRaftPreValidateEpoch(int type, sds data);
 static void clusterRaftUpdateMyself(int old_flags);
 static sds clusterRaftBuildMyNodeInfo(void);
 static void clusterRaftCheckSlotCoverage(void);
@@ -315,9 +316,9 @@ static void clusterRaftBroadcastAppendEntries(void);
 static void clusterRaftSendAppendEntries(clusterLink *link, clusterNode *node);
 static void clusterRaftSendPreVoteRequest(clusterLink *link, uint64_t term);
 static void clusterRaftUnblockMeet(clusterNode *node);
-static void clusterRaftApplySlotChange(sds data);
-static void clusterRaftApplySetReplica(sds data);
-static void clusterRaftApplyFailover(sds data);
+static const char *clusterRaftApplySlotChange(sds data);
+static const char *clusterRaftApplySetReplica(sds data);
+static const char *clusterRaftApplyFailover(sds data);
 static void raftLogApply(raftLogEntry *e);
 static raftLogEntry *raftLogCreate(uint64_t term, uint64_t index, uint8_t type, sds data);
 static void raftLogAppend(raftLogEntry *e);
@@ -766,6 +767,15 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx,
 
     sds data = sp ? sdsnew(sp + 1) : sdsempty();
 
+    if (rs->role == RAFT_ROLE_LEADER) {
+        /* Pre-validate epoch on the leader to reject obviously stale proposals. */
+        if (!clusterRaftPreValidateEpoch(type, data)) {
+            if (callback) callback(ctx, "stale shard epoch");
+            sdsfree(data);
+            return;
+        }
+    }
+
     /* Track pending proposal for retry on leader change. */
     {
         raftPendingProposal *pp = zmalloc(sizeof(*pp));
@@ -786,7 +796,6 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx,
               memcmp(data, myself->name, CLUSTER_NAMELEN) == 0)) {
             clusterRaftSelfJoin();
         }
-
         uint64_t idx = raftLogLastIndex() + 1;
         raftLogAppend(raftLogCreate(rs->current_term, idx, type, data));
         serverLog(LL_NOTICE, "Leader appended %s (index %llu).",
@@ -815,6 +824,134 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx,
     }
 }
 
+static int isShardEpochCurrent(const char *shard_id, uint64_t entry_epoch) {
+    uint64_t current = clusterGetShardEpoch(shard_id);
+    return (current == 0 || entry_epoch == current);
+}
+
+/* Parsed fields from a SLOT_CHANGE entry's epoch-related data.
+ * Format: "<target-id-or-dash> <ranges...> <source-epoch> <target-epoch>" */
+typedef struct {
+    clusterNode *target;         /* Target node (NULL if dash). */
+    clusterNode *source_owner;   /* Current owner of the first slot in the range. */
+    const char *source_shard_id; /* source_owner->shard_id or NULL. */
+    const char *target_shard_id; /* target->shard_id or NULL. */
+    uint64_t source_epoch;
+    uint64_t target_epoch;
+    int range_end; /* Index in argv where ranges end (argc - 2). */
+    int valid;     /* 1 if parsing succeeded, 0 otherwise. */
+} slotChangeEpochInfo;
+
+/* Parse epoch-related fields from a SLOT_CHANGE entry's split argv.
+ * Caller must have split data and verified argc >= 4. */
+static slotChangeEpochInfo parseSlotChangeEpochs(sds *argv, int argc) {
+    slotChangeEpochInfo info = {0};
+    if (argc < 4) return info;
+
+    info.target = (sdslen(argv[0]) == CLUSTER_NAMELEN)
+                      ? clusterLookupNode(argv[0], CLUSTER_NAMELEN)
+                      : NULL;
+    info.source_epoch = strtoull(argv[argc - 2], NULL, 10);
+    info.target_epoch = strtoull(argv[argc - 1], NULL, 10);
+    info.range_end = argc - 2;
+
+    /* Derive source shard from current owner of the first slot in the range. */
+    int first_start;
+    char *dash = strchr(argv[1], '-');
+    if (dash) {
+        *dash = '\0';
+        first_start = atoi(argv[1]);
+        *dash = '-';
+    } else {
+        first_start = atoi(argv[1]);
+    }
+    info.source_owner = (first_start >= 0 && first_start < CLUSTER_SLOTS)
+                            ? server.cluster->slots[first_start]
+                            : NULL;
+    info.source_shard_id = info.source_owner ? info.source_owner->shard_id : NULL;
+    info.target_shard_id = info.target ? info.target->shard_id : NULL;
+    info.valid = 1;
+    return info;
+}
+
+/* Pre-validate shard epoch on the leader before appending to the log.
+ * This is an optimization: reject obviously stale proposals early to avoid
+ * wasting log space and replication bandwidth.
+ * Returns 1 if the entry should be appended, 0 if stale. */
+/* Returns 0 if the entry_epoch is stale relative to the shard's current epoch.
+ * Returns 1 if valid (epoch matches or shard is at epoch 0). */
+static int clusterRaftPreValidateEpoch(int type, sds data) {
+    int argc;
+    sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc);
+    if (!argv) return 1;
+
+    int ok = 1;
+    switch (type) {
+    case RAFT_ENTRY_FAILOVER: {
+        /* Format: <replica-id> <primary-id> <epoch> */
+        if (argc < 3) {
+            ok = 0;
+        } else {
+            clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1]));
+            if (primary) {
+                uint64_t epoch = strtoull(argv[2], NULL, 10);
+                ok = isShardEpochCurrent(primary->shard_id, epoch);
+            }
+        }
+        break;
+    }
+    case RAFT_ENTRY_SET_REPLICA_OF: {
+        /* Format: <replica-id> <primary-id-or-dash> <shard-id> <epoch> */
+        if (argc < 4) {
+            ok = 0;
+        } else {
+            uint64_t epoch = strtoull(argv[3], NULL, 10);
+            ok = isShardEpochCurrent(argv[2], epoch);
+        }
+        break;
+    }
+    case RAFT_ENTRY_SLOT_CHANGE: {
+        /* Format: <target-id-or-dash> <ranges...> <source-epoch> <target-epoch> */
+        if (argc < 4) {
+            ok = 0;
+        } else {
+            slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc);
+            if (info.valid) {
+                if (info.source_shard_id) {
+                    ok = isShardEpochCurrent(info.source_shard_id, info.source_epoch);
+                }
+                if (ok && info.target_shard_id) {
+                    ok = isShardEpochCurrent(info.target_shard_id, info.target_epoch);
+                }
+            }
+        }
+        break;
+    }
+    case RAFT_ENTRY_NODE_FORGET: {
+        /* Format: <node-id> <epoch> */
+        if (argc < 2) {
+            ok = 0;
+        } else {
+            clusterNode *node = clusterLookupNode(argv[0], sdslen(argv[0]));
+            if (node) {
+                uint64_t epoch = strtoull(argv[1], NULL, 10);
+                ok = isShardEpochCurrent(node->shard_id, epoch);
+            }
+        }
+        break;
+    }
+    default:
+        break;
+    }
+
+    sdsfreesplitres(argv, argc);
+    if (!ok) {
+        serverLog(LL_NOTICE, "Leader pre-validation: rejecting %s proposal (missing or stale epoch).",
+                  raftEntryTypeName(type));
+    }
+    return ok;
+}
+
 static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) {
     UNUSED(link);
     clusterRaftState *rs = RAFT_STATE();
@@ -834,6 +971,13 @@ static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) {
     /* Data is everything after the type name. */
     sds data = (argc >= 3) ? sdsjoinsds(argv + 2, argc - 2, " ", 1) : sdsempty();
 
+    /* Pre-validate epoch on the leader to reject obviously stale proposals. */
+    if (!clusterRaftPreValidateEpoch(type, data)) {
+        sdsfree(data);
+        sdsfree(entry);
+        return 1;
+    }
+
     uint64_t idx = raftLogLastIndex() + 1;
     raftLogAppend(raftLogCreate(rs->current_term, idx, type, data));
     serverLog(LL_NOTICE, "Leader appended proposed %s (index %llu).",
@@ -957,9 +1101,63 @@ static uint64_t raftLogTermAt(uint64_t index) {
     return e ? e->term : 0;
 }
 
+/* --------------------------------------------------------------------------
+ * Shard epoch validation helpers
+ * -------------------------------------------------------------------------- */
+
+/* Validate and bump epoch for a single-shard entry.
+ * Returns 1 if the entry should be applied, 0 if stale.
+ * On success, increments the shard epoch (or sets it to 1 if currently 0).
+ * If shard_id is NULL, always returns 1 (no shard to validate). */
+static int clusterCheckAndBumpShardEpoch(const char *shard_id, uint64_t entry_epoch) {
+    if (!shard_id) return 1;
+    uint64_t current = clusterGetShardEpoch(shard_id);
+    if (current == 0) {
+        /* First operation on this shard — apply unconditionally, bump to 1. */
+        clusterSetShardEpoch(shard_id, 1);
+        return 1;
+    }
+    if (entry_epoch == current) {
+        clusterSetShardEpoch(shard_id, current + 1);
+        return 1;
+    }
+    return 0;
+}
+
+/* Validate and bump epochs for SLOT_CHANGE (two shards).
+ * Returns 1 if the entry should be applied, 0 if either epoch is stale.
+ * On success, increments both source and target epochs.
+ * source_shard_id may be NULL (unassigned slots). */
+static int clusterCheckAndBumpSlotChangeEpochs(const char *source_shard_id,
+                                               uint64_t source_epoch,
+                                               const char *target_shard_id,
+                                               uint64_t target_epoch) {
+    uint64_t src_current = source_shard_id ? clusterGetShardEpoch(source_shard_id) : 0;
+    uint64_t tgt_current = target_shard_id ? clusterGetShardEpoch(target_shard_id) : 0;
+
+    /* Check source. */
+    if (source_shard_id && src_current > 0 && source_epoch != src_current) {
+        return 0;
+    }
+    /* Check target. */
+    if (target_shard_id && tgt_current > 0 && target_epoch != tgt_current) {
+        return 0;
+    }
+
+    /* Both passed — bump both. */
+    if (source_shard_id) {
+        clusterSetShardEpoch(source_shard_id, src_current == 0 ? 1 : src_current + 1);
+    }
+    if (target_shard_id) {
+        clusterSetShardEpoch(target_shard_id, tgt_current == 0 ? 1 : tgt_current + 1);
+    }
+    return 1;
+}
+
 /* Apply a committed log entry. */
 static void raftLogApply(raftLogEntry *e) {
     clusterRaftState *rs = RAFT_STATE();
+    const char *entry_error = NULL;
     switch (e->type) {
     case RAFT_ENTRY_NODE_JOIN: {
         /* data: "<node-id> <address>" */
@@ -967,6 +1165,8 @@ static void raftLogApply(raftLogEntry *e) {
         sds *argv = sdssplitlen(e->data, sdslen(e->data), " ", 1, &argc);
         if (argv && argc >= 2 && sdslen(argv[0]) == CLUSTER_NAMELEN) {
             clusterNode *existing = clusterLookupNode(argv[0], CLUSTER_NAMELEN);
+            /* For new nodes (existing == NULL), epoch should be 0 — no validation needed. */
+
             if (!existing) {
                 clusterNode *n = createClusterNode(argv[0], CLUSTER_NODE_PRIMARY);
                 if (clusterNodeParseAddressString(n, argv[1]) == C_OK) {
@@ -1065,26 +1265,53 @@ static void raftLogApply(raftLogEntry *e) {
         if (argv) sdsfreesplitres(argv, argc);
         break;
     }
-    case RAFT_ENTRY_SLOT_CHANGE:
-        clusterRaftApplySlotChange(e->data);
-        rs->todo_update_slot_coverage = 1;
-        rs->todo_invalidate_slots_cache = 1;
-        serverLog(LL_NOTICE, "Applied SLOT_CHANGE (index %llu).", (unsigned long long)e->index);
+    case RAFT_ENTRY_SLOT_CHANGE: {
+        entry_error = clusterRaftApplySlotChange(e->data);
+        if (!entry_error) {
+            rs->todo_update_slot_coverage = 1;
+            rs->todo_invalidate_slots_cache = 1;
+        }
+        serverLog(LL_NOTICE, "Applied SLOT_CHANGE (index %llu)%s.", (unsigned long long)e->index,
+                  entry_error ? " [stale]" : "");
         break;
-    case RAFT_ENTRY_SET_REPLICA_OF:
-        clusterRaftApplySetReplica(e->data);
-        rs->todo_invalidate_slots_cache = 1;
-        serverLog(LL_NOTICE, "Applied SET_REPLICA_OF (index %llu).", (unsigned long long)e->index);
+    }
+    case RAFT_ENTRY_SET_REPLICA_OF: {
+        entry_error = clusterRaftApplySetReplica(e->data);
+        if (!entry_error) {
+            rs->todo_invalidate_slots_cache = 1;
+        }
+        serverLog(LL_NOTICE, "Applied SET_REPLICA_OF (index %llu)%s.", (unsigned long long)e->index,
+                  entry_error ? " [stale]" : "");
         break;
-    case RAFT_ENTRY_FAILOVER:
-        clusterRaftApplyFailover(e->data);
-        rs->todo_update_slot_coverage = 1;
-        rs->todo_invalidate_slots_cache = 1;
-        serverLog(LL_NOTICE, "Applied FAILOVER (index %llu).", (unsigned long long)e->index);
+    }
+    case RAFT_ENTRY_FAILOVER: {
+        entry_error = clusterRaftApplyFailover(e->data);
+        if (!entry_error) {
+            rs->todo_update_slot_coverage = 1;
+            rs->todo_invalidate_slots_cache = 1;
+        }
+        serverLog(LL_NOTICE, "Applied FAILOVER (index %llu)%s.", (unsigned long long)e->index,
+                  entry_error ? " [stale]" : "");
         break;
+    }
     case RAFT_ENTRY_NODE_FORGET: {
-        clusterNode *node = clusterLookupNode(e->data, sdslen(e->data));
+        int argc;
+        sds *argv = sdssplitlen(e->data, sdslen(e->data), " ", 1, &argc);
+        if (!argv || argc < 2) {
+            if (argv) sdsfreesplitres(argv, argc);
+            break;
+        }
+        clusterNode *node = clusterLookupNode(argv[0], sdslen(argv[0]));
         if (node && node != myself) {
+            /* Epoch validation. */
+            uint64_t epoch = strtoull(argv[1], NULL, 10);
+            if (!clusterCheckAndBumpShardEpoch(node->shard_id, epoch)) {
+                entry_error = "stale shard epoch";
+                serverLog(LL_NOTICE, "Applied NODE_FORGET %.40s (index %llu) [stale].",
+                          argv[0], (unsigned long long)e->index);
+                sdsfreesplitres(argv, argc);
+                break;
+            }
             /* Detach link before deleting so clusterReadHandler can detect
              * that the node it was talking to is gone. */
             if (node->link) {
@@ -1096,7 +1323,8 @@ static void raftLogApply(raftLogEntry *e) {
             rs->todo_invalidate_slots_cache = 1;
         }
         serverLog(LL_NOTICE, "Applied NODE_FORGET %.40s (index %llu).",
-                  e->data, (unsigned long long)e->index);
+                  argv[0], (unsigned long long)e->index);
+        sdsfreesplitres(argv, argc);
         break;
     }
     case RAFT_ENTRY_NODE_FAIL: {
@@ -1143,8 +1371,9 @@ static void raftLogApply(raftLogEntry *e) {
          * managed by NODE_JOIN and SET_REPLICA_OF entries. */
         int argc;
         sds *argv = sdssplitlen(e->data, sdslen(e->data), " ", 1, &argc);
-        if (argv && argc >= 2 && sdslen(argv[0]) == CLUSTER_NAMELEN) {
+        if (argv && argc >= 3 && sdslen(argv[0]) == CLUSTER_NAMELEN) {
             clusterNode *node = clusterLookupNode(argv[0], CLUSTER_NAMELEN);
+
             if (node && node != myself) {
                 /* Reset optional fields so absent aux fields get cleared. */
                 node->announce_client_tcp_port = 0;
@@ -1155,8 +1384,7 @@ static void raftLogApply(raftLogEntry *e) {
                 sdsclear(node->announce_client_ipv6);
                 sdsclear(node->availability_zone);
                 clusterNodeParseAddressString(node, argv[1]);
-                /* Apply self-set flags. TODO: split on comma and compare
-                 * each part individually when more flags are added. */
+                /* Apply self-set flags. */
                 if (argc >= 3) {
                     node->flags &= ~CLUSTER_NODE_NOFAILOVER;
                     if (strstr(argv[2], "nofailover")) {
@@ -1191,7 +1419,7 @@ static void raftLogApply(raftLogEntry *e) {
         while ((ln = listNext(&li)) != NULL) {
             raftPendingProposal *pp = listNodeValue(ln);
             if (pp->type == e->type && !sdscmp(pp->data, e->data)) {
-                if (pp->callback) pp->callback(pp->ctx, NULL);
+                if (pp->callback) pp->callback(pp->ctx, entry_error);
                 sdsfree(pp->data);
                 zfree(pp);
                 listDelNode(rs->pending_proposals, ln);
@@ -1972,6 +2200,9 @@ static void clusterRaftCron(void) {
             entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
             entry = sdscatlen(entry, " ", 1);
             entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN);
+            /* Append primary's shard epoch. */
+            uint64_t epoch = clusterGetShardEpoch(primary->shard_id);
+            entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
             clusterRaftPropose(entry, rs->mf_ctx, rs->mf_callback);
             sdsfree(entry);
             rs->mf_end = 0;
@@ -1991,6 +2222,9 @@ static void clusterRaftCron(void) {
             entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
             entry = sdscatlen(entry, " ", 1);
             entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN);
+            /* Append primary's shard epoch. */
+            uint64_t epoch = clusterGetShardEpoch(primary->shard_id);
+            entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
             clusterRaftPropose(entry, NULL, NULL);
             sdsfree(entry);
         }
@@ -2760,7 +2994,7 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode
         return;
     }
 
-    /* Build entry: "SLOT_CHANGE <node-id-or-dash> <ranges...>" */
+    /* Build entry: "SLOT_CHANGE <node-id-or-dash> <ranges...> <source-epoch> <target-epoch>" */
     sds entry = sdsnew("SLOT_CHANGE ");
     entry = sdscatlen(entry, target ? target->name : "-", target ? CLUSTER_NAMELEN : 1);
     for (int i = 0; i < numranges; i++) {
@@ -2770,6 +3004,12 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode
             entry = sdscatfmt(entry, " %i-%i", ranges[i].start_slot, ranges[i].end_slot);
     }
 
+    /* Append source-epoch and target-epoch as last two fields. */
+    clusterNode *source_owner = server.cluster->slots[ranges[0].start_slot];
+    uint64_t source_epoch = (source_owner) ? clusterGetShardEpoch(source_owner->shard_id) : 0;
+    uint64_t target_epoch = target ? clusterGetShardEpoch(target->shard_id) : 0;
+    entry = sdscatfmt(entry, " %U %U", (unsigned long long)source_epoch, (unsigned long long)target_epoch);
+
     /* Propose through Raft leader. The callback is invoked when the entry is
      * committed and applied. We use two chained callbacks, to handle some
      * side-effects like replica migration, before invoking the original
@@ -2782,18 +3022,28 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode
     sdsfree(entry);
 }
 
-/* Apply a SLOT_CHANGE entry. Format: "<node-id-or-dash> <range> [<range> ...]"
- * Ranges use the same format as nodes.conf: "0-5460" or "5461". */
-static void clusterRaftApplySlotChange(sds data) {
+/* Apply a SLOT_CHANGE entry. Format: "<node-id-or-dash> <ranges...> <source-epoch> <target-epoch>"
+ * Ranges use the same format as nodes.conf: "0-5460" or "5461".
+ * The last two fields are the source and target shard epochs. */
+static const char *clusterRaftApplySlotChange(sds data) {
+    const char *error = NULL;
     int argc;
     sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc);
-    if (!argv || argc < 2) goto done;
+    /* Need at least: target + one_range + source_epoch + target_epoch = 4 */
+    if (!argv || argc < 4) goto done;
 
-    clusterNode *target = (sdslen(argv[0]) == CLUSTER_NAMELEN)
-                              ? clusterLookupNode(argv[0], CLUSTER_NAMELEN)
-                              : NULL;
+    slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc);
+    if (!info.valid) goto done;
 
-    for (int i = 1; i < argc; i++) {
+    /* Epoch validation: validate both source and target epochs. */
+    if (!clusterCheckAndBumpSlotChangeEpochs(info.source_shard_id, info.source_epoch,
+                                             info.target_shard_id, info.target_epoch)) {
+        error = "stale shard epoch";
+        goto done;
+    }
+
+    /* Range fields are argv[1] through argv[range_end-1] (excluding last two epoch fields). */
+    for (int i = 1; i < info.range_end; i++) {
         int start, end;
         char *p = strchr(argv[i], '-');
         if (p) {
@@ -2804,17 +3054,17 @@ static void clusterRaftApplySlotChange(sds data) {
             start = end = atoi(argv[i]);
         }
         for (int j = start; j <= end; j++) {
-            if (target == myself || server.cluster->slots[j] == myself)
+            if (info.target == myself || server.cluster->slots[j] == myself)
                 RAFT_STATE()->todo_save_config = 1;
-            if (target) {
+            if (info.target) {
                 /* If this slot is moving away from myself, delete keys. */
-                if (server.cluster->slots[j] == myself && target != myself) {
+                if (server.cluster->slots[j] == myself && info.target != myself) {
                     serverLog(LL_NOTICE, "Deleting keys in dirty slot %d on node %.40s",
                               j, myself->name);
                     delKeysInSlot(j, server.lazyfree_lazy_server_del, true, false);
                 }
                 if (server.cluster->slots[j]) clusterDelSlot(j);
-                clusterAddSlot(target, j);
+                clusterAddSlot(info.target, j);
             } else {
                 if (server.cluster->slots[j] == myself) {
                     delKeysInSlot(j, server.lazyfree_lazy_server_del, true, false);
@@ -2825,15 +3075,17 @@ static void clusterRaftApplySlotChange(sds data) {
     }
 done:
     if (argv) sdsfreesplitres(argv, argc);
+    return error;
 }
 
 /* Apply a SET_REPLICA_OF entry.
- * Format: "<replica-id> <primary-id-or-dash> <shard-id>" */
-static void clusterRaftApplySetReplica(sds data) {
+ * Format: "<replica-id> <primary-id-or-dash> <shard-id> <epoch>" */
+static const char *clusterRaftApplySetReplica(sds data) {
     clusterRaftState *rs = RAFT_STATE();
+    const char *error = NULL;
     int argc;
     sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc);
-    if (!argv || argc != 3) goto done;
+    if (!argv || argc != 4) goto done;
 
     clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0]));
     if (!replica) goto done;
@@ -2842,6 +3094,13 @@ static void clusterRaftApplySetReplica(sds data) {
 
     char *shard_id = argv[2];
 
+    /* Epoch validation: validate against the shard-id from the entry. */
+    uint64_t epoch = strtoull(argv[3], NULL, 10);
+    if (!clusterCheckAndBumpShardEpoch(shard_id, epoch)) {
+        error = "stale shard epoch";
+        goto done;
+    }
+
     if (sdslen(argv[1]) == 1 && argv[1][0] == '-') {
         /* Promote to primary with the shard-id from the entry. */
         if (nodeIsReplica(replica)) {
@@ -2884,20 +3143,30 @@ static void clusterRaftApplySetReplica(sds data) {
     }
 done:
     if (argv) sdsfreesplitres(argv, argc);
+    return error;
 }
 
-/* Apply a FAILOVER entry. Format: "<replica-id> <primary-id>"
+/* Apply a FAILOVER entry. Format: "<replica-id> <primary-id> <epoch>"
  * The replica takes over the primary's slots and becomes primary.
  * The old primary becomes a replica of the new primary. */
-static void clusterRaftApplyFailover(sds data) {
+static const char *clusterRaftApplyFailover(sds data) {
     clusterRaftState *rs = RAFT_STATE();
+    const char *error = NULL;
     int argc;
     sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc);
-    if (!argv || argc != 2) goto done;
+    if (!argv || argc != 3) goto done;
 
     clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0]));
     clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1]));
     if (!replica || !primary) goto done;
+
+    /* Epoch validation: validate against primary's shard before domain logic. */
+    uint64_t epoch = strtoull(argv[2], NULL, 10);
+    if (!clusterCheckAndBumpShardEpoch(primary->shard_id, epoch)) {
+        error = "stale shard epoch";
+        goto done;
+    }
+
     if (!nodeIsReplica(replica) || nodeIsReplica(primary) || replica->replicaof != primary) goto done;
 
     /* Transfer slots from old primary to new primary. */
@@ -2946,11 +3215,16 @@ static void clusterRaftApplyFailover(sds data) {
     }
 done:
     if (argv) sdsfreesplitres(argv, argc);
+    return error;
 }
 
 static void clusterRaftForgetNode(const char *node_id, size_t id_len, void *ctx, void (*callback)(void *ctx, const char *error)) {
     sds entry = sdsnew("NODE_FORGET ");
     entry = sdscatlen(entry, node_id, id_len);
+    /* Append departing node's shard epoch. */
+    clusterNode *node = clusterLookupNode(node_id, id_len);
+    uint64_t epoch = node ? clusterGetShardEpoch(node->shard_id) : 0;
+    entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
     clusterRaftPropose(entry, ctx, callback);
     sdsfree(entry);
 }
@@ -2972,6 +3246,9 @@ static void clusterRaftSetReplicaOf(clusterNode *primary, void *ctx, void (*call
         getRandomHexChars(new_shard_id, CLUSTER_NAMELEN);
         entry = sdscatlen(entry, new_shard_id, CLUSTER_NAMELEN);
     }
+    /* Append target shard epoch: use primary's shard epoch for assignment, 0 for promotion. */
+    uint64_t epoch = primary ? clusterGetShardEpoch(primary->shard_id) : 0;
+    entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
     clusterRaftPropose(entry, ctx, callback);
     sdsfree(entry);
 }
@@ -2990,6 +3267,9 @@ static void clusterRaftFailover(int force, int takeover, void *ctx, void (*callb
         entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
         entry = sdscatlen(entry, " ", 1);
         entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN);
+        /* Append primary's shard epoch. */
+        uint64_t epoch = clusterGetShardEpoch(primary->shard_id);
+        entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
         clusterRaftPropose(entry, ctx, callback);
         sdsfree(entry);
     } else {
@@ -3085,6 +3365,7 @@ static void clusterRaftResetCluster(int hard) {
 
     /* Recreate shards dict. */
     dictEmpty(server.cluster->shards, NULL);
+    dictEmpty(server.cluster->shard_epochs, NULL);
 
     /* Forget all nodes except myself. */
     dictIterator *di = dictGetSafeIterator(server.cluster->nodes);
diff --git a/src/cluster_state.c b/src/cluster_state.c
index ad8d6ade13f..e9df0344d07 100644
--- a/src/cluster_state.c
+++ b/src/cluster_state.c
@@ -62,6 +62,14 @@ dictType clusterSlotDictType = {
     .entryDestructor = zfree,
 };
 
+/* Shard epoch table, mapping shard_id to uint64_t epoch (stored inline). */
+dictType clusterShardEpochDictType = {
+    .entryGetKey = dictEntryGetKey,
+    .hashFunction = dictSdsHash,
+    .keyCompare = dictSdsKeyCompare,
+    .entryDestructor = dictEntryDestructorSdsKey,
+};
+
 /* -----------------------------------------------------------------------------
  * Bitmap helpers
  * -------------------------------------------------------------------------- */
@@ -362,6 +370,14 @@ void clusterAddNodeToShard(const char *shard_id, clusterNode *node) {
         list *l = listCreate();
         listAddNodeTail(l, node);
         serverAssert(dictAdd(server.cluster->shards, s, l) == DICT_OK);
+        /* Initialize shard epoch to 0 for the new shard. */
+        sds epoch_key = sdsnewlen(shard_id, CLUSTER_NAMELEN);
+        dictEntry *epoch_de = dictAddRaw(server.cluster->shard_epochs, epoch_key, NULL);
+        if (epoch_de) {
+            dictSetUnsignedIntegerVal(epoch_de, 0);
+        } else {
+            sdsfree(epoch_key);
+        }
     } else {
         list *l = dictGetVal(de);
         if (listSearchKey(l, node) == NULL) {
@@ -382,11 +398,46 @@ void clusterRemoveNodeFromShard(clusterNode *node) {
         }
         if (listLength(l) == 0) {
             dictDelete(server.cluster->shards, s);
+            /* Remove shard epoch when shard is destroyed. */
+            dictDelete(server.cluster->shard_epochs, s);
         }
     }
     sdsfree(s);
 }
 
+/* --------------------------------------------------------------------------
+ * Shard epoch helpers
+ * -------------------------------------------------------------------------- */
+
+uint64_t clusterGetShardEpoch(const char *shard_id) {
+    sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN);
+    dictEntry *de = dictFind(server.cluster->shard_epochs, s);
+    sdsfree(s);
+    return de ? dictGetUnsignedIntegerVal(de) : 0;
+}
+
+void clusterSetShardEpoch(const char *shard_id, uint64_t epoch) {
+    sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN);
+    dictEntry *de = dictFind(server.cluster->shard_epochs, s);
+    if (de) {
+        dictSetUnsignedIntegerVal(de, epoch);
+        sdsfree(s);
+    } else {
+        de = dictAddRaw(server.cluster->shard_epochs, s, NULL);
+        if (de) {
+            dictSetUnsignedIntegerVal(de, epoch);
+        } else {
+            sdsfree(s);
+        }
+    }
+}
+
+void clusterRemoveShardEpoch(const char *shard_id) {
+    sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN);
+    dictDelete(server.cluster->shard_epochs, s);
+    sdsfree(s);
+}
+
 /* -----------------------------------------------------------------------------
  * Slot migration state
  * -------------------------------------------------------------------------- */
diff --git a/src/cluster_state.h b/src/cluster_state.h
index 3a9bb49ab08..34485d78e7d 100644
--- a/src/cluster_state.h
+++ b/src/cluster_state.h
@@ -11,6 +11,7 @@ typedef struct clusterLink clusterLink;
 extern dictType clusterNodesDictType;
 extern dictType clusterSdsToListType;
 extern dictType clusterSlotDictType;
+extern dictType clusterShardEpochDictType;
 
 /* Cluster node flags and macros. */
 #define CLUSTER_NODE_PRIMARY (1 << 0)    /* The node is a primary */
@@ -85,6 +86,7 @@ struct clusterState {
     int size;            /* Num of primary nodes with at least one slot */
     dict *nodes;         /* Hash table of name -> clusterNode structures */
     dict *shards;        /* Hash table of shard_id -> list (of nodes) structures */
+    dict *shard_epochs;  /* Hash table of shard_id -> uint64_t epoch */
     dict *migrating_slots_to;
     dict *importing_slots_from;
     clusterNode *slots[CLUSTER_SLOTS];
@@ -127,6 +129,9 @@ int clusterPrimariesHaveReplicas(void);
 int clusterNodeClearSlotBit(clusterNode *n, int slot);
 void clusterRemoveNodeFromShard(clusterNode *node);
 void clusterAddNodeToShard(const char *shard_id, clusterNode *node);
+uint64_t clusterGetShardEpoch(const char *shard_id);
+void clusterSetShardEpoch(const char *shard_id, uint64_t epoch);
+void clusterRemoveShardEpoch(const char *shard_id);
 void clusterAddNode(clusterNode *node);
 void clusterDelNode(clusterNode *node);
 void clusterRenameNode(clusterNode *node, char *newname);
diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl
index a33a2c7a7b0..565cb15b976 100644
--- a/tests/unit/cluster/cluster-raft-proto.tcl
+++ b/tests/unit/cluster/cluster-raft-proto.tcl
@@ -515,4 +515,127 @@ test "Raft proto: leader sends REPL_OFFSETS after follower offset changes" {
     }
 }
 
+# --------------------------------------------------------------------------
+# Shard epoch: stale proposal rejection tests (parameterized).
+# These verify that the leader rejects stale proposals BEFORE appending
+# to the raft log (commit index unchanged, cluster state unchanged).
+# --------------------------------------------------------------------------
+
+proc get_cluster_info_field {client field} {
+    set info [$client CLUSTER INFO]
+    foreach line [split $info "\n"] {
+        set line [string trim $line "\r"]
+        if {[string match "${field}:*" $line]} {
+            return [lindex [split $line ":"] 1]
+        }
+    }
+    return ""
+}
+
+start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft cluster-node-timeout 2000}} {
+    # Shared setup: form cluster and assign slots.
+    set r0 [srv 0 client]
+    set r1 [srv -1 client]
+
+    $r0 CLUSTER MEET [srv -1 host] [srv -1 port]
+
+    wait_for_condition 50 100 {
+        [get_cluster_info_field $r0 cluster_size] == 2 &&
+        [get_cluster_info_field $r1 cluster_size] == 2
+    } else {
+        fail "Cluster did not form"
+    }
+
+    $r0 CLUSTER ADDSLOTSRANGE 0 16383
+
+    wait_for_condition 50 100 {
+        [get_cluster_info_field $r0 cluster_slots_assigned] == 16384 &&
+        [get_cluster_info_field $r1 cluster_slots_assigned] == 16384
+    } else {
+        fail "Slots not assigned"
+    }
+
+    set node_id [$r0 CLUSTER MYID]
+    set node1_id [$r1 CLUSTER MYID]
+    set port [srv 0 port]
+    set cport [expr {$port + 10000}]
+
+    # Helper: connect to cluster bus and complete HELLO handshake.
+    proc connect_fake_node {cport} {
+        set fake_id [string repeat "f" 40]
+        set fake_shard [string repeat "e" 40]
+        set fake_addr "127.0.0.1:9999@19999,,tls-port=0,shard-id=$fake_shard"
+        set fd [raft_connect 127.0.0.1 $cport]
+        raft_send $fd "HELLO $fake_id $fake_addr"
+        set reply [raft_recv $fd]
+        assert_match "HI *" $reply
+        return $fd
+    }
+
+    # Each entry: {entry_type propose_msg state_check}
+    set stale_proposals [list \
+        [list "SLOT_CHANGE" "PROPOSE SLOT_CHANGE - 50 0 0" {
+            assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned]
+        }] \
+        [list "FAILOVER" "PROPOSE FAILOVER $node1_id $node_id 0" {
+            set nodes [$r0 CLUSTER NODES]
+            assert_match "*$node_id*myself,master*" $nodes
+            assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned]
+        }] \
+    ]
+
+    foreach case $stale_proposals {
+        lassign $case entry_type propose_msg state_check
+
+        test "Raft shard epoch: stale $entry_type is rejected (pre-validation, no log append)" {
+            # Record commit index before injection.
+            set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
+
+            # Connect and inject stale PROPOSE.
+            set fd [connect_fake_node $cport]
+            raft_send $fd $propose_msg
+            after 500
+
+            # Verify log index unchanged (rejected at pre-validation).
+            set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
+            assert_equal $commit_before $commit_after
+
+            # Verify cluster state unchanged.
+            eval $state_check
+
+            close $fd
+        }
+    }
+
+    # --------------------------------------------------------------------------
+    # Missing epoch: proposals without the required epoch field are rejected
+    # at pre-validation (never appended to the log).
+    # --------------------------------------------------------------------------
+
+    set missing_epoch_proposals [list \
+        [list "SLOT_CHANGE (missing epoch)" "PROPOSE SLOT_CHANGE - 50"] \
+        [list "FAILOVER (missing epoch)" "PROPOSE FAILOVER $node1_id $node_id"] \
+        [list "SET_REPLICA_OF (missing epoch)" "PROPOSE SET_REPLICA_OF $node1_id $node_id [string repeat a 40]"] \
+        [list "NODE_FORGET (missing epoch)" "PROPOSE NODE_FORGET $node1_id"] \
+    ]
+
+    foreach case $missing_epoch_proposals {
+        lassign $case label propose_msg
+
+        test "Raft shard epoch: $label is rejected (pre-validation, no log append)" {
+            set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
+
+            set fd [connect_fake_node $cport]
+            raft_send $fd $propose_msg
+            after 500
+
+            # Verify log index unchanged (rejected at pre-validation).
+            set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
+            assert_equal $commit_before $commit_after
+
+            close $fd
+        }
+    }
+}
+
 } ;# tags

From ebc2456e63fcaad7160f228a121b94110f7a3482 Mon Sep 17 00:00:00 2001
From: Sushil Paneru <sushil.paneru1@gmail.com>
Date: Fri, 5 Jun 2026 03:58:07 +0000
Subject: [PATCH 2/9] Use existing configEpoch field API for shard epoch

Signed-off-by: Sushil Paneru <sushil.paneru1@gmail.com>
---
 design-docs/cluster-raft.md | 12 +++----
 src/cluster.c               |  1 -
 src/cluster_legacy.c        |  1 -
 src/cluster_raft.c          | 70 +++++++++++++++++++++++++++++++++++--
 src/cluster_state.c         | 51 ---------------------------
 src/cluster_state.h         |  5 ---
 6 files changed, 73 insertions(+), 67 deletions(-)

diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md
index d389142ac84..67dca8267e3 100644
--- a/design-docs/cluster-raft.md
+++ b/design-docs/cluster-raft.md
@@ -212,9 +212,9 @@ changes are infrequent.
 ## PROPOSE and Leader Validation
 
 Followers forward proposals to the leader using the PROPOSE message,
-sent on the outbound link to the leader. The leader always accepts
-proposals without validation — it appends them to the log and
-replicates them. Validation happens at apply time, where the apply
+sent on the outbound link to the leader. The leader accepts
+proposals with best effort pre-validations — it appends them to the log and
+replicates them. Authoritative validation happens at apply time, where the apply
 function can detect conflicts and treat them as no-ops.
 
 This design simplifies the leader: it doesn't need to understand the
@@ -738,9 +738,9 @@ the corresponding keys.
 A shard-epoch is a per-shard monotonically increasing counter stored
 in `server.cluster->shard_epochs`. It is bumped each time a topology
 change is applied to the shard. Entries that modify shard topology
-include the shard's current epoch at proposal time. On apply, if the
-epoch has advanced past the value in the entry, the entry is stale
-and becomes a no-op.
+include the shard's current epoch at proposal time. Epoch is validated
+at prepare time and at apply time. If the epoch has advanced past the
+value in the entry, the entry is stale and is ignored.
 
 ### Example: slot migration racing with failover
 
diff --git a/src/cluster.c b/src/cluster.c
index ffc614b70ac..3a7907fa738 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -70,7 +70,6 @@ void clusterInit(void) {
     server.cluster->size = 0;
     server.cluster->nodes = dictCreate(&clusterNodesDictType);
     server.cluster->shards = dictCreate(&clusterSdsToListType);
-    server.cluster->shard_epochs = dictCreate(&clusterShardEpochDictType);
     server.cluster->migrating_slots_to = dictCreate(&clusterSlotDictType);
     server.cluster->importing_slots_from = dictCreate(&clusterSlotDictType);
     server.cluster->stat_cluster_links_buffer_limit_exceeded = 0;
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 65b46670389..bb62df1d35f 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -735,7 +735,6 @@ static void clusterLegacyReset(int hard) {
 
     /* Recreate shards dict */
     dictEmpty(server.cluster->shards, NULL);
-    dictEmpty(server.cluster->shard_epochs, NULL);
 
     /* Forget all the nodes, but myself. */
     di = dictGetSafeIterator(server.cluster->nodes);
diff --git a/src/cluster_raft.c b/src/cluster_raft.c
index 30831a540c5..e77c6fae0e8 100644
--- a/src/cluster_raft.c
+++ b/src/cluster_raft.c
@@ -44,6 +44,14 @@
 /* From module.c */
 void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, const unsigned char *payload, uint32_t len);
 
+/* Shard epoch dict type, mapping shard_id to epoch. */
+static dictType raftShardEpochDictType = {
+    .entryGetKey = dictEntryGetKey,
+    .hashFunction = dictSdsHash,
+    .keyCompare = dictSdsKeyCompare,
+    .entryDestructor = dictEntryDestructorSdsKey,
+};
+
 /* --------------------------------------------------------------------------
  * Raft log entry types — what gets replicated
  * -------------------------------------------------------------------------- */
@@ -177,6 +185,9 @@ typedef struct {
     uint64_t stats_pubsub_bytes_received;
     uint64_t stats_module_bytes_sent;
     uint64_t stats_module_bytes_received;
+
+    /* Shard epoch tracking (per-shard monotonic counter for stale proposal detection). */
+    dict *shard_epochs;
 } clusterRaftState;
 
 #define RAFT_STATE() ((clusterRaftState *)server.cluster->protocol_data)
@@ -824,6 +835,38 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx,
     }
 }
 
+/* --------------------------------------------------------------------------
+ * Shard epoch helpers
+ * -------------------------------------------------------------------------- */
+
+static uint64_t clusterGetShardEpoch(const char *shard_id) {
+    clusterRaftState *rs = RAFT_STATE();
+    sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN);
+    dictEntry *de = dictFind(rs->shard_epochs, s);
+    sdsfree(s);
+    return de ? dictGetUnsignedIntegerVal(de) : 0;
+}
+
+static void clusterSetShardEpoch(const char *shard_id, uint64_t epoch) {
+    clusterRaftState *rs = RAFT_STATE();
+    sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN);
+    dictEntry *de = dictFind(rs->shard_epochs, s);
+    if (de) {
+        /* Existing entry — just update the value. */
+        dictSetUnsignedIntegerVal(de, epoch);
+        sdsfree(s);
+    } else {
+        /* Only create a new epoch entry if the shard actually exists. */
+        dictEntry *shard_de = dictFind(server.cluster->shards, s);
+        if (shard_de) {
+            de = dictAddRaw(rs->shard_epochs, s, NULL);
+            dictSetUnsignedIntegerVal(de, epoch);
+        } else {
+            sdsfree(s);
+        }
+    }
+}
+
 static int isShardEpochCurrent(const char *shard_id, uint64_t entry_epoch) {
     uint64_t current = clusterGetShardEpoch(shard_id);
     return (current == 0 || entry_epoch == current);
@@ -1926,7 +1969,11 @@ static void clusterRaftInit(void) {
     rs->my_last_committed_info = sdsempty();
     rs->last_node_info_check = monotonicMs();
     rs->last_repl_offsets_broadcast = monotonicMs();
+<<<<<<< HEAD
     rs->todo_update_slot_coverage = 1;
+=======
+    rs->shard_epochs = dictCreate(&raftShardEpochDictType);
+>>>>>>> 4ec207fbe (Use existing configEpoch field API for shard epoch)
     server.cluster->size = 0; /* Incremented by NODE_JOIN apply */
 }
 
@@ -2439,6 +2486,7 @@ static void clusterRaftHandleServerShutdown(void) {
     }
     zfree(rs->log);
     sdsfree(rs->my_last_committed_info);
+    dictRelease(rs->shard_epochs);
     zfree(rs);
     server.cluster->protocol_data = NULL;
 }
@@ -2664,6 +2712,22 @@ static void clusterRaftFreeNodeData(clusterNode *node) {
     zfree(node->protocol_data);
 }
 
+/* For raft, the config_epoch field in nodes.conf stores the shard epoch.
+ * ping_sent and pong_received have no meaning in raft (no gossip). */
+static void clusterRaftGetNodePingPongEpoch(clusterNode *node, long long *ping_sent, long long *pong_received, uint64_t *config_epoch) {
+    *ping_sent = 0;
+    *pong_received = 0;
+    *config_epoch = clusterGetShardEpoch(node->shard_id);
+}
+
+/* On load, the config_epoch field is the shard epoch for this node's shard.
+ * We use it to restore the shard_epochs dict. ping/pong are ignored. */
+static void clusterRaftSetNodePingPongEpoch(clusterNode *node, int ping_active, int pong_active, uint64_t shard_epoch) {
+    UNUSED(ping_active);
+    UNUSED(pong_active);
+    clusterSetShardEpoch(node->shard_id, shard_epoch);
+}
+
 static sds clusterRaftAppendVarsLine(sds config) {
     clusterRaftState *rs = RAFT_STATE();
     config = sdscatprintf(config, "vars currentTerm %llu lastApplied %llu",
@@ -3365,7 +3429,7 @@ static void clusterRaftResetCluster(int hard) {
 
     /* Recreate shards dict. */
     dictEmpty(server.cluster->shards, NULL);
-    dictEmpty(server.cluster->shard_epochs, NULL);
+    dictEmpty(rs->shard_epochs, NULL);
 
     /* Forget all nodes except myself. */
     dictIterator *di = dictGetSafeIterator(server.cluster->nodes);
@@ -3478,8 +3542,8 @@ clusterBusType clusterRaftBus = {
     .resetStats = clusterRaftResetStats,
     .appendInfoFields = clusterRaftAppendInfoFields,
     .getFailureReportsCount = clusterRaftGetFailureReportsCount,
-    .getNodePingPongEpoch = NULL,
-    .setNodePingPongEpoch = NULL,
+    .getNodePingPongEpoch = clusterRaftGetNodePingPongEpoch,
+    .setNodePingPongEpoch = clusterRaftSetNodePingPongEpoch,
     .setNodeFailed = NULL,
     .appendVarsLine = clusterRaftAppendVarsLine,
     .parseVarsLine = clusterRaftParseVarsLine,
diff --git a/src/cluster_state.c b/src/cluster_state.c
index e9df0344d07..ad8d6ade13f 100644
--- a/src/cluster_state.c
+++ b/src/cluster_state.c
@@ -62,14 +62,6 @@ dictType clusterSlotDictType = {
     .entryDestructor = zfree,
 };
 
-/* Shard epoch table, mapping shard_id to uint64_t epoch (stored inline). */
-dictType clusterShardEpochDictType = {
-    .entryGetKey = dictEntryGetKey,
-    .hashFunction = dictSdsHash,
-    .keyCompare = dictSdsKeyCompare,
-    .entryDestructor = dictEntryDestructorSdsKey,
-};
-
 /* -----------------------------------------------------------------------------
  * Bitmap helpers
  * -------------------------------------------------------------------------- */
@@ -370,14 +362,6 @@ void clusterAddNodeToShard(const char *shard_id, clusterNode *node) {
         list *l = listCreate();
         listAddNodeTail(l, node);
         serverAssert(dictAdd(server.cluster->shards, s, l) == DICT_OK);
-        /* Initialize shard epoch to 0 for the new shard. */
-        sds epoch_key = sdsnewlen(shard_id, CLUSTER_NAMELEN);
-        dictEntry *epoch_de = dictAddRaw(server.cluster->shard_epochs, epoch_key, NULL);
-        if (epoch_de) {
-            dictSetUnsignedIntegerVal(epoch_de, 0);
-        } else {
-            sdsfree(epoch_key);
-        }
     } else {
         list *l = dictGetVal(de);
         if (listSearchKey(l, node) == NULL) {
@@ -398,46 +382,11 @@ void clusterRemoveNodeFromShard(clusterNode *node) {
         }
         if (listLength(l) == 0) {
             dictDelete(server.cluster->shards, s);
-            /* Remove shard epoch when shard is destroyed. */
-            dictDelete(server.cluster->shard_epochs, s);
         }
     }
     sdsfree(s);
 }
 
-/* --------------------------------------------------------------------------
- * Shard epoch helpers
- * -------------------------------------------------------------------------- */
-
-uint64_t clusterGetShardEpoch(const char *shard_id) {
-    sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN);
-    dictEntry *de = dictFind(server.cluster->shard_epochs, s);
-    sdsfree(s);
-    return de ? dictGetUnsignedIntegerVal(de) : 0;
-}
-
-void clusterSetShardEpoch(const char *shard_id, uint64_t epoch) {
-    sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN);
-    dictEntry *de = dictFind(server.cluster->shard_epochs, s);
-    if (de) {
-        dictSetUnsignedIntegerVal(de, epoch);
-        sdsfree(s);
-    } else {
-        de = dictAddRaw(server.cluster->shard_epochs, s, NULL);
-        if (de) {
-            dictSetUnsignedIntegerVal(de, epoch);
-        } else {
-            sdsfree(s);
-        }
-    }
-}
-
-void clusterRemoveShardEpoch(const char *shard_id) {
-    sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN);
-    dictDelete(server.cluster->shard_epochs, s);
-    sdsfree(s);
-}
-
 /* -----------------------------------------------------------------------------
  * Slot migration state
  * -------------------------------------------------------------------------- */
diff --git a/src/cluster_state.h b/src/cluster_state.h
index 34485d78e7d..3a9bb49ab08 100644
--- a/src/cluster_state.h
+++ b/src/cluster_state.h
@@ -11,7 +11,6 @@ typedef struct clusterLink clusterLink;
 extern dictType clusterNodesDictType;
 extern dictType clusterSdsToListType;
 extern dictType clusterSlotDictType;
-extern dictType clusterShardEpochDictType;
 
 /* Cluster node flags and macros. */
 #define CLUSTER_NODE_PRIMARY (1 << 0)    /* The node is a primary */
@@ -86,7 +85,6 @@ struct clusterState {
     int size;            /* Num of primary nodes with at least one slot */
     dict *nodes;         /* Hash table of name -> clusterNode structures */
     dict *shards;        /* Hash table of shard_id -> list (of nodes) structures */
-    dict *shard_epochs;  /* Hash table of shard_id -> uint64_t epoch */
     dict *migrating_slots_to;
     dict *importing_slots_from;
     clusterNode *slots[CLUSTER_SLOTS];
@@ -129,9 +127,6 @@ int clusterPrimariesHaveReplicas(void);
 int clusterNodeClearSlotBit(clusterNode *n, int slot);
 void clusterRemoveNodeFromShard(clusterNode *node);
 void clusterAddNodeToShard(const char *shard_id, clusterNode *node);
-uint64_t clusterGetShardEpoch(const char *shard_id);
-void clusterSetShardEpoch(const char *shard_id, uint64_t epoch);
-void clusterRemoveShardEpoch(const char *shard_id);
 void clusterAddNode(clusterNode *node);
 void clusterDelNode(clusterNode *node);
 void clusterRenameNode(clusterNode *node, char *newname);

From 5219c9c04dbd21da6d76a84560ee823706189043 Mon Sep 17 00:00:00 2001
From: Sushil Paneru <sushil.paneru1@gmail.com>
Date: Fri, 5 Jun 2026 06:08:33 +0000
Subject: [PATCH 3/9] Changed message format for FAILOVER and SLOT_CHANGE

Signed-off-by: Sushil Paneru <sushil.paneru1@gmail.com>
---
 design-docs/cluster-raft.md               |   8 +-
 src/cluster_raft.c                        | 146 +++++++++++-----------
 tests/unit/cluster/cluster-raft-proto.tcl |  55 +++++++-
 3 files changed, 130 insertions(+), 79 deletions(-)

diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md
index 67dca8267e3..1898b4fc17f 100644
--- a/design-docs/cluster-raft.md
+++ b/design-docs/cluster-raft.md
@@ -141,7 +141,7 @@ NODE_JOIN <node-id> <address>
 NODE_FORGET <node-id>
     Remove a node from the cluster (CLUSTER FORGET). Not yet implemented.
 
-SLOT_CHANGE <node-id-or-dash> <range> [<range> ...]
+SLOT_CHANGE <source-node-id-or-dash> <source-epoch> <target-node-id-or-dash> <target-epoch> <range> [<range> ...]
     Assign or remove slot ownership. A dash means "no owner" (delete
     slots). Ranges use the nodes.conf format: "0-5460" or "5461".
 
@@ -171,7 +171,7 @@ NODE_RECOVER <node-id>
 ```
 
 Ranges in SLOT_CHANGE use the same format as nodes.conf: `0-5460` or
-`5461`. A dash as node-id means "no owner" (delete slots) or "no
+`5461`. A dash as source/target-id means "no owner" (delete slots) or "no
 primary" (promote to primary).
 
 ### Why typed entries instead of a key-value store?
@@ -759,9 +759,9 @@ value in the entry, the entry is stale and is ignored.
 ### Entry formats with epoch
 
 ```
-FAILOVER <replica-id> <primary-id> <epoch>
+FAILOVER <replica-id> <primary-id> <shard-id> <shard-epoch>
 SET_REPLICA_OF <replica-id> <primary-id-or-dash> <shard-id> <epoch>
-SLOT_CHANGE <target-id-or-dash> <ranges...> <source-epoch> <target-epoch>
+SLOT_CHANGE <source-node-id-or-dash> <source-epoch> <target-node-id-or-dash> <target-epoch> <ranges...>
 NODE_FORGET <node-id> <epoch>
 ```
 
diff --git a/src/cluster_raft.c b/src/cluster_raft.c
index e77c6fae0e8..f879e8bda8d 100644
--- a/src/cluster_raft.c
+++ b/src/cluster_raft.c
@@ -433,8 +433,9 @@ static void clusterRaftSelfJoin(void) {
             slots = sdscatfmt(slots, " %i-%i", start, j);
     }
     if (sdslen(slots) > 0) {
-        entry = sdsnew("SLOT_CHANGE ");
+        entry = sdsnew("SLOT_CHANGE - 0 ");
         entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
+        entry = sdscatfmt(entry, " %U", (unsigned long long)0);
         entry = sdscatsds(entry, slots);
         clusterRaftPropose(entry, NULL, NULL);
         sdsfree(entry);
@@ -731,7 +732,7 @@ static int clusterRaftProcessMeetRejected(clusterLink *link, int argc, sds *argv
  *
  * Examples:
  *   PROPOSE NODE_JOIN <node-id> <address>
- *   PROPOSE SLOT_CHANGE <node-id-or-dash> <range> [<range> ...]
+ *   PROPOSE SLOT_CHANGE <source-id-or-dash> <source-epoch> <target-id-or-dash> <target-epoch> <range> [<range> ...]
  * -------------------------------------------------------------------------- */
 static int raftEntryTypeByName(const char *name) {
     if (!strcasecmp(name, "NODE_JOIN")) return RAFT_ENTRY_NODE_JOIN;
@@ -761,7 +762,7 @@ static const char *raftEntryTypeName(uint8_t type) {
 
 /* Propose a log entry. The entry is an sds containing the type name
  * followed by the data, e.g. "NODE_JOIN <id> <addr>" or
- * "SLOT_CHANGE <id> 0-5460". On the leader, it's appended directly
+ * "SLOT_CHANGE <src-id> <src-epoch> <target-id> <target-epoch> 0-5460". On the leader, it's appended directly
  * to the log. On followers, it's forwarded to the leader. */
 static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, const char *error)) {
     clusterRaftState *rs = RAFT_STATE();
@@ -873,56 +874,48 @@ static int isShardEpochCurrent(const char *shard_id, uint64_t entry_epoch) {
 }
 
 /* Parsed fields from a SLOT_CHANGE entry's epoch-related data.
- * Format: "<target-id-or-dash> <ranges...> <source-epoch> <target-epoch>" */
+ * Format: "<source-id-or-dash> <source-epoch> <target-id-or-dash> <target-epoch> <ranges...>" */
 typedef struct {
     clusterNode *target;         /* Target node (NULL if dash). */
-    clusterNode *source_owner;   /* Current owner of the first slot in the range. */
+    clusterNode *source_owner;   /* Source node (NULL if dash). */
     const char *source_shard_id; /* source_owner->shard_id or NULL. */
     const char *target_shard_id; /* target->shard_id or NULL. */
     uint64_t source_epoch;
     uint64_t target_epoch;
-    int range_end; /* Index in argv where ranges end (argc - 2). */
+    int range_end; /* Index in argv where ranges begin (first range element). */
     int valid;     /* 1 if parsing succeeded, 0 otherwise. */
 } slotChangeEpochInfo;
 
-/* Parse epoch-related fields from a SLOT_CHANGE entry's split argv.
- * Caller must have split data and verified argc >= 4. */
+/* Parse fields from a SLOT_CHANGE entry's split argv.
+ * Format: "<source-id-or-dash> <source-epoch> <target-id-or-dash> <target-epoch> <ranges...>"
+ * Caller must have verified argc >= 5. */
 static slotChangeEpochInfo parseSlotChangeEpochs(sds *argv, int argc) {
     slotChangeEpochInfo info = {0};
-    if (argc < 4) return info;
 
-    info.target = (sdslen(argv[0]) == CLUSTER_NAMELEN)
-                      ? clusterLookupNode(argv[0], CLUSTER_NAMELEN)
-                      : NULL;
-    info.source_epoch = strtoull(argv[argc - 2], NULL, 10);
-    info.target_epoch = strtoull(argv[argc - 1], NULL, 10);
-    info.range_end = argc - 2;
-
-    /* Derive source shard from current owner of the first slot in the range. */
-    int first_start;
-    char *dash = strchr(argv[1], '-');
-    if (dash) {
-        *dash = '\0';
-        first_start = atoi(argv[1]);
-        *dash = '-';
-    } else {
-        first_start = atoi(argv[1]);
-    }
-    info.source_owner = (first_start >= 0 && first_start < CLUSTER_SLOTS)
-                            ? server.cluster->slots[first_start]
+    /* argv[0] = source-id-or-dash */
+    info.source_owner = (sdslen(argv[0]) == CLUSTER_NAMELEN)
+                            ? clusterLookupNode(argv[0], CLUSTER_NAMELEN)
                             : NULL;
+    /* argv[1] = source-epoch */
+    info.source_epoch = strtoull(argv[1], NULL, 10);
+    /* argv[2] = target-id-or-dash */
+    info.target = (sdslen(argv[2]) == CLUSTER_NAMELEN)
+                      ? clusterLookupNode(argv[2], CLUSTER_NAMELEN)
+                      : NULL;
+    /* argv[3] = target-epoch */
+    info.target_epoch = strtoull(argv[3], NULL, 10);
+    /* argv[4..argc-1] = ranges */
+    info.range_end = 4;
+
     info.source_shard_id = info.source_owner ? info.source_owner->shard_id : NULL;
     info.target_shard_id = info.target ? info.target->shard_id : NULL;
-    info.valid = 1;
+    info.valid = (argc > 4);
     return info;
 }
 
 /* Pre-validate shard epoch on the leader before appending to the log.
- * This is an optimization: reject obviously stale proposals early to avoid
- * wasting log space and replication bandwidth.
- * Returns 1 if the entry should be appended, 0 if stale. */
-/* Returns 0 if the entry_epoch is stale relative to the shard's current epoch.
- * Returns 1 if valid (epoch matches or shard is at epoch 0). */
+ * Rejects stale proposals early to avoid wasting log space.
+ * Returns 1 if valid, 0 if stale. */
 static int clusterRaftPreValidateEpoch(int type, sds data) {
     int argc;
     sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc);
@@ -931,15 +924,12 @@ static int clusterRaftPreValidateEpoch(int type, sds data) {
     int ok = 1;
     switch (type) {
     case RAFT_ENTRY_FAILOVER: {
-        /* Format: <replica-id> <primary-id> <epoch> */
-        if (argc < 3) {
+        /* Format: <replica-id> <primary-id> <shard-id> <shard-epoch> */
+        if (argc < 4) {
             ok = 0;
         } else {
-            clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1]));
-            if (primary) {
-                uint64_t epoch = strtoull(argv[2], NULL, 10);
-                ok = isShardEpochCurrent(primary->shard_id, epoch);
-            }
+            uint64_t epoch = strtoull(argv[3], NULL, 10);
+            ok = isShardEpochCurrent(argv[2], epoch);
         }
         break;
     }
@@ -954,8 +944,8 @@ static int clusterRaftPreValidateEpoch(int type, sds data) {
         break;
     }
     case RAFT_ENTRY_SLOT_CHANGE: {
-        /* Format: <target-id-or-dash> <ranges...> <source-epoch> <target-epoch> */
-        if (argc < 4) {
+        /* Format: <source-id-or-dash> <source-epoch> <target-id-or-dash> <target-epoch> <ranges...> */
+        if (argc < 5) {
             ok = 0;
         } else {
             slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc);
@@ -1276,23 +1266,27 @@ static void raftLogApply(raftLogEntry *e) {
 
                     /* Propose SLOT_CHANGE for slots assigned before joining
                      * the cluster (from our singleton state). */
-                    sds entry = sdsnew("SLOT_CHANGE ");
-                    entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
                     int count = 0;
+                    sds slot_range = sdsempty();
                     for (int j = 0; j < CLUSTER_SLOTS; j++) {
                         if (server.cluster->slots[j] != myself) continue;
                         int start = j;
                         while (j + 1 < CLUSTER_SLOTS && server.cluster->slots[j + 1] == myself) j++;
                         if (j == start)
-                            entry = sdscatfmt(entry, " %i", start);
+                            slot_range = sdscatfmt(slot_range, " %i", start);
                         else
-                            entry = sdscatfmt(entry, " %i-%i", start, j);
+                            slot_range = sdscatfmt(slot_range, " %i-%i", start, j);
                         count++;
                     }
                     if (count > 0) {
+                        sds entry = sdsnew("SLOT_CHANGE - 0 ");
+                        entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
+                        entry = sdscatfmt(entry, " %U", (unsigned long long)0);
+                        entry = sdscatsds(entry, slot_range);
                         clusterRaftPropose(entry, NULL, NULL);
+                        sdsfree(entry);
                     }
-                    sdsfree(entry);
+                    sdsfree(slot_range);
                 }
             }
 
@@ -2247,7 +2241,8 @@ static void clusterRaftCron(void) {
             entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
             entry = sdscatlen(entry, " ", 1);
             entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN);
-            /* Append primary's shard epoch. */
+            entry = sdscatlen(entry, " ", 1);
+            entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN);
             uint64_t epoch = clusterGetShardEpoch(primary->shard_id);
             entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
             clusterRaftPropose(entry, rs->mf_ctx, rs->mf_callback);
@@ -2269,7 +2264,8 @@ static void clusterRaftCron(void) {
             entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
             entry = sdscatlen(entry, " ", 1);
             entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN);
-            /* Append primary's shard epoch. */
+            entry = sdscatlen(entry, " ", 1);
+            entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN);
             uint64_t epoch = clusterGetShardEpoch(primary->shard_id);
             entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
             clusterRaftPropose(entry, NULL, NULL);
@@ -3058,9 +3054,17 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode
         return;
     }
 
-    /* Build entry: "SLOT_CHANGE <node-id-or-dash> <ranges...> <source-epoch> <target-epoch>" */
+    /* Build entry: "SLOT_CHANGE <source-id-or-dash> <source-epoch> <target-id-or-dash> <target-epoch> <ranges...>" */
+    clusterNode *source_owner = server.cluster->slots[ranges[0].start_slot];
+    serverAssert(source_owner || target); /* At least one side must be set. */
+    uint64_t source_epoch = (source_owner) ? clusterGetShardEpoch(source_owner->shard_id) : 0;
+    uint64_t target_epoch = target ? clusterGetShardEpoch(target->shard_id) : 0;
+
     sds entry = sdsnew("SLOT_CHANGE ");
+    entry = sdscatlen(entry, source_owner ? source_owner->name : "-", source_owner ? CLUSTER_NAMELEN : 1);
+    entry = sdscatfmt(entry, " %U ", (unsigned long long)source_epoch);
     entry = sdscatlen(entry, target ? target->name : "-", target ? CLUSTER_NAMELEN : 1);
+    entry = sdscatfmt(entry, " %U", (unsigned long long)target_epoch);
     for (int i = 0; i < numranges; i++) {
         if (ranges[i].start_slot == ranges[i].end_slot)
             entry = sdscatfmt(entry, " %i", ranges[i].start_slot);
@@ -3068,12 +3072,6 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode
             entry = sdscatfmt(entry, " %i-%i", ranges[i].start_slot, ranges[i].end_slot);
     }
 
-    /* Append source-epoch and target-epoch as last two fields. */
-    clusterNode *source_owner = server.cluster->slots[ranges[0].start_slot];
-    uint64_t source_epoch = (source_owner) ? clusterGetShardEpoch(source_owner->shard_id) : 0;
-    uint64_t target_epoch = target ? clusterGetShardEpoch(target->shard_id) : 0;
-    entry = sdscatfmt(entry, " %U %U", (unsigned long long)source_epoch, (unsigned long long)target_epoch);
-
     /* Propose through Raft leader. The callback is invoked when the entry is
      * committed and applied. We use two chained callbacks, to handle some
      * side-effects like replica migration, before invoking the original
@@ -3086,15 +3084,14 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode
     sdsfree(entry);
 }
 
-/* Apply a SLOT_CHANGE entry. Format: "<node-id-or-dash> <ranges...> <source-epoch> <target-epoch>"
- * Ranges use the same format as nodes.conf: "0-5460" or "5461".
- * The last two fields are the source and target shard epochs. */
+/* Apply a SLOT_CHANGE entry. Format: "<source-id-or-dash> <source-epoch> <target-id-or-dash> <target-epoch> <ranges...>"
+ * Ranges use the same format as nodes.conf: "0-5460" or "5461". */
 static const char *clusterRaftApplySlotChange(sds data) {
     const char *error = NULL;
     int argc;
     sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc);
-    /* Need at least: target + one_range + source_epoch + target_epoch = 4 */
-    if (!argv || argc < 4) goto done;
+    /* Need at least: source + source_epoch + target + target_epoch + one_range = 5 */
+    if (!argv || argc < 5) goto done;
 
     slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc);
     if (!info.valid) goto done;
@@ -3106,8 +3103,8 @@ static const char *clusterRaftApplySlotChange(sds data) {
         goto done;
     }
 
-    /* Range fields are argv[1] through argv[range_end-1] (excluding last two epoch fields). */
-    for (int i = 1; i < info.range_end; i++) {
+    /* Range fields are argv[range_end] through argv[argc-1]. */
+    for (int i = info.range_end; i < argc; i++) {
         int start, end;
         char *p = strchr(argv[i], '-');
         if (p) {
@@ -3210,7 +3207,7 @@ static const char *clusterRaftApplySetReplica(sds data) {
     return error;
 }
 
-/* Apply a FAILOVER entry. Format: "<replica-id> <primary-id> <epoch>"
+/* Apply a FAILOVER entry. Format: "<replica-id> <primary-id> <shard-id> <shard-epoch>"
  * The replica takes over the primary's slots and becomes primary.
  * The old primary becomes a replica of the new primary. */
 static const char *clusterRaftApplyFailover(sds data) {
@@ -3218,19 +3215,27 @@ static const char *clusterRaftApplyFailover(sds data) {
     const char *error = NULL;
     int argc;
     sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc);
-    if (!argv || argc != 3) goto done;
+    if (!argv || argc != 4) goto done;
 
     clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0]));
     clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1]));
     if (!replica || !primary) goto done;
 
-    /* Epoch validation: validate against primary's shard before domain logic. */
-    uint64_t epoch = strtoull(argv[2], NULL, 10);
-    if (!clusterCheckAndBumpShardEpoch(primary->shard_id, epoch)) {
+    /* Epoch validation: validate against the shard-id from the entry. */
+    uint64_t epoch = strtoull(argv[3], NULL, 10);
+    if (!clusterCheckAndBumpShardEpoch(argv[2], epoch)) {
         error = "stale shard epoch";
         goto done;
     }
 
+    /* Validate that the primary still belongs to the shard claimed in the entry. */
+    if (memcmp(primary->shard_id, argv[2], CLUSTER_NAMELEN) != 0) {
+        serverLog(LL_WARNING, "FAILOVER rejected: primary %.40s shard mismatch (expected %.40s, got %.40s).",
+                  primary->name, argv[2], primary->shard_id);
+        error = "primary shard mismatch";
+        goto done;
+    }
+
     if (!nodeIsReplica(replica) || nodeIsReplica(primary) || replica->replicaof != primary) goto done;
 
     /* Transfer slots from old primary to new primary. */
@@ -3331,7 +3336,8 @@ static void clusterRaftFailover(int force, int takeover, void *ctx, void (*callb
         entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
         entry = sdscatlen(entry, " ", 1);
         entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN);
-        /* Append primary's shard epoch. */
+        entry = sdscatlen(entry, " ", 1);
+        entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN);
         uint64_t epoch = clusterGetShardEpoch(primary->shard_id);
         entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
         clusterRaftPropose(entry, ctx, callback);
diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl
index 565cb15b976..0ef5a8fa573 100644
--- a/tests/unit/cluster/cluster-raft-proto.tcl
+++ b/tests/unit/cluster/cluster-raft-proto.tcl
@@ -560,6 +560,18 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
     set port [srv 0 port]
     set cport [expr {$port + 10000}]
 
+    # Extract shard-id of r0 from CLUSTER SHARDS output.
+    set node0_shard ""
+    set shards [$r0 CLUSTER SHARDS]
+    foreach shard $shards {
+        foreach node [dict get $shard nodes] {
+            if {[dict get $node id] eq $node_id} {
+                set node0_shard [dict get $shard id]
+            }
+        }
+    }
+    assert {$node0_shard ne ""}
+
     # Helper: connect to cluster bus and complete HELLO handshake.
     proc connect_fake_node {cport} {
         set fake_id [string repeat "f" 40]
@@ -574,10 +586,10 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
 
     # Each entry: {entry_type propose_msg state_check}
     set stale_proposals [list \
-        [list "SLOT_CHANGE" "PROPOSE SLOT_CHANGE - 50 0 0" {
+        [list "SLOT_CHANGE" "PROPOSE SLOT_CHANGE $node_id 50 $node1_id 0 0-100" {
             assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned]
         }] \
-        [list "FAILOVER" "PROPOSE FAILOVER $node1_id $node_id 0" {
+        [list "FAILOVER" "PROPOSE FAILOVER $node1_id $node_id $node0_shard 50" {
             set nodes [$r0 CLUSTER NODES]
             assert_match "*$node_id*myself,master*" $nodes
             assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned]
@@ -600,7 +612,7 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
             set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
             assert_equal $commit_before $commit_after
 
-            # Verify cluster state unchanged.
+            # Verify cluster state unchanged (stale proposal had no effect).
             eval $state_check
 
             close $fd
@@ -613,8 +625,8 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
     # --------------------------------------------------------------------------
 
     set missing_epoch_proposals [list \
-        [list "SLOT_CHANGE (missing epoch)" "PROPOSE SLOT_CHANGE - 50"] \
-        [list "FAILOVER (missing epoch)" "PROPOSE FAILOVER $node1_id $node_id"] \
+        [list "SLOT_CHANGE (missing epoch)" "PROPOSE SLOT_CHANGE $node_id 1 $node1_id"] \
+        [list "FAILOVER (missing epoch)" "PROPOSE FAILOVER $node1_id $node_id $node0_shard"] \
         [list "SET_REPLICA_OF (missing epoch)" "PROPOSE SET_REPLICA_OF $node1_id $node_id [string repeat a 40]"] \
         [list "NODE_FORGET (missing epoch)" "PROPOSE NODE_FORGET $node1_id"] \
     ]
@@ -636,6 +648,39 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
             close $fd
         }
     }
+
+    # --------------------------------------------------------------------------
+    # FAILOVER with wrong shard-id: passes pre-validation (unknown shard has
+    # epoch 0, so isShardEpochCurrent returns true) but is rejected at apply
+    # because primary's current shard_id doesn't match the entry.
+    # --------------------------------------------------------------------------
+
+    test "Raft shard epoch: FAILOVER with wrong shard-id is rejected at apply" {
+        # Use a bogus shard-id that doesn't match node_id's actual shard.
+        set wrong_shard [string repeat "1" 40]
+
+        # Get current commit index.
+        set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
+
+        # Propose FAILOVER with correct epoch (0, since wrong_shard has no
+        # epoch tracked) but wrong shard-id.
+        set fd [connect_fake_node $cport]
+        raft_send $fd "PROPOSE FAILOVER $node1_id $node_id $wrong_shard 0"
+        after 500
+
+        # The entry passes pre-validation (epoch 0 for unknown shard is valid)
+        # and gets appended + committed. But apply rejects it due to shard mismatch.
+        # Commit index advances (entry was appended) but state is unchanged.
+        set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
+        assert {$commit_after > $commit_before}
+
+        # Verify the failover did NOT happen: r0 is still primary with all slots.
+        set nodes [$r0 CLUSTER NODES]
+        assert_match "*$node_id*myself,master*" $nodes
+        assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned]
+
+        close $fd
+    }
 }
 
 } ;# tags

From ab5807de3a3022ac49f7186402084e4a4604435a Mon Sep 17 00:00:00 2001
From: Sushil Paneru <sushil.paneru1@gmail.com>
Date: Fri, 5 Jun 2026 06:32:13 +0000
Subject: [PATCH 4/9] Avoid epoch bump for slot_change

Signed-off-by: Sushil Paneru <sushil.paneru1@gmail.com>
---
 src/cluster_raft.c | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/src/cluster_raft.c b/src/cluster_raft.c
index f879e8bda8d..ac96853c8b1 100644
--- a/src/cluster_raft.c
+++ b/src/cluster_raft.c
@@ -923,18 +923,9 @@ static int clusterRaftPreValidateEpoch(int type, sds data) {
 
     int ok = 1;
     switch (type) {
-    case RAFT_ENTRY_FAILOVER: {
-        /* Format: <replica-id> <primary-id> <shard-id> <shard-epoch> */
-        if (argc < 4) {
-            ok = 0;
-        } else {
-            uint64_t epoch = strtoull(argv[3], NULL, 10);
-            ok = isShardEpochCurrent(argv[2], epoch);
-        }
-        break;
-    }
+    case RAFT_ENTRY_FAILOVER:
     case RAFT_ENTRY_SET_REPLICA_OF: {
-        /* Format: <replica-id> <primary-id-or-dash> <shard-id> <epoch> */
+        /* Format: <..ids..> <shard-id> <shard-epoch> (argv[2] and argv[3]) */
         if (argc < 4) {
             ok = 0;
         } else {
@@ -1177,13 +1168,6 @@ static int clusterCheckAndBumpSlotChangeEpochs(const char *source_shard_id,
         return 0;
     }
 
-    /* Both passed — bump both. */
-    if (source_shard_id) {
-        clusterSetShardEpoch(source_shard_id, src_current == 0 ? 1 : src_current + 1);
-    }
-    if (target_shard_id) {
-        clusterSetShardEpoch(target_shard_id, tgt_current == 0 ? 1 : tgt_current + 1);
-    }
     return 1;
 }
 
@@ -1421,7 +1405,8 @@ static void raftLogApply(raftLogEntry *e) {
                 sdsclear(node->announce_client_ipv6);
                 sdsclear(node->availability_zone);
                 clusterNodeParseAddressString(node, argv[1]);
-                /* Apply self-set flags. */
+                /* Apply self-set flags. TODO: split on comma and compare
+                 * each part individually when more flags are added. */
                 if (argc >= 3) {
                     node->flags &= ~CLUSTER_NODE_NOFAILOVER;
                     if (strstr(argv[2], "nofailover")) {

From 3c8dd26e3205bdc3ad7fec701152bd4cf42a9866 Mon Sep 17 00:00:00 2001
From: Sushil Paneru <sushil.paneru1@gmail.com>
Date: Fri, 5 Jun 2026 18:29:45 +0000
Subject: [PATCH 5/9] Added proposal rejection stat

Signed-off-by: Sushil Paneru <sushil.paneru1@gmail.com>
---
 design-docs/cluster-raft.md               | 16 ++---
 src/cluster_raft.c                        | 81 +++++++++++++----------
 tests/unit/cluster/cluster-raft-proto.tcl | 52 ++++++++++-----
 3 files changed, 87 insertions(+), 62 deletions(-)

diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md
index 1898b4fc17f..7428a6d113b 100644
--- a/design-docs/cluster-raft.md
+++ b/design-docs/cluster-raft.md
@@ -138,20 +138,20 @@ NODE_JOIN <node-id> <address>
     via MEET. The node starts as a learner and is promoted to follower
     when the entry is committed.
 
-NODE_FORGET <node-id>
+NODE_FORGET <node-id> <shard-epoch>
     Remove a node from the cluster (CLUSTER FORGET). Not yet implemented.
 
 SLOT_CHANGE <source-node-id-or-dash> <source-epoch> <target-node-id-or-dash> <target-epoch> <range> [<range> ...]
     Assign or remove slot ownership. A dash means "no owner" (delete
     slots). Ranges use the nodes.conf format: "0-5460" or "5461".
 
-SET_REPLICA_OF <replica-id> <primary-id-or-dash> <shard-id>
+SET_REPLICA_OF <replica-id> <primary-id-or-dash> <shard-id> <target-shard-epoch>
     Set a node as replica of a primary (CLUSTER REPLICATE). A dash as
     primary means promote to primary. The shard-id is the target shard:
     for promotion, a new random id; for assignment, the primary's
     current shard-id (used as a guard against concurrent changes).
 
-FAILOVER <replica-id> <primary-id>
+FAILOVER <replica-id> <primary-id> <shard-epoch>
     The replica takes over the primary's slots and becomes primary.
     The old primary becomes a replica of the new primary.
 
@@ -736,11 +736,11 @@ inconsistencies — such as moving a slot to a node that no longer owns
 the corresponding keys.
 
 A shard-epoch is a per-shard monotonically increasing counter stored
-in `server.cluster->shard_epochs`. It is bumped each time a topology
-change is applied to the shard. Entries that modify shard topology
-include the shard's current epoch at proposal time. Epoch is validated
-at prepare time and at apply time. If the epoch has advanced past the
-value in the entry, the entry is stale and is ignored.
+in `server.cluster->shard_epochs`. It is bumped each time membership or
+leadership of the shard changes. Such entries include the shard's
+current epoch at proposal time. Epoch is validated at prepare time
+and at apply time. If the epoch has advanced past the value in the entry,
+the entry is stale and is ignored.
 
 ### Example: slot migration racing with failover
 
diff --git a/src/cluster_raft.c b/src/cluster_raft.c
index ac96853c8b1..a6f36196a54 100644
--- a/src/cluster_raft.c
+++ b/src/cluster_raft.c
@@ -185,6 +185,8 @@ typedef struct {
     uint64_t stats_pubsub_bytes_received;
     uint64_t stats_module_bytes_sent;
     uint64_t stats_module_bytes_received;
+    uint64_t stats_proposals_rejected_prevalidation;
+    uint64_t stats_proposals_rejected_apply;
 
     /* Shard epoch tracking (per-shard monotonic counter for stale proposal detection). */
     dict *shard_epochs;
@@ -851,21 +853,11 @@ static uint64_t clusterGetShardEpoch(const char *shard_id) {
 static void clusterSetShardEpoch(const char *shard_id, uint64_t epoch) {
     clusterRaftState *rs = RAFT_STATE();
     sds s = sdsnewlen(shard_id, CLUSTER_NAMELEN);
-    dictEntry *de = dictFind(rs->shard_epochs, s);
-    if (de) {
-        /* Existing entry — just update the value. */
-        dictSetUnsignedIntegerVal(de, epoch);
+    dictEntry *de = dictAddOrFind(rs->shard_epochs, s);
+    if (dictGetKey(de) != s) {
         sdsfree(s);
-    } else {
-        /* Only create a new epoch entry if the shard actually exists. */
-        dictEntry *shard_de = dictFind(server.cluster->shards, s);
-        if (shard_de) {
-            de = dictAddRaw(rs->shard_epochs, s, NULL);
-            dictSetUnsignedIntegerVal(de, epoch);
-        } else {
-            sdsfree(s);
-        }
     }
+    dictSetUnsignedIntegerVal(de, epoch);
 }
 
 static int isShardEpochCurrent(const char *shard_id, uint64_t entry_epoch) {
@@ -929,13 +921,21 @@ static int clusterRaftPreValidateEpoch(int type, sds data) {
         if (argc < 4) {
             ok = 0;
         } else {
-            uint64_t epoch = strtoull(argv[3], NULL, 10);
-            ok = isShardEpochCurrent(argv[2], epoch);
+            /* Validate shard-id matches the primary's current shard. */
+            clusterNode *primary = (sdslen(argv[1]) == CLUSTER_NAMELEN)
+                                       ? clusterLookupNode(argv[1], CLUSTER_NAMELEN)
+                                       : NULL;
+            if (primary && memcmp(primary->shard_id, argv[2], CLUSTER_NAMELEN) != 0) {
+                ok = 0;
+            } else {
+                uint64_t epoch = strtoull(argv[3], NULL, 10);
+                ok = isShardEpochCurrent(argv[2], epoch);
+            }
         }
         break;
     }
     case RAFT_ENTRY_SLOT_CHANGE: {
-        /* Format: <source-id-or-dash> <source-epoch> <target-id-or-dash> <target-epoch> <ranges...> */
+        /* Format: <source-node-id-or-dash> <source-epoch> <target-node-id-or-dash> <target-epoch> <ranges...> */
         if (argc < 5) {
             ok = 0;
         } else {
@@ -970,6 +970,7 @@ static int clusterRaftPreValidateEpoch(int type, sds data) {
 
     sdsfreesplitres(argv, argc);
     if (!ok) {
+        RAFT_STATE()->stats_proposals_rejected_prevalidation++;
         serverLog(LL_NOTICE, "Leader pre-validation: rejecting %s proposal (missing or stale epoch).",
                   raftEntryTypeName(type));
     }
@@ -1133,7 +1134,7 @@ static uint64_t raftLogTermAt(uint64_t index) {
  * Returns 1 if the entry should be applied, 0 if stale.
  * On success, increments the shard epoch (or sets it to 1 if currently 0).
  * If shard_id is NULL, always returns 1 (no shard to validate). */
-static int clusterCheckAndBumpShardEpoch(const char *shard_id, uint64_t entry_epoch) {
+static int clusterValidateAndBumpShardEpoch(const char *shard_id, uint64_t entry_epoch) {
     if (!shard_id) return 1;
     uint64_t current = clusterGetShardEpoch(shard_id);
     if (current == 0) {
@@ -1148,14 +1149,14 @@ static int clusterCheckAndBumpShardEpoch(const char *shard_id, uint64_t entry_ep
     return 0;
 }
 
-/* Validate and bump epochs for SLOT_CHANGE (two shards).
+/* Validate SLOT_CHANGE (two shards). We do not bump epoch to support
+ * concurrent slot migration.
  * Returns 1 if the entry should be applied, 0 if either epoch is stale.
- * On success, increments both source and target epochs.
  * source_shard_id may be NULL (unassigned slots). */
-static int clusterCheckAndBumpSlotChangeEpochs(const char *source_shard_id,
-                                               uint64_t source_epoch,
-                                               const char *target_shard_id,
-                                               uint64_t target_epoch) {
+static int clusterValidateSlotChangeEpochs(const char *source_shard_id,
+                                           uint64_t source_epoch,
+                                           const char *target_shard_id,
+                                           uint64_t target_epoch) {
     uint64_t src_current = source_shard_id ? clusterGetShardEpoch(source_shard_id) : 0;
     uint64_t tgt_current = target_shard_id ? clusterGetShardEpoch(target_shard_id) : 0;
 
@@ -1326,7 +1327,7 @@ static void raftLogApply(raftLogEntry *e) {
         if (node && node != myself) {
             /* Epoch validation. */
             uint64_t epoch = strtoull(argv[1], NULL, 10);
-            if (!clusterCheckAndBumpShardEpoch(node->shard_id, epoch)) {
+            if (!clusterValidateAndBumpShardEpoch(node->shard_id, epoch)) {
                 entry_error = "stale shard epoch";
                 serverLog(LL_NOTICE, "Applied NODE_FORGET %.40s (index %llu) [stale].",
                           argv[0], (unsigned long long)e->index);
@@ -1433,6 +1434,10 @@ static void raftLogApply(raftLogEntry *e) {
         break;
     }
 
+    if (entry_error) {
+        rs->stats_proposals_rejected_apply++;
+    }
+
     /* Check pending proposals for a match and remove it. */
     if (listLength(rs->pending_proposals) > 0) {
         listIter li;
@@ -2903,6 +2908,8 @@ static void clusterRaftResetStats(void) {
     rs->stats_pubsub_bytes_received = 0;
     rs->stats_module_bytes_sent = 0;
     rs->stats_module_bytes_received = 0;
+    rs->stats_proposals_rejected_prevalidation = 0;
+    rs->stats_proposals_rejected_apply = 0;
 }
 
 static sds clusterRaftAppendInfoFields(sds info) {
@@ -2957,6 +2964,8 @@ static sds clusterRaftAppendInfoFields(sds info) {
                         "cluster_stats_pubsub_bytes_received:%llu\r\n"
                         "cluster_stats_module_bytes_sent:%llu\r\n"
                         "cluster_stats_module_bytes_received:%llu\r\n"
+                        "cluster_stats_proposals_rejected_prevalidation:%llu\r\n"
+                        "cluster_stats_proposals_rejected_apply:%llu\r\n"
                         "total_cluster_links_buffer_limit_exceeded:%llu\r\n",
                         role_str, (unsigned long long)rs->current_term,
                         (unsigned long long)rs->commit_index, (unsigned long long)rs->last_applied,
@@ -2969,6 +2978,8 @@ static sds clusterRaftAppendInfoFields(sds info) {
                         (unsigned long long)rs->stats_pubsub_bytes_received,
                         (unsigned long long)rs->stats_module_bytes_sent,
                         (unsigned long long)rs->stats_module_bytes_received,
+                        (unsigned long long)rs->stats_proposals_rejected_prevalidation,
+                        (unsigned long long)rs->stats_proposals_rejected_apply,
                         (unsigned long long)server.cluster->stat_cluster_links_buffer_limit_exceeded);
     return info;
 }
@@ -3082,8 +3093,8 @@ static const char *clusterRaftApplySlotChange(sds data) {
     if (!info.valid) goto done;
 
     /* Epoch validation: validate both source and target epochs. */
-    if (!clusterCheckAndBumpSlotChangeEpochs(info.source_shard_id, info.source_epoch,
-                                             info.target_shard_id, info.target_epoch)) {
+    if (!clusterValidateSlotChangeEpochs(info.source_shard_id, info.source_epoch,
+                                         info.target_shard_id, info.target_epoch)) {
         error = "stale shard epoch";
         goto done;
     }
@@ -3139,10 +3150,8 @@ static const char *clusterRaftApplySetReplica(sds data) {
     if (replica == myself) rs->todo_save_config = 1;
 
     char *shard_id = argv[2];
-
-    /* Epoch validation: validate against the shard-id from the entry. */
     uint64_t epoch = strtoull(argv[3], NULL, 10);
-    if (!clusterCheckAndBumpShardEpoch(shard_id, epoch)) {
+    if (!clusterValidateAndBumpShardEpoch(shard_id, epoch)) {
         error = "stale shard epoch";
         goto done;
     }
@@ -3206,13 +3215,6 @@ static const char *clusterRaftApplyFailover(sds data) {
     clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1]));
     if (!replica || !primary) goto done;
 
-    /* Epoch validation: validate against the shard-id from the entry. */
-    uint64_t epoch = strtoull(argv[3], NULL, 10);
-    if (!clusterCheckAndBumpShardEpoch(argv[2], epoch)) {
-        error = "stale shard epoch";
-        goto done;
-    }
-
     /* Validate that the primary still belongs to the shard claimed in the entry. */
     if (memcmp(primary->shard_id, argv[2], CLUSTER_NAMELEN) != 0) {
         serverLog(LL_WARNING, "FAILOVER rejected: primary %.40s shard mismatch (expected %.40s, got %.40s).",
@@ -3221,6 +3223,13 @@ static const char *clusterRaftApplyFailover(sds data) {
         goto done;
     }
 
+    /* Epoch validation: validate against the shard-id from the entry. */
+    uint64_t epoch = strtoull(argv[3], NULL, 10);
+    if (!clusterValidateAndBumpShardEpoch(argv[2], epoch)) {
+        error = "stale shard epoch";
+        goto done;
+    }
+
     if (!nodeIsReplica(replica) || nodeIsReplica(primary) || replica->replicaof != primary) goto done;
 
     /* Transfer slots from old primary to new primary. */
diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl
index 0ef5a8fa573..22f7179dee1 100644
--- a/tests/unit/cluster/cluster-raft-proto.tcl
+++ b/tests/unit/cluster/cluster-raft-proto.tcl
@@ -557,6 +557,11 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
 
     set node_id [$r0 CLUSTER MYID]
     set node1_id [$r1 CLUSTER MYID]
+
+    # Make r1 a replica of r0. This commits a SET_REPLICA_OF entry
+    # which bumps the shard epoch from 0 to 1.
+    $r1 CLUSTER REPLICATE $node_id
+
     set port [srv 0 port]
     set cport [expr {$port + 10000}]
 
@@ -600,13 +605,20 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
         lassign $case entry_type propose_msg state_check
 
         test "Raft shard epoch: stale $entry_type is rejected (pre-validation, no log append)" {
-            # Record commit index before injection.
+            # Record commit index and rejection counter before injection.
             set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
+            set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
 
             # Connect and inject stale PROPOSE.
             set fd [connect_fake_node $cport]
             raft_send $fd $propose_msg
-            after 500
+
+            # Wait for the rejection counter to increase.
+            wait_for_condition 50 100 {
+                [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before
+            } else {
+                fail "Stale $entry_type proposal was not rejected"
+            }
 
             # Verify log index unchanged (rejected at pre-validation).
             set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
@@ -636,10 +648,17 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
 
         test "Raft shard epoch: $label is rejected (pre-validation, no log append)" {
             set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
+            set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
 
             set fd [connect_fake_node $cport]
             raft_send $fd $propose_msg
-            after 500
+
+            # Wait for the rejection counter to increase.
+            wait_for_condition 50 100 {
+                [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before
+            } else {
+                fail "$label proposal was not rejected"
+            }
 
             # Verify log index unchanged (rejected at pre-validation).
             set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
@@ -649,30 +668,27 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
         }
     }
 
-    # --------------------------------------------------------------------------
-    # FAILOVER with wrong shard-id: passes pre-validation (unknown shard has
-    # epoch 0, so isShardEpochCurrent returns true) but is rejected at apply
-    # because primary's current shard_id doesn't match the entry.
-    # --------------------------------------------------------------------------
-
-    test "Raft shard epoch: FAILOVER with wrong shard-id is rejected at apply" {
+    test "Raft shard epoch: FAILOVER with wrong shard-id is rejected at pre-validation" {
         # Use a bogus shard-id that doesn't match node_id's actual shard.
         set wrong_shard [string repeat "1" 40]
 
-        # Get current commit index.
         set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
+        set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
 
-        # Propose FAILOVER with correct epoch (0, since wrong_shard has no
-        # epoch tracked) but wrong shard-id.
+        # Propose FAILOVER with wrong shard-id.
         set fd [connect_fake_node $cport]
         raft_send $fd "PROPOSE FAILOVER $node1_id $node_id $wrong_shard 0"
-        after 500
 
-        # The entry passes pre-validation (epoch 0 for unknown shard is valid)
-        # and gets appended + committed. But apply rejects it due to shard mismatch.
-        # Commit index advances (entry was appended) but state is unchanged.
+        # Wait for the rejection counter to increase.
+        wait_for_condition 50 100 {
+            [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before
+        } else {
+            fail "FAILOVER with wrong shard-id was not rejected"
+        }
+
+        # Verify log index unchanged (rejected at pre-validation).
         set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
-        assert {$commit_after > $commit_before}
+        assert_equal $commit_before $commit_after
 
         # Verify the failover did NOT happen: r0 is still primary with all slots.
         set nodes [$r0 CLUSTER NODES]

From 7893b988fb3f6887eb40d3db00a13520d10e11da Mon Sep 17 00:00:00 2001
From: Sushil Paneru <sushil.paneru1@gmail.com>
Date: Sun, 7 Jun 2026 22:10:43 +0000
Subject: [PATCH 6/9] Add shard-id for SET_REPLICA_OF

Signed-off-by: Sushil Paneru <sushil.paneru1@gmail.com>
---
 design-docs/cluster-raft.md               |   9 +-
 src/cluster_raft.c                        | 159 +++++++++++++---------
 tests/unit/cluster/cluster-raft-proto.tcl |  58 ++++----
 3 files changed, 131 insertions(+), 95 deletions(-)

diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md
index 7428a6d113b..6fd477a82eb 100644
--- a/design-docs/cluster-raft.md
+++ b/design-docs/cluster-raft.md
@@ -145,13 +145,14 @@ SLOT_CHANGE <source-node-id-or-dash> <source-epoch> <target-node-id-or-dash> <ta
     Assign or remove slot ownership. A dash means "no owner" (delete
     slots). Ranges use the nodes.conf format: "0-5460" or "5461".
 
-SET_REPLICA_OF <replica-id> <primary-id-or-dash> <shard-id> <target-shard-epoch>
+SET_REPLICA_OF <replica-id> <source-shard> <source-epoch> <primary-id-or-dash> <target-shard> <target-epoch>
     Set a node as replica of a primary (CLUSTER REPLICATE). A dash as
-    primary means promote to primary. The shard-id is the target shard:
+    primary means promote to primary. Both source and target shard epochs
+    are validated to guard against concurrent shard changes.
     for promotion, a new random id; for assignment, the primary's
     current shard-id (used as a guard against concurrent changes).
 
-FAILOVER <replica-id> <primary-id> <shard-epoch>
+FAILOVER <replica-id> <primary-id> <shard-id> <shard-epoch>
     The replica takes over the primary's slots and becomes primary.
     The old primary becomes a replica of the new primary.
 
@@ -760,7 +761,7 @@ the entry is stale and is ignored.
 
 ```
 FAILOVER <replica-id> <primary-id> <shard-id> <shard-epoch>
-SET_REPLICA_OF <replica-id> <primary-id-or-dash> <shard-id> <epoch>
+SET_REPLICA_OF <replica-id> <source-shard> <source-epoch> <primary-id-or-dash> <target-shard> <target-epoch>
 SLOT_CHANGE <source-node-id-or-dash> <source-epoch> <target-node-id-or-dash> <target-epoch> <ranges...>
 NODE_FORGET <node-id> <epoch>
 ```
diff --git a/src/cluster_raft.c b/src/cluster_raft.c
index a6f36196a54..466372a7119 100644
--- a/src/cluster_raft.c
+++ b/src/cluster_raft.c
@@ -915,13 +915,11 @@ static int clusterRaftPreValidateEpoch(int type, sds data) {
 
     int ok = 1;
     switch (type) {
-    case RAFT_ENTRY_FAILOVER:
-    case RAFT_ENTRY_SET_REPLICA_OF: {
-        /* Format: <..ids..> <shard-id> <shard-epoch> (argv[2] and argv[3]) */
+    case RAFT_ENTRY_FAILOVER: {
+        /* Format: <replica-id> <primary-id> <shard-id> <shard-epoch> */
         if (argc < 4) {
             ok = 0;
         } else {
-            /* Validate shard-id matches the primary's current shard. */
             clusterNode *primary = (sdslen(argv[1]) == CLUSTER_NAMELEN)
                                        ? clusterLookupNode(argv[1], CLUSTER_NAMELEN)
                                        : NULL;
@@ -934,6 +932,29 @@ static int clusterRaftPreValidateEpoch(int type, sds data) {
         }
         break;
     }
+    case RAFT_ENTRY_SET_REPLICA_OF: {
+        /* Format: <replica-id> <source-shard> <source-epoch> <primary-id-or-dash> <target-shard> <target-epoch> */
+        if (argc < 6) {
+            ok = 0;
+        } else {
+            /* Validate source shard epoch. */
+            uint64_t source_epoch = strtoull(argv[2], NULL, 10);
+            ok = isShardEpochCurrent(argv[1], source_epoch);
+            /* Validate target shard epoch. */
+            if (ok) {
+                uint64_t target_epoch = strtoull(argv[5], NULL, 10);
+                ok = isShardEpochCurrent(argv[4], target_epoch);
+            }
+            /* Validate primary's shard matches target-shard. */
+            if (ok && sdslen(argv[3]) == CLUSTER_NAMELEN) {
+                clusterNode *primary = clusterLookupNode(argv[3], CLUSTER_NAMELEN);
+                if (primary && memcmp(primary->shard_id, argv[4], CLUSTER_NAMELEN) != 0) {
+                    ok = 0;
+                }
+            }
+        }
+        break;
+    }
     case RAFT_ENTRY_SLOT_CHANGE: {
         /* Format: <source-node-id-or-dash> <source-epoch> <target-node-id-or-dash> <target-epoch> <ranges...> */
         if (argc < 5) {
@@ -1130,33 +1151,13 @@ static uint64_t raftLogTermAt(uint64_t index) {
  * Shard epoch validation helpers
  * -------------------------------------------------------------------------- */
 
-/* Validate and bump epoch for a single-shard entry.
- * Returns 1 if the entry should be applied, 0 if stale.
- * On success, increments the shard epoch (or sets it to 1 if currently 0).
- * If shard_id is NULL, always returns 1 (no shard to validate). */
-static int clusterValidateAndBumpShardEpoch(const char *shard_id, uint64_t entry_epoch) {
-    if (!shard_id) return 1;
-    uint64_t current = clusterGetShardEpoch(shard_id);
-    if (current == 0) {
-        /* First operation on this shard — apply unconditionally, bump to 1. */
-        clusterSetShardEpoch(shard_id, 1);
-        return 1;
-    }
-    if (entry_epoch == current) {
-        clusterSetShardEpoch(shard_id, current + 1);
-        return 1;
-    }
-    return 0;
-}
-
-/* Validate SLOT_CHANGE (two shards). We do not bump epoch to support
- * concurrent slot migration.
- * Returns 1 if the entry should be applied, 0 if either epoch is stale.
- * source_shard_id may be NULL (unassigned slots). */
-static int clusterValidateSlotChangeEpochs(const char *source_shard_id,
-                                           uint64_t source_epoch,
-                                           const char *target_shard_id,
-                                           uint64_t target_epoch) {
+/* Validate a pair of shard epochs (source and target).
+ * Returns 1 if both are current, 0 if either is stale.
+ * Either shard_id may be NULL (skipped). Does not bump. */
+static int clusterValidateShardEpochPair(const char *source_shard_id,
+                                         uint64_t source_epoch,
+                                         const char *target_shard_id,
+                                         uint64_t target_epoch) {
     uint64_t src_current = source_shard_id ? clusterGetShardEpoch(source_shard_id) : 0;
     uint64_t tgt_current = target_shard_id ? clusterGetShardEpoch(target_shard_id) : 0;
 
@@ -1325,15 +1326,17 @@ static void raftLogApply(raftLogEntry *e) {
         }
         clusterNode *node = clusterLookupNode(argv[0], sdslen(argv[0]));
         if (node && node != myself) {
-            /* Epoch validation. */
+            /* Epoch validation (no bump yet). */
             uint64_t epoch = strtoull(argv[1], NULL, 10);
-            if (!clusterValidateAndBumpShardEpoch(node->shard_id, epoch)) {
+            if (!isShardEpochCurrent(node->shard_id, epoch)) {
                 entry_error = "stale shard epoch";
-                serverLog(LL_NOTICE, "Applied NODE_FORGET %.40s (index %llu) [stale].",
-                          argv[0], (unsigned long long)e->index);
                 sdsfreesplitres(argv, argc);
                 break;
             }
+            /* Save shard_id before deleting the node. */
+            char shard_id[CLUSTER_NAMELEN];
+            memcpy(shard_id, node->shard_id, CLUSTER_NAMELEN);
+
             /* Detach link before deleting so clusterReadHandler can detect
              * that the node it was talking to is gone. */
             if (node->link) {
@@ -1343,6 +1346,10 @@ static void raftLogApply(raftLogEntry *e) {
             clusterDelNode(node);
             rs->todo_update_slot_coverage = 1;
             rs->todo_invalidate_slots_cache = 1;
+
+            /* Bump shard epoch after successful delete. */
+            uint64_t current = clusterGetShardEpoch(shard_id);
+            clusterSetShardEpoch(shard_id, current == 0 ? 1 : current + 1);
         }
         serverLog(LL_NOTICE, "Applied NODE_FORGET %.40s (index %llu).",
                   argv[0], (unsigned long long)e->index);
@@ -1953,11 +1960,8 @@ static void clusterRaftInit(void) {
     rs->my_last_committed_info = sdsempty();
     rs->last_node_info_check = monotonicMs();
     rs->last_repl_offsets_broadcast = monotonicMs();
-<<<<<<< HEAD
     rs->todo_update_slot_coverage = 1;
-=======
     rs->shard_epochs = dictCreate(&raftShardEpochDictType);
->>>>>>> 4ec207fbe (Use existing configEpoch field API for shard epoch)
     server.cluster->size = 0; /* Incremented by NODE_JOIN apply */
 }
 
@@ -3093,8 +3097,8 @@ static const char *clusterRaftApplySlotChange(sds data) {
     if (!info.valid) goto done;
 
     /* Epoch validation: validate both source and target epochs. */
-    if (!clusterValidateSlotChangeEpochs(info.source_shard_id, info.source_epoch,
-                                         info.target_shard_id, info.target_epoch)) {
+    if (!clusterValidateShardEpochPair(info.source_shard_id, info.source_epoch,
+                                       info.target_shard_id, info.target_epoch)) {
         error = "stale shard epoch";
         goto done;
     }
@@ -3136,28 +3140,35 @@ static const char *clusterRaftApplySlotChange(sds data) {
 }
 
 /* Apply a SET_REPLICA_OF entry.
- * Format: "<replica-id> <primary-id-or-dash> <shard-id> <epoch>" */
+ * Format: "<replica-id> <source-shard> <source-epoch> <primary-id-or-dash> <target-shard> <target-epoch>" */
 static const char *clusterRaftApplySetReplica(sds data) {
     clusterRaftState *rs = RAFT_STATE();
     const char *error = NULL;
     int argc;
     sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc);
-    if (!argv || argc != 4) goto done;
+    if (!argv || argc != 6) goto done;
 
     clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0]));
     if (!replica) goto done;
 
-    if (replica == myself) rs->todo_save_config = 1;
-
     char *shard_id = argv[2];
     uint64_t epoch = strtoull(argv[3], NULL, 10);
-    if (!clusterValidateAndBumpShardEpoch(shard_id, epoch)) {
+    char *source_shard = argv[1];
+    uint64_t source_epoch = strtoull(argv[2], NULL, 10);
+    char *target_shard = argv[4];
+    uint64_t target_epoch = strtoull(argv[5], NULL, 10);
+
+    /* Validate both shard epochs without bumping. */
+    if (!clusterValidateShardEpochPair(source_shard, source_epoch,
+                                       target_shard, target_epoch)) {
         error = "stale shard epoch";
         goto done;
     }
 
-    if (sdslen(argv[1]) == 1 && argv[1][0] == '-') {
-        /* Promote to primary with the shard-id from the entry. */
+    if (replica == myself) rs->todo_save_config = 1;
+
+    if (sdslen(argv[3]) == 1 && argv[3][0] == '-') {
+        /* Promote to primary with the target-shard from the entry. */
         if (nodeIsReplica(replica)) {
             if (replica->replicaof) clusterNodeRemoveReplica(replica->replicaof, replica);
             replica->flags &= ~CLUSTER_NODE_REPLICA;
@@ -3166,13 +3177,13 @@ static const char *clusterRaftApplySetReplica(sds data) {
             if (replica == myself) rs->todo_update_replication = 1;
         }
         clusterRemoveNodeFromShard(replica);
-        memcpy(replica->shard_id, shard_id, CLUSTER_NAMELEN);
-        clusterAddNodeToShard(shard_id, replica);
+        memcpy(replica->shard_id, target_shard, CLUSTER_NAMELEN);
+        clusterAddNodeToShard(target_shard, replica);
     } else {
-        clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1]));
+        clusterNode *primary = clusterLookupNode(argv[3], sdslen(argv[3]));
         if (!primary) goto done;
         /* Guard: skip if the primary's shard-id has changed. */
-        if (memcmp(primary->shard_id, shard_id, CLUSTER_NAMELEN) != 0) goto done;
+        if (memcmp(primary->shard_id, target_shard, CLUSTER_NAMELEN) != 0) goto done;
         if (replica == myself) {
             /* Update cluster state; actual replication change deferred to beforeSleep. */
             if (myself->replicaof) clusterNodeRemoveReplica(myself->replicaof, myself);
@@ -3196,6 +3207,10 @@ static const char *clusterRaftApplySetReplica(sds data) {
             clusterAddNodeToShard(primary->shard_id, replica);
         }
     }
+
+    /* Bump both shard epochs after successful apply. */
+    clusterSetShardEpoch(source_shard, source_epoch + 1);
+    clusterSetShardEpoch(target_shard, target_epoch + 1);
 done:
     if (argv) sdsfreesplitres(argv, argc);
     return error;
@@ -3223,9 +3238,9 @@ static const char *clusterRaftApplyFailover(sds data) {
         goto done;
     }
 
-    /* Epoch validation: validate against the shard-id from the entry. */
-    uint64_t epoch = strtoull(argv[3], NULL, 10);
-    if (!clusterValidateAndBumpShardEpoch(argv[2], epoch)) {
+    /* Epoch validation: validate against the shard-id from the entry (no bump yet). */
+    uint64_t shard_epoch = strtoull(argv[3], NULL, 10);
+    if (!isShardEpochCurrent(argv[2], shard_epoch)) {
         error = "stale shard epoch";
         goto done;
     }
@@ -3276,6 +3291,10 @@ static const char *clusterRaftApplyFailover(sds data) {
         rs->todo_update_replication = 1;
         rs->todo_save_config = 1;
     }
+
+    /* Bump shard epoch after successful apply. */
+    clusterSetShardEpoch(argv[2], shard_epoch + 1);
+
 done:
     if (argv) sdsfreesplitres(argv, argc);
     return error;
@@ -3293,25 +3312,29 @@ static void clusterRaftForgetNode(const char *node_id, size_t id_len, void *ctx,
 }
 
 static void clusterRaftSetReplicaOf(clusterNode *primary, void *ctx, void (*callback)(void *ctx, const char *error)) {
-    /* Propose SET_REPLICA_OF: "<myself-id> <primary-id-or-dash> <shard-id>"
-     * For promotion (dash): shard-id is a new random id.
-     * For assignment: shard-id is the primary's current shard-id, used
-     * as a guard against concurrent shard changes. */
+    /* Propose SET_REPLICA_OF:
+     * "<replica-id> <source-shard> <source-epoch> <primary-id-or-dash> <target-shard> <target-epoch>"
+     * Source is myself's current shard. Target is the primary's shard (for assignment)
+     * or a new random shard (for promotion to primary). */
+    uint64_t source_epoch = clusterGetShardEpoch(myself->shard_id);
+
+    char target_shard[CLUSTER_NAMELEN];
+    if (primary) {
+        memcpy(target_shard, primary->shard_id, CLUSTER_NAMELEN);
+    } else {
+        getRandomHexChars(target_shard, CLUSTER_NAMELEN);
+    }
+    uint64_t target_epoch = primary ? clusterGetShardEpoch(primary->shard_id) : 0;
+
     sds entry = sdsnew("SET_REPLICA_OF ");
     entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
     entry = sdscatlen(entry, " ", 1);
+    entry = sdscatlen(entry, myself->shard_id, CLUSTER_NAMELEN);
+    entry = sdscatfmt(entry, " %U ", (unsigned long long)source_epoch);
     entry = sdscatlen(entry, primary ? primary->name : "-", primary ? CLUSTER_NAMELEN : 1);
     entry = sdscatlen(entry, " ", 1);
-    if (primary) {
-        entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN);
-    } else {
-        char new_shard_id[CLUSTER_NAMELEN];
-        getRandomHexChars(new_shard_id, CLUSTER_NAMELEN);
-        entry = sdscatlen(entry, new_shard_id, CLUSTER_NAMELEN);
-    }
-    /* Append target shard epoch: use primary's shard epoch for assignment, 0 for promotion. */
-    uint64_t epoch = primary ? clusterGetShardEpoch(primary->shard_id) : 0;
-    entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
+    entry = sdscatlen(entry, target_shard, CLUSTER_NAMELEN);
+    entry = sdscatfmt(entry, " %U", (unsigned long long)target_epoch);
     clusterRaftPropose(entry, ctx, callback);
     sdsfree(entry);
 }
diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl
index 22f7179dee1..14a04e2b4b5 100644
--- a/tests/unit/cluster/cluster-raft-proto.tcl
+++ b/tests/unit/cluster/cluster-raft-proto.tcl
@@ -639,7 +639,7 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
     set missing_epoch_proposals [list \
         [list "SLOT_CHANGE (missing epoch)" "PROPOSE SLOT_CHANGE $node_id 1 $node1_id"] \
         [list "FAILOVER (missing epoch)" "PROPOSE FAILOVER $node1_id $node_id $node0_shard"] \
-        [list "SET_REPLICA_OF (missing epoch)" "PROPOSE SET_REPLICA_OF $node1_id $node_id [string repeat a 40]"] \
+        [list "SET_REPLICA_OF (missing epoch)" "PROPOSE SET_REPLICA_OF $node1_id [string repeat a 40] 1 $node_id [string repeat b 40]"] \
         [list "NODE_FORGET (missing epoch)" "PROPOSE NODE_FORGET $node1_id"] \
     ]
 
@@ -668,34 +668,46 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
         }
     }
 
-    test "Raft shard epoch: FAILOVER with wrong shard-id is rejected at pre-validation" {
-        # Use a bogus shard-id that doesn't match node_id's actual shard.
-        set wrong_shard [string repeat "1" 40]
+    # --------------------------------------------------------------------------
+    # Wrong shard-id: proposals where the shard-id doesn't match the node's
+    # actual shard are rejected at pre-validation.
+    # --------------------------------------------------------------------------
 
-        set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
-        set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
+    set wrong_shard [string repeat "1" 40]
 
-        # Propose FAILOVER with wrong shard-id.
-        set fd [connect_fake_node $cport]
-        raft_send $fd "PROPOSE FAILOVER $node1_id $node_id $wrong_shard 0"
+    set wrong_shard_proposals [list \
+        [list "FAILOVER" "PROPOSE FAILOVER $node1_id $node_id $wrong_shard 0"] \
+        [list "SET_REPLICA_OF" "PROPOSE SET_REPLICA_OF $node1_id $node0_shard 1 $node_id $wrong_shard 0"] \
+    ]
 
-        # Wait for the rejection counter to increase.
-        wait_for_condition 50 100 {
-            [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before
-        } else {
-            fail "FAILOVER with wrong shard-id was not rejected"
-        }
+    foreach case $wrong_shard_proposals {
+        lassign $case entry_type propose_msg
 
-        # Verify log index unchanged (rejected at pre-validation).
-        set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
-        assert_equal $commit_before $commit_after
+        test "Raft shard epoch: $entry_type with wrong shard-id is rejected at pre-validation" {
+            set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
+            set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
 
-        # Verify the failover did NOT happen: r0 is still primary with all slots.
-        set nodes [$r0 CLUSTER NODES]
-        assert_match "*$node_id*myself,master*" $nodes
-        assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned]
+            set fd [connect_fake_node $cport]
+            raft_send $fd $propose_msg
 
-        close $fd
+            # Wait for the rejection counter to increase.
+            wait_for_condition 50 100 {
+                [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before
+            } else {
+                fail "$entry_type with wrong shard-id was not rejected"
+            }
+
+            # Verify log index unchanged (rejected at pre-validation, epoch not bumped).
+            set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
+            assert_equal $commit_before $commit_after
+
+            # Verify cluster state unchanged.
+            set nodes [$r0 CLUSTER NODES]
+            assert_match "*$node_id*myself,master*" $nodes
+            assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned]
+
+            close $fd
+        }
     }
 }
 

From b49b879a02ffb0f335427a4d778f5fa0adeed32d Mon Sep 17 00:00:00 2001
From: Sushil Paneru <sushil.paneru1@gmail.com>
Date: Mon, 8 Jun 2026 04:44:01 +0000
Subject: [PATCH 7/9] Added pre-Validation rejection in raft leader

Signed-off-by: Sushil Paneru <sushil.paneru1@gmail.com>
---
 src/cluster_raft.c                            | 117 +++++++++++-----
 tests/support/cluster_util.tcl                |  35 ++++-
 tests/unit/cluster/cluster-raft-proto.tcl     | 126 +++++++++++++-----
 tests/unit/cluster/faster-failover.tcl        |   4 +-
 .../cluster/pubsubshard-slot-migration.tcl    |   2 +-
 5 files changed, 208 insertions(+), 76 deletions(-)

diff --git a/src/cluster_raft.c b/src/cluster_raft.c
index 466372a7119..ed882a80ef9 100644
--- a/src/cluster_raft.c
+++ b/src/cluster_raft.c
@@ -41,6 +41,9 @@
 
 #include <arpa/inet.h>
 
+/* Generic error message returned to clients when a proposal is rejected. */
+#define GENERIC_PROPOSAL_REJECTION_MSG "proposal rejected by raft leader"
+
 /* From module.c */
 void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, const unsigned char *payload, uint32_t len);
 
@@ -321,6 +324,7 @@ static clusterMsgSendBlock *clusterRaftBuildAllOffsetsMsg(void) {
 
 static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx, const char *error));
 static void clusterRaftDeferPendingProposals(void);
+static void clusterRaftCompletePendingProposal(int type, sds data, const char *error);
 static int clusterRaftPreValidateEpoch(int type, sds data);
 static void clusterRaftUpdateMyself(int old_flags);
 static sds clusterRaftBuildMyNodeInfo(void);
@@ -784,7 +788,7 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx,
     if (rs->role == RAFT_ROLE_LEADER) {
         /* Pre-validate epoch on the leader to reject obviously stale proposals. */
         if (!clusterRaftPreValidateEpoch(type, data)) {
-            if (callback) callback(ctx, "stale shard epoch");
+            if (callback) callback(ctx, GENERIC_PROPOSAL_REJECTION_MSG);
             sdsfree(data);
             return;
         }
@@ -1019,6 +1023,14 @@ static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) {
 
     /* Pre-validate epoch on the leader to reject obviously stale proposals. */
     if (!clusterRaftPreValidateEpoch(type, data)) {
+        /* Send REJECT back to the proposing follower so it can unblock the client. */
+        if (link) {
+            sds msg = wireNewMsg("REJECT");
+            msg = sdscatlen(msg, " ", 1);
+            msg = sdscatlen(msg, entry, sdslen(entry));
+            msg = wireFinishMsg(msg);
+            clusterRaftSendMsg(link, msg);
+        }
         sdsfree(data);
         sdsfree(entry);
         return 1;
@@ -1039,6 +1051,26 @@ static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) {
     return 1;
 }
 
+/* Handle a REJECT message from the leader. The leader sends this when it
+ * rejects a forwarded PROPOSE at pre-validation. We match it against our
+ * pending_proposals and fire the callback with an error so the client
+ * gets an immediate reply instead of hanging until timeout.
+ * Format: "REJECT <type> <data...>" (echoes back the original entry). */
+static int clusterRaftProcessReject(clusterLink *link, int argc, sds *argv) {
+    UNUSED(link);
+
+    /* argv[0]="REJECT", argv[1]=type, argv[2..]=data */
+    if (argc < 2) return 1;
+
+    int type = raftEntryTypeByName(argv[1]);
+    if (type < 0) return 1;
+
+    sds data = (argc >= 3) ? sdsjoinsds(argv + 2, argc - 2, " ", 1) : sdsempty();
+    clusterRaftCompletePendingProposal(type, data, GENERIC_PROPOSAL_REJECTION_MSG);
+    sdsfree(data);
+    return 1;
+}
+
 /* --------------------------------------------------------------------------
  * Raft election and heartbeat
  * -------------------------------------------------------------------------- */
@@ -1173,6 +1205,28 @@ static int clusterValidateShardEpochPair(const char *source_shard_id,
     return 1;
 }
 
+/* Find a pending proposal matching type+data, fire its callback, and remove it.
+ * Called both when an entry is applied (from raftLogApply) and when the leader
+ * sends a REJECT for a forwarded proposal. */
+static void clusterRaftCompletePendingProposal(int type, sds data, const char *error) {
+    clusterRaftState *rs = RAFT_STATE();
+    if (listLength(rs->pending_proposals) == 0) return;
+
+    listIter li;
+    listNode *ln;
+    listRewind(rs->pending_proposals, &li);
+    while ((ln = listNext(&li)) != NULL) {
+        raftPendingProposal *pp = listNodeValue(ln);
+        if (pp->type == type && !sdscmp(pp->data, data)) {
+            if (pp->callback) pp->callback(pp->ctx, error);
+            sdsfree(pp->data);
+            zfree(pp);
+            listDelNode(rs->pending_proposals, ln);
+            break;
+        }
+    }
+}
+
 /* Apply a committed log entry. */
 static void raftLogApply(raftLogEntry *e) {
     clusterRaftState *rs = RAFT_STATE();
@@ -1329,7 +1383,7 @@ static void raftLogApply(raftLogEntry *e) {
             /* Epoch validation (no bump yet). */
             uint64_t epoch = strtoull(argv[1], NULL, 10);
             if (!isShardEpochCurrent(node->shard_id, epoch)) {
-                entry_error = "stale shard epoch";
+                entry_error = GENERIC_PROPOSAL_REJECTION_MSG;
                 sdsfreesplitres(argv, argc);
                 break;
             }
@@ -1446,21 +1500,7 @@ static void raftLogApply(raftLogEntry *e) {
     }
 
     /* Check pending proposals for a match and remove it. */
-    if (listLength(rs->pending_proposals) > 0) {
-        listIter li;
-        listNode *ln;
-        listRewind(rs->pending_proposals, &li);
-        while ((ln = listNext(&li)) != NULL) {
-            raftPendingProposal *pp = listNodeValue(ln);
-            if (pp->type == e->type && !sdscmp(pp->data, e->data)) {
-                if (pp->callback) pp->callback(pp->ctx, entry_error);
-                sdsfree(pp->data);
-                zfree(pp);
-                listDelNode(rs->pending_proposals, ln);
-                break;
-            }
-        }
-    }
+    clusterRaftCompletePendingProposal(e->type, e->data, entry_error);
 }
 
 /* --------------------------------------------------------------------------
@@ -2530,6 +2570,8 @@ static int clusterRaftProcessMessage(struct clusterLink *link) {
         ret = clusterRaftProcessMeetRejected(link, argc, argv);
     } else if (!strcasecmp(argv[0], "PROPOSE")) {
         ret = clusterRaftProcessPropose(link, argc, argv);
+    } else if (!strcasecmp(argv[0], "REJECT")) {
+        ret = clusterRaftProcessReject(link, argc, argv);
     } else if (!strcasecmp(argv[0], "AE")) {
         ret = clusterRaftProcessAppendEntries(link, argc, argv, lines + 1, line_count - 1);
     } else if (!strcasecmp(argv[0], "AE_ACK")) {
@@ -3091,16 +3133,15 @@ static const char *clusterRaftApplySlotChange(sds data) {
     int argc;
     sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc);
     /* Need at least: source + source_epoch + target + target_epoch + one_range = 5 */
-    if (!argv || argc < 5) goto done;
+    if (!argv || argc < 5) goto reject;
 
     slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc);
-    if (!info.valid) goto done;
+    if (!info.valid) goto reject;
 
     /* Epoch validation: validate both source and target epochs. */
     if (!clusterValidateShardEpochPair(info.source_shard_id, info.source_epoch,
                                        info.target_shard_id, info.target_epoch)) {
-        error = "stale shard epoch";
-        goto done;
+        goto reject;
     }
 
     /* Range fields are argv[range_end] through argv[argc-1]. */
@@ -3134,6 +3175,10 @@ static const char *clusterRaftApplySlotChange(sds data) {
             }
         }
     }
+    goto done;
+
+reject:
+    if (!error) error = GENERIC_PROPOSAL_REJECTION_MSG;
 done:
     if (argv) sdsfreesplitres(argv, argc);
     return error;
@@ -3146,10 +3191,10 @@ static const char *clusterRaftApplySetReplica(sds data) {
     const char *error = NULL;
     int argc;
     sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc);
-    if (!argv || argc != 6) goto done;
+    if (!argv || argc != 6) goto reject;
 
     clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0]));
-    if (!replica) goto done;
+    if (!replica) goto reject;
 
     char *shard_id = argv[2];
     uint64_t epoch = strtoull(argv[3], NULL, 10);
@@ -3161,8 +3206,7 @@ static const char *clusterRaftApplySetReplica(sds data) {
     /* Validate both shard epochs without bumping. */
     if (!clusterValidateShardEpochPair(source_shard, source_epoch,
                                        target_shard, target_epoch)) {
-        error = "stale shard epoch";
-        goto done;
+        goto reject;
     }
 
     if (replica == myself) rs->todo_save_config = 1;
@@ -3181,9 +3225,9 @@ static const char *clusterRaftApplySetReplica(sds data) {
         clusterAddNodeToShard(target_shard, replica);
     } else {
         clusterNode *primary = clusterLookupNode(argv[3], sdslen(argv[3]));
-        if (!primary) goto done;
+        if (!primary) goto reject;
         /* Guard: skip if the primary's shard-id has changed. */
-        if (memcmp(primary->shard_id, target_shard, CLUSTER_NAMELEN) != 0) goto done;
+        if (memcmp(primary->shard_id, target_shard, CLUSTER_NAMELEN) != 0) goto reject;
         if (replica == myself) {
             /* Update cluster state; actual replication change deferred to beforeSleep. */
             if (myself->replicaof) clusterNodeRemoveReplica(myself->replicaof, myself);
@@ -3211,6 +3255,10 @@ static const char *clusterRaftApplySetReplica(sds data) {
     /* Bump both shard epochs after successful apply. */
     clusterSetShardEpoch(source_shard, source_epoch + 1);
     clusterSetShardEpoch(target_shard, target_epoch + 1);
+    goto done;
+
+reject:
+    if (!error) error = GENERIC_PROPOSAL_REJECTION_MSG;
 done:
     if (argv) sdsfreesplitres(argv, argc);
     return error;
@@ -3224,28 +3272,26 @@ static const char *clusterRaftApplyFailover(sds data) {
     const char *error = NULL;
     int argc;
     sds *argv = sdssplitlen(data, sdslen(data), " ", 1, &argc);
-    if (!argv || argc != 4) goto done;
+    if (!argv || argc != 4) goto reject;
 
     clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0]));
     clusterNode *primary = clusterLookupNode(argv[1], sdslen(argv[1]));
-    if (!replica || !primary) goto done;
+    if (!replica || !primary) goto reject;
 
     /* Validate that the primary still belongs to the shard claimed in the entry. */
     if (memcmp(primary->shard_id, argv[2], CLUSTER_NAMELEN) != 0) {
         serverLog(LL_WARNING, "FAILOVER rejected: primary %.40s shard mismatch (expected %.40s, got %.40s).",
                   primary->name, argv[2], primary->shard_id);
-        error = "primary shard mismatch";
-        goto done;
+        goto reject;
     }
 
     /* Epoch validation: validate against the shard-id from the entry (no bump yet). */
     uint64_t shard_epoch = strtoull(argv[3], NULL, 10);
     if (!isShardEpochCurrent(argv[2], shard_epoch)) {
-        error = "stale shard epoch";
-        goto done;
+        goto reject;
     }
 
-    if (!nodeIsReplica(replica) || nodeIsReplica(primary) || replica->replicaof != primary) goto done;
+    if (!nodeIsReplica(replica) || nodeIsReplica(primary) || replica->replicaof != primary) goto reject;
 
     /* Transfer slots from old primary to new primary. */
     for (int j = 0; j < CLUSTER_SLOTS; j++) {
@@ -3294,7 +3340,10 @@ static const char *clusterRaftApplyFailover(sds data) {
 
     /* Bump shard epoch after successful apply. */
     clusterSetShardEpoch(argv[2], shard_epoch + 1);
+    goto done;
 
+reject:
+    if (!error) error = GENERIC_PROPOSAL_REJECTION_MSG;
 done:
     if (argv) sdsfreesplitres(argv, argc);
     return error;
diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl
index ec1f91fdf0f..00a7e4f63a8 100644
--- a/tests/support/cluster_util.tcl
+++ b/tests/support/cluster_util.tcl
@@ -185,13 +185,44 @@ proc cluster_allocate_slots {masters replicas} {
     }
 }
 
+# Issue CLUSTER REPLICATE with retry on epoch rejection. In raft mode,
+# concurrent REPLICATE commands for the same shard may race with shard
+# epoch bumps, causing transient "proposal rejected" errors.
+proc cluster_replicate_with_retry {node_id target_id {max_retries 5}} {
+    for {set attempt 0} {$attempt < $max_retries} {incr attempt} {
+        if {[catch {R $node_id cluster replicate $target_id} err]} {
+            if {$attempt < $max_retries - 1 && [string match "*proposal rejected*" $err]} {
+                after 100
+                continue
+            }
+            error $err
+        }
+        return
+    }
+}
+
+# Same as above but takes a client object instead of a node index.
+# Returns the result on success.
+proc cluster_client_replicate_with_retry {client target_id {max_retries 5}} {
+    for {set attempt 0} {$attempt < $max_retries} {incr attempt} {
+        if {[catch {$client cluster replicate $target_id} result]} {
+            if {$attempt < $max_retries - 1 && [string match "*proposal rejected*" $result]} {
+                after 100
+                continue
+            }
+            error $result
+        }
+        return $result
+    }
+}
+
 proc default_replica_allocation {masters replicas} {
     # Setup master/replica relationships
     set node_count [expr $masters + $replicas]
     for {set i 0} {$i < $masters} {incr i} {
         set nodeid [R $i CLUSTER MYID]
         for {set j [expr $i + $masters]} {$j < $node_count} {incr j $masters} {
-            R $j CLUSTER REPLICATE $nodeid
+            cluster_replicate_with_retry $j $nodeid
         }
     }
 }
@@ -204,7 +235,7 @@ proc cluster_allocate_replicas {masters replicas} {
         set master_id [expr {$j % $masters}]
         set replica_id [cluster_find_available_replica $masters]
         set master_myself [cluster_get_myself $master_id]
-        R $replica_id cluster replicate [dict get $master_myself id]
+        cluster_replicate_with_retry $replica_id [dict get $master_myself id]
     }
 }
 
diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl
index 14a04e2b4b5..6b8919244b4 100644
--- a/tests/unit/cluster/cluster-raft-proto.tcl
+++ b/tests/unit/cluster/cluster-raft-proto.tcl
@@ -84,6 +84,16 @@ proc raft_listen_and_accept {port_var {timeout 5000} {before_accept {}}} {
     return $::_raft_accepted
 }
 
+# Connect a fake node to a cluster bus port and complete HELLO handshake.
+# Returns the connected fd.
+proc raft_connect_fake_node {host cport fake_id fake_addr} {
+    set fd [raft_connect $host $cport]
+    raft_send $fd "HELLO $fake_id $fake_addr"
+    set reply [raft_recv $fd]
+    assert_match "HI *" $reply
+    return $fd
+}
+
 # Parse an AE message and reply with AE_ACK.
 # Returns the last log index after applying the entries.
 proc raft_reply_ae_ack {fd ae_msg repl_offset} {
@@ -577,17 +587,8 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
     }
     assert {$node0_shard ne ""}
 
-    # Helper: connect to cluster bus and complete HELLO handshake.
-    proc connect_fake_node {cport} {
-        set fake_id [string repeat "f" 40]
-        set fake_shard [string repeat "e" 40]
-        set fake_addr "127.0.0.1:9999@19999,,tls-port=0,shard-id=$fake_shard"
-        set fd [raft_connect 127.0.0.1 $cport]
-        raft_send $fd "HELLO $fake_id $fake_addr"
-        set reply [raft_recv $fd]
-        assert_match "HI *" $reply
-        return $fd
-    }
+    set fake_id [string repeat "f" 40]
+    set fake_addr "127.0.0.1:9999@19999,,tls-port=0,shard-id=[string repeat e 40]"
 
     # Each entry: {entry_type propose_msg state_check}
     set stale_proposals [list \
@@ -610,15 +611,16 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
             set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
 
             # Connect and inject stale PROPOSE.
-            set fd [connect_fake_node $cport]
+            set fd [raft_connect_fake_node 127.0.0.1 $cport $fake_id $fake_addr]
             raft_send $fd $propose_msg
 
-            # Wait for the rejection counter to increase.
-            wait_for_condition 50 100 {
-                [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before
-            } else {
-                fail "Stale $entry_type proposal was not rejected"
-            }
+            # Expect a REJECT message back from the leader.
+            set reply [raft_recv $fd 2000]
+            assert_match "REJECT *" $reply
+
+            # Verify rejection counter increased.
+            set rejected_after [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
+            assert {$rejected_after > $rejected_before}
 
             # Verify log index unchanged (rejected at pre-validation).
             set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
@@ -650,15 +652,16 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
             set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
             set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
 
-            set fd [connect_fake_node $cport]
+            set fd [raft_connect_fake_node 127.0.0.1 $cport $fake_id $fake_addr]
             raft_send $fd $propose_msg
 
-            # Wait for the rejection counter to increase.
-            wait_for_condition 50 100 {
-                [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before
-            } else {
-                fail "$label proposal was not rejected"
-            }
+            # Expect a REJECT message back from the leader.
+            set reply [raft_recv $fd 2000]
+            assert_match "REJECT *" $reply
+
+            # Verify rejection counter increased.
+            set rejected_after [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
+            assert {$rejected_after > $rejected_before}
 
             # Verify log index unchanged (rejected at pre-validation).
             set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
@@ -687,28 +690,77 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
             set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
             set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
 
-            set fd [connect_fake_node $cport]
+            set fd [raft_connect_fake_node 127.0.0.1 $cport $fake_id $fake_addr]
             raft_send $fd $propose_msg
 
-            # Wait for the rejection counter to increase.
-            wait_for_condition 50 100 {
-                [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $rejected_before
-            } else {
-                fail "$entry_type with wrong shard-id was not rejected"
-            }
+            # Expect a REJECT message back from the leader.
+            set reply [raft_recv $fd 2000]
+            assert_match "REJECT *" $reply
+
+            # Verify rejection counter increased.
+            set rejected_after [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
+            assert {$rejected_after > $rejected_before}
 
             # Verify log index unchanged (rejected at pre-validation, epoch not bumped).
             set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
             assert_equal $commit_before $commit_after
 
-            # Verify cluster state unchanged.
-            set nodes [$r0 CLUSTER NODES]
-            assert_match "*$node_id*myself,master*" $nodes
-            assert_equal 16384 [get_cluster_info_field $r0 cluster_slots_assigned]
-
             close $fd
         }
     }
+
+    # --------------------------------------------------------------------------
+    # Duplicate FAILOVER: two clients on the follower both send FAILOVER FORCE
+    # simultaneously. The first succeeds, the second is rejected at apply time
+    # (epoch bumped by the first). Neither client should hang.
+    # --------------------------------------------------------------------------
+
+    test "Raft shard epoch: duplicate FAILOVER from follower - second client gets rejection" {
+        set prevalidation_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
+        set apply_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_apply]
+
+        # Two deferred clients on r1 (the follower/replica).
+        set c1 [valkey_deferring_client -1]
+        set c2 [valkey_deferring_client -1]
+
+        # Both issue CLUSTER FAILOVER FORCE concurrently.
+        $c1 CLUSTER FAILOVER FORCE
+        $c2 CLUSTER FAILOVER FORCE
+
+        # Wait for one of the rejection counters to increase.
+        wait_for_condition 50 100 {
+            [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $prevalidation_before ||
+            [get_cluster_info_field $r0 cluster_stats_proposals_rejected_apply] > $apply_before
+        } else {
+            fail "Second FAILOVER was not rejected"
+        }
+
+        # Read both replies. One should succeed (OK), the other should
+        # get a rejection error. Neither should hang.
+        # Use catch because error replies throw in the deferring client.
+        if {[catch {set reply1 [$c1 read]} err1]} {
+            set reply1 $err1
+        }
+        if {[catch {set reply2 [$c2 read]} err2]} {
+            set reply2 $err2
+        }
+
+        # Exactly one should be OK and the other should be rejected.
+        set ok_count 0
+        set err_count 0
+        foreach reply [list $reply1 $reply2] {
+            if {$reply eq "OK"} {
+                incr ok_count
+            } elseif {[string match "*proposal rejected by raft leader*" $reply]} {
+                incr err_count
+            }
+        }
+        assert_equal 1 $ok_count
+        assert_equal 1 $err_count
+
+        $c1 close
+        $c2 close
+    }
 }
 
 } ;# tags
diff --git a/tests/unit/cluster/faster-failover.tcl b/tests/unit/cluster/faster-failover.tcl
index 6ecfbf73c99..cb06b8125bc 100644
--- a/tests/unit/cluster/faster-failover.tcl
+++ b/tests/unit/cluster/faster-failover.tcl
@@ -40,8 +40,8 @@ start_cluster 3 1 {overrides {cluster-ping-interval 1000 cluster-node-timeout 50
 # R3 and R4 are both replicas of R0.
 proc single_primary_replica_allocation {masters replicas} {
     set master0_id [R 0 CLUSTER MYID]
-    R 3 CLUSTER REPLICATE $master0_id
-    R 4 CLUSTER REPLICATE $master0_id
+    cluster_replicate_with_retry 3 $master0_id
+    cluster_replicate_with_retry 4 $master0_id
 }
 
 # Test 1: Single primary failure.
diff --git a/tests/unit/cluster/pubsubshard-slot-migration.tcl b/tests/unit/cluster/pubsubshard-slot-migration.tcl
index c5a324f0949..3d49ca76ed5 100644
--- a/tests/unit/cluster/pubsubshard-slot-migration.tcl
+++ b/tests/unit/cluster/pubsubshard-slot-migration.tcl
@@ -156,7 +156,7 @@ test "Move a replica to another primary, verify client receives sunsubscribe on
     $nodefrom(link) spublish $channelname hello
     assert_equal {smessage mychannel2 hello} [$subscribeclient read]
 
-    assert_equal {OK} [$replica_client cluster replicate $nodeto(id)]
+    assert_equal {OK} [cluster_client_replicate_with_retry $replica_client $nodeto(id)]
 
     set msg [$subscribeclient read]
     assert {"sunsubscribe" eq [lindex $msg 0]}

From f4d7d1ecf1149a64fc8389d2e35a03fbe7712d6b Mon Sep 17 00:00:00 2001
From: Sushil Paneru <sushil.paneru1@gmail.com>
Date: Tue, 16 Jun 2026 06:43:43 +0000
Subject: [PATCH 8/9] Addressed comments

Signed-off-by: Sushil Paneru <sushil.paneru1@gmail.com>
---
 design-docs/cluster-raft.md               |  2 +-
 src/cluster_raft.c                        | 87 ++++++++++++-----------
 tests/unit/cluster/cli.tcl                |  2 +-
 tests/unit/cluster/cluster-raft-proto.tcl | 39 ++++------
 4 files changed, 62 insertions(+), 68 deletions(-)

diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md
index 6fd477a82eb..11a1961dafd 100644
--- a/design-docs/cluster-raft.md
+++ b/design-docs/cluster-raft.md
@@ -139,7 +139,7 @@ NODE_JOIN <node-id> <address>
     when the entry is committed.
 
 NODE_FORGET <node-id> <shard-epoch>
-    Remove a node from the cluster (CLUSTER FORGET). Not yet implemented.
+    Remove a node from the cluster (CLUSTER FORGET).
 
 SLOT_CHANGE <source-node-id-or-dash> <source-epoch> <target-node-id-or-dash> <target-epoch> <range> [<range> ...]
     Assign or remove slot ownership. A dash means "no owner" (delete
diff --git a/src/cluster_raft.c b/src/cluster_raft.c
index ed882a80ef9..209c4694837 100644
--- a/src/cluster_raft.c
+++ b/src/cluster_raft.c
@@ -43,6 +43,7 @@
 
 /* Generic error message returned to clients when a proposal is rejected. */
 #define GENERIC_PROPOSAL_REJECTION_MSG "proposal rejected by raft leader"
+#define STALE_SHARD_EPOCH_REJECTION_MSG "proposal rejected due to stale shard epoch"
 
 /* From module.c */
 void moduleCallClusterReceivers(const char *sender_id, uint64_t module_id, uint8_t type, const unsigned char *payload, uint32_t len);
@@ -188,8 +189,6 @@ typedef struct {
     uint64_t stats_pubsub_bytes_received;
     uint64_t stats_module_bytes_sent;
     uint64_t stats_module_bytes_received;
-    uint64_t stats_proposals_rejected_prevalidation;
-    uint64_t stats_proposals_rejected_apply;
 
     /* Shard epoch tracking (per-shard monotonic counter for stale proposal detection). */
     dict *shard_epochs;
@@ -441,7 +440,7 @@ static void clusterRaftSelfJoin(void) {
     if (sdslen(slots) > 0) {
         entry = sdsnew("SLOT_CHANGE - 0 ");
         entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
-        entry = sdscatfmt(entry, " %U", (unsigned long long)0);
+        entry = sdscat(entry, " 0");
         entry = sdscatsds(entry, slots);
         clusterRaftPropose(entry, NULL, NULL);
         sdsfree(entry);
@@ -788,7 +787,7 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx,
     if (rs->role == RAFT_ROLE_LEADER) {
         /* Pre-validate epoch on the leader to reject obviously stale proposals. */
         if (!clusterRaftPreValidateEpoch(type, data)) {
-            if (callback) callback(ctx, GENERIC_PROPOSAL_REJECTION_MSG);
+            if (callback) callback(ctx, STALE_SHARD_EPOCH_REJECTION_MSG);
             sdsfree(data);
             return;
         }
@@ -879,34 +878,33 @@ typedef struct {
     uint64_t source_epoch;
     uint64_t target_epoch;
     int range_end; /* Index in argv where ranges begin (first range element). */
-    int valid;     /* 1 if parsing succeeded, 0 otherwise. */
 } slotChangeEpochInfo;
 
 /* Parse fields from a SLOT_CHANGE entry's split argv.
  * Format: "<source-id-or-dash> <source-epoch> <target-id-or-dash> <target-epoch> <ranges...>"
- * Caller must have verified argc >= 5. */
-static slotChangeEpochInfo parseSlotChangeEpochs(sds *argv, int argc) {
-    slotChangeEpochInfo info = {0};
+ * Caller must have verified argc >= 5.
+ * Returns true if parsing succeeded, false otherwise. */
+static bool parseSlotChangeEpochs(sds *argv, int argc, slotChangeEpochInfo *info) {
+    memset(info, 0, sizeof(*info));
 
     /* argv[0] = source-id-or-dash */
-    info.source_owner = (sdslen(argv[0]) == CLUSTER_NAMELEN)
-                            ? clusterLookupNode(argv[0], CLUSTER_NAMELEN)
-                            : NULL;
+    info->source_owner = (sdslen(argv[0]) == CLUSTER_NAMELEN)
+                             ? clusterLookupNode(argv[0], CLUSTER_NAMELEN)
+                             : NULL;
     /* argv[1] = source-epoch */
-    info.source_epoch = strtoull(argv[1], NULL, 10);
+    info->source_epoch = strtoull(argv[1], NULL, 10);
     /* argv[2] = target-id-or-dash */
-    info.target = (sdslen(argv[2]) == CLUSTER_NAMELEN)
-                      ? clusterLookupNode(argv[2], CLUSTER_NAMELEN)
-                      : NULL;
+    info->target = (sdslen(argv[2]) == CLUSTER_NAMELEN)
+                       ? clusterLookupNode(argv[2], CLUSTER_NAMELEN)
+                       : NULL;
     /* argv[3] = target-epoch */
-    info.target_epoch = strtoull(argv[3], NULL, 10);
+    info->target_epoch = strtoull(argv[3], NULL, 10);
     /* argv[4..argc-1] = ranges */
-    info.range_end = 4;
+    info->range_end = 4;
 
-    info.source_shard_id = info.source_owner ? info.source_owner->shard_id : NULL;
-    info.target_shard_id = info.target ? info.target->shard_id : NULL;
-    info.valid = (argc > 4);
-    return info;
+    info->source_shard_id = info->source_owner ? info->source_owner->shard_id : NULL;
+    info->target_shard_id = info->target ? info->target->shard_id : NULL;
+    return (argc > 4);
 }
 
 /* Pre-validate shard epoch on the leader before appending to the log.
@@ -964,8 +962,8 @@ static int clusterRaftPreValidateEpoch(int type, sds data) {
         if (argc < 5) {
             ok = 0;
         } else {
-            slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc);
-            if (info.valid) {
+            slotChangeEpochInfo info;
+            if (parseSlotChangeEpochs(argv, argc, &info)) {
                 if (info.source_shard_id) {
                     ok = isShardEpochCurrent(info.source_shard_id, info.source_epoch);
                 }
@@ -995,8 +993,7 @@ static int clusterRaftPreValidateEpoch(int type, sds data) {
 
     sdsfreesplitres(argv, argc);
     if (!ok) {
-        RAFT_STATE()->stats_proposals_rejected_prevalidation++;
-        serverLog(LL_NOTICE, "Leader pre-validation: rejecting %s proposal (missing or stale epoch).",
+        serverLog(LL_DEBUG, "Leader pre-validation: rejecting %s proposal (missing or stale epoch).",
                   raftEntryTypeName(type));
     }
     return ok;
@@ -1321,7 +1318,7 @@ static void raftLogApply(raftLogEntry *e) {
                     if (count > 0) {
                         sds entry = sdsnew("SLOT_CHANGE - 0 ");
                         entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
-                        entry = sdscatfmt(entry, " %U", (unsigned long long)0);
+                        entry = sdscat(entry, " 0");
                         entry = sdscatsds(entry, slot_range);
                         clusterRaftPropose(entry, NULL, NULL);
                         sdsfree(entry);
@@ -1380,10 +1377,14 @@ static void raftLogApply(raftLogEntry *e) {
         }
         clusterNode *node = clusterLookupNode(argv[0], sdslen(argv[0]));
         if (node && node != myself) {
-            /* Epoch validation (no bump yet). */
             uint64_t epoch = strtoull(argv[1], NULL, 10);
             if (!isShardEpochCurrent(node->shard_id, epoch)) {
-                entry_error = GENERIC_PROPOSAL_REJECTION_MSG;
+                entry_error = STALE_SHARD_EPOCH_REJECTION_MSG;
+                sdsfreesplitres(argv, argc);
+                break;
+            }
+            if (nodeIsPrimary(node) && (node->num_replicas > 0 || node->numslots > 0)) {
+                entry_error = "Can't forget a primary with replicas or assigned slots.";
                 sdsfreesplitres(argv, argc);
                 break;
             }
@@ -1496,7 +1497,8 @@ static void raftLogApply(raftLogEntry *e) {
     }
 
     if (entry_error) {
-        rs->stats_proposals_rejected_apply++;
+        serverLog(LL_DEBUG, "Proposal rejected at apply: %s (type %s, index %llu).",
+                  entry_error, raftEntryTypeName(e->type), (unsigned long long)e->index);
     }
 
     /* Check pending proposals for a match and remove it. */
@@ -2140,6 +2142,9 @@ static void clusterRaftRetryProposals(void) {
             }
             serverLog(LL_NOTICE, "Forwarded %lu deferred proposals to leader.",
                       listLength(rs->pending_proposals));
+        } else {
+            /* Leader link not available yet — retry on next cycle. */
+            rs->todo_retry_proposals = 1;
         }
     }
 }
@@ -2954,8 +2959,6 @@ static void clusterRaftResetStats(void) {
     rs->stats_pubsub_bytes_received = 0;
     rs->stats_module_bytes_sent = 0;
     rs->stats_module_bytes_received = 0;
-    rs->stats_proposals_rejected_prevalidation = 0;
-    rs->stats_proposals_rejected_apply = 0;
 }
 
 static sds clusterRaftAppendInfoFields(sds info) {
@@ -3010,8 +3013,6 @@ static sds clusterRaftAppendInfoFields(sds info) {
                         "cluster_stats_pubsub_bytes_received:%llu\r\n"
                         "cluster_stats_module_bytes_sent:%llu\r\n"
                         "cluster_stats_module_bytes_received:%llu\r\n"
-                        "cluster_stats_proposals_rejected_prevalidation:%llu\r\n"
-                        "cluster_stats_proposals_rejected_apply:%llu\r\n"
                         "total_cluster_links_buffer_limit_exceeded:%llu\r\n",
                         role_str, (unsigned long long)rs->current_term,
                         (unsigned long long)rs->commit_index, (unsigned long long)rs->last_applied,
@@ -3024,8 +3025,6 @@ static sds clusterRaftAppendInfoFields(sds info) {
                         (unsigned long long)rs->stats_pubsub_bytes_received,
                         (unsigned long long)rs->stats_module_bytes_sent,
                         (unsigned long long)rs->stats_module_bytes_received,
-                        (unsigned long long)rs->stats_proposals_rejected_prevalidation,
-                        (unsigned long long)rs->stats_proposals_rejected_apply,
                         (unsigned long long)server.cluster->stat_cluster_links_buffer_limit_exceeded);
     return info;
 }
@@ -3127,7 +3126,8 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode
 }
 
 /* Apply a SLOT_CHANGE entry. Format: "<source-id-or-dash> <source-epoch> <target-id-or-dash> <target-epoch> <ranges...>"
- * Ranges use the same format as nodes.conf: "0-5460" or "5461". */
+ * Ranges use the same format as nodes.conf: "0-5460" or "5461".
+ * Returns NULL on success, or a error string describing the failure. */
 static const char *clusterRaftApplySlotChange(sds data) {
     const char *error = NULL;
     int argc;
@@ -3135,12 +3135,13 @@ static const char *clusterRaftApplySlotChange(sds data) {
     /* Need at least: source + source_epoch + target + target_epoch + one_range = 5 */
     if (!argv || argc < 5) goto reject;
 
-    slotChangeEpochInfo info = parseSlotChangeEpochs(argv, argc);
-    if (!info.valid) goto reject;
+    slotChangeEpochInfo info;
+    if (!parseSlotChangeEpochs(argv, argc, &info)) goto reject;
 
     /* Epoch validation: validate both source and target epochs. */
     if (!clusterValidateShardEpochPair(info.source_shard_id, info.source_epoch,
                                        info.target_shard_id, info.target_epoch)) {
+        error = STALE_SHARD_EPOCH_REJECTION_MSG;
         goto reject;
     }
 
@@ -3185,7 +3186,8 @@ static const char *clusterRaftApplySlotChange(sds data) {
 }
 
 /* Apply a SET_REPLICA_OF entry.
- * Format: "<replica-id> <source-shard> <source-epoch> <primary-id-or-dash> <target-shard> <target-epoch>" */
+ * Format: "<replica-id> <source-shard> <source-epoch> <primary-id-or-dash> <target-shard> <target-epoch>"
+ * Returns NULL on success, or a error string describing the failure. */
 static const char *clusterRaftApplySetReplica(sds data) {
     clusterRaftState *rs = RAFT_STATE();
     const char *error = NULL;
@@ -3196,8 +3198,6 @@ static const char *clusterRaftApplySetReplica(sds data) {
     clusterNode *replica = clusterLookupNode(argv[0], sdslen(argv[0]));
     if (!replica) goto reject;
 
-    char *shard_id = argv[2];
-    uint64_t epoch = strtoull(argv[3], NULL, 10);
     char *source_shard = argv[1];
     uint64_t source_epoch = strtoull(argv[2], NULL, 10);
     char *target_shard = argv[4];
@@ -3206,6 +3206,7 @@ static const char *clusterRaftApplySetReplica(sds data) {
     /* Validate both shard epochs without bumping. */
     if (!clusterValidateShardEpochPair(source_shard, source_epoch,
                                        target_shard, target_epoch)) {
+        error = STALE_SHARD_EPOCH_REJECTION_MSG;
         goto reject;
     }
 
@@ -3266,7 +3267,8 @@ static const char *clusterRaftApplySetReplica(sds data) {
 
 /* Apply a FAILOVER entry. Format: "<replica-id> <primary-id> <shard-id> <shard-epoch>"
  * The replica takes over the primary's slots and becomes primary.
- * The old primary becomes a replica of the new primary. */
+ * The old primary becomes a replica of the new primary.
+ * Returns NULL on success, or a error string describing the failure. */
 static const char *clusterRaftApplyFailover(sds data) {
     clusterRaftState *rs = RAFT_STATE();
     const char *error = NULL;
@@ -3288,6 +3290,7 @@ static const char *clusterRaftApplyFailover(sds data) {
     /* Epoch validation: validate against the shard-id from the entry (no bump yet). */
     uint64_t shard_epoch = strtoull(argv[3], NULL, 10);
     if (!isShardEpochCurrent(argv[2], shard_epoch)) {
+        error = STALE_SHARD_EPOCH_REJECTION_MSG;
         goto reject;
     }
 
diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl
index 277a8e873b0..efbc2660cb5 100644
--- a/tests/unit/cluster/cli.tcl
+++ b/tests/unit/cluster/cli.tcl
@@ -565,7 +565,7 @@ start_multiple_servers 3 [list overrides $base_conf] {
 # Test valkey-cli --cluster del-node
 # For cluster-raft we need to handle leader transfer when FORGET targets the leader.
 set base_conf [list cluster-enabled yes cluster-node-timeout 1000]
-start_multiple_servers 3 [list overrides $base_conf tags {cluster-raft:skip}] {
+start_multiple_servers 3 [list overrides $base_conf] {
 
     # Create cluster with 1 primary and 2 replicas for del-node tests
     exec $::VALKEY_CLI_BIN --cluster-yes --cluster create \
diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl
index 6b8919244b4..552f804363d 100644
--- a/tests/unit/cluster/cluster-raft-proto.tcl
+++ b/tests/unit/cluster/cluster-raft-proto.tcl
@@ -542,7 +542,7 @@ proc get_cluster_info_field {client field} {
     return ""
 }
 
-start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft cluster-node-timeout 2000}} {
+start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft cluster-node-timeout 2000 loglevel debug}} {
     # Shared setup: form cluster and assign slots.
     set r0 [srv 0 client]
     set r1 [srv -1 client]
@@ -606,9 +606,9 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
         lassign $case entry_type propose_msg state_check
 
         test "Raft shard epoch: stale $entry_type is rejected (pre-validation, no log append)" {
-            # Record commit index and rejection counter before injection.
+            # Record commit index before injection.
             set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
-            set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
+            set loglines [count_log_lines 0]
 
             # Connect and inject stale PROPOSE.
             set fd [raft_connect_fake_node 127.0.0.1 $cport $fake_id $fake_addr]
@@ -618,9 +618,8 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
             set reply [raft_recv $fd 2000]
             assert_match "REJECT *" $reply
 
-            # Verify rejection counter increased.
-            set rejected_after [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
-            assert {$rejected_after > $rejected_before}
+            # Verify rejection logged.
+            wait_for_log_messages 0 {"*Leader pre-validation: rejecting*"} $loglines 1000 10
 
             # Verify log index unchanged (rejected at pre-validation).
             set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
@@ -650,7 +649,7 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
 
         test "Raft shard epoch: $label is rejected (pre-validation, no log append)" {
             set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
-            set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
+            set loglines [count_log_lines 0]
 
             set fd [raft_connect_fake_node 127.0.0.1 $cport $fake_id $fake_addr]
             raft_send $fd $propose_msg
@@ -659,9 +658,8 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
             set reply [raft_recv $fd 2000]
             assert_match "REJECT *" $reply
 
-            # Verify rejection counter increased.
-            set rejected_after [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
-            assert {$rejected_after > $rejected_before}
+            # Verify rejection logged.
+            wait_for_log_messages 0 {"*Leader pre-validation: rejecting*"} $loglines 1000 10
 
             # Verify log index unchanged (rejected at pre-validation).
             set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
@@ -688,7 +686,7 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
 
         test "Raft shard epoch: $entry_type with wrong shard-id is rejected at pre-validation" {
             set commit_before [get_cluster_info_field $r0 cluster_raft_commit_index]
-            set rejected_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
+            set loglines [count_log_lines 0]
 
             set fd [raft_connect_fake_node 127.0.0.1 $cport $fake_id $fake_addr]
             raft_send $fd $propose_msg
@@ -697,9 +695,8 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
             set reply [raft_recv $fd 2000]
             assert_match "REJECT *" $reply
 
-            # Verify rejection counter increased.
-            set rejected_after [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
-            assert {$rejected_after > $rejected_before}
+            # Verify rejection logged.
+            wait_for_log_messages 0 {"*Leader pre-validation: rejecting*"} $loglines 1000 10
 
             # Verify log index unchanged (rejected at pre-validation, epoch not bumped).
             set commit_after [get_cluster_info_field $r0 cluster_raft_commit_index]
@@ -716,8 +713,7 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
     # --------------------------------------------------------------------------
 
     test "Raft shard epoch: duplicate FAILOVER from follower - second client gets rejection" {
-        set prevalidation_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation]
-        set apply_before [get_cluster_info_field $r0 cluster_stats_proposals_rejected_apply]
+        set loglines [count_log_lines 0]
 
         # Two deferred clients on r1 (the follower/replica).
         set c1 [valkey_deferring_client -1]
@@ -727,13 +723,8 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
         $c1 CLUSTER FAILOVER FORCE
         $c2 CLUSTER FAILOVER FORCE
 
-        # Wait for one of the rejection counters to increase.
-        wait_for_condition 50 100 {
-            [get_cluster_info_field $r0 cluster_stats_proposals_rejected_prevalidation] > $prevalidation_before ||
-            [get_cluster_info_field $r0 cluster_stats_proposals_rejected_apply] > $apply_before
-        } else {
-            fail "Second FAILOVER was not rejected"
-        }
+        # Wait for a rejection to be logged (either at pre-validation or apply).
+        wait_for_log_messages 0 {"*rejecting*proposal*" "*Proposal rejected at apply*"} $loglines 1000 10
 
         # Read both replies. One should succeed (OK), the other should
         # get a rejection error. Neither should hang.
@@ -751,7 +742,7 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
         foreach reply [list $reply1 $reply2] {
             if {$reply eq "OK"} {
                 incr ok_count
-            } elseif {[string match "*proposal rejected by raft leader*" $reply]} {
+            } elseif {[string match "*proposal rejected*" $reply]} {
                 incr err_count
             }
         }

From 18a1c8bbfbfe16543835389636fa49e414a00f91 Mon Sep 17 00:00:00 2001
From: Sushil Paneru <sushil.paneru1@gmail.com>
Date: Wed, 17 Jun 2026 06:32:44 +0000
Subject: [PATCH 9/9] Added retry logic for proposal rejection

Signed-off-by: Sushil Paneru <sushil.paneru1@gmail.com>
---
 design-docs/cluster-raft.md               |  20 ++
 src/cluster_raft.c                        | 231 ++++++++++++++++++++--
 tests/unit/cluster/cli.tcl                |   2 +-
 tests/unit/cluster/cluster-raft-proto.tcl |  47 -----
 4 files changed, 235 insertions(+), 65 deletions(-)

diff --git a/design-docs/cluster-raft.md b/design-docs/cluster-raft.md
index 11a1961dafd..2da2bda9fe7 100644
--- a/design-docs/cluster-raft.md
+++ b/design-docs/cluster-raft.md
@@ -786,6 +786,26 @@ Epoch validation happens at two points:
    and the entry is applied. On mismatch, the entry is a no-op and
    the error is propagated to the caller's callback.
 
+### Retry on stale epoch
+
+Proposals rejected due to a stale shard epoch are automatically retried
+with a fresh epoch (up to 5 attempts):
+
+- **SET_REPLICA_OF / NODE_FORGET / FAILOVER (force) / SLOT_CHANGE** —
+  the proposal is rebuilt with current epoch(s) and re-submitted.
+
+- **Automatic failover** — if the FAILOVER proposal is rejected, the
+  failover is re-scheduled (via `todo_schedule_failover`) as long as
+  the primary is still failed. The next attempt uses the current epoch.
+  For automatic failover, no cap on retry attempt to avoid leaderless shard.
+
+Only `STALE_SHARD_EPOCH_REJECTION_MSG` triggers retry. Other errors
+(format errors, invalid state) are forwarded to the client immediately.
+
+When the leader rejects a forwarded proposal at pre-validation, it sends
+a `REJECT <type> <data> retry` message back. The `retry` suffix signals
+the follower that the rejection is epoch-related and eligible for retry.
+
 ### Entries that don't carry an epoch
 
 - NODE_FAIL / NODE_RECOVER 
diff --git a/src/cluster_raft.c b/src/cluster_raft.c
index 209c4694837..9476ba3e334 100644
--- a/src/cluster_raft.c
+++ b/src/cluster_raft.c
@@ -216,6 +216,7 @@ typedef struct {
 #define RAFT_HDR_SIZE 8
 #define REPL_OFFSETS_BROADCAST_PERIOD_MS 10000
 #define RAFT_LOG_REWRITE_THRESHOLD 100
+#define PROPOSAL_MAX_RETRIES 5
 
 /* Monotonic millisecond clock for timeouts and failure detection.
  * Unlike gettimeofday(), this is not affected by system clock adjustments. */
@@ -325,6 +326,8 @@ static void clusterRaftPropose(sds entry, void *ctx, void (*callback)(void *ctx,
 static void clusterRaftDeferPendingProposals(void);
 static void clusterRaftCompletePendingProposal(int type, sds data, const char *error);
 static int clusterRaftPreValidateEpoch(int type, sds data);
+static void clusterRaftAutoFailoverCallback(void *ctx, const char *error);
+static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode *target, void *ctx, void (*callback)(void *ctx, const char *error));
 static void clusterRaftUpdateMyself(int old_flags);
 static sds clusterRaftBuildMyNodeInfo(void);
 static void clusterRaftCheckSlotCoverage(void);
@@ -993,14 +996,13 @@ static int clusterRaftPreValidateEpoch(int type, sds data) {
 
     sdsfreesplitres(argv, argc);
     if (!ok) {
-        serverLog(LL_DEBUG, "Leader pre-validation: rejecting %s proposal (missing or stale epoch).",
+        serverLog(LL_DEBUG, "Leader pre-validation: rejecting %s proposal (stale epoch).",
                   raftEntryTypeName(type));
     }
     return ok;
 }
 
 static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) {
-    UNUSED(link);
     clusterRaftState *rs = RAFT_STATE();
 
     /* argv[0]="PROPOSE", argv[1..] is the entry (type + data). */
@@ -1020,11 +1022,13 @@ static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) {
 
     /* Pre-validate epoch on the leader to reject obviously stale proposals. */
     if (!clusterRaftPreValidateEpoch(type, data)) {
-        /* Send REJECT back to the proposing follower so it can unblock the client. */
+        /* Send REJECT back to the proposing follower so it can unblock the client.
+         * Append "retry" hint so the follower knows it can retry with a fresh epoch. */
         if (link) {
             sds msg = wireNewMsg("REJECT");
             msg = sdscatlen(msg, " ", 1);
             msg = sdscatlen(msg, entry, sdslen(entry));
+            msg = sdscat(msg, " retry");
             msg = wireFinishMsg(msg);
             clusterRaftSendMsg(link, msg);
         }
@@ -1049,21 +1053,28 @@ static int clusterRaftProcessPropose(clusterLink *link, int argc, sds *argv) {
 }
 
 /* Handle a REJECT message from the leader. The leader sends this when it
- * rejects a forwarded PROPOSE at pre-validation. We match it against our
+ * rejects a forwarded PROPOSE. We match it against our
  * pending_proposals and fire the callback with an error so the client
  * gets an immediate reply instead of hanging until timeout.
- * Format: "REJECT <type> <data...>" (echoes back the original entry). */
+ * Format: "REJECT <type> <data...> [retry]" (echoes back the original entry).
+ * If the last token is "retry", the rejection is retryable. */
 static int clusterRaftProcessReject(clusterLink *link, int argc, sds *argv) {
     UNUSED(link);
 
-    /* argv[0]="REJECT", argv[1]=type, argv[2..]=data */
+    /* argv[0]="REJECT", argv[1]=type, argv[2..]=data, optional last="retry" */
     if (argc < 2) return 1;
 
     int type = raftEntryTypeByName(argv[1]);
     if (type < 0) return 1;
 
-    sds data = (argc >= 3) ? sdsjoinsds(argv + 2, argc - 2, " ", 1) : sdsempty();
-    clusterRaftCompletePendingProposal(type, data, GENERIC_PROPOSAL_REJECTION_MSG);
+    /* Check if the last token is the "retry" hint. */
+    int retryable = (argc >= 3 && sdslen(argv[argc - 1]) == 5 &&
+                     memcmp(argv[argc - 1], "retry", 5) == 0);
+    int data_argc = retryable ? argc - 3 : argc - 2;
+
+    sds data = (data_argc > 0) ? sdsjoinsds(argv + 2, data_argc, " ", 1) : sdsempty();
+    const char *error = retryable ? STALE_SHARD_EPOCH_REJECTION_MSG : GENERIC_PROPOSAL_REJECTION_MSG;
+    clusterRaftCompletePendingProposal(type, data, error);
     sdsfree(data);
     return 1;
 }
@@ -1235,8 +1246,6 @@ static void raftLogApply(raftLogEntry *e) {
         sds *argv = sdssplitlen(e->data, sdslen(e->data), " ", 1, &argc);
         if (argv && argc >= 2 && sdslen(argv[0]) == CLUSTER_NAMELEN) {
             clusterNode *existing = clusterLookupNode(argv[0], CLUSTER_NAMELEN);
-            /* For new nodes (existing == NULL), epoch should be 0 — no validation needed. */
-
             if (!existing) {
                 clusterNode *n = createClusterNode(argv[0], CLUSTER_NODE_PRIMARY);
                 if (clusterNodeParseAddressString(n, argv[1]) == C_OK) {
@@ -2307,7 +2316,7 @@ static void clusterRaftCron(void) {
             entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN);
             uint64_t epoch = clusterGetShardEpoch(primary->shard_id);
             entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
-            clusterRaftPropose(entry, NULL, NULL);
+            clusterRaftPropose(entry, NULL, clusterRaftAutoFailoverCallback);
             sdsfree(entry);
         }
     }
@@ -2762,7 +2771,10 @@ static void clusterRaftGetNodePingPongEpoch(clusterNode *node, long long *ping_s
 static void clusterRaftSetNodePingPongEpoch(clusterNode *node, int ping_active, int pong_active, uint64_t shard_epoch) {
     UNUSED(ping_active);
     UNUSED(pong_active);
-    clusterSetShardEpoch(node->shard_id, shard_epoch);
+    uint64_t current = clusterGetShardEpoch(node->shard_id);
+    if (shard_epoch > current) {
+        clusterSetShardEpoch(node->shard_id, shard_epoch);
+    }
 }
 
 static sds clusterRaftAppendVarsLine(sds config) {
@@ -3060,15 +3072,28 @@ typedef struct {
     void *orig_ctx;
     void (*orig_callback)(void *orig_ctx, const char *error);
     clusterNode *target;
+    slotRange *ranges;
+    int numranges;
+    int retries;
 } slotChangeCallbackCtx;
 
 static void clusterRaftSlotChangeApplyCallback(void *ctx, const char *error) {
     slotChangeCallbackCtx *sc = (slotChangeCallbackCtx *)ctx;
-    if (clusterNodeGetPrimary(myself)->numslots == 0 && sc->target && !error) {
+    if (!error && clusterNodeGetPrimary(myself)->numslots == 0 && sc->target) {
         clusterHandleLostLastSlot(sc->target);
     }
+    if (error && strcmp(error, STALE_SHARD_EPOCH_REJECTION_MSG) == 0 && sc->retries > 0) {
+        sc->retries--;
+        /* Retry: re-invoke slot change with fresh epochs. */
+        clusterRaftSlotChange(sc->ranges, sc->numranges, sc->target,
+                              sc->orig_ctx, sc->orig_callback);
+        zfree(sc->ranges);
+        zfree(sc);
+        return;
+    }
     if (sc->orig_callback) sc->orig_callback(sc->orig_ctx, error);
-    zfree(ctx);
+    zfree(sc->ranges);
+    zfree(sc);
 }
 
 /* Invoked for slot assignments and slot migrations (ADDSLOTS, SETSLOT, etc.) */
@@ -3121,6 +3146,10 @@ static void clusterRaftSlotChange(slotRange *ranges, int numranges, clusterNode
     sc->orig_ctx = ctx;
     sc->orig_callback = callback;
     sc->target = target;
+    sc->ranges = zmalloc(sizeof(slotRange) * numranges);
+    memcpy(sc->ranges, ranges, sizeof(slotRange) * numranges);
+    sc->numranges = numranges;
+    sc->retries = PROPOSAL_MAX_RETRIES;
     clusterRaftPropose(entry, sc, &clusterRaftSlotChangeApplyCallback);
     sdsfree(entry);
 }
@@ -3352,14 +3381,162 @@ static const char *clusterRaftApplyFailover(sds data) {
     return error;
 }
 
+/* --------------------------------------------------------------------------
+ * Proposal retry on stale shard epoch
+ *
+ * REPLICATE and FORGET operations can be safely retried when rejected due to
+ * a stale shard epoch (a concurrent operation bumped the epoch). The retry
+ * rebuilds the proposal with a fresh epoch. Max retries: 5.
+ * -------------------------------------------------------------------------- */
+typedef struct {
+    void *client_ctx;                              /* Original blocked client handle. */
+    void (*client_callback)(void *, const char *); /* Original completion callback. */
+    int retries;                                   /* Remaining retry attempts. */
+    /* FORGET-specific fields. */
+    char node_id[CLUSTER_NAMELEN];
+    /* SET_REPLICA_OF-specific fields. */
+    char primary_name[CLUSTER_NAMELEN]; /* Primary node name (or "-" if promotion). */
+    int has_primary;                    /* 1 if replicating, 0 if promoting. */
+} proposalRetryCtx;
+
+/* Callback for automatic failover proposals. On rejection (stale epoch or any
+ * reason), re-schedule the failover if the primary is still failed. The next
+ * attempt will rebuild the proposal with a fresh shard epoch. */
+static void clusterRaftAutoFailoverCallback(void *ctx, const char *error) {
+    UNUSED(ctx);
+    if (!error) return; /* Success — nothing to do. */
+
+    clusterNode *myself = getMyClusterNode();
+    if (nodeIsReplica(myself) && myself->replicaof &&
+        nodeFailed(myself->replicaof) && !nodeCantFailover(myself)) {
+        RAFT_STATE()->todo_schedule_failover = 1;
+        serverLog(LL_NOTICE, "Automatic failover proposal rejected (%s), re-scheduling.", error);
+    }
+}
+
+static void clusterRaftForgetNodeRetryCallback(void *ctx, const char *error) {
+    proposalRetryCtx *rc = (proposalRetryCtx *)ctx;
+    if (error && strcmp(error, STALE_SHARD_EPOCH_REJECTION_MSG) == 0 && rc->retries > 0) {
+        rc->retries--;
+        /* Rebuild proposal with fresh epoch. */
+        clusterNode *node = clusterLookupNode(rc->node_id, CLUSTER_NAMELEN);
+        if (!node) {
+            /* Node already gone — treat as success. */
+            if (rc->client_callback) rc->client_callback(rc->client_ctx, NULL);
+            zfree(rc);
+            return;
+        }
+        sds entry = sdsnew("NODE_FORGET ");
+        entry = sdscatlen(entry, rc->node_id, CLUSTER_NAMELEN);
+        uint64_t epoch = clusterGetShardEpoch(node->shard_id);
+        entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
+        clusterRaftPropose(entry, rc, clusterRaftForgetNodeRetryCallback);
+        sdsfree(entry);
+        return;
+    }
+    /* No retry — forward result to the original callback. */
+    if (rc->client_callback) rc->client_callback(rc->client_ctx, error);
+    zfree(rc);
+}
+
+static void clusterRaftSetReplicaOfRetryCallback(void *ctx, const char *error) {
+    proposalRetryCtx *rc = (proposalRetryCtx *)ctx;
+    if (error && strcmp(error, STALE_SHARD_EPOCH_REJECTION_MSG) == 0 && rc->retries > 0) {
+        rc->retries--;
+        /* Rebuild proposal with fresh epochs. */
+        clusterNode *primary = rc->has_primary
+                                   ? clusterLookupNode(rc->primary_name, CLUSTER_NAMELEN)
+                                   : NULL;
+        if (rc->has_primary && !primary) {
+            if (rc->client_callback) rc->client_callback(rc->client_ctx, "target primary no longer exists");
+            zfree(rc);
+            return;
+        }
+        uint64_t source_epoch = clusterGetShardEpoch(myself->shard_id);
+        char target_shard[CLUSTER_NAMELEN];
+        uint64_t target_epoch;
+        if (primary) {
+            memcpy(target_shard, primary->shard_id, CLUSTER_NAMELEN);
+            target_epoch = clusterGetShardEpoch(primary->shard_id);
+        } else {
+            getRandomHexChars(target_shard, CLUSTER_NAMELEN);
+            target_epoch = 0;
+        }
+        sds entry = sdsnew("SET_REPLICA_OF ");
+        entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
+        entry = sdscatlen(entry, " ", 1);
+        entry = sdscatlen(entry, myself->shard_id, CLUSTER_NAMELEN);
+        entry = sdscatfmt(entry, " %U ", (unsigned long long)source_epoch);
+        entry = sdscatlen(entry, primary ? primary->name : "-", primary ? CLUSTER_NAMELEN : 1);
+        entry = sdscatlen(entry, " ", 1);
+        entry = sdscatlen(entry, target_shard, CLUSTER_NAMELEN);
+        entry = sdscatfmt(entry, " %U", (unsigned long long)target_epoch);
+        clusterRaftPropose(entry, rc, clusterRaftSetReplicaOfRetryCallback);
+        sdsfree(entry);
+        return;
+    }
+    /* No retry — forward result to the original callback. */
+    if (rc->client_callback) rc->client_callback(rc->client_ctx, error);
+    zfree(rc);
+}
+
+static void clusterRaftFailoverRetryCallback(void *ctx, const char *error) {
+    proposalRetryCtx *rc = (proposalRetryCtx *)ctx;
+    if (error && strcmp(error, STALE_SHARD_EPOCH_REJECTION_MSG) == 0 && rc->retries > 0) {
+        rc->retries--;
+        /* Rebuild proposal with fresh epoch. */
+        clusterNode *primary = myself->replicaof;
+        if (!primary) {
+            if (rc->client_callback) rc->client_callback(rc->client_ctx, "no primary to fail over");
+            zfree(rc);
+            return;
+        }
+        sds entry = sdsnew("FAILOVER ");
+        entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
+        entry = sdscatlen(entry, " ", 1);
+        entry = sdscatlen(entry, primary->name, CLUSTER_NAMELEN);
+        entry = sdscatlen(entry, " ", 1);
+        entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN);
+        uint64_t epoch = clusterGetShardEpoch(primary->shard_id);
+        entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
+        clusterRaftPropose(entry, rc, clusterRaftFailoverRetryCallback);
+        sdsfree(entry);
+        return;
+    }
+    /* No retry — forward result to the original callback. */
+    if (rc->client_callback) rc->client_callback(rc->client_ctx, error);
+    zfree(rc);
+}
+
 static void clusterRaftForgetNode(const char *node_id, size_t id_len, void *ctx, void (*callback)(void *ctx, const char *error)) {
+    /* Reject forgetting the raft leader — it would crash the cluster.
+     * The admin should transfer leadership first. Only block if the leader
+     * node is still a known, non-failed member. */
+    clusterRaftState *rs = RAFT_STATE();
+    if (id_len == CLUSTER_NAMELEN && memcmp(node_id, rs->leader, CLUSTER_NAMELEN) == 0) {
+        clusterNode *leader_node = clusterLookupNode(node_id, id_len);
+        if (leader_node && !nodeFailed(leader_node)) {
+            if (callback) callback(ctx, "Can't forget the raft leader. Transfer leadership first.");
+            return;
+        }
+    }
+
+    /* Wrap with retry context for stale epoch recovery. */
+    proposalRetryCtx *rc = zmalloc(sizeof(*rc));
+    rc->client_ctx = ctx;
+    rc->client_callback = callback;
+    rc->retries = PROPOSAL_MAX_RETRIES;
+    memset(rc->node_id, 0, CLUSTER_NAMELEN);
+    memcpy(rc->node_id, node_id, id_len < CLUSTER_NAMELEN ? id_len : CLUSTER_NAMELEN);
+    rc->has_primary = 0;
+
     sds entry = sdsnew("NODE_FORGET ");
     entry = sdscatlen(entry, node_id, id_len);
     /* Append departing node's shard epoch. */
     clusterNode *node = clusterLookupNode(node_id, id_len);
     uint64_t epoch = node ? clusterGetShardEpoch(node->shard_id) : 0;
     entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
-    clusterRaftPropose(entry, ctx, callback);
+    clusterRaftPropose(entry, rc, clusterRaftForgetNodeRetryCallback);
     sdsfree(entry);
 }
 
@@ -3368,6 +3545,18 @@ static void clusterRaftSetReplicaOf(clusterNode *primary, void *ctx, void (*call
      * "<replica-id> <source-shard> <source-epoch> <primary-id-or-dash> <target-shard> <target-epoch>"
      * Source is myself's current shard. Target is the primary's shard (for assignment)
      * or a new random shard (for promotion to primary). */
+
+    /* Wrap with retry context for stale epoch recovery. */
+    proposalRetryCtx *rc = zmalloc(sizeof(*rc));
+    rc->client_ctx = ctx;
+    rc->client_callback = callback;
+    rc->retries = PROPOSAL_MAX_RETRIES;
+    memset(rc->node_id, 0, CLUSTER_NAMELEN);
+    rc->has_primary = (primary != NULL);
+    if (primary) {
+        memcpy(rc->primary_name, primary->name, CLUSTER_NAMELEN);
+    }
+
     uint64_t source_epoch = clusterGetShardEpoch(myself->shard_id);
 
     char target_shard[CLUSTER_NAMELEN];
@@ -3387,7 +3576,7 @@ static void clusterRaftSetReplicaOf(clusterNode *primary, void *ctx, void (*call
     entry = sdscatlen(entry, " ", 1);
     entry = sdscatlen(entry, target_shard, CLUSTER_NAMELEN);
     entry = sdscatfmt(entry, " %U", (unsigned long long)target_epoch);
-    clusterRaftPropose(entry, ctx, callback);
+    clusterRaftPropose(entry, rc, clusterRaftSetReplicaOfRetryCallback);
     sdsfree(entry);
 }
 
@@ -3401,6 +3590,14 @@ static void clusterRaftFailover(int force, int takeover, void *ctx, void (*callb
 
     if (force) {
         /* FORCE/TAKEOVER: propose immediately without coordination. */
+        /* Wrap with retry context for stale epoch recovery. */
+        proposalRetryCtx *rc = zmalloc(sizeof(*rc));
+        rc->client_ctx = ctx;
+        rc->client_callback = callback;
+        rc->retries = PROPOSAL_MAX_RETRIES;
+        memset(rc->node_id, 0, CLUSTER_NAMELEN);
+        rc->has_primary = 0;
+
         sds entry = sdsnew("FAILOVER ");
         entry = sdscatlen(entry, myself->name, CLUSTER_NAMELEN);
         entry = sdscatlen(entry, " ", 1);
@@ -3409,7 +3606,7 @@ static void clusterRaftFailover(int force, int takeover, void *ctx, void (*callb
         entry = sdscatlen(entry, primary->shard_id, CLUSTER_NAMELEN);
         uint64_t epoch = clusterGetShardEpoch(primary->shard_id);
         entry = sdscatfmt(entry, " %U", (unsigned long long)epoch);
-        clusterRaftPropose(entry, ctx, callback);
+        clusterRaftPropose(entry, rc, clusterRaftFailoverRetryCallback);
         sdsfree(entry);
     } else {
         /* Coordinated failover: ask primary to pause writes, then wait
diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl
index efbc2660cb5..277a8e873b0 100644
--- a/tests/unit/cluster/cli.tcl
+++ b/tests/unit/cluster/cli.tcl
@@ -565,7 +565,7 @@ start_multiple_servers 3 [list overrides $base_conf] {
 # Test valkey-cli --cluster del-node
 # For cluster-raft we need to handle leader transfer when FORGET targets the leader.
 set base_conf [list cluster-enabled yes cluster-node-timeout 1000]
-start_multiple_servers 3 [list overrides $base_conf] {
+start_multiple_servers 3 [list overrides $base_conf tags {cluster-raft:skip}] {
 
     # Create cluster with 1 primary and 2 replicas for del-node tests
     exec $::VALKEY_CLI_BIN --cluster-yes --cluster create \
diff --git a/tests/unit/cluster/cluster-raft-proto.tcl b/tests/unit/cluster/cluster-raft-proto.tcl
index 552f804363d..f48cd2aed6a 100644
--- a/tests/unit/cluster/cluster-raft-proto.tcl
+++ b/tests/unit/cluster/cluster-raft-proto.tcl
@@ -705,53 +705,6 @@ start_multiple_servers 2 {overrides {cluster-enabled yes cluster-protocol raft c
             close $fd
         }
     }
-
-    # --------------------------------------------------------------------------
-    # Duplicate FAILOVER: two clients on the follower both send FAILOVER FORCE
-    # simultaneously. The first succeeds, the second is rejected at apply time
-    # (epoch bumped by the first). Neither client should hang.
-    # --------------------------------------------------------------------------
-
-    test "Raft shard epoch: duplicate FAILOVER from follower - second client gets rejection" {
-        set loglines [count_log_lines 0]
-
-        # Two deferred clients on r1 (the follower/replica).
-        set c1 [valkey_deferring_client -1]
-        set c2 [valkey_deferring_client -1]
-
-        # Both issue CLUSTER FAILOVER FORCE concurrently.
-        $c1 CLUSTER FAILOVER FORCE
-        $c2 CLUSTER FAILOVER FORCE
-
-        # Wait for a rejection to be logged (either at pre-validation or apply).
-        wait_for_log_messages 0 {"*rejecting*proposal*" "*Proposal rejected at apply*"} $loglines 1000 10
-
-        # Read both replies. One should succeed (OK), the other should
-        # get a rejection error. Neither should hang.
-        # Use catch because error replies throw in the deferring client.
-        if {[catch {set reply1 [$c1 read]} err1]} {
-            set reply1 $err1
-        }
-        if {[catch {set reply2 [$c2 read]} err2]} {
-            set reply2 $err2
-        }
-
-        # Exactly one should be OK and the other should be rejected.
-        set ok_count 0
-        set err_count 0
-        foreach reply [list $reply1 $reply2] {
-            if {$reply eq "OK"} {
-                incr ok_count
-            } elseif {[string match "*proposal rejected*" $reply]} {
-                incr err_count
-            }
-        }
-        assert_equal 1 $ok_count
-        assert_equal 1 $err_count
-
-        $c1 close
-        $c2 close
-    }
 }
 
 } ;# tags