diff --git a/.config/typos.toml b/.config/typos.toml
index 10103279c57..ff90d3a679d 100644
--- a/.config/typos.toml
+++ b/.config/typos.toml
@@ -15,11 +15,12 @@ optin = "optin"
 smove = "smove"
 Parth = "Parth" # seems like the spellchecker does not like it is similar to "Path"
 nd = "nd"
+threadsave = "threadsave"
 
 [default]
 extend-ignore-re = [
-    "SELECTed",
-    "WATCHed",
+    "[A-Z]{2,}ed", # SELECTed, WATCHed, etc.
+    "[A-Z]{2,}s",  # SELECTs, etc.
 ]
 
 [type.c]
@@ -64,6 +65,7 @@ pathc = "pathc"
 pn = "pn"
 seeked = "seeked"
 tre = "tre"
+dbe = "dbe"
 
 [type.systemd.extend-words]
 # systemd = .conf
diff --git a/src/Makefile b/src/Makefile
index 2c78f95986e..98f49108e46 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -457,6 +457,7 @@ ENGINE_SERVER_OBJ = \
     allocator_defrag.o \
     anet.o \
     aof.o \
+	bgiteration.o \
     bio.o \
     bitops.o \
     blocked.o \
diff --git a/src/bgiteration.c b/src/bgiteration.c
new file mode 100644
index 00000000000..ed6ac40bddc
--- /dev/null
+++ b/src/bgiteration.c
@@ -0,0 +1,2728 @@
+#include "fmacros.h"
+#include "bgiteration.h"
+#include "dict.h"
+#include "fifo.h"
+#include "kvstore.h"
+#include "monotonic.h"
+#include "mutexqueue.h"
+#include "server.h"
+
+int getFlushCommandFlags(client *c, int *flags);                                                        // in db.c
+uint64_t dictObjHash(const void *key);                                                                  // in server.c
+int dictObjKeyCompare(const void *key1, const void *key2);                                              // in server.c
+size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid);                             // in object.c
+robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const sds key, long long expire); // in object.c
+
+
+// Non-public hashtable/kvstore functions...
+bool hashtableInternalFindBucketIdx(hashtable *ht, void *key, int *table_idx, size_t *bucket_idx);
+void hashtableInternalIteratorGetBucketIdx(hashtableIterator *iterator, int *table_idx, size_t *bucket_idx);
+bool hashtableInternalIteratorIsBucketIdxComplete(hashtableIterator *iterator);
+hashtableIterator *kvstoreInternalIteratorGetCurrentHashtableIterator(kvstoreIterator *kvs_it);
+
+
+static bool receiveItemsBackFromOneIterator(bgIterator *it); // in bgiteration.c - forward declaration
+
+// ################  TEMP COMPILE HACKS   ###########################
+// Issue found.  server.db has changed from an array of db to an array of pointers to db (change all refs to server.db)
+// Issue: iterators (kvstore/hashtable) are not safe across event loop invocations.  Hashtable (kvstore?) needs to track and maintain safe iterators.
+
+
+// Don't think there's any current need for this...
+static bool ignoreKeyForSave(const_sds key) {
+    UNUSED(key);
+    return false;
+}
+
+//------- END OF COMPILE HACKS -------------------
+
+
+// Returns true if the cmd is a script command that may replicate.
+static bool isScriptCallWriteCmd(struct serverCommand *cmd) {
+    return ((cmd->proc == fcallCommand) || (cmd->proc == evalCommand) || (cmd->proc == evalShaCommand));
+}
+
+// The PFCOUNT command (which does NOT have the CMD_WRITE flag) modifies the underlying string and
+//  is replicated as a write.  So it needs to be detected and handled specially.
+static bool isWriteCmd(struct serverCommand *cmd) {
+    return ((cmd->flags & CMD_WRITE) || (cmd->proc == pfcountCommand) || (cmd->proc == execCommand) || (isScriptCallWriteCmd(cmd)));
+}
+
+// Returns true if the command is a deletion based command (DEL or UNLINK)
+static bool isDeleteCmd(struct serverCommand *cmd) {
+    return ((cmd->proc == delCommand) || (cmd->proc == unlinkCommand));
+}
+
+
+static bool onValkeyMainThread(void) {
+    return (pthread_equal(server.main_thread_id, pthread_self()) != 0);
+}
+
+/* Parse a parameters robj, extracting a valid DBID.
+ * Returns FALSE if DBID isn't valid.
+ */
+static bool getDbIdFromRobj(robj *obj, int *db_id) {
+    long long value;
+    if (getLongLongFromObject(obj, &value) != C_OK) return false;
+    if ((value < 0) || (value >= server.dbnum)) return false;
+    *db_id = (int)value;
+    return true;
+}
+
+/* Parse the parameters of the COPY command, extracting the target DBID.
+ * Returns FALSE if the command would not run.
+ */
+static bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid, int *target_dbid) {
+    const int COPY_COMMAND_OPTIONAL_ARG_START_INDEX = 3;
+
+    *target_dbid = selected_dbid;
+
+    for (int i = COPY_COMMAND_OPTIONAL_ARG_START_INDEX;  i < argc;  i++) {
+        if (!strcasecmp((char *)objectGetVal(argv[i]), "replace")) {
+            continue;
+        } else if (!strcasecmp((char *)objectGetVal(argv[i]), "db") && (i + 1 < argc)) {
+            /* Note the parsing here needs to perfectly match what we have in Valkey OSS for COPY.
+             * The following command is considered OK by Valkey 8.1 so we can't return here, but
+             * must continue to parse till the last db which is the one that's effectively used.
+             *    COPY key1 key2 db 1 db 2 db 3    // (This will use db 3)
+             */
+            if (!getDbIdFromRobj(argv[i + 1], target_dbid)) {
+                return false; // parse failure
+            }
+            i++; // Consume additional argument
+        } else {
+            return false; // parse failure
+        }
+    }
+    return true;
+}
+
+/* Get parameters for the SWAPDB command.
+ * The optional permission_client allows for checking of a client's permission for swapdb.
+ * Returns true if command would be executed.
+ */
+bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, int *id1_p, int *id2_p) {
+    static struct serverCommand *swapdb_cmd = NULL;
+
+    // We don't need to check permissions in the replication phase
+    if (permission_client != NULL) {
+        if (swapdb_cmd == NULL) {
+            swapdb_cmd = lookupCommandByCString("swapdb");
+            serverAssert(swapdb_cmd != NULL);
+        }
+
+        int idxptr;
+        if (ACLCheckAllUserCommandPerm(permission_client->user, swapdb_cmd, argv, argc,
+                                       permission_client->db->id, &idxptr) != ACL_OK) return false;
+    }
+
+    long long dbid1, dbid2;
+    if (argc != 3) return false;
+    if (server.cluster_enabled) return false;
+    if (getLongLongFromObject(argv[1], &dbid1) != C_OK) return false;
+    if (getLongLongFromObject(argv[2], &dbid2) != C_OK) return false;
+    if (dbid1 < 0 || dbid1 >= server.dbnum) return false;
+    if (dbid2 < 0 || dbid2 >= server.dbnum) return false;
+    if (dbid1 == dbid2) return false;  // Valid, but doesn't do anything
+
+    *id1_p = (int)dbid1;
+    *id2_p = (int)dbid2;
+    return true;
+}
+
+/* Get parameters for the SELECT command.
+ * The optional permission_client allows for checking of a client's permission for select.
+ * Returns true if command would be executed.
+ */
+bool getParamsForSelect(int argc, robj **argv, client *permission_client, int *dbid_p) {
+    static struct serverCommand *select_cmd = NULL;
+
+    // We don't need to check permissions in the replication phase
+    if (permission_client != NULL) {
+        if (select_cmd == NULL) {
+            select_cmd = lookupCommandByCString("select");
+            serverAssert(select_cmd != NULL);
+        }
+
+        int idxptr;
+        if (ACLCheckAllUserCommandPerm(permission_client->user, select_cmd, argv, argc,
+                                       permission_client->db->id, &idxptr) != ACL_OK) return false;
+    }
+
+    long long dbid;
+    if (argc != 2) return false;
+    if (getLongLongFromObject(argv[1], &dbid) != C_OK) return false;
+    if (dbid < 0 || dbid >= server.dbnum) return false;
+
+    *dbid_p = (int)dbid;
+    return true;
+}
+
+
+/* DictType for SDS->ptr.  The SDS is referenced, no destructor. */
+static dictType sdsrefToPtrDictType = {
+    .entryGetKey = dictEntryGetKey,
+    .hashFunction = dictSdsHash,
+    .keyCompare = dictSdsKeyCompare
+};
+
+
+/* Wrap decrRefCount() so that it can be used as a callback requiring void. */
+static void decrRefCountVoid(void *o) {
+    decrRefCount(o);
+}
+
+
+/* Concatenate argc/argv into a command string for debugging. */
+static sds createSdsFromClientArgv(int argc, robj **argv) {
+    sds cmd = sdsempty();
+    for (int i = 0; i < argc; i++) {
+        robj *arg = getDecodedObject(argv[i]); // some objects are int encoded
+        cmd = sdscatprintf(cmd, "'%s' ", (char *)objectGetVal(arg));
+        decrRefCount(arg);
+    }
+    return cmd;
+}
+
+
+//###########################################################################
+
+
+/* bgIteration internal (compile time) configuration values */
+enum {
+    BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE = 16384,    // Prevent initial rehashing
+    BGITER_MAX_CLONE_ITEM_BYTES = 512,                 // Max size item to clone
+    BGITER_MAX_CLONE_POOL_BYTES = (1 * 1024 * 1024),   // Total limit for all cloned items
+    BGITER_QUEUE_INCREASE_INCR = 100,                  // Step size when increasing queue target
+    BGITER_CYCLE_DELAY_MS = 2,                         // Delay between calls on bgIteration timer
+    BGITER_CYCLE_BUDGET_MS = 1,                        // Normal time limit for timer processing
+    BGITER_CYCLE_BUDGET_MAX_MS = 10                    // Maximum time limit when starvation seen
+};
+
+// These can be tweaked by unit tests
+static int bgiter_max_clone_item_bytes = BGITER_MAX_CLONE_ITEM_BYTES;
+static int bgiter_max_clone_pool_bytes = BGITER_MAX_CLONE_POOL_BYTES;
+
+void bgIteration_unitTestDisableCloning(void) {
+    bgiter_max_clone_item_bytes = 0;
+    bgiter_max_clone_pool_bytes = 0;
+}
+void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes) {
+    bgiter_max_clone_item_bytes = item_bytes;
+    bgiter_max_clone_pool_bytes = pool_bytes;
+}
+
+typedef enum {
+    BGITERATION_TYPE_NONE,
+    BGITERATION_TYPE_FULLSCAN,
+    BGITERATION_TYPE_CLUSTERSLOT
+} bgIterationType;
+
+/* Extensions to bgIteratorItemType.  These enumerations are used internally, and are not part of
+ *  the published interface.  These allow for extensibility in the internal information-passing
+ *  between the Valkey main thread and the iteration client thread. */
+typedef enum {
+    /* Indicates that the iteration client has completed use of the bgIterator and that the
+     * bgIterator should be cleaned up and freed by the Valkey main thread. */
+    BGITERATOR_ITEMEXT_ITER_CLOSED = 10
+} bgIteratorItemTypeExtended;
+
+/* Item for bgIteratorItemTypeExtended.BGITERATOR_ITEMEXT_ITER_CLOSED.  Used to pass a bgIterator
+ * back to the Valkey main thread for cleanup/release. */
+typedef struct {
+    bgIteratorItemTypeExtended type;
+    bgIterator *iter;
+} bgIteratorItemExtClose;
+
+/* Used for dictEntryPtrDictType. This dict grows and shrinks constantly during the iteration.
+ * There is no point to rehash it all the time. */
+static int neverShrink(size_t moreMem, double usedRatio) {
+    UNUSED(moreMem);
+    return (usedRatio > 0.5); // Return true only if expanding
+}
+
+// A dictionary with a pointer (itself) as a key (the address pointed to is NOT referenced).
+//  Nothing is duplicated, this is a very fast dictionary, but potentially unsafe if the original
+//  items are deleted or moved.
+// WARNING:  Can't have active defrag running!  It might reallocate memory blocks, swapping their
+//           pointer values!  A check must be made in active defrag to ensure that no iteration is
+//           active.
+
+// Thomas Wang's 64-bit mix
+static uint64_t pointerHash(const void *key) {
+    uint64_t h = (uint64_t)(uintptr_t)key;
+    h = (~h) + (h << 21); // h = (h << 21) - h - 1;
+    h = h ^ (h >> 24);
+    h = (h + (h << 3)) + (h << 8); // h * 265
+    h = h ^ (h >> 14);
+    h = (h + (h << 2)) + (h << 4); // h * 21
+    h = h ^ (h >> 28);
+    h = h + (h << 31);
+    return h;
+}
+
+static int pointerCompare(const void *key1, const void *key2) {
+    return key1 == key2;
+}
+
+static dictType dictEntryPtrDictType = {
+    .entryGetKey = dictEntryGetKey,
+    .hashFunction = pointerHash,
+    .keyCompare = pointerCompare,
+    .resizeAllowed = neverShrink
+};
+
+// A TEMP set of robj's (of type sds).  This is only for temporary sets as the robj's are not
+//  ref-counted at insertion/deletion.  Used for robj->NULL.
+static dictType tempKeysetDictType = {
+    .entryGetKey = dictEntryGetKey,
+    .hashFunction = dictObjHash,
+    .keyCompare = dictObjKeyCompare
+};
+
+typedef struct genericIterator genericIterator;
+typedef void   (*iteratorReleaseFunc)       (genericIterator *genIt);
+typedef fifo * (*iteratorGetEntriesFunc)    (genericIterator *genIt, int *orig_dbid, int *cur_dbid);
+typedef void   (*iteratorSwapDbFunc)        (genericIterator *genIt, int db1, int db2);
+typedef void   (*iteratorFlushDbFunc)       (genericIterator *genIt, int cur_dbid);
+typedef bool   (*iteratorHasPassedItemFunc) (genericIterator *genIt, const_sds key, int cur_dbid);
+typedef int    (*iteratorOriginalDbFunc)    (genericIterator *genIt, int cur_dbid);
+typedef bool   (*iteratorIsKeyInScopeFunc)  (genericIterator *genIt, const_sds key);
+
+// Function pointers supporting polymorphic iterator implementation
+struct genericIterator {
+    iteratorReleaseFunc         release;
+    iteratorGetEntriesFunc      getEntries;
+    iteratorSwapDbFunc          swapDb;
+    iteratorFlushDbFunc         flushDb;
+    iteratorHasPassedItemFunc   hasPassedItem;
+    iteratorOriginalDbFunc      originalDb;
+    iteratorIsKeyInScopeFunc    isKeyInScope;
+};
+
+typedef struct itemListNode {
+    struct itemListNode *next;
+} itemListNode;
+
+static itemListNode *freeItemStackHead = NULL;
+
+static void itemFreeList_returnItemBackToFreeList(bgIteratorItem* item) {
+    itemListNode *freedNode = (itemListNode*)item;
+    freedNode->next = freeItemStackHead;
+    freeItemStackHead = freedNode;
+}
+
+static bgIteratorItem *itemFreeList_getElementOrAllocate(void) {
+ 
+    bgIteratorItem *item;
+    // Pop a free node from the free list or allocate if none free
+    if (freeItemStackHead) {
+        item = (bgIteratorItem*)freeItemStackHead;
+        freeItemStackHead = freeItemStackHead->next;
+        if (freeItemStackHead) {
+            valkey_prefetch(freeItemStackHead);
+        }
+    }
+    else {
+        // Create new listNode and item
+        item = zmalloc(sizeof(bgIteratorItem));
+    }
+    return item;
+}
+ 
+static void itemFreeList_release(void) { 
+    while(freeItemStackHead) { 
+        itemListNode *node = freeItemStackHead;
+        freeItemStackHead = node->next;
+        zfree((bgIteratorItem*)node);
+    }
+}
+
+// This struct is used across threads.  Unless otherwise noted, the fields are initialized at
+//  iterator creation (within the main thread) and are read-only by the client thread.
+struct bgIterator {
+    sds name;                        // Iterator name
+    bgIteratorReplDoneFunc repldone; // Optional repldone function to be run on the main thread
+    bgIteratorCleanupFunc cleanup;   // Optional cleanup function to be run on main thread
+    void *privdata;                  // Client's private data to be passed to cleanup function
+
+    int iteration_flags;                 // Consistent and/or Replication
+    int iteration_type;                  // Full scan or cluster slot
+    uint32_t consistent_modification_id; // iterator epoch at time of iterator creation
+
+    genericIterator *keyset_iter; // Low-level iterator (polymorphic)
+
+    dict *early_iterate_entries; // Used to keep track of what items have already been iterated
+                                 // over by out-of-order expedited process, ensuring a bgIterator 
+                                 // does not try to reprocess items.
+                                 // Used only by main thread.
+                                 // dictEntry -> NULL
+
+    mutexQueue *items_for_iterator; // Created/Destroyed in main thread, used in both (threadsafe)
+
+    mutexQueue *return_to_valkey; // Queue of items to be returned to the Valkey main thread (threadsafe)
+    
+    unsigned int item_count_target; // Used only by main thread
+
+    bgIteratorItem *volatile current_item; // current_item is normally only used in the iteration client.
+                                           //  It's marked volatile here only to support snooping from the
+                                           //  main thread when handling a FLUSHDB command.  This prevents
+                                           //  the compiler from generating code which might read the
+                                           //  pointer multiple times (when it's coded to read only once).
+                                           // Also - this syntax is for a volatile POINTER to a
+                                           //  non-volatile item.  "volatile" at the beginning of the
+                                           //  declaration, would indicate a (non-volatile) pointer to a
+                                           //  volatile item.
+
+    bool client_is_active; // Set to true when client performs 1st read
+    bool completed;        // Set to true in main thread when last item from iteration has
+                           //  been queued to the client.  No additional items will be
+                           //  enqueued to the client after this has been set.
+
+    volatile bool terminated; // Set to true in main thread when iteration is to be killed
+                              // Set to true in iteration client when it decides to end early
+
+    bool cur_cmd_may_replicate; // Used only in main thread during command processing
+
+    // Variables maintaining runtime statistics
+    unsigned long dbentries_queued;         // Updated by main thread
+    unsigned long dbentries_processed;      // Updated by client thread
+    unsigned long replication_queued;       // Updated by main thread
+    unsigned long replication_processed;    // Updated by client thread
+    unsigned long swapdb_queued;            // Updated by main thread
+    unsigned long swapdb_processed;         // Updated by client thread
+    unsigned long flushdb_queued;           // Updated by main thread
+    unsigned long flushdb_processed;        // Updated by client thread
+    unsigned long dbentry_clones_queued;    // Updated by main thread
+    unsigned long dbentry_clones_processed; // Updated by client thread
+    monotime monotonic_start_time;          // Time iteration started
+
+    volatile monotime monotonic_item_start_time; // The item start time is set in the iteration client.  It is
+                                                 //  marked volatile as it can be read from the main thread by
+                                                 //  bgIteratorGetStatus.  If 0, this indicates that the
+                                                 //  iteration client is waiting for an item to process.
+};
+
+
+// These static values are only accessed from the main Valkey thread.
+
+static list *allIterators;   // list of bgIterator
+static dict *nameToIterator; // bgIterator->name -> bgIterator
+
+// Global, across all iterators, dict contains a dbEntry pointer -> ref count
+static dict *inUseEntries; // dbEntry -> ref count
+
+// Key values in the current command which don't exist in the DB yet.  Needed for determination of
+//  replication for NON-consistent iterations.
+static list *curCmdMissingKeys; // list of robj
+
+// A counter of the total amount of memory used for buffered replication data.
+//  This amount is excluded when computing the need for evictions.
+static ssize_t bufferedReplicationBytes;
+
+// Memory pool to track current allocated memory of cloned items (in bytes)
+static ssize_t bgiteration_current_clone_memory_pool_size;
+
+// Snapshot of the last queue size to seed the next queue
+// We assume all bgIterators consume items at the same rate
+static int last_item_count_target;
+
+// Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID)
+static long long bgIterator_timeproc_id;
+
+// Incremented on each new iteration, this is updated in dbEntry metadata whenever an entry is modified.
+static uint32_t bgIteration_epoch = 1;
+
+
+// BgIteration debug captures BgIteration activity to a large sds buffer.  When an iterator is
+//  completed, the entire buffer is written to a file in the current working directory.  Note that
+//  memory must be available for the ENTIRE debug in memory.  This isn't captured incrementally to
+//  a file as the file I/O is more likely to affect timing.
+// Future implementation: the current design is most useful for a single iterator.  When items are
+//  queued to an iterator, the iterator name is not recorded (to save space).
+// Developer note: using a CONST value here allows the compiler to completely remove all of the
+//  debugging code at compile time.  There is no run-time performance overhead when set to FALSE.
+//  This is essentially like an IFDEF, however, it's better as it forces the compiler to validate
+//  syntax.
+static const bool BGITERATION_DEBUG = false; // DO NOT SUBMIT WITH THIS SYMBOL SET TO TRUE!
+static sds debugBuffer;
+
+
+
+//=============================================================================================
+//                        Full Scan Iterator
+//=============================================================================================
+/* The full scan iterator performs the actual iteration over the Valkey keyset.  The iterator is
+ * only used from within the Valkey main thread.  Iteration proceeds one DB at a time, based on
+ * the DB ordering at the time of iterator creation.  Each time the iterator returns items, all
+ * of the dictionary entries from a single hash bucket are returned.
+ */
+
+struct fullScanIterator {
+    genericIterator callbacks; // (must be first item)
+
+    // Array of mapping from original DB ID (at the time of iteration start) to that DB's
+    //  current index.  So, if the DB which was DB-0 is now at index 6, orig_to_cur_db[0]==6.
+    int *orig_to_cur_db;
+
+    // The reverse of the above array.  This maps a current DB index to its original index
+    //  (at the time of iteration start).
+    int *cur_to_orig_db;
+
+    // This is the DB we are currently iterating over.  This is relative to the ORIGINAL
+    //  DB ordering, at the time of iterator creation.  Iteration proceeds from 0..N based on
+    //  the original ordering.
+    int iter_db;
+
+    // Iterator for the DB orig_to_cur_db[iter_db]
+    kvstore *kvs; // keep track of kvs associated with iter_dbi
+    kvstoreIterator *iter_dbi;
+};
+
+static void fullScanIteratorRelease(genericIterator *genIt) {
+    struct fullScanIterator *it = (struct fullScanIterator *)genIt;
+    if (it->iter_dbi) kvstoreIteratorRelease(it->iter_dbi);
+    zfree(it->orig_to_cur_db);
+    zfree(it->cur_to_orig_db);
+    zfree(it);
+}
+
+static fifo * fullScanIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) {
+    struct fullScanIterator *it = (struct fullScanIterator *)genIt;
+    if (it->iter_db >= server.dbnum) return NULL; // Finished scanning
+
+    fifo *dbEntryFifo = fifoCreate();
+    while (fifoLength(dbEntryFifo) == 0) {
+        while (it->iter_dbi == NULL) {
+            if (++it->iter_db >= server.dbnum) {
+                fifoRelease(dbEntryFifo);
+                return NULL; // Iteration complete
+            }
+            serverDb *db = server.db[it->orig_to_cur_db[it->iter_db]];
+            if (db != NULL) {
+                it->kvs = db->keys;
+                it->iter_dbi = kvstoreIteratorInit(it->kvs, HASHTABLE_ITER_SAFE);
+            }
+        }
+
+        hashtableIterator *ht_it = NULL;
+        do {
+            dbEntry *de;
+            if (!kvstoreIteratorNext(it->iter_dbi, (void **)&de)) {
+                kvstoreIteratorRelease(it->iter_dbi);
+                it->kvs = NULL, it->iter_dbi = NULL;
+                break;
+            }
+
+            ht_it = kvstoreInternalIteratorGetCurrentHashtableIterator(it->iter_dbi);
+            if (ignoreKeyForSave(objectGetKey(de))) continue; // slot migration: keys being purged
+            fifoPush(dbEntryFifo, de);
+        } while (!hashtableInternalIteratorIsBucketIdxComplete(ht_it));
+    }
+    *orig_dbid = it->iter_db;
+    *cur_dbid = it->orig_to_cur_db[*orig_dbid];
+    return dbEntryFifo;
+}
+
+static void fullScanIteratorSwapDb(genericIterator *genIt, int db1, int db2) {
+    struct fullScanIterator *it = (struct fullScanIterator *)genIt;
+    int temp = it->cur_to_orig_db[db1];
+    it->cur_to_orig_db[db1] = it->cur_to_orig_db[db2];
+    it->cur_to_orig_db[db2] = temp;
+
+    it->orig_to_cur_db[it->cur_to_orig_db[db1]] = db1;
+    it->orig_to_cur_db[it->cur_to_orig_db[db2]] = db2;
+}
+
+static void fullScanIteratorFlushDb(genericIterator *genIt, int cur_dbid) {
+    struct fullScanIterator *it = (struct fullScanIterator *)genIt;
+    int orig_db = it->cur_to_orig_db[cur_dbid];
+    if (orig_db == it->iter_db) {
+        // We are currently iterating on the DB that's being flushed.
+        kvstoreIteratorRelease(it->iter_dbi);
+        it->kvs = NULL, it->iter_dbi = NULL;
+        // Iteration will continue with the next DB.
+    }
+}
+
+static bool fullScanIteratorHasPassedItem(genericIterator *genIt, const_sds key, int cur_dbid) {
+    struct fullScanIterator *it = (struct fullScanIterator *) genIt;
+    int orig_dbid = it->cur_to_orig_db[cur_dbid];
+
+    if (orig_dbid < it->iter_db) return true;  // Entire DB has already been processed
+    if (orig_dbid > it->iter_db) return false; // Haven't started this DB yet
+    // Now, orig_dbid == it->iter_db
+
+    if (it->iter_dbi == NULL) return true; // just finished this DB
+
+    // We're in the middle of processing a DB.  In cluster-mode, the DB is divided into 1 hashtable
+    //  per slot.  In cluster-mode-disabled, we treat all keys as in slot 0.
+    int keySlot = server.cluster_enabled ? getKeySlot((sds)key) : 0;
+    if (keySlot < kvstoreIteratorGetCurrentHashtableIndex(it->iter_dbi)) return true;
+    if (keySlot > kvstoreIteratorGetCurrentHashtableIndex(it->iter_dbi)) return false;
+
+    // At this point, we're down to a specific hashtable.
+
+    hashtable *iter_current_ht = kvstoreGetHashtable(it->kvs, keySlot);
+    int table; // 0 or 1 (supporting rehashing)
+    size_t index; // bucket number within the hashtable
+    // If key doesn't exist, we consider it passed - we MIGHT have iterated over it had it existed.
+    if (!hashtableInternalFindBucketIdx(iter_current_ht, (void *)key, &table, &index)) return true;
+
+    hashtableIterator *htIter = kvstoreInternalIteratorGetCurrentHashtableIterator(it->iter_dbi);
+    int iter_table;
+    size_t iter_index;
+    hashtableInternalIteratorGetBucketIdx(htIter, &iter_table, &iter_index);
+    if (table < iter_table) return true;  // iteration in table 1, but item is in table 0
+    if (table > iter_table) return false; // iteration in table 0, but item is in table 1
+    // if index <= iterator index, it has been passed. bgIterator
+    // processes buckets atomically. hashtableIterator points to the
+    // last returned position. It means bucket at iter_index has
+    // already been processed.
+    if (index <= iter_index) return true;
+    if (ignoreKeyForSave(key)) return true; // if slot being purged, pretend we have passed it
+    return false;
+}
+
+static int fullScanIteratorOriginalDb(genericIterator *genIt, int cur_dbid) {
+    struct fullScanIterator *it = (struct fullScanIterator *)genIt;
+    return it->cur_to_orig_db[cur_dbid];
+}
+
+static bool fullScanIteratorIsKeyInScope(genericIterator *genIt, const_sds key) {
+    UNUSED(genIt);
+    UNUSED(key);
+    return true; // All keys are in scope
+}
+
+static genericIterator * fullScanIteratorCreate(void) {
+    struct fullScanIterator *it = zmalloc(sizeof(struct fullScanIterator));
+    it->orig_to_cur_db = zmalloc(sizeof(int) * server.dbnum);
+    it->cur_to_orig_db = zmalloc(sizeof(int) * server.dbnum);
+    for (int i = 0; i < server.dbnum; i++) {
+        it->orig_to_cur_db[i] = i;
+        it->cur_to_orig_db[i] = i;
+    }
+    it->iter_db = -1;
+    it->kvs = NULL;
+    it->iter_dbi = NULL;
+
+    it->callbacks.release = fullScanIteratorRelease;
+    it->callbacks.getEntries = fullScanIteratorGetEntries;
+    it->callbacks.swapDb = fullScanIteratorSwapDb;
+    it->callbacks.flushDb = fullScanIteratorFlushDb;
+    it->callbacks.hasPassedItem = fullScanIteratorHasPassedItem;
+    it->callbacks.originalDb = fullScanIteratorOriginalDb;
+    it->callbacks.isKeyInScope = fullScanIteratorIsKeyInScope;
+
+    return (genericIterator *)it;
+}
+
+
+
+//=============================================================================================
+//                        Cluster Slot Iterator
+//=============================================================================================
+/* The cluster slot iterator performs iteration over one cluster slot of the Valkey keyset.  The
+ * iterator is only used from within the Valkey main thread.
+ */
+struct clusterSlotIterator {
+    genericIterator callbacks; // (must be first item)
+};
+
+static void clusterSlotIteratorRelease(genericIterator *genIt) {
+    UNUSED(genIt);
+    serverAssert(false); // Not yet implemented
+}
+
+static fifo * clusterSlotIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) {
+    UNUSED(genIt);
+    UNUSED(orig_dbid);
+    UNUSED(cur_dbid);
+    serverAssert(false); // Not yet implemented
+}
+
+static void clusterSlotIteratorSwapDb(genericIterator *genIt, int db1, int db2) {
+    UNUSED(genIt);
+    UNUSED(db1);
+    UNUSED(db2);
+    serverAssert(false); // swap not valid in cluster mode
+}
+
+static void clusterSlotIteratorFlushDb(genericIterator *genIt, int cur_dbid) {
+    UNUSED(genIt);
+    UNUSED(cur_dbid);
+    serverAssert(false); // Not yet implemented
+}
+
+static bool clusterSlotIteratorHasPassedItem(genericIterator *genIt, const_sds key, int cur_dbid) {
+    UNUSED(genIt);
+    UNUSED(key);
+    UNUSED(cur_dbid);
+    serverAssert(false); // Not yet implemented
+}
+
+static int clusterSlotIteratorOriginalDb(genericIterator *genIt, int cur_dbid) {
+    UNUSED(genIt);
+    UNUSED(cur_dbid);
+    return cur_dbid; // swap not supported in cluster mode
+}
+
+/* When checking if a command is in scope for this iterator, all of its keys should be either in
+ * scope or not. In cluster mode enabled a command cannot reference keys from different slots, so
+ * this assumption will always be true. */
+static bool clusterSlotIteratorIsKeyInScope(genericIterator *genIt, const_sds key) {
+    UNUSED(genIt);
+    UNUSED(key);
+    serverAssert(false); // Not yet implemented
+}
+
+static genericIterator * clusterSlotIteratorCreate(const int *slots, size_t slots_count) {
+    struct clusterSlotIterator *it = zmalloc(sizeof(struct clusterSlotIterator));
+    it->callbacks.release = clusterSlotIteratorRelease;
+    it->callbacks.getEntries = clusterSlotIteratorGetEntries;
+    it->callbacks.swapDb = clusterSlotIteratorSwapDb;
+    it->callbacks.flushDb = clusterSlotIteratorFlushDb;
+    it->callbacks.hasPassedItem = clusterSlotIteratorHasPassedItem;
+    it->callbacks.originalDb = clusterSlotIteratorOriginalDb;
+    it->callbacks.isKeyInScope = clusterSlotIteratorIsKeyInScope;
+
+    UNUSED(slots);
+    UNUSED(slots_count);
+    serverAssert(false); // Not yet implemented
+
+    return (genericIterator *)it;
+}
+
+
+
+//=============================================================================================
+//                        General iteration support (across all iterators)
+//=============================================================================================
+
+// While an item is potentially in use by a background thread, we can't have
+//  rehashing by the main thread.  Returns true if rehashing was paused.
+static bool pauseRehashing(dbEntry *de) {
+    switch (de->encoding) {
+        case OBJ_ENCODING_HASHTABLE: {  // SET or HASH
+            hashtable *ht = objectGetVal(de);
+            hashtablePauseRehashing(ht);
+            return true;
+        }
+        case OBJ_ENCODING_SKIPLIST: {   // SORTED SET
+            zset *zs = objectGetVal(de);
+            hashtablePauseRehashing(zs->ht);
+            return true;
+        }
+        default:
+            return false;
+    }
+}
+
+static void resumeRehashing(dbEntry *de) {
+    switch (de->encoding) {
+        case OBJ_ENCODING_HASHTABLE: {  // SET or HASH
+            hashtable *ht = objectGetVal(de);
+            hashtableResumeRehashing(ht);
+            break;
+        }
+        case OBJ_ENCODING_SKIPLIST: {   // SORTED SET
+            zset *zs = objectGetVal(de);
+            hashtableResumeRehashing(zs->ht);
+            break;
+        }
+        default:
+            break;
+    }
+}
+
+// Maintain a list of entries which are currently in-use.  These items should not be modified.
+static void incrementEntryInuse(dbEntry *de) {
+    dictEntry *existingEntry;
+    dictEntry *newEntry = dictAddRaw(inUseEntries, de, &existingEntry);
+    if (newEntry) {
+        incrRefCount(de);
+        dictSetSignedIntegerVal(newEntry, 1);
+    } else {
+        dictSetSignedIntegerVal(existingEntry, dictGetSignedIntegerVal(existingEntry) + 1);
+    }
+}
+
+
+static void decrementEntryInuse(dbEntry *de) {
+    dictEntry *entry = dictFind(inUseEntries, de);
+    if (dictGetSignedIntegerVal(entry) == 1) {
+        dictDelete(inUseEntries, de);
+        decrRefCount(de);
+    } else {
+        serverAssert(dictGetSignedIntegerVal(entry) > 1);
+        dictSetSignedIntegerVal(entry, dictGetSignedIntegerVal(entry) - 1);
+    }
+}
+
+static bool isEntryInuseBySingleIterator(dbEntry *de) {
+    dictEntry *entry = dictFind(inUseEntries, de);
+    return dictGetSignedIntegerVal(entry) == 1;
+}
+
+static bool isEntryInuseByAnyIterator(dbEntry *de) {
+    return (dictFind(inUseEntries, de) != NULL);
+}
+
+
+static ssize_t computeStringDbEntrySize(dbEntry *de) {
+    sds key = objectGetKey(de);
+    size_t valueSize = stringObjectLen(de);
+
+    return sdslen(key) + valueSize; // ignore the rest of the overhead, it's minor & transient
+}
+
+
+static dbEntry *tryCloneDbEntry(dbEntry *de) {
+    if (bgiteration_current_clone_memory_pool_size + bgiter_max_clone_item_bytes 
+            > bgiter_max_clone_pool_bytes) {
+        return NULL;
+    }
+
+    // Future optimization: Incorporate small ziplists, sorted sets, etc.
+    // OBJ_ENCODING_INT is omitted only because there isn't a good API for cloning it yet.
+    if (de->type == OBJ_STRING && de->encoding != OBJ_ENCODING_INT) {
+        ssize_t itemSize = computeStringDbEntrySize(de);
+
+        if (itemSize <= bgiter_max_clone_item_bytes) {
+            bgiteration_current_clone_memory_pool_size += itemSize;
+            dbEntry *clone = createStringObjectWithKeyAndExpire((char *)objectGetVal(de), sdslen(objectGetVal(de)), objectGetKey(de), objectGetExpire(de));
+            ((bgIterationEntryMetadata *)objectGetMetadata(clone))->iterator_epoch
+                    = ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch;
+            return clone;
+        }
+    }
+
+    return NULL;
+}
+
+
+static void freeClonedDictEntry(dbEntry *clonedEntry) {
+    serverAssert(clonedEntry->type == OBJ_STRING);
+
+    // Add back to memory pool
+    bgiteration_current_clone_memory_pool_size -= computeStringDbEntrySize(clonedEntry);
+
+    decrRefCount(clonedEntry);
+}
+
+static bgIteratorItem * makeDbEntryItem(dbEntry *de, int dbid, bool isCloned) {
+    if (!isCloned) incrementEntryInuse(de);
+
+    bgIteratorItem *item = itemFreeList_getElementOrAllocate();
+    item->type = BGITERATOR_ITEM_DBENTRY;
+    item->dbid = dbid;
+    item->u.dbe.de = de;
+    item->u.dbe.is_cloned = isCloned;
+    item->u.dbe.is_rehashing_paused = pauseRehashing(de);
+
+    return item;
+}
+
+static robj ** cloneRobjArray(int argc, robj **argv) {
+    robj **newarray = zmalloc(sizeof(robj*) * argc);
+    for (int i = 0; i < argc; i++) {
+        newarray[i] = argv[i];
+        incrRefCount(argv[i]);
+    }
+    return newarray;
+}
+
+
+static void freeRobjArray(int argc, robj **argv) {
+    for (int i = 0; i < argc; i++) {
+        decrRefCount(argv[i]);
+    }
+    zfree(argv);
+}
+
+
+// Called by iterator thread to release an item.
+static void returnCurrentItemToValkey(bgIterator *it) {
+    bgIteratorItem *item = it->current_item;
+    if (item == NULL) return;
+
+    switch (item->type) {
+        case BGITERATOR_ITEM_DBENTRY:
+            it->dbentries_processed++;
+            if (item->u.dbe.is_cloned) it->dbentry_clones_processed++;
+            mutexQueueAdd(it->return_to_valkey, item);
+            break;
+        case BGITERATOR_ITEM_REPLICATION:
+            it->replication_processed++;
+            mutexQueueAdd(it->return_to_valkey, item);
+            break;
+        case BGITERATOR_ITEM_SWAPDB:
+            it->swapdb_processed++;
+            mutexQueueAdd(it->return_to_valkey, item);
+            break;
+        case BGITERATOR_ITEM_FLUSHDB:
+            it->flushdb_processed++;
+            mutexQueueAdd(it->return_to_valkey, item);
+            break;
+
+        case BGITERATOR_ITEM_COMPLETE:
+        case BGITERATOR_ITEM_TERMINATED:
+            // These are static and just used to wake the iterator - they should never be returned.
+            serverAssert(false);
+            break;
+
+        default:
+            serverAssert(false);
+    }
+
+    // Do this AFTER placing into return_to_valkey.  This is volatile and snooped when there is a
+    //  flushall event.  Don't want an item to be missed.
+    it->current_item = NULL;
+}
+
+
+
+//=============================================================================================
+//                        Background Iterator (private)
+//=============================================================================================
+
+static void bgIteratorRelease(bgIterator *it) {
+    serverAssert(onValkeyMainThread());
+    serverAssert(it->current_item == NULL);
+    serverAssert(mutexQueueLength(it->items_for_iterator) == 0);
+    serverAssert(mutexQueueLength(it->return_to_valkey) == 0);
+
+    dictDelete(nameToIterator, it->name);
+    listDelNode(allIterators, listSearchKey(allIterators, it));
+
+    mutexQueueRelease(it->items_for_iterator);
+    it->items_for_iterator = NULL;
+
+    mutexQueueRelease(it->return_to_valkey);
+    it->return_to_valkey = NULL;
+
+    it->keyset_iter->release(it->keyset_iter);
+    it->keyset_iter = NULL;
+
+    dictRelease(it->early_iterate_entries);
+    it->early_iterate_entries = NULL;
+
+    sdsfree(it->name);
+    zfree(it);
+}
+
+
+static bool shouldFeedIteratorMore(bgIterator *it) {
+    return (!it->completed
+         && !it->terminated
+         && mutexQueueLength(it->items_for_iterator) < it->item_count_target);
+}
+
+
+// Debugging routine
+static sds createEntryString(int dbid, dbEntry *de) {
+    sds key = objectGetKey(de);
+
+    sds entrySds = sdsempty();
+    entrySds = sdscatprintf(entrySds, "(%d)'%s'", dbid, key);
+    if (de->type == OBJ_STRING) {
+        robj *o = getDecodedObject(de); // might be encoded as int
+        const unsigned valuePrintLen = 20;
+        entrySds = sdscatprintf(entrySds, " : '%.*s'", valuePrintLen, (char *)objectGetVal(o));
+        if (sdslen((sds)objectGetVal(o)) > valuePrintLen) entrySds = sdscat(entrySds, "...");
+        decrRefCount(o);
+    } else {
+        entrySds = sdscatprintf(entrySds, " : type(%d)", de->type);
+    }
+    return entrySds;
+}
+
+
+static void feedIterator(bgIterator *it, monotime end_time_us) {
+    // Smart logic to dynamically adjust the size of the queue
+    unsigned int initial_queue_len = mutexQueueLength(it->items_for_iterator);
+
+    if (initial_queue_len > 2 && it->item_count_target >= initial_queue_len) {
+        it->item_count_target -= initial_queue_len / 2;
+    }
+
+    // Now do some feeding
+    bool have_time = (getMonotonicUs() < end_time_us);
+    int timeCheckCounter = 0;
+    while (shouldFeedIteratorMore(it) && have_time) {
+        int orig_dbid, cur_dbid;
+        fifo *dbEntryFifo = it->keyset_iter->getEntries(it->keyset_iter, &orig_dbid, &cur_dbid);
+
+        if (dbEntryFifo == NULL) {
+            // Iteration of items is complete for this iterator
+            serverAssert(it->dbentries_queued      >= it->dbentries_processed);
+            serverAssert(it->replication_queued    >= it->replication_processed);
+            serverAssert(it->swapdb_queued         >= it->swapdb_processed);
+            serverAssert(it->flushdb_queued        >= it->flushdb_processed);
+            serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed);
+
+            // Snapshot queue size to seed next iterator when terminated
+            last_item_count_target = it->item_count_target;
+
+            if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) {
+                if (!it->client_is_active || (it->dbentries_queued > it->dbentries_processed)) {
+                    // We are done feeding dict entries to the iterator, but before ending the
+                    //  replication processing make sure that the iterator has become active (has
+                    //  started reading) and make sure that all of the dict entries have been processed
+                    //  by the client.
+                    break;
+                }
+                if (it->repldone) {
+                    bool clientWantsMoreReplication = (!it->repldone(it->privdata));
+                    if (clientWantsMoreReplication) break;
+                }
+            }
+            bgIteratorItem *completionItem = itemFreeList_getElementOrAllocate();
+            *completionItem = (bgIteratorItem){ .type = BGITERATOR_ITEM_COMPLETE };
+            if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) {
+                rdbSaveInfo rsi;
+                completionItem->dbid = (rdbPopulateSaveInfo(&rsi)) ? rsi.repl_stream_db : 0;
+                completionItem->u.master_repl_offset = server.primary_repl_offset;
+                if (BGITERATION_DEBUG) {
+                    debugBuffer = sdscat(debugBuffer, "REPLDONE FN\n");
+                }
+            }
+
+            if (BGITERATION_DEBUG) {
+                debugBuffer = sdscat(debugBuffer, "SENDING COMPLETE\n");
+            }
+
+            mutexQueueAdd(it->items_for_iterator, completionItem);
+            it->completed = true;
+            break;
+        }
+
+        int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) ? orig_dbid : cur_dbid;
+
+        fifo *itemsToAdd = fifoCreate();
+        while (fifoLength(dbEntryFifo) > 0) {
+            dbEntry *de;
+            fifoPop(dbEntryFifo, (void **)&de);
+
+            // Remove new/modified items during consistent iteration.
+            if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT
+                    && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch > it->consistent_modification_id) {
+                continue;
+            }
+
+            // Remove any items which have been processed early
+            if (dictFind(it->early_iterate_entries, de) != NULL) {
+                dictDelete(it->early_iterate_entries, de);
+                if (BGITERATION_DEBUG) {
+                    sds entryString = createEntryString(dbid, de);
+                    debugBuffer = sdscatprintf(debugBuffer, "SKIPPING ITEM(early iterate): %s\n", entryString);
+                    sdsfree(entryString);
+                }
+                continue;
+            }
+
+            // For items which are left, convert them from dbEntry to iteratorItem
+            if (BGITERATION_DEBUG) {
+                sds entryString = createEntryString(dbid, de);
+                debugBuffer = sdscatprintf(debugBuffer, "ITEM: %s\n", entryString);
+                sdsfree(entryString);
+            }
+
+            bgIteratorItem *item = makeDbEntryItem(de, dbid, false);
+
+            fifoPush(itemsToAdd, item);
+        }
+        fifoRelease(dbEntryFifo);
+
+        if (fifoLength(itemsToAdd) > 0) {
+            it->dbentries_queued += fifoLength(itemsToAdd);
+            mutexQueueAddMultiple(it->items_for_iterator, itemsToAdd);
+        }
+        fifoRelease(itemsToAdd);
+
+        // This is a predictably fast loop.  We don't need to check the time on every pass.
+        if (++timeCheckCounter % 32 == 0) {
+            have_time = (getMonotonicUs() < end_time_us);
+        }
+    }
+
+    // Smart logic to dynamically adjust the size of the queue
+    if (initial_queue_len == 0 && have_time) {
+        it->item_count_target += BGITER_QUEUE_INCREASE_INCR;
+    }
+}
+
+
+static bool addEarlyIterationKey(bgIterator *it, dbEntry *earlyEntry, int cur_dbid) {
+    int rc = dictAdd(it->early_iterate_entries, earlyEntry, NULL);
+    serverAssert(rc == DICT_OK);
+  
+    int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)
+            ? it->keyset_iter->originalDb(it->keyset_iter, cur_dbid)
+            : cur_dbid;
+
+    dbEntry *cloneEntry = tryCloneDbEntry(earlyEntry);
+    bool isClonedEntry = (cloneEntry != NULL);
+    bgIteratorItem *item = makeDbEntryItem(isClonedEntry ? cloneEntry : earlyEntry, dbid, isClonedEntry);
+
+    it->dbentries_queued++;
+    if (isClonedEntry) it->dbentry_clones_queued++;
+
+    if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { // JHB - can we optimize here in cluster mode (no swap)
+        // On consistent iteration, SWAPDB events are not provided.  So there is no requirement to
+        //  keep items in order or synchronized with SWAPDB.
+        if (BGITERATION_DEBUG) {
+            sds entryString = createEntryString(dbid, item->u.dbe.de);
+            debugBuffer = sdscatprintf(debugBuffer, "EARLY_1: %s\n", entryString);
+            sdsfree(entryString);
+        }
+        mutexQueuePushPriority(it->items_for_iterator, item);
+    } else {
+        if (BGITERATION_DEBUG) {
+            sds entryString = createEntryString(dbid, item->u.dbe.de);
+            debugBuffer = sdscatprintf(debugBuffer, "EARLY: %s\n", entryString);
+            sdsfree(entryString);
+        }
+        mutexQueueAdd(it->items_for_iterator, item);
+    }
+    return !isClonedEntry; // Block if the entry will be used by the background thread
+}
+
+
+// This expedites a single key and doesn't attempt to avoid expediting through optimization.
+static bool expediteSingleKeyWithoutOptimization(
+        bgIterator *it,
+        int dbid,
+        robj *oKey,
+        dict *waitingOnKeys) {
+
+    bool mustBlock = false;
+
+    bool iterComplete = it->completed || it->terminated;
+
+    sds key = objectGetVal(oKey);
+    dbEntry *de = dbFind(server.db[dbid], key);
+    if (de != NULL) {
+        if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid))
+                && (dictFind(it->early_iterate_entries, de) == NULL)) {
+            if (addEarlyIterationKey(it, de, dbid)) {
+                mustBlock = true;
+                dictAdd(waitingOnKeys, oKey, NULL); 
+            }
+        } else {
+            if (isEntryInuseByAnyIterator(de)) {
+                mustBlock = true;
+                dictAdd(waitingOnKeys, oKey, NULL);
+            }
+        }
+    }
+
+    return mustBlock;
+}
+
+
+// MOVE/COPY are unfortunate special commands.  They work on 2 DBs at once.
+const int MOVE_COMMAND_DBID_ARG_INDEX = 2;
+static bool expediteKeysForMove(
+        bgIterator *it,
+        int dbid,
+        int argc,
+        robj **argv,
+        dict *waitingOnKeys) {
+    if (argc <= MOVE_COMMAND_DBID_ARG_INDEX) return false;
+
+    int destDbid;
+    if (!getDbIdFromRobj(argv[MOVE_COMMAND_DBID_ARG_INDEX], &destDbid)) return false;
+
+    bool mustBlock = false;
+    robj *key = argv[1];
+
+    // Not looking for special cases to optimize here.  Just try to expedite both src and dest
+    //  keys.  Note that the dest key might exist (and need iteration) but could be expired and
+    //  could be overwritten by MOVE.  In this case, a DEL would replicate due to the expiry.  So
+    //  even if the target is expired, we need to replicate it before executing the command.
+    if (expediteSingleKeyWithoutOptimization(it, dbid, key, waitingOnKeys)) mustBlock = true;
+    if (expediteSingleKeyWithoutOptimization(it, destDbid, key, waitingOnKeys)) mustBlock = true;
+
+    it->cur_cmd_may_replicate = true;
+    return mustBlock;
+}
+
+
+// MOVE/COPY are unfortunate special commands.  They work on 2 DBs at once.
+static bool expediteKeysForCopy(
+        bgIterator *it,
+        int dbid,
+        int argc,
+        robj **argv,
+        dict *waitingOnKeys) {
+
+    int destDbid;
+    if (!getTargetDbIdForCopyCommand(argc, argv, dbid, &destDbid)) return false;
+
+    bool mustBlock = false;
+    robj *srcKey = argv[1];
+    robj *destKey = argv[2];
+
+    // Not trying to optimize COPY.  Just expedite source and destination (if it exists).  We
+    //  don't really care if the value is overwritten or not (so no need to parse REPLACE option).
+    if (expediteSingleKeyWithoutOptimization(it, dbid, srcKey, waitingOnKeys)) mustBlock = true;
+    if (expediteSingleKeyWithoutOptimization(it, destDbid, destKey, waitingOnKeys)) mustBlock = true;
+
+    it->cur_cmd_may_replicate = true;
+    return mustBlock;
+}
+
+
+/* There are several cases where a client must be blocked on write operations.  (Clients never need
+ * to be blocked for read operations.)
+ *
+ * Note:  An Amazon extension to the Valkey command structure allows us to identify commands where
+ *        the first key is for write and the rest are for read.  This allows us to make the
+ *        following optimizations:
+ *   - for keys which are read only, there's no need to block if the key is in-use by an iterator
+ *   - without replication, there's no need to immediately queue read keys on a consistent iteration
+ *
+ * Iterator:  CONSISTENT = NO,  REPLICATION = NO
+ *   - Block if any write-key is in use by an the iterator
+ *
+ * Iterator:  CONSISTENT = NO,  REPLICATION = YES
+ *   - Block if any write-key is in use by an the iterator
+ *   - If ANY key has already been iterated (but some keys have not), then
+ *       - Block and immediately queue any key (read or write) that has not
+ *         already been iterated
+ *         Example:  SDIFFSTORE KEY_A KEY_B KEY_C
+ *           In this case, KEY_A is written, KEY_B and KEY_C are read.  If KEY_A has already been
+ *           iterated over, the replication stream will contain this command.  The receiver of this
+ *           replication will need KEY_B and KEY_C in order to process the replication stream.  So
+ *           these need to be iterated and the client blocked.
+ *
+ * Iterator:  CONSISTENT = YES, REPLICATION = NO
+ *   - Block if any write-key is in use by an the iterator
+ *   - Block and immediately queue any WRITE-key that has not already been iterated
+ *
+ * Iterator:  CONSISTENT = YES, REPLICATION = YES
+ *   (Combination only valid in cluster mode - no SWAPDB possible)
+ *   - Block if any write-key is in use by an the iterator
+ *   - Block and immediately queue any key (read or write) that has not already been iterated
+ */
+static bool expediteKeysForWrite(
+        bgIterator *it,
+        int dbid,
+        struct serverCommand *cmd,
+        int argc,
+        robj **argv,
+        keyReference *keyrefs,
+        int numKeys,
+        dict *waitingOnKeys) {
+    serverAssert(numKeys > 0);
+
+    bool mustBlock = false;
+
+    // All keys of the command should either be in scope or not since in cluster mode enabled they
+    // should all be in the same slot. So we just check the first key.
+    robj *oKey = argv[keyrefs[0].pos];
+    sds key = objectGetVal(oKey);
+    // If it's not in the iteration scope for the current iterator, then we don't need to do
+    // anything with this command.
+    if (!it->keyset_iter->isKeyInScope(it->keyset_iter, key)) return false;
+
+    // Note: performance optimization for commands which only modify the first key.  If this flag
+    //  is not available, we can safely remove this `if` statement.
+    if ((cmd->flags & CMD_WRITE_FIRSTKEY_ONLY)
+            && !(it->iteration_flags & BGITERATOR_FLAG_REPLICATION)) {
+        // If this write command only modifies the 1st key, we don't need to expedite others
+        //  unless replication enabled.
+        numKeys = 1;
+    }
+
+    if (cmd->proc == moveCommand) {
+        // Unfortunate special case for MOVE
+        return expediteKeysForMove(it, dbid, argc, argv, waitingOnKeys);
+    }
+
+    if (cmd->proc == copyCommand) {
+        // Similar special case for COPY
+        return expediteKeysForCopy(it, dbid, argc, argv, waitingOnKeys);
+    }
+
+    bool iterComplete = it->completed || it->terminated;
+
+    if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) {
+        // CONSISTENT = YES, REPLICATION = YES / NO
+        for (int i = 0; i < numKeys; i++) {
+            robj *oKey = argv[keyrefs[i].pos];
+            sds key = objectGetVal(oKey);
+            dbEntry *de = dbFind(server.db[dbid], key);
+            if (de == NULL) continue; // New key, no need to expedite
+            if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid))
+                    && dictFind(it->early_iterate_entries, de) == NULL
+                    && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) {
+                if (addEarlyIterationKey(it, de, dbid)) {
+                    mustBlock = true;
+                    dictAdd(waitingOnKeys, oKey, NULL); 
+                }
+            } else {
+                if (isEntryInuseByAnyIterator(de)) {
+                    mustBlock = true;
+                    dictAdd(waitingOnKeys, oKey, NULL);
+                }
+            }
+        }
+        it->cur_cmd_may_replicate = true; // Will replicate only if replication enabled
+    } else {
+        // Identification of missing keys is only needed for non-consistent iteration.  This only
+        //  needs to be collected once (on the 1st non-consistent iteration)
+        bool collectMissing = (listLength(curCmdMissingKeys) == 0);
+
+        if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) {
+            // CONSISTENT = NO,  REPLICATION = YES
+            bool someIterated = false;
+            // dict containing the keys that have not been iterated yet.
+            //  Using a dict dedupes the keys in case the command contains duplicated keys.
+            dict *notIteratedKeys = dictCreate(&dictEntryPtrDictType); // dict of dbEntry* -> robj*
+
+            for (int i = 0; i < numKeys; i++) {
+                robj *oKey = argv[keyrefs[i].pos];
+                sds key = objectGetVal(oKey);
+                dbEntry *de = dbFind(server.db[dbid], key);
+                if (de == NULL) {
+                    if (collectMissing) {
+                        incrRefCount(oKey);
+                        listAddNodeHead(curCmdMissingKeys, oKey);
+                    }
+                    continue;
+                }
+                if (iterComplete
+                        || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)
+                        || (dictFind(it->early_iterate_entries, de) != NULL)) {
+                    someIterated = true;
+                } else {
+                    dictAdd(notIteratedKeys, de, oKey);
+                }
+                if (isEntryInuseByAnyIterator(de)) {
+                    mustBlock = true;
+                    dictAdd(waitingOnKeys, oKey, NULL);
+                }
+            }
+
+            // Since missing keys are considered as already iterated, if there are any missing keys
+            //  we must consider that some keys have been iterated, and make sure all other keys
+            //  will be expedited if needed.
+            if (listLength(curCmdMissingKeys) > 0) someIterated = true;
+
+            // This command may be executing as part of a larger transaction.  If some parts of the
+            //  transaction have already been identified to replicate, we must wait on all keys and
+            //  replicate here as well.  (Take care not to set cur_cmd_may_replicate to false.)
+            if (someIterated) {
+                if (server.in_exec) {
+                    // We are now executing the commands in a multi-exec block.
+                    //
+                    // Regarding MULTI/EXEC:  Remember that this code is executed twice for commands
+                    //  within a MULTI/EXEC block.  First, we parse all the commands when deciding
+                    //  if the EXEC should be blocked.  Then, as each command is executed, it's
+                    //  re-parsed so that we can maintain the early iterated list as the commands
+                    //  execute.  In this second pass, as each command is executed, we can't change
+                    //  the replication decision which was made earlier (when the EXEC was processed).
+                    // We don't want to get tricked (by a key being removed and recreated) into
+                    //  into starting to replicate in the middle of a MULTI/EXEC block.
+                } else {
+                    it->cur_cmd_may_replicate = true;
+                }
+            }
+            if (it->cur_cmd_may_replicate) {
+                dictEntry *de;
+                dictIterator *di = dictGetIterator(notIteratedKeys);
+                while ((de = dictNext(di)) != NULL) {
+                    dbEntry *notIteratedEntry = dictGetKey(de);
+                    robj *oKey = dictGetVal(de);
+
+                    if (addEarlyIterationKey(it, notIteratedEntry, dbid)) {
+                        mustBlock = true;
+                        dictAdd(waitingOnKeys, oKey, NULL); 
+                    }
+                }
+                dictReleaseIterator(di);
+            }
+            dictRelease(notIteratedKeys);
+        } else {
+            // CONSISTENT = NO,  REPLICATION = NO
+            for (int i = 0; i < numKeys; i++) {
+                robj *oKey = argv[keyrefs[i].pos];
+                sds key = objectGetVal(oKey);
+                dbEntry *de = dbFind(server.db[dbid], key);
+                if (de == NULL) {
+                    if (collectMissing) {
+                        incrRefCount(oKey);
+                        listAddNodeHead(curCmdMissingKeys, oKey);
+                    }
+                    continue;
+                }
+                if (isEntryInuseByAnyIterator(de)) {
+                    mustBlock = true;
+                    dictAdd(waitingOnKeys, oKey, NULL);
+                }
+            }
+        }
+    }
+
+    return mustBlock;
+}
+
+
+// Called when an iterator is terminated.  Pulls everything out of the queue
+//  and returns the items to Valkey (before they hit the iterator).
+static void returnAllItemsToValkey(bgIterator *it) {
+    serverAssert(onValkeyMainThread());
+
+    fifo *poppedFifo = mutexQueuePopAll(it->items_for_iterator, false);
+    if (poppedFifo == NULL) return; // Nothing to return
+
+    // Release non-dictentry items first...
+    fifo *itemsToReturn = fifoCreate();
+    while (fifoLength(poppedFifo) > 0) {
+        bgIteratorItem *item;
+        fifoPop(poppedFifo, (void **)&item);
+        switch (item->type) {
+            // back out the "queued" statistic
+            case BGITERATOR_ITEM_DBENTRY:
+                it->dbentries_queued--;
+                if (item->u.dbe.is_cloned) it->dbentry_clones_queued--;
+                break;
+            case BGITERATOR_ITEM_REPLICATION:
+                it->replication_queued--;
+                break;
+            case BGITERATOR_ITEM_SWAPDB:
+                it->swapdb_queued--;
+                break;
+            case BGITERATOR_ITEM_FLUSHDB:
+                it->flushdb_queued--;
+                break;
+
+            case BGITERATOR_ITEM_COMPLETE:
+                // This can only happen if the completion item has been enqueued and
+                //  the iterator is terminated before reaching the completion item.
+                itemFreeList_returnItemBackToFreeList(item);
+                continue; // Skip pushing this onto itemsToReturn
+
+            case BGITERATOR_ITEM_TERMINATED:
+                // This can only happen if there is a race when terminating between
+                //  the iteration client and main thread.
+                itemFreeList_returnItemBackToFreeList(item);
+                continue; // Skip pushing this onto itemsToReturn
+
+            default:
+                serverAssert(false);
+        }
+
+        fifoPush(itemsToReturn, item);
+    }
+    fifoRelease(poppedFifo);
+
+    // Now release items all at once...
+    if (fifoLength(itemsToReturn) > 0) {
+        mutexQueueAddMultiple(it->return_to_valkey, itemsToReturn);
+    }
+    fifoRelease(itemsToReturn);
+}
+
+
+
+//=============================================================================================
+//                        Foreground support functions (private)
+//=============================================================================================
+
+static size_t replicationItemSize(bgIteratorItem *item) {
+    serverAssert(item->type == BGITERATOR_ITEM_REPLICATION);
+    size_t itemSize = sizeof(bgIteratorItem);
+    for (int i = 0; i < item->u.repl.argc; i++) {
+        itemSize += objectComputeSize(NULL, item->u.repl.argv[i], 0, 0);
+    }
+    return itemSize;
+}
+
+static void processReturnOfItemToValkey(bgIteratorItem *item, bgIterator *iter) {
+    serverAssert(onValkeyMainThread());
+    switch ((int)item->type) {
+        case BGITERATOR_ITEM_REPLICATION:
+            bufferedReplicationBytes -= replicationItemSize(item);
+            freeRobjArray(item->u.repl.argc, item->u.repl.argv);
+            break;
+
+        case BGITERATOR_ITEM_DBENTRY:
+            {
+                if (item->u.dbe.is_cloned) {
+                    freeClonedDictEntry(item->u.dbe.de);
+                } else {
+                    if (isEntryInuseBySingleIterator(item->u.dbe.de)) {
+                        // This blocking mechanism isn't the best.  Written for slot-migration,
+                        //  it assumes a single DB so if the same key appears in multiple DBs,
+                        //  commands might get unblocked only to get blocked again.  (This would
+                        //  happen only rarely, and with minimal impact.)
+                        robj *key = createStringObjectFromSds(objectGetKey(item->u.dbe.de));
+                        unblockClientsInUseOnKey(key);
+                        decrRefCount(key);
+                    }
+                    // resumeRehashing must be called before decrementEntryInuse, since decrementEntryInuse can free
+                    if (item->u.dbe.is_rehashing_paused) resumeRehashing(item->u.dbe.de);
+                    decrementEntryInuse(item->u.dbe.de);
+                }
+            }
+            break;
+
+        case BGITERATOR_ITEM_SWAPDB:
+        case BGITERATOR_ITEM_FLUSHDB:
+            break;
+
+        case BGITERATOR_ITEMEXT_ITER_CLOSED:
+            {
+                bgIterator *it = ((bgIteratorItemExtClose*)item)->iter;
+                serverAssert(it == iter);
+                if (it->terminated) {
+                    // Abnormal termination
+                    //  Normally the item is TERMINATED, but might be COMPLETE in race
+                    serverAssert(it->current_item->type == BGITERATOR_ITEM_TERMINATED
+                            || it->current_item->type == BGITERATOR_ITEM_COMPLETE);
+                    // Release any items stranded on the iterator after early termination
+                    returnAllItemsToValkey(it);
+                    receiveItemsBackFromOneIterator(it);
+                } else {
+                    // Normal completion
+                    serverAssert(it->current_item->type == BGITERATOR_ITEM_COMPLETE);
+                }
+                serverAssert(mutexQueueLength(it->items_for_iterator) == 0);
+                serverAssert(it->dbentries_queued      == it->dbentries_processed);
+                serverAssert(it->replication_queued    == it->replication_processed);
+                serverAssert(it->swapdb_queued         == it->swapdb_processed);
+                serverAssert(it->flushdb_queued        == it->flushdb_processed);
+                serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed);
+
+                listEmpty(curCmdMissingKeys); // Just in case any remain
+
+                itemFreeList_returnItemBackToFreeList(it->current_item);
+                it->current_item = NULL;
+
+                bool terminated = it->terminated;
+                void *privdata = it->privdata;
+                bgIteratorCleanupFunc cleanup = it->cleanup;
+                bgIteratorRelease(it); // Fully release the iterator before calling cleanup
+
+                if (BGITERATION_DEBUG) {
+                    if (cleanup) debugBuffer = sdscatprintf(debugBuffer, "CLEANUP FN (%s)\n",
+                            (terminated) ? "terminated" : "success");
+
+                    sds filename = sdscatprintf(sdsempty(), "bgiteration_debug.%d", getpid());
+                    FILE *f = fopen(filename, "w");
+                    sdsfree(filename);
+
+                    fputs(debugBuffer, f);
+
+                    fclose(f);
+                    sdsfree(debugBuffer);
+                    debugBuffer = sdsempty();
+                }
+
+                if (cleanup) cleanup(terminated, privdata);
+            }
+            break;
+
+        default:
+            serverAssert(false); // Not expecting any other type of item!
+    }
+
+    // We don't allocate extension items from the pool so we manually free them
+    if((int)item->type == BGITERATOR_ITEMEXT_ITER_CLOSED) {
+        zfree(item);
+    } else {
+        itemFreeList_returnItemBackToFreeList(item);
+    }
+}
+
+static void prepareAndProcessReturnedItems(int n, bgIteratorItem **items, bgIterator *iter) {
+    int i = 0;
+    for (i = 0; i < n; i++) valkey_prefetch(items[i]);
+    for (i = 0; i < n; i++) {
+        if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue;
+        // Prefetch can have a significant perf hit on NULL
+        // but we never expect items[i]->u.dbe.de to be NULL
+        valkey_prefetch(items[i]->u.dbe.de);
+    }
+    for (i = 0; i < n; i++) {
+        if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue;
+        // Same as above, assume key is never NULL
+        valkey_prefetch(objectGetKey(items[i]->u.dbe.de));
+    }
+    for (i = 0; i < n; i++) processReturnOfItemToValkey(items[i], iter);
+}
+
+#define PREFETCH_BATCH_SIZE 16
+
+static bool receiveItemsBackFromOneIterator(bgIterator *it) {
+    bgIteratorItem* batchPool[PREFETCH_BATCH_SIZE];
+    int n = 0;
+    // Returns true if we process at least one item from
+    // a given iterator's return_to_valkey queue, false otherwise.
+    fifo *poppedFifo = mutexQueuePopAll(it->return_to_valkey, false);
+    if (poppedFifo != NULL) {
+        while (fifoLength(poppedFifo) > 0) {
+            fifoPop(poppedFifo, (void **)&batchPool[n++]);
+            if (n == PREFETCH_BATCH_SIZE) {
+                prepareAndProcessReturnedItems(n, batchPool, it);
+                n = 0;
+            }
+        }
+        if (n > 0) {
+            prepareAndProcessReturnedItems(n, batchPool, it);
+        }
+        fifoRelease(poppedFifo);
+        return true;
+    }
+    return false;
+}
+
+static void receiveItemsBackFromIterators(bool blocking) {
+    // Process each iterator's return_to_valkey queue
+    // If `blocking` is true, continue reading until
+    // at least one queue was not empty.
+    serverAssert(onValkeyMainThread());
+    listIter li;
+    listNode *node;
+    bool processedItems = false;
+    do {
+        listRewind(allIterators, &li);
+        while ((node = listNext(&li)) != NULL) {
+            bgIterator *it = listNodeValue(node);
+            processedItems |= receiveItemsBackFromOneIterator(it);
+        }
+        if (blocking) usleep(100); // Sleep for 1ms and re-try processing iterators
+    } while (blocking && !processedItems);
+}
+
+
+static long long bgIteration_feedIterators_task(
+        struct aeEventLoop *eventLoop,
+        long long id,
+        void *clientData) {
+    UNUSED(eventLoop);
+    UNUSED(id);
+    UNUSED(clientData);
+    serverAssert(onValkeyMainThread());
+
+    static monotime lastFeedEndTime; // STATIC: Persists For checking starvation
+    monotime startTime = getMonotonicUs();
+
+    if (!bgIteration_iterationActive()) {
+        // No more iterators exist.  Self-check, and terminate the "feed" task.
+        serverAssert(dictSize(nameToIterator) == 0);
+        serverAssert(dictSize(inUseEntries) == 0);
+        serverAssert(bufferedReplicationBytes == 0);
+
+        // Shrink dict back to zero (doesn't normally shrink)
+        dictRelease(inUseEntries);
+        inUseEntries = dictCreate(&dictEntryPtrDictType);
+
+        itemFreeList_release();
+
+        bgIterator_timeproc_id = AE_DELETED_EVENT_ID;
+        lastFeedEndTime = 0;
+        return AE_NOMORE;
+    }
+
+    long dutyTimeUs = BGITER_CYCLE_BUDGET_MS * 1000;
+    if (lastFeedEndTime > 0) {
+        // If the timer was delayed, compute the proportional time we should have had, and increase
+        //  the duty cycle to compensate (up to a limit).
+        long starvationUs = (startTime - lastFeedEndTime) - BGITER_CYCLE_DELAY_MS * 1000;
+        if (starvationUs > 0) {
+            long starvationCompensationUs = starvationUs * BGITER_CYCLE_BUDGET_MS
+                / (BGITER_CYCLE_BUDGET_MS + BGITER_CYCLE_DELAY_MS);
+            dutyTimeUs += starvationCompensationUs;
+            dutyTimeUs = MIN(dutyTimeUs, BGITER_CYCLE_BUDGET_MAX_MS * 1000);
+        }
+    }
+    monotime endTime = startTime + dutyTimeUs;
+
+    // Run this part regardless of time limit...
+    receiveItemsBackFromIterators(false);
+
+    // Feeding iterators (below) respects endTime.  The stuff above always runs to completion.
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL && getMonotonicUs() < endTime) {
+        bgIterator *it = listNodeValue(node);
+        if (it->completed || it->terminated) continue;
+        feedIterator(it, endTime);
+    }
+
+    lastFeedEndTime = getMonotonicUs();
+    return BGITER_CYCLE_DELAY_MS;
+}
+
+
+// Not static, but not API.  Intended for unit tests where the event loop may not be active.
+void bgIteration_feedIterators(void) {
+    // For unit testing, force the item_count_target to 1 in each call.  This ensures that we only
+    //  feed a minimal amount to the iterators rather than a non-deterministic amount.
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        it->item_count_target = 1;
+    }
+
+    // Invoke the feeding task (normally invoked by timer).
+    bgIteration_feedIterators_task(NULL, 0, NULL);
+}
+
+
+static void resetReplicationFlagForIterators(client *c) {
+    // For any given command, the command may or may not need to be replicated based on the status
+    //  and flags of each iterator.  Furthermore, if a command does need to be replicated, this
+    //  replication must occur for an entire atomic unit; we can't replicate only part of a script
+    //  or multi/exec.
+    // This function is the only place where the replication flag is cleared.
+
+    if (c->flag.multi || c->flag.script) {
+        // REGARDING MULTI/EXEC
+        // --------------------
+        // When processing a MULTI/EXEC, blockClientIfRequired is called first for the MULTI.  Then,
+        //  all of the commands are queued up in server.c:processCommand().  It's only when EXEC is
+        //  encountered, that server.c:call() is fired to begin execution.
+        // AFTER the EXEC is processed by call(), then each of the commands in the MULTI/EXEC block
+        //  will be processed through call().
+        // If write commands are present, MULTI & EXEC will be passed to the replication stream
+        //  before/after the transaction commands.  Note that MULTI & EXEC are not actually
+        //  "executed" at the time when their replication is passed to the replication stream.
+        //
+        // Example:  MULTI; SET A B; EXEC
+        //  1. blockClientIfRequired() called for MULTI.  MULTI flag IS NOT set.  (Won't block.)
+        //  2. blockClientIfRequired() called for EXEC.  MULTI flag IS set.  (Might block.)
+        //  3. blockClientIfRequired() called for SET.  MULTI flag IS set.  (Won't block.)
+        //  4. handleCommandReplication() is called for MULTI.
+        //  5. handleCommandReplication() is called for SET.
+        //  6. handleCommandReplication() is called for EXEC.
+        //
+        // SO - if the MULTI flag is set, we DON'T clear the flag.  It should only be cleared at the
+        //  start of the transaction, when MULTI is received - and the flag isn't set yet.
+
+        // REGARDING SCRIPTS
+        // -----------------
+        // When processing a script, blockClientIfRequired is called first for the EVAL/EVALSHA/FCALL.
+        //  Then, all of the commands are processed using a special script client.  The script
+        //  client has the CLIENT_SCRIPT flag set.  For scripts, the replication flag is set when
+        //  processing the EVAL/EVALSHA/FCALL and should not be cleared when executing individual
+        //  commands in the script.
+
+        // If it's the EXEC command, we fall through and clear the flag below.  But for all other
+        //  commands within the transaction, we don't clear the flag.
+        if (c->cmd->proc != execCommand) return;
+    }
+
+    // For most commands, the replication flag is cleared and we determine if replication is needed
+    //  based on the keys being used and their state in each iterator. If a modified key hasn't been
+    //  processed yet, there's no need to expedite the key or send the replication.  The key will be
+    //  sent later, when reached by the iterator.
+    // However, for scripts, it is not possible to perform this optimization.  There is no way to
+    //  know if an undeclared key might be modified.  Since the entire script needs to be replicated
+    //  (or not replicated) atomically, we can't take the chance that an undeclared key might be
+    //  hit which requires replication.
+    bool isScript = isScriptCallWriteCmd(c->cmd);
+
+    getKeysResult result;
+    initGetKeysResult(&result);
+    getKeysFromCommand(c->cmd, c->argv, c->argc, &result);
+
+    // [sm-bgiterator] TODO: ELMO-108525, This assumes all keys are in the same slot, should consider cross-slot script case.
+    sds check_key = (result.numkeys > 0) ? objectGetVal(c->argv[result.keys[0].pos]) : NULL;
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (it->completed || it->terminated) {
+            it->cur_cmd_may_replicate = false;
+        } else {
+            // Set initial state of the replication flag for this transaction
+            // For full scan iterators, write commands within scripts must always be replicated.
+            // For cluster slot iterators, replication of script write commands depends on whether
+            // the key is in scope of the current iterator.
+            it->cur_cmd_may_replicate = isScript && it->keyset_iter->isKeyInScope(it->keyset_iter, check_key);
+        }
+    }
+    getKeysFreeResult(&result);
+}
+
+
+static void handleSwapdb(int db1, int db2) {
+    serverAssert(onValkeyMainThread());
+    serverAssert(bgIteration_iterationActive());
+    serverAssert(!server.cluster_enabled);
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (it->completed || it->terminated) continue;
+
+        // Let the iterator internal mechanism know
+        it->keyset_iter->swapDb(it->keyset_iter, db1, db2);
+
+        // Let the background client know
+        if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) {
+            if (BGITERATION_DEBUG) {
+                debugBuffer = sdscatprintf(debugBuffer, "SWAP: %d %d\n", db1, db2);
+            }
+
+            bgIteratorItem *item = itemFreeList_getElementOrAllocate();
+            item->type = BGITERATOR_ITEM_SWAPDB;
+            item->dbid = db1;
+            item->u.dbid2 = db2;
+            it->swapdb_queued++;
+            mutexQueueAdd(it->items_for_iterator, item);
+        }
+    }
+}
+
+
+static void removePtrFromEarlyIterate(dbEntry *de) {
+    // If the item is being released, let's get the pointer out of our early_iterate_entries.
+    //  Note that this is not strictly necessary, but it frees some memory and keeps the
+    //  dictionary small.
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        dictDelete(it->early_iterate_entries, de); // just try delete (might not be here)
+    }
+}
+
+
+static int findDbForEntry(dbEntry *de) {
+    for (int i = 0; i < server.dbnum; i++) {
+        if (dbFind(server.db[i], objectGetKey(de)) == de) return i;
+    }
+    serverAssert(false); // the entry MUST be in one of the DBs
+}
+
+
+static void terminateIteratorForFlush(bgIterator *it, int dbid) {
+    if (!it->terminated) bgIteratorTerminate(it);
+
+    // Snoop on the iterator.  There might be 1 item still being processed.  If that item is in the
+    //  DB being flushed, the item is removed from the dict and held for deferred deletion.  This
+    //  allows the iterator to complete processing on the current item without the item being
+    //  deleted unexpectedly.
+    // Since this is running in parallel with a background thread, the results are volatile.  This
+    //  is OK as when the iterator completes processing the item, it still won't have been accepted
+    //  back to Valkey yet, meaning the item will still be in inUseEntries.
+    bgIteratorItem *item = it->current_item;
+    if (item && item->type == BGITERATOR_ITEM_DBENTRY) {
+        dbEntry *de = item->u.dbe.de;
+        int deDb = findDbForEntry(de);
+        if (dbid == -1 || dbid == deDb) {
+            removePtrFromEarlyIterate(de);
+        }
+    }
+}
+
+
+static void preserveIteratorItemsForFlush(bgIterator *it, int dbid) {
+    serverAssert(onValkeyMainThread());
+    serverAssert(!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT));
+    serverAssert(dbid >= 0);
+    // Since this is not a consistent iteration, it's OK if the early_iterate_entries contains
+    //  pointers to items being deleted.  The item is not actually accessed from the pointer.  And
+    //  if the pointer gets reused for a new item, there's no guarantee that we would iterate it
+    //  anyway.  If replication is enabled, both new items and early_iterate_entries are treated the
+    //  same (replication is processed).  So this is safe in all cases.
+    // Given this, we will just worry about preserving items in the iterator's processing queue.
+    //  Because of commands like SWAPDB and MOVE, there's no attempt to remove unnecessary items
+    //  from the queue.  This is also safer to future Valkey extensions.
+
+    // Temporarily yank all items from the iterator's queue
+    fifo *poppedFifo = mutexQueuePopAll(it->items_for_iterator, false);
+    if (poppedFifo != NULL) {
+        fifo *readdFifo = fifoCreate();
+        while(fifoLength(poppedFifo) > 0) {
+            bgIteratorItem *item;
+            fifoPop(poppedFifo, (void **)&item);
+            if (item->type == BGITERATOR_ITEM_DBENTRY) {
+                dbEntry *de = item->u.dbe.de;
+                if (dbFind(server.db[dbid], objectGetKey(de)) == de) {
+                  // Found the entry in the DB about to be flushed
+                  removePtrFromEarlyIterate(de);
+                }
+            }
+            fifoPush(readdFifo, item);
+        }
+        fifoRelease(poppedFifo);
+
+        // Now give the list back to the iterator
+        mutexQueueAddMultiple(it->items_for_iterator, readdFifo);
+        fifoRelease(readdFifo);
+    }
+
+    // And snoop on the active item.  Even if the background task finishes with this item as we look
+    //  at it, the item can't have been returned to Valkey yet.
+    bgIteratorItem *item = it->current_item;
+    if (item && item->type == BGITERATOR_ITEM_DBENTRY) {
+        dbEntry *de = item->u.dbe.de;
+        if (dbFind(server.db[dbid], objectGetKey(de)) == de) {
+          // Found the entry in the DB about to be flushed
+          removePtrFromEarlyIterate(de);
+        }
+    }
+}
+
+
+static bool isDbSignificant(int dbid) {
+    unsigned long long totalKeys = 0;
+    for (int i = 0; i < server.dbnum; i++) {
+        totalKeys += (server.db[i]) ? dbSize(server.db[i]) : 0;
+    }
+    return (server.db[dbid]) ? (dbSize(server.db[dbid]) > totalKeys / 2) : false;
+}
+
+
+static void handleFlushdb(int dbid) {
+    // Invoked BEFORE the actual flush.  -1 indicates FLUSHALL.
+    bool should_abort_iterators = server.cluster_enabled || dbid == -1 || isDbSignificant(dbid);
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+
+        if (should_abort_iterators || it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) {
+            terminateIteratorForFlush(it, dbid);
+        } else {
+            // In this (limited) case, we're only flushing a single DB that contains < half the
+            //  keys.  We don't want to kill a full-sync replication.  We will just continue with
+            //  iteration, knowing that a replication client will also receive the FLUSHDB on the
+            //  replication stream.
+            // It would be nice to do this with consistent snapshot also, but given that this is a
+            //  very rare condition, development is not justified to save off the DB for deferred
+            //  delete.  This would add a lot of complexity as well as memory implications.
+            preserveIteratorItemsForFlush(it, dbid);
+            it->keyset_iter->flushDb(it->keyset_iter, dbid);
+
+            // Send a flushdb event to notify the client
+            if (BGITERATION_DEBUG) {
+                debugBuffer = sdscatprintf(debugBuffer, "FLUSH: %d\n", dbid);
+            }
+
+            bgIteratorItem *item = itemFreeList_getElementOrAllocate();
+            item->type = BGITERATOR_ITEM_FLUSHDB;
+            item->dbid = dbid;
+            it->flushdb_queued++;
+            mutexQueueAdd(it->items_for_iterator, item);
+        }
+    }
+    receiveItemsBackFromIterators(false); // Receive items back before flushing the items
+}
+
+
+static bool expediteKeysForWriteOnAllIterators(
+        int dbid,
+        struct serverCommand *cmd,
+        int argc,
+        robj **argv,
+        keyReference *keyrefs,
+        int numKeys,
+        dict *waitingOnKeys) {
+    bool mustBlock = false;
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (expediteKeysForWrite(it, dbid, cmd, argc, argv, keyrefs, numKeys, waitingOnKeys))
+            mustBlock = true;
+    }
+
+    return mustBlock;
+}
+
+
+static bool anIteratorWillReplicateForThisCommand(void) {
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (it->cur_cmd_may_replicate) return true;
+    }
+    return false;
+}
+
+
+static bool expediteKeysForMultiExec(client *c, dict *waitingOnKeys) {
+    serverAssert(c->cmd->proc == execCommand);
+
+    /* For MULTI/EXEC, Valkey buffers all of the commands until hitting the EXEC.
+     * At this point, the client holds all of the commands to be executed.  This function searches
+     * for all of the keys used by any of the buffered write commands.  In addition, if SWAPDB or
+     * SELECT is used, this tracks the DBIDs through various swap/select operations.
+     */
+
+    /* There's a special concern for a NON-consistent iteration with replication.  If the keys are
+     * all "future" keys (which haven't been processed by the iterator yet), then we don't expedite
+     * the keys or replicate.  However, if some keys have already been processed, we need to
+     * expedite the remaining keys and replicate everything.
+     *
+     * When processing a single command, this is all handled.  But in this function, for MULTI/EXEC,
+     * we process 1 command at a time.  There's an issue if the first command modifies a "future"
+     * key, we don't know (without reading ahead) if a later command will modify a prior key.  This
+     * would require the future key to be expedited.
+     *
+     * This COULD be addressed by collecting all of the keys into a single structure and then
+     * analyzing them all at once.  However, this won't share code well with the single commands.
+     * Also, building this structure is a little complex/time-consuming as we need to track both
+     * key AND dictID.  One way to do this might be with a dict of dicts, where the first dict maps
+     * a dictID to a dict of keys.
+     *
+     * ALTERNATIVELY (and it's the simpler approach that's taken here) we can just check if the
+     * MULTI will be replicated.  If so, we re-process the MULTI, just in case there were commands
+     * prior to deciding that replication was required that might have missed expediting.  If so,
+     * these will be caught on the 2nd time around.
+     *
+     * Checking replication status before/after ensures that there can only be a single recursive
+     * call.
+     */
+    bool initiallyAnIteratorWillReplicate = anIteratorWillReplicateForThisCommand();
+
+    bool mustBlock = false;
+    int *cur_to_orig_db = NULL;
+
+    int curDb = c->db->id;
+    for (int cmdNum = 0; cmdNum < c->mstate->count; cmdNum++) {
+        struct serverCommand *cmd = c->mstate->commands[cmdNum].cmd;
+        robj **argv = c->mstate->commands[cmdNum].argv;
+        int argc = c->mstate->commands[cmdNum].argc;
+
+        if (cmd->proc == swapdbCommand) {
+            int id1, id2;
+            if (getParamsForSwapdb(argc, argv, c, &id1, &id2)) {
+                if (cur_to_orig_db == NULL) {
+                    cur_to_orig_db = zmalloc(sizeof(int) * server.dbnum);
+                    for (int i = 0; i < server.dbnum; i++) cur_to_orig_db[i] = i;
+                }
+                int temp = cur_to_orig_db[id1];
+                cur_to_orig_db[id1] = cur_to_orig_db[id2];
+                cur_to_orig_db[id2] = temp;
+            }
+            continue;
+        }
+
+        if (cmd->proc == selectCommand) {
+            int id;
+            if (getParamsForSelect(argc, argv, c, &id)) {
+                curDb = id;
+            }
+            continue;
+        }
+
+        if (!isWriteCmd(cmd)) continue;
+
+        getKeysResult result;
+        initGetKeysResult(&result);
+        int numkeys = getKeysFromCommand(cmd, argv, argc, &result);
+        keyReference *keyrefs = result.keys;
+        if (numkeys == 0) continue; // Write command with no keys - like FLUSHDB
+
+        if (expediteKeysForWriteOnAllIterators(
+                cur_to_orig_db ? cur_to_orig_db[curDb] : curDb,
+                cmd, argc, argv, keyrefs, numkeys, waitingOnKeys)) {
+            mustBlock = true;
+        }
+        getKeysFreeResult(&result);
+    }
+
+    zfree(cur_to_orig_db);
+
+    if (!initiallyAnIteratorWillReplicate && anIteratorWillReplicateForThisCommand()) {
+        // We've decided to replicate.  Re-process the MULTI/EXEC just once more to make sure that
+        //  we didn't miss any keys at the beginning.  This can't continue to recurse because
+        //  `initiallyAnIteratorWillReplicate` will be TRUE in the recursive call.  Note that the
+        //  recursive call may add additional entries to `waitingOnKeys`.
+        if (expediteKeysForMultiExec(c, waitingOnKeys)) mustBlock = true;
+    }
+
+    return mustBlock;
+}
+
+static bgIterator * bgIteratorCreate(
+        const char *name,
+        int flags,
+        bgIteratorReplDoneFunc repldone,
+        bgIteratorCleanupFunc cleanup,
+        void *privdata,
+        bgIterationType iter_type,
+        genericIterator *keyset_iter) {
+    serverAssert(onValkeyMainThread());
+    serverAssert(server.cluster_enabled || iter_type == BGITERATION_TYPE_FULLSCAN);
+    serverAssert(server.cluster_enabled                 // Don't allow CONSISTENT & REPLICATION
+            || !(flags & BGITERATOR_FLAG_CONSISTENT)    //  unless cluster mode (avoids
+            || !(flags & BGITERATOR_FLAG_REPLICATION)); //  complications with SWAPDB & FLUSHDB)
+
+    bgIterator *it = zmalloc(sizeof(bgIterator));
+    it->name = sdsnew(name);
+    it->repldone = repldone;
+    it->cleanup = cleanup;
+    it->privdata = privdata;
+    it->items_for_iterator = mutexQueueCreate();
+    it->return_to_valkey = mutexQueueCreate();
+
+    // Floor queue size to bgiteration_queue_increase_incr or use last queue size value
+    if (last_item_count_target < BGITER_QUEUE_INCREASE_INCR) {
+        last_item_count_target = BGITER_QUEUE_INCREASE_INCR;
+    }
+    it->item_count_target = last_item_count_target;
+    it->iteration_flags = flags;
+    it->iteration_type = iter_type;
+    it->consistent_modification_id = bgIteration_epoch++;
+    it->keyset_iter = keyset_iter;
+    it->early_iterate_entries = dictCreate(&dictEntryPtrDictType);
+    dictExpand(it->early_iterate_entries, BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE);
+    it->current_item = NULL;
+    it->client_is_active = false;
+    it->completed = false;
+    it->terminated = false;
+    it->cur_cmd_may_replicate = false;
+
+    it->dbentries_queued = 0;
+    it->dbentries_processed = 0;
+    it->replication_queued = 0;
+    it->replication_processed = 0;
+    it->swapdb_queued = 0;
+    it->swapdb_processed = 0;
+    it->flushdb_queued = 0;
+    it->flushdb_processed = 0;
+    it->dbentry_clones_queued = 0;
+    it->dbentry_clones_processed = 0;
+
+    elapsedStart(&it->monotonic_start_time);
+    it->monotonic_item_start_time = 0;
+
+
+    if (bgIterator_timeproc_id <= 0) {
+        // If iteration is not currently active, start the feeding task.  (Runs in main thread.)
+        bgIterator_timeproc_id = aeCreateTimeEvent(server.el, 1, bgIteration_feedIterators_task, NULL, NULL);
+        serverAssert(bgIterator_timeproc_id != AE_ERR);
+    }
+
+    if (dictAdd(nameToIterator, (void*)it->name, it) != DICT_OK) {
+        // Can't have 2 iterators with the same name!
+        serverAssert(false);
+    }
+
+    listAddNodeTail(allIterators, it);
+
+    dictExpand(inUseEntries, listLength(allIterators) * it->item_count_target);
+
+    return it;
+}
+
+
+
+//=============================================================================================
+//                        PUBLIC INTERFACE:  Iterator creation and use
+//=============================================================================================
+
+// PUBLIC API
+bgIterator * bgIteratorCreateFullScanIter(
+        const char *name,
+        int flags,
+        bgIteratorReplDoneFunc repldone,
+        bgIteratorCleanupFunc cleanup,
+        void *privdata) {
+    return bgIteratorCreate(name, flags, repldone, cleanup, privdata, BGITERATION_TYPE_FULLSCAN,
+                            fullScanIteratorCreate());
+}
+
+// PUBLIC API
+bgIterator * bgIteratorCreateSlotsIter(
+        const char *name,
+        int flags,
+        const int *slots,
+        int slots_count,
+        bgIteratorReplDoneFunc repldone,
+        bgIteratorCleanupFunc cleanup,
+        void *privdata) {
+    return bgIteratorCreate(name, flags, repldone, cleanup, privdata, BGITERATION_TYPE_CLUSTERSLOT,
+                            clusterSlotIteratorCreate(slots, slots_count));
+}
+
+// PUBLIC API
+bgIterator * bgIteratorFind(const char *name) {
+    serverAssert(onValkeyMainThread());
+
+    sds sdsname = sdsnew(name);
+    bgIterator *it = dictFetchValue(nameToIterator, sdsname);
+    sdsfree(sdsname);
+
+    return it;
+}
+
+
+// PUBLIC API
+const char *bgIteratorName(bgIterator *it) {
+    return it->name;
+}
+
+
+// PUBLIC API
+void bgIteratorGetStatus(bgIterator *it, bgIteratorStatus *status) {
+    status->dbentries_queued      = it->dbentries_queued;
+    status->dbentries_processed   = it->dbentries_processed;
+    status->replication_queued    = it->replication_queued;
+    status->replication_processed = it->replication_processed;
+    status->swapdb_queued         = it->swapdb_queued;
+    status->swapdb_processed      = it->swapdb_processed;
+    status->flushdb_queued        = it->flushdb_queued;
+    status->flushdb_processed     = it->flushdb_processed;
+    status->dbentry_clones_queued = it->dbentry_clones_queued;
+    status->dbentry_clones_processed = it->dbentry_clones_processed;
+
+    status->queue_length = mutexQueueLength(it->items_for_iterator);
+    status->queue_length_target = it->item_count_target;
+
+    status->runtime_ms = elapsedMs(it->monotonic_start_time);
+
+    monotime nonvolatile_item_start_time = it->monotonic_item_start_time;
+    status->current_item_ms =
+            (nonvolatile_item_start_time == 0) ? 0 : elapsedMs(nonvolatile_item_start_time);
+}
+
+
+// PUBLIC API
+void bgIteratorTerminate(bgIterator *it) {
+    serverAssert(onValkeyMainThread());
+
+    // Remove any items in the queue, but doesn't affect the 1 item that's being processed.
+    returnAllItemsToValkey(it);
+
+    // We have to add an item, just in case the READER is waiting on the mutex.
+    if (BGITERATION_DEBUG) {
+        debugBuffer = sdscat(debugBuffer, "SENDING TERMINATE\n");
+    }
+
+    bgIteratorItem *terminationItem = itemFreeList_getElementOrAllocate();
+    *terminationItem = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED };
+    mutexQueueAdd(it->items_for_iterator, terminationItem);
+
+    it->terminated = true;
+}
+
+
+// PUBLIC API
+bool bgIteratorIsTerminating(bgIterator *it) {
+    return it->terminated;
+}
+
+
+// PUBLIC API
+bgIteratorItem * bgIteratorRead(bgIterator *it) {
+    serverAssert(it->current_item == NULL
+            || (it->current_item->type != BGITERATOR_ITEM_COMPLETE
+                && it->current_item->type != BGITERATOR_ITEM_TERMINATED));
+
+    // First, clean up the previous item read
+    if (it->current_item != NULL) {
+        returnCurrentItemToValkey(it);
+
+        // To support unit tests.  Normal clients call bgIteratorRead from an alternate thread.
+        //  Without this, a unit test could get stuck waiting on the completion event because
+        //  feed won't get invoked.  For production, this is called regularly from the main thread.
+        if (onValkeyMainThread()) bgIteration_feedIterators_task(NULL, 0, NULL);
+    } else {
+        it->client_is_active = true;
+    }
+
+    it->monotonic_item_start_time = 0; // idle until blocking pop returns
+    it->current_item = mutexQueuePop(it->items_for_iterator, true);
+    it->monotonic_item_start_time = getMonotonicUs();
+
+    return it->current_item;
+}
+
+
+// PUBLIC API
+void bgIteratorClose(bgIterator *it) {
+    if (it->current_item != NULL) {
+        if (it->current_item->type == BGITERATOR_ITEM_COMPLETE
+         || it->current_item->type == BGITERATOR_ITEM_TERMINATED) {
+            // Normal confirmation of background completion
+        } else {
+            // Client is initiating the termination
+            it->terminated = true;
+            returnCurrentItemToValkey(it);
+
+            it->current_item = itemFreeList_getElementOrAllocate();
+            *(it->current_item) = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED };
+        }
+    } else {
+        // terminated before first item read
+        it->terminated = true;
+        it->current_item = itemFreeList_getElementOrAllocate();
+        *(it->current_item) = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED };
+    }
+
+    // We don't allocate extension items from the free list
+    bgIteratorItemExtClose *itemClose = zmalloc(sizeof(bgIteratorItemExtClose));
+    itemClose->type = BGITERATOR_ITEMEXT_ITER_CLOSED;
+    itemClose->iter = it;
+    mutexQueueAdd(it->return_to_valkey, itemClose);
+}
+
+
+
+//=============================================================================================
+//                        PUBLIC INTERFACE:  Valkey main-thread support hooks
+//=============================================================================================
+
+// PUBLIC API
+void bgIteration_init(void) {
+    serverAssert(onValkeyMainThread());
+
+    /* This should be called once and only once from the Valkey main thread.  However to support
+     * unit tests, this is not validated, and multiple invocations are ignored.  */
+    if (nameToIterator) return; // If already initialized, ignore (unit tests)
+
+    nameToIterator = dictCreate(&sdsrefToPtrDictType);
+    serverAssert(nameToIterator != NULL);
+
+    allIterators = listCreate();
+    serverAssert(allIterators != NULL);
+
+    inUseEntries = dictCreate(&dictEntryPtrDictType);
+    serverAssert(inUseEntries != NULL);
+
+    curCmdMissingKeys = listCreate();
+    serverAssert(curCmdMissingKeys != NULL);
+    listSetFreeMethod(curCmdMissingKeys, decrRefCountVoid);
+
+    bufferedReplicationBytes = 0;
+
+    if (BGITERATION_DEBUG) {
+        debugBuffer = sdsMakeRoomFor(sdsempty(), SDS_MAX_PREALLOC);
+    }
+}
+
+
+// PUBLIC API
+bool bgIteration_iterationActive(void) {
+    return (allIterators != NULL && listLength(allIterators) > 0);
+}
+
+
+// PUBLIC API
+void bgIteration_keyDelete(int dbid, const_sds key) {
+    if (!bgIteration_iterationActive()) return;
+    serverAssert(onValkeyMainThread());
+
+    if (BGITERATION_DEBUG) {
+        debugBuffer = sdscatprintf(debugBuffer, "KEYDEL: (%d)%s\n", dbid, key);
+    }
+
+    dbEntry *de = dbFind(server.db[dbid], (sds)key);
+    if (de == NULL) return;
+
+    // For consistent iterators, we need to make sure the item gets written before delete
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (it->completed || it->terminated || !it->keyset_iter->isKeyInScope(it->keyset_iter, key)) continue;
+
+        if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT
+                && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) {
+            if (!it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)
+                    && !(dictFind(it->early_iterate_entries, de) != NULL)) {
+                addEarlyIterationKey(it, de, dbid); // (may also add to inUseEntries)
+            }
+        }
+    }
+
+    removePtrFromEarlyIterate(de);
+
+    // We might be within the context of a command execution.  This happens if the key is found to
+    //  be expired when attempting to execute the command.  In this case, we should treat the key as
+    //  missing.  If the key exists after the command executes, we can treat it like a new key.
+    // (If not in command execution, this is ok - it's reset at the beginning of command execution.)
+    robj *oKey = createObject(OBJ_STRING, sdsdup(key));
+    listAddNodeHead(curCmdMissingKeys, oKey);
+}
+
+
+// PUBLIC API
+// Notify bgIteration that a FLUSHALL is being performed outside of the normal client interface.
+void bgIteration_flushall(void) {
+    handleFlushdb(-1);
+}
+
+
+// PUBLIC API
+bool bgIteration_blockClientIfRequired(client *c) {
+    serverAssert(onValkeyMainThread());
+    if (!bgIteration_iterationActive()) return false;
+    if (!isWriteCmd(c->cmd)) return false;
+
+    if (BGITERATION_DEBUG) {
+        debugBuffer = sdscatprintf(debugBuffer, "BLCK?: (%d)%s\n", c->db->id,
+                createSdsFromClientArgv(c->argc, c->argv));
+    }
+
+    // Before executing a command or atomic transaction, the replication flag is cleared for each
+    //  iterator.  If it's determined that the command should replicate, the flag will be set
+    //  as the command and keys are examined for expedite.
+    resetReplicationFlagForIterators(c);
+
+    if (c->cmd->proc == flushdbCommand || c->cmd->proc == flushallCommand) {
+        // Handle flush commands prior to execution
+        int flags;
+        if (getFlushCommandFlags(c, &flags) == C_OK) {
+            // The command parsed ok - we WILL flush
+            handleFlushdb((c->cmd->proc == flushdbCommand) ? c->db->id : -1);
+        }
+    }
+
+    bool mustBlock = false;
+    dict *waitOnKeys = dictCreate(&tempKeysetDictType); // dict of robj(sds)->NULL
+    listEmpty(curCmdMissingKeys);
+
+    if (c->cmd->proc == execCommand) {
+        mustBlock = expediteKeysForMultiExec(c, waitOnKeys);
+    } else {
+        getKeysResult result;
+        initGetKeysResult(&result);
+        int numkeys = getKeysFromCommand(c->cmd, c->argv, c->argc, &result);
+        keyReference *keyrefs = result.keys;
+        if (numkeys > 0) {
+            mustBlock = expediteKeysForWriteOnAllIterators(
+                            c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys);
+            serverAssert(!(mustBlock && (c->flag.multi) && !(c->flag.script)));
+
+            if (mustBlock && (c->flag.script)) {
+                /* For scripts, we will block for keys declared in EVAL/EVALSHA/FCALL.
+                 *  However, scripts are NOT required to declare keys.  Even if it declares keys,
+                 *  it's not declaring the DB for the key.  After a SELECT or SWAPDB, we might be on
+                 *  a key we haven't blocked for.  In this case, there is no option but to execute a
+                 *  synchronous block and wait for the iterator(s) to be done with the key(s).
+                 *  (Yuck.)  */
+                while (mustBlock) {
+                    receiveItemsBackFromIterators(true); // Blocking
+                    dictEmpty(waitOnKeys, NULL);
+                    mustBlock = expediteKeysForWriteOnAllIterators(
+                                    c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys);
+                }
+            }
+            getKeysFreeResult(&result);
+        } else {
+            // WRITE commands with no keys should always be replicated.  SWAPDB, FLUSH, FUNCTION, etc.
+            listIter li;
+            listNode *node;
+            listRewind(allIterators, &li);
+            while ((node = listNext(&li)) != NULL) {
+                bgIterator *it = listNodeValue(node);
+                it->cur_cmd_may_replicate = true;
+            }
+        }
+    }
+
+    if (mustBlock) {
+        serverAssert(dictSize(waitOnKeys) > 0);
+        robj **waitKeysArgv = zmalloc(sizeof(robj*) * dictSize(waitOnKeys));
+
+        dictEntry *de;
+        dictIterator *di = dictGetIterator(waitOnKeys);
+        unsigned long argvCount = 0;
+        while((de = dictNext(di)) != NULL) {
+            waitKeysArgv[argvCount++] = dictGetKey(de);
+        }
+        dictReleaseIterator(di);
+        serverAssert(argvCount == dictSize(waitOnKeys));
+
+        blockClientInUseOnKeys(c, argvCount, waitKeysArgv);
+
+        zfree(waitKeysArgv);
+    }
+
+    dictRelease(waitOnKeys);
+
+    if (BGITERATION_DEBUG) {
+        if (mustBlock) debugBuffer = sdscat(debugBuffer, " (blocked)\n");
+    }
+
+    return mustBlock;
+}
+
+
+// PUBLIC API
+void bgIteration_handleCommandReplication(
+        int dbid,
+        struct serverCommand *cmd,
+        int argc,
+        robj **argv) {
+    if (BGITERATION_DEBUG) {
+        // DEBUG - enable this to capture replication not queued because iteration is inactive
+        if (0 && !bgIteration_iterationActive() && (isWriteCmd(cmd) || cmd->proc == multiCommand)) {
+            debugBuffer = sdscatprintf(debugBuffer, "REPL? INACT: (%d)%s\n", dbid,
+                    createSdsFromClientArgv(argc, argv));
+        }
+    }
+
+    if (!bgIteration_iterationActive()) return;
+    serverAssert(onValkeyMainThread());
+
+    // Some commands are replicated which are not writes (like publish) these can be ignored.
+    //  Be careful with MULTI which is not a write command, but must be replicated.
+    if (!isWriteCmd(cmd) && cmd->proc != multiCommand) return;
+
+    if (BGITERATION_DEBUG) {
+        debugBuffer = sdscatprintf(debugBuffer, "REPL?: (%d)%s\n", dbid,
+                createSdsFromClientArgv(argc, argv));
+    }
+
+    if (cmd->proc == swapdbCommand) {
+        // All iterators and clients must be informed of swapdb
+        int id1, id2;
+        // command has been processed, but Valkey allows "swapdb 0 0" (which can be ignored)
+        if (getParamsForSwapdb(argc, argv, NULL, &id1, &id2))
+            handleSwapdb(id1, id2);
+    }
+
+    // In the case that a key is touched in a different DB (COPY/MOVE) the key is recorded as
+    //  a "special" key and than handled below.
+    int special_dbid = 0;
+    sds special_key = NULL;
+    dbEntry *special_dbEntry = NULL;
+    if (cmd->proc == moveCommand) {
+        // The MOVE command succeeded.  However MOVE requires special handling as it creates a new
+        //  key in a different database.  We need to make sure that we don't later try to iterate
+        //  on the key as it would be a duplicate key at that point.  So, instead, we will mark the
+        //  newly created key as "early iterated".
+        bool success = getDbIdFromRobj(argv[MOVE_COMMAND_DBID_ARG_INDEX], &special_dbid);
+        serverAssert(success); // the command already succeeded, so this should work!
+
+        robj *oKey = argv[1];
+        special_key = (sds)objectGetVal(oKey);
+
+        special_dbEntry = dbFind(server.db[special_dbid], special_key);
+    }
+    if (cmd->proc == copyCommand) {
+        // The COPY command succeeded.  However COPY requires special handling (like MOVE).
+        bool success = getTargetDbIdForCopyCommand(argc, argv, dbid, &special_dbid);
+        serverAssert(success); // the command already succeeded, so this should work!
+
+        // Find the newly created entry.
+        robj *oKey = argv[2];
+        special_key = (sds)objectGetVal(oKey);
+
+        special_dbEntry = dbFind(server.db[special_dbid], special_key);
+    }
+
+    /* Implementation note regarding LUA and MULTI:  LUA scripts and MULTI-EXEC blocks must be
+     *  treated atomically.  We need to ensure that either ALL of the replication (or none of the
+     *  replication) for the atomic operation is processed by the iterator(s).  This is handled
+     *  naturally as we can only "complete" the iteration during the feeding process - and feeding
+     *  is only performed when handling timer events (after the LUA/MULTI has completed).  */
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (it->completed || it->terminated) continue;
+
+        // For consistent iteration, we only iterate values based on version.  But for
+        //  non-consistent iteration, we don't need to explicitly iterate any values newly created
+        //  during the iteration.  So we mark them as expedited.  We know we have a new key if it
+        //  was missing before the command, and exists now.
+        if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) {
+            // Handle the special case of a key moved to a different DB
+            if (special_dbEntry != NULL) {
+                if (it->cur_cmd_may_replicate
+                        && !it->keyset_iter->hasPassedItem(it->keyset_iter, special_key, special_dbid)) {
+                    dictAdd(it->early_iterate_entries, special_dbEntry, NULL);
+                    if (BGITERATION_DEBUG) {
+                        sds entryString = createEntryString(special_dbid, special_dbEntry);
+                        debugBuffer = sdscatprintf(debugBuffer, "EARLY(special): %s\n", entryString);
+                        sdsfree(entryString);
+                    }
+                }
+
+                // Note: In the cases where there's a special command, we are copying or moving an
+                //       item to a different DB.  In these limited cases, we can only possibly be
+                //       creating a single key.  And if we've handled it here, we don't need to
+                //       handle it as a "missing key" below.  If we were to try to handle it as a
+                //       standard "missing key", we would get the DBID incorrect.
+            } else if (listLength(curCmdMissingKeys) > 0) {
+                listIter missingIt;
+                listNode *missingNode;
+                listRewind(curCmdMissingKeys, &missingIt);
+                while ((missingNode = listNext(&missingIt)) != NULL) {
+                    robj *oKey = listNodeValue(missingNode);
+                    const_sds key = objectGetVal(oKey);
+                    dbEntry *de = dbFind(server.db[dbid], (sds)key);
+                    if (de != NULL) {
+                        // It exists now!
+                        if (it->cur_cmd_may_replicate
+                                && !it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) {
+                            // If the current command is allowed to replicate, and there is a new
+                            //  key which we haven't yet reached in iteration, it needs to be added
+                            //  to the set of early iterate entries.  (We know that it's not already
+                            //  in that set because it's a newly created key!)
+                            dictAdd(it->early_iterate_entries, de, NULL);
+                            if (BGITERATION_DEBUG) {
+                                sds entryString = createEntryString(dbid, de);
+                                debugBuffer = sdscatprintf(debugBuffer, "EARLY(NEW): %s\n", entryString);
+                                sdsfree(entryString);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        /* Deletes (and unlinks) are special.
+         * Developer context:  For most commands, we call bgIteration_blockClientIfRequired before
+         *  the command and then call bgIteration_handleCommandReplication after the command.  While
+         *  the "before" logic is determining the need to block, it can also determine (mostly) the
+         *  need for replication (on each iterator).  Doing this all in one place saves us from
+         *  performing some of the same logic twice.  When we get to this point in the code, we just
+         *  use the previously determined information regarding replication.  This works because
+         *  Valkey is single-threaded and only processes one command at a time.
+         *
+         * But deletes (and unlinks) happen multiple ways - and occur outside the normal
+         *  before/after logic for commands.  These situations must be handled:
+         *    - A normal (client-driven) DEL/UNLINK command will use the standard before/after
+         *      logic.  If the key is in use by bgIteration, the command will be blocked.
+         *    - An EVICTION generates a DEL/UNLINK which happens outside of the context of a client
+         *      issued command.  The replication flags on the iterators are stale and relate to the
+         *      prior command executed.
+         *    - An EXPIRATION in the context of a client-driven WRITE command occurs when the client
+         *      command attempts to access a key and it is found to be expired.  In this case, the
+         *      client-command has already gone through the blocking process, so it should be OK to
+         *      use it->cmd_may_replicate.
+         *    - An EXPIRATION in the context of a client-driven READ command occurs when the client
+         *      command attempts to access a key and it is found to be expired.  In this case, the
+         *      client-command has NOT gone through the blocking process.  The replication flags on
+         *      the iterators are stale and relate to the prior (write) command executed.
+         *    - An EXPIRATION outside of a client-driven command occurs due to active expiry.  In
+         *      this case, the replication flags on the iterator are stale and relate to the prior
+         *      command executed.
+         *
+         * In the case of EXPIRE/EVICT occurring outside the context of a write command, this is
+         *  handled.  If the key is in-use by bgIterator, increment of robj's refcount prevents the
+         *  key from deletion. In this case the key will be removed from the main dictionary, but
+         *  held inside bgIteration until no longer needed.
+         *  Even though the entry is not physically deleted yet, it is logically deleted and it is
+         *  safe to replicate the DEL/UNLINK.  Since iterators process items FIFO, the replication
+         *  for DEL/UNLINK won't actually get processed until other queued replication is processed.
+         *
+         * In the case of a client driven DEL command, the key will have already been deleted when
+         *  we hit this routine.  In the case of EXPIRE/EVICT, they propagate happens before the key
+         *  is deleted.  So if the key is missing, we can use the cached replication decision.  But
+         *  if the key still exists (indicating EXPIRE/EVICT) we evaluate it specially.
+         */
+        bool shouldReplicateDelCommand = false;
+        bool isDelCommand = isDeleteCmd(cmd);
+        if (isDelCommand) {
+            sds key = objectGetVal(argv[1]);
+            if (it->keyset_iter->isKeyInScope(it->keyset_iter, key)) {
+                dbEntry *de = dbFind(server.db[dbid], key);
+                if (de) {
+                    // NOTE:  It's weird, but helpful, for both EXPIRE and EVICT the propagation happens
+                    //        BEFORE the actual delete.  So if the dbEntry still exists, we are doing
+                    //        an expire/evict which is not preceded by blockClientIfRequired().
+                    if (it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)
+                            || (dictFind(it->early_iterate_entries, de) != NULL)) {
+                        shouldReplicateDelCommand = true;
+                    }
+                } else {
+                    // The dbEntry has already been deleted, this must be part of normal command
+                    //  processing.
+                    shouldReplicateDelCommand = it->cur_cmd_may_replicate;
+                }
+            }
+        }
+
+        bool replicate = (it->iteration_flags & BGITERATOR_FLAG_REPLICATION &&
+                ((!isDelCommand && it->cur_cmd_may_replicate)
+                        || shouldReplicateDelCommand));
+
+        if (replicate) {
+            /* We will replicate the command in these cases:
+             * 1) For consistent iteration - it->cur_cmd_may_replicate is always true
+             * 2) For non-consistent, if any of the keys have been processed, expediteKeysForWrite
+             *    will ensure that ALL of the keys have been expedited - and we should replicate
+             * 3) For non-consistent, if NONE of the keys have been processed, no need to replicate
+             */
+
+            if (BGITERATION_DEBUG) {
+                debugBuffer = sdscat(debugBuffer, " (queued)\n");
+            }
+
+            bgIteratorItem *item = itemFreeList_getElementOrAllocate();
+            item->type = BGITERATOR_ITEM_REPLICATION;
+            item->dbid = dbid;
+            item->u.repl.cmd = cmd;
+            item->u.repl.argv = cloneRobjArray(argc, argv);
+            item->u.repl.argc = argc;
+            bufferedReplicationBytes += replicationItemSize(item);
+            it->replication_queued++;
+            mutexQueueAdd(it->items_for_iterator, item);
+        }
+    } // allIterators loop
+}
+
+
+// PUBLIC API
+size_t bgIteration_memoryInuseForReplication(void) {
+    return bufferedReplicationBytes;
+}
+
+
+// PUBLIC API
+bool bgIteration_isEntryInuse(dbEntry *de) {
+    serverAssert(onValkeyMainThread());
+    return isEntryInuseByAnyIterator(de);
+}
+
+
+// PUBLIC API
+uint32_t bgIteration_getEpoch(void) {
+    return bgIteration_epoch;
+}
+
+
+// PUBLIC API
+void bgIteration_updateDbEntryPtr(dbEntry *old, dbEntry *new) {
+    if (!bgIteration_iterationActive() || old == new) return;
+    serverAssert(onValkeyMainThread());
+    serverAssert(!isEntryInuseByAnyIterator(old));
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (dictDelete(it->early_iterate_entries, old) == DICT_OK) {
+            if (BGITERATION_DEBUG) {
+                debugBuffer = sdscatprintf(debugBuffer, "EARLY LIST UPDATE %p -> %p\n", (void *)old, (void *)new);
+            }
+            dictAdd(it->early_iterate_entries, new, NULL);
+        }
+    }
+}
diff --git a/src/bgiteration.h b/src/bgiteration.h
new file mode 100644
index 00000000000..35a4b988857
--- /dev/null
+++ b/src/bgiteration.h
@@ -0,0 +1,363 @@
+#ifndef __BGITERATION_H
+#define __BGITERATION_H
+
+#include <stdbool.h>
+#include "sds.h"
+
+/* A mechanism for creating iteration clients which iterate over the main dictionary in a
+ * background thread.
+ *
+ * This mechanism passes keys to the iteration client, while blocking the keys from write by the
+ * Valkey main thread.  Once an iteration client is done with a key, it is returned to the Valkey
+ * main thread and any pending writers are unblocked.
+ *
+ * A bgIterator must be created on the main Valkey thread, and then passed to another thread which
+ * implements the logic of the iteration client.
+ *
+ * Iteration clients are expected to read through the keyspace until the iteration is complete or
+ * terminated.  An iteration client may not perform modifications on a key.
+ *
+ * Future enhancement:  Certain types of modifications may be passed back to the Valkey main thread.
+ *                      Use case: A background compression thread wants to compress a string value.
+ */
+
+/* Avoids dependency on server.h */
+typedef struct serverObject dbEntry;    // An object with key/value inserted into main dictionary
+typedef struct serverObject robj;       // An object with a value used for command parameters
+typedef struct client client;
+
+/* The bgIterator is an opaque structure.  */
+typedef struct bgIterator bgIterator;
+
+
+/* Flag indicates that a consistent iteration is required.  This is used to create a point-in-time
+ * iteration.  The iteration client will see all keys AS THEY EXISTED at the time when the iterator
+ * was created.
+ * Note:  The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration
+ *        start).  SWAPDB events are NOT provided during a consistent iteration.  */
+#define BGITERATOR_FLAG_CONSISTENT (1 << 0)
+
+/* Flag indicating that the replication stream for keys which have already been processed should be
+ * forwarded to the iteration client.  Most useful for non-consistent iteration to track changes
+ * to keys already processed.  By tracking changes, this allows an non-consistent iteration client
+ * to achieve a consistent view at the END of the iteration.
+ * NOTE:  Replication events will be provided ordered and synchronized with any SWAPDB events.
+ * LIMITATION:  Since SWAPDB events are not provided during CONSISTENT iteration, it is not
+ *              permitted to use both CONSISTENT and REPLICATION on a non-clustermode instance.  */
+#define BGITERATOR_FLAG_REPLICATION (1 << 1)
+
+
+/* When running an iterator with replication, a replication-done function (callback) may be
+ * provided.  This function will be executed after the last replication item has been fed into the
+ * queue for the client.  This function will be run on the Valkey main thread, and allows a client
+ * to recognize the point where no additional replication data will be sent for processing.
+ *
+ * PRIVDATA:    this pointer is for data private to the iteration client.
+ *
+ * Returns true when an iterator stops accepting any replication item into the queue for the client.
+ * If false is returned, replication will continue, and bgiteration will periodically call the callback
+ * until true is returned. In this context, returning false indicates that the client is not ready to
+ * stop receiving replication, it is requesting that replication be continued.
+ */
+typedef bool (*bgIteratorReplDoneFunc)(void *privdata);
+
+
+/* When creating a bgIterator, a cleanup function (callback) may be provided.  This function will be
+ * executed once iteration has completed and this will run on the Valkey main thread.
+ *
+ * TERMINATED:  will be passed as TRUE if the iteration process was terminated early (either by
+ *              the main thread calling bgIteratorTerminate() or the iteration client calling
+ *              bgIteratorClose()).
+ * PRIVDATA:    this pointer is for data private to the iteration client.
+ */
+typedef void (*bgIteratorCleanupFunc)(bool terminated, void *privdata);
+
+
+/* Create a background full-scan iterator (bgIterator).
+ * This bgIterator will iterate through the entire keyspace (across all DBs).
+ *
+ * NAME:        a human readable name for the iterator (must be unique)
+ * FLAGS:       creation flags indicate iteration options
+ * REPLDONE:    if provided, called after the last replication item has been queued (on the Valkey main thread)
+ * CLEANUP:     if provided, called at the end of iteration (on the Valkey main thread)
+ * PRIVDATA:    passed to cleanup function
+ *
+ * This method creates and initializes the bgIterator.  It does not perform any thread management.
+ * It is expected that the main Valkey thread will call this method, and then start a new thread to
+ * to implement the iteration client which will read from the returned bgIterator.
+ *
+ * There is no need to delete/destroy a bgIterator.  It will automatically be cleaned up after the
+ * last item is read.
+ */
+bgIterator * bgIteratorCreateFullScanIter(
+        const char *name,
+        int flags,
+        bgIteratorReplDoneFunc repldone,
+        bgIteratorCleanupFunc cleanup,
+        void *privdata);
+
+
+/* Create a background slots iterator (bgIterator).
+ * This bgIterator will iterate through the keys belonging to a set of cluster slots.
+ *
+ * NAME:        a human readable name for the iterator (must be unique)
+ * FLAGS:       creation flags indicate iteration options
+ * SLOTS:       array of cluster slots to iterate over
+ * SLOTS_COUNT: size of the array of slots
+ * REPLDONE:    if provided, called after the last replication item has been queued (on the Valkey main thread)
+ * CLEANUP:     if provided, called at the end of iteration (on the Valkey main thread)
+ * PRIVDATA:    passed to cleanup function
+ *
+ * This method creates and initializes the bgIterator.  It does not perform any thread management.
+ * It is expected that the main Valkey thread will call this method, and then start a new thread to
+ * to implement the iteration client which will read from the returned bgIterator.
+ *
+ * The caller of this function has the ownership of the `slots` array's memory. This function will
+ * just copy its data and leave the array untouched.
+ *
+ * There is no need to delete/destroy a bgIterator.  It will automatically be cleaned up after the
+ * last item is read.
+ */
+bgIterator * bgIteratorCreateSlotsIter(
+        const char *name,
+        int flags,
+        const int *slots,
+        int slots_count,
+        bgIteratorReplDoneFunc repldone,
+        bgIteratorCleanupFunc cleanup,
+        void *privdata);
+
+
+/* Find an existing bgIterator by name.
+ * Returns NULL if the iterator does not exist (or has completed).
+ */
+bgIterator * bgIteratorFind(const char *name);
+
+
+/* Get the name of an existing iterator.  */
+const char * bgIteratorName(bgIterator *iter);
+
+
+/* Struct to retrieve status information for an active iteration client.  */
+typedef struct {
+    unsigned long dbentries_queued;         // Cumulative BGITERATOR_ITEM_DBENTRY queued
+    unsigned long dbentries_processed;      // Cumulative BGITERATOR_ITEM_DBENTRY processed
+    unsigned long replication_queued;       // Cumulative BGITERATOR_ITEM_REPLICATION queued
+    unsigned long replication_processed;    // Cumulative BGITERATOR_ITEM_REPLICATION processed
+    unsigned long swapdb_queued;            // Cumulative BGITERATOR_ITEM_SWAPDB queued
+    unsigned long swapdb_processed;         // Cumulative BGITERATOR_ITEM_SWAPDB processed
+    unsigned long flushdb_queued;           // Cumulative BGITERATOR_ITEM_FLUSHDB queued
+    unsigned long flushdb_processed;        // Cumulative BGITERATOR_ITEM_FLUSHDB processed
+    unsigned long dbentry_clones_queued;    // A subset of dbentries_queued for cloned entries
+    unsigned long dbentry_clones_processed; // A subset of dbentries_processed for cloned entries
+    unsigned long queue_length;             // Current length of queue to iteration client
+    unsigned long queue_length_target;      // Dynamic target length for queue to iteration client
+    unsigned long runtime_ms;               // Time, in milliseconds, that iterator has been running
+    unsigned long current_item_ms;          // Time, in milliseconds, spent processing current item
+} bgIteratorStatus;
+
+
+/* Get the status of a background iteration.
+ *
+ * The caller-provided bgIteratorStatus will be populated.
+ */
+void bgIteratorGetStatus(bgIterator *iter, bgIteratorStatus *status);
+
+
+/* Terminate a background iteration.
+ *
+ * An iteration is terminated by the Valkey main thread.  It is expected that the iteration client
+ * will continue to read, receiving BGITERATOR_ITEM_TERMINATED or BGITERATOR_ITEM_COMPLETE to
+ * complete the iteration.  (This is necessary to ensure proper cleanup.)
+ * NOTE:  If the iteration client wants to terminate iteration, it may call bgIteratorClose().
+ */
+void bgIteratorTerminate(bgIterator *iter);
+
+
+/* Check if an iterator is being terminated.
+ *
+ * This checks if the iterator is in the process of terminating.  For the Valkey main thread, this
+ * can be used to determine if a call has already been made to bgIteratorTerminate.  For an
+ * iteration client, it normally learns about terminate by reading the next item, this allows
+ * out-of-band detection of termination which can be useful when processing a large key.
+ */
+bool bgIteratorIsTerminating(bgIterator *iter);
+
+
+typedef enum {
+    /* Indicates that the iteration has completed normally.  No more items to read.
+     * If replication is enabled, on completion, the final replication offset is recorded in
+     *  'u.master_repl_offset' and 'dbid' is set to the selected replication db.  The iteration
+     *  client will have received all *applicable* replication data to this point.  */
+    BGITERATOR_ITEM_COMPLETE = 1,
+
+    /* Indicates that the iteration has been terminated before completion.  No more items to read.*/
+    BGITERATOR_ITEM_TERMINATED,
+
+    /* A dbEntry for DB=dbid.
+     * NOTE:  The dbEntry MAY be expired.  It is up to the client to decide how to handle
+     *        expired entries.  */
+    BGITERATOR_ITEM_DBENTRY,
+
+    /* A replication command for DB=dbid.  cmd, argv, & argc provided.
+     * NOTE:  The command may have been re-written before replication.  */
+    BGITERATOR_ITEM_REPLICATION,
+
+    /* A SWAPDB event.  dbid swapped with dbid2.
+     * Note that SWAPDB events are not provided during consistent iteration.  */
+    BGITERATOR_ITEM_SWAPDB,
+
+    /* A FLUSHDB event.  In most cases, iteration will be terminated, and this event will NOT be
+     * sent.  However, in the case of a single minor DB being flushed, non-consistent iteration is
+     * permitted to continue.  */
+    BGITERATOR_ITEM_FLUSHDB
+} bgIteratorItemType;
+
+
+typedef struct {
+    dbEntry *de;
+    bool is_cloned;
+    bool is_rehashing_paused;
+} dbEntryData;
+
+typedef struct {
+    struct serverCommand *cmd;
+    robj **argv;
+    int argc;
+} replicationData;
+
+typedef struct {
+    bgIteratorItemType type;
+    int dbid;       /* orig DB ID for CONSISTENT, queue-time DB ID for !CONSISTENT.  */
+    union {
+        dbEntryData dbe;                // for BGITERATOR_ITEM_DBENTRY
+        replicationData repl;           // for BGITERATOR_ITEM_REPLICATION
+        long long master_repl_offset;   // for BGITERATOR_ITEM_COMPLETE
+        int dbid2;                      // for BGITERATOR_ITEM_SWAPDB
+    } u;
+} bgIteratorItem;
+
+
+/* Read the next bgIteratorItem from the bgIterator.
+ *
+ * The iteration client is expected to call this function in a loop.  After reading
+ * BGITERATOR_ITEM_COMPLETE or BGITERATOR_ITEM_TERMINATED, the iteration client must call
+ * bgIteratorClose to finalize the iteration process.
+ *
+ * This is a blocking call.  If the main Valkey thread has been too busy to send items to the
+ * iterator, the iteration client's queue may run dry and this call will block until data is
+ * available.
+ *
+ * NOTE: Reading an item returns previously read items to Valkey.  It is unsafe to reference an item
+ * previously read.
+ *
+ * (All memory management is the responsibility of the bgIterator - not the reader.)
+ */
+bgIteratorItem * bgIteratorRead(bgIterator *iter);
+
+
+/* Close the bgIterator, allowing the bgIterator to be deallocated.
+ *
+ * This must be called by an iteration client to release the bgIterator.
+ *
+ * It is required that this is called after receiving BGITERATOR_ITEM_COMPLETE or
+ * BGITERATOR_ITEM_TERMINATED and signals that the background activity is complete.
+ *
+ * This may also be called by the iteration client to force terminate an iteration early.  The
+ * bgIterator will be marked as terminated.
+ */
+void bgIteratorClose(bgIterator *iter);
+
+
+/********************************************************************************************
+ * BGITERATION HOOKS REQUIRED TO SUPPORT ITERATION - CALLS INSERTED INTO MAIN VALKEY CODE
+ ********************************************************************************************/
+
+typedef struct {
+    uint32_t iterator_epoch;    // iterator epoch of last modification
+} bgIterationEntryMetadata;
+
+
+/* Must be called once (and only once) at server startup.  */
+void bgIteration_init(void);
+
+
+/* Returns true if any iterators are currently active. */
+bool bgIteration_iterationActive(void);
+
+
+/* Notify bgIteration that a key is being deleted.  In Valkey, key deletion can occur in a READ
+ * command if the key is expired.  Note that this notification is more about status than memory.
+ * Since the dbEntry is a reference counted object, the dbEntry can't be physically deleted if
+ * bgIteration is still actively using it.
+ */
+void bgIteration_keyDelete(int dbid, const_sds key);
+
+
+/* Iteration needs to know if a FLUSHALL is being performed.  For normal clients, this comes through
+ * the standard "blockClientIfRequired" interface.  This interface is for cases where Valkey
+ * performs the FLUSHALL operation independently of clients (e.g. when syncing with master).
+ */
+void bgIteration_flushall(void);
+
+
+/* Updating value or expiration of an existing key may lead to reallocation of the dbEntry (robj).
+ * BgIteration keeps track of expedited keys (by pointer) to avoid repeated iteration.  BgIteration
+ * must be notified when dbEntries are reallocated.  BgIteration will not dereference the pointers;
+ * it is safe to have deallocated the old dbEntry before calling this function.
+ * 
+ * We can't update the dbEntry if the entry is actually in use (bgIteration_isEntryInuse)!
+ *
+ * To simplify calling code, this function does nothing if old_entry == new_entry.
+ */
+void bgIteration_updateDbEntryPtr(dbEntry *old_entry, dbEntry *new_entry);
+
+
+/* Before executing any command, the Valkey main thread must call this function.  If the key(s) are
+ * blocked for writes by an iterator, the function returns true and the client is blocked.  A
+ * blocked client will be unblocked once the key becomes available for write.
+ *
+ * This should be called for all commands - even commands which are executed as part of a MULTI/EXEC
+ * or LUA script.
+ *
+ * For MULTI/EXEC - This function is called when hitting the EXEC - after all of the commands
+ *                  have been queued.  This may block the EXEC, but will NOT block individual
+ *                  commands as they are executed in the MULTI/EXEC block.
+ *
+ * For LUA script - This function is first called for EVAL/EVALSHA.  It may block the script while
+ *                  waiting on declared keys.  However, if the script accesses undeclared keys or
+ *                  performs SWAPDB, a synchronous block may be performed (returning false) on
+ *                  individual commands within the script.
+ *
+ * Note: this function should be called for all commands (not just writes).
+ */
+bool bgIteration_blockClientIfRequired(client *c);
+
+
+/* After execution of a write command, the Valkey main thread must provide the command to iterators
+ * which are interested in the replication feed.  It is required that all commands have been passed
+ * through bgIteration_blockClientIfRequired(), however, it is permitted that the command can be
+ * re-written for propagation.
+ */
+void bgIteration_handleCommandReplication(
+        int dbid,
+        struct serverCommand *cmd,
+        int argc,
+        robj **argv);
+
+
+/* The memory that bgIteration uses while temporarily buffering replication data is not included in
+ * the maxmemory computation used for eviction.  This function provides insight into the current
+ * amount of memory used for buffered replication data.
+ */
+size_t bgIteration_memoryInuseForReplication(void);
+
+
+/* Check if a dbEntry is currently in-use/locked by bgIteration. */
+bool bgIteration_isEntryInuse(dbEntry *de);
+
+
+/* Get the current iteration epoch, for tagging metadata on keys. */
+uint32_t bgIteration_getEpoch(void);
+
+#endif
diff --git a/src/db.c b/src/db.c
index ba9d25c2fa6..d48bc4b935a 100644
--- a/src/db.c
+++ b/src/db.c
@@ -37,6 +37,7 @@
 #include "module.h"
 #include "vector.h"
 #include "expire.h"
+#include "bgiteration.h"
 
 /*-----------------------------------------------------------------------------
  * C-level DB API
@@ -361,6 +362,7 @@ static void dbSetValue(serverDb *db, robj *key, robj **valref, int overwrite, vo
         val->lru = old->lru;
         long long expire = objectGetExpire(old);
         new = objectSetKeyAndExpire(val, objectGetVal(key), expire);
+        bgIteration_updateDbEntryPtr(old, new);
         *oldref = new;
         /* Replace the old value at its location in the expire space. */
         if (expire >= 0) {
@@ -430,6 +432,8 @@ void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags) {
     } else {
         dbSetValue(db, key, valref, 1, NULL);
     }
+    bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(*valref);
+    if (md) md->iterator_epoch = bgIteration_getEpoch();
     if (!(flags & SETKEY_KEEPTTL)) removeExpire(db, key);
     if (!(flags & SETKEY_NO_SIGNAL)) signalModifiedKey(c, db, key);
 }
@@ -475,6 +479,8 @@ int dbGenericDeleteWithDictIndex(serverDb *db, robj *key, int async, int flags,
     hashtablePosition pos;
     void **ref = kvstoreHashtableTwoPhasePopFindRef(db->keys, dict_index, objectGetVal(key), &pos);
     if (ref != NULL) {
+        bgIteration_keyDelete(db->id, (sds)objectGetVal(key));
+
         robj *val = *ref;
         /* VM_StringDMA may call dbUnshareStringValue which may free val, so we
          * need to incr to retain val */
@@ -753,6 +759,15 @@ long long dbTotalServerKeyCount(void) {
 void signalModifiedKey(client *c, serverDb *db, robj *key) {
     touchWatchedKey(db, key);
     trackingInvalidateKey(c, key, 1);
+
+    /* If bgIteration is running, need to maintain the iteration epoch. */
+    if (bgIteration_iterationActive()) {
+        dbEntry *o = dbFind(db, objectGetVal(key));
+        if (o) {
+            bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(o);
+            if (md) md->iterator_epoch = bgIteration_getEpoch();
+        }
+    }
 }
 
 void signalFlushedDb(int dbid, int async) {
@@ -2255,7 +2270,7 @@ robj *dbFindExpires(serverDb *db, sds key) {
 }
 
 unsigned long long dbSize(serverDb *db) {
-    return kvstoreSize(db->keys);
+    return (db->keys) ? kvstoreSize(db->keys) : 0;
 }
 
 unsigned long long dbScan(serverDb *db, unsigned long long cursor, kvstoreScanFunction scan_cb, void *privdata) {
diff --git a/src/hashtable.c b/src/hashtable.c
index dcae6dfa014..1dcb8038030 100644
--- a/src/hashtable.c
+++ b/src/hashtable.c
@@ -214,6 +214,8 @@ static_assert(100 * BUCKET_DIVISOR / BUCKET_FACTOR / ENTRIES_PER_BUCKET <= MAX_F
               "Expand must result in a fill below the soft max fill factor");
 static_assert(MAX_FILL_PERCENT_SOFT <= MAX_FILL_PERCENT_HARD, "Soft vs hard fill factor");
 
+#define ITERATOR_DONE_WITH_BUCKET_IDX (ENTRIES_PER_BUCKET + 1)
+
 /* --- Random entry --- */
 
 #define FAIR_RANDOM_SAMPLE_SIZE (ENTRIES_PER_BUCKET * 10)
@@ -344,7 +346,7 @@ typedef struct {
 } position;
 
 static_assert(sizeof(hashtablePosition) >= sizeof(position),
-              "Opaque iterator size");
+              "Opaque position size");
 
 /* State for incremental find. */
 typedef struct {
@@ -612,7 +614,8 @@ static bucket *fetchEntriesForExpand(bucket *b, void *buf[], int *size, int max_
 
 /* Processes one bucket chain during incremental table expansion.
  * Uses batch processing to optimize memory access patterns. */
-static void rehashStepExpand(hashtable *ht) {
+// Not API, but not static - used in unit testing
+void rehashStepExpand(hashtable *ht) {
     void *entry_buf[FETCH_ENTRY_BUFFER_SIZE_WHEN_EXPAND];
     const void *key_buf[FETCH_ENTRY_BUFFER_SIZE_WHEN_EXPAND];
     size_t idx = ht->rehash_idx;
@@ -1377,13 +1380,13 @@ void hashtableResumeAutoShrink(hashtable *ht) {
  * spaces, "holes", in the bucket chains, which wastes memory. Additionally, we
  * pause auto shrink when rehashing is paused, meaning the hashtable will not
  * shrink the bucket count. */
-static void hashtablePauseRehashing(hashtable *ht) {
+void hashtablePauseRehashing(hashtable *ht) {
     ht->pause_rehash++;
     hashtablePauseAutoShrink(ht);
 }
 
 /* Resumes incremental rehashing, after pausing it. */
-static void hashtableResumeRehashing(hashtable *ht) {
+void hashtableResumeRehashing(hashtable *ht) {
     ht->pause_rehash--;
     assert(ht->pause_rehash >= 0);
     hashtableResumeAutoShrink(ht);
@@ -2268,7 +2271,9 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) {
              * child bucket in a chain, or to the next bucket index, or to the
              * next table. */
             iter->pos_in_bucket++;
-            if (iter->bucket->chained && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1) {
+            if (iter->bucket->chained
+                    && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1
+                    && iter->pos_in_bucket != ITERATOR_DONE_WITH_BUCKET_IDX) {
                 iter->pos_in_bucket = 0;
                 iter->bucket = getChildBucket(iter->bucket);
             } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) {
@@ -2562,3 +2567,68 @@ int hashtableLongestBucketChain(hashtable *ht) {
     }
     return maxlen;
 }
+
+/* This is an internal function - not part of the standard API.  It must be explicitly declared
+ * where used.  It shouldn't be included in any .h (API) file.  Use of this interface is discouraged
+ * as it depends on the internal structure, which may change.
+ *
+ * For a given key, return:
+ *   table_idx - the index of the internal table (0 or 1)
+ *   bucket_idx - the bucket index within the table (0..n)
+ *
+ * Returns TRUE if the the key exists in the table.
+ * Returns FALSE if the key doesn't exist (and table/index are undefined)
+ */
+bool hashtableInternalFindBucketIdx(hashtable *ht, void *key, int *table_idx, size_t *bucket_idx) {
+    uint64_t hash = hashKey(ht, key);
+    int pos_in_bucket;
+    int table;
+    bucket *b = findBucket(ht, hash, key, &pos_in_bucket, &table);
+    if (!b) return false;
+
+    *table_idx = table;
+    *bucket_idx = hash & expToMask(ht->bucket_exp[table]);
+    return true;
+}
+
+/* This is an internal function - not part of the standard API.  It must be explicitly declared
+ * where used.  It shouldn't be included in any .h (API) file.  Use of this interface is discouraged
+ * as it depends on the internal structure, which may change.
+ *
+ * For a given iterator, return:
+ *   table_idx - the index of the internal table (0 or 1)
+ *   bucket_idx - the bucket index within the table (0..n)
+ *
+ * NOTE: hashtableIterator position is based on the LAST item returned.
+ */
+void hashtableInternalIteratorGetBucketIdx(hashtableIterator *iterator, int *table_idx, size_t *bucket_idx) {
+    iter *it = iteratorFromOpaque(iterator);
+    *table_idx = it->table;
+    *bucket_idx = it->index;
+}
+
+/* This is an internal function - not part of the standard API.  It must be explicitly declared
+ * where used.  It shouldn't be included in any .h (API) file.  Use of this interface is discouraged
+ * as it depends on the internal structure, which may change.
+ *
+ * Returns TRUE if the iterator is ready to move to the next bucket index (if it has completed the
+ * current bucket index).  Note: hashtableIterator bucket_idx is the bucket index of the last item
+ * returned by hashtableNext.
+ *
+ * Note: If this function returns true, the iterator commits to move onto the next bucket index,
+ * even if something new is added to the end of the current bucket before hashtableNext is called.
+ */
+bool hashtableInternalIteratorIsBucketIdxComplete(hashtableIterator *iterator) {
+    iter *it = iteratorFromOpaque(iterator);
+
+    if (it->bucket->chained) return false;
+
+    if (!(it->bucket->presence >> (it->pos_in_bucket + 1))) {
+        /* There's CURRENTLY nothing else to return at this bucket index.  Mark pos_in_bucket so
+         * so that hashtableNext will move to the next bucket index, regardless of items which may
+         * be added in the future. */
+        it->pos_in_bucket = ITERATOR_DONE_WITH_BUCKET_IDX;
+        return true;
+    }
+    return false;
+}
diff --git a/src/hashtable.h b/src/hashtable.h
index 8bbf5d8c05b..97ecab68518 100644
--- a/src/hashtable.h
+++ b/src/hashtable.h
@@ -129,6 +129,8 @@ size_t hashtableMemUsage(const hashtable *ht);
 void hashtablePauseAutoShrink(hashtable *ht);
 void hashtableResumeAutoShrink(hashtable *ht);
 bool hashtableIsRehashing(hashtable *ht);
+void hashtablePauseRehashing(hashtable *ht);
+void hashtableResumeRehashing(hashtable *ht);
 bool hashtableIsRehashingPaused(hashtable *ht);
 ssize_t hashtableGetRehashingIndex(hashtable *ht);
 void hashtableRehashingInfo(hashtable *ht, size_t *from_size, size_t *to_size);
diff --git a/src/kvstore.c b/src/kvstore.c
index 86078cfc1ab..1ac72a01dc2 100644
--- a/src/kvstore.c
+++ b/src/kvstore.c
@@ -689,6 +689,16 @@ int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it) {
     return kvs_it->didx;
 }
 
+/* This is an internal function - not part of the standard API.  It must be explicitly declared
+ * where used.  It shouldn't be included in any .h (API) file.  Use of this interface is discouraged
+ * as it depends on the internal structure, which may change.
+ *
+ * Return the current hashtableIterator from within the kvstoreIterator.
+ */
+hashtableIterator *kvstoreInternalIteratorGetCurrentHashtableIterator(kvstoreIterator *kvs_it) {
+    return &kvs_it->di;
+}
+
 /* Fetches the next element and returns true. Returns false if there are no more elements. */
 bool kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next) {
     if (kvs_it->didx != KVSTORE_INDEX_NOT_FOUND && hashtableNext(&kvs_it->di, next)) {
diff --git a/src/module.c b/src/module.c
index c2511dbb54e..3bcfa2d3aae 100644
--- a/src/module.c
+++ b/src/module.c
@@ -70,6 +70,7 @@
 #include "io_threads.h"
 #include "scripting_engine.h"
 #include "cluster_migrateslots.h"
+#include "bgiteration.h"
 #include <dlfcn.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
@@ -4464,6 +4465,7 @@ int VM_SetAbsExpire(ValkeyModuleKey *key, mstime_t expire) {
  * When async is set to true, db contents will be freed by a background thread. */
 void VM_ResetDataset(int restart_aof, int async) {
     if (restart_aof && server.aof_state != AOF_OFF) stopAppendOnly();
+    bgIteration_flushall();
     flushAllDataAndResetRDB((async ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS) | EMPTYDB_NOFUNCTIONS);
     if (server.aof_enabled && restart_aof) restartAOFAfterSYNC();
 }
diff --git a/src/object.c b/src/object.c
index 21eb57e5cbd..f4545cf8025 100644
--- a/src/object.c
+++ b/src/object.c
@@ -38,6 +38,7 @@
 #include "zmalloc.h"
 #include "sds.h"
 #include "module.h"
+#include "bgiteration.h"
 #include <math.h>
 #include <ctype.h>
 
@@ -340,7 +341,7 @@ robj *createStringObjectFromSds(const_sds s) {
     return createStringObject(s, sdslen(s));
 }
 
-static robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const_sds key, long long expire) {
+robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const_sds key, long long expire) {
     if (shouldEmbedStringObject(len, key, expire)) {
         return createEmbeddedStringObjectWithKeyAndExpire(ptr, len, key, expire);
     } else {
@@ -447,6 +448,7 @@ void objectUnembedVal(robj *o) {
 robj *objectSetKeyAndExpire(robj *o, const_sds key, long long expire) {
     if (o->type == OBJ_STRING && o->encoding == OBJ_ENCODING_EMBSTR) {
         robj *new = createStringObjectWithKeyAndExpire(objectGetVal(o), sdslen(objectGetVal(o)), key, expire);
+        bgIteration_updateDbEntryPtr(o, new);
         new->lru = o->lru;
         decrRefCount(o);
         return new;
@@ -471,6 +473,7 @@ robj *objectSetKeyAndExpire(robj *o, const_sds key, long long expire) {
         serverPanic("Not implemented");
     }
     robj *new = createUnembeddedObjectWithKeyAndExpire(o->type, ptr, key, expire);
+    bgIteration_updateDbEntryPtr(o, new);
     new->encoding = o->encoding;
     new->lru = o->lru;
     decrRefCount(o);
diff --git a/src/rdb.c b/src/rdb.c
index e4e006a16ec..ae16f62bd26 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -46,6 +46,7 @@
 #include "module.h"
 #include "cluster.h"
 #include "cluster_migrateslots.h"
+#include "bgiteration.h"
 
 #include <math.h>
 #include <fcntl.h>
@@ -3171,6 +3172,7 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin
     if (rdbflags & RDBFLAGS_EMPTY_DATA) {
         int empty_db_flags = server.repl_replica_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS;
         serverLog(LL_NOTICE, "RDB signature and version check passed. Flushing old data");
+        bgIteration_flushall();
         emptyData(-1, empty_db_flags, replicationEmptyDbCallback);
 
         /* functionsLibCtx is cleared when we call emptyData, reinitialize here. */
diff --git a/src/replication.c b/src/replication.c
index 9c8c56d44d2..9f1e00087e6 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -41,6 +41,7 @@
 #include "connection.h"
 #include "module.h"
 #include "cluster_migrateslots.h"
+#include "bgiteration.h"
 
 #include <memory.h>
 #include <sys/time.h>
@@ -2482,6 +2483,7 @@ int replicaLoadPrimaryRDBFromSocket(connection *conn, char *buf, char *eofmark,
             } else {
                 /* Remove the half-loaded data in case the load failed for other reasons. */
                 serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data");
+                bgIteration_flushall();
                 emptyData(-1, empty_db_flags, replicationEmptyDbCallback);
             }
         }
@@ -2585,6 +2587,7 @@ int replicaLoadPrimaryRDBFromDisk(rdbSaveInfo *rsi) {
         } else {
             /* If disk-based RDB loading fails, remove the half-loaded dataset. */
             serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data");
+            bgIteration_flushall();
             emptyData(-1, empty_db_flags, replicationEmptyDbCallback);
         }
 
diff --git a/src/server.c b/src/server.c
index 4eb7798a924..ecbae40c2f5 100644
--- a/src/server.c
+++ b/src/server.c
@@ -54,6 +54,7 @@
 #include "util.h"
 
 #include "eval.h"
+#include "bgiteration.h"
 
 #include "trace/trace_commands.h"
 
@@ -3018,8 +3019,11 @@ void initServer(void) {
 
     /* Set object metadata size before creating any database key objects */
     if (server.forkless_options_supported) {
-        objectSetMetadataSize(sizeof(uint32_t)); /* This is a placeholder until Threadsave defines a metadata structure */
-                                                 /* 4 bytes for iterator_epoch for now*/
+        /* NOTE: At this time, there is only one reason for dbEntry metadata.  bgIteration.  However,
+         * if/when new metadata options are added, we will need to compute the size of a variable
+         * size metadata, and provide appropriate accessors to access the specific portion of the
+         * metadata (each of which may/may not exist, based on immutable startup parameters).  */
+        objectSetMetadataSize(sizeof(bgIterationEntryMetadata));
     }
 
     createDatabaseIfNeeded(0); /* The default database should always exist */
@@ -3141,6 +3145,7 @@ void initServer(void) {
     commandlogInit();
     latencyMonitorInit();
     initSharedQueryBuf();
+    bgIteration_init();
 
     /* Initialize ACL default password if it exists */
     ACLUpdateDefaultUserPassword(server.requirepass);
@@ -3702,6 +3707,11 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot)
     if (propagate_to_slot_migration) clusterFeedSlotExportJobs(dbid, argv, argc, slot);
 }
 
+// If true, a MULTI has been sent to bgIterator.
+//  Remember to send the matching EXEC in propagatePendingCommands().
+static bool sentMultiToBgIterator = false;
+static int lastDbidSentToBgIterator;
+
 /* Used inside commands to schedule the propagation of additional commands
  * after the current command is propagated to AOF / Replication.
  *
@@ -3714,6 +3724,29 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot)
  * stack allocated).  The function automatically increments ref count of
  * passed objects, so the caller does not need to. */
 void alsoPropagate(int dbid, robj **argv, int argc, int target, int slot) {
+    if (target & PROPAGATE_REPL && bgIteration_iterationActive()) {
+        // Note that bgIterator must be invoked immediately after each command.  This is required
+        //  for proper processing in the bgIterator state machine.  It's NOT ok to call bgIterator
+        //  from propagateNow as that handles all of the commands for a transaction at the end.
+        // THIS FUNCTION (alsoPropagate) is called after each command.
+        if (!sentMultiToBgIterator && (scriptIsRunning() || server.in_exec)) {
+            // For a script or multi/exec, we should be sending the MULTI at the beginning of the
+            //  execution unit.  There shouldn't be any commands in the propagation queue yet.
+            serverAssert(server.also_propagate.numops == 0);
+            // If this is the first propagated command of a script or multi, make it a transaction.
+            //  It may turn out that there is only 1 command in the MULTI block, but we can't know
+            //  that now.  Unlike regular replication, we can't defer all of the replication until
+            //  we know for sure.  We must call bgIterator after each command.
+            static struct serverCommand* cmd_multi = NULL;   // STATIC to avoid repeated lookups
+            if (cmd_multi == NULL) cmd_multi = lookupCommandOrOriginal(&shared.multi, 1);
+            bgIteration_handleCommandReplication(dbid, cmd_multi, 1, &shared.multi);
+            sentMultiToBgIterator = true;
+        }
+        struct serverCommand* cmd = lookupCommandOrOriginal(argv, argc);
+        bgIteration_handleCommandReplication(dbid, cmd, argc, argv);
+        lastDbidSentToBgIterator = dbid;
+    }
+
     robj **argvcopy;
     int j;
 
@@ -3780,6 +3813,17 @@ void updateCommandLatencyHistogram(struct hdr_histogram **latency_histogram, int
  * multiple separated commands. Note that alsoPropagate() is not affected
  * by CLIENT_PREVENT_PROP flag. */
 static void propagatePendingCommands(void) {
+    // Note: This is done before the check on server.also_propagate.numops.  Numops might be zero
+    //       if there is no replica but we might be running bgIteration for something other than
+    //       replication.  If we sent the multi (to bgIteration), we need to send the matching exec.
+    if (sentMultiToBgIterator) {
+        // If a MULTI was sent to bgIterator via alsoPropagate(), then send the matching EXEC.
+        static struct serverCommand* cmd_exec = NULL;    // STATIC to avoid repeated lookups
+        if (cmd_exec == NULL) cmd_exec = lookupCommandOrOriginal(&shared.exec, 1);
+        bgIteration_handleCommandReplication(lastDbidSentToBgIterator, cmd_exec, 1, &shared.exec);
+        sentMultiToBgIterator = false;
+    }
+
     if (server.also_propagate.numops == 0) return;
 
     int j;
@@ -3909,6 +3953,8 @@ int incrCommandStatsOnError(struct serverCommand *cmd, int flags) {
  *
  */
 void call(client *c, int flags) {
+    if (bgIteration_blockClientIfRequired(c)) return;
+
     long long dirty;
     struct ClientFlags client_old_flags = c->flag;
 
diff --git a/src/server.h b/src/server.h
index 51db9a38baa..c68dd524592 100644
--- a/src/server.h
+++ b/src/server.h
@@ -103,7 +103,19 @@ static_assert(sizeof(off_t) >= 8, "off_t must be 64-bit; ensure _FILE_OFFSET_BIT
 #define dismissMemory zmadvise_dontneed
 
 #define VALKEYMODULE_CORE 1
-typedef struct serverObject robj;
+
+/* serverObject (aka robj) is currently overloaded for 2 purposes.  This is a legacy artifact.
+ *   1. It's carries a reference counted STRING (a keyless value) during parsing and command execution.
+ *   2. It's also used to carry a key/value pair which is inserted into the DB.  In this form, the
+ *      value is not limited to being a string.
+ * 
+ * The typedef "dbEntry" is used to explicitly connote the latter form.  It indicates a key/value
+ * pair which is suitable to exist in the DB.  It might be active in the DB, or may be unlinked from
+ * the DB (but still contains a key/value).  The value may be any of the Valkey data types/encodings.
+ */
+typedef struct serverObject robj;       // A keyless string OR a key/value pair
+typedef struct serverObject dbEntry;    // Explicitly a key/value pair
+
 #include "valkeymodule.h" /* Modules API defines. */
 
 /* Following includes allow test functions to be called from main() */
diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp
new file mode 100644
index 00000000000..7499e53ca52
--- /dev/null
+++ b/src/unit/test_bgiteration.cpp
@@ -0,0 +1,3747 @@
+//#include <algorithm>
+#include "generated_wrappers.hpp"
+#include <vector>
+//#include "amz_assert.h"
+
+//                                                                                                        
+//                                                                                                        
+//                                                ##                                                      
+//     ######:                                    ##                                                      
+//     #######:                                   ##                                                      
+//     ##   :##                                                                                           
+//     ##    ##   ##.####   .####:   ##:  :##   ####      .####:  ##      ##                              
+//     ##   :##   #######  .######:   ##  ##    ####     .######: ##.    .##                              
+//     #######:   ###.     ##:  :##  :##  ##:     ##     ##:  :##  #: ## :#                               
+//     ######:    ##       ########   ##..##      ##     ######## :#:.##.:#:                              
+//     ##         ##       ########   ##::##      ##     ########  # :##:##                               
+//     ##         ##       ##         :####:      ##     ##        ## ## ##                               
+//     ##         ##       ###.  :#    ####       ##     ###.  :#  ###::##                                
+//     ##         ##       .#######    ####    ########  .#######  :##..##:                               
+//     ##         ##        .#####:    :##:    ########   .#####:  .##  ##                                
+//                                                                                                        
+//                                                                                                        
+//                                                                                                        
+//                                                                                                        
+//                                                                                                        
+//                                                                                                        
+//                                                                                                        
+//                .####.             ####                                                                 
+//                ######             ####                                                                 
+//               :##  ##:              ##                                                                 
+//               ##:  :##  ##.####     ##      ##    ##                                                   
+//               ##    ##  #######     ##      :##  ##                                                    
+//               ##    ##  ###  :##    ##       ##: ##.                                                   
+//               ##    ##  ##    ##    ##       ###:##                                                    
+//               ##    ##  ##    ##    ##       .## #                                                     
+//               ##:  :##  ##    ##    ##        ####.                                                    
+//               :##  ##:  ##    ##    ##:       :###                                                     
+//                ######   ##    ##    #####      ##                                                      
+//                .####.   ##    ##    .####      ##.                                                     
+//                                               :##                                                      
+//                                              ###:                                                      
+//                                              ###                                                       
+//                                                                                                        
+//                                                                                                        
+//                                                                                                        
+//                                                                                                        
+//     ###   ##                                                                    ##                     
+//     ###   ##              ##                                                    ##                     
+//     ###:  ##              ##                                                    ##                     
+//     ####  ##   .####.   #######              ##.####   .####:    :####     :###.##  ##    ##           
+//     ##:#: ##  .######.  #######              #######  .######:   ######   :#######  :##  ##            
+//     ## ## ##  ###  ###    ##                 ###.     ##:  :##   #:  :##  ###  ###   ##: ##.           
+//     ## ## ##  ##.  .##    ##                 ##       ########    :#####  ##.  .##   ###:##            
+//     ## :#:##  ##    ##    ##                 ##       ########  .#######  ##    ##   .## #             
+//     ##  ####  ##.  .##    ##                 ##       ##        ## .  ##  ##.  .##    ####.            
+//     ##  :###  ###  ###    ##.                ##       ###.  :#  ##:  ###  ###  ###    :###             
+//     ##   ###  .######.    #####              ##       .#######  ########  :#######     ##              
+//     ##   ###   .####.     .####              ##        .#####:    ###.##   :###.##     ##.             
+//                                                                                       :##              
+//                                                                                      ###:              
+//                                                                                      ###               
+//                                                                                                        
+//                                                                                                        
+//                                                                                                        
+//                                                                              ##                        
+//       :####                                                                  ##                        
+//       #####                                                                  ##                        
+//       ##                                                                                               
+//     #######    .####.    ##.####             ##.####   .####:   ##:  :##   ####      .####:  ##      ##
+//     #######   .######.   #######             #######  .######:   ##  ##    ####     .######: ##.    .##
+//       ##      ###  ###   ###.                ###.     ##:  :##  :##  ##:     ##     ##:  :##  #: ## :# 
+//       ##      ##.  .##   ##                  ##       ########   ##..##      ##     ######## :#:.##.:#:
+//       ##      ##    ##   ##                  ##       ########   ##::##      ##     ########  # :##:## 
+//       ##      ##.  .##   ##                  ##       ##         :####:      ##     ##        ## ## ## 
+//       ##      ###  ###   ##                  ##       ###.  :#    ####       ##     ###.  :#  ###::##  
+//       ##      .######.   ##                  ##       .#######    ####    ########  .#######  :##..##: 
+//       ##       .####.    ##                  ##        .#####:    :##:    ########   .#####:  .##  ##  
+//                                                                                                        
+//                                                                                                        
+//                                                                                                        
+//                                                                                                        
+
+
+
+using namespace ::testing;
+
+extern "C" {
+    #include "stdlib.h"
+    #include "bgiteration.h"
+    #include "server.h"
+    //#include "serverassert.h"
+    #define using usingvar // compile hack
+    #include "module.h"
+    #undef using
+    extern hashtableType commandSetType;
+    extern dictType keylistDictType;
+    bool iteratorRepldoneFn(void *privdata);
+    void iteratorCleanupFn(bool terminated, void *privdata);
+    void bgIteration_feedIterators(void);
+    void createSharedObjects(void);
+    void hashtableDump(hashtable *ht);
+    void rehashStepExpand(hashtable *ht); // in hashtable.c (non-API)
+    void bgIteration_unitTestDisableCloning(void);
+    void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes);
+}
+
+
+// The private data is a pointer to arbitrary data.  This value is used just to
+//  test that the correct value is passed through.
+#define PRIVDATA reinterpret_cast<void*>(12345)
+
+// A bgIteration cleanup function used for testing.
+int cleanupCount;
+bool cleanupTerminated;
+void iteratorCleanupFn(bool terminated, void *privdata) {
+    EXPECT_EQ(privdata, PRIVDATA);
+    cleanupCount++;
+    cleanupTerminated = terminated;
+}
+
+
+// A bgIteration repldone function used for testing.
+int repldoneCount;
+bool iteratorRepldoneFn(void *privdata) {
+    EXPECT_EQ(privdata, PRIVDATA);
+    repldoneCount++;
+    return true;
+}
+
+
+// A more complicated repldone function that can delay the replcation done condition.
+bool isReplDoneReady;
+bool iteratorRepldoneFnNotBeingReadyInitially(void *privdata) {
+    EXPECT_EQ(privdata, PRIVDATA);
+    // This is to test the behavior when Repl Done function is not ready to be executed.
+    if (!isReplDoneReady) {
+        isReplDoneReady = true;
+        return false;
+    }
+    repldoneCount++;
+    return true;
+}
+
+
+static const char *logfile = "";
+
+/* Most of the bgIteration unit tests are based on a CMD instance with 2 DBs.  There are 8 keys in
+ * each DB.  The keys are named A0, B0, C0, D0, E0, F0, G0, H0 for DB-0 and A1, B1, C1, D1, E1, F1,
+ * G1, H1 for DB-1.  There are a number of helper functions to simulate certain key modification
+ * actions within our test configuration.  Note that this is isolated from the actual call to
+ * processCommand.
+ * 
+ * Because most of bgIteration is based on an ordered processing of keys, it doesn't matter if we
+ * are simulating CMD or CME, full scan, or slot-based.  The majority of tests are independent of
+ * these concerns.
+ * 
+ * However, there are some tests which are are unique to these configurations and use a specialized
+ * derived class to handle the differences.  We do not want to duplicate all of the tests for
+ * the different configurations, but we do want to ensure that each configuration works properly.
+ *   - bgIterationTestCluster - handles tests unique to full scan in cluster mode
+ *   - bgIterationTestClusterSlots - handles tests unique to cluster slot-based iteration
+ */
+class BgIterationTest : public ::testing::Test {
+    private:
+        static const int DB_COUNT = 2;
+        static const int ITEMS_PER_DB = 8;
+
+        // This is the expected order of the keys when hashed
+        const char *keys[DB_COUNT][ITEMS_PER_DB] = {{"D0", "G0", "H0", "C0", "F0", "A0", "B0", "E0"},
+                                                    {"B1", "C1", "F1", "G1", "E1", "D1", "A1", "H1"}};
+
+    protected:
+        static const int TOTAL_ITEMS = DB_COUNT * ITEMS_PER_DB;
+        static const int LAST_ITEM = TOTAL_ITEMS - 1;
+
+        MockValkey mock;
+        RealValkey real;
+
+        struct serverCommand dummy_cmd = {0};
+
+        // Helper functions for accessing the keys.  We can access by db(0..1) and seq(0..4)
+        //  or by item number (0..9).
+        // NOTE: These virtual functions can be overridden in subclasses which may have different item layout.
+        virtual const char * getKeyAtDbSeq(int db, int seq) {
+            assert(db < DB_COUNT);
+            assert(seq < ITEMS_PER_DB);
+            return keys[db][seq];
+        }
+
+        virtual int getDbFromItemNum(int itemNum) {
+            assert(itemNum < DB_COUNT * ITEMS_PER_DB);
+            return itemNum / ITEMS_PER_DB;
+        }
+
+        virtual int getSeqFromItemNum(int itemNum) {
+            assert(itemNum < DB_COUNT * ITEMS_PER_DB);
+            return itemNum % ITEMS_PER_DB;
+        }
+
+        const char * keyStr(int itemNum) {
+            return getKeyAtDbSeq(getDbFromItemNum(itemNum), getSeqFromItemNum(itemNum));
+        }
+
+        int itemNumFromKey(const char * key) {
+            for (int itemNum = 0;  itemNum < DB_COUNT * ITEMS_PER_DB;  itemNum++) {
+                if (strcmp(key, keyStr(itemNum)) == 0) return itemNum;
+            }
+            return -1;
+        }
+
+
+        // Do some general initialization before starting the suite.  Normally, the tests are run in
+        //  isolation - and this isn't much different than SetUp().  But if running the
+        //  entire test suite together (just manually running the test executable), this gets called
+        //  only once.
+        static void SetUpTestSuite() {
+            monotonicInit();
+            
+            bzero(&server, sizeof(server));
+            server.hz = 100;
+            server.logfile = const_cast<char*>(logfile);
+            createSharedObjects();
+
+            moduleInitModulesSystem();
+
+            server.commands = hashtableCreate(&commandSetType);
+            server.orig_commands = hashtableCreate(&commandSetType);
+            populateCommandTable();
+        }
+
+
+        static void TearDownTestSuite() {
+            hashtableRelease(server.commands);
+            hashtableRelease(server.orig_commands);
+        }
+
+
+        void initializeServerDb(int dbid, int slot_count_bits = 0) {
+            server.db[dbid] = static_cast<serverDb *>(zcalloc(sizeof(serverDb)));
+            server.db[dbid]->id = dbid;
+            server.db[dbid]->keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, 0);
+            server.db[dbid]->expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, 0);
+            server.db[dbid]->watched_keys = dictCreate(&keylistDictType);
+            kvstoreExpand(server.db[dbid]->keys, 8, 0, NULL);
+        }
+
+
+        void addKeyAndValObjsToDb(int dbid, sds key, sds val) {
+            robj *key_obj = createStringObjectFromSds(key);
+            robj *val_obj = createStringObjectFromSds(val);
+            dbAdd(server.db[dbid], key_obj, &val_obj);
+            decrRefCount(key_obj);
+        }
+
+
+        void addKeyToDb(int dbid, const char *key, const char *val) {
+            addKeyAndValObjsToDb(dbid, sdsnew(key), sdsnew(val));
+        }
+
+
+        virtual void setupDatabase() {
+            // For these unit tests, a standard database is constructed.  The order of items in the
+            //  hash table is important, and this is validated here.  If the hash table
+            //  implementation changes, we will find out quickly at this point.  All other tests
+            //  will become invalid!
+
+            server.dbnum = 2;
+            server.cluster_enabled = false;
+            server.db = static_cast<serverDb **>(zcalloc(sizeof(serverDb *) * server.dbnum));
+
+            for (int dbid = 0;  dbid < server.dbnum;  dbid++) {
+                initializeServerDb(dbid);
+            }
+
+            // With hashtable, it can be difficult to get our keys spread across different buckets.
+            //  Here we play with hashtable size and rehashing to get comfortable scenarios for testing.
+            // NOTE: If the hashtable bucketization changes, we'll need to evaluate the tests for
+            //       changes.  Since bgIteration processes a bucket at a time, we need to evaluate
+            //       all the tests when bucketization changes.
+            // As an alternative, we could mock all of the hashtable activity, but it's better if we
+            //  can use the real functionality as much as possible.
+
+            kvstoreExpand(server.db[0]->keys, 16, 0, NULL);
+            addKeyToDb(0, "A0", "A0");
+            addKeyToDb(0, "B0", "B0");
+            addKeyToDb(0, "C0", "C0");
+            addKeyToDb(0, "D0", "D0");
+            addKeyToDb(0, "E0", "E0");
+            addKeyToDb(0, "F0", "F0");
+            addKeyToDb(0, "G0", "G0");
+            addKeyToDb(0, "H0", "H0");
+            hashtable *ht = kvstoreGetHashtable(server.db[1]->keys, 0);
+            hashtablePauseRehashing(ht);
+
+            kvstoreExpand(server.db[1]->keys, 16, 0, NULL);
+            addKeyToDb(1, "A1", "A1");
+            addKeyToDb(1, "B1", "B1");
+            addKeyToDb(1, "C1", "C1");
+            addKeyToDb(1, "D1", "D1");
+            addKeyToDb(1, "E1", "E1");
+            addKeyToDb(1, "F1", "F1");
+            addKeyToDb(1, "G1", "G1");
+            addKeyToDb(1, "H1", "H1");
+            // Now, let's increase the size and start a rehash on the 2nd DB.  This ensures that
+            //  iteration is working even if a hashtable is in the middle of rehashing.  We choose
+            //  a 128 size so that rehashed keys all get unique buckets.
+            kvstoreExpand(server.db[1]->keys, 128, 0, NULL);
+            ht = kvstoreGetHashtable(server.db[1]->keys, 0);
+            rehashStepExpand(ht); // in hashtable.c (non-API)
+            rehashStepExpand(ht); // and rehash the 2nd bucket also
+            hashtablePauseRehashing(ht);
+
+            // The bucketization should look like this.  Remember that DB-1 is in
+            //  the middle of a rehash, so it has 2 tables.
+            //
+            // DB: 0  SLOT: 0
+            // Table 0, used 8, exp 2, top-level buckets 4, child buckets 0
+            //   Bucket 0:1 level:0
+            //     0 h2 63, key "D0"
+            //     1 h2 a5, key "G0"
+            //     2 h2 ca, key "H0"
+            //   Bucket 0:2 level:0
+            //     0 h2 91, key "C0"
+            //     1 h2 88, key "F0"
+            //   Bucket 0:3 level:0
+            //     0 h2 b8, key "A0"
+            //     1 h2 f5, key "B0"
+            //     2 h2 13, key "E0"
+            // Table 1, used 0, exp -1, top-level buckets 0, child buckets 0
+            //
+            // DB: 1  SLOT: 0
+            // Table 0, used 3, exp 2, top-level buckets 4, child buckets 0
+            //   Bucket 0:0 level:0   <- rehashed into table 1
+            //   Bucket 0:1 level:0   <- rehashed into table 1
+            //   Bucket 0:2 level:0
+            //     0 h2 18, key "B1"
+            //     1 h2 fd, key "C1"
+            //   Bucket 0:3 level:0
+            //     0 h2 6f, key "F1"
+            // Table 1, used 5, exp 5, top-level buckets 32, child buckets 0
+            //   Bucket 1:1 level:0
+            //     0 h2 ad, key "G1"
+            //   Bucket 1:5 level:0
+            //     0 h2 0c, key "E1"
+            //   Bucket 1:12 level:0
+            //     0 h2 e9, key "D1"
+            //   Bucket 1:17 level:0
+            //     0 h2 36, key "A1"
+            //   Bucket 1:29 level:0
+            //     0 h2 9e, key "H1"
+            //   Bucket 1:30 level:0
+
+
+            // In case we need to debug...
+            // Used to generate comment above, showing bucketization.
+            if (0) debugPrintBucketInfo();
+
+            // Validate that the iteration order matches the expected order
+            for (int db = 0;  db < server.dbnum;  db++) {
+                ht = kvstoreGetHashtable(server.db[db]->keys, 0);
+                hashtableIterator *it = hashtableCreateIterator(ht, 0);
+                robj *next;
+                int i = 0;
+                while (hashtableNext(it, reinterpret_cast<void**>(&next))) {
+                    ASSERT_THAT(next, robjEqualsStr(getKeyAtDbSeq(db, i++)));
+                }
+                hashtableReleaseIterator(it);
+            }
+        }
+
+
+        void SetUp() override {
+            server.main_thread_id = pthread_self();
+            server.forkless_options_supported = 1;
+            objectSetMetadataSize(sizeof(bgIterationEntryMetadata));
+
+            bgIteration_unitTestDisableCloning();
+
+            setupDatabase();
+
+            EXPECT_CALL(mock, aeCreateTimeEvent(_,_,_,_,_)).WillRepeatedly(Return(0));
+            bgIteration_init();
+
+            cleanupCount = 0;
+            repldoneCount = 0;
+            isReplDoneReady = false;
+
+            // By default, in tests, we treat items as not having an expiration
+            //JHB EXPECT_CALL(mock, getExpire(_,_)).WillRepeatedly(Return(-1));
+
+            // By default, do nothing for these
+            EXPECT_CALL(mock, blockClientInUseOnKeys(_,_,_)).WillRepeatedly(Return());
+            EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).WillRepeatedly(Return());
+
+            // By default, expect no permission issues
+            EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_,_,_,_,_,_)).WillRepeatedly(Return(ACL_OK));
+
+            //JHB EXPECT_CALL(mock, lookupCommandOrOriginal(_)).WillRepeatedly(Return(&dummy_cmd));
+        }
+
+
+        void TearDown() override {
+            bgIteration_feedIterators();    // process returning stuff before deleting DB
+            bgIteration_feedIterators();    // in case an iterator was closed there might be more
+            for (int i = 0;  i < server.dbnum;  i++) {
+                if (server.db[i]->keys) kvstoreRelease(server.db[i]->keys);
+                if (server.db[i]->expires) kvstoreRelease(server.db[i]->expires);
+                dictRelease(server.db[i]->watched_keys);
+                zfree(server.db[i]);
+            }
+            zfree(server.db);
+        }
+
+
+        // void update_keys(const char **new_keys, int db, int len) {
+        //     memcpy(keys[db], new_keys, len * sizeof(const char *));
+        // }
+
+
+
+
+
+
+        // Deletes an item from the DB (often at the start of a test) - but does NOT notify
+        //  bgIteration.  bgIteration_keyDelete() should be explicitly called where needed.
+        void simpleDelItem(int itemNum) {
+            int db = getDbFromItemNum(itemNum);
+
+            sds delKey = sdsnew(keyStr(itemNum));
+            int rc = kvstoreHashtableDelete(server.db[db]->keys, 0, delKey);
+            ASSERT_EQ(rc, 1);
+            sdsfree(delKey);
+        }
+
+
+        // Find the actual dbEntry object by itemNum
+        dbEntry * getItem(int itemNum) {
+            int db = getDbFromItemNum(itemNum);
+            sds key = sdsnew(keyStr(itemNum));
+            dbEntry *de = dbFind(server.db[db], key);
+            sdsfree(key);
+            return de;
+        }
+
+
+        // The test expects that the next item read will be BGITERATOR_ITEM_COMPLETE
+        void expectReadComplete(bgIterator *iter) {
+            bgIteration_feedIterators();
+            bgIteratorItem *item = bgIteratorRead(iter);
+            EXPECT_EQ(item->type, BGITERATOR_ITEM_COMPLETE);
+            bgIteratorClose(iter);
+
+            int oldCleanupCount = cleanupCount;
+            bgIteration_feedIterators();
+            EXPECT_EQ(cleanupCount, oldCleanupCount + 1);
+        }
+
+
+        // The test is cleaning up and isn't validating the remaining cleanup
+        void expectAnythingCleanup(bgIterator *iter) {
+            while (true) {
+                bgIteration_feedIterators();
+                bgIteratorItem *item = bgIteratorRead(iter);
+                if ((item->type == BGITERATOR_ITEM_COMPLETE
+                        || item->type == BGITERATOR_ITEM_TERMINATED)) {
+                    bgIteratorClose(iter);
+                    break;
+                }
+            }
+            bgIteration_feedIterators();    // Recognize the closed iterator
+            EXPECT_EQ(cleanupCount, 1);
+        }
+
+
+        void expectDictEntryMetadataMatch(dbEntry *de1, dbEntry *de2) {
+            bgIterationEntryMetadata *dm1 = static_cast<bgIterationEntryMetadata *>(objectGetMetadata(de1));
+            bgIterationEntryMetadata *dm2 = static_cast<bgIterationEntryMetadata *>(objectGetMetadata(de2));
+
+            EXPECT_NE(dm1, nullptr);
+            EXPECT_NE(dm2, nullptr);
+            EXPECT_EQ(dm1->iterator_epoch, dm2->iterator_epoch);
+        }
+
+
+        // Useful when debugging new tests.  It reads/prints all remaining items then crashes.
+        void cleanupIteratorDebugPrint(bgIterator *iter) {
+            bool done = false;
+            printf("[DEBUG] Printing bgIterator '%s' items:\n", bgIteratorName(iter));
+            while (!done) {
+                bgIteration_feedIterators();
+                bgIteratorItem *item = bgIteratorRead(iter);
+                switch (item->type) {
+                    case BGITERATOR_ITEM_DBENTRY:
+                        {
+                            auto obj = item->u.dbe.de;
+                            const char * keyStr = objectGetKey(obj);
+                            printf("Entry: %s -> %s [itemNum: %i]\n",
+                                   keyStr,
+                                   static_cast<char *>(objectGetVal(obj)),
+                                   itemNumFromKey(keyStr));
+                            break;
+                    }
+                    case BGITERATOR_ITEM_REPLICATION:
+                        printf("Repl: DB=%d : ", item->dbid);
+                        for (int i = 0;  i < item->u.repl.argc;  i++)
+                            printf("%s ", static_cast<char*>(objectGetVal(item->u.repl.argv[i])));
+                        printf("\n");
+                        break;
+                    case BGITERATOR_ITEM_COMPLETE:
+                    case BGITERATOR_ITEM_TERMINATED:
+                        bgIteratorClose(iter);
+                        done = true;
+                        break;
+                    default:
+                        printf("unhandled: %d\n", item->type);
+                }
+            }
+            bgIteration_feedIterators();    // Recognize the closed iterator
+            ASSERT_TRUE(false); // Halt the test here
+        }
+
+
+        // Make a copy of the metadata
+        void * cloneMetadata(dbEntry *de) {
+            int size = objectGetMetadataSize(de);
+            void *metadata = zmalloc(size);
+            memcpy(metadata, objectGetMetadata(de), size);
+            return metadata;
+        }
+
+
+        // Compare a previous metadata copy to an existing entry
+        void compareAndFreeClonedMetadata(dbEntry *de, void *metadata) {
+            EXPECT_EQ(memcmp(objectGetMetadata(de), metadata, objectGetMetadataSize(de)), 0);
+            zfree(metadata);
+        }
+
+
+        // The test expects the next item will be a specific key
+        //  The item value is verified against the default unless provided as a parameter.
+        void expectReadKey(bgIterator *iter, int itemNum, const char *value=nullptr) {
+            int db = getDbFromItemNum(itemNum);
+
+            bgIteration_feedIterators();
+            bgIteratorItem *item = bgIteratorRead(iter);
+            bgIteration_feedIterators();
+
+            ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY);
+            EXPECT_EQ(item->dbid, db);
+            EXPECT_FALSE(item->u.dbe.is_cloned);
+            // if (item->u.dbe.is_cloned) {  // JHB - wrong place to check this.
+            //     // If the entry is cloned, make sure we copied the metadata
+            //     dbEntry *cloned_dbEntry = item->u.dbe.de;
+            //     dbEntry *original_dbEntry = getItem(itemNum);
+            //     expectDictEntryMetadataMatch(original_dbEntry, cloned_dbEntry);
+            // }
+            EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum));
+            if (value) {
+                EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value));
+            } else {
+                EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum)));
+            }
+        }
+
+
+        // The test expects the next item will be a specific key amd that the item is cloned.
+        //  Metadata is tested (to make sure the clone includes the proper metadata).
+        //  The item value is verified against the default unless provided as a parameter.
+        void expectReadClonedKey(bgIterator *iter, int itemNum, void *metadata, const char *value=nullptr) {
+            int db = getDbFromItemNum(itemNum);
+
+            bgIteration_feedIterators();
+            bgIteratorItem *item = bgIteratorRead(iter);
+            bgIteration_feedIterators();
+
+            ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY);
+            EXPECT_EQ(item->dbid, db);
+            EXPECT_TRUE(item->u.dbe.is_cloned);
+            compareAndFreeClonedMetadata(item->u.dbe.de, metadata);
+            EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum));
+            if (value) {
+                EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value));
+            } else {
+                EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum)));
+            }
+        }
+
+
+        // Test expects the next key, but specified by key name, not itemNum.
+        void expectReadDbKeyValue(bgIterator *iter, int db, const char *key, const char *value) {
+            bgIteration_feedIterators();
+            bgIteratorItem *item = bgIteratorRead(iter);
+            bgIteration_feedIterators();
+
+            ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY);
+            EXPECT_EQ(item->dbid, db);
+            EXPECT_STREQ(objectGetKey(item->u.dbe.de), key);
+            EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value));
+        }
+
+
+        // Test expect to read a sequence of key items
+        void expectReadKeySequence(bgIterator *iter, int startItem, int endItem) {
+            for (int i = startItem;  i <= endItem;  i++) expectReadKey(iter, i);
+        }
+
+
+        // Just like expectReadKey, but also tests that a previous item is becoming unblocked.
+        void expectReadKeyWithUnblock(bgIterator *iter, int itemNum, int unblockItem, const char *value=nullptr) {
+            bool blocked = true;
+            EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(unblockItem))))
+                    .WillOnce(Assign(&blocked, false));
+            expectReadKey(iter, itemNum, value);
+            EXPECT_FALSE(blocked);
+        }
+
+
+        // Test expects to read a replication item matching the command help by client 'c'
+        void expectReadReplication(bgIterator *iter, client *c) {
+            bgIteration_feedIterators();
+            bgIteratorItem *item = bgIteratorRead(iter);
+            bgIteration_feedIterators();
+
+            ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION);
+            EXPECT_EQ(item->dbid, c->db->id);
+            EXPECT_EQ(item->u.repl.cmd, c->cmd);
+            EXPECT_EQ(item->u.repl.argc, c->argc);
+            for (int i = 0;  i < c->argc;  i++) {
+                EXPECT_STREQ(static_cast<char*>(objectGetVal(item->u.repl.argv[i])),
+                             static_cast<char*>(objectGetVal(c->argv[i])));
+            }
+        }
+
+
+        // We expect to read a MULTI command which should have been inserted.
+        void expectReadMultiReplication(bgIterator *iter) {
+            bgIteration_feedIterators();
+            bgIteratorItem *item = bgIteratorRead(iter);
+            bgIteration_feedIterators();
+
+            ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION);
+            EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("multi"));
+        }
+
+
+        // We expect to read an EXEC command which should have been inserted.
+        void expectReadExecReplication(bgIterator *iter) {
+            bgIteration_feedIterators();
+            bgIteratorItem *item = bgIteratorRead(iter);
+            bgIteration_feedIterators();
+
+            ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION);
+            EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("exec"));
+        }
+
+
+        // Expecting that a DEL command should have been replicated.
+        void expectReadReplicationDel(bgIterator *iter, int itemNum) {
+            int db = getDbFromItemNum(itemNum);
+
+            bgIteration_feedIterators();
+            bgIteratorItem *item = bgIteratorRead(iter);
+            bgIteration_feedIterators();
+
+            ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION);
+            EXPECT_EQ(item->dbid, db);
+            EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("DEL"));
+            EXPECT_EQ(item->u.repl.argc, 2);
+            EXPECT_THAT(item->u.repl.argv[0], robjEqualsStr("DEL"));
+            EXPECT_THAT(item->u.repl.argv[1], robjEqualsStr(keyStr(itemNum)));
+        }
+
+
+        // Expecting that a special SWAPDB item has been inserted.
+        void expectReadSwapDB(bgIterator *iter, int db1, int db2) {
+            bgIteration_feedIterators();
+            bgIteratorItem *item = bgIteratorRead(iter);
+            bgIteration_feedIterators();
+
+            ASSERT_EQ(item->type, BGITERATOR_ITEM_SWAPDB);
+            EXPECT_EQ(item->dbid, db1);
+            EXPECT_EQ(item->u.dbid2, db2);
+        }
+
+
+        // Used to examine the physical bucket layout in the hash table.  Generated the comment
+        //  above which shows each item in each bucket.  Necessary if hash table layout changes.
+        void debugPrintBucketInfo(int num_slots = -1) {
+            for (int db = 0;  db < server.dbnum;  db++) {
+                int n = (num_slots == -1) ? kvstoreNumHashtables(server.db[db]->keys) : num_slots;
+                for (int slot = 0;  slot < n;  slot++) {
+                    hashtable *ht = kvstoreGetHashtable(server.db[db]->keys, slot);
+                    printf("DB: %d  SLOT: %d\n", db, slot);
+                    hashtableDump(ht);
+                }
+            }
+        }
+
+
+        // Creates a client with a write command (SET) for the given itemNum
+        client * getWriteClient(int itemNum, const char *value) {
+            int db = getDbFromItemNum(itemNum);
+
+            client *c = static_cast<client*>(zcalloc(sizeof(client)));
+
+            c->cmd = lookupCommandByCString("set");
+            c->db = server.db[db];
+
+            c->argc = 3;
+            c->argv = static_cast<robj**>(zcalloc(sizeof(robj*) * c->argc));
+            c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname));
+            c->argv[1] = createStringObjectFromSds(sdsnew(keyStr(itemNum)));
+            c->argv[2] = createStringObjectFromSds(sdsnew(value));
+
+            return c;
+        }
+
+
+        // Create a client with a write command that touches multiple keys
+        client * getWriteMultiKeysClient(
+                const char * cmdName,
+                int dstItemNum,
+                const std::vector<int> & srcItemsNum) {
+
+            assert(!srcItemsNum.empty());
+
+            const int db = getDbFromItemNum(dstItemNum);
+            std::for_each(srcItemsNum.cbegin(), srcItemsNum.cend(), [&db, this](int srcItemNum) {
+                assert(db == getDbFromItemNum(srcItemNum));
+            });
+
+            client *c = static_cast<client*>(zcalloc(sizeof(client)));
+
+            c->cmd = lookupCommandByCString(cmdName);
+            assert(c->cmd != nullptr);
+            c->db = server.db[db];
+
+            c->argc = 2 + srcItemsNum.size();
+            c->argv = static_cast<robj**>(zcalloc(sizeof(robj*) * c->argc));
+            c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname));
+            c->argv[1] = createStringObjectFromSds(sdsnew(keyStr(dstItemNum)));
+            for (unsigned int i = 0;  i < srcItemsNum.size();  i++) {
+                c->argv[2 + i] = createStringObjectFromSds(sdsnew(keyStr(srcItemsNum[i])));
+            }
+
+            return c;
+        }
+
+
+        client * getWrite2KeysClient(const char * cmdName, int dstItemNum, int srcItemNum) {
+            return getWriteMultiKeysClient(cmdName, dstItemNum, {srcItemNum});
+        }
+
+
+        client * getWrite3KeysClient(
+                const char * cmdName, int dstItemNum, int src1ItemNum, int src2ItemNum) {
+            return getWriteMultiKeysClient(cmdName, dstItemNum, {src1ItemNum, src2ItemNum});
+        }
+
+
+        // Create a client with a MULTI/EXEC block.
+        //  This parses a series of commands separated by ';'
+        //  Example: getMultiClient("SET A0 xxx; SELECT 1; SET A1 xxx; SET B1 xxx")
+        client * getMultiClient(const char *commands, int dbid = 0) {
+            char *commandsCopy = zstrdup(commands);  // a mutable copy
+            char *commandStr, *commandStrSave;
+            char *token, *tokenSave;
+
+            client *c = static_cast<client*>(zcalloc(sizeof(client)));
+            c->db = server.db[dbid];
+            initClientMultiState(c);
+            c->flag.multi = 1;
+            c->mstate->cmd_flags |= CMD_WRITE;
+
+            commandStr = strtok_r(commandsCopy, ";", &commandStrSave);
+            while (commandStr != NULL) {
+
+                token = strtok_r(commandStr, " ", &tokenSave);
+                c->cmd = lookupCommandByCString(token);
+
+                c->argv = static_cast<robj**>(zcalloc(sizeof(robj*) * 5));   // command + 4 args
+
+                for (int i = 0;  token != NULL;  i++) {
+                    c->argv[i] = createStringObject(token, strlen(token));
+                    c->argc = i+1;
+                    token = strtok_r(NULL, " ", &tokenSave);
+                }
+
+                queueMultiCommand(c, 0);
+                freeClientArgv(c);
+
+                commandStr = strtok_r(NULL, ";", &commandStrSave);
+            }
+
+            c->cmd = lookupCommandByCString("exec");
+            c->argc = 1;
+            c->argv = static_cast<robj**>(zcalloc(sizeof(robj*) * c->argc));
+            c->argv[0] = createStringObjectFromSds(sdsnew("EXEC"));
+
+            zfree(commandsCopy);
+            return c;
+        }
+
+
+        // Initially, a MULTI client is set up to execute the EXEC command (which examines the
+        //  contents of the multi/exec block).  This function advances the client to begin executing
+        //  the individual commands within the multi/exec block.
+        void advanceMultiClientToCommand(client *c, int cmdNum) {
+            assert(cmdNum >= 0 && cmdNum < c->mstate->count);
+            c->argc = c->mstate->commands[cmdNum].argc;
+            c->argv = c->mstate->commands[cmdNum].argv;
+            c->argv_len = c->mstate->commands[cmdNum].argv_len;
+            c->cmd = c->realcmd = c->mstate->commands[cmdNum].cmd;
+        }
+
+
+        // A client with a fictional command:
+        //  SETGET <write_key> <value> <read_key>
+        //  - writes a value to the first key (making this CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY)
+        //  - reads a second key
+        client * getSetGetClient(int itemNum1, const char *value1, int itemNum2) {
+            // Fictional command which writes to 1st key and reads the 2nd
+            int db = getDbFromItemNum(itemNum1);
+            assert(db == getDbFromItemNum(itemNum2));   // (this would be a testcase error)
+
+            client *c = static_cast<client*>(zcalloc(sizeof(client)));
+            struct serverCommand *cmd
+                    = static_cast<struct serverCommand*>(zcalloc(sizeof(struct serverCommand)));
+
+            cmd->fullname = const_cast<char*>("SETGET");
+            cmd->arity = 4;
+            cmd->flags = CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY;
+
+            cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INDEX;
+            cmd->legacy_range_key_spec.bs.index.pos = 1;    // firstkey
+            cmd->legacy_range_key_spec.fk.range.lastkey = -1;
+            cmd->legacy_range_key_spec.fk.range.keystep = 2;
+
+            c->cmd = cmd;
+            c->db = server.db[db];
+
+            c->argc = 4;
+            c->argv = static_cast<robj**>(zcalloc(sizeof(robj*) * c->argc));
+            c->argv[0] = createStringObjectFromSds(sdsnew(cmd->fullname));
+            c->argv[1] = createStringObjectFromSds(sdsnew(keyStr(itemNum1)));
+            c->argv[2] = createStringObjectFromSds(sdsnew(value1));
+            c->argv[3] = createStringObjectFromSds(sdsnew(keyStr(itemNum2)));
+
+            return c;
+        }
+
+
+        // Client with a fictional write command with no keys specified
+        client * getNoKeysWriteClient() {
+            // Fictional command which is marked WRITE, but has no keys.
+            client *c = static_cast<client*>(zcalloc(sizeof(client)));
+            struct serverCommand *cmd
+                    = static_cast<struct serverCommand*>(zcalloc(sizeof(struct serverCommand)));
+
+            cmd->fullname = const_cast<char*>("NOKEYSWRITE");
+            cmd->arity = 1;
+            cmd->flags = CMD_WRITE;
+
+            cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INVALID;    // No keys
+
+            c->cmd = cmd;
+            c->db = server.db[0];
+
+            c->argc = 1;
+            c->argv = static_cast<robj**>(zcalloc(sizeof(robj*) * c->argc));
+            c->argv[0] = createStringObjectFromSds(sdsnew(cmd->fullname));
+
+            return c;
+        }
+
+
+        void freeClientArgv(client *c) {
+            for (int i = 0; i < c->argc; i++) decrRefCount(c->argv[i]);
+            zfree(c->argv);
+            c->argv = NULL;
+            c->argc = 0;
+        }
+
+
+        // During testing, we create some fake commands.  This checks if the command is real or fake.
+        //  A fake command is dynamically allocated and can be freed.  Real commands are static.
+        bool isRealValkeyCommand(struct serverCommand *cmd) {
+            return lookupCommandByCString(cmd->declared_name);
+        }
+
+
+        void freeTestClient(client *c) {
+            freeClientMultiState(c);
+            freeClientArgv(c);
+
+            if (!isRealValkeyCommand(c->cmd)) zfree(c->cmd);
+
+            zfree(c);
+        }
+
+
+        // Simulate what happens when a write command is blocked
+        void simulateBlockedWrite(client *c, int expectedNumberBlockedKeys = 1) {
+            EXPECT_CALL(mock, blockClientInUseOnKeys(c,expectedNumberBlockedKeys,_)).Times(1);
+            bool blocked = bgIteration_blockClientIfRequired(c);
+            EXPECT_TRUE(blocked);
+        }
+
+
+        // Simulate what happens when a write command isn't blocked
+        void simulateUnblockedWrite(client *c) {
+            EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0);
+            bool blocked = bgIteration_blockClientIfRequired(c);
+            EXPECT_FALSE(blocked);
+        }
+
+
+        // Simulate what happens when a write command is NOT blocked, because the key can be cloned
+        //  and expedited.  This requires a scenario where we would normally need to block the
+        //  client so that bgIteration can process the item.
+        void simulateClonedWrite(bgIterator *it, client *c) {
+            bgIteratorStatus status;
+            bgIteratorGetStatus(it, &status);
+            unsigned long initialClones = status.dbentry_clones_queued;
+
+            // Client should not get blocked
+            EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0);
+            bool blocked = bgIteration_blockClientIfRequired(c);
+            EXPECT_FALSE(blocked);
+
+            // Ensure that cloning took place
+            bgIteratorGetStatus(it, &status);
+            EXPECT_EQ(status.dbentry_clones_queued, (initialClones + 1));
+
+            // Ensure that the real item isn't inuse (because we cloned it instead)
+            dbEntry *de = dbFind(c->db, static_cast<sds>(objectGetVal(c->argv[1])));
+            ASSERT_FALSE(bgIteration_isEntryInuse(de));
+        }           
+
+
+        // Simulates what happens when a write command (SET) actually executes.  This requires a
+        //  scenario where we would NOT be blocked on the write.  It actually alters the value of
+        //  the key and updates the metadata.
+        void simulateUnblockedWriteWithModification(client *c) {
+            EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0);
+            bool blocked = bgIteration_blockClientIfRequired(c);
+            EXPECT_FALSE(blocked);
+
+            //dbFind(c->db, static_cast<sds>(objectGetVal(c->argv[1])));  JHB
+
+            // Fake execution of the command - touch the iterator_epoch counter and swap the value
+            // We need to duplicate the value because setKey() can reallocate it.
+            robj *value = dupStringObject(c->argv[2]);
+            setKey(c, c->db, c->argv[1], &value, SETKEY_ADD_OR_UPDATE);
+
+            // Let's make sure that setKey updated the iteration epoch (as it should have)
+            dbEntry *de = dbFind(c->db, static_cast<sds>(objectGetVal(c->argv[1])));
+            bgIterationEntryMetadata *md = static_cast<bgIterationEntryMetadata*>(objectGetMetadata(de));
+            EXPECT_EQ(md->iterator_epoch, bgIteration_getEpoch());
+
+            bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv);
+        }
+
+
+        // Simulate execution of a MULTI/EXEC transaction for a client `c` without blocking.
+        //  It replays all queued commands and ensures replication matches a real transaction.
+        //  command replication flag is revalidated when exec command is processed.
+        //  This requires a scenario where we don't expect the client to be blocked.
+        void simulateUnblockedMultiExec(client *c) {
+
+            // simulate EXEC command of the multi/exec client
+            simulateUnblockedWrite(c);
+            server.in_exec = 1;
+  
+            // If there are other commands, call both blockClientIfRequired and handleCommandReplication for each of the command.
+            for (int i = 0;  i < c->mstate->count;  i++) {
+                advanceMultiClientToCommand(c, i);
+                simulateUnblockedWrite(c);
+                
+                // Replicate MULTI if this is the first instruction inside MULTI/EXEC
+                if (i == 0) {
+                    robj *argv[1];
+                    argv[0] = createStringObjectFromSds(sdsnew("multi"));
+                    bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("multi"), 1, argv);
+                    decrRefCount(argv[0]);
+                }
+                bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv);
+            }
+
+            // Call handleCommandReplication for EXEC
+            robj *argv[1];
+            argv[0] = createStringObjectFromSds(sdsnew("EXEC"));
+            bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("exec"), 1, argv);
+            server.in_exec = 0;
+            decrRefCount(argv[0]);
+        }
+
+
+        // Simulate the expiration (active expiration) of a key.  This is independent of command execution.
+        void simulateExpiration(int itemNum) {
+            ASSERT_NE(getItem(itemNum), nullptr); // Should be there before expire
+
+            // NOTE: This seems weird, but Valkey propagates the delete before actually expiring the
+            //       key.  BgIterator expects this behavior and expects the key to exist when the
+            //       DEL is received for propagation.
+
+            // Send bgIteration the DEL
+            int db = getDbFromItemNum(itemNum);
+            sds sdsKey = sdsnew(keyStr(itemNum));
+            robj *argv[2];
+            argv[0] = createStringObjectFromSds(sdsnew("DEL"));
+            argv[1] = createStringObjectFromSds(sdsdup(sdsKey));
+            serverCommand *cmd = lookupCommandByCString("DEL");
+            bgIteration_handleCommandReplication(db, cmd, 2, argv);
+            decrRefCount(argv[0]);
+            decrRefCount(argv[1]);
+
+            bgIteration_keyDelete(db, sdsKey);
+            simpleDelItem(itemNum);     // Simulate the actual del
+
+            EXPECT_EQ(getItem(itemNum), nullptr);
+            sdsfree(sdsKey);
+        }
+
+
+        // Simulates an expiration, but validates behavior for an item inuse by bgIteration.
+        void simulateExpirationOfInuse(int itemNum) {
+            // An inuse item will have a refcount > 1.  BgIteration should have incremented the
+            //  refcount while it is inuse.
+            dbEntry *de = getItem(itemNum);
+            ASSERT_NE(de, nullptr); // Should be there before expire
+            EXPECT_TRUE(bgIteration_isEntryInuse(de));
+            EXPECT_EQ(de->refcount, 2u);
+
+            simulateExpiration(itemNum);
+
+            // At this point, the item is removed from the DB, but still exists, and the refcount
+            //  has been reduced to 1.  This allows a background thread to continue using the item.
+            EXPECT_EQ(de->refcount, 1u);
+        }
+
+
+        // Simulates an expiration, but the item is a future item which will be expedited.
+        void simulateExpirationWithExpedite(int itemNum) {
+            // An inuse item will have a refcount > 1.  BgIteration should have incremented the
+            //  refcount while it is inuse.
+            dbEntry *de = getItem(itemNum);
+            ASSERT_NE(de, nullptr); // Should be there before expire
+            EXPECT_FALSE(bgIteration_isEntryInuse(de)); // Not yet inuse
+            EXPECT_EQ(de->refcount, 1u);
+
+            simulateExpiration(itemNum);
+
+            // At this point, the item is removed from the DB, but still exists, and the refcount
+            //  has been reduced to 1.  This allows a background thread to continue using the item.
+            EXPECT_TRUE(bgIteration_isEntryInuse(de)); // It's inuse now
+            EXPECT_EQ(getItem(itemNum), nullptr);      // but it's not in the DB anymore
+            EXPECT_EQ(de->refcount, 1u);
+        }
+
+
+        // Simulate execution of a SWAPDB command
+        void simulateSwapDB(int dbid0, int dbid1) {
+            char dbStr[2] = {0};
+
+            client *c = static_cast<client*>(zcalloc(sizeof(client)));
+
+            c->cmd = lookupCommandByCString("swapdb");
+            c->db = server.db[0];
+
+            c->argc = 3;
+            c->argv = static_cast<robj**>(zcalloc(sizeof(robj*) * c->argc));
+            c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname));
+            dbStr[0] = '0' + dbid0;
+            c->argv[1] = createStringObjectFromSds(sdsnew(dbStr));
+            dbStr[0] = '0' + dbid1;
+            c->argv[2] = createStringObjectFromSds(sdsnew(dbStr));
+
+            bool blocked = bgIteration_blockClientIfRequired(c);
+            EXPECT_FALSE(blocked);  // SWAPDB should never block
+
+            // The real SWAP does more than this, but this is enough for unit tests
+            serverDb *aux = server.db[dbid0];
+            server.db[dbid0] = server.db[dbid1];
+            server.db[dbid1] = aux;
+
+            bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv);
+
+            freeTestClient(c);
+        }
+
+
+        // Simulate execution of a FLUSHDB or FLUSHALL command
+        void simulateFlushDB(int db, int anInUseItem) {
+            client *c = static_cast<client*>(zcalloc(sizeof(client)));
+
+            if (db == -1) {
+                c->cmd = lookupCommandByCString("flushall");
+                c->db = server.db[0];
+            } else {
+                c->cmd = lookupCommandByCString("flushdb");
+                c->db = server.db[db];
+            }
+
+            c->argc = 1;
+            c->argv = static_cast<robj**>(zcalloc(sizeof(robj*) * c->argc));
+            c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname));
+
+            dbEntry *de_in_use = getItem(anInUseItem);
+            EXPECT_EQ(de_in_use->refcount, 2u);
+
+            bool blocked = bgIteration_blockClientIfRequired(c);
+            EXPECT_FALSE(blocked);  // FLUSHDB should never block
+
+            // The real FLUSH does more than this, but this is enough for unit tests
+
+            // Now flush the items
+            for (int d = 0;  d < server.dbnum;  d++) {
+                if (db == -1 || db == d) {
+                    kvstoreRelease(server.db[d]->keys);
+                    server.db[d]->keys = NULL;
+                }
+            }
+
+            EXPECT_EQ(de_in_use->refcount, 1u);
+
+            // and replicate
+
+            bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv);
+
+            freeTestClient(c);
+        }
+};
+
+using BgIterationDeathTest = BgIterationTest;
+
+
+TEST_F(BgIterationTest, dbIsOK) {
+    // Just run the setup/teardown code to make sure the DB is OK.
+}
+
+
+/////////////////////////////////////////////////////
+// Simple Full-scan iterator tests
+/////////////////////////////////////////////////////
+
+// A simple full scan that just checks basic flow.
+TEST_F(BgIterationTest, createAndCleanup) {
+    bgIterator *it = bgIteratorCreateFullScanIter("simple",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+    EXPECT_EQ(bgIteratorFind("simple"), it);
+    EXPECT_STREQ(bgIteratorName(it), "simple");
+
+    bgIteratorStatus status;
+    bgIteratorGetStatus(it, &status);
+
+    EXPECT_EQ(status.dbentries_queued, 0u);
+    EXPECT_EQ(status.dbentries_processed, 0u);
+    EXPECT_EQ(status.replication_queued, 0u);
+    EXPECT_EQ(status.replication_processed, 0u);
+    EXPECT_EQ(status.swapdb_queued, 0u);
+    EXPECT_EQ(status.swapdb_processed, 0u);
+    EXPECT_EQ(status.flushdb_queued, 0u);
+    EXPECT_EQ(status.flushdb_processed, 0u);
+
+    EXPECT_EQ(status.queue_length, 0u);
+    EXPECT_GT(status.queue_length_target, 0u);
+
+    EXPECT_LT(status.runtime_ms, 5u);
+    EXPECT_EQ(status.current_item_ms, 0u);
+
+    expectAnythingCleanup(it);
+
+    EXPECT_EQ(bgIteratorFind("simple"), nullptr);
+}
+
+
+// Close client before reading anything
+TEST_F(BgIterationTest, testClientCloseBeforeRead) {
+    bgIterator *it = bgIteratorCreateFullScanIter("simple",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+    bgIteration_feedIterators();
+ 
+    bgIteratorClose(it); // Immediately close before reading
+ 
+    bgIteration_feedIterators(); // Recognize the closed iterator
+
+    // Check that the cleanup callback was executed properly
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_TRUE(cleanupTerminated);
+}
+
+
+// Test that the full scan hits each item in the expected sequence.
+TEST_F(BgIterationTest, orderedIteration) {
+    bgIterator *it = bgIteratorCreateFullScanIter("simple",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKeySequence(it, 0, LAST_ITEM);
+
+    // Quick status check.  At this point, item #9 hasn't been returned yet.
+    bgIteratorStatus status;
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentries_queued, static_cast<unsigned int>(TOTAL_ITEMS));
+    EXPECT_EQ(status.dbentries_processed, static_cast<unsigned int>(TOTAL_ITEMS) - 1);
+
+    expectReadComplete(it); // Returns item #9, and reads the completion item
+
+    // Check that the cleanup callback was executed properly
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_FALSE(cleanupTerminated);
+}
+
+
+// Test that two simultaneous iterations work properly.
+TEST_F(BgIterationTest, twoOrderedIterations) {
+    bgIterator *it1 = bgIteratorCreateFullScanIter("simple1",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+    bgIterator *it2 = bgIteratorCreateFullScanIter("simple2",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+    EXPECT_EQ(bgIteratorFind("simple1"), it1);
+    EXPECT_EQ(bgIteratorFind("simple2"), it2);
+
+    int it1Count = 0;
+    int it2Count = 0;
+    while (it1Count < TOTAL_ITEMS || it2Count < TOTAL_ITEMS) {
+        // Randomly read from either iterator
+        if ((rand() % 2) == 0) {
+            if (it1Count < TOTAL_ITEMS) expectReadKey(it1, it1Count++);
+        } else {
+            if (it2Count < TOTAL_ITEMS) expectReadKey(it2, it2Count++);
+        }
+    }
+
+    // Nothing left but to read the final completions
+    expectReadComplete(it1);
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_FALSE(cleanupTerminated);
+    expectReadComplete(it2);
+    EXPECT_EQ(cleanupCount, 2);
+    EXPECT_FALSE(cleanupTerminated);
+}
+
+
+/////////////////////////////////////////////////////
+// MODIFY A FUTURE ITEM
+// The next tests validate the basic pattern when a key, not yet iterated, is modified.
+// Each variation of iteration flags is tested.
+// Note that these tests execute without cloning (cloning is tested elsewhere).
+/////////////////////////////////////////////////////
+
+// Modify a future item, without replication or consistency.
+// Our expectation for this case is that the modification should proceed without blocking, the item
+//  shouldn't be expedited, and we will see the modified item once the iterator reaches it.
+TEST_F(BgIterationTest, modFutureItem_NoReplication_NoConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    // Fake a modification to a later key so that we can see if it gets processed out of order.
+    client *c = getWriteClient(6, "xxx");
+
+    // We DONT expect the client to be blocked - not consistent
+    simulateUnblockedWriteWithModification(c);
+
+    // Now continue reading, 1, 2, 3, 4, 5
+    expectReadKeySequence(it, 1, 5);
+
+    // Let's validate that key 6 shows the new value
+    expectReadKey(it, 6, "xxx");
+
+    // Continue...
+    expectReadKeySequence(it, 7, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// Modify a future item, without replication but with consistency.  (Like a SAVE operation)
+// Our expectation for this case is that the modification SHOULD be blocked, as we have to save the
+//  the item in it's state before the modification.  To reduce blocking time, the item should be
+//  moved to the head of the queue - there's no replication in this case, so out-of-order processing
+//  isn't a concern.
+TEST_F(BgIterationTest, modFutureItem_NoReplication_YesConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    // Fake a modification to a later key so that we can see if it gets processed out of order.
+    client *c = getWriteClient(6, "xxx");
+    // Since this is consistent, we will block the client, disallowing the write.
+    simulateBlockedWrite(c);
+
+    // On a consistent iterator, the event is expedited in-front of items already in queue!
+    //  Read key 6 out of order.
+    expectReadKey(it, 6);
+
+    // Now, when we read key 1, key 6 is released back to Valkey, and the client will be unblocked.
+    expectReadKeyWithUnblock(it, 1, 6);
+    simulateUnblockedWriteWithModification(c); // Now the write can proceed
+
+    // Continue...
+    expectReadKeySequence(it, 2, 5);
+    // 6 has already been processed
+    expectReadKeySequence(it, 7, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// Modify a future item, with replication but without consistency.  (Like a Threadsave Full Sync operation)
+// Our expectation for this case is that the modification should proceed without blocking, as the
+//  mode is inconsistent.  We don't expect replication, as we haven't reached the item yet.  We'll
+//  see the modified item later.
+TEST_F(BgIterationTest, modFutureItem_YesReplication_NoConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    // Fake a modification to a later key so that we can see if it gets processed out of order.
+    client *c = getWriteClient(6, "xxx");
+
+    // We DONT expect the client to be blocked - not consistent
+    simulateUnblockedWriteWithModification(c);
+
+    // NOTE:  Since we haven't reached this item yet, and consistency is not required, there's no
+    //        need to replicate this command.  So everything should wrap up just fine - we will see
+    //        the new value when we get to it.
+
+    // Now continue reading, 1, 2, 3, 4, 5
+    expectReadKeySequence(it, 1, 5);
+
+    // Let's validate that key 6 shows the new value
+    expectReadKey(it, 6, "xxx");
+
+    // Continue...
+    expectReadKeySequence(it, 7, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// There's no current use case for CONSISTENT with REPLICATION.  It's included for completeness
+//  and to clarify the functionality of the design.  However, if this combination were to be used,
+//  it would be invalid in the presence of SWAPDB.
+TEST_F(BgIterationDeathTest, modFutureItem_YesReplication_YesConsistent_fail) {
+    // Note:  This configuration (CONSISTENT with REPLICATION) is invalid unless in cluster mode.
+    //        The issue is that with multiple database supporting SWAPDB creates a problem.  How is it
+    //        possible to maintain a CONSISTENT view with a SWAPDB impacting the values seen in the
+    //        replication stream?  (Cluster mode doesn't support SWAPDB, so no issue there.)
+    EXPECT_DEATH(bgIteratorCreateFullScanIter("iter", BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT,
+            NULL, NULL, NULL), "");
+}
+
+
+/////////////////////////////////////////////////////
+// MODIFY A CURRENT ITEM
+// The next tests validate the basic pattern when a key, currently in use, is modified.
+// Each variation of iteration flags is tested.
+// Note that these tests execute without cloning (cloning is tested elsewhere).
+/////////////////////////////////////////////////////
+
+// Modify a current item, without replication or consistency.
+// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't
+//  be expedited (it's already in use).
+TEST_F(BgIterationTest, modCurrentItem_NoReplication_NoConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    client *c = getWriteClient(2, "xxx");
+
+    // Must be blocked since key is queued
+    simulateBlockedWrite(c);
+
+    // Now continue reading
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+    expectReadKeyWithUnblock(it, 3, 2);
+    simulateUnblockedWriteWithModification(c);     // the actual write won't affect anything (past key, no replication)
+
+    // Continue...
+    expectReadKeySequence(it, 4, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// Modify a current item, without replication but with consistency.  (Like a SAVE operation)
+// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't
+//  be expedited (it's already in use).
+TEST_F(BgIterationTest, modCurrentItem_NoReplication_YesConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    client *c = getWriteClient(2, "xxx");
+
+    // Must be blocked since key is queued
+    simulateBlockedWrite(c);
+
+    // Now continue reading
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+    expectReadKeyWithUnblock(it, 3, 2);
+    simulateUnblockedWriteWithModification(c);     // the actual write won't affect anything (past key, no replication)
+
+    // Continue...
+    expectReadKeySequence(it, 4, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// Modify a current item, with replication but without consistency.  (Like a Threadsave Full Sync operation)
+// Our expectation for this case is that the modification SHOULD be blocked.  After the key is processed,
+//  the write will proceed, and the replication will be sent.
+TEST_F(BgIterationTest, modCurrentItem_YesReplication_NoConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    client *c = getWriteClient(2, "xxx");
+
+    // Must be blocked since key is queued
+    simulateBlockedWrite(c);
+
+    // Now continue reading
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+    expectReadKeyWithUnblock(it, 3, 2);
+    simulateUnblockedWriteWithModification(c);     // the actual write will cause replication
+
+    expectReadKey(it, 4);  // 4 got put in queue when 3 was read
+
+    expectReadReplication(it, c);
+
+    // Continue...
+    expectReadKeySequence(it, 5, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+#ifdef CODE_NOT_READY_YET
+TEST_F(BgIterationTestCluster, modCurrentItem_YesReplication_YesConsistent_cluster) {
+    // Cluster test.  REPLICATION + CONSISTENT only supported in cluster mode
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT,
+            NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read. All other keys are queued.
+    client *c = getWriteClient(1, "xxx");
+
+    // Since this is consistent, we will block the client, disallowing the write.
+    simulateBlockedWrite(c);
+
+    // Not expedited because item is already in queue
+    expectReadKey(it, 1);
+    expectReadKeyWithUnblock(it, 2, nullptr, 1);  // reading original/unmodified item
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKey(it, 3);  // 2, 3 & 4 are in the same bucket, so the replication comes after
+    expectReadKey(it, 4);
+    expectReadReplication(it, c);
+
+     // Continue...
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+#endif
+
+
+/////////////////////////////////////////////////////
+// MODIFY A PAST ITEM
+// The next tests validate the basic pattern when a key, not yet iterated on, is modified.
+// Each variation of iteration flags is tested.
+// Note that these tests execute without cloning (cloning is tested elsewhere).
+/////////////////////////////////////////////////////
+
+// Modify a past item, without replication or consistency.
+// Our expectation for this case is that the modification should proceed without blocking.
+//  No replication is generated and keys are processed similar to no modification.
+TEST_F(BgIterationTest, modPastItem_NoReplication_NoConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // This read returns key 0 (making it a past item)
+    expectReadKey(it, 1);
+
+    // At this point, key 0 is returned.
+    client *c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    // Continue...
+    expectReadKeySequence(it, 2, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// Modify a past item, without replication but with consistency.  (Like a SAVE operation)
+// Our expectation for this case is that the modification should proceed without blocking.
+//  No replication is generated and keys are processed similar to no modification.
+TEST_F(BgIterationTest, modPastItem_NoReplication_YesConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // This read returns key 0 (making it a past item)
+    expectReadKey(it, 1);
+
+    // At this point, key 0 is returned.
+    client *c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    // Continue...
+    expectReadKeySequence(it, 2, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// Modify a past item, with replication but without consistency.  (Like a Threadsave Full Sync operation)
+// Our expectation for this case is that the modification should proceed without blocking.
+//  Replication will be sent.
+TEST_F(BgIterationTest, modPastItem_YesReplication_NoConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // This read returns key 0 (making it a past item)
+    expectReadKey(it, 1);
+
+    // At this point, key 0 is returned.
+    client *c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    // Key 2 was already in queue (same bucket as key 1).  The replication will follow.
+    expectReadKey(it, 2);
+    expectReadReplication(it, c);
+
+    // Continue...
+    expectReadKeySequence(it, 3, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+#ifdef CODE_NOT_READY_YET
+TEST_F(BgIterationTestCluster, modPastItem_YesReplication_YesConsistent_cluster) {
+    // Cluster test.  REPLICATION + CONSISTENT only supported in cluster mode
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT,
+            NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // This read returns key 0 (making it a past item)
+    expectReadKey(it, 1);
+
+    // At this point, key 0 is returned.
+    client *c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    // Keys 2, 3, and 4 were already in queue.  The replication will follow.
+    expectReadKey(it, 2);
+    expectReadKey(it, 3);
+    expectReadKey(it, 4);
+    expectReadReplication(it, c);
+
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+#endif
+
+
+/////////////////////////////////////////////////////
+// TESTS FOR ITEM CLONING
+/////////////////////////////////////////////////////
+
+// In a consistent iteration, verify that a simple string is properly cloned, and that a write can
+//  occur without blocking.  Validate the cloned item and metadata.
+TEST_F(BgIterationTest, modFutureItem_NoReplication_YesConsistent_CloneExpeditedItem) {
+    // Initialize cloning configurations.
+    bgIteration_unitTestEnableCloning(50, 100);
+
+    bgIteratorStatus status;
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    // Fake a modification to a later key so that we can see if it gets processed out of order.
+    client *c = getWriteClient(6, "xxx");
+
+    // Quick status check.  At this point, no clones exist yet.
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_queued, 0u);
+
+    // Since item 6 should be cloned, it will not block the client, allowing the write.
+    void *de6_md = cloneMetadata(getItem(6));
+    simulateClonedWrite(it, c); // This wouldn't block, and queues the cloned value
+    simulateUnblockedWriteWithModification(c); // This modifies the real entry in the de (touching metadata)
+
+    // At this point, one clone is in the queue.
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_queued, 1u);
+
+    // On a consistent iterator, the event is expedited in-front of items already in queue!
+    //  Read key 6 (which is cloned) out of order.  The value will still match the key.
+    expectReadClonedKey(it, 6, de6_md); // Also validates and frees the metadata
+
+    // Quick status check.  At this point, cloned items have not been marked as processed yet.
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_processed, 0u);
+
+    // Reading key 1 will release key 6, and the clone will finish processing.
+    expectReadKey(it, 1);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_processed, 1u);
+
+    // Now, when we read key 2 should not have an impact on number of processed clones.
+    expectReadKey(it, 2);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_processed, 1u);
+
+    // Continue...
+    expectReadKeySequence(it, 3, 5);
+    // 6 has already been processed
+    expectReadKeySequence(it, 7, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// Check that cloning for simple strings is respecting the size limits and pool size.  On a
+//  consistent iteration, we expect to block or clone on all future keys.  We validate that we can
+//  clone if the item is small enough and the cloning pool has more space left.
+TEST_F(BgIterationTest, modFutureItem_NoReplication_YesConsistent_LargeItemOrClonePoolFull) {
+    // Initialize cloning configurations to test the clone pool functionality first.
+    bgIteration_unitTestEnableCloning(50, 50);
+
+    bgIteratorStatus status;
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    // Fake a modification to a later key so that we can see if it gets processed out of order.
+    client *c6 = getWriteClient(6, "xxx");
+    client *c7 = getWriteClient(7, "xxx");
+    client *c8 = getWriteClient(8, "xxx");
+
+    // Quick status check.  At this point, no clones exist yet.
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_queued, 0u);
+
+    // Since item 6 should be cloned, it will not block the client, allowing the write.
+    void *de6_md = cloneMetadata(getItem(6));
+    simulateClonedWrite(it, c6);
+    simulateUnblockedWriteWithModification(c6);
+
+    // At this point, one clone is in the queue.
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_queued, 1u);
+
+    // Now that cloning pool is full, item 7 will not be cloned and the client will be blocked.
+    simulateBlockedWrite(c7);
+    ASSERT_TRUE(bgIteration_isEntryInuse(getItem(7)));
+
+    // There is still only one cloned item in the queue.
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_queued, 1u);
+
+    // Now change cloning configurations to test that large items will not be cloned. We adjust
+    //  the clone pool size to allow two items, but set the maximum item size to be smaller than
+    //  the size of item 8. The clone pool size must be larger than the total size of the existing 
+    //  clones plus the maximum item clone size. 
+    bgIteration_unitTestEnableCloning(1, 101);
+
+    // This write will pass the clone pool check but fail the item size check, blocking the client.
+    simulateBlockedWrite(c8);
+    ASSERT_TRUE(bgIteration_isEntryInuse(getItem(8)));
+
+    // On a consistent iterator, the expedited item in-front of items already in queue!
+    //  Read key 6 out of order.
+    expectReadClonedKey(it, 6, de6_md);
+
+    // Now, when we expect to read key 7, which was expedited, key 6 will be released back to Valkey
+    //  and the clone will be deallocated here.
+    expectReadKey(it, 7);
+
+    // Now, when we read key 8, which was expedited, key 7 is released back to Valkey, and the client 
+    // will be unblocked.
+    // (actually, unblock is called after every key [just in case] - but functionally we only care
+    //  about this one)
+    expectReadKeyWithUnblock(it, 8, 7);
+    simulateUnblockedWriteWithModification(c7);
+
+    // Now, when we read key 1, key 8 is released back to Valkey, and the client will be unblocked.
+    expectReadKeyWithUnblock(it, 1, 8);
+    simulateUnblockedWriteWithModification(c8);
+
+    // Since only one item was cloned, there should be one clone processed
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_processed, 1u);
+
+    // Continue...
+    expectReadKeySequence(it, 2, 5);
+    // 6, 7, and 8 have already been processed
+    expectReadKeySequence(it, 9, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c6);
+    freeTestClient(c7);
+    freeTestClient(c8);
+}
+
+
+/////////////////////////////////////////////////////
+// TESTS RELATED TO MODIFICATION OF TWO ITEMS
+// When 2 keys are modified, we need to ensure that both keys have been sent before we can send
+//  replication.  This means that if replication is present, we may have to block/expedite for
+//  future keys, even in the inconsistent scenario.
+/////////////////////////////////////////////////////
+
+// Replication enabled, but NOT consistent.  In this case, if ANY of the keys have been iterated,
+//  ALL of the keys must be replicated so that the command can be processed properly on the replica.
+TEST_F(BgIterationTest, modPastFutureItem_YesReplication_NoConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // In this test, we need a past and future key IN THE SAME DB (they're used in the same command).
+    //  DB1 has lots of buckets.  After reading item 9,
+    //    8 will be past, 10 will be in queue, 11-15 will be future.
+    expectReadKeySequence(it, 0, 9);
+
+    // We're going to write to key 8 (past) and read from key 12 (future)
+    // Even though key 12 is for READ in this command, it must be expedited so that it exists before
+    //  the associated replication is sent.
+    client *c = getSetGetClient(8, "xxx", 12);
+    simulateBlockedWrite(c);
+
+    // Key 12 will be expedited, but not in front of existing items in queue (can only do that for
+    //  consistent iterators) - JHB How about cluster mode?
+
+    expectReadKey(it, 10);
+    expectReadKey(it, 12); // expedited
+    expectReadKeyWithUnblock(it, 11, 12); // 13 is now in queue
+
+    simulateUnblockedWriteWithModification(c);
+
+    // Continue...
+    expectReadKey(it, 13);
+    expectReadReplication(it, c);
+
+    expectReadKeySequence(it, 14, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// Replication NOT enabled.  A read-only key doesn't need to be expedited, even if other keys have
+//  been processed already.  (This should work identically for both consistent/non-consistent.
+TEST_F(BgIterationTest, modPastFutureItem_NoReplication_YesConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter1",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // In this test, we need a past and future key IN THE SAME DB (they're used in the same command).
+    //  DB1 has lots of buckets.  After reading item 9,
+    //    8 will be past, 10 will be in queue, 11-15 will be future.
+    expectReadKeySequence(it, 0, 9);
+
+    // We're going to write to key 8 (past) and read from key 12 (future)
+    // Since there's no replication, we don't have to worry about expediting 12.  The write will
+    //  proceed without blocking.
+    client *c = getSetGetClient(8, "xxx", 12);
+    simulateUnblockedWriteWithModification(c);
+
+    // Key 12 will not be expedited.  Remaining keys should be received in normal order.
+    expectReadKeySequence(it, 10, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+TEST_F(BgIterationTest, modPastFutureItem_NoReplication_NoConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter2",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // In this test, we need a past and future key IN THE SAME DB (they're used in the same command).
+    //  DB1 has lots of buckets.  After reading item 9,
+    //    8 will be past, 10 will be in queue, 11-15 will be future.
+    expectReadKeySequence(it, 0, 9);
+
+    // We're going to write to key 8 (past) and read from key 12 (future)
+    // Since there's no replication, we don't have to worry about expediting 12.  The write will
+    //  proceed without blocking.
+    client *c = getSetGetClient(8, "xxx", 12);
+    simulateUnblockedWriteWithModification(c);
+
+    // Key 9 will not be expedited.  Remaining keys should be received in normal order.
+    expectReadKeySequence(it, 10, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+/////////////////////////////////////////////////////
+// TESTS RELATED TO MISSING ITEMS
+// Missing items are tricky.  A missing item might be logically located in the past or future, in
+//  relation to the current iteration position.  The command may (or may not) create the "missing"
+//  key.  Some general considerations:
+//    * In a consistent iteration, a missing key didn't exist at the time of consistency, or it was
+//      already processed (saved) at the time of the deletion.  If the missing key gets created, we
+//      must be sure to skip it if we later iterate over it.
+//    * In a non-consistent iteration with replication:
+//        * If the key location is already passed, the replication is sent, allowing the key to be
+//          created (or not) based on the replication.
+//        * If the key location is in the furure, we can allow the command to proceed, without
+//          replication.  If the key is created, we will process it when the iterator gets to it.
+//
+// We expect:
+//  no-repl, no-consist:  past items are ignored - future items are processed when iterated
+//  no-repl, yes-consist:  past items are ignored - future items are ignored
+//  yes-repl, no-consist:  past item skipped, but replicated - future items are created by replication and skipped later
+//  yes-repl, yes-consist:  past item skipped, but replicated - future items are processed when iterated
+/////////////////////////////////////////////////////
+
+// no-repl, no-consist: creation of PAST item has no impact
+TEST_F(BgIterationTest, missingPastItem_NoReplication_NoConsistent) {
+    simpleDelItem(0); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+
+    client *c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKeySequence(it, 3, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// no-repl, yes-consist: creation of PAST item has no impact
+TEST_F(BgIterationTest, missingPastItem_NoReplication_YesConsistent) {
+    simpleDelItem(0); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+
+    client *c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKeySequence(it, 3, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// yes-repl, no-consist: creation of a PAST item will be replicated
+TEST_F(BgIterationTest, missingPastItem_YesReplication_NoConsistent) {
+    simpleDelItem(0); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+    expectReadKey(it, 3);
+
+    client *c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c); // replication will be added after item 4 (3,4 in same bucket)
+
+    expectReadKey(it, 4);
+
+    expectReadReplication(it, c);
+
+    expectReadKeySequence(it, 5, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+#ifdef CODE_NOT_READY_YET
+// yes-repl, yes-consist: creation of a PAST item will be replicated
+TEST_F(BgIterationTestCluster, missingPastItem_YesReplication_YesConsistent) {
+    // Cluster test.  REPLICATION + CONSISTENT only supported in cluster mode
+    simpleDelItem(0); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT,
+            NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+
+    client *c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);     // replication will be added after item 4 (2, 3, and 4 in same bucket)
+
+    expectReadKey(it, 3);
+    expectReadKey(it, 4);
+    expectReadReplication(it, c);
+
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+#endif
+
+
+// no-repl, no-consist: creation of FUTURE item is seen when reached by the iteration.
+TEST_F(BgIterationTest, missingFutureItem_NoReplication_NoConsistent) {
+    // Using DB1 so we have lots of buckets
+    // Note:  Choosing item 14 because it's in the portion of DB1 that's already rehashed.  So we
+    //  know that the item won't be moving when we re-add it.
+    simpleDelItem(14); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0);
+
+    const char * newValue = "xxx";
+    client *c = getWriteClient(14, newValue);
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKeySequence(it, 1, 13);
+
+    // We expect to see item 14.
+    //  Note that for an inconsistent DB view, it is logically undefined if this value is seen (or not).
+    //  But as implemented, we should see it and the test is helpful to understand if/when the
+    //  functionality changes.
+    expectReadKey(it, 14, newValue);
+
+    expectReadKey(it, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// no-repl, yes-consist: creation of FUTURE item is ignored by consistent iteration.
+TEST_F(BgIterationTest, missingFutureItem_NoReplication_YesConsistent) {
+    // Using DB1 so we have lots of buckets
+    // Note:  Choosing item 14 because it's in the portion of DB1 that's already rehashed.  So we
+    //  know that the item won't be moving when we re-add it.
+    simpleDelItem(14); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0);
+
+    client *c = getWriteClient(14, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKeySequence(it, 1, 13);
+    // Key 14 is missing - it didn't exist at start of consistent iteration
+    expectReadKey(it, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// yes-repl, no-consist: creation of FUTURE item is handled by the replication, and then the key is
+//  later skipped (treated like an early iteration case).
+TEST_F(BgIterationTest, missingFutureItem_YesReplication_NoConsistent) {
+    // Using DB1 so we have lots of buckets
+    // Note:  Choosing item 14 because it's in the portion of DB1 that's already rehashed.  So we
+    //  know that the item won't be moving when we re-add it.
+    simpleDelItem(14); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0); // Items 1 & 2 are in queue (same bucket)
+
+    client *c = getWriteClient(14, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKeySequence(it, 1, 2);
+
+    expectReadReplication(it, c); // Here's the replication creating item 14
+
+    expectReadKeySequence(it, 3, 13);
+    // We expect item 14 to be skipped, because it was created by the earlier replication
+    expectReadKey(it, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+#ifdef CODE_NOT_READY_YET
+TEST_F(BgIterationTestCluster, missingFutureItem_YesReplication_YesConsistent) {
+    // Cluster test.  REPLICATION + CONSISTENT only supported in cluster mode
+    simpleDelItem(4);
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT,
+            NULL, iteratorCleanupFn, PRIVDATA);
+
+    bgIteration_feedIterators();    // Make sure we get key 0 and 1 into the queue
+
+    client *c = getWriteClient(4, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKey(it, 0);
+    expectReadKey(it, 1);
+
+    expectReadReplication(it, c);
+
+    expectReadKey(it, 2);
+    expectReadKey(it, 3);
+
+    // The replication was read - we don't want to see the key now - #4 should be skipped
+
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+#endif
+
+
+/////////////////////////////////////////////////////
+// TESTS RELATED TO EXPIRATION
+// Expiration can be tricky.  When pre-evaluating a command with bgIteration_blockClientIfRequired,
+//  a key might exist, but be ready for expiration.  Then, as the command executes, the key expires
+//  and gets deleted before the write operation.  Consider SET K V.
+//  In the unexpired case, this appears to bgIteration as a single SET command (which replaces the value).
+//  In the expired case, bgIteration will receive a DEL followed by a SET.
+//
+// Another case is a READ command.  A read command won't cause the client to be blocked.  However,
+//  if the key is expired, this will cause a DEL.  For consistent processing, this key might need to
+//  be expedited so that it can be processed before it gets deleted.  In this case, the key is
+//  unlinked from the main Valkey dictionary, but the actual deletion is deferred.
+/////////////////////////////////////////////////////
+
+TEST_F(BgIterationTest, expireKeys_NoReplication_NoConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0);
+    expectReadKey(it, 1);
+
+    // At this point, key 1 is active, key 2 is in queue.
+
+    simulateExpiration(0);        // Past - we no longer care
+    simulateExpirationOfInuse(2); // Current - it's inuse
+    simulateExpiration(5);        // Future - we don't care (non-consistent)
+
+    expectReadKeySequence(it, 2, 4);
+    // key 5 has been deleted
+    expectReadKeySequence(it, 6, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+TEST_F(BgIterationTest, expireKeys_Replication_NoConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0);
+    expectReadKey(it, 1);
+
+    // At this point, key 1 is active, key 2 is in queue.
+
+    simulateExpiration(0);        // Past - we expect replication
+    simulateExpirationOfInuse(2); // Current - it's inuse, but we expect replication
+    simulateExpiration(5);        // Future - we don't care (non-consistent)
+
+    expectReadKey(it, 2); // this was already queued
+    expectReadReplicationDel(it, 0); // Past item should replicate
+    expectReadReplicationDel(it, 2); // Current item should replicate
+    // Item 5 is a future item and doesn't need to replicate
+
+    expectReadKeySequence(it, 3, 4);
+    // Item 5 has been deleted
+    expectReadKeySequence(it, 6, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+TEST_F(BgIterationTest, expireKeys_NoReplication_YesConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0);
+    expectReadKey(it, 1);
+
+    // At this point, key 1 is active, key 2 is in queue.
+
+    simulateExpiration(0);             // Past - we no longer care
+    simulateExpirationOfInuse(2);      // Current - we must defer
+    simulateExpirationWithExpedite(5); // Future - will become inuse and expedited for consistency
+
+    expectReadKey(it, 5);  // Expedited to front
+
+    expectReadKeySequence(it, 2, 4);
+    // Item 5 has been deleted
+    expectReadKeySequence(it, 6, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Special case during a non-consistent iteration with replication and expiration.
+//  1. A future key is created (and processed by its replication) - considered early iterated
+//  2. Later the key is expired and deleted during command processing (causes DEL to be sent) - no longer early iterated
+//  3. The key is recreated as part of the command processing (and this command was replicated) - again early iterated
+//  4. Finally, when we iterate to the key, it shouldn't be sent, because it was replicated in step 3.
+TEST_F(BgIterationTest, expireKeys_Replication_NoConsistent_FutureKeyCreatedThenExipredDuringSet) {
+    simpleDelItem(8);   // Start with a missing future item
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0);  // Get the iterator started
+
+    client *c = getWriteClient(8, "xxx");
+    simulateUnblockedWriteWithModification(c); // Not blocked because this is a future key (but we expect repl)
+
+    // Now do it again, but break out the steps so that we can simulate an expiration
+    bool blocked = bgIteration_blockClientIfRequired(c);
+    EXPECT_FALSE(blocked); // Shouldn't be blocked because this is a future key
+
+    // Now, as the SET command tries to execute, simulate that the key is expired.  Expiration
+    //  processing sends the replication FIRST!
+    robj *argv[2];
+    argv[0] = createStringObjectFromSds(sdsnew("DEL"));
+    argv[1] = c->argv[1];
+    serverCommand *cmd = lookupCommandByCString("DEL");
+    bgIteration_handleCommandReplication(getDbFromItemNum(8), cmd, 2, argv);
+    decrRefCount(argv[0]);
+
+    // Now the call to keyDelete happens (after the replication).
+    bgIteration_keyDelete(getDbFromItemNum(8), static_cast<sds>(objectGetVal(c->argv[1])));
+    simpleDelItem(8);     // Simulate the actual del
+
+    // Now the SET will run, re-creating the item (which is still a future item)
+    // We need to duplicate the value because setKey() can reallocate it.
+    robj *value = dupStringObject(c->argv[2]);
+    setKey(c, c->db, c->argv[1], &(value), SETKEY_ADD_OR_UPDATE);
+
+    // Finally, replication will be sent because this is creating a new key
+    bgIteration_handleCommandReplication(getDbFromItemNum(8), c->cmd, c->argc, c->argv);
+
+    // Test that everything comes as expected
+    expectReadKeySequence(it, 1, 2);    // All one bucket - queued after key 0 read
+
+    expectReadReplication(it, c);       // Repl from the first SET command
+    expectReadReplicationDel(it, 8);    // This is the expected replication of the DEL from expire
+    expectReadReplication(it, c);       // Repl from the second SET command (recreating deleted key)
+
+    expectReadKeySequence(it, 3, 7);    // continue with normal iteration
+    // KEY 8 SHOULD BE OMITTED - This was already replicated
+    expectReadKeySequence(it, 9, LAST_ITEM);
+
+    expectReadComplete(it);
+}
+
+
+#ifdef CODE_NOT_READY_YET
+/////////////////////////////////////////////////////
+// THE REMAINING TESTS ARE GENERAL / UNCATEGORIZED
+/////////////////////////////////////////////////////
+
+// Iteration can be terminated from the main thread or from the child client.
+//  This tests termination driven from the main thread.
+TEST_F(BgIterationTest, earlyTerminationFromMain) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+
+    // At this point, keys 1 & 2 are in queue.  A termination should release those keys.
+    bool blocked1 = true;
+    bool blocked2 = true;
+    // We expect no general unblocks, we account for each specific unblock below.
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0);
+    // We should expect to see unblock called for items 1-4, as they are released from the queue.
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1))))
+            .WillOnce(Assign(&blocked1, false));
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2))))
+            .WillOnce(Assign(&blocked2, false));
+    bgIteratorTerminate(it);        // queues the items for release
+    EXPECT_TRUE(bgIteratorIsTerminating(it));
+    bgIteration_feedIterators();    // actually performs the release
+    EXPECT_FALSE(blocked1);
+    EXPECT_FALSE(blocked2);
+
+    bool blocked0 = true;
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0))))
+            .WillOnce(Assign(&blocked0, false));
+    bgIteratorItem *item = bgIteratorRead(it);
+    EXPECT_FALSE(blocked0);
+    EXPECT_EQ(item->type, BGITERATOR_ITEM_TERMINATED);
+
+    bgIteratorClose(it);            // background thread completes the termination
+
+    EXPECT_EQ(cleanupCount, 0);
+    bgIteration_feedIterators();    // main thread, cleans up iterator and calls cleanup function
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_TRUE(cleanupTerminated);
+}
+
+
+// Iteration can be terminated from the main thread or from the child client.
+//  This tests termination driven from the child client (the background thread).
+TEST_F(BgIterationTest, earlyTerminationFromChild) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0);
+
+    // At this point, keys 1 & 2 are in queue.  A termination should release those keys.
+    bgIteratorClose(it);            // background thread initiates the termination
+    EXPECT_TRUE(bgIteratorIsTerminating(it));
+
+    bool blocked0 = true;
+    bool blocked1 = true;
+    bool blocked2 = true;
+    // Expecting no extra unblocks
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0);
+    // We expect item 0 (the in progress item) to be released
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0))))
+            .WillOnce(Assign(&blocked0, false));
+    // We expect items 1-4 (the queued items) to be released
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1))))
+            .WillOnce(Assign(&blocked1, false));
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2))))
+            .WillOnce(Assign(&blocked2, false));
+    bgIteration_feedIterators();
+    EXPECT_FALSE(blocked0);
+    EXPECT_FALSE(blocked1);
+    EXPECT_FALSE(blocked2);
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_TRUE(cleanupTerminated);
+}
+
+
+// Edge case.  Executing a command (like SUNIONSTORE) which REPLACES the first key and reads the
+//  second key.  In this case, bgIteration will get notified of the key deletion during execution of
+//  SETUNIONSTORE.  Given that both keys are in the future (not iterated yet), we'll allow the
+//  command to execute, unblocked.  We won't replicate as we'll pick up the key when we get to it.
+TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_keyDeletedDuringSetReplace) {
+    // Using DB1 so we have lots of buckets
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKeySequence(it, 0, 8); // 9 is in queue
+
+    // Write command that has 2 keys. 1 existing key that we write to and 1 dependant future key.
+    client *c = getWrite2KeysClient("sunionstore", 12, 13);
+
+    simulateUnblockedWrite(c);
+
+    // Now the call to keyDelete happens
+    bgIteration_keyDelete(getDbFromItemNum(12), keyStr(12));
+    simpleDelItem(12);     // So simulate the actual del
+
+    // Now the write will run, re-creating the item (which is still a future item)
+    const char * const newValueStr = "new value";
+    robj *newValueRobj = createStringObjectFromSds(sdsnew(newValueStr));
+    setKey(c, c->db, c->argv[1], &newValueRobj, SETKEY_ADD_OR_UPDATE);
+
+    // Finally, we are letting bgIteration know that the write command was executed
+    bgIteration_handleCommandReplication(getDbFromItemNum(12), c->cmd, c->argc, c->argv);
+
+    // Since the write command was not replicated, we expect all the keys to be read in the normal
+    //  order from the dictionary.
+    expectReadKeySequence(it, 9, 11);
+    expectReadKey(it, 12, newValueStr);
+    expectReadKeySequence(it, 13, LAST_ITEM);
+
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// Edge case.  When we have a new key which is created by a command, AND replication is enabled, we
+//  expect that we will replicate the command rather than serializing the key/value later.  As an
+//  example, consider SUNIONSTORE A B.  We want to create A by replicating the command.  We don't
+//  want to have to process A as a key later on.  But in this case, we can't run the command until
+//  B has been sent.  We expect the command to be blocked while we send B.
+TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_setNewKey_DependantFuture) {
+    // Using DB1 so we have lots of buckets
+    simpleDelItem(12);   // Deleting key 12 to then create it with a write command
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKeySequence(it, 0, 8); // 9 is in queue
+
+    // Write command that has 2 keys. 1 new key and 1 dependant future key.
+    client *c = getWrite2KeysClient("sunionstore", 12, 13);
+
+    // We are simulating a new key in the dict. This command should block on the dependant key.
+    // This adds key 13 in the queue since the command depends on it.
+    simulateBlockedWrite(c);
+
+    // Key 9 was already in the queue
+    expectReadKey(it, 9);
+
+    // Key 13 is processed out of order since the write depends on it
+    expectReadKey(it, 13);
+
+    // Reading key 10 will unblock key 13, allowing us to write.
+    expectReadKey(it, 10);
+
+    // Now that key 13 was processed and released by the iterator, the write command can be executed.
+    simulateUnblockedWriteWithModification(c);
+
+    // Key 11 was queued when we read key 10
+    expectReadKey(it, 11);
+
+    // The replication of the write command was enqueued after key 11
+    expectReadReplication(it, c);
+
+    // We shouldn't see key 12 - as that was processed via replication.
+    // We shouldn't see key 13 - as that was expedited earlier
+
+    // Now resuming processing of dict entries
+    expectReadKeySequence(it, 14, LAST_ITEM);
+
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// A new key is being created, but is dependent on another key which has already been processed.
+//  In this case, the command shouldn't be blocked.
+TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_setNewKey_DependantPast) {
+    // Using DB1 so we have lots of buckets
+    simpleDelItem(12);   // Deleting key 12 to then create it with a write command
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKeySequence(it, 0, 9); // 10 is in queue, done with 8
+
+    // Write command that has 2 keys. 1 new key and 1 dependant past key.
+    client *c = getWrite2KeysClient("sunionstore", 12, 8);
+
+    // We are simulating a new key in the dict.
+    // This command should not block since the dependant key has already been processed.
+    simulateUnblockedWriteWithModification(c);
+
+    // Key 10 was put in the queue before the write
+    expectReadKey(it, 10);
+
+    expectReadReplication(it, c);
+
+    expectReadKey(it, 11);
+
+    // Key 12 should be missing - it was processed by replication
+
+    expectReadKeySequence(it, 13, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+// A new key is being created, and has dependencies on 2 other keys - one already processed, one not.
+//  In this case, the command should be blocked so that the future key can be sent first.
+TEST_F(BgIterationTest, writeWith3Keys_Replication_NoConsistent_setNewKey_1DependantPast1DependantFuture) {
+    // Using DB1 so we have lots of buckets
+    simpleDelItem(12);   // Deleting key 12 to then create it with a write command
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKeySequence(it, 0, 9); // 8 has been returned, 9 is active, 10 is in queue
+
+    // Write command that has 1 new key and 2 dependencies (past/future)
+    client *c = getWrite3KeysClient("sunionstore", 12, 8, 13);
+
+    // The write should be blocked, so that item 13 can be processed.
+    simulateBlockedWrite(c);
+
+    expectReadKey(it, 10); // 10 was already in queue
+    expectReadKey(it, 13); // 13 was expedited since the write depends on it
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(13)))).Times(1);
+    expectReadKey(it, 11); // Releases 13 so the command can execute
+
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKey(it, 14); // was queued when reading 11 (12 is missing, 13 was expedited)
+
+    expectReadReplication(it, c);
+
+    expectReadKey(it, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+// Test an edge case with the same (future) key being repeated in the command, like:
+//  SUNIONSTORE A B B
+// In this test, A is a previously handled key, and B is a future key.  We expect the future key B to
+//  be expedited (once).
+TEST_F(BgIterationTest, writeWith3Keys_Replication_NoConsistent_repeatedKey_1DependantPast1RepeatedFuture) {
+    // Using DB1 so we have lots of buckets
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKeySequence(it, 0, 9); // We're done with 8, and 10 is in queue
+
+    // Write command that has 3 keys. 1 past key and 1 repeated key in the future.
+    client *c = getWrite3KeysClient("sunionstore", 8, 12, 12);
+
+    // This command should block because 12 needs to be expedited.
+    simulateBlockedWrite(c);
+
+    expectReadKey(it, 10); // was already in queue
+    expectReadKey(it, 12); // expedited
+    expectReadKey(it, 11); // releases 12 (unblocking the command)
+
+    // Now that key 12 was processed and released by the iterator, the write command can be executed.
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKey(it, 13); // queued when we read 11
+
+    expectReadReplication(it, c);
+
+    // Now resuming processing of dict entries.
+    expectReadKeySequence(it, 14, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+#endif
+#ifdef CODE_NOT_READY_YET
+
+
+TEST_F(BgIterationTest, writeWith3Keys_Replication_NoConsistent_repeatedKey_1newKey1RepeatedFuture) {
+    // This tests the replication of a write command that creates a new key and depends on 1 other
+    //  key which is repeated in the command. The repeated key is in the future.
+    // This test is meant to replicate this bug: https://issues.amazon.com/ELMO-46572
+
+    // Expected sequence of event for this test:
+    //  ITEM: (0)'D0' : 'D0'
+    //  BLCK?: (0)'sunionstore' 'E0' 'C0' 'C0'
+    //  EARLY: (0)'C0' : 'C0'
+    //   (blocked)
+    //  ITEM: (0)'B0' : 'B0'
+    //  ITEM: (0)'A0' : 'A0'
+    //  BLCK?: (0)'sunionstore' 'E0' 'C0' 'C0'
+    //  REPL?: (0)'sunionstore' 'E0' 'C0' 'C0'
+    //   (queued)
+    //  SKIPPING ITEM(early iterate): (0)'C0' : 'C0'
+    //  ITEM: (1)'E1' : 'E1'
+    //  ITEM: (1)'C1' : 'C1'
+    //  ITEM: (1)'B1' : 'B1'
+    //  ITEM: (1)'A1' : 'A1'
+    //  ITEM: (1)'D1' : 'D1'
+    //  SENDING COMPLETE
+    //  CLEANUP FN (success)
+
+    simpleDelItem(1);   // Deleting key 1 to then create it with a write command
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Start with this to load 0 into the queue - but don't read 0 as that would load 1,2,3 into the queue!
+    bgIteration_feedIterators();
+
+    // Write command that has 3 keys. 1 new key and 1 repeated key in the future.
+    client *c = getWrite3KeysClient(1, 4, 4);
+
+    // This command should block on key 4.
+    // This adds key 4 in the queue because:
+    // - the command depends on key 4 which hasn't been processed yet
+    // - the command depends on a new key (key 1).
+    simulateBlockedWrite(c);
+
+    // Key 0 was already enqueued.
+    expectReadKey(it, 0);
+
+    // Key 4 is processed out of order since the write depends on it
+    expectReadKey(it, 4);
+
+    // Keys 2,3 are next in the queue (they are all in the same bucket).
+    // Only reading key 2 for now to release key 4 from the iterator.
+    expectReadKey(it, 2);
+
+    // Now that key 4 was processed and released by the iterator, the write command can be executed.
+    simulateUnblockedWriteWithModification(c);
+
+    // Key 3 is next in the queue (it was put in the queue at the same time as key 2).
+    expectReadKey(it, 3);
+
+    // The replication of the write command was enqueued after keys 1,2,3.
+    expectReadReplication(it, c);
+
+    // Now resuming processing of dict entries.
+    expectReadKeySequence(it, 5, 9);
+
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+TEST_F(BgIterationTest, writeWith3Keys_NoReplication_Consistent_repeatedKey_1DependantPast1RepeatedFuture) {
+    // This tests the replication of a write command that updates multiple keys and depends on a key
+    //  which is repeated in the command. The repeated key is in the future and the other key is in
+    //  the past.
+
+    // Expected sequence of event for this test:
+    //  ITEM: (0)'D0' : 'D0'
+    //  BLCK?: (0)'blpop' 'D0' 'C0' 'C0' 'D0'
+    //  EARLY_1: (0)'C0' : 'C0'
+    //   (blocked)
+    //  ITEM: (0)'E0' : 'E0'
+    //  ITEM: (0)'B0' : 'B0'
+    //  ITEM: (0)'A0' : 'A0'
+    //  BLCK?: (0)'blpop' 'D0' 'C0' 'C0' 'D0'
+    //  REPL?: (0)'blpop' 'D0' 'C0' 'C0' 'D0'
+    //  SKIPPING ITEM(early iterate): (0)'C0' : 'C0'
+    //  ITEM: (1)'E1' : 'E1'
+    //  ITEM: (1)'C1' : 'C1'
+    //  ITEM: (1)'B1' : 'B1'
+    //  ITEM: (1)'A1' : 'A1'
+    //  ITEM: (1)'D1' : 'D1'
+    //  SENDING COMPLETE
+    //  CLEANUP FN (success)
+
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Start with this to load 0 into the queue - but don't read 0 as that would load 1,2,3 into the queue!
+    bgIteration_feedIterators();
+
+    // Write command that has 3 keys. 1 past key and 1 repeated key in the future.
+    // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a
+    //  multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY).
+    client *c = getWriteMultiKeysClient(0, {4, 4, 0}, "blpop");
+
+    // This command should block on 2 keys (0 and 4), since:
+    //  - key 0 is in use by the iterator (still in the queue since it has not been processed by the consumer yet)
+    //  - key 4 is in the future
+    // This adds key 4 in the queue since the command depends on it and it hasn't been processed yet.
+    simulateBlockedWrite(c, 2);
+
+    // Key 4 is processed out of order since the write depends on it.
+    // Key 4 is processed before key 0 even though key 0 was already in the queue
+    //  because key 4 was enqueued as a priority item.
+    expectReadKey(it, 4);
+
+    // Key 0 was already enqueued.
+    // Reading key 0 releases key 4 from the iterator.
+    expectReadKey(it, 0);
+
+    // Keys 1,2,3 are next in the queue (they are all in the same bucket).
+    // Only reading key 1 for now to release key 0 from the iterator.
+    expectReadKey(it, 1);
+
+    // Now that keys 4 and 0 were processed and released by the iterator, the write command can be executed.
+    simulateUnblockedWriteWithModification(c);
+
+    // Keys 2, 3 are next in the queue (it was put in the queue at the same time as key 1).
+    expectReadKeySequence(it, 2, 3);
+
+    // Now resuming processing of dict entries.
+    expectReadKeySequence(it, 5, 9);
+
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+TEST_F(BgIterationTest, writeWith3Keys_NoReplication_NoConsistent_repeatedKey_1repeatedNewKey) {
+    // This tests a write command that creates a new key where the new key is repeated in the
+    //  command. The repeated key is in the future.
+
+    // Expected sequence of event for this test:
+    // ITEM: (0)'D0' : 'D0'
+    // ITEM: (0)'A0' : 'A0'
+    // ITEM: (0)'B0' : 'B0'
+    // ITEM: (0)'E0' : 'E0'
+    // BLCK?: (0)'blpop' 'C0' 'D0' 'C0' 'D0'
+    // REPL?: (0)'blpop' 'C0' 'D0' 'C0' 'D0'
+    // ITEM: (0)'C0' : 'D0'
+    // ITEM: (1)'B1' : 'B1'
+    // ITEM: (1)'C1' : 'C1'
+    // ITEM: (1)'D1' : 'D1'
+    // ITEM: (1)'A1' : 'A1'
+    // ITEM: (1)'E1' : 'E1'
+    // SENDING COMPLETE
+    //  CLEANUP FN (success)
+
+    server.db[0]->keys->dtype->resizeAllowed = NULL;
+    kvstoreExpand(server.db[0]->keys, 32, 0, NULL);
+    hashtableRehash(server.db[0]->keys->hashtables[0], 32);
+
+    // The table looks this way now:
+    // Table 0, used 5, exp 3, top-level buckets 8, child buckets 0
+    // Bucket 0:0 level:0
+    //   0 (empty)
+    //   1 (empty)
+    //   2 (empty)
+    //   3 (empty)
+    //   4 (empty)
+    //   5 (empty)
+    //   6 (empty)
+    // Bucket 0:1 level:0
+    //   0 h2 63, key "D0"
+    //   1 (empty)
+    //   2 (empty)
+    //   3 (empty)
+    //   4 (empty)
+    //   5 (empty)
+    //   6 (empty)
+    // Bucket 0:2 level:0
+    //   0 (empty)
+    //   1 (empty)
+    //   2 (empty)
+    //   3 (empty)
+    //   4 (empty)
+    //   5 (empty)
+    //   6 (empty)
+    // Bucket 0:3 level:0
+    //   0 h2 b8, key "A0"
+    //   1 h2 f5, key "B0"
+    //   2 h2 13, key "E0"
+    //   3 (empty)
+    //   4 (empty)
+    //   5 (empty)
+    //   6 (empty)
+    // Bucket 0:4 level:0
+    //   0 (empty)
+    //   1 (empty)
+    //   2 (empty)
+    //   3 (empty)
+    //   4 (empty)
+    //   5 (empty)
+    //   6 (empty)
+    // Bucket 0:5 level:0
+    //   0 (empty)
+    //   1 (empty)
+    //   2 (empty)
+    //   3 (empty)
+    //   4 (empty)
+    //   5 (empty)
+    //   6 (empty)
+    // Bucket 0:6 level:0
+    //   0 h2 91, key "C0"
+    //   1 (empty)
+    //   2 (empty)
+    //   3 (empty)
+    //   4 (empty)
+    //   5 (empty)
+    //   6 (empty)
+    // Bucket 0:7 level:0
+    //   0 (empty)
+    //   1 (empty)
+    //   2 (empty)
+    //   3 (empty)
+    //   4 (empty)
+    //   5 (empty)
+    //   6 (empty)
+
+    const char *new_keys[5] = {"D0", "A0", "B0", "E0", "C0"};
+    update_keys(new_keys, 0, 5);
+
+    simpleDelItem(4);   // Deleting key 4 to then create it with a write command
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Getting started
+    // The first bucket is empty
+    bgIteration_feedIterators();
+    expectReadKey(it, 0);
+
+    // Key 1 is the next in the queue.
+    // Reading key 1 to release key 0 from the iterator.
+    expectReadKey(it, 1);
+
+    // Write command that has 3 keys. 1 new repeated key and 1 key in the past.
+    // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a
+    //  multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY).
+    client *c = getWriteMultiKeysClient(4, {0, 4, 0}, "blpop");
+
+    // The write command is not blocked since key 0 is not in use by the iterator
+    simulateUnblockedWriteWithModification(c);
+
+    // Keys 2, 3 are next in the queue (it was put in the queue at the same time as key 1).
+    expectReadKeySequence(it, 2, 3);
+
+    // Key 4 is now in the dict with the value of key 0.
+    expectReadKey(it, 4, keyStr(0));
+
+    // Processing the rest of the dict entries.
+    expectReadKeySequence(it, 5, 9);
+
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+TEST_F(BgIterationTest, copyHandlesProperDb_Replication_NoConsistent) {
+    // In this test, the COPY command is copying from one DB to another.  We will create the
+    //  same key in both DBs.  We make sure that the proper key is created via replication, and
+    //  the proper key is created by iteration.
+
+    // NOTE:  Adding E0 to dict 1.  Now there is a E0 in both dict 0 and dict 1.
+    addKeyToDb(1, "E0", "E0");
+
+    // The test:
+    //  We will simulate (with DB0 selected): COPY D0 C0 DB 1 REPLACE
+    //  This will overwrite DB1:C0 that was created above.
+    //  Since DB0:D0 is the first iterated key we expect that DB1:C0 will be expedited.
+    //  After DB1:C0 is "overwritten", it should be marked early iterate.
+    //  We expect DB0:C0 to NOT be marked early iterate, and should get processed normally.
+
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Start with this to load 0 (C0) into the queue - but don't read 0 as that would load 1,2,3 into the queue!
+    bgIteration_feedIterators();
+
+    // COPY C0 E0 DB 1 REPLACE
+    client *c = static_cast<client*>(zcalloc(sizeof(client)));
+    c->cmd = lookupCommandByCString("copy");
+    c->db = server.db[0];
+    c->argc = 6;
+    c->argv = static_cast<robj**>(zcalloc(sizeof(robj*) * c->argc));
+    c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname));
+    c->argv[1] = createStringObjectFromSds(sdsnew("C0"));
+    c->argv[2] = createStringObjectFromSds(sdsnew("E0"));
+    c->argv[3] = createStringObjectFromSds(sdsnew("DB"));
+    c->argv[4] = createStringObjectFromSds(sdsnew("1"));
+    c->argv[5] = createStringObjectFromSds(sdsnew("REPLACE"));
+
+    // This should block on 2 keys.  DB0:C0 is in queue.  DB1:E0 needs to be expedited.
+    simulateBlockedWrite(c, 2);
+    expectReadKey(it, 0);              // DB0:C0
+    expectReadDbKeyValue(it, 1, "E0", "E0");    // DB1:E0 is expedited
+    expectReadKey(it, 1);              // (to release DB1:E0)
+    // Now keys 2 & 3 & 4 are in the queue
+
+    simulateUnblockedWrite(c);  // We shouldn't be blocked this time
+
+    // Now, we'll simulate the actual activity of the COPY.  DB1:C0 will be deleted in order to
+    //  be overwritten.
+    bgIteration_keyDelete(1, sdsnew("E0"));
+    // At this point the key would actually be deleted and recreated by COPY (no need to actually do this)
+
+    // And finally the replication (this should queue replication)
+    bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv);
+
+    // Now let's read everything...
+    expectReadKeySequence(it, 2, 4);    // These were in queue already
+    expectReadReplication(it, c);       // This is the new replication (creating DB1:C0)
+
+    expectReadKeySequence(it, 5, 9);    // These are all normal
+
+    expectReadComplete(it);     // At this point, we should be done.  We should NOT see DB1:C0.
+    freeTestClient(c);
+}
+
+
+// Just check that termination with replication in queue works OK.
+TEST_F(BgIterationTest, terminateWithReplication) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0);
+    expectReadKey(it, 1);  // makes sure we are done with key 0 (don't want to block)
+
+    client *c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);     // Should replicate
+    freeTestClient(c);
+
+    bgIteratorTerminate(it);
+
+    bgIteratorItem *item = bgIteratorRead(it);
+    ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED);
+
+    bgIteratorClose(it);            // background thread completes the termination
+
+    bgIteration_feedIterators();    // main thread, cleans up iterator and calls cleanup function
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_TRUE(cleanupTerminated);
+}
+
+
+// SWAPDB tests - Get ready for the mind-bend...
+
+TEST_F(BgIterationTest, swapDB_NoReplication_NoConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+    bgIteratorStatus status;
+
+    // In the non-consistent iterator (without replication), items are identified with the DBID at
+    //  the time they are placed into the queue.  The SWAPDB event signals the change to the
+    //  iterating process - and this is properly sequenced with the DB info for each item.
+
+    expectReadKey(it, 0);
+
+    // Keys 1,2,3, and 4 are in queue
+    simulateSwapDB(0, 1);       // The swap event will be queued after item 3
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.swapdb_queued, 1u);
+    EXPECT_EQ(status.swapdb_processed, 0u);
+
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+    expectReadKey(it, 3);
+    expectReadKey(it, 4);
+
+    expectReadSwapDB(it, 0, 1);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.swapdb_queued, 1u);
+    EXPECT_EQ(status.swapdb_processed, 0u);     // still processing it...
+
+    // Since we've seen the swap event, items now have the new DBID
+    expectReadDbKeyValue(it, 0, keyStr(5), keyStr(5));  // item 5 is in DB0
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.swapdb_queued, 1u);
+    EXPECT_EQ(status.swapdb_processed, 1u);     // done processing the swapdb
+
+    // Keys 6 & 7 are in the queue - let's swap back!
+    simulateSwapDB(1, 0);       // The swap event will be queued after item 7
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.swapdb_queued, 2u);        // 2nd one queued
+    EXPECT_EQ(status.swapdb_processed, 1u);
+
+    expectReadDbKeyValue(it, 0, keyStr(6), keyStr(6));  // Still appears as DB0
+    expectReadDbKeyValue(it, 0, keyStr(7), keyStr(7));  // Still appears as DB0
+
+    expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.swapdb_queued, 2u);
+    EXPECT_EQ(status.swapdb_processed, 1u);     // still processing it...
+
+    expectReadKey(it, 8);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.swapdb_queued, 2u);
+    EXPECT_EQ(status.swapdb_processed, 2u);     // done processing all swaps
+
+    expectReadKey(it, 9);
+    expectReadComplete(it);
+}
+
+TEST_F(BgIterationTest, swapDB_NoReplication_YesConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // In the consistent iterator (without replication) all items are presented to the iterating
+    //  process using the DBID at the time of the iterator creation.  No changes are evident.
+
+    expectReadKey(it, 0);
+
+    // Keys 1,2,3,4 are in queue
+    simulateSwapDB(0, 1);       // The swap occurs, but the iterator sees no change
+
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+    expectReadKey(it, 3);
+    expectReadKey(it, 4);
+
+    // Heck, let's go crazy with those swaps...
+    for (int itemNum = 5;  itemNum <= 9;  itemNum++) {
+        simulateSwapDB(0, 1);
+        expectReadKey(it, itemNum);
+    }
+
+    expectReadComplete(it);
+}
+
+TEST_F(BgIterationTest, swapDB_YesReplication_NoConsistent) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // In the non-consistent iterator WITH replication, items are identified with the DBID at the
+    //  time they are placed into the queue.  The SWAPDB event signals the change to the iterating
+    //  process - and this is properly sequenced with the DB info for each item.
+
+    expectReadKey(it, 0);
+
+    // Keys 1,2,3,4 are in queue
+    simulateSwapDB(0, 1);       // The swap event will be queued after item 3
+
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+    expectReadKey(it, 3);
+    expectReadKey(it, 4);
+
+    expectReadSwapDB(it, 0, 1);                         // We should see a SWAPDB event
+    bgIteratorItem *item = bgIteratorRead(it);          // followed by the associated replication
+    ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION);
+    bgIteration_feedIterators();
+
+    // Since we've seen the swap event, items now have the new DBID
+    expectReadDbKeyValue(it, 0, keyStr(5), keyStr(5));  // item 5 is in DB0
+
+    // Keys 6 & 7 are in the queue - let's swap back!
+    simulateSwapDB(1, 0);       // The swap event will be queued after item 7
+
+    expectReadDbKeyValue(it, 0, keyStr(6), keyStr(6));  // Still appears as DB0
+    expectReadDbKeyValue(it, 0, keyStr(7), keyStr(7));  // Still appears as DB0
+
+    expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap
+    item = bgIteratorRead(it);
+    ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION);
+    bgIteration_feedIterators();
+
+    expectReadKey(it, 8);
+    expectReadKey(it, 9);
+    expectReadComplete(it);
+}
+
+// There is no test for swapDB_YesReplication_YesConsistent because this configuration is not
+//  permitted with multiple DBs (not permitted with swaps).
+
+
+// FLUSHDB & FLUSHALL Tests
+TEST_F(BgIterationTest, flushDB_flushAll) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0);
+    expectReadKey(it, 1);
+
+    // key 1 is active in the iterator - this key will be removed from the DB before flush.
+    // keys 2 & 3 & 4 are in queue - but will be returned to Valkey before the flush.  These are yanked
+    //  back by Valkey and will not be seen by iterator.
+    simulateFlushDB(-1, 1);
+
+    bgIteratorItem *item = bgIteratorRead(it);
+    ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED);
+
+    bgIteratorClose(it);            // background thread completes the termination
+
+    bgIteration_feedIterators();    // main thread, cleans up iterator and calls cleanup function
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_TRUE(cleanupTerminated);
+}
+
+TEST_F(BgIterationTest, flushDB_flushOne) {
+    bgIterator *it1 = bgIteratorCreateFullScanIter("iter1",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+    bgIterator *it2 = bgIteratorCreateFullScanIter("iter2",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+    bgIteratorStatus status;
+
+    // The test flushes DB0.  This is half the data.  Since <= half, a non-consistent iterator is
+    //  allowed to proceed.  But the consistent iterator will be terminated.
+
+    expectReadKey(it1, 0);
+    expectReadKey(it2, 0);
+    expectReadKey(it1, 1);
+    expectReadKey(it2, 1);
+
+    // key 1 is active in the iterator - this key will be removed from the DB before flush.
+    // keys 2 & 3 & 4 are in queue - but will be returned to Valkey before the flush.  These are yanked
+    //  back by Valkey and will not be seen by iterator.
+    simulateFlushDB(0, 1);
+    bgIteratorGetStatus(it1, &status);
+    EXPECT_EQ(status.flushdb_queued, 1u);
+    EXPECT_EQ(status.flushdb_processed, 0u);
+
+    // Testing the non-consistent one continues...
+    // Everything already on the iterator queue should be preserved (deleted from the DB).
+    //  Keys 2 & 3 & 4 are already queued (and preserved).
+    expectReadKey(it1, 2);
+    expectReadKey(it1, 3);
+    expectReadKey(it1, 4);
+
+    bgIteratorItem *item = bgIteratorRead(it1);
+    ASSERT_EQ(item->type, BGITERATOR_ITEM_FLUSHDB);
+    ASSERT_EQ(item->dbid, 0);
+    bgIteratorGetStatus(it1, &status);
+    EXPECT_EQ(status.flushdb_queued, 1u);
+    EXPECT_EQ(status.flushdb_processed, 0u);    // still processing it
+
+    expectReadKey(it1, 5);
+    bgIteratorGetStatus(it1, &status);
+    EXPECT_EQ(status.flushdb_queued, 1u);
+    EXPECT_EQ(status.flushdb_processed, 1u);    // done with all flushdb's
+    expectReadKey(it1, 6);
+    expectReadKey(it1, 7);
+    expectReadKey(it1, 8);
+    expectReadKey(it1, 9);
+    expectReadComplete(it1);
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_FALSE(cleanupTerminated);
+
+    // But the consistent iterator should be terminated
+    item = bgIteratorRead(it2);
+    ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED);
+    bgIteratorClose(it2);           // background thread completes the termination
+    bgIteration_feedIterators();    // main thread, cleans up iterator and calls cleanup function
+    EXPECT_EQ(cleanupCount, 2);
+    EXPECT_TRUE(cleanupTerminated);
+}
+
+// Cluster mode, 2 iterators, CONSISTENT+REPLICATION and NONCONSISTENT+REPLICATION
+//  Modify a missing key.
+TEST_F(BgIterationTestCluster, modMissingKey_2iter_cluster) {
+    // Cluster test.  REPLICATION + CONSISTENT only supported in cluster mode
+    // For this test, we only have 5 keys since not using DB[1].  Remove the last one.
+    simpleDelItem(4);
+
+    bgIterator *it1 = bgIteratorCreateFullScanIter("iter1",
+            BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT,
+            NULL, iteratorCleanupFn, PRIVDATA);
+    bgIterator *it2 = bgIteratorCreateFullScanIter("iter2",
+            BGITERATOR_FLAG_REPLICATION,
+            NULL, iteratorCleanupFn, PRIVDATA);
+
+    client *c = getWriteClient(4, "xxx");
+    simulateUnblockedWriteWithModification(c);     // Wouldn't be blocked since key doesn't exist
+
+    bgIteration_feedIterators();    // Prime the feed - key 0 and 1 are now enqueued
+
+    // Process the consistent iteration
+    expectReadReplication(it1, c);  // replication happened before feeding (should be 1st)
+    expectReadKeySequence(it1, 0, 3);
+    expectReadComplete(it1);
+
+    // Process the non-consistent iteration
+    expectReadReplication(it2, c);  // replication happened before feeding (should be 1st)
+    expectReadKeySequence(it2, 0, 3);
+    expectReadComplete(it2);
+
+    freeTestClient(c);
+}
+
+TEST_F(BgIterationTest, twoKeys_firstFuture) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION,
+            NULL, iteratorCleanupFn, PRIVDATA);
+
+    bgIteration_feedIterators();    // Prime the feed - key 0
+    expectReadKey(it, 0);  // Causes keys 1, 2, 3, 4 to be queued (same bucket)
+    expectReadKey(it, 1);  // Causes key 0 to be released
+
+    // This must replicate, because A0 is in the past.  B1 (future) wouldn't need replication except
+    //  for the modification to B1.  We try to trip up bgIterator by giving a key that doesn't need
+    //  replication except for the later command that does.  Make this a little trickier by adding
+    //  the set for A1 - unnecessary, but more clearly shows the expediting in progress.
+    client *c = getMultiClient("SET A0 xxx; SELECT 1; SET A1 xxx; SET B1 xxx", 1);
+
+    // The EXEC should block on 2 keys, because B1(5) & A1(8) should be expedited
+    simulateBlockedWrite(c, 2);
+
+    expectReadKeySequence(it, 2, 4);    // These were already in queue
+
+    // Note - it would be OK if these 2 were reversed, but this is how the current algorithm works.
+    expectReadKey(it, 8);  // Key 8 (A1) was expedited
+    expectReadKey(it, 5);  // Key 5 (B1) was expedited
+
+    // and clean up the rest...
+    expectReadKeySequence(it, 6, 7);
+    // Key 8 was already read above (expedited)
+    expectReadKey(it, 9);
+    expectReadComplete(it);
+}
+
+TEST_F(BgIterationTest, multiBlocksOnFutureKey) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1,2,3,4 are queued (they are all in the same bucket).
+    // If we fake a modification to key 5, we won't know if it's handled out of order.
+    // So we fake a modification to key 6
+    // Dummy up a MULTI...
+    client *c = getMultiClient("SET C1 xxx", 1);
+
+    // Since this is consistent, we will block the client, disallowing the write.
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+
+    // C1 (key 6) will be expedited to the front of the list
+    expectReadKey(it, 6);
+
+    // Now that we've read key 5, key 0 (C0) is passed and should not block
+    client *c2 = getMultiClient("SET C0 xxx");
+    simulateUnblockedWrite(c2);
+    freeTestClient(c2);
+
+
+    expectReadKeySequence(it, 1, 5);
+    expectReadKeySequence(it, 7, 9);
+    expectReadComplete(it);
+}
+
+TEST_F(BgIterationTest, multiNotReplicatedButDelRecreateAccess) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Scenario.  We have a multi that doesn't need to be replicated because all of the keys exist
+    //  but are all future keys.  Note that missing keys are considered already-iterated, so all
+    //  must exist for this test.  Then:
+    //   - we delete a key
+    //   - we re-create the deleted (future) key - normally this would be replicated
+    //   - we access another (future) key - we don't expect to get blocked!
+
+    // We use DB 1 only because the hash table buckets are better broken up there.
+    client *c = getMultiClient("DEL A1; SET A1 xxx; SET E1 yyy", 1);
+
+    // For DB[1]:
+    // Bucket 0:0 level:0
+    //   0 h2 18, key "B1"
+    //   1 h2 fd, key "C1"
+    //   2 h2 e9, key "D1"
+    //   3 (empty)
+    //   4 (empty)
+    //   5 (empty)
+    //   6 (empty)
+    // Bucket 0:1 level:0
+    //   0 h2 36, key "A1"
+    //   1 h2 0c, key "E1"
+    //   2 (empty)
+    //   3 (empty)
+    //   4 (empty)
+    //   5 (empty)
+    //   6 (empty)
+
+    // Read through DB 0 and into DB 1
+    expectReadKeySequence(it, 0, 5);    // D0, E0, B0, A0, C0, B1
+    // Now, C1 and D1 are in the queue (in use) and A1 & E1 are future
+
+    // Now let's process the multi.  Since A1 & D1 are both future (existing) items, we shouldn't
+    //  block or replicate.
+    simulateUnblockedWrite(c);  // the EXEC
+
+    // Simulate the DEL A1
+    server.in_exec = 1;    // Simulate actual execution of the MULTI/EXEC
+    advanceMultiClientToCommand(c, 0);  // DEL A1
+    EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0);
+    bool blocked = bgIteration_blockClientIfRequired(c);
+    EXPECT_FALSE(blocked);
+    simpleDelItem(8);
+    sds delKey = sdsnew(keyStr(8));
+    bgIteration_keyDelete(1, delKey);
+    sdsfree(delKey);
+    bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); // shouldn't replicate
+
+    // Simulate SET A1 - the key doesn't exist, and would normally replicate and mark early iterate,
+    //  but this is in a transaction, and we are not replicating this transaction.
+    advanceMultiClientToCommand(c, 1);  // SET A1 xxx
+    simulateUnblockedWriteWithModification(c);
+
+    // Now write to another existing future key - this should work if we weren't confused by the DEL
+    advanceMultiClientToCommand(c, 2);  // SET E1 yyy
+    simulateUnblockedWriteWithModification(c);
+    server.in_exec = 0;
+
+    // Now we can continue iterating, and we should pick up keys 6-9.  (and no replication!)
+    expectReadKeySequence(it, 6, 7);
+    expectReadKey(it, 8, "xxx");
+    expectReadKey(it, 9, "yyy");
+    expectReadComplete(it);
+}
+
+TEST_F(BgIterationTest, multiHandlesSelectProperly) {
+    // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1.  We will process it
+    //  in DB0, but it will be unprocessed in DB1.  See if we track select properly.
+    addKeyToDb(1, "C0", "C0");
+
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - C0 in DB 0.
+    expectReadKey(it, 0);
+
+    // Now, we are done with C0 in DB0, but not in DB1
+    expectReadKey(it, 1);
+
+    // These cases should NOT block...  (they access C0 in DB0)
+    client *c;
+    c = getMultiClient("SET C0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 0; SET C0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SET C0 xxx; SELECT 1");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 1; SELECT 0; SET C0 xxx; SELECT 1");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+
+    // These cases SHOULD block...  (they access C0 in DB1)
+    c = getMultiClient("SET C0 xxx");
+    c->db = server.db[1];
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 1; SET C0 xxx");
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 1; SET C0 xxx; SELECT 0");
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 0; SELECT 1; SET C0 xxx; SELECT 1");
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+
+    expectAnythingCleanup(it);
+}
+
+
+TEST_F(BgIterationTest, multiHandlesSelectNoPermissionProperly) {
+    // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1.  We will process it
+    //  in DB0, but it will be unprocessed in DB1.  See if we track select properly.
+    addKeyToDb(1, "C0", "C0");
+
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - C0 in DB 0.
+    expectReadKey(it, 0);
+
+    // Now, we are done with DC00 in DB0, but not in DB1
+    expectReadKey(it, 1);
+
+    // No permission for any commands (specifically select/swapdb)
+    EXPECT_CALL(mock, amzCanClientExecuteCommand(_,_,_,_))
+        .Times(AtLeast(1)).WillRepeatedly(Return(false));
+
+    // These cases should NOT block...  (they access C0 in DB0)
+    //  The SELECTs below are inconsequential - with/without select, same result.
+    client *c;
+    c = getMultiClient("SET C0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 0; SET C0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SET C0 xxx; SELECT 1");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 1; SELECT 0; SET C0 xxx; SELECT 1");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+
+    // These cases SHOULD block IF SELECT IS WORKING...  (they access C0 in DB1)
+    c = getMultiClient("SET C0 xxx");
+    c->db = server.db[1];      // already starting on DB1
+    simulateBlockedWrite(c);    // will block, no select
+    freeTestClient(c);
+    c = getMultiClient("SELECT 1; SET C0 xxx");
+    simulateUnblockedWrite(c);  // will not block because accessing DB0 (select fails)
+    freeTestClient(c);
+    c = getMultiClient("SELECT 1; SET C0 xxx; SELECT 0");
+    simulateUnblockedWrite(c);  // will not block because accessing DB0 (select fails)
+    freeTestClient(c);
+    c = getMultiClient("SELECT 0; SELECT 1; SET C0 xxx; SELECT 1");
+    simulateUnblockedWrite(c);  // will not block because accessing DB0 (select fails)
+    freeTestClient(c);
+
+    expectAnythingCleanup(it);
+}
+
+
+TEST_F(BgIterationTest, multiHandlesSwapdbProperly) {
+    // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1.  We will process it
+    //  in DB0, but it will be unprocessed in DB1.  See if we track select properly.
+    addKeyToDb(1, "C0", "C0");
+
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - C0 in DB 0.
+    expectReadKey(it, 0);
+
+    // Now, we are done with C0 in DB0, but not in DB1
+    expectReadKey(it, 1);
+
+    // These cases should NOT block...  (they access C0 in DB0)
+    client *c;
+    c = getMultiClient("SET C0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SET C0 xxx; SWAPDB 0 1");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SET C0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET C0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 0 1; SELECT 1; SET C0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+
+    // These cases SHOULD block...  (they access C0 in DB1)
+    c = getMultiClient("SET C0 xxx");
+    c->db = server.db[1];
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 1 0; SET C0 xxx; SWAPDB 0 1");
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 1 0; SELECT 0; SET C0 xxx; SWAPDB 0 1");
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET C0 xxx; SELECT 1");
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+
+    expectAnythingCleanup(it);
+}
+
+
+TEST_F(BgIterationTest, multiHandlesSwapdbNoPermissionProperly) {
+    // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1.  We will process it
+    //  in DB0, but it will be unprocessed in DB1.  See if we track select properly.
+    addKeyToDb(1, "C0", "C0");
+
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - C0 in DB 0.
+    expectReadKey(it, 0);
+
+    // Now, we are done with C0 in DB0, but not in DB1
+    expectReadKey(it, 1);
+
+    // No permission for any commands (specifically select/swapdb)
+    EXPECT_CALL(mock, amzCanClientExecuteCommand(_,_,_,_))
+        .Times(AtLeast(1)).WillRepeatedly(Return(false));
+
+    // These cases should NOT block...  (they access C0 in DB0)
+    //  The SELECTs & SWAPDBs below are inconsequential - with/without select/swapdb, same result.
+    client *c;
+    c = getMultiClient("SET C0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SET C0 xxx; SWAPDB 0 1");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SET C0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET C0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 0 1; SELECT 1; SET C0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+
+    // These cases SHOULD block IF SELECT/SWAPDB IS WORKING...  (they access C0 in DB1)
+    c = getMultiClient("SET C0 xxx");
+    c->db = server.db[1];
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 1 0; SET C0 xxx; SWAPDB 0 1");
+    simulateUnblockedWrite(c);  // will not block because accessing DB0 (swapdb fails)
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 1 0; SELECT 0; SET C0 xxx; SWAPDB 0 1");
+    simulateUnblockedWrite(c);  // will not block because accessing DB0 (swapdb/select fails)
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET C0 xxx; SELECT 1");
+    simulateUnblockedWrite(c);  // will not block because accessing DB0 (swapdb/select fails)
+    freeTestClient(c);
+
+    expectAnythingCleanup(it);
+}
+
+void * pthreadWait200msAndReadTwoKeys(void *arg) {
+    bgIterator *it = static_cast<bgIterator*>(arg);
+
+    usleep(200000);
+    bgIteratorRead(it);
+    bgIteratorRead(it);
+    return nullptr;
+}
+
+void asyncWait200msAndReadTwoKeys(bgIterator *it) {
+    int rc;
+    pthread_attr_t attr;
+    pthread_t thread;
+
+    rc = pthread_attr_init(&attr);
+    assert(rc == 0);
+    rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+    assert(rc == 0);
+
+    rc = pthread_create(&thread, &attr, pthreadWait200msAndReadTwoKeys, it);
+    assert(rc == 0);
+
+    rc = pthread_attr_destroy(&attr);
+    assert(rc == 0);
+}
+
+
+TEST_F(BgIterationTest, testLuaWithUndeclaredKey) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1,2,3 are queued (they are all in the same bucket).
+    // If we fake a modification to key 4, we won't know if it's handled out of order.
+    // So we fake a modification to key 5
+    client *c = getWriteClient(5, "xxx");
+    c->flag.script = 1;
+
+    // Now for a LUA script, we have already blocked (on the eval/evalsha) for any declared keys
+    //  But here, we're about to modify an undeclared key.  We can't actually block in the middle
+    //  of the LUA script.  So this will behave as unblocked, but incur a synchronous wait.
+
+    // Key 5 will get expedited when we simulate the write.  After reading key 5, key 1 will need
+    //  to be read to return key 5 to Valkey, unbloking the synchronous wait.
+    asyncWait200msAndReadTwoKeys(it);
+
+    monotime blockTimer;
+    elapsedStart(&blockTimer);
+    simulateUnblockedWrite(c);
+    // Must have delayed at least 150ms (some time may have passed before timer start)
+    EXPECT_GT(elapsedMs(blockTimer), 150u);
+
+    // Continue...
+    expectReadKeySequence(it, 2, 4);
+    // 5 has already been processed
+    expectReadKeySequence(it, 6, 9);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+
+
+TEST_F(BgIterationTest, replicationReceivedWhileProcessingLastKey) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    client *c = getWriteClient(0, "xxx");
+
+    expectReadKeySequence(it, 0, 9);
+    simulateUnblockedWriteWithModification(c);         // Wouldn't be blocked because done with key 0
+    expectReadReplication(it, c);   // Replication happened while processing key 9, should be here.
+
+    simulateUnblockedWriteWithModification(c);         // This won't replicate because we are done processing key 9
+    expectReadComplete(it);         // We expect to see the completion instead
+
+    freeTestClient(c);
+}
+
+
+TEST_F(BgIterationTest, repldoneFunctionCalled) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA);
+
+    client *c = getWriteClient(0, "xxx");
+
+    expectReadKeySequence(it, 0, 9);
+    simulateUnblockedWriteWithModification(c);         // Wouldn't be blocked because done with key 0
+    expectReadReplication(it, c);   // Replication happened while processing key 9, should be here.
+    EXPECT_EQ(repldoneCount, 1);    // Last key released, now done feeding replication
+
+    simulateUnblockedWriteWithModification(c);         // This won't replicate because we are done processing key 9
+    expectReadComplete(it);         // We expect to see the completion instead
+
+    freeTestClient(c);
+}
+
+
+TEST_F(BgIterationTest, repldoneFunctionCalledTwice) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, iteratorRepldoneFnNotBeingReadyInitially, iteratorCleanupFn, PRIVDATA);
+
+    client *c = getWriteClient(0, "xxx");
+
+    expectReadKeySequence(it, 0, 9);
+    simulateUnblockedWriteWithModification(c);         // Wouldn't be blocked because done with key 0
+    expectReadReplication(it, c);   // Replication happened while processing key 9, should be here.
+    EXPECT_EQ(repldoneCount, 0);    // Last key released, now done feeding replication
+    EXPECT_EQ(isReplDoneReady, 1);
+    bgIteration_feedIterators();    // Need to call it as RepldoneFnNotBeingReadyInitially returns false in first call
+    EXPECT_EQ(repldoneCount, 1);
+
+    simulateUnblockedWriteWithModification(c);         // This won't replicate because we are done processing key 9
+    expectReadComplete(it);         // We expect to see the completion instead
+
+    freeTestClient(c);
+}
+
+
+TEST_F(BgIterationTest, queuingitemFunctionCalled) {
+    bgIterator *it = bgIteratorCreateFullScanIter("simple",
+            0, NULL, iteratorCleanupFn, iteratorBeforeAndAfterQueuingItemFn, PRIVDATA);
+    EXPECT_EQ(beforeQueuingItemCount, 0);
+    EXPECT_EQ(afterQueuingItemCount, 0);
+    expectReadKeySequence(it, 0, 9);
+    expectReadComplete(it);
+    // Callback is invoked when item is fed to and returned from an iterator
+    EXPECT_EQ(beforeQueuingItemCount, 10);
+    EXPECT_EQ(afterQueuingItemCount, 10);
+}
+
+TEST_F(BgIterationTest, checkReplicationByteCount) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA);
+
+    client *c = getWriteClient(0, "xxx");
+    int expectedReplicationSize = sizeof(bgIteratorItem);
+    for (int i = 0;  i < c->argc;  i++) {
+        expectedReplicationSize += objectComputeSize(NULL, c->argv[i], 0, 0);
+    }
+
+    expectReadKey(it, 0);
+    expectReadKey(it, 1);  // Releases and unblocks 0
+    EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u);
+
+    simulateUnblockedWriteWithModification(c);         // Wouldn't be blocked because done with key 0
+    EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize);
+    simulateUnblockedWriteWithModification(c);         // and write again (2nd replication)
+    EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize);
+
+    expectReadKeySequence(it, 2, 4);    // Keys 1..4 all in same bucket
+
+    expectReadReplication(it, c);
+    // After reading the 1st replication, it hasn't been returned yet (it's the active item)
+    EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize);
+    expectReadReplication(it, c);
+    // After reading the 2nd replication, the 1st has been returned
+    EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize);
+
+    expectReadKey(it, 5);
+    // Now all replication has been returned/freed
+    EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u);
+
+    expectReadKeySequence(it, 6, 9);
+    expectReadComplete(it);
+
+    freeTestClient(c);
+}
+
+// Test that for an arbitrary write command having no keys, replication should occur.
+TEST_F(BgIterationTest, checkNoKeysWriteIsReplicated) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0);
+
+    client *c = getNoKeysWriteClient();
+    EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0);
+    bool blocked = bgIteration_blockClientIfRequired(c);
+    EXPECT_FALSE(blocked);
+    bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv);
+
+    expectReadKeySequence(it, 1, 4);    // These were already in queue
+
+    expectReadReplication(it, c);
+
+    expectReadKeySequence(it, 5, 9);
+    expectReadComplete(it);
+    freeTestClient(c);
+}
+TEST_F(BgIterationTestClusterSlots, testAmzKeyIsLogicallyDeletedInOrderedIteration3Slots) {
+    bgIterator *it = bgIteratorCreateSlotsIter("simple",
+            0, slots_to_iterate, slots_to_iterate_size, NULL, iteratorCleanupFn, PRIVDATA);
+    EXPECT_CALL(mock, amzKeyIsLogicallyDeleted(_)).WillOnce(Return(true)).WillRepeatedly(Return(false));
+    expectReadKeySequence(it, 1, n_keys_to_read - 1);
+
+    // Quick status check.  At this point, the last item hasn't been returned yet.
+    bgIteratorStatus status;
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentries_queued, n_keys_to_read - 1); // The first item should be skipped from the queue
+    EXPECT_EQ(status.dbentries_processed, n_keys_to_read - 2);
+
+    expectReadComplete(it);
+    EXPECT_FALSE(cleanupTerminated);
+}
+
+TEST_F(BgIterationTest, testAmzKeyIsLogicallyDeletedInOrderedFullScanIteration) {
+    bgIterator *it = bgIteratorCreateFullScanIter("simple",
+            0, NULL, iteratorCleanupFn, PRIVDATA);
+    EXPECT_CALL(mock, amzKeyIsLogicallyDeleted(_)).WillOnce(Return(true)).WillRepeatedly(Return(false));
+    expectReadKeySequence(it, 1, 9);
+
+    // Quick status check.  At this point, item #9 hasn't been returned yet.
+    bgIteratorStatus status;
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentries_queued, 9u); // The first item should be skipped from the queue
+    EXPECT_EQ(status.dbentries_processed, 8u);
+
+    expectReadComplete(it);
+    EXPECT_FALSE(cleanupTerminated);
+}
+#endif
+
+#ifdef CODE_NOT_READY_YET
+class BgIterationTestCluster : public BgIterationTest {
+    private:
+        // This is the expected order of the keys when hashed into a single dict at slot 0 having size 8.
+        //  The "{06S}" prefix ensures use of only slot 0.
+        const char *keys[1][5] = {{"{06S}C0", "{06S}D0", "{06S}A0", "{06S}B0", "{06S}E0"}};
+
+    protected:
+        // Furthermore, the bucketization will look like this:
+        // db 0 slot 0
+        // Table 0, used 5, exp 1, top-level buckets 2, child buckets 0
+        // Bucket 0:0 level:0
+        //   0 h2 1a, key "{06S}C0"
+        //   1 h2 7b, key "{06S}D0"
+        //   2 (empty)
+        //   3 (empty)
+        //   4 (empty)
+        //   5 (empty)
+        //   6 (empty)
+        // Bucket 0:1 level:0
+        //   0 h2 5c, key "{06S}A0"
+        //   1 h2 bf, key "{06S}B0"
+        //   2 h2 57, key "{06S}E0"
+        //   3 (empty)
+        //   4 (empty)
+        //   5 (empty)
+        //   6 (empty)
+
+        virtual const char * getKeyAtDbSeq(int db, int seq) override {
+            assert(db == 0);
+            return keys[db][seq];
+        }
+
+
+        virtual void setupDatabase() override {
+            // For these unit tests, a standard database is constructed.  The order of items in the
+            //  hash table is important, and this is validated here.  If the hash table
+            //  implementation changes, we will find out quickly at this point.  All other tests
+            //  will become invalid!
+
+            // Note that the cluster_enabled tests are designed for the purpose of testing
+            //  CONSISTENT iteration WITH REPLICATION.  This type of iteration is not supported
+            //  in non-cluster-mode.  At the time of writing, there is no-known use-case for this
+            //  combination.  But it is tested for completeness and to ensure future availability.
+
+            // Note also that the cluster_enabled tests are not designed to address issues specific
+            //  to per-slot-dictionaries.  The tests are simplified by ensuring that all keys are
+            //  mapped to slot-0.  It is assumed that iteration would progress in slot order, and
+            //  failure in this regard will be caught in integration tests (amztests).
+
+            server.dbnum = 1;   // cluster-mode means 1 DB
+            server.cluster_enabled = true;
+            server.db = static_cast<serverDb **>(zcalloc(sizeof(serverDb *) * server.dbnum));
+
+            // Yes, it's cluster mode, but we're mapping all keys to slot 0 - so we cheat and create only 1 dict (just like CMD).
+            initializeServerDb(0, CLUSTER_SLOT_MASK_BITS);
+
+            // Note "06S" is a prefix that maps to slot 0.  We're not testing slots here.
+
+            addKeyToDb(0, "{06S}A0", "{06S}A0");
+            addKeyToDb(0, "{06S}B0", "{06S}B0");
+            addKeyToDb(0, "{06S}C0", "{06S}C0");
+            addKeyToDb(0, "{06S}D0", "{06S}D0");
+            addKeyToDb(0, "{06S}E0", "{06S}E0");
+
+            // In case we need to debug...
+            if (0) debugPrintBucketInfo();
+
+            // Validate that the iteration order matches the expected order
+            hashtableIterator *it = hashtableCreateIterator(server.db[0]->keys->hashtables[0], 0);
+            for (int i = 0;  i < 5;  i++) {
+                void *nextEntry;
+                hashtableNext(it, &nextEntry);
+                dbEntry *de = static_cast<dbEntry *>(nextEntry);
+                ASSERT_STREQ(static_cast<const char*>(objectGetKey(de)), getKeyAtDbSeq(0, i));
+            }
+            hashtableReleaseIterator(it);
+        }
+};
+#endif
+
+#ifdef CODE_NOT_READY_YET
+TEST_F(BgIterationTestCluster, dictIsOK) {
+    // Just run the setup/teardown code to make sure the dict is OK.
+}
+
+
+TEST_F(BgIterationTestCluster, modFutureItem_YesReplication_YesConsistent_cluster) {
+    // Cluster test.  REPLICATION + CONSISTENT only supported in cluster mode
+    bgIterator *it = bgIteratorCreateFullScanIter("iter",
+            BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT,
+            NULL, iteratorCleanupFn, PRIVDATA);
+    bgIteratorStatus status;
+
+    // For this test, don't read the 1st key - we only have 5 keys since not using DB[1]
+    bgIteration_feedIterators();    // Prime the feed - key 0 and 1 are now enqueued
+
+    // At this point, key 0, and 1 are queued.  Fake a modification to key 2 & 4 - two keys to ensure
+    //  that replication is ordered
+    client *c1 = getWriteClient(2, "xxx");
+    client *c2 = getWriteClient(4, "yyy");
+
+    // Since this is consistent, we will block the client, disallowing the write.
+    simulateBlockedWrite(c1);
+    simulateBlockedWrite(c2);
+
+    // On a consistent iterator, the event is expedited in-front of items already in queue!
+    //  Read keys 2&4 out of order.
+    expectReadKey(it, 2);  // reading original/unmodified item
+
+    // This call is expected to unblock the client waiting on #2
+    expectReadKeyWithUnblock(it, 4, nullptr, 2);  // reading original/unmodified item
+    simulateUnblockedWriteWithModification(c1);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.replication_queued, 1u);
+    EXPECT_EQ(status.replication_processed, 0u);
+
+    // Now read items 0 and 1 - these were actually already queued before keys 1 & 4 were expedited.
+    // This call is expected to unblock the client waiting on #4
+    expectReadKeyWithUnblock(it, 0, nullptr, 4);
+    simulateUnblockedWriteWithModification(c2);
+    expectReadKey(it, 1);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.replication_queued, 2u);
+    EXPECT_EQ(status.replication_processed, 0u);
+
+     // And now the 2 replications are queued
+    expectReadReplication(it, c1);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.replication_queued, 2u);       // 1st replication still being processed
+    EXPECT_EQ(status.replication_processed, 0u);    //  (no change in these metrics yet)
+
+    expectReadReplication(it, c2);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.replication_queued, 2u);
+    EXPECT_EQ(status.replication_processed, 1u);    // Done with 1st, processing 2nd
+
+     // Continue...
+    expectReadKey(it, 3);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.replication_queued, 2u);
+    EXPECT_EQ(status.replication_processed, 2u);    // Done processing both repl items
+    expectReadComplete(it);
+    freeTestClient(c1);
+    freeTestClient(c2);
+}
+#endif
+
+
+
+// JHB - need test that hashing is paused when an entry is in use.
diff --git a/src/unit/wrappers.h b/src/unit/wrappers.h
index 0f4fb388b98..0f80919d6f7 100644
--- a/src/unit/wrappers.h
+++ b/src/unit/wrappers.h
@@ -61,6 +61,12 @@ extern "C" {
 long long __wrap_aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, aeTimeProc *proc, void *clientData, aeEventFinalizerProc *finalizerProc);
 int __wrap_processPendingCommandAndInputBuffer(client *c);
 void __wrap_beforeNextClient(client *c);
+
+void __wrap_blockClientInUseOnKeys(client *c, int nKeys, robj **keys);
+void __wrap_unblockClientsInUseOnKey(robj *key);
+
+int __wrap_ACLCheckAllUserCommandPerm(user *u, struct serverCommand *cmd, robj **argv, int argc, int dbid, int *idxptr);
+
 #undef protected
 #undef _Bool
 #undef typename