diff --git a/.config/typos.toml b/.config/typos.toml
index 10103279c57..da8175e4ff9 100644
--- a/.config/typos.toml
+++ b/.config/typos.toml
@@ -15,11 +15,15 @@ optin = "optin"
 smove = "smove"
 Parth = "Parth" # seems like the spellchecker does not like it is similar to "Path"
 nd = "nd"
+threadsave = "threadsave"
+
+[default.extend-identifiers]
+dbe = "dbe"
 
 [default]
 extend-ignore-re = [
-    "SELECTed",
-    "WATCHed",
+    "[A-Z]{2,}ed", # SELECTed, WATCHed, etc.
+    "[A-Z]{2,}s",  # SELECTs, etc.
 ]
 
 [type.c]
@@ -65,6 +69,9 @@ pn = "pn"
 seeked = "seeked"
 tre = "tre"
 
+[type.cpp.extend-words]
+fo = "fo"
+
 [type.systemd.extend-words]
 # systemd = .conf
 ake = "ake"
@@ -72,6 +79,3 @@ ake = "ake"
 [type.tcl.extend-words]
 fo = "fo"
 tre = "tre"
-
-[type.cpp.extend-words]
-fo = "fo"
diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake
index e2cd375bcc7..54964d079cd 100644
--- a/cmake/Modules/SourceFiles.cmake
+++ b/cmake/Modules/SourceFiles.cmake
@@ -36,6 +36,7 @@ set(VALKEY_SERVER_SRCS
     ${CMAKE_SOURCE_DIR}/src/t_hash.c
     ${CMAKE_SOURCE_DIR}/src/config.c
     ${CMAKE_SOURCE_DIR}/src/aof.c
+    ${CMAKE_SOURCE_DIR}/src/bgiteration.c
     ${CMAKE_SOURCE_DIR}/src/pubsub.c
     ${CMAKE_SOURCE_DIR}/src/multi.c
     ${CMAKE_SOURCE_DIR}/src/debug.c
diff --git a/src/Makefile b/src/Makefile
index 2c78f95986e..98f49108e46 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -457,6 +457,7 @@ ENGINE_SERVER_OBJ = \
     allocator_defrag.o \
     anet.o \
     aof.o \
+	bgiteration.o \
     bio.o \
     bitops.o \
     blocked.o \
diff --git a/src/bgiteration.c b/src/bgiteration.c
new file mode 100644
index 00000000000..3756ba0a60b
--- /dev/null
+++ b/src/bgiteration.c
@@ -0,0 +1,2698 @@
+/*
+ * Copyright Valkey Contributors.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD 3-Clause
+ */
+
+#include "fmacros.h"
+#include "bgiteration.h"
+#include "dict.h"
+#include "fifo.h"
+#include "kvstore.h"
+#include "monotonic.h"
+#include "mutexqueue.h"
+#include "server.h"
+
+int getFlushCommandFlags(client *c, int *flags);                                                        // in db.c
+uint64_t dictObjHash(const void *key);                                                                  // in server.c
+int dictObjKeyCompare(const void *key1, const void *key2);                                              // in server.c
+size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid);                             // in object.c
+robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const sds key, long long expire); // in object.c
+
+
+static bool receiveItemsBackFromOneIterator(bgIterator *it);
+
+
+// Future extendability
+static bool ignoreKeyForSave(const_sds key) {
+    UNUSED(key);
+    return false;
+}
+
+// Returns true if the cmd is a script command that may replicate.
+static bool isScriptCallWriteCmd(struct serverCommand *cmd) {
+    return ((cmd->proc == fcallCommand) || (cmd->proc == evalCommand) || (cmd->proc == evalShaCommand));
+}
+
+/* The PFCOUNT command (which does NOT have the CMD_WRITE flag) modifies the underlying string and
+ * is replicated as a write.  So it needs to be detected and handled specially. */
+static bool isWriteCmd(struct serverCommand *cmd) {
+    return ((cmd->flags & CMD_WRITE) || (cmd->proc == pfcountCommand) || (cmd->proc == execCommand) || (isScriptCallWriteCmd(cmd)));
+}
+
+// Returns true if the command is a deletion based command (DEL or UNLINK)
+static bool isDeleteCmd(struct serverCommand *cmd) {
+    return ((cmd->proc == delCommand) || (cmd->proc == unlinkCommand));
+}
+
+
+/* This utility utilizes the main thread and background threads for processing.  The API is split,
+ * with some of the functions intended for the main thread and others intended for the background
+ * clients.  This sanity check ensures that we maintain thread safety, calling the API as intended. */
+static bool onValkeyMainThread(void) {
+    /* Modules interact with the main thread using a mutex.  If a module owns the mutex, consider
+     *  that equivalent to being on the main thread. */
+    bool mightBeInModule = (atomic_load_explicit(&server.module_gil_acquired, memory_order_relaxed) == 0);
+    return (mightBeInModule || pthread_equal(server.main_thread_id, pthread_self()) != 0);
+}
+
+
+/* Parse a parameters robj, extracting a valid DBID.
+ * Returns FALSE if DBID isn't valid. */
+static bool getDbIdFromRobj(robj *obj, int *db_id) {
+    long long value;
+    if (getLongLongFromObject(obj, &value) != C_OK) return false;
+    if ((value < 0) || (value >= server.dbnum)) return false;
+    *db_id = (int)value;
+    return true;
+}
+
+/* Parse the parameters of the COPY command, extracting the target DBID.
+ * Returns FALSE if the command would not run. */
+static bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid, int *target_dbid) {
+    const int COPY_COMMAND_OPTIONAL_ARG_START_INDEX = 3;
+
+    *target_dbid = selected_dbid;
+
+    for (int i = COPY_COMMAND_OPTIONAL_ARG_START_INDEX; i < argc; i++) {
+        if (!strcasecmp((char *)objectGetVal(argv[i]), "replace")) {
+            continue;
+        } else if (!strcasecmp((char *)objectGetVal(argv[i]), "db") && (i + 1 < argc)) {
+            /* Note the parsing here needs to perfectly match what we have in Valkey OSS for COPY.
+             * The following command is considered OK by Valkey 8.1 so we can't return here, but
+             * must continue to parse till the last db which is the one that's effectively used.
+             *    COPY key1 key2 db 1 db 2 db 3    (This will use db 3) */
+            if (!getDbIdFromRobj(argv[i + 1], target_dbid)) {
+                return false; // parse failure
+            }
+            i++; // Consume additional argument
+        } else {
+            return false; // parse failure
+        }
+    }
+    return true;
+}
+
+/* Get parameters for the SWAPDB command.
+ * The optional permission_client allows for checking of a client's permission for swapdb.
+ * Returns true if command would be executed. */
+static bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, int *id1_p, int *id2_p) {
+    static struct serverCommand *swapdb_cmd = NULL;
+
+    // We don't need to check permissions in the replication phase
+    if (permission_client != NULL) {
+        if (swapdb_cmd == NULL) {
+            swapdb_cmd = lookupCommandByCString("swapdb");
+            serverAssert(swapdb_cmd != NULL);
+        }
+
+        int idxptr;
+        if (ACLCheckAllUserCommandPerm(permission_client->user, swapdb_cmd, argv, argc,
+                                       permission_client->db->id, &idxptr) != ACL_OK) return false;
+    }
+
+    long long dbid1, dbid2;
+    if (argc != 3) return false;
+    if (server.cluster_enabled) return false;
+    if (getLongLongFromObject(argv[1], &dbid1) != C_OK) return false;
+    if (getLongLongFromObject(argv[2], &dbid2) != C_OK) return false;
+    if (dbid1 < 0 || dbid1 >= server.dbnum) return false;
+    if (dbid2 < 0 || dbid2 >= server.dbnum) return false;
+    if (dbid1 == dbid2) return false; // Valid, but doesn't do anything
+
+    *id1_p = (int)dbid1;
+    *id2_p = (int)dbid2;
+    return true;
+}
+
+/* Get parameters for the SELECT command.
+ * The optional permission_client allows for checking of a client's permission for select.
+ * Returns true if command would be executed. */
+static bool getParamsForSelect(int argc, robj **argv, client *permission_client, int *dbid_p) {
+    static struct serverCommand *select_cmd = NULL;
+
+    // We don't need to check permissions in the replication phase
+    if (permission_client != NULL) {
+        if (select_cmd == NULL) {
+            select_cmd = lookupCommandByCString("select");
+            serverAssert(select_cmd != NULL);
+        }
+
+        int idxptr;
+        if (ACLCheckAllUserCommandPerm(permission_client->user, select_cmd, argv, argc,
+                                       permission_client->db->id, &idxptr) != ACL_OK) return false;
+    }
+
+    long long dbid;
+    if (argc != 2) return false;
+    if (getLongLongFromObject(argv[1], &dbid) != C_OK) return false;
+    if (dbid < 0 || dbid >= server.dbnum) return false;
+
+    *dbid_p = (int)dbid;
+    return true;
+}
+
+static void pauseRehashForKvsHashtable(kvstore *kvs, int didx) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (ht != NULL) hashtablePauseRehashing(ht);
+}
+
+static void resumeRehashForKvsHashtable(kvstore *kvs, int didx) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (ht != NULL) hashtableResumeRehashing(ht);
+}
+
+
+/* DictType for SDS->ptr.  The SDS is referenced, no destructor. */
+static dictType sdsrefToPtrDictType = {
+    .entryGetKey = dictEntryGetKey,
+    .hashFunction = dictSdsHash,
+    .keyCompare = dictSdsKeyCompare,
+    .entryDestructor = zfree};
+
+
+/* Wrap decrRefCount() so that it can be used as a callback requiring void. */
+static void decrRefCountVoid(void *o) {
+    decrRefCount(o);
+}
+
+
+/* Concatenate argc/argv into a command string for debugging. */
+static sds createSdsFromClientArgv(int argc, robj **argv) {
+    sds cmd = sdsempty();
+    for (int i = 0; i < argc; i++) {
+        robj *arg = getDecodedObject(argv[i]); // some objects are int encoded
+        cmd = sdscatprintf(cmd, "'%s' ", (char *)objectGetVal(arg));
+        decrRefCount(arg);
+    }
+    return cmd;
+}
+
+
+// ###########################################################################
+
+
+/* bgIteration internal (compile time) configuration values */
+enum {
+    BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE = 16384,  // Prevent initial rehashing
+    BGITER_MAX_CLONE_ITEM_BYTES = 512,               // Max size item to clone
+    BGITER_MAX_CLONE_POOL_BYTES = (1 * 1024 * 1024), // Total limit for all cloned items
+    BGITER_QUEUE_INCREASE_INCR = 100,                // Step size when increasing queue target
+    BGITER_CYCLE_DELAY_MS = 2,                       // Delay between calls on bgIteration timer
+    BGITER_CYCLE_BUDGET_MS = 1,                      // Normal time limit for timer processing
+    BGITER_CYCLE_BUDGET_MAX_MS = 10                  // Maximum time limit when starvation seen
+};
+
+// dbEntry metadata
+typedef struct {
+    uint32_t iterator_epoch; // iterator epoch of last modification
+} bgIterationEntryMetadata;
+static_assert(sizeof(bgIterationEntryMetadata) == BGITERATION_ENTRY_METADATA_SIZE, "");
+
+
+// These can be tweaked by unit tests
+static int bgiter_max_clone_item_bytes = BGITER_MAX_CLONE_ITEM_BYTES;
+static int bgiter_max_clone_pool_bytes = BGITER_MAX_CLONE_POOL_BYTES;
+
+void bgIteration_unitTestDisableCloning(void) {
+    bgiter_max_clone_item_bytes = 0;
+    bgiter_max_clone_pool_bytes = 0;
+}
+void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes) {
+    bgiter_max_clone_item_bytes = item_bytes;
+    bgiter_max_clone_pool_bytes = pool_bytes;
+}
+
+typedef enum {
+    BGITERATION_TYPE_NONE,
+    BGITERATION_TYPE_FULLSCAN,
+    BGITERATION_TYPE_CLUSTERSLOT
+} bgIterationType;
+
+
+/* Flag indicates that a consistent iteration is required.  This is used to create a point-in-time
+ * iteration.  The iteration client will see all keys AS THEY EXISTED at the time when the iterator
+ * was created.
+ * Note:  The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration
+ *        start).  SWAPDB events are NOT provided during a consistent iteration.  */
+#define BGITERATOR_FLAG_CONSISTENT (1 << 0)
+
+/* Flag indicating that the replication stream for keys which have already been processed should be
+ * forwarded to the iteration client.  Used for non-consistent iteration to track changes
+ * to keys already processed.  By tracking changes, this allows an non-consistent iteration client
+ * to achieve a consistent view at the END of the iteration.
+ * NOTE:  Replication events will be provided ordered and synchronized with any SWAPDB events. */
+#define BGITERATOR_FLAG_REPLICATION (1 << 1)
+
+
+/* Extensions to bgIteratorItemType.  These enumerations are used internally, and are not part of
+ *  the published interface.  These allow for extensibility in the internal information-passing
+ *  between the Valkey main thread and the iteration client thread. */
+typedef enum {
+    /* Indicates that the iteration client has completed use of the bgIterator and that the
+     * bgIterator should be cleaned up and freed by the Valkey main thread. */
+    BGITERATOR_ITEMEXT_ITER_CLOSED = 10
+} bgIteratorItemTypeExtended;
+
+/* Item for bgIteratorItemTypeExtended.BGITERATOR_ITEMEXT_ITER_CLOSED.  Used to pass a bgIterator
+ * back to the Valkey main thread for cleanup/release. */
+typedef struct {
+    bgIteratorItemTypeExtended type;
+    bgIterator *iter;
+} bgIteratorItemExtClose;
+
+
+/* A dictionary with a pointer (itself) as a key (the address pointed to is NOT referenced).
+ * Nothing is duplicated, this is a very fast dictionary, but potentially unsafe if the original
+ * items are deleted or moved.
+ * WARNING:  This needs to maintain safety with things that may move the object.
+ *   + In db.c, if the object is reallocatd, bgIteration_updateDbEntryPtr() is called.
+ *   + In defrag.c, we don't defrag if there are multiple references (and we incr the refcount). */
+
+// Thomas Wang's 64-bit mix
+static uint64_t pointerHash(const void *key) {
+    uint64_t h = (uint64_t)(uintptr_t)key;
+    h = (~h) + (h << 21); // h = (h << 21) - h - 1;
+    h = h ^ (h >> 24);
+    h = (h + (h << 3)) + (h << 8); // h * 265
+    h = h ^ (h >> 14);
+    h = (h + (h << 2)) + (h << 4); // h * 21
+    h = h ^ (h >> 28);
+    h = h + (h << 31);
+    return h;
+}
+
+static int pointerCompare(const void *key1, const void *key2) {
+    return key1 == key2;
+}
+
+// This dict grows and shrinks constantly during the iteration.  Avoid constant rehashing.
+static int onlyAllowExpansion(size_t moreMem, double usedRatio) {
+    UNUSED(moreMem);
+    return (usedRatio > 0.5); // Return true only if expanding
+}
+
+static dictType dictEntryPtrDictType = {
+    .entryGetKey = dictEntryGetKey,
+    .hashFunction = pointerHash,
+    .keyCompare = pointerCompare,
+    .resizeAllowed = onlyAllowExpansion,
+    .entryDestructor = zfree};
+
+static hashtableType dbEntryPtrHashtableType = {
+    .hashFunction = pointerHash,
+    .keyCompare = pointerCompare,
+    .resizeAllowed = onlyAllowExpansion};
+
+
+// A free list for bgIteratorItem's - avoids churning zmalloc calls
+typedef struct itemListNode {
+    struct itemListNode *next;
+} itemListNode;
+
+static const int FREE_ITEM_MAX = 500;
+static itemListNode *freeItemStackHead = NULL;
+static int freeItemStackCount = 0;
+
+static void itemFreeList_returnItemBackToFreeList(bgIteratorItem *item) {
+    itemListNode *freedNode = (itemListNode *)item;
+    if (freeItemStackCount < FREE_ITEM_MAX) {
+        freedNode->next = freeItemStackHead;
+        freeItemStackHead = freedNode;
+        freeItemStackCount++;
+    } else {
+        zfree(freedNode);
+    }
+}
+
+// Pop a free node from the free list or allocate if none free
+static bgIteratorItem *itemFreeList_getElementOrAllocate(void) {
+    bgIteratorItem *item;
+    if (freeItemStackHead) {
+        item = (bgIteratorItem *)freeItemStackHead;
+        freeItemStackHead = freeItemStackHead->next;
+        freeItemStackCount--;
+        if (freeItemStackHead) valkey_prefetch(freeItemStackHead);
+    } else {
+        serverAssert(freeItemStackCount == 0);
+        // Create new listNode and item
+        item = zmalloc(sizeof(bgIteratorItem));
+    }
+    return item;
+}
+
+static void itemFreeList_release(void) {
+    while (freeItemStackHead) {
+        itemListNode *node = freeItemStackHead;
+        freeItemStackHead = node->next;
+        freeItemStackCount--;
+        zfree(node);
+    }
+    serverAssert(freeItemStackCount == 0);
+}
+
+
+/* A TEMPORARY set of robj's (of type sds).  This is only for temporary sets as the robj's are not
+ * ref-counted at insertion/deletion. */
+static hashtableType tempKeysetHashtableType = {
+    .hashFunction = dictObjHash,
+    .keyCompare = dictObjKeyCompare};
+
+
+typedef struct genericIterator genericIterator;
+typedef void (*iteratorReleaseFunc)(genericIterator *genIt);
+typedef fifo *(*iteratorGetEntriesFunc)(genericIterator *genIt, int *orig_dbid, int *cur_dbid);
+typedef void (*iteratorSwapDbFunc)(genericIterator *genIt, int db1, int db2);
+typedef void (*iteratorFlushDbFunc)(genericIterator *genIt, int cur_dbid);
+typedef bool (*iteratorHasPassedItemFunc)(genericIterator *genIt, const_sds key, int cur_dbid);
+typedef int (*iteratorOriginalDbFunc)(genericIterator *genIt, int cur_dbid);
+typedef bool (*iteratorIsKeyInScopeFunc)(genericIterator *genIt, const_sds key);
+
+// Function pointers supporting polymorphic iterator implementation
+struct genericIterator {
+    iteratorReleaseFunc release;
+    iteratorGetEntriesFunc getEntries;
+    iteratorSwapDbFunc swapDb;
+    iteratorFlushDbFunc flushDb;
+    iteratorHasPassedItemFunc hasPassedItem;
+    iteratorOriginalDbFunc originalDb;
+    iteratorIsKeyInScopeFunc isKeyInScope;
+};
+
+
+/* This struct is used across threads.  Unless otherwise noted, the fields are initialized at
+ *  iterator creation (within the main thread) and are read-only by the client thread. */
+struct bgIterator {
+    sds name;                        // Iterator name
+    bgIteratorReplDoneFunc repldone; // Optional repldone function to be run on the main thread
+    bgIteratorCleanupFunc cleanup;   // Optional cleanup function to be run on main thread
+    void *privdata;                  // Client's private data to be passed to cleanup function
+
+    int iteration_flags;                 // Consistent and/or Replication
+    int iteration_type;                  // Full scan or cluster slot
+    uint32_t consistent_modification_id; // iterator epoch at time of iterator creation
+
+    genericIterator *keyset_iter; // Low-level iterator (polymorphic)
+
+    /* A set of dbEntry, compared by pointer.  Used to track items which have already been iterated
+     * over by out-of-order expedited processing.  Ensures a bgIterator does not try to reprocess
+     * items.  Used only by main thread. */
+    hashtable *early_iterate_entries;
+
+    mutexQueue *items_for_iterator; // Created/Destroyed in main thread, used in both (threadsafe)
+
+    mutexQueue *return_to_valkey; // Queue of items to be returned to the Valkey main thread (threadsafe)
+
+    unsigned int item_count_target; // Used only by main thread
+
+    /* current_item is normally only used in the iteration client.  It's marked volatile here only
+     * to support snooping from the main thread when handling a FLUSHDB command.  This prevents the
+     * compiler from generating code which might read the pointer multiple times (when it's coded to
+     * read only once).
+     * (A volatile POINTER to a non-volatile item.) */
+    bgIteratorItem *volatile current_item;
+
+    bool client_is_active; // Set to true when client performs 1st read
+
+    /* Set to true in main thread when last item from iteration has been queued to the client.  No
+     * additional items will be enqueued to the client after this has been set. */
+    bool completed;
+
+    /* Set to true in main thread when iteration is to be killed.
+     * Set to true in iteration client when it decides to end early. */
+    volatile bool terminated;
+
+    bool cur_cmd_may_replicate; // Used only in main thread during command processing
+
+    // Variables maintaining runtime statistics
+    unsigned long dbentries_queued;         // Updated by main thread
+    unsigned long dbentries_processed;      // Updated by client thread
+    unsigned long replication_queued;       // Updated by main thread
+    unsigned long replication_processed;    // Updated by client thread
+    unsigned long swapdb_queued;            // Updated by main thread
+    unsigned long swapdb_processed;         // Updated by client thread
+    unsigned long flushdb_queued;           // Updated by main thread
+    unsigned long flushdb_processed;        // Updated by client thread
+    unsigned long dbentry_clones_queued;    // Updated by main thread
+    unsigned long dbentry_clones_processed; // Updated by client thread
+    monotime monotonic_start_time;          // Time iteration started
+
+    /* The item start time is set in the iteration client.  It is marked volatile as it can be read
+     * from the main thread by bgIteratorGetStatus.  If 0, this indicates that the iteration client
+     * is waiting for an item to process. */
+    volatile monotime monotonic_item_start_time;
+};
+
+
+// These static values are only accessed from the main Valkey thread.
+
+static list *allIterators;   // list of bgIterator
+static dict *nameToIterator; // bgIterator->name -> bgIterator
+
+// Global, across all iterators, dict contains a dbEntry pointer -> ref count
+static dict *inUseEntries; // dbEntry -> ref count
+
+/* Key values in the current command which don't exist in the DB yet.  Needed for determination of
+ * replication for NON-consistent iterations. */
+static list *curCmdMissingKeys; // list of robj
+
+/* A counter of the total amount of memory used for buffered replication data.  This amount is
+ * excluded when computing the need for evictions. */
+static ssize_t bufferedReplicationBytes;
+
+// Memory pool to track current allocated memory of cloned items (in bytes)
+static ssize_t bgiteration_current_clone_memory_pool_size;
+
+/* Snapshot of the last queue size to seed the next queue.  We assume all bgIterators consume items
+ * at roughly the same rate. */
+static int last_item_count_target;
+
+// Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID)
+static long long bgIterator_timeproc_id;
+
+// Incremented on each new iteration, this is updated in dbEntry metadata whenever an entry is modified.
+static uint32_t bgIteration_epoch = 1;
+
+/* If true, the iterators' cur_cmd_may_replicate flag was determined in the last call to
+ * blockClientIfRequired.  Otherwise, we skipped over computing this flag (maybe because it was a
+ * READ command).
+ * If this is true, AND we are in the context of executing a command inside of call(), then we
+ * should respect the setting of cur_cmd_may_replicate. */
+static bool iteratorReplicationFlagsWereUpdated;
+
+
+/* BgIteration debug captures BgIteration activity to a large sds buffer.  When an iterator is
+ * completed, the entire buffer is written to a file in the current working directory.  Note that
+ * memory must be available for the ENTIRE debug in memory.  This isn't captured incrementally to
+ * a file as the file I/O is more likely to affect timing.
+ *
+ * Future implementation: the current design is most useful for a single iterator.  When items are
+ * queued to an iterator, the iterator name is not recorded (to save space).
+ *
+ * Developer note: using a CONST value here allows the compiler to completely remove all of the
+ * debugging code at compile time.  There is no run-time performance overhead when set to FALSE.
+ * This is essentially like an IFDEF, however, it's better as it forces the compiler to validate
+ * syntax. */
+static const bool BGITERATION_DEBUG = false; // DO NOT SUBMIT WITH THIS SYMBOL SET TO TRUE!
+static sds debugBuffer;
+
+
+/* =============================================================================================
+ *                        Full Scan Iterator
+ * =============================================================================================
+ * The full scan iterator performs the actual iteration over the Valkey keyset.  The iterator is
+ * only used from within the Valkey main thread.  Iteration proceeds one DB at a time, based on
+ * the DB ordering at the time of iterator creation.  Each time the iterator returns items, all
+ * of the dictionary entries from a single hash bucket are returned. */
+
+struct fullScanIterator {
+    genericIterator callbacks; // (must be first item)
+
+    /* Array of mapping from original DB ID (at the time of iteration start) to that DB's current
+     * index.  So, if the DB which was DB-0 is now at index 6, orig_to_cur_db[0]==6. */
+    int *orig_to_cur_db;
+
+    /* The reverse of the above array.  This maps a current DB index to its original index (at the
+     * time of iteration start). */
+    int *cur_to_orig_db;
+
+    /* This is the DB we are currently iterating over.  This is relative to the ORIGINAL DB
+     * ordering, at the time of iterator creation.  Iteration proceeds from 0..N based on the
+     * original ordering. */
+    int iter_db;
+
+    // Iterator for the DB orig_to_cur_db[iter_db]
+    kvstore *kvs;     // keep track of kvs associated with iter_dbi
+    int kvs_didx;     // hashtable index within the kvstore
+    size_t ht_cursor; // cursor for scanning hashtable
+};
+
+static void fullScanIteratorRelease(genericIterator *genIt) {
+    struct fullScanIterator *it = (struct fullScanIterator *)genIt;
+    if (it->kvs) resumeRehashForKvsHashtable(it->kvs, it->kvs_didx);
+    zfree(it->orig_to_cur_db);
+    zfree(it->cur_to_orig_db);
+    zfree(it);
+}
+
+/* Scan callback used by fullScanIteratorGetEntries2 to collect entries into a fifo. */
+static void fullScanIteratorScanCallback(void *privdata, void *entry) {
+    fifo *dbEntryFifo = (fifo *)privdata;
+    dbEntry *de = (dbEntry *)entry;
+    if (ignoreKeyForSave(objectGetKey(de))) return; // slot migration: keys being purged
+    fifoPush(dbEntryFifo, de);
+}
+
+static fifo *fullScanIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) {
+    struct fullScanIterator *it = (struct fullScanIterator *)genIt;
+    if (it->iter_db >= server.dbnum) return NULL; // Finished scanning
+
+    fifo *dbEntryFifo = fifoCreate();
+    while (fifoLength(dbEntryFifo) == 0) {
+        while (it->kvs == NULL) {
+            if (++it->iter_db >= server.dbnum) {
+                fifoRelease(dbEntryFifo);
+                return NULL; // Iteration complete
+            }
+            serverDb *db = server.db[it->orig_to_cur_db[it->iter_db]];
+            if (db != NULL) {
+                it->kvs = db->keys;
+                it->kvs_didx = kvstoreGetFirstNonEmptyHashtableIndex(it->kvs);
+                it->ht_cursor = 0;
+                if (it->kvs_didx == KVSTORE_INDEX_NOT_FOUND) it->kvs = NULL;
+                if (it->kvs != NULL) pauseRehashForKvsHashtable(it->kvs, it->kvs_didx);
+            }
+        }
+
+        hashtable *ht = kvstoreGetHashtable(it->kvs, it->kvs_didx);
+        if (ht) {
+            it->ht_cursor = hashtableScan(ht, it->ht_cursor, fullScanIteratorScanCallback, dbEntryFifo);
+        } else {
+            it->ht_cursor = 0;
+        }
+
+        if (it->ht_cursor == 0) {
+            /* Done with this hashtable, move to next. */
+            resumeRehashForKvsHashtable(it->kvs, it->kvs_didx);
+            it->kvs_didx = kvstoreGetNextNonEmptyHashtableIndex(it->kvs, it->kvs_didx);
+            if (it->kvs_didx == KVSTORE_INDEX_NOT_FOUND) it->kvs = NULL;
+            if (it->kvs != NULL) pauseRehashForKvsHashtable(it->kvs, it->kvs_didx);
+        }
+    }
+    *orig_dbid = it->iter_db;
+    *cur_dbid = it->orig_to_cur_db[*orig_dbid];
+    return dbEntryFifo;
+}
+
+static void fullScanIteratorSwapDb(genericIterator *genIt, int db1, int db2) {
+    struct fullScanIterator *it = (struct fullScanIterator *)genIt;
+    int temp = it->cur_to_orig_db[db1];
+    it->cur_to_orig_db[db1] = it->cur_to_orig_db[db2];
+    it->cur_to_orig_db[db2] = temp;
+
+    it->orig_to_cur_db[it->cur_to_orig_db[db1]] = db1;
+    it->orig_to_cur_db[it->cur_to_orig_db[db2]] = db2;
+}
+
+static void fullScanIteratorFlushDb(genericIterator *genIt, int cur_dbid) {
+    struct fullScanIterator *it = (struct fullScanIterator *)genIt;
+    int orig_db = (cur_dbid == -1) ? it->iter_db : it->cur_to_orig_db[cur_dbid];
+    if (orig_db == it->iter_db) {
+        // We are currently iterating on the DB that's being flushed.
+        it->kvs = NULL;
+        // Iteration will continue with the next DB.
+    }
+}
+
+static bool fullScanIteratorHasPassedItem(genericIterator *genIt, const_sds key, int cur_dbid) {
+    struct fullScanIterator *it = (struct fullScanIterator *)genIt;
+    int orig_dbid = it->cur_to_orig_db[cur_dbid];
+
+    if (orig_dbid < it->iter_db) return true;  // Entire DB has already been processed
+    if (orig_dbid > it->iter_db) return false; // Haven't started this DB yet
+    // Now, orig_dbid == it->iter_db
+
+    if (it->kvs == NULL) return true; // just finished this DB
+
+    /* We're in the middle of processing a DB.  In cluster-mode, the DB is divided into 1 hashtable
+     * per slot.  In cluster-mode-disabled, we treat all keys as in slot 0. */
+    int keySlot = server.cluster_enabled ? getKeySlot((sds)key) : 0;
+    if (keySlot < it->kvs_didx) return true;
+    if (keySlot > it->kvs_didx) return false;
+
+    // At this point, we're down to a specific hashtable.
+
+    hashtable *ht = kvstoreGetHashtable(it->kvs, keySlot);
+    if (hashtableScanHasPassedKey(ht, key, it->ht_cursor)) return true;
+
+    if (ignoreKeyForSave(key)) return true; // if slot being purged, pretend we have passed it
+    return false;
+}
+
+static int fullScanIteratorOriginalDb(genericIterator *genIt, int cur_dbid) {
+    struct fullScanIterator *it = (struct fullScanIterator *)genIt;
+    return it->cur_to_orig_db[cur_dbid];
+}
+
+static bool fullScanIteratorIsKeyInScope(genericIterator *genIt, const_sds key) {
+    UNUSED(genIt);
+    UNUSED(key);
+    return true; // All keys are in scope
+}
+
+static genericIterator *fullScanIteratorCreate(void) {
+    struct fullScanIterator *it = zmalloc(sizeof(struct fullScanIterator));
+    it->orig_to_cur_db = zmalloc(sizeof(int) * server.dbnum);
+    it->cur_to_orig_db = zmalloc(sizeof(int) * server.dbnum);
+    for (int i = 0; i < server.dbnum; i++) {
+        it->orig_to_cur_db[i] = i;
+        it->cur_to_orig_db[i] = i;
+    }
+    it->iter_db = -1;
+    it->kvs = NULL;
+
+    it->callbacks.release = fullScanIteratorRelease;
+    it->callbacks.getEntries = fullScanIteratorGetEntries;
+    it->callbacks.swapDb = fullScanIteratorSwapDb;
+    it->callbacks.flushDb = fullScanIteratorFlushDb;
+    it->callbacks.hasPassedItem = fullScanIteratorHasPassedItem;
+    it->callbacks.originalDb = fullScanIteratorOriginalDb;
+    it->callbacks.isKeyInScope = fullScanIteratorIsKeyInScope;
+
+    return (genericIterator *)it;
+}
+
+
+/* =============================================================================================
+ *                        Cluster Slot Iterator
+ * =============================================================================================
+ * The cluster slot iterator performs iteration over one cluster slot of the Valkey keyset.  The
+ * iterator is only used from within the Valkey main thread. */
+struct clusterSlotIterator {
+    genericIterator callbacks; // (must be first item)
+};
+
+static void clusterSlotIteratorRelease(genericIterator *genIt) {
+    UNUSED(genIt);
+    serverAssert(false); // Not yet implemented
+}
+
+static fifo *clusterSlotIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) {
+    UNUSED(genIt);
+    UNUSED(orig_dbid);
+    UNUSED(cur_dbid);
+    serverAssert(false); // Not yet implemented
+}
+
+static void clusterSlotIteratorSwapDb(genericIterator *genIt, int db1, int db2) {
+    UNUSED(genIt);
+    UNUSED(db1);
+    UNUSED(db2);
+    serverAssert(false); // swap not valid in cluster mode
+}
+
+static void clusterSlotIteratorFlushDb(genericIterator *genIt, int cur_dbid) {
+    UNUSED(genIt);
+    UNUSED(cur_dbid);
+    serverAssert(false); // Not yet implemented
+}
+
+static bool clusterSlotIteratorHasPassedItem(genericIterator *genIt, const_sds key, int cur_dbid) {
+    UNUSED(genIt);
+    UNUSED(key);
+    UNUSED(cur_dbid);
+    serverAssert(false); // Not yet implemented
+}
+
+static int clusterSlotIteratorOriginalDb(genericIterator *genIt, int cur_dbid) {
+    UNUSED(genIt);
+    UNUSED(cur_dbid);
+    return cur_dbid; // swap not supported in cluster mode
+}
+
+/* When checking if a command is in scope for this iterator, all of its keys should be either in
+ * scope or not. In cluster mode enabled a command cannot reference keys from different slots, so
+ * this assumption will always be true. */
+static bool clusterSlotIteratorIsKeyInScope(genericIterator *genIt, const_sds key) {
+    UNUSED(genIt);
+    UNUSED(key);
+    serverAssert(false); // Not yet implemented
+}
+
+static genericIterator *clusterSlotIteratorCreate(const int *slots, size_t slots_count) {
+    struct clusterSlotIterator *it = zmalloc(sizeof(struct clusterSlotIterator));
+    it->callbacks.release = clusterSlotIteratorRelease;
+    it->callbacks.getEntries = clusterSlotIteratorGetEntries;
+    it->callbacks.swapDb = clusterSlotIteratorSwapDb;
+    it->callbacks.flushDb = clusterSlotIteratorFlushDb;
+    it->callbacks.hasPassedItem = clusterSlotIteratorHasPassedItem;
+    it->callbacks.originalDb = clusterSlotIteratorOriginalDb;
+    it->callbacks.isKeyInScope = clusterSlotIteratorIsKeyInScope;
+
+    UNUSED(slots);
+    UNUSED(slots_count);
+    serverAssert(false); // Not yet implemented
+
+    return (genericIterator *)it;
+}
+
+
+/* =============================================================================================
+ *                        General iteration support (across all iterators)
+ * ============================================================================================= */
+
+/* While an item is potentially in use by a background thread, we can't have rehashing by the main
+ * thread.  Returns true if rehashing was paused. */
+static bool pauseRehashing(dbEntry *de) {
+    switch (de->encoding) {
+    case OBJ_ENCODING_HASHTABLE: { // SET or HASH
+        hashtable *ht = objectGetVal(de);
+        hashtablePauseRehashing(ht);
+        return true;
+    }
+    case OBJ_ENCODING_SKIPLIST: { // SORTED SET
+        zset *zs = objectGetVal(de);
+        hashtablePauseRehashing(zs->ht);
+        return true;
+    }
+    default:
+        return false;
+    }
+}
+
+static void resumeRehashing(dbEntry *de) {
+    switch (de->encoding) {
+    case OBJ_ENCODING_HASHTABLE: { // SET or HASH
+        hashtable *ht = objectGetVal(de);
+        hashtableResumeRehashing(ht);
+        break;
+    }
+    case OBJ_ENCODING_SKIPLIST: { // SORTED SET
+        zset *zs = objectGetVal(de);
+        hashtableResumeRehashing(zs->ht);
+        break;
+    }
+    default:
+        break;
+    }
+}
+
+// Maintain a list of entries which are currently in-use.  These items should not be modified.
+static void incrementEntryInuse(dbEntry *de) {
+    dictEntry *existingEntry;
+    dictEntry *newEntry = dictAddRaw(inUseEntries, de, &existingEntry);
+    if (newEntry) {
+        incrRefCount(de);
+        dictSetSignedIntegerVal(newEntry, 1);
+    } else {
+        dictSetSignedIntegerVal(existingEntry, dictGetSignedIntegerVal(existingEntry) + 1);
+    }
+}
+
+
+static void decrementEntryInuse(dbEntry *de) {
+    dictEntry *entry = dictFind(inUseEntries, de);
+    if (dictGetSignedIntegerVal(entry) == 1) {
+        dictDelete(inUseEntries, de);
+        decrRefCount(de);
+    } else {
+        serverAssert(dictGetSignedIntegerVal(entry) > 1);
+        dictSetSignedIntegerVal(entry, dictGetSignedIntegerVal(entry) - 1);
+    }
+}
+
+static bool isEntryInuseBySingleIterator(dbEntry *de) {
+    dictEntry *entry = dictFind(inUseEntries, de);
+    return dictGetSignedIntegerVal(entry) == 1;
+}
+
+static bool isEntryInuseByAnyIterator(dbEntry *de) {
+    return (dictFind(inUseEntries, de) != NULL);
+}
+
+
+static ssize_t computeStringDbEntrySize(dbEntry *de) {
+    sds key = objectGetKey(de);
+    size_t valueSize = stringObjectLen(de);
+
+    return sdslen(key) + valueSize; // ignore the rest of the overhead, it's minor & transient
+}
+
+
+static dbEntry *tryCloneDbEntry(dbEntry *de) {
+    if (bgiteration_current_clone_memory_pool_size + bgiter_max_clone_item_bytes >
+        bgiter_max_clone_pool_bytes) return NULL;
+
+    /* Future optimization: Incorporate small ziplists, sorted sets, etc.
+     * OBJ_ENCODING_INT is omitted only because there isn't a good API for cloning it yet. */
+    if (de->type == OBJ_STRING && de->encoding != OBJ_ENCODING_INT) {
+        ssize_t itemSize = computeStringDbEntrySize(de);
+
+        if (itemSize <= bgiter_max_clone_item_bytes) {
+            bgiteration_current_clone_memory_pool_size += itemSize;
+            dbEntry *clone = createStringObjectWithKeyAndExpire((char *)objectGetVal(de),
+                                                                sdslen(objectGetVal(de)),
+                                                                objectGetKey(de),
+                                                                objectGetExpire(de));
+            ((bgIterationEntryMetadata *)objectGetMetadata(clone))->iterator_epoch =
+                ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch;
+            return clone;
+        }
+    }
+
+    return NULL;
+}
+
+static void freeClonedDictEntry(dbEntry *clonedEntry) {
+    serverAssert(clonedEntry->type == OBJ_STRING);
+
+    bgiteration_current_clone_memory_pool_size -= computeStringDbEntrySize(clonedEntry);
+
+    decrRefCount(clonedEntry);
+}
+
+
+static bgIteratorItem *makeDbEntryItem(dbEntry *de, int dbid, bool isCloned) {
+    if (!isCloned) incrementEntryInuse(de);
+
+    bgIteratorItem *item = itemFreeList_getElementOrAllocate();
+    item->type = BGITERATOR_ITEM_DBENTRY;
+    item->dbid = dbid;
+    item->u.dbe.de = de;
+    item->u.dbe.is_cloned = isCloned;
+    item->u.dbe.is_rehashing_paused = pauseRehashing(de);
+
+    return item;
+}
+
+static robj **cloneRobjArray(int argc, robj **argv) {
+    robj **newarray = zmalloc(sizeof(robj *) * argc);
+    for (int i = 0; i < argc; i++) {
+        newarray[i] = argv[i];
+        incrRefCount(argv[i]);
+    }
+    return newarray;
+}
+
+
+static void freeRobjArray(int argc, robj **argv) {
+    for (int i = 0; i < argc; i++) {
+        decrRefCount(argv[i]);
+    }
+    zfree(argv);
+}
+
+
+// Called by iterator thread to release an item.
+static void returnCurrentItemToValkey(bgIterator *it) {
+    bgIteratorItem *item = it->current_item;
+    if (item == NULL) return;
+
+    switch (item->type) {
+    case BGITERATOR_ITEM_DBENTRY:
+        it->dbentries_processed++;
+        if (item->u.dbe.is_cloned) it->dbentry_clones_processed++;
+        mutexQueueAdd(it->return_to_valkey, item);
+        break;
+    case BGITERATOR_ITEM_REPLICATION:
+        it->replication_processed++;
+        mutexQueueAdd(it->return_to_valkey, item);
+        break;
+    case BGITERATOR_ITEM_SWAPDB:
+        it->swapdb_processed++;
+        mutexQueueAdd(it->return_to_valkey, item);
+        break;
+    case BGITERATOR_ITEM_FLUSHDB:
+        it->flushdb_processed++;
+        mutexQueueAdd(it->return_to_valkey, item);
+        break;
+
+    case BGITERATOR_ITEM_COMPLETE:
+    case BGITERATOR_ITEM_TERMINATED:
+        // These are static and just used to wake the iterator - they should never be returned.
+        serverAssert(false);
+        break;
+
+    default:
+        serverAssert(false);
+    }
+
+    /* Do this AFTER placing into return_to_valkey.  This is volatile and snooped when there is a
+     *  flushall event.  Don't want an item to be missed. */
+    it->current_item = NULL;
+}
+
+
+/* =============================================================================================
+ *                        Background Iterator (private)
+ * ============================================================================================= */
+
+static void bgIteratorRelease(bgIterator *it) {
+    serverAssert(onValkeyMainThread());
+    serverAssert(it->current_item == NULL);
+    serverAssert(mutexQueueLength(it->items_for_iterator) == 0);
+    serverAssert(mutexQueueLength(it->return_to_valkey) == 0);
+
+    dictDelete(nameToIterator, it->name);
+    listDelNode(allIterators, listSearchKey(allIterators, it));
+
+    mutexQueueRelease(it->items_for_iterator);
+    it->items_for_iterator = NULL;
+
+    mutexQueueRelease(it->return_to_valkey);
+    it->return_to_valkey = NULL;
+
+    it->keyset_iter->release(it->keyset_iter);
+    it->keyset_iter = NULL;
+
+    hashtableRelease(it->early_iterate_entries);
+    it->early_iterate_entries = NULL;
+
+    sdsfree(it->name);
+    zfree(it);
+}
+
+
+static bool shouldFeedIteratorMore(bgIterator *it) {
+    return (!it->completed &&
+            !it->terminated &&
+            mutexQueueLength(it->items_for_iterator) < it->item_count_target);
+}
+
+
+// Debugging routine
+static sds createEntryString(int dbid, dbEntry *de) {
+    sds key = objectGetKey(de);
+
+    sds entrySds = sdsempty();
+    entrySds = sdscatprintf(entrySds, "(%d)'%s'", dbid, key);
+    if (de->type == OBJ_STRING) {
+        robj *o = getDecodedObject(de); // might be encoded as int
+        const unsigned valuePrintLen = 20;
+        entrySds = sdscatprintf(entrySds, " : '%.*s'", valuePrintLen, (char *)objectGetVal(o));
+        if (sdslen((sds)objectGetVal(o)) > valuePrintLen) entrySds = sdscat(entrySds, "...");
+        decrRefCount(o);
+    } else {
+        entrySds = sdscatprintf(entrySds, " : type(%d)", de->type);
+    }
+    return entrySds;
+}
+
+
+static void feedIterator(bgIterator *it, monotime end_time_us) {
+    // Smart logic to dynamically adjust the size of the queue
+    unsigned int initial_queue_len = mutexQueueLength(it->items_for_iterator);
+
+    if (initial_queue_len > 2 && it->item_count_target >= initial_queue_len) {
+        it->item_count_target -= initial_queue_len / 2;
+    }
+
+    // Now do some feeding
+    bool have_time = (getMonotonicUs() < end_time_us);
+    int timeCheckCounter = 0;
+    while (shouldFeedIteratorMore(it) && have_time) {
+        int orig_dbid, cur_dbid;
+        fifo *dbEntryFifo = it->keyset_iter->getEntries(it->keyset_iter, &orig_dbid, &cur_dbid);
+
+        if (dbEntryFifo == NULL) {
+            // Iteration of items is complete for this iterator
+            serverAssert(it->dbentries_queued >= it->dbentries_processed);
+            serverAssert(it->replication_queued >= it->replication_processed);
+            serverAssert(it->swapdb_queued >= it->swapdb_processed);
+            serverAssert(it->flushdb_queued >= it->flushdb_processed);
+            serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed);
+
+            // Snapshot queue size to seed next iterator when terminated
+            last_item_count_target = it->item_count_target;
+
+            if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) {
+                if (!it->client_is_active || (it->dbentries_queued > it->dbentries_processed)) {
+                    /* We are done feeding dict entries to the iterator, but before ending the
+                     * replication processing make sure that the iterator has become active (has
+                     * started reading) and make sure that all of the dict entries have been
+                     * processed by the client. */
+                    break;
+                }
+                if (it->repldone) {
+                    bool clientWantsMoreReplication = (!it->repldone(it->privdata));
+                    if (clientWantsMoreReplication) break;
+                }
+            }
+            bgIteratorItem *completionItem = itemFreeList_getElementOrAllocate();
+            *completionItem = (bgIteratorItem){.type = BGITERATOR_ITEM_COMPLETE};
+            if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) {
+                rdbSaveInfo rsi;
+                completionItem->dbid = (rdbPopulateSaveInfo(&rsi)) ? rsi.repl_stream_db : 0;
+                completionItem->u.master_repl_offset = server.primary_repl_offset;
+                if (BGITERATION_DEBUG) {
+                    debugBuffer = sdscat(debugBuffer, "REPLDONE FN\n");
+                }
+            }
+
+            if (BGITERATION_DEBUG) {
+                debugBuffer = sdscat(debugBuffer, "SENDING COMPLETE\n");
+            }
+
+            mutexQueueAdd(it->items_for_iterator, completionItem);
+            it->completed = true;
+            break;
+        }
+
+        int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) ? orig_dbid : cur_dbid;
+
+        fifo *itemsToAdd = fifoCreate();
+        while (fifoLength(dbEntryFifo) > 0) {
+            dbEntry *de;
+            fifoPop(dbEntryFifo, (void **)&de);
+
+            // Remove new/modified items during consistent iteration.
+            if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT &&
+                ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch > it->consistent_modification_id) {
+                continue;
+            }
+
+            // Remove any items which have been processed early
+            if (hashtableDelete(it->early_iterate_entries, de)) {
+                if (BGITERATION_DEBUG) {
+                    sds entryString = createEntryString(dbid, de);
+                    debugBuffer = sdscatprintf(debugBuffer, "SKIPPING ITEM(early iterate): %s\n", entryString);
+                    sdsfree(entryString);
+                }
+                continue;
+            }
+
+            // For items which are left, convert them from dbEntry to iteratorItem
+            if (BGITERATION_DEBUG) {
+                sds entryString = createEntryString(dbid, de);
+                debugBuffer = sdscatprintf(debugBuffer, "ITEM: %s\n", entryString);
+                sdsfree(entryString);
+            }
+
+            bgIteratorItem *item = makeDbEntryItem(de, dbid, false);
+            fifoPush(itemsToAdd, item);
+        }
+        fifoRelease(dbEntryFifo);
+
+        if (fifoLength(itemsToAdd) > 0) {
+            it->dbentries_queued += fifoLength(itemsToAdd);
+            mutexQueueAddMultiple(it->items_for_iterator, itemsToAdd);
+        }
+        fifoRelease(itemsToAdd);
+
+        // This is a predictably fast loop.  We don't need to check the time on every pass.
+        if (++timeCheckCounter % 32 == 0) {
+            have_time = (getMonotonicUs() < end_time_us);
+        }
+    }
+
+    // Smart logic to dynamically adjust the size of the queue
+    if (initial_queue_len == 0 && have_time) {
+        it->item_count_target += BGITER_QUEUE_INCREASE_INCR;
+    }
+}
+
+
+static bool addEarlyIterationKey(bgIterator *it, dbEntry *earlyEntry, int cur_dbid) {
+    bool wasAdded = hashtableAdd(it->early_iterate_entries, earlyEntry);
+    serverAssert(wasAdded);
+
+    int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)
+                   ? it->keyset_iter->originalDb(it->keyset_iter, cur_dbid)
+                   : cur_dbid;
+
+    dbEntry *cloneEntry = tryCloneDbEntry(earlyEntry);
+    bool isClonedEntry = (cloneEntry != NULL);
+    bgIteratorItem *item = makeDbEntryItem(isClonedEntry ? cloneEntry : earlyEntry, dbid, isClonedEntry);
+
+    it->dbentries_queued++;
+    if (isClonedEntry) it->dbentry_clones_queued++;
+
+    if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT || server.cluster_enabled) {
+        /* On consistent iteration, SWAPDB events are not provided.  So there is no requirement to
+         * keep items in order or synchronized with SWAPDB.  In cluster mode, SWAPDB isn't supported. */
+        if (BGITERATION_DEBUG) {
+            sds entryString = createEntryString(dbid, item->u.dbe.de);
+            debugBuffer = sdscatprintf(debugBuffer, "EARLY_1: %s\n", entryString);
+            sdsfree(entryString);
+        }
+        mutexQueuePushPriority(it->items_for_iterator, item);
+    } else {
+        if (BGITERATION_DEBUG) {
+            sds entryString = createEntryString(dbid, item->u.dbe.de);
+            debugBuffer = sdscatprintf(debugBuffer, "EARLY: %s\n", entryString);
+            sdsfree(entryString);
+        }
+        mutexQueueAdd(it->items_for_iterator, item);
+    }
+    return !isClonedEntry; // Block if the entry will be used by the background thread
+}
+
+
+// This expedites a single key and doesn't attempt to avoid expediting through optimization.
+static bool expediteSingleKeyWithoutOptimization(bgIterator *it,
+                                                 int dbid,
+                                                 robj *oKey,
+                                                 hashtable *waitingOnKeys) {
+    bool mustBlock = false;
+
+    bool iterComplete = it->completed || it->terminated;
+
+    sds key = objectGetVal(oKey);
+    dbEntry *de = dbFind(server.db[dbid], key);
+    if (de != NULL) {
+        if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) &&
+            !hashtableFind(it->early_iterate_entries, de, NULL)) {
+            if (addEarlyIterationKey(it, de, dbid)) {
+                mustBlock = true;
+                hashtableAdd(waitingOnKeys, oKey);
+            }
+        } else {
+            if (isEntryInuseByAnyIterator(de)) {
+                mustBlock = true;
+                hashtableAdd(waitingOnKeys, oKey);
+            }
+        }
+    }
+
+    return mustBlock;
+}
+
+
+// MOVE/COPY are unfortunate special commands.  They work on 2 DBs at once.
+const int MOVE_COMMAND_DBID_ARG_INDEX = 2;
+static bool expediteKeysForMove(bgIterator *it,
+                                int dbid,
+                                int argc,
+                                robj **argv,
+                                hashtable *waitingOnKeys) {
+    if (argc <= MOVE_COMMAND_DBID_ARG_INDEX) return false;
+
+    int destDbid;
+    if (!getDbIdFromRobj(argv[MOVE_COMMAND_DBID_ARG_INDEX], &destDbid)) return false;
+
+    bool mustBlock = false;
+    robj *key = argv[1];
+
+    /* Not looking for special cases to optimize here.  Just try to expedite both src and dest
+     * keys.  Note that the dest key might exist (and need iteration) but could be expired and
+     * could be overwritten by MOVE.  In this case, a DEL would replicate due to the expiry.  So
+     * even if the target is expired, we need to replicate it before executing the command. */
+    if (expediteSingleKeyWithoutOptimization(it, dbid, key, waitingOnKeys)) mustBlock = true;
+    if (expediteSingleKeyWithoutOptimization(it, destDbid, key, waitingOnKeys)) mustBlock = true;
+
+    it->cur_cmd_may_replicate = true;
+    return mustBlock;
+}
+
+
+// MOVE/COPY are unfortunate special commands.  They work on 2 DBs at once.
+static bool expediteKeysForCopy(bgIterator *it,
+                                int dbid,
+                                int argc,
+                                robj **argv,
+                                hashtable *waitingOnKeys) {
+    int destDbid;
+    if (!getTargetDbIdForCopyCommand(argc, argv, dbid, &destDbid)) return false;
+
+    bool mustBlock = false;
+    robj *srcKey = argv[1];
+    robj *destKey = argv[2];
+
+    /* Not trying to optimize COPY.  Just expedite source and destination (if it exists).  We
+     * don't really care if the value is overwritten or not (so no need to parse REPLACE option). */
+    if (expediteSingleKeyWithoutOptimization(it, dbid, srcKey, waitingOnKeys)) mustBlock = true;
+    if (expediteSingleKeyWithoutOptimization(it, destDbid, destKey, waitingOnKeys)) mustBlock = true;
+
+    it->cur_cmd_may_replicate = true;
+    return mustBlock;
+}
+
+
+/* There are several cases where a client must be blocked on write operations.  (Clients never need
+ * to be blocked for read operations.)
+ *
+ * Note:  An Amazon extension to the Valkey command structure allows us to identify commands where
+ *        the first key is for write and the rest are for read.  This allows us to make the
+ *        following optimizations:
+ *   - for keys which are read only, there's no need to block if the key is in-use by an iterator
+ *   - without replication, there's no need to immediately queue read keys on a consistent iteration
+ *
+ * Iterator:  CONSISTENT = NO,  REPLICATION = NO
+ *   - Block if any write-key is in use by an the iterator
+ *
+ * Iterator:  CONSISTENT = NO,  REPLICATION = YES
+ *   - Block if any write-key is in use by an the iterator
+ *   - If ANY key has already been iterated (but some keys have not), then
+ *       - Block and immediately queue any key (read or write) that has not
+ *         already been iterated
+ *         Example:  SDIFFSTORE KEY_A KEY_B KEY_C
+ *           In this case, KEY_A is written, KEY_B and KEY_C are read.  If KEY_A has already been
+ *           iterated over, the replication stream will contain this command.  The receiver of this
+ *           replication will need KEY_B and KEY_C in order to process the replication stream.  So
+ *           these need to be iterated and the client blocked.
+ *
+ * Iterator:  CONSISTENT = YES, REPLICATION = NO
+ *   - Block if any write-key is in use by an the iterator
+ *   - Block and immediately queue any WRITE-key that has not already been iterated
+ *
+ * Iterator:  CONSISTENT = YES, REPLICATION = YES
+ *   (Combination only valid in cluster mode - no SWAPDB possible)
+ *   - Block if any write-key is in use by an the iterator
+ *   - Block and immediately queue any key (read or write) that has not already been iterated */
+static bool expediteKeysForWrite(bgIterator *it,
+                                 int dbid,
+                                 struct serverCommand *cmd,
+                                 int argc,
+                                 robj **argv,
+                                 keyReference *keyrefs,
+                                 int numKeys,
+                                 hashtable *waitingOnKeys) {
+    serverAssert(numKeys > 0);
+
+    bool mustBlock = false;
+
+    /* All keys of the command should either be in scope or not since in cluster mode enabled they
+     * should all be in the same slot. So we just check the first key. */
+    robj *oKey = argv[keyrefs[0].pos];
+    sds key = objectGetVal(oKey);
+    /* If it's not in the iteration scope for the current iterator, then we don't need to do
+     * anything with this command. */
+    if (!it->keyset_iter->isKeyInScope(it->keyset_iter, key)) return false;
+
+    /* Note: performance optimization for commands which only modify the first key.  If this flag
+     * is not available, we can safely remove this `if` statement. */
+    if ((cmd->flags & CMD_WRITE_FIRSTKEY_ONLY) &&
+        !(it->iteration_flags & BGITERATOR_FLAG_REPLICATION)) {
+        /* If this write command only modifies the 1st key, we don't need to expedite others
+         * unless replication enabled. */
+        numKeys = 1;
+    }
+
+    if (cmd->proc == moveCommand) {
+        // Special case for MOVE
+        return expediteKeysForMove(it, dbid, argc, argv, waitingOnKeys);
+    }
+
+    if (cmd->proc == copyCommand) {
+        // Similar special case for COPY
+        return expediteKeysForCopy(it, dbid, argc, argv, waitingOnKeys);
+    }
+
+    bool iterComplete = it->completed || it->terminated;
+
+    if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) {
+        // CONSISTENT = YES, REPLICATION = YES / NO
+        for (int i = 0; i < numKeys; i++) {
+            robj *oKey = argv[keyrefs[i].pos];
+            sds key = objectGetVal(oKey);
+            dbEntry *de = dbFind(server.db[dbid], key);
+            if (de == NULL) continue; // New key, no need to expedite
+            if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) &&
+                !hashtableFind(it->early_iterate_entries, de, NULL) &&
+                ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) {
+                if (addEarlyIterationKey(it, de, dbid)) {
+                    mustBlock = true;
+                    hashtableAdd(waitingOnKeys, oKey);
+                }
+            } else {
+                if (isEntryInuseByAnyIterator(de)) {
+                    mustBlock = true;
+                    hashtableAdd(waitingOnKeys, oKey);
+                }
+            }
+        }
+        it->cur_cmd_may_replicate = true; // Will replicate only if replication enabled
+    } else {
+        /* Identification of missing keys is only needed for non-consistent iteration.  This only
+         * needs to be collected once (on the 1st non-consistent iteration). */
+        bool collectMissing = (listLength(curCmdMissingKeys) == 0);
+
+        if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) {
+            // CONSISTENT = NO,  REPLICATION = YES
+            bool someIterated = false;
+            /* dict containing the keys that have not been iterated yet.
+             * Using a dict dedupes the keys in case the command contains duplicated keys. */
+            dict *notIteratedKeys = dictCreate(&dictEntryPtrDictType); // dict of dbEntry* -> robj*
+
+            for (int i = 0; i < numKeys; i++) {
+                robj *oKey = argv[keyrefs[i].pos];
+                sds key = objectGetVal(oKey);
+                dbEntry *de = dbFind(server.db[dbid], key);
+                if (de == NULL) {
+                    if (collectMissing) {
+                        incrRefCount(oKey);
+                        listAddNodeHead(curCmdMissingKeys, oKey);
+                    }
+                    continue;
+                }
+                if (iterComplete ||
+                    it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) ||
+                    hashtableFind(it->early_iterate_entries, de, NULL)) {
+                    someIterated = true;
+                } else {
+                    dictAdd(notIteratedKeys, de, oKey);
+                }
+                if (isEntryInuseByAnyIterator(de)) {
+                    mustBlock = true;
+                    hashtableAdd(waitingOnKeys, oKey);
+                }
+            }
+
+            /* Since missing keys are considered as already iterated, if there are any missing keys
+             * we must consider that some keys have been iterated, and make sure all other keys
+             * will be expedited if needed. */
+            if (listLength(curCmdMissingKeys) > 0) someIterated = true;
+
+            /* This command may be executing as part of a larger transaction.  If some parts of the
+             * transaction have already been identified to replicate, we must wait on all keys and
+             * replicate here as well.  (Take care not to set cur_cmd_may_replicate to false.) */
+            if (someIterated) {
+                if (server.in_exec) {
+                    /* We are now executing the commands in a multi-exec block.
+                     *
+                     * Regarding MULTI/EXEC:  Remember that this code is executed twice for commands
+                     * within a MULTI/EXEC block.  First, we parse all the commands when deciding
+                     * if the EXEC should be blocked.  Then, as each command is executed, it's
+                     * re-parsed so that we can maintain the early iterated list as the commands
+                     * execute.  In this second pass, as each command is executed, we can't change
+                     * the replication decision which was made earlier (when the EXEC was processed).
+                     * We don't want to get tricked (by a key being removed and recreated) into
+                     * starting to replicate in the middle of a MULTI/EXEC block. */
+                } else {
+                    it->cur_cmd_may_replicate = true;
+                }
+            }
+            if (it->cur_cmd_may_replicate) {
+                dictEntry *de;
+                dictIterator *di = dictGetIterator(notIteratedKeys);
+                while ((de = dictNext(di)) != NULL) {
+                    dbEntry *notIteratedEntry = dictGetKey(de);
+                    robj *oKey = dictGetVal(de);
+
+                    if (addEarlyIterationKey(it, notIteratedEntry, dbid)) {
+                        mustBlock = true;
+                        hashtableAdd(waitingOnKeys, oKey);
+                    }
+                }
+                dictReleaseIterator(di);
+            }
+            dictRelease(notIteratedKeys);
+        } else {
+            // CONSISTENT = NO,  REPLICATION = NO
+            for (int i = 0; i < numKeys; i++) {
+                robj *oKey = argv[keyrefs[i].pos];
+                sds key = objectGetVal(oKey);
+                dbEntry *de = dbFind(server.db[dbid], key);
+                if (de == NULL) {
+                    if (collectMissing) {
+                        incrRefCount(oKey);
+                        listAddNodeHead(curCmdMissingKeys, oKey);
+                    }
+                    continue;
+                }
+                if (isEntryInuseByAnyIterator(de)) {
+                    mustBlock = true;
+                    hashtableAdd(waitingOnKeys, oKey);
+                }
+            }
+        }
+    }
+
+    return mustBlock;
+}
+
+
+/* Called when an iterator is terminated.  Pulls everything out of the queue
+ * and returns the items to Valkey (before they hit the iterator). */
+static void returnAllItemsToValkey(bgIterator *it) {
+    serverAssert(onValkeyMainThread());
+
+    fifo *poppedFifo = mutexQueuePopAll(it->items_for_iterator, false);
+    if (poppedFifo == NULL) return; // Nothing to return
+
+    // Release non-dictentry items first...
+    fifo *itemsToReturn = fifoCreate();
+    while (fifoLength(poppedFifo) > 0) {
+        bgIteratorItem *item;
+        fifoPop(poppedFifo, (void **)&item);
+        switch (item->type) {
+        // back out the "queued" statistic
+        case BGITERATOR_ITEM_DBENTRY:
+            it->dbentries_queued--;
+            if (item->u.dbe.is_cloned) it->dbentry_clones_queued--;
+            break;
+        case BGITERATOR_ITEM_REPLICATION:
+            it->replication_queued--;
+            break;
+        case BGITERATOR_ITEM_SWAPDB:
+            it->swapdb_queued--;
+            break;
+        case BGITERATOR_ITEM_FLUSHDB:
+            it->flushdb_queued--;
+            break;
+
+        case BGITERATOR_ITEM_COMPLETE:
+            /* This can only happen if the completion item has been enqueued and
+             * the iterator is terminated before reaching the completion item. */
+            itemFreeList_returnItemBackToFreeList(item);
+            continue; // Skip pushing this onto itemsToReturn
+
+        case BGITERATOR_ITEM_TERMINATED:
+            /* This can only happen if there is a race when terminating between
+             *  the iteration client and main thread. */
+            itemFreeList_returnItemBackToFreeList(item);
+            continue; // Skip pushing this onto itemsToReturn
+
+        default:
+            serverAssert(false);
+        }
+
+        fifoPush(itemsToReturn, item);
+    }
+    fifoRelease(poppedFifo);
+
+    // Now release items all at once...
+    if (fifoLength(itemsToReturn) > 0) {
+        mutexQueueAddMultiple(it->return_to_valkey, itemsToReturn);
+    }
+    fifoRelease(itemsToReturn);
+}
+
+
+/* =============================================================================================
+ *                        Foreground support functions (private)
+ * ============================================================================================= */
+
+static size_t replicationItemSize(bgIteratorItem *item) {
+    serverAssert(item->type == BGITERATOR_ITEM_REPLICATION);
+    size_t itemSize = sizeof(bgIteratorItem);
+    for (int i = 0; i < item->u.repl.argc; i++) {
+        itemSize += objectComputeSize(NULL, item->u.repl.argv[i], 0, 0);
+    }
+    return itemSize;
+}
+
+static void processReturnOfItemToValkey(bgIteratorItem *item, bgIterator *iter) {
+    serverAssert(onValkeyMainThread());
+    switch ((int)item->type) {
+    case BGITERATOR_ITEM_REPLICATION:
+        bufferedReplicationBytes -= replicationItemSize(item);
+        freeRobjArray(item->u.repl.argc, item->u.repl.argv);
+        break;
+
+    case BGITERATOR_ITEM_DBENTRY:
+        if (item->u.dbe.is_cloned) {
+            freeClonedDictEntry(item->u.dbe.de);
+        } else {
+            if (isEntryInuseBySingleIterator(item->u.dbe.de)) {
+                /* This blocking mechanism assumes a single DB so if the same key appears in
+                 * multiple DBs, commands might get unblocked only to get blocked again.  (This
+                 * would happen only rarely, and with minimal impact.) */
+                robj *key = createStringObjectFromSds(objectGetKey(item->u.dbe.de));
+                unblockClientsInUseOnKey(key);
+                decrRefCount(key);
+            }
+            // resumeRehashing must be called before decrementEntryInuse, since decrementEntryInuse can free
+            if (item->u.dbe.is_rehashing_paused) resumeRehashing(item->u.dbe.de);
+            decrementEntryInuse(item->u.dbe.de);
+        }
+        break;
+
+    case BGITERATOR_ITEM_SWAPDB:
+    case BGITERATOR_ITEM_FLUSHDB:
+        break;
+
+    case BGITERATOR_ITEMEXT_ITER_CLOSED: {
+        bgIterator *it = ((bgIteratorItemExtClose *)item)->iter;
+        serverAssert(it == iter);
+        if (it->terminated) {
+            /* Abnormal termination
+             * Normally the item is TERMINATED, but might be COMPLETE in race */
+            serverAssert(it->current_item->type == BGITERATOR_ITEM_TERMINATED ||
+                         it->current_item->type == BGITERATOR_ITEM_COMPLETE);
+            // Release any items stranded on the iterator after early termination
+            returnAllItemsToValkey(it);
+            receiveItemsBackFromOneIterator(it);
+        } else {
+            // Normal completion
+            serverAssert(it->current_item->type == BGITERATOR_ITEM_COMPLETE);
+        }
+        serverAssert(mutexQueueLength(it->items_for_iterator) == 0);
+        serverAssert(it->dbentries_queued == it->dbentries_processed);
+        serverAssert(it->replication_queued == it->replication_processed);
+        serverAssert(it->swapdb_queued == it->swapdb_processed);
+        serverAssert(it->flushdb_queued == it->flushdb_processed);
+        serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed);
+
+        listEmpty(curCmdMissingKeys); // Just in case any remain
+
+        itemFreeList_returnItemBackToFreeList(it->current_item);
+        it->current_item = NULL;
+
+        bool terminated = it->terminated;
+        void *privdata = it->privdata;
+        bgIteratorCleanupFunc cleanup = it->cleanup;
+        bgIteratorRelease(it); // Fully release the iterator before calling cleanup
+
+        if (BGITERATION_DEBUG) {
+            if (cleanup) debugBuffer = sdscatprintf(debugBuffer, "CLEANUP FN (%s)\n",
+                                                    (terminated) ? "terminated" : "success");
+
+            sds filename = sdscatprintf(sdsempty(), "bgiteration_debug.%d", getpid());
+            FILE *f = fopen(filename, "w");
+            sdsfree(filename);
+
+            fputs(debugBuffer, f);
+
+            fclose(f);
+            sdsfree(debugBuffer);
+            debugBuffer = sdsempty();
+        }
+
+        if (cleanup) cleanup(terminated, privdata);
+    } break;
+
+    default:
+        serverAssert(false); // Not expecting any other type of item!
+    }
+
+    // We don't allocate extension items from the pool so we manually free them
+    if ((int)item->type == BGITERATOR_ITEMEXT_ITER_CLOSED) {
+        zfree(item);
+    } else {
+        itemFreeList_returnItemBackToFreeList(item);
+    }
+}
+
+static void prepareAndProcessReturnedItems(int n, bgIteratorItem **items, bgIterator *iter) {
+    for (int i = 0; i < n; i++) valkey_prefetch(items[i]);
+    for (int i = 0; i < n; i++) {
+        if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue;
+        valkey_prefetch(items[i]->u.dbe.de);
+    }
+    for (int i = 0; i < n; i++) {
+        if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue;
+        valkey_prefetch(objectGetKey(items[i]->u.dbe.de));
+    }
+    for (int i = 0; i < n; i++) processReturnOfItemToValkey(items[i], iter);
+}
+
+#define PREFETCH_BATCH_SIZE 16
+
+// Returns true if we process at least one item from a given iterator's return_to_valkey queue.
+static bool receiveItemsBackFromOneIterator(bgIterator *it) {
+    bgIteratorItem *batchPool[PREFETCH_BATCH_SIZE];
+    int n = 0;
+    fifo *poppedFifo = mutexQueuePopAll(it->return_to_valkey, false);
+    if (poppedFifo != NULL) {
+        while (fifoLength(poppedFifo) > 0) {
+            fifoPop(poppedFifo, (void **)&batchPool[n++]);
+            if (n == PREFETCH_BATCH_SIZE) {
+                prepareAndProcessReturnedItems(n, batchPool, it);
+                n = 0;
+            }
+        }
+        if (n > 0) {
+            prepareAndProcessReturnedItems(n, batchPool, it);
+        }
+        fifoRelease(poppedFifo);
+        return true;
+    }
+    return false;
+}
+
+/* Process each iterator's return_to_valkey queue
+ * If `blocking` is true, continue reading until at least one queue was not empty. */
+static void receiveItemsBackFromIterators(bool blocking) {
+    serverAssert(onValkeyMainThread());
+    listIter li;
+    listNode *node;
+    bool processedItems = false;
+    do {
+        listRewind(allIterators, &li);
+        while ((node = listNext(&li)) != NULL) {
+            bgIterator *it = listNodeValue(node);
+            processedItems |= receiveItemsBackFromOneIterator(it);
+        }
+        if (blocking && !processedItems) usleep(100); // Short sleep before retry
+    } while (blocking && !processedItems);
+}
+
+
+static long long bgIteration_feedIterators_task(struct aeEventLoop *eventLoop,
+                                                long long id,
+                                                void *clientData) {
+    UNUSED(eventLoop);
+    UNUSED(id);
+    UNUSED(clientData);
+    serverAssert(onValkeyMainThread());
+
+    static monotime lastFeedEndTime; // STATIC: Persists For checking starvation
+    monotime startTime = getMonotonicUs();
+
+    if (!bgIteration_iterationActive()) {
+        // No more iterators exist.  Self-check, and terminate the "feed" task.
+        serverAssert(dictSize(nameToIterator) == 0);
+        serverAssert(dictSize(inUseEntries) == 0);
+        serverAssert(bufferedReplicationBytes == 0);
+
+        // Shrink dict back to zero (doesn't normally shrink)
+        dictRelease(inUseEntries);
+        inUseEntries = dictCreate(&dictEntryPtrDictType);
+
+        itemFreeList_release();
+
+        bgIterator_timeproc_id = AE_DELETED_EVENT_ID;
+        lastFeedEndTime = 0;
+        return AE_NOMORE;
+    }
+
+    long dutyTimeUs = BGITER_CYCLE_BUDGET_MS * 1000;
+    if (lastFeedEndTime > 0) {
+        /* If the timer was delayed, compute the proportional time we should have had, and increase
+         *  the duty cycle to compensate (up to a limit). */
+        long starvationUs = (startTime - lastFeedEndTime) - BGITER_CYCLE_DELAY_MS * 1000;
+        if (starvationUs > 0) {
+            long starvationCompensationUs = starvationUs * BGITER_CYCLE_BUDGET_MS /
+                                            (BGITER_CYCLE_BUDGET_MS + BGITER_CYCLE_DELAY_MS);
+            dutyTimeUs += starvationCompensationUs;
+            dutyTimeUs = MIN(dutyTimeUs, BGITER_CYCLE_BUDGET_MAX_MS * 1000);
+        }
+    }
+    monotime endTime = startTime + dutyTimeUs;
+
+    // Run this part regardless of time limit...
+    receiveItemsBackFromIterators(false);
+
+    // Feeding iterators (below) respects endTime.  The stuff above always runs to completion.
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL && getMonotonicUs() < endTime) {
+        bgIterator *it = listNodeValue(node);
+        if (it->completed || it->terminated) continue;
+        feedIterator(it, endTime);
+    }
+
+    lastFeedEndTime = getMonotonicUs();
+    return BGITER_CYCLE_DELAY_MS;
+}
+
+
+// Not static, but not API.  Intended for unit tests where the event loop may not be active.
+void bgIteration_feedIterators(void) {
+    /* For unit testing, force the item_count_target to 1 in each call.  This ensures that we only
+     * feed a minimal amount to the iterators rather than a non-deterministic amount. */
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        it->item_count_target = 1;
+    }
+
+    // Invoke the feeding task (normally invoked by timer).
+    bgIteration_feedIterators_task(NULL, 0, NULL);
+}
+
+
+static void resetReplicationFlagForIterators(client *c) {
+    /* For any given command, the command may or may not need to be replicated based on the status
+     * and flags of each iterator.  Furthermore, if a command does need to be replicated, this
+     * replication must occur for an entire atomic unit; we can't replicate only part of a script
+     * or multi/exec.
+     * This function is the only place where the replication flag is cleared. */
+
+    if (c->flag.multi || c->flag.script) {
+        /* REGARDING MULTI/EXEC
+         * --------------------
+         * When processing a MULTI/EXEC, blockClientIfRequired is called first for the MULTI.  Then,
+         * all of the commands are queued up in server.c:processCommand().  It's only when EXEC is
+         * encountered, that server.c:call() is fired to begin execution.
+         *
+         * AFTER the EXEC is processed by call(), then each of the commands in the MULTI/EXEC block
+         * will be processed through call().
+         *
+         * If write commands are present, MULTI & EXEC will be passed to the replication stream
+         * before/after the transaction commands.  Note that MULTI & EXEC are not actually
+         * "executed" at the time when their replication is passed to the replication stream.
+         *
+         * Example:  MULTI; SET A B; EXEC
+         *  1. blockClientIfRequired() called for MULTI.  MULTI flag IS NOT set.  (Won't block.)
+         *  2. blockClientIfRequired() called for EXEC.  MULTI flag IS set.  (Might block.)
+         *  3. blockClientIfRequired() called for SET.  MULTI flag IS set.  (Won't block.)
+         *  4. handleCommandReplication() is called for MULTI.
+         *  5. handleCommandReplication() is called for SET.
+         *  6. handleCommandReplication() is called for EXEC.
+         *
+         * SO - if the MULTI flag is set, we DON'T clear the flag.  It should only be cleared at the
+         * start of the transaction, when MULTI is received - and the flag isn't set yet. */
+
+        /* REGARDING SCRIPTS
+         * -----------------
+         * When processing a script, blockClientIfRequired is called first for the EVAL/EVALSHA/FCALL.
+         * Then, all of the commands are processed using a special script client.  The script
+         * client has the CLIENT_SCRIPT flag set.  For scripts, the replication flag is set when
+         * processing the EVAL/EVALSHA/FCALL and should not be cleared when executing individual
+         * commands in the script. */
+
+        /* If it's the EXEC command, we fall through and clear the flag below.  But for all other
+         *  commands within the transaction, we don't clear the flag. */
+        if (c->cmd->proc != execCommand) return;
+    }
+
+    /* For most commands, the replication flag is cleared and we determine if replication is needed
+     * based on the keys being used and their state in each iterator. If a modified key hasn't been
+     * processed yet, there's no need to expedite the key or send the replication.  The key will be
+     * sent later, when reached by the iterator.
+     *
+     * However, for scripts, it is not possible to perform this optimization.  There is no way to
+     * know if an undeclared key might be modified.  Since the entire script needs to be replicated
+     * (or not replicated) atomically, we can't take the chance that an undeclared key might be
+     * hit which requires replication. */
+    bool isScript = isScriptCallWriteCmd(c->cmd);
+
+    sds firstScriptKey = NULL;
+    if (isScript) {
+        /* If it's a script, we will normally replicate.  But if the keys are out of scope for the
+         * iteration, we shouldn't.  The use-case for this is with slot iteration, when the script
+         * is acting on keys from a different slot.  Here, we just check the first declared key, and
+         * if it's out of scope for the iteration, we won't replicate it.  This might cause issues
+         * for cross-slot scripts (anti-pattern), but the alternative is replicating all scripts,
+         * regardless of slot. */
+        getKeysResult result;
+        initGetKeysResult(&result);
+        getKeysFromCommand(c->cmd, c->argv, c->argc, &result);
+        if (result.numkeys > 0) firstScriptKey = objectGetVal(c->argv[result.keys[0].pos]);
+        getKeysFreeResult(&result);
+    }
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (it->completed || it->terminated) {
+            it->cur_cmd_may_replicate = false;
+        } else {
+            /* For normal commands, the flag is initialized to false (not to replicate).  For these
+             * commands, we decide later based on the actual commands.
+             *
+             * However, for scripts, we don't know what commands will be executed.  So IF it's a
+             * script, and the keys are in scope (on the right slot) we initialize the replication
+             * flag to true. */
+            it->cur_cmd_may_replicate = isScript && firstScriptKey &&
+                                        it->keyset_iter->isKeyInScope(it->keyset_iter, firstScriptKey);
+        }
+    }
+}
+
+
+static void handleSwapdb(int db1, int db2) {
+    serverAssert(onValkeyMainThread());
+    serverAssert(bgIteration_iterationActive());
+    serverAssert(!server.cluster_enabled);
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (it->completed || it->terminated) continue;
+
+        // Let the iterator internal mechanism know
+        it->keyset_iter->swapDb(it->keyset_iter, db1, db2);
+
+        // Let the background client know
+        if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) {
+            if (BGITERATION_DEBUG) {
+                debugBuffer = sdscatprintf(debugBuffer, "SWAP: %d %d\n", db1, db2);
+            }
+
+            bgIteratorItem *item = itemFreeList_getElementOrAllocate();
+            item->type = BGITERATOR_ITEM_SWAPDB;
+            item->dbid = db1;
+            item->u.dbid2 = db2;
+            it->swapdb_queued++;
+            mutexQueueAdd(it->items_for_iterator, item);
+        }
+    }
+}
+
+
+static void removePtrFromEarlyIterate(dbEntry *de) {
+    /* If the item is being released, let's get the pointer out of our early_iterate_entries.
+     * This is not strictly necessary, but it frees some memory and keeps the dictionary small. */
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        hashtableDelete(it->early_iterate_entries, de); // just try delete (might not be here)
+    }
+}
+
+
+static bool isDbSignificant(int dbid) {
+    unsigned long long totalKeys = 0;
+    for (int i = 0; i < server.dbnum; i++) {
+        totalKeys += (server.db[i]) ? dbSize(server.db[i]) : 0;
+    }
+    return (server.db[dbid]) ? (dbSize(server.db[dbid]) > totalKeys / 2) : false;
+}
+
+
+static void handleFlushdb(int dbid) {
+    // Invoked BEFORE the actual flush.  -1 indicates FLUSHALL.
+    bool should_abort_iterators = (dbid == -1 || isDbSignificant(dbid));
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+
+        // Let the low-level iterator know the DB is being flushed
+        it->keyset_iter->flushDb(it->keyset_iter, dbid);
+
+        if (should_abort_iterators || it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) {
+            if (!it->terminated) bgIteratorTerminate(it);
+        } else {
+            /* In this (limited) case, we're only flushing a single DB that contains < half the
+             * keys.  We don't want to kill a full-sync replication.  We will just continue with
+             * iteration, knowing that a replication client will also receive the FLUSHDB on the
+             * replication stream.  There's no need to worry about the items themselves.  Since
+             * we've incremented the refcount, the items still in queue won't be physically deleted. */
+
+            // Send a flushdb event to notify the client
+            if (BGITERATION_DEBUG) {
+                debugBuffer = sdscatprintf(debugBuffer, "FLUSH: %d\n", dbid);
+            }
+            bgIteratorItem *item = itemFreeList_getElementOrAllocate();
+            item->type = BGITERATOR_ITEM_FLUSHDB;
+            item->dbid = dbid;
+            it->flushdb_queued++;
+            mutexQueueAdd(it->items_for_iterator, item);
+        }
+    }
+    receiveItemsBackFromIterators(false); // Receive items back before flushing the items
+}
+
+
+static bool expediteKeysForWriteOnAllIterators(int dbid,
+                                               struct serverCommand *cmd,
+                                               int argc,
+                                               robj **argv,
+                                               keyReference *keyrefs,
+                                               int numKeys,
+                                               hashtable *waitingOnKeys) {
+    bool mustBlock = false;
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (expediteKeysForWrite(it, dbid, cmd, argc, argv, keyrefs, numKeys, waitingOnKeys))
+            mustBlock = true;
+    }
+
+    return mustBlock;
+}
+
+
+static bool anIteratorWillReplicateForThisCommand(void) {
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (it->cur_cmd_may_replicate) return true;
+    }
+    return false;
+}
+
+
+static bool expediteKeysForMultiExec(client *c, hashtable *waitingOnKeys) {
+    serverAssert(c->cmd->proc == execCommand);
+
+    /* For MULTI/EXEC, Valkey buffers all of the commands until hitting the EXEC.
+     * At this point, the client holds all of the commands to be executed.  This function searches
+     * for all of the keys used by any of the buffered write commands.  In addition, if SWAPDB or
+     * SELECT is used, this tracks the DBIDs through various swap/select operations. */
+
+    /* There's a special concern for a NON-consistent iteration with replication.  If the keys are
+     * all "future" keys (which haven't been processed by the iterator yet), then we don't expedite
+     * the keys or replicate.  However, if some keys have already been processed, we need to
+     * expedite the remaining keys and replicate everything.
+     *
+     * When processing a single command, this is all handled.  But in this function, for MULTI/EXEC,
+     * we process 1 command at a time.  There's an issue if the first command modifies a "future"
+     * key, we don't know (without reading ahead) if a later command will modify a prior key.  This
+     * would require the future key to be expedited.
+     *
+     * This COULD be addressed by collecting all of the keys into a single structure and then
+     * analyzing them all at once.  However, this won't share code well with the single commands.
+     * Also, building this structure is a little complex/time-consuming as we need to track both
+     * key AND dictID.  One way to do this might be with a dict of dicts, where the first dict maps
+     * a dictID to a dict of keys.
+     *
+     * ALTERNATIVELY (and it's the simpler approach that's taken here) we can just check if the
+     * MULTI will be replicated.  If so, we re-process the MULTI, just in case there were commands
+     * prior to deciding that replication was required that might have missed expediting.  If so,
+     * these will be caught on the 2nd time around.
+     *
+     * Checking replication status before/after ensures that there can only be a single recursive
+     * call. */
+    bool initiallyAnIteratorWillReplicate = anIteratorWillReplicateForThisCommand();
+
+    bool mustBlock = false;
+    int *cur_to_orig_db = NULL;
+
+    int curDb = c->db->id;
+    for (int cmdNum = 0; cmdNum < c->mstate->count; cmdNum++) {
+        struct serverCommand *cmd = c->mstate->commands[cmdNum].cmd;
+        robj **argv = c->mstate->commands[cmdNum].argv;
+        int argc = c->mstate->commands[cmdNum].argc;
+
+        if (cmd->proc == swapdbCommand) {
+            int id1, id2;
+            if (getParamsForSwapdb(argc, argv, c, &id1, &id2)) {
+                if (cur_to_orig_db == NULL) {
+                    cur_to_orig_db = zmalloc(sizeof(int) * server.dbnum);
+                    for (int i = 0; i < server.dbnum; i++) cur_to_orig_db[i] = i;
+                }
+                int temp = cur_to_orig_db[id1];
+                cur_to_orig_db[id1] = cur_to_orig_db[id2];
+                cur_to_orig_db[id2] = temp;
+            }
+            continue;
+        }
+
+        if (cmd->proc == selectCommand) {
+            int id;
+            if (getParamsForSelect(argc, argv, c, &id)) {
+                curDb = id;
+            }
+            continue;
+        }
+
+        if (!isWriteCmd(cmd)) continue;
+
+        getKeysResult result;
+        initGetKeysResult(&result);
+        int numkeys = getKeysFromCommand(cmd, argv, argc, &result);
+        keyReference *keyrefs = result.keys;
+        if (numkeys == 0) {
+            getKeysFreeResult(&result);
+            continue; // Write command with no keys - like FLUSHDB
+        }
+
+        if (expediteKeysForWriteOnAllIterators(
+                cur_to_orig_db ? cur_to_orig_db[curDb] : curDb,
+                cmd, argc, argv, keyrefs, numkeys, waitingOnKeys)) {
+            mustBlock = true;
+        }
+        getKeysFreeResult(&result);
+    }
+
+    zfree(cur_to_orig_db);
+
+    if (!initiallyAnIteratorWillReplicate && anIteratorWillReplicateForThisCommand()) {
+        /* We've decided to replicate.  Re-process the MULTI/EXEC just once more to make sure that
+         * we didn't miss any keys at the beginning.  This can't continue to recurse because
+         * `initiallyAnIteratorWillReplicate` will be TRUE in the recursive call.  Note that the
+         * recursive call may add additional entries to `waitingOnKeys`. */
+        if (expediteKeysForMultiExec(c, waitingOnKeys)) mustBlock = true;
+    }
+
+    return mustBlock;
+}
+
+
+static bgIterator *bgIteratorCreate(const char *name,
+                                    bgIteratorConsistency consistency,
+                                    bgIteratorReplDoneFunc repldone,
+                                    bgIteratorCleanupFunc cleanup,
+                                    void *privdata,
+                                    bgIterationType iter_type,
+                                    genericIterator *keyset_iter) {
+    serverAssert(onValkeyMainThread());
+    serverAssert(server.cluster_enabled || iter_type == BGITERATION_TYPE_FULLSCAN);
+
+    int flags;
+    switch (consistency) {
+    case BGITERATOR_CONSISTENCY_NONE: flags = 0; break;
+    case BGITERATOR_CONSISTENCY_START: flags = BGITERATOR_FLAG_CONSISTENT; break;
+    case BGITERATOR_CONSISTENCY_EVENTUAL: flags = BGITERATOR_FLAG_REPLICATION; break;
+    default: serverAssert(false);
+    }
+    // Consistent, with replication - doesn't make sense.
+    serverAssert(!((flags & BGITERATOR_FLAG_CONSISTENT) && (flags & BGITERATOR_FLAG_REPLICATION)));
+
+    bgIterator *it = zmalloc(sizeof(bgIterator));
+    it->name = sdsnew(name);
+    it->repldone = repldone;
+    it->cleanup = cleanup;
+    it->privdata = privdata;
+    it->items_for_iterator = mutexQueueCreate();
+    it->return_to_valkey = mutexQueueCreate();
+
+    // Floor queue size to bgiteration_queue_increase_incr or use last queue size value
+    if (last_item_count_target < BGITER_QUEUE_INCREASE_INCR) {
+        last_item_count_target = BGITER_QUEUE_INCREASE_INCR;
+    }
+    it->item_count_target = last_item_count_target;
+    it->iteration_flags = flags;
+    it->iteration_type = iter_type;
+    it->consistent_modification_id = bgIteration_epoch++;
+    it->keyset_iter = keyset_iter;
+    it->early_iterate_entries = hashtableCreate(&dbEntryPtrHashtableType);
+    hashtableExpand(it->early_iterate_entries, BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE);
+    it->current_item = NULL;
+    it->client_is_active = false;
+    it->completed = false;
+    it->terminated = false;
+    it->cur_cmd_may_replicate = false;
+
+    it->dbentries_queued = 0;
+    it->dbentries_processed = 0;
+    it->replication_queued = 0;
+    it->replication_processed = 0;
+    it->swapdb_queued = 0;
+    it->swapdb_processed = 0;
+    it->flushdb_queued = 0;
+    it->flushdb_processed = 0;
+    it->dbentry_clones_queued = 0;
+    it->dbentry_clones_processed = 0;
+
+    elapsedStart(&it->monotonic_start_time);
+    it->monotonic_item_start_time = 0;
+
+
+    if (bgIterator_timeproc_id <= 0) {
+        // If iteration is not currently active, start the feeding task.  (Runs in main thread.)
+        bgIterator_timeproc_id = aeCreateTimeEvent(server.el, 1, bgIteration_feedIterators_task, NULL, NULL);
+        serverAssert(bgIterator_timeproc_id != AE_ERR);
+    }
+
+    if (dictAdd(nameToIterator, it->name, it) != DICT_OK) {
+        // Can't have 2 iterators with the same name!
+        serverAssert(false);
+    }
+
+    listAddNodeTail(allIterators, it);
+
+    dictExpand(inUseEntries, listLength(allIterators) * it->item_count_target);
+
+    return it;
+}
+
+
+/* =============================================================================================
+ *                        PUBLIC INTERFACE:  Iterator creation and use
+ * ============================================================================================= */
+
+// PUBLIC API
+bgIterator *bgIteratorCreateFullScanIter(const char *name,
+                                         bgIteratorConsistency consistency,
+                                         bgIteratorReplDoneFunc repldone,
+                                         bgIteratorCleanupFunc cleanup,
+                                         void *privdata) {
+    return bgIteratorCreate(name, consistency, repldone, cleanup, privdata,
+                            BGITERATION_TYPE_FULLSCAN, fullScanIteratorCreate());
+}
+
+// PUBLIC API
+bgIterator *bgIteratorCreateSlotsIter(const char *name,
+                                      bgIteratorConsistency consistency,
+                                      const int *slots,
+                                      int slots_count,
+                                      bgIteratorReplDoneFunc repldone,
+                                      bgIteratorCleanupFunc cleanup,
+                                      void *privdata) {
+    return bgIteratorCreate(name, consistency, repldone, cleanup, privdata,
+                            BGITERATION_TYPE_CLUSTERSLOT, clusterSlotIteratorCreate(slots, slots_count));
+}
+
+// PUBLIC API
+bgIterator *bgIteratorFind(const char *name) {
+    serverAssert(onValkeyMainThread());
+
+    sds sdsname = sdsnew(name);
+    bgIterator *it = dictFetchValue(nameToIterator, sdsname);
+    sdsfree(sdsname);
+
+    return it;
+}
+
+
+// PUBLIC API
+const char *bgIteratorName(bgIterator *it) {
+    return it->name;
+}
+
+
+// PUBLIC API
+void bgIteratorGetStatus(bgIterator *it, bgIteratorStatus *status) {
+    status->dbentries_queued = it->dbentries_queued;
+    status->dbentries_processed = it->dbentries_processed;
+    status->replication_queued = it->replication_queued;
+    status->replication_processed = it->replication_processed;
+    status->swapdb_queued = it->swapdb_queued;
+    status->swapdb_processed = it->swapdb_processed;
+    status->flushdb_queued = it->flushdb_queued;
+    status->flushdb_processed = it->flushdb_processed;
+    status->dbentry_clones_queued = it->dbentry_clones_queued;
+    status->dbentry_clones_processed = it->dbentry_clones_processed;
+
+    status->queue_length = mutexQueueLength(it->items_for_iterator);
+    status->queue_length_target = it->item_count_target;
+
+    status->runtime_ms = elapsedMs(it->monotonic_start_time);
+
+    monotime nonvolatile_item_start_time = it->monotonic_item_start_time;
+    status->current_item_ms = (nonvolatile_item_start_time == 0)
+                                  ? 0
+                                  : elapsedMs(nonvolatile_item_start_time);
+}
+
+
+// PUBLIC API
+void bgIteratorTerminate(bgIterator *it) {
+    serverAssert(onValkeyMainThread());
+
+    // Remove any items in the queue, but doesn't affect the 1 item that's being processed.
+    returnAllItemsToValkey(it);
+
+    // We have to add an item, just in case the READER is waiting on the mutex.
+    if (BGITERATION_DEBUG) {
+        debugBuffer = sdscat(debugBuffer, "SENDING TERMINATE\n");
+    }
+
+    bgIteratorItem *terminationItem = itemFreeList_getElementOrAllocate();
+    *terminationItem = (bgIteratorItem){.type = BGITERATOR_ITEM_TERMINATED};
+    mutexQueueAdd(it->items_for_iterator, terminationItem);
+
+    it->terminated = true;
+}
+
+
+// PUBLIC API
+bool bgIteratorIsTerminating(bgIterator *it) {
+    return it->terminated;
+}
+
+
+// PUBLIC API
+bgIteratorItem *bgIteratorRead(bgIterator *it) {
+    serverAssert(it->current_item == NULL ||
+                 (it->current_item->type != BGITERATOR_ITEM_COMPLETE &&
+                  it->current_item->type != BGITERATOR_ITEM_TERMINATED));
+
+    // First, clean up the previous item read
+    if (it->current_item != NULL) {
+        returnCurrentItemToValkey(it);
+
+        /* To support unit tests.  Normal clients call bgIteratorRead from an alternate thread.
+         * Without this, a unit test could get stuck waiting on the completion event because
+         * feed won't get invoked.  For production, feed is called regularly from the main thread.
+         * Note - this is checking that the exact same thread is used and shouldn't count modules. */
+        if (pthread_equal(server.main_thread_id, pthread_self()) != 0) bgIteration_feedIterators_task(NULL, 0, NULL);
+    } else {
+        it->client_is_active = true;
+    }
+
+    it->monotonic_item_start_time = 0; // idle until blocking pop returns
+    it->current_item = mutexQueuePop(it->items_for_iterator, true);
+    it->monotonic_item_start_time = getMonotonicUs();
+
+    return it->current_item;
+}
+
+
+// PUBLIC API
+void bgIteratorClose(bgIterator *it) {
+    if (it->current_item != NULL) {
+        if (it->current_item->type == BGITERATOR_ITEM_COMPLETE ||
+            it->current_item->type == BGITERATOR_ITEM_TERMINATED) {
+            // Normal confirmation of background completion
+        } else {
+            // Client is initiating the termination
+            it->terminated = true;
+            returnCurrentItemToValkey(it);
+
+            it->current_item = itemFreeList_getElementOrAllocate();
+            *(it->current_item) = (bgIteratorItem){.type = BGITERATOR_ITEM_TERMINATED};
+        }
+    } else {
+        // terminated before first item read
+        it->terminated = true;
+        it->current_item = itemFreeList_getElementOrAllocate();
+        *(it->current_item) = (bgIteratorItem){.type = BGITERATOR_ITEM_TERMINATED};
+    }
+
+    // We don't allocate extension items from the free list
+    bgIteratorItemExtClose *itemClose = zmalloc(sizeof(bgIteratorItemExtClose));
+    itemClose->type = BGITERATOR_ITEMEXT_ITER_CLOSED;
+    itemClose->iter = it;
+    mutexQueueAdd(it->return_to_valkey, itemClose);
+}
+
+
+/* =============================================================================================
+ *                        PUBLIC INTERFACE:  Valkey main-thread support hooks
+ * ============================================================================================= */
+
+// PUBLIC API
+void bgIteration_init(void) {
+    serverAssert(onValkeyMainThread());
+
+    /* This should be called once and only once from the Valkey main thread.  However to support
+     * unit tests, this is not validated, and multiple invocations are ignored.  */
+    if (nameToIterator) return; // If already initialized, ignore (unit tests)
+
+    nameToIterator = dictCreate(&sdsrefToPtrDictType);
+    serverAssert(nameToIterator != NULL);
+
+    allIterators = listCreate();
+    serverAssert(allIterators != NULL);
+
+    inUseEntries = dictCreate(&dictEntryPtrDictType);
+    serverAssert(inUseEntries != NULL);
+
+    curCmdMissingKeys = listCreate();
+    serverAssert(curCmdMissingKeys != NULL);
+    listSetFreeMethod(curCmdMissingKeys, decrRefCountVoid);
+
+    bufferedReplicationBytes = 0;
+
+    if (BGITERATION_DEBUG) {
+        debugBuffer = sdsMakeRoomFor(sdsempty(), SDS_MAX_PREALLOC);
+    }
+}
+
+
+// PUBLIC API
+bool bgIteration_iterationActive(void) {
+    return (allIterators != NULL && listLength(allIterators) > 0);
+}
+
+
+// PUBLIC API
+void bgIteration_keyDelete(int dbid, const_sds key) {
+    if (!bgIteration_iterationActive()) return;
+    serverAssert(onValkeyMainThread());
+
+    if (BGITERATION_DEBUG) {
+        debugBuffer = sdscatprintf(debugBuffer, "KEYDEL: (%d)%s\n", dbid, key);
+    }
+
+    dbEntry *de = dbFind(server.db[dbid], (sds)key);
+    if (de == NULL) return;
+
+    // For consistent iterators, we need to make sure the item gets written before delete
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (it->completed || it->terminated || !it->keyset_iter->isKeyInScope(it->keyset_iter, key)) continue;
+
+        if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT &&
+            ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) {
+            if (!it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) &&
+                !hashtableFind(it->early_iterate_entries, de, NULL)) {
+                addEarlyIterationKey(it, de, dbid); // (may also add to inUseEntries)
+            }
+        }
+    }
+
+    removePtrFromEarlyIterate(de);
+
+    /* We might be within the context of a command execution.  This happens if the key is found to
+     * be expired when attempting to execute the command.  In this case, we should treat the key as
+     * missing.  If the key exists after the command executes, we can treat it like a new key.
+     * (If not in command execution, this is ok - it's reset at the beginning of command execution.) */
+    robj *oKey = createObject(OBJ_STRING, sdsdup(key));
+    listAddNodeHead(curCmdMissingKeys, oKey);
+}
+
+
+// PUBLIC API
+void bgIteration_flushall(void) {
+    handleFlushdb(-1);
+}
+
+
+// PUBLIC API
+bool bgIteration_blockClientIfRequired(client *c) {
+    serverAssert(onValkeyMainThread());
+    iteratorReplicationFlagsWereUpdated = false;
+    if (!bgIteration_iterationActive()) return false;
+    if (!isWriteCmd(c->cmd)) return false;
+
+    if (BGITERATION_DEBUG) {
+        sds sdsArgv = createSdsFromClientArgv(c->argc, c->argv);
+        debugBuffer = sdscatprintf(debugBuffer, "BLCK?: (%d)%s\n", c->db->id, sdsArgv);
+        sdsfree(sdsArgv);
+    }
+
+    /* Before executing a command or atomic transaction, the replication flag is cleared for each
+     * iterator.  If it's determined that the command should replicate, the flag will be set
+     * as the command and keys are examined for expedite. */
+    resetReplicationFlagForIterators(c);
+    iteratorReplicationFlagsWereUpdated = true;
+
+    if (c->cmd->proc == flushdbCommand || c->cmd->proc == flushallCommand) {
+        // Handle flush commands prior to execution
+        int flags;
+        if (getFlushCommandFlags(c, &flags) == C_OK) {
+            // The command parsed ok - we WILL flush
+            handleFlushdb((c->cmd->proc == flushdbCommand) ? c->db->id : -1);
+        }
+    }
+
+    bool mustBlock = false;
+    hashtable *waitOnKeys = hashtableCreate(&tempKeysetHashtableType); // set of robj(sds)
+    listEmpty(curCmdMissingKeys);
+
+    if (c->cmd->proc == execCommand) {
+        mustBlock = expediteKeysForMultiExec(c, waitOnKeys);
+    } else {
+        getKeysResult result;
+        initGetKeysResult(&result);
+        int numkeys = getKeysFromCommand(c->cmd, c->argv, c->argc, &result);
+        keyReference *keyrefs = result.keys;
+        if (numkeys > 0) {
+            mustBlock = expediteKeysForWriteOnAllIterators(
+                c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys);
+            serverAssert(!(mustBlock && (c->flag.multi) && !(c->flag.script)));
+
+            if (mustBlock && (c->flag.script)) {
+                /* For scripts, we will block for keys declared in EVAL/EVALSHA/FCALL.
+                 *  However, scripts are NOT required to declare keys.  Even if it declares keys,
+                 *  it's not declaring the DB for the key.  After a SELECT or SWAPDB, we might be on
+                 *  a key we haven't blocked for.  In this case, there is no option but to execute a
+                 *  synchronous block and wait for the iterator(s) to be done with the key(s).
+                 *  (Yuck.)  */
+                while (mustBlock) {
+                    receiveItemsBackFromIterators(true); // Blocking
+                    hashtableEmpty(waitOnKeys, NULL);
+                    mustBlock = expediteKeysForWriteOnAllIterators(
+                        c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys);
+                }
+            }
+        } else {
+            // WRITE commands with no keys should always be replicated.  SWAPDB, FLUSH, FUNCTION, etc.
+            listIter li;
+            listNode *node;
+            listRewind(allIterators, &li);
+            while ((node = listNext(&li)) != NULL) {
+                bgIterator *it = listNodeValue(node);
+                it->cur_cmd_may_replicate = true;
+            }
+        }
+        getKeysFreeResult(&result);
+    }
+
+    if (mustBlock) {
+        serverAssert(hashtableSize(waitOnKeys) > 0);
+        robj **waitKeysArgv = zmalloc(sizeof(robj *) * hashtableSize(waitOnKeys));
+
+        robj *key;
+        hashtableIterator hi;
+        hashtableInitIterator(&hi, waitOnKeys, 0);
+        unsigned long argvCount = 0;
+        while (hashtableNext(&hi, (void **)&key)) {
+            waitKeysArgv[argvCount++] = key;
+        }
+        hashtableCleanupIterator(&hi);
+        serverAssert(argvCount == hashtableSize(waitOnKeys));
+
+        blockClientInUseOnKeys(c, argvCount, waitKeysArgv);
+
+        zfree(waitKeysArgv);
+    }
+
+    hashtableRelease(waitOnKeys);
+
+    if (BGITERATION_DEBUG) {
+        if (mustBlock) debugBuffer = sdscat(debugBuffer, " (blocked)\n");
+    }
+
+    return mustBlock;
+}
+
+
+// PUBLIC API
+void bgIteration_handleCommandReplication(int dbid,
+                                          struct serverCommand *cmd,
+                                          int argc,
+                                          robj **argv) {
+    if (BGITERATION_DEBUG) {
+        // DEBUG - enable this to capture replication not queued because iteration is inactive
+        if (0 && !bgIteration_iterationActive() && (isWriteCmd(cmd) || cmd->proc == multiCommand)) {
+            sds sdsArgv = createSdsFromClientArgv(argc, argv);
+            debugBuffer = sdscatprintf(debugBuffer, "REPL? INACT: (%d)%s\n", dbid, sdsArgv);
+            sdsfree(sdsArgv);
+        }
+    }
+
+    if (!bgIteration_iterationActive()) return;
+    serverAssert(onValkeyMainThread());
+
+    /* Some commands are replicated which are not writes (like publish) these can be ignored.
+     *  Be careful with MULTI which is not a write command, but must be replicated. */
+    if (!isWriteCmd(cmd) && cmd->proc != multiCommand) return;
+
+    if (BGITERATION_DEBUG) {
+        sds sdsArgv = createSdsFromClientArgv(argc, argv);
+        debugBuffer = sdscatprintf(debugBuffer, "REPL?: (%d)%s\n", dbid, sdsArgv);
+        sdsfree(sdsArgv);
+    }
+
+    if (cmd->proc == swapdbCommand) {
+        // All iterators and clients must be informed of swapdb
+        int id1, id2;
+        // command has been processed, but Valkey allows "swapdb 0 0" (which can be ignored)
+        if (getParamsForSwapdb(argc, argv, NULL, &id1, &id2))
+            handleSwapdb(id1, id2);
+    }
+
+    /* In the case that a key is touched in a different DB (COPY/MOVE) the key is recorded as
+     *  a "special" key and than handled below. */
+    int special_dbid = 0;
+    sds special_key = NULL;
+    dbEntry *special_dbEntry = NULL;
+    if (cmd->proc == moveCommand) {
+        /* The MOVE command succeeded.  However MOVE requires special handling as it creates a new
+         * key in a different database.  We need to make sure that we don't later try to iterate
+         * on the key as it would be a duplicate key at that point.  So, instead, we will mark the
+         * newly created key as "early iterated". */
+        bool success = getDbIdFromRobj(argv[MOVE_COMMAND_DBID_ARG_INDEX], &special_dbid);
+        serverAssert(success); // the command already succeeded, so this should work!
+
+        robj *oKey = argv[1];
+        special_key = (sds)objectGetVal(oKey);
+
+        special_dbEntry = dbFind(server.db[special_dbid], special_key);
+    }
+    if (cmd->proc == copyCommand) {
+        // The COPY command succeeded.  However COPY requires special handling (like MOVE).
+        bool success = getTargetDbIdForCopyCommand(argc, argv, dbid, &special_dbid);
+        serverAssert(success); // the command already succeeded, so this should work!
+
+        // Find the newly created entry.
+        robj *oKey = argv[2];
+        special_key = (sds)objectGetVal(oKey);
+
+        special_dbEntry = dbFind(server.db[special_dbid], special_key);
+    }
+
+    /* Implementation note regarding LUA and MULTI:  LUA scripts and MULTI-EXEC blocks must be
+     * treated atomically.  We need to ensure that either ALL of the replication (or none of the
+     * replication) for the atomic operation is processed by the iterator(s).  This is handled
+     * naturally as we can only "complete" the iteration during the feeding process - and feeding
+     * is only performed when handling timer events (after the LUA/MULTI has completed).  */
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (it->completed || it->terminated) continue;
+
+        /* For consistent iteration, we only iterate values based on version.  But for
+         * non-consistent iteration, we don't need to explicitly iterate any values newly created
+         * during the iteration.  So we mark them as expedited.  We know we have a new key if it
+         * was missing before the command, and exists now. */
+
+        if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) {
+            // Handle the special case of a key moved to a different DB
+            if (special_dbEntry != NULL) {
+                if (it->cur_cmd_may_replicate &&
+                    !it->keyset_iter->hasPassedItem(it->keyset_iter, special_key, special_dbid)) {
+                    hashtableAdd(it->early_iterate_entries, special_dbEntry);
+                    if (BGITERATION_DEBUG) {
+                        sds entryString = createEntryString(special_dbid, special_dbEntry);
+                        debugBuffer = sdscatprintf(debugBuffer, "EARLY(special): %s\n", entryString);
+                        sdsfree(entryString);
+                    }
+                }
+
+                /* Note: In the cases where there's a special command, we are copying or moving an
+                 *       item to a different DB.  In these limited cases, we can only possibly be
+                 *       creating a single key.  And if we've handled it here, we don't need to
+                 *       handle it as a "missing key" below.  If we were to try to handle it as a
+                 *       standard "missing key", we would get the DBID incorrect. */
+
+            } else if (listLength(curCmdMissingKeys) > 0) {
+                listIter missingIt;
+                listNode *missingNode;
+                listRewind(curCmdMissingKeys, &missingIt);
+                while ((missingNode = listNext(&missingIt)) != NULL) {
+                    robj *oKey = listNodeValue(missingNode);
+                    const_sds key = objectGetVal(oKey);
+                    dbEntry *de = dbFind(server.db[dbid], (sds)key);
+                    if (de != NULL) {
+                        // It exists now!
+                        if (it->cur_cmd_may_replicate &&
+                            !it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) {
+                            /* If the current command is allowed to replicate, and there is a new
+                             * key which we haven't yet reached in iteration, it needs to be added
+                             * to the set of early iterate entries.  (We know that it's not already
+                             * in that set because it's a newly created key!) */
+                            bool wasAdded = hashtableAdd(it->early_iterate_entries, de);
+                            serverAssert(wasAdded);
+                            if (BGITERATION_DEBUG) {
+                                sds entryString = createEntryString(dbid, de);
+                                debugBuffer = sdscatprintf(debugBuffer, "EARLY(NEW): %s\n", entryString);
+                                sdsfree(entryString);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        /* Deletes (and unlinks) are special.
+         * Developer context:  For most commands, we call bgIteration_blockClientIfRequired before
+         *  the command and then call bgIteration_handleCommandReplication after the command.  While
+         *  the "before" logic is determining the need to block, it can also determine (mostly) the
+         *  need for replication (on each iterator).  Doing this all in one place saves us from
+         *  performing some of the same logic twice.  When we get to this point in the code, we just
+         *  use the previously determined information regarding replication.  This works because
+         *  Valkey is single-threaded and only processes one command at a time.
+         *
+         * But deletes (and unlinks) happen multiple ways - and occur outside the normal
+         *  before/after logic for commands.  These situations must be handled:
+         *    - A normal (client-driven) DEL/UNLINK command will use the standard before/after
+         *      logic.  If the key is in use by bgIteration, the command will be blocked.
+         *    - An EVICTION generates a DEL/UNLINK which happens outside of the context of a client
+         *      issued command.  The replication flags on the iterators are stale and relate to the
+         *      prior command executed.
+         *    - An EXPIRATION in the context of a client-driven WRITE command occurs when the client
+         *      command attempts to access a key and it is found to be expired.  In this case, the
+         *      client-command has already gone through the blocking process, so it should be OK to
+         *      use it->cmd_may_replicate.
+         *    - An EXPIRATION in the context of a client-driven READ command occurs when the client
+         *      command attempts to access a key and it is found to be expired.  In this case, the
+         *      client-command has NOT gone through the blocking process.  The replication flags on
+         *      the iterators are stale and relate to the prior (write) command executed.
+         *    - An EXPIRATION outside of a client-driven command occurs due to active expiry.  In
+         *      this case, the replication flags on the iterator are stale and relate to the prior
+         *      command executed.
+         *
+         * In the case of EXPIRE/EVICT occurring outside the context of a write command, this is
+         *  handled.  If the key is in-use by bgIterator, increment of robj's refcount prevents the
+         *  key from deletion. In this case the key will be removed from the main dictionary, but
+         *  held inside bgIteration until no longer needed.
+         *  Even though the entry is not physically deleted yet, it is logically deleted and it is
+         *  safe to replicate the DEL/UNLINK.  Since iterators process items FIFO, the replication
+         *  for DEL/UNLINK won't actually get processed until other queued replication is processed.
+         *
+         * In the case of a client driven DEL command, the key will have already been deleted when
+         *  we hit this routine.  In the case of EXPIRE/EVICT, they propagate happens before the key
+         *  is deleted.  So if the key is missing, we can use the cached replication decision.  But
+         *  if the key still exists (indicating EXPIRE/EVICT) we evaluate it specially. */
+        bool shouldReplicateDelCommand = false;
+        bool isDelCommand = isDeleteCmd(cmd);
+        if (isDelCommand) {
+            sds key = objectGetVal(argv[1]);
+            if (it->keyset_iter->isKeyInScope(it->keyset_iter, key)) {
+                bool blockClientIfRequiredWasCalled = (server.in_call > 0);
+                if (blockClientIfRequiredWasCalled && iteratorReplicationFlagsWereUpdated) {
+                    // Here we know that the DEL is related to the running command
+                    shouldReplicateDelCommand = it->cur_cmd_may_replicate;
+                } else {
+                    // Otherwise, it's something like active expiration or eviction (unrelated)
+                    dbEntry *de = dbFind(server.db[dbid], key);
+                    if (it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) ||
+                        (de && hashtableFind(it->early_iterate_entries, de, NULL))) {
+                        shouldReplicateDelCommand = true;
+                    }
+                }
+            }
+        }
+
+        bool replicate = (it->iteration_flags & BGITERATOR_FLAG_REPLICATION &&
+                          ((!isDelCommand && it->cur_cmd_may_replicate) || shouldReplicateDelCommand));
+
+        if (replicate) {
+            /* We will replicate the command in these cases:
+             * 1) For consistent iteration - it->cur_cmd_may_replicate is always true
+             * 2) For non-consistent, if any of the keys have been processed, expediteKeysForWrite
+             *    will ensure that ALL of the keys have been expedited - and we should replicate
+             * 3) For non-consistent, if NONE of the keys have been processed, no need to replicate */
+            if (BGITERATION_DEBUG) {
+                debugBuffer = sdscat(debugBuffer, " (queued)\n");
+            }
+
+            bgIteratorItem *item = itemFreeList_getElementOrAllocate();
+            item->type = BGITERATOR_ITEM_REPLICATION;
+            item->dbid = dbid;
+            item->u.repl.cmd = cmd;
+            item->u.repl.argv = cloneRobjArray(argc, argv);
+            item->u.repl.argc = argc;
+            bufferedReplicationBytes += replicationItemSize(item);
+            it->replication_queued++;
+            mutexQueueAdd(it->items_for_iterator, item);
+        }
+    } // allIterators loop
+}
+
+
+// PUBLIC API
+size_t bgIteration_memoryInuseForReplication(void) {
+    return bufferedReplicationBytes;
+}
+
+
+// PUBLIC API
+bool bgIteration_isEntryInuse(dbEntry *de) {
+    serverAssert(onValkeyMainThread());
+    if (!bgIteration_iterationActive()) return false;
+    return isEntryInuseByAnyIterator(de);
+}
+
+
+// PUBLIC API
+void bgIteration_dbEntryModified(dbEntry *de) {
+    if (bgIteration_iterationActive()) {
+        bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(de);
+        if (md) md->iterator_epoch = bgIteration_epoch;
+    }
+}
+
+
+// PUBLIC API
+void bgIteration_keyModified(int dbid, const_sds key) {
+    if (bgIteration_iterationActive()) {
+        dbEntry *de = dbFind(server.db[dbid], (sds)key);
+        if (de) bgIteration_dbEntryModified(de);
+    }
+}
+
+
+// PUBLIC API
+void bgIteration_updateDbEntryPtr(dbEntry *old, dbEntry *new) {
+    if (!bgIteration_iterationActive() || old == new) return;
+    serverAssert(onValkeyMainThread());
+    serverAssert(!isEntryInuseByAnyIterator(old));
+
+    listIter li;
+    listNode *node;
+    listRewind(allIterators, &li);
+    while ((node = listNext(&li)) != NULL) {
+        bgIterator *it = listNodeValue(node);
+        if (hashtableDelete(it->early_iterate_entries, old)) {
+            if (BGITERATION_DEBUG) {
+                debugBuffer = sdscatprintf(debugBuffer, "EARLY LIST UPDATE %p -> %p\n", (void *)old, (void *)new);
+            }
+            bool wasAdded = hashtableAdd(it->early_iterate_entries, new);
+            serverAssert(wasAdded);
+        }
+    }
+}
diff --git a/src/bgiteration.h b/src/bgiteration.h
new file mode 100644
index 00000000000..78643311fe0
--- /dev/null
+++ b/src/bgiteration.h
@@ -0,0 +1,358 @@
+/*
+ * Copyright Valkey Contributors.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD 3-Clause
+ */
+
+#ifndef __BGITERATION_H
+#define __BGITERATION_H
+
+#include <stdbool.h>
+#include "sds.h"
+
+/* A mechanism for creating iteration clients which iterate over the main dictionary in a
+ * background thread.
+ *
+ * This mechanism passes keys to the iteration client, while blocking the keys from write by the
+ * Valkey main thread.  Once an iteration client is done with a key, it is returned to the Valkey
+ * main thread and any pending writers are unblocked.
+ *
+ * A bgIterator must be created on the main Valkey thread, and then passed to another thread which
+ * implements the logic of the iteration client.
+ *
+ * Iteration clients are expected to read through the keyspace until the iteration is complete or
+ * terminated.  An iteration client may not perform modifications on a key. */
+
+/* Avoids dependency on server.h */
+typedef struct serverObject dbEntry; // An object with key/value inserted into main dictionary
+typedef struct serverObject robj;    // An object with a value used for command parameters
+typedef struct client client;
+
+/* The bgIterator is an opaque structure.  */
+typedef struct bgIterator bgIterator;
+
+
+/* Consistency type for iteration. */
+typedef enum {
+    /* With no consistency requirements, dbEntries are provided to the iteration client as they
+     * appear at the time of iteration.  No replication is provided.  The only guarantee is that
+     * dbEntries which existed at the start of iteration, and remained through the duration of
+     * iteration, will be provided to the iteration client once (and only once).  If a dbEntry is
+     * modified during iteration, either the old or the new value may be provided. */
+    BGITERATOR_CONSISTENCY_NONE = 0,
+
+    /* With consistency at the start of iteration, a point-in-time iteration is performed.  The
+     * iteration client will see all keys AS THEY EXISTED at the time when the iterator was created.
+     * Note:  The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration
+     *        start).  SWAPDB events will not be provided.  */
+    BGITERATOR_CONSISTENCY_START = 1,
+
+    /* With an eventually consistent iteration, dbEntries will be followed by relevant replication.
+     * This will allow a client to achieve a consistent state at the END of the iteration.  Once a
+     * dbEntry has been provided to the iteration client, any replication related to that entry will
+     * also be forwarded to the iteration client.  With eventual consistency, keys are provided as
+     * they are at the time of iteration.  This mode requires that the iteration client be aware of
+     * SWAPDB events.  If a SWAPDB is performed, the client will receive a SWAPDB event.
+     * Replication events will be provided ordered and synchronized with any SWAPDB events. */
+    BGITERATOR_CONSISTENCY_EVENTUAL = 2
+} bgIteratorConsistency;
+
+
+/* When running an iterator with replication, a replication-done function (callback) may be
+ * provided.  This function will be executed after the last replication item has been fed into the
+ * queue for the client.  This function will be run on the Valkey main thread, and allows a client
+ * to recognize the point where no additional replication data will be sent for processing.
+ *
+ * PRIVDATA:    this pointer is for data private to the iteration client.
+ *
+ * Returns true when an iterator stops accepting any replication item into the queue for the client.
+ * If false is returned, replication will continue, and bgiteration will periodically call the callback
+ * until true is returned. In this context, returning false indicates that the client is not ready to
+ * stop receiving replication, it is requesting that replication be continued. */
+typedef bool (*bgIteratorReplDoneFunc)(void *privdata);
+
+
+/* When creating a bgIterator, a cleanup function (callback) may be provided.  This function will be
+ * executed once iteration has completed and this will run on the Valkey main thread.
+ *
+ * TERMINATED:  will be passed as TRUE if the iteration process was terminated early (either by
+ *              the main thread calling bgIteratorTerminate() or the iteration client calling
+ *              bgIteratorClose()).
+ * PRIVDATA:    this pointer is for data private to the iteration client. */
+typedef void (*bgIteratorCleanupFunc)(bool terminated, void *privdata);
+
+
+/* Create a background full-scan iterator (bgIterator).
+ * This bgIterator will iterate through the entire keyspace (across all DBs).
+ *
+ * NAME:        a human readable name for the iterator (must be unique)
+ * FLAGS:       creation flags indicate iteration options
+ * REPLDONE:    if provided, called after the last replication item has been queued (on the Valkey main thread)
+ * CLEANUP:     if provided, called at the end of iteration (on the Valkey main thread)
+ * PRIVDATA:    passed to cleanup function
+ *
+ * This method creates and initializes the bgIterator.  It does not perform any thread management.
+ * It is expected that the main Valkey thread will call this method, and then start a new thread to
+ * to implement the iteration client which will read from the returned bgIterator.
+ *
+ * There is no need to delete/destroy a bgIterator.  It will automatically be cleaned up after the
+ * last item is read. */
+bgIterator *bgIteratorCreateFullScanIter(
+    const char *name,
+    bgIteratorConsistency consistency,
+    bgIteratorReplDoneFunc repldone,
+    bgIteratorCleanupFunc cleanup,
+    void *privdata);
+
+
+/* Create a background slots iterator (bgIterator).
+ * This bgIterator will iterate through the keys belonging to a set of cluster slots.
+ *
+ * NAME:        a human readable name for the iterator (must be unique)
+ * FLAGS:       creation flags indicate iteration options
+ * SLOTS:       array of cluster slots to iterate over
+ * SLOTS_COUNT: size of the array of slots
+ * REPLDONE:    if provided, called after the last replication item has been queued (on the Valkey main thread)
+ * CLEANUP:     if provided, called at the end of iteration (on the Valkey main thread)
+ * PRIVDATA:    passed to cleanup function
+ *
+ * This method creates and initializes the bgIterator.  It does not perform any thread management.
+ * It is expected that the main Valkey thread will call this method, and then start a new thread to
+ * to implement the iteration client which will read from the returned bgIterator.
+ *
+ * The caller of this function has the ownership of the `slots` array's memory. This function will
+ * just copy its data and leave the array untouched.
+ *
+ * There is no need to delete/destroy a bgIterator.  It will automatically be cleaned up after the
+ * last item is read. */
+bgIterator *bgIteratorCreateSlotsIter(
+    const char *name,
+    bgIteratorConsistency consistency,
+    const int *slots,
+    int slots_count,
+    bgIteratorReplDoneFunc repldone,
+    bgIteratorCleanupFunc cleanup,
+    void *privdata);
+
+
+/* Find an existing bgIterator by name.
+ * Returns NULL if the iterator does not exist (or has completed). */
+bgIterator *bgIteratorFind(const char *name);
+
+
+/* Get the name of an existing iterator.  */
+const char *bgIteratorName(bgIterator *iter);
+
+
+/* Struct to retrieve status information for an active iteration client.  */
+typedef struct {
+    unsigned long dbentries_queued;         // Cumulative BGITERATOR_ITEM_DBENTRY queued
+    unsigned long dbentries_processed;      // Cumulative BGITERATOR_ITEM_DBENTRY processed
+    unsigned long replication_queued;       // Cumulative BGITERATOR_ITEM_REPLICATION queued
+    unsigned long replication_processed;    // Cumulative BGITERATOR_ITEM_REPLICATION processed
+    unsigned long swapdb_queued;            // Cumulative BGITERATOR_ITEM_SWAPDB queued
+    unsigned long swapdb_processed;         // Cumulative BGITERATOR_ITEM_SWAPDB processed
+    unsigned long flushdb_queued;           // Cumulative BGITERATOR_ITEM_FLUSHDB queued
+    unsigned long flushdb_processed;        // Cumulative BGITERATOR_ITEM_FLUSHDB processed
+    unsigned long dbentry_clones_queued;    // A subset of dbentries_queued for cloned entries
+    unsigned long dbentry_clones_processed; // A subset of dbentries_processed for cloned entries
+    unsigned long queue_length;             // Current length of queue to iteration client
+    unsigned long queue_length_target;      // Dynamic target length for queue to iteration client
+    unsigned long runtime_ms;               // Time, in milliseconds, that iterator has been running
+    unsigned long current_item_ms;          // Time, in milliseconds, spent processing current item
+} bgIteratorStatus;
+
+
+/* Get the status of a background iteration.
+ *
+ * The caller-provided bgIteratorStatus will be populated. */
+void bgIteratorGetStatus(bgIterator *iter, bgIteratorStatus *status);
+
+
+/* Terminate a background iteration.
+ *
+ * An iteration is terminated by the Valkey main thread.  It is expected that the iteration client
+ * will continue to read, receiving BGITERATOR_ITEM_TERMINATED or BGITERATOR_ITEM_COMPLETE to
+ * complete the iteration.  (This is necessary to ensure proper cleanup.)
+ * NOTE:  If the iteration client wants to terminate iteration, it may call bgIteratorClose(). */
+void bgIteratorTerminate(bgIterator *iter);
+
+
+/* Check if an iterator is being terminated.
+ *
+ * This checks if the iterator is in the process of terminating.  For the Valkey main thread, this
+ * can be used to determine if a call has already been made to bgIteratorTerminate.  For an
+ * iteration client, it normally learns about terminate by reading the next item, this allows
+ * out-of-band detection of termination which can be useful when processing a large key. */
+bool bgIteratorIsTerminating(bgIterator *iter);
+
+
+typedef enum {
+    /* Indicates that the iteration has completed normally.  No more items to read.
+     * If replication is enabled, on completion, the final replication offset is recorded in
+     *  'u.master_repl_offset' and 'dbid' is set to the selected replication db.  The iteration
+     *  client will have received all *applicable* replication data to this point.  */
+    BGITERATOR_ITEM_COMPLETE = 1,
+
+    /* Indicates that the iteration has been terminated before completion.  No more items to read.*/
+    BGITERATOR_ITEM_TERMINATED,
+
+    /* A dbEntry for DB=dbid.
+     * NOTE:  The dbEntry MAY be expired.  It is up to the client to decide how to handle
+     *        expired entries.  */
+    BGITERATOR_ITEM_DBENTRY,
+
+    /* A replication command for DB=dbid.  cmd, argv, & argc provided.
+     * NOTE:  The command may have been re-written before replication.  */
+    BGITERATOR_ITEM_REPLICATION,
+
+    /* A SWAPDB event.  dbid swapped with dbid2.
+     * Note that SWAPDB events are not provided during consistent iteration.  */
+    BGITERATOR_ITEM_SWAPDB,
+
+    /* A FLUSHDB event.  In most cases, iteration will be terminated, and this event will NOT be
+     * sent.  However, in the case of a single minor DB being flushed, non-consistent iteration is
+     * permitted to continue.  */
+    BGITERATOR_ITEM_FLUSHDB
+} bgIteratorItemType;
+
+
+typedef struct {
+    dbEntry *de;
+    bool is_cloned;
+    bool is_rehashing_paused;
+} dbEntryData;
+
+typedef struct {
+    struct serverCommand *cmd;
+    robj **argv;
+    int argc;
+} replicationData;
+
+typedef struct {
+    bgIteratorItemType type;
+    int dbid; // orig DB ID for CONSISTENT, queue-time DB ID for !CONSISTENT.
+    union {
+        dbEntryData dbe;              // for BGITERATOR_ITEM_DBENTRY
+        replicationData repl;         // for BGITERATOR_ITEM_REPLICATION
+        long long master_repl_offset; // for BGITERATOR_ITEM_COMPLETE
+        int dbid2;                    // for BGITERATOR_ITEM_SWAPDB
+    } u;
+} bgIteratorItem;
+
+
+/* Read the next bgIteratorItem from the bgIterator.
+ *
+ * The iteration client is expected to call this function in a loop.  After reading
+ * BGITERATOR_ITEM_COMPLETE or BGITERATOR_ITEM_TERMINATED, the iteration client must call
+ * bgIteratorClose to finalize the iteration process.
+ *
+ * This is a blocking call.  If the main Valkey thread has been too busy to send items to the
+ * iterator, the iteration client's queue may run dry and this call will block until data is
+ * available.
+ *
+ * NOTE: Reading an item returns previously read items to Valkey.  It is unsafe to reference an item
+ * previously read.
+ *
+ * (All memory management is the responsibility of the bgIterator - not the reader.) */
+bgIteratorItem *bgIteratorRead(bgIterator *iter);
+
+
+/* Close the bgIterator, allowing the bgIterator to be deallocated.
+ *
+ * This must be called by an iteration client to release the bgIterator.
+ *
+ * It is required that this is called after receiving BGITERATOR_ITEM_COMPLETE or
+ * BGITERATOR_ITEM_TERMINATED and signals that the background activity is complete.
+ *
+ * This may also be called by the iteration client to force terminate an iteration early.  The
+ * bgIterator will be marked as terminated. */
+void bgIteratorClose(bgIterator *iter);
+
+
+/********************************************************************************************
+ * BGITERATION HOOKS REQUIRED TO SUPPORT ITERATION - CALLS INSERTED INTO MAIN VALKEY CODE
+ ********************************************************************************************/
+
+#define BGITERATION_ENTRY_METADATA_SIZE 4
+
+/* Must be called once (and only once) at server startup.  */
+void bgIteration_init(void);
+
+
+/* Returns true if any iterators are currently active. */
+bool bgIteration_iterationActive(void);
+
+
+/* Notify bgIteration that a key is being deleted.  In Valkey, key deletion can occur in a READ
+ * command if the key is expired.  Note that this notification is more about status than memory.
+ * Since the dbEntry is a reference counted object, the dbEntry can't be physically deleted if
+ * bgIteration is still actively using it. */
+void bgIteration_keyDelete(int dbid, const_sds key);
+
+
+/* Iteration needs to know if a FLUSHALL is being performed.  For normal clients, this comes through
+ * the standard "blockClientIfRequired" interface.  This interface is for cases where Valkey
+ * performs the FLUSHALL operation independently of clients (e.g. when syncing with master). */
+void bgIteration_flushall(void);
+
+
+/* Updating value or expiration of an existing key may lead to reallocation of the dbEntry (robj).
+ * BgIteration keeps track of expedited keys (by pointer) to avoid repeated iteration.  BgIteration
+ * must be notified when dbEntries are reallocated.  BgIteration will not dereference the pointers;
+ * it is safe to have deallocated the old dbEntry before calling this function.
+ *
+ * We can't update the dbEntry if the entry is actually in use (bgIteration_isEntryInuse)!
+ *
+ * To simplify calling code, this function does nothing if old_entry == new_entry. */
+void bgIteration_updateDbEntryPtr(dbEntry *old_entry, dbEntry *new_entry);
+
+
+/* Before executing any command, the Valkey main thread must call this function.  If the key(s) are
+ * blocked for writes by an iterator, the function returns true and the client is blocked.  A
+ * blocked client will be unblocked once the key becomes available for write.
+ *
+ * This should be called for all commands - even commands which are executed as part of a MULTI/EXEC
+ * or LUA script.
+ *
+ * For MULTI/EXEC - This function is called when hitting the EXEC - after all of the commands
+ *                  have been queued.  This may block the EXEC, but will NOT block individual
+ *                  commands as they are executed in the MULTI/EXEC block.
+ *
+ * For LUA script - This function is first called for EVAL/EVALSHA.  It may block the script while
+ *                  waiting on declared keys.  However, if the script accesses undeclared keys or
+ *                  performs SWAPDB, a synchronous block may be performed (returning false) on
+ *                  individual commands within the script.
+ *
+ * Note: this function should be called for all commands (not just writes). */
+bool bgIteration_blockClientIfRequired(client *c);
+
+
+/* After execution of a write command, the Valkey main thread must provide the command to iterators
+ * which are interested in the replication feed.  It is required that all commands have been passed
+ * through bgIteration_blockClientIfRequired(), however, it is permitted that the command can be
+ * re-written for propagation. */
+void bgIteration_handleCommandReplication(
+    int dbid,
+    struct serverCommand *cmd,
+    int argc,
+    robj **argv);
+
+
+/* The memory that bgIteration uses while temporarily buffering replication data is not included in
+ * the maxmemory computation used for eviction.  This function provides insight into the current
+ * amount of memory used for buffered replication data. */
+size_t bgIteration_memoryInuseForReplication(void);
+
+
+/* Check if a dbEntry is currently in-use/locked by bgIteration. */
+bool bgIteration_isEntryInuse(dbEntry *de);
+
+
+/* Notify bgIteration that a dbEntry has been added/modified.
+ *  - If caller has a dbEntry*, dbEntryModified is more efficient
+ *  - If caller has a dbid/key, a lookup is performed to find the dbEntry */
+void bgIteration_dbEntryModified(dbEntry *de);
+void bgIteration_keyModified(int dbid, const_sds key);
+
+#endif
diff --git a/src/db.c b/src/db.c
index ed906f22c4e..4b6f3c11a39 100644
--- a/src/db.c
+++ b/src/db.c
@@ -37,6 +37,7 @@
 #include "module.h"
 #include "vector.h"
 #include "expire.h"
+#include "bgiteration.h"
 
 /*-----------------------------------------------------------------------------
  * C-level DB API
@@ -361,6 +362,7 @@ static void dbSetValue(serverDb *db, robj *key, robj **valref, int overwrite, vo
         val->lru = old->lru;
         long long expire = objectGetExpire(old);
         new = objectSetKeyAndExpire(val, objectGetVal(key), expire);
+        bgIteration_updateDbEntryPtr(old, new);
         *oldref = new;
         /* Replace the old value at its location in the expire space. */
         if (expire >= 0) {
@@ -430,6 +432,7 @@ void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags) {
     } else {
         dbSetValue(db, key, valref, 1, NULL);
     }
+    bgIteration_dbEntryModified(*valref);
     if (!(flags & SETKEY_KEEPTTL)) removeExpire(db, key);
     if (!(flags & SETKEY_NO_SIGNAL)) signalModifiedKey(c, db, key);
 }
@@ -475,6 +478,8 @@ int dbGenericDeleteWithDictIndex(serverDb *db, robj *key, int async, int flags,
     hashtablePosition pos;
     void **ref = kvstoreHashtableTwoPhasePopFindRef(db->keys, dict_index, objectGetVal(key), &pos);
     if (ref != NULL) {
+        bgIteration_keyDelete(db->id, (sds)objectGetVal(key));
+
         robj *val = *ref;
         /* VM_StringDMA may call dbUnshareStringValue which may free val, so we
          * need to incr to retain val */
@@ -662,6 +667,9 @@ long long emptyData(int dbnum, int flags, void(callback)(hashtable *)) {
         return -1;
     }
 
+    /* bgIteration must be notified for flushall. */
+    if (dbnum == -1) bgIteration_flushall();
+
     /* Fire the flushdb modules event. */
     moduleFireServerEvent(VALKEYMODULE_EVENT_FLUSHDB, VALKEYMODULE_SUBEVENT_FLUSHDB_START, &fi);
 
@@ -753,6 +761,7 @@ long long dbTotalServerKeyCount(void) {
 void signalModifiedKey(client *c, serverDb *db, robj *key) {
     touchWatchedKey(db, key);
     trackingInvalidateKey(c, key, 1);
+    bgIteration_keyModified(db->id, objectGetVal(key));
 }
 
 void signalFlushedDb(int dbid, int async) {
@@ -2257,7 +2266,7 @@ robj *dbFindExpires(serverDb *db, sds key) {
 }
 
 unsigned long long dbSize(serverDb *db) {
-    return kvstoreSize(db->keys);
+    return (db->keys) ? kvstoreSize(db->keys) : 0;
 }
 
 unsigned long long dbScan(serverDb *db, unsigned long long cursor, kvstoreScanFunction scan_cb, void *privdata) {
diff --git a/src/defrag.c b/src/defrag.c
index 670f83bee73..e6ecad0227e 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -43,6 +43,7 @@
 #include "eval.h"
 #include "script.h"
 #include "module.h"
+#include "bgiteration.h"
 #include <stdbool.h>
 #include <stddef.h>
 
@@ -708,6 +709,8 @@ static void defragKey(defragKeysCtx *ctx, robj **elemref) {
     unsigned char *newzl;
     ob = *elemref;
 
+    if (bgIteration_isEntryInuse(ob)) return;
+
     /* Try to defrag robj and/or string value. */
     if ((newob = activeDefragStringOb(ob))) {
         *elemref = newob;
@@ -815,6 +818,11 @@ static void defragPubsubScanCallback(void *privdata, void *elemref) {
  * and 1 if time is up and more work is needed. */
 static int defragLaterItem(robj *ob, unsigned long *cursor, monotime endtime, int dbid) {
     if (ob) {
+        if (bgIteration_isEntryInuse(ob)) {
+            *cursor = 0;
+            return 0;
+        }
+
         if (ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST) {
             return scanLaterList(ob, cursor, endtime);
         } else if (ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HASHTABLE) {
diff --git a/src/expire.c b/src/expire.c
index b31f57465cc..efa027dad8c 100644
--- a/src/expire.c
+++ b/src/expire.c
@@ -39,6 +39,7 @@
 #include "cluster.h"
 #include "cluster_migrateslots.h"
 #include "util.h"
+#include "bgiteration.h"
 
 /*-----------------------------------------------------------------------------
  * Incremental collection of expired keys.
@@ -167,13 +168,18 @@ void fieldExpireScanCallback(void *privdata, void *volaKey, int didx) {
     robj *o = volaKey;
     serverAssert(o);
     serverAssert(hashTypeHasVolatileFields(o));
+
+    data->has_more_expired_entries = false;
+    data->sampled++;
+
+    if (bgIteration_isEntryInuse(o)) return;
+
     mstime_t now = server.mstime;
     size_t expired_fields = dbReclaimExpiredFields(o, data->db, now, data->max_entries, didx);
     if (expired_fields) {
         data->has_more_expired_entries = (expired_fields == data->max_entries);
         data->expired++;
     }
-    data->sampled++;
 }
 
 static int expireShouldSkipTableForSamplingCb(hashtable *ht) {
diff --git a/src/hashtable.c b/src/hashtable.c
index dcae6dfa014..940adcc8c01 100644
--- a/src/hashtable.c
+++ b/src/hashtable.c
@@ -344,7 +344,7 @@ typedef struct {
 } position;
 
 static_assert(sizeof(hashtablePosition) >= sizeof(position),
-              "Opaque iterator size");
+              "Opaque position size");
 
 /* State for incremental find. */
 typedef struct {
@@ -1377,13 +1377,13 @@ void hashtableResumeAutoShrink(hashtable *ht) {
  * spaces, "holes", in the bucket chains, which wastes memory. Additionally, we
  * pause auto shrink when rehashing is paused, meaning the hashtable will not
  * shrink the bucket count. */
-static void hashtablePauseRehashing(hashtable *ht) {
+void hashtablePauseRehashing(hashtable *ht) {
     ht->pause_rehash++;
     hashtablePauseAutoShrink(ht);
 }
 
 /* Resumes incremental rehashing, after pausing it. */
-static void hashtableResumeRehashing(hashtable *ht) {
+void hashtableResumeRehashing(hashtable *ht) {
     ht->pause_rehash--;
     assert(ht->pause_rehash >= 0);
     hashtableResumeAutoShrink(ht);
@@ -2562,3 +2562,19 @@ int hashtableLongestBucketChain(hashtable *ht) {
     }
     return maxlen;
 }
+
+// Temporary, waiting on PR #3803
+bool hashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor) {
+    if (cursor == 0) return false;
+    if (hashtableSize(ht) == 0) return true;
+
+    /* The scan visits buckets in reverse-binary order based on the smallest
+     * table. During rehashing, a small-table bucket and its corresponding
+     * large-table buckets are processed together, so the small-table mask
+     * determines ordering in both cases. */
+    int exp = ht->bucket_exp[0];
+    if (hashtableIsRehashing(ht) && ht->bucket_exp[1] < exp) exp = ht->bucket_exp[1];
+    size_t mask = expToMask(exp);
+    size_t bucket_idx = hashKey(ht, key) & mask;
+    return rev(bucket_idx) < rev(cursor & mask);
+}
diff --git a/src/hashtable.h b/src/hashtable.h
index 8bbf5d8c05b..289bc183db1 100644
--- a/src/hashtable.h
+++ b/src/hashtable.h
@@ -129,6 +129,8 @@ size_t hashtableMemUsage(const hashtable *ht);
 void hashtablePauseAutoShrink(hashtable *ht);
 void hashtableResumeAutoShrink(hashtable *ht);
 bool hashtableIsRehashing(hashtable *ht);
+void hashtablePauseRehashing(hashtable *ht);
+void hashtableResumeRehashing(hashtable *ht);
 bool hashtableIsRehashingPaused(hashtable *ht);
 ssize_t hashtableGetRehashingIndex(hashtable *ht);
 void hashtableRehashingInfo(hashtable *ht, size_t *from_size, size_t *to_size);
@@ -161,6 +163,7 @@ bool hashtableIncrementalFindGetResult(hashtableIncrementalFindState *state, voi
 /* Iteration & scan */
 size_t hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata);
 size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata, void *(*defragfn)(void *), int flags);
+bool hashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor);
 void hashtableInitIterator(hashtableIterator *iter, hashtable *ht, uint8_t flags);
 void hashtableRetargetIterator(hashtableIterator *iterator, hashtable *ht);
 void hashtableCleanupIterator(hashtableIterator *iter);
diff --git a/src/object.c b/src/object.c
index 21eb57e5cbd..f4545cf8025 100644
--- a/src/object.c
+++ b/src/object.c
@@ -38,6 +38,7 @@
 #include "zmalloc.h"
 #include "sds.h"
 #include "module.h"
+#include "bgiteration.h"
 #include <math.h>
 #include <ctype.h>
 
@@ -340,7 +341,7 @@ robj *createStringObjectFromSds(const_sds s) {
     return createStringObject(s, sdslen(s));
 }
 
-static robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const_sds key, long long expire) {
+robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const_sds key, long long expire) {
     if (shouldEmbedStringObject(len, key, expire)) {
         return createEmbeddedStringObjectWithKeyAndExpire(ptr, len, key, expire);
     } else {
@@ -447,6 +448,7 @@ void objectUnembedVal(robj *o) {
 robj *objectSetKeyAndExpire(robj *o, const_sds key, long long expire) {
     if (o->type == OBJ_STRING && o->encoding == OBJ_ENCODING_EMBSTR) {
         robj *new = createStringObjectWithKeyAndExpire(objectGetVal(o), sdslen(objectGetVal(o)), key, expire);
+        bgIteration_updateDbEntryPtr(o, new);
         new->lru = o->lru;
         decrRefCount(o);
         return new;
@@ -471,6 +473,7 @@ robj *objectSetKeyAndExpire(robj *o, const_sds key, long long expire) {
         serverPanic("Not implemented");
     }
     robj *new = createUnembeddedObjectWithKeyAndExpire(o->type, ptr, key, expire);
+    bgIteration_updateDbEntryPtr(o, new);
     new->encoding = o->encoding;
     new->lru = o->lru;
     decrRefCount(o);
diff --git a/src/server.c b/src/server.c
index 4eb7798a924..ff854b72873 100644
--- a/src/server.c
+++ b/src/server.c
@@ -54,6 +54,7 @@
 #include "util.h"
 
 #include "eval.h"
+#include "bgiteration.h"
 
 #include "trace/trace_commands.h"
 
@@ -2052,7 +2053,10 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
     /* Before we are going to sleep, let the threads access the dataset by
      * releasing the GIL. The server main thread will not touch anything at this
      * time. */
-    if (moduleCount()) moduleReleaseGIL();
+    if (moduleCount()) {
+        atomic_store_explicit(&server.module_gil_acquired, 0, memory_order_relaxed);
+        moduleReleaseGIL();
+    }
     /********************* WARNING ********************
      * Do NOT add anything below moduleReleaseGIL !!! *
      ***************************** ********************/
@@ -2074,6 +2078,7 @@ void afterSleep(struct aeEventLoop *eventLoop, int numevents) {
             atomic_store_explicit(&server.module_gil_acquiring, 1, memory_order_relaxed);
             moduleAcquireGIL();
             atomic_store_explicit(&server.module_gil_acquiring, 0, memory_order_relaxed);
+            atomic_store_explicit(&server.module_gil_acquired, 1, memory_order_relaxed);
             moduleFireServerEvent(VALKEYMODULE_EVENT_EVENTLOOP, VALKEYMODULE_SUBEVENT_EVENTLOOP_AFTER_SLEEP, NULL);
             latencyEndMonitor(latency);
             latencyAddSampleIfNeeded("module-acquire-GIL", latency);
@@ -3018,8 +3023,11 @@ void initServer(void) {
 
     /* Set object metadata size before creating any database key objects */
     if (server.forkless_options_supported) {
-        objectSetMetadataSize(sizeof(uint32_t)); /* This is a placeholder until Threadsave defines a metadata structure */
-                                                 /* 4 bytes for iterator_epoch for now*/
+        /* NOTE: At this time, there is only one reason for dbEntry metadata: bgIteration.  However,
+         * if/when new metadata options are added, we will need to compute the size of a variable
+         * size metadata, and provide appropriate accessors to access the specific portion of the
+         * metadata (each of which may/may not exist, based on immutable startup parameters).  */
+        objectSetMetadataSize(BGITERATION_ENTRY_METADATA_SIZE);
     }
 
     createDatabaseIfNeeded(0); /* The default database should always exist */
@@ -3036,6 +3044,7 @@ void initServer(void) {
     server.watching_clients = 0;
     server.cronloops = 0;
     server.in_exec = 0;
+    server.in_call = 0;
     server.busy_module_yield_flags = BUSY_MODULE_YIELD_NONE;
     server.busy_module_yield_reply = NULL;
     server.client_pause_in_transaction = 0;
@@ -3141,6 +3150,7 @@ void initServer(void) {
     commandlogInit();
     latencyMonitorInit();
     initSharedQueryBuf();
+    bgIteration_init();
 
     /* Initialize ACL default password if it exists */
     ACLUpdateDefaultUserPassword(server.requirepass);
@@ -3702,6 +3712,58 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot)
     if (propagate_to_slot_migration) clusterFeedSlotExportJobs(dbid, argv, argc, slot);
 }
 
+/* BgIteration requires that replicaton is sent after each command, however the
+ * alsoPropagate mechanism queues replication until the end of the transaction
+ * (when propagatePendingCommands is invoked).  Also, the propagation mechanism
+ * strips out multi/exec, adding them back during propagatePendingCommands (if
+ * necessary).  This function ensures that replication, including multi/exec are
+ * sequenced with the commands for bgIteration.
+ *
+ * Called from alsoPropagate with regular params.
+ * Called from propagatePendingCommands with dbid = -1 (to close multi/exec). */
+static void propagateToBgIteration(int dbid, int argc, robj **argv, int target) {
+    /* STATIC indicates that we have sent the MULTI, and need to match it with
+     *  an EXEC during propagatePendingCommands. */
+    static bool sentMultiToBgIterator = false;
+    /* STATIC indicates that last DBID that was sent, so that we can use the
+     *  same DBID when sending a generated EXEC. */
+    static int lastDbidSentToBgIterator;
+
+    if (dbid >= 0) {
+        // Called from alsoPropagate() to replicate a command
+        if (target & PROPAGATE_REPL && bgIteration_iterationActive()) {
+            if (!sentMultiToBgIterator && (scriptIsRunning() || server.in_exec)) {
+                /* For a script or multi/exec, we should be sending the MULTI at
+                 * the beginning of the execution unit.  There shouldn't be any
+                 * commands in the propagation queue yet. */
+                serverAssert(server.also_propagate.numops == 0);
+                /* If this is the first propagated command of a script or multi,
+                 * make it a transaction.  It may turn out that there is only 1
+                 * command in the MULTI block, but we can't know that now.
+                 * Unlike regular replication, we can't defer all of the
+                 * replication until we know for sure.  We must call bgIteration
+                 * after each command. */
+                static struct serverCommand *cmd_multi = NULL; // STATIC
+                if (cmd_multi == NULL) cmd_multi = lookupCommandOrOriginal(&shared.multi, 1);
+                bgIteration_handleCommandReplication(dbid, cmd_multi, 1, &shared.multi);
+                sentMultiToBgIterator = true;
+            }
+            struct serverCommand *cmd = lookupCommandOrOriginal(argv, argc);
+            bgIteration_handleCommandReplication(dbid, cmd, argc, argv);
+            lastDbidSentToBgIterator = dbid;
+        }
+    } else {
+        // Called from propagatePendingCommands() to finalize a transaction
+        if (sentMultiToBgIterator) {
+            // If a MULTI was sent to bgIterator via alsoPropagate(), then send the matching EXEC.
+            static struct serverCommand *cmd_exec = NULL; // STATIC
+            if (cmd_exec == NULL) cmd_exec = lookupCommandOrOriginal(&shared.exec, 1);
+            bgIteration_handleCommandReplication(lastDbidSentToBgIterator, cmd_exec, 1, &shared.exec);
+            sentMultiToBgIterator = false;
+        }
+    }
+}
+
 /* Used inside commands to schedule the propagation of additional commands
  * after the current command is propagated to AOF / Replication.
  *
@@ -3714,6 +3776,8 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot)
  * stack allocated).  The function automatically increments ref count of
  * passed objects, so the caller does not need to. */
 void alsoPropagate(int dbid, robj **argv, int argc, int target, int slot) {
+    propagateToBgIteration(dbid, argc, argv, target);
+
     robj **argvcopy;
     int j;
 
@@ -3780,6 +3844,12 @@ void updateCommandLatencyHistogram(struct hdr_histogram **latency_histogram, int
  * multiple separated commands. Note that alsoPropagate() is not affected
  * by CLIENT_PREVENT_PROP flag. */
 static void propagatePendingCommands(void) {
+    /* This is done before the check on server.also_propagate.numops.  Numops
+     * might be zero if there is no replica but we might be running bgIteration
+     * for something other than replication.  If we sent the multi (to
+     * bgIteration), we need to send the matching exec. */
+    propagateToBgIteration(-1, 0, NULL, 0);
+
     if (server.also_propagate.numops == 0) return;
 
     int j;
@@ -3909,6 +3979,10 @@ int incrCommandStatsOnError(struct serverCommand *cmd, int flags) {
  *
  */
 void call(client *c, int flags) {
+    if (bgIteration_blockClientIfRequired(c)) return;
+
+    server.in_call++;
+
     long long dirty;
     struct ClientFlags client_old_flags = c->flag;
 
@@ -4123,6 +4197,7 @@ void call(client *c, int flags) {
     }
 
     server.executing_client = prev_client;
+    server.in_call--;
 }
 
 /* Used when a command that is ready for execution needs to be rejected, due to
diff --git a/src/server.h b/src/server.h
index 51db9a38baa..9cf59dfae0a 100644
--- a/src/server.h
+++ b/src/server.h
@@ -103,7 +103,19 @@ static_assert(sizeof(off_t) >= 8, "off_t must be 64-bit; ensure _FILE_OFFSET_BIT
 #define dismissMemory zmadvise_dontneed
 
 #define VALKEYMODULE_CORE 1
-typedef struct serverObject robj;
+
+/* serverObject (aka robj) is currently overloaded for 2 purposes.  This is a legacy artifact.
+ *   1. It's carries a reference counted STRING (a keyless value) during parsing and command execution.
+ *   2. It's also used to carry a key/value pair which is inserted into the DB.  In this form, the
+ *      value is not limited to being a string.
+ *
+ * The typedef "dbEntry" is used to explicitly connote the latter form.  It indicates a key/value
+ * pair which is suitable to exist in the DB.  It might be active in the DB, or may be unlinked from
+ * the DB (but still contains a key/value).  The value may be any of the Valkey data types/encodings.
+ */
+typedef struct serverObject robj;    // A keyless string OR a key/value pair
+typedef struct serverObject dbEntry; // Explicitly a key/value pair
+
 #include "valkeymodule.h" /* Modules API defines. */
 
 /* Following includes allow test functions to be called from main() */
@@ -1767,6 +1779,7 @@ struct valkeyServer {
     size_t initial_memory_usage;         /* Bytes used after initialization. */
     int always_show_logo;                /* Show logo even for non-stdout logging. */
     int in_exec;                         /* Are we inside EXEC? */
+    int in_call;                         /* Nesting level within the call() function. */
     int busy_module_yield_flags;         /* Are we inside a busy module? (triggered by RM_Yield). see BUSY_MODULE_YIELD_ flags. */
     const char *busy_module_yield_reply; /* When non-null, we are inside RM_Yield. */
     char *ignore_warnings;               /* Config: warnings that should be ignored. */
@@ -1785,6 +1798,7 @@ struct valkeyServer {
     pid_t child_pid;                   /* PID of current child */
     int child_type;                    /* Type of current child */
     _Atomic(int) module_gil_acquiring; /* Indicates whether the GIL is being acquiring by the main thread. */
+    _Atomic(int) module_gil_acquired;  /* Indicates if the main thread has the GIL acquired. */
     /* Networking */
     int port;                              /* TCP listening port */
     int tls_port;                          /* TLS listening port */
diff --git a/src/unit/custom_matchers.hpp b/src/unit/custom_matchers.hpp
index 2d9c8193d29..edc83bf33ea 100644
--- a/src/unit/custom_matchers.hpp
+++ b/src/unit/custom_matchers.hpp
@@ -14,7 +14,11 @@
 MATCHER_P(robjEqualsStr, str, "robj string matcher") {
     assert(arg->type == OBJ_STRING);
     assert(sdsEncodedObject(arg));
-    return strcmp(static_cast<const char *>(objectGetVal(arg)), str) == 0;
+
+    if (strcmp(static_cast<const char *>(objectGetVal(arg)), str) == 0) return true;
+
+    *result_listener << "robj(\"" << (char *)objectGetVal(arg) << "\") doesn't match \"" << str << "\"";
+    return false;
 }
 
 #endif // _CUSTOM_MATCHERS_HPP_
diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp
new file mode 100644
index 00000000000..1faff9e7c20
--- /dev/null
+++ b/src/unit/test_bgiteration.cpp
@@ -0,0 +1,3051 @@
+/*
+ * Copyright Valkey Contributors.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD 3-Clause
+ */
+
+#include "generated_wrappers.hpp"
+#include <vector>
+
+using namespace ::testing;
+
+extern "C" {
+#include "bgiteration.h"
+#include "server.h"
+#include "stdlib.h"
+#define using usingvar // compile hack
+#include "module.h"    // uses "using" keyword
+#undef using
+extern hashtableType commandSetType;
+extern dictType keylistDictType;
+void bgIteration_feedIterators(void);
+void createSharedObjects(void);
+void hashtableDump(hashtable *ht);
+void bgIteration_unitTestDisableCloning(void);
+void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes);
+static size_t mockHashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata);
+size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid);
+}
+
+
+// The private data is a pointer to arbitrary data.  This value is used just to
+//  test that the correct value is passed through.
+#define PRIVDATA reinterpret_cast<void *>(12345)
+
+typedef int32_t bgIterationEntryMetadata; // opaque 4 bytes
+static_assert(sizeof(bgIterationEntryMetadata) == BGITERATION_ENTRY_METADATA_SIZE);
+
+// A bgIteration cleanup function used for testing.
+static int cleanupCount;
+static bool cleanupTerminated;
+static void iteratorCleanupFn(bool terminated, void *privdata) {
+    EXPECT_EQ(privdata, PRIVDATA);
+    cleanupCount++;
+    cleanupTerminated = terminated;
+}
+
+// A bgIteration repldone function used for testing.
+static int replDoneConfirmed;
+static bool iteratorRepldoneFn(void *privdata) {
+    EXPECT_EQ(privdata, PRIVDATA);
+    replDoneConfirmed++;
+    return true;
+}
+
+// A more complicated repldone function that can delay the replcation done condition.
+static int replDoneRejected;
+static bool iteratorRepldoneFnNotBeingReadyInitially(void *privdata) {
+    EXPECT_EQ(privdata, PRIVDATA);
+    // This is to test the behavior when Repl Done function is not ready to be executed.
+    if (replDoneRejected == 0) {
+        replDoneRejected++;
+        return false;
+    }
+    replDoneConfirmed++;
+    return true;
+}
+
+
+/* This mock for hashtableScan will return the items in lexical order.  It assumes that the entries
+ * are robjs containing an sds string for the key.  The key is expected to begin with a capital
+ * letter [A-Z].  The caller passes 0 as the cursor to start the iteration.  The returned cursor
+ * value will indicate the prior letter returned (1=A, ...).  After entries starting with 'Z' have
+ * been returned, the cursor of 0 will indicate that the scan is complete.  Note that all entries
+ * starting with the same letter will be returned in a single call. */
+static size_t mockHashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata) {
+    // Just in case, if it's not one of our hashtables, use the unmocked function
+    bool our_ht = (server.db[0]->keys && ht == kvstoreGetHashtable(server.db[0]->keys, 0)) ||
+                  (server.db[1]->keys && ht == kvstoreGetHashtable(server.db[1]->keys, 0));
+    if (!our_ht) return __real_hashtableScan(ht, cursor, fn, privdata);
+
+    // Collect all entries from the hashtable
+    std::vector<dbEntry *> entries;
+    hashtableIterator *iter = hashtableCreateIterator(ht, 0);
+    dbEntry *entry;
+    while (hashtableNext(iter, (void **)&entry)) {
+        char first = objectGetKey(entry)[0];
+        assert(first >= 'A' && first <= 'Z');
+        entries.push_back(entry);
+    }
+    hashtableReleaseIterator(iter);
+
+    // Sort by key lexicographically
+    std::sort(entries.begin(), entries.end(), [](dbEntry *a, dbEntry *b) {
+        return strcmp(objectGetKey(a), objectGetKey(b)) < 0;
+    });
+
+    // cursor 0 means start at 'A', otherwise start after the cursor letter
+    char startLetter = (char)('A' + cursor);
+
+    // Find the first letter to emit
+    char emitLetter = 0;
+    for (dbEntry *e : entries) {
+        char first = objectGetKey(e)[0];
+        if (first >= startLetter) {
+            emitLetter = first;
+            break;
+        }
+    }
+
+    if (emitLetter == 0) return 0;
+
+    // Call fn for all entries starting with emitLetter
+    for (dbEntry *e : entries) {
+        char first = objectGetKey(e)[0];
+        if (first == emitLetter) fn(privdata, (void *)e);
+    }
+
+    size_t nextCursor = (size_t)(emitLetter - 'A' + 1);
+    return (nextCursor > 25) ? 0 : nextCursor;
+}
+
+
+static bool mockHashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor) {
+    // Just in case, if it's not one of our hashtables, use the unmocked function
+    if (ht != kvstoreGetHashtable(server.db[0]->keys, 0) &&
+        ht != kvstoreGetHashtable(server.db[1]->keys, 0))
+        return __real_hashtableScanHasPassedKey(ht, key, cursor);
+
+    return ((const char *)key)[0] < (char)('A' + cursor);
+}
+
+
+static const char *logfile = "";
+
+/* Most of the bgIteration unit tests are based on a CMD instance with 2 DBs.  There are 8 keys in
+ * each DB.  The hashtableScan function is mocked to return the keys in a predictable order.
+ *
+ * There are a number of helper functions to simulate certain key modification actions within our
+ * test configuration.  Note that this is isolated from the actual call to processCommand.
+ *
+ * Because most of bgIteration is based on an ordered processing of keys, it doesn't matter if we
+ * are simulating CMD or CME, full scan, or slot-based.  The majority of tests are independent of
+ * these concerns.
+ *
+ * However, there are some tests which are are unique to these configurations and use a specialized
+ * derived class to handle the differences.  We do not want to duplicate all of the tests for
+ * the different configurations, but we do want to ensure that each configuration works properly.
+ *   - bgIterationTestCluster - handles tests unique to full scan in cluster mode
+ *   - bgIterationTestClusterSlots - handles tests unique to cluster slot-based iteration */
+class BgIterationTest : public ::testing::Test {
+  protected:
+    static const int DB_COUNT = 2;
+    static const int ITEMS_PER_DB = 8;
+
+  private:
+    /* With the mock hashtableScan, we get keys in a predictable order.  DB0 works with buckets
+     * containing groups of keys (which hashtableScan returns in a single call).  DB1 returns
+     * each key individually, as more separate buckets.  Convention (for test readability) is
+     * that keys beginning [A-M] would be in DB0 and keys beginning [N-Z] in DB1.  Letters are
+     * intentionally skipped to allow for possible insertions. */
+    const char *keys[DB_COUNT][ITEMS_PER_DB] = {{"B0", "B1", "B2", "E0", "E1", "H0", "H1", "H2"},
+                                                {"N0", "O0", "Q0", "R0", "T0", "U0", "W0", "Y0"}};
+
+  protected:
+    static const int TOTAL_ITEMS = DB_COUNT * ITEMS_PER_DB;
+    static const int LAST_ITEM = TOTAL_ITEMS - 1;
+
+    MockValkey mock;
+    RealValkey real;
+    client *c = nullptr;        // for general use in the tests (with common cleanup)
+    robj **orig_argv = nullptr; // Used when simulating multi
+    int orig_argc = 0;          // Used when simulating multi
+
+
+    struct serverCommand dummy_cmd = {0};
+
+    // Helper functions for accessing the keys.  We can access by db(0..1) and seq(0..7)
+    //  or by item number (0..15).
+    // NOTE: These virtual functions can be overridden in subclasses which may have different item layout.
+    virtual const char *getKeyAtDbSeq(int db, int seq) {
+        assert(db < DB_COUNT);
+        assert(seq < ITEMS_PER_DB);
+        return keys[db][seq];
+    }
+
+    virtual int getDbFromItemNum(int itemNum) {
+        assert(itemNum < DB_COUNT * ITEMS_PER_DB);
+        return itemNum / ITEMS_PER_DB;
+    }
+
+    virtual int getSeqFromItemNum(int itemNum) {
+        assert(itemNum < DB_COUNT * ITEMS_PER_DB);
+        return itemNum % ITEMS_PER_DB;
+    }
+
+    const char *keyStr(int itemNum) {
+        return getKeyAtDbSeq(getDbFromItemNum(itemNum), getSeqFromItemNum(itemNum));
+    }
+
+    int itemNumFromKey(const char *key) {
+        for (int itemNum = 0; itemNum < DB_COUNT * ITEMS_PER_DB; itemNum++) {
+            if (strcmp(key, keyStr(itemNum)) == 0) return itemNum;
+        }
+        return -1;
+    }
+
+
+    // Do some general initialization before starting the suite.  Normally, the tests are run in
+    //  isolation - and this isn't much different than SetUp().  But if running the
+    //  entire test suite together (just manually running the test executable), this gets called
+    //  only once.
+    static void SetUpTestSuite() {
+        monotonicInit();
+
+        bzero(&server, sizeof(server));
+        server.hz = 100;
+        server.logfile = const_cast<char *>(logfile);
+        createSharedObjects();
+
+        moduleInitModulesSystem();
+
+        server.commands = hashtableCreate(&commandSetType);
+        server.orig_commands = hashtableCreate(&commandSetType);
+        populateCommandTable();
+    }
+
+
+    static void TearDownTestSuite() {
+        hashtableRelease(server.commands);
+        hashtableRelease(server.orig_commands);
+    }
+
+
+    void initializeServerDb(int dbid, int slot_count_bits = 0) {
+        server.db[dbid] = static_cast<serverDb *>(zcalloc(sizeof(serverDb)));
+        server.db[dbid]->id = dbid;
+        server.db[dbid]->keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, 0);
+        server.db[dbid]->expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, 0);
+        server.db[dbid]->watched_keys = dictCreate(&keylistDictType);
+    }
+
+
+    robj *createStringObjectFromCString(const char *s) {
+        return createStringObject(s, strlen(s));
+    }
+
+
+    void addKeyToDb(int dbid, const char *key, const char *val) {
+        robj *key_obj = createStringObjectFromCString(key);
+        robj *val_obj = createStringObjectFromCString(val);
+        dbAdd(server.db[dbid], key_obj, &val_obj);
+        decrRefCount(key_obj);
+    }
+
+
+    virtual void setupDatabase() {
+        /* For these unit tests, a standard database is constructed.  But we will use our own
+         * mocked scan function to ensure a consistent iteration order */
+
+        server.dbnum = DB_COUNT;
+        server.cluster_enabled = false;
+        server.db = static_cast<serverDb **>(zcalloc(sizeof(serverDb *) * server.dbnum));
+
+        for (int dbid = 0; dbid < server.dbnum; dbid++) {
+            initializeServerDb(dbid);
+            for (int keynum = 0; keynum < ITEMS_PER_DB; keynum++) {
+                addKeyToDb(dbid, keys[dbid][keynum], keys[dbid][keynum]);
+            }
+        }
+
+        EXPECT_CALL(mock, hashtableScan(_, _, _, _))
+            .WillRepeatedly(Invoke(mockHashtableScan));
+        EXPECT_CALL(mock, hashtableScanHasPassedKey(_, _, _))
+            .WillRepeatedly(Invoke(mockHashtableScanHasPassedKey));
+
+        if (0) debugPrintBucketInfo();
+    }
+
+
+    void SetUp() override {
+        server.main_thread_id = pthread_self();
+        server.forkless_options_supported = 1;
+        objectSetMetadataSize(BGITERATION_ENTRY_METADATA_SIZE);
+
+        bgIteration_unitTestDisableCloning();
+
+        setupDatabase();
+
+        EXPECT_CALL(mock, aeCreateTimeEvent(_, _, _, _, _)).WillRepeatedly(Return(0));
+        bgIteration_init();
+
+        cleanupCount = 0;
+        replDoneConfirmed = 0;
+        replDoneRejected = 0;
+
+        // By default, do nothing for these
+        EXPECT_CALL(mock, blockClientInUseOnKeys(_, _, _)).WillRepeatedly(Return());
+        EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).WillRepeatedly(Return());
+
+        // By default, expect no permission issues
+        EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_, _, _, _, _, _))
+            .WillRepeatedly(Return(ACL_OK));
+    }
+
+
+    void TearDown() override {
+        bgIteration_feedIterators(); // process returning stuff before deleting DB
+        bgIteration_feedIterators(); // in case an iterator was closed there might be more
+        for (int i = 0; i < server.dbnum; i++) {
+            if (server.db[i]->keys) kvstoreRelease(server.db[i]->keys);
+            if (server.db[i]->expires) kvstoreRelease(server.db[i]->expires);
+            dictRelease(server.db[i]->watched_keys);
+            zfree(server.db[i]);
+        }
+        zfree(server.db);
+
+        if (c != NULL) freeTestClient(c);
+    }
+
+
+    // Deletes an item from the DB (often at the start of a test) - but does NOT notify
+    //  bgIteration.  bgIteration_keyDelete() should be explicitly called where needed.
+    void simpleDelItem(int itemNum) {
+        int db = getDbFromItemNum(itemNum);
+
+        sds delKey = sdsnew(keyStr(itemNum));
+        int rc = kvstoreHashtableDelete(server.db[db]->keys, 0, delKey);
+        ASSERT_EQ(rc, 1);
+        sdsfree(delKey);
+    }
+
+
+    // Find the actual dbEntry object by itemNum
+    dbEntry *getItem(int itemNum) {
+        int db = getDbFromItemNum(itemNum);
+        sds key = sdsnew(keyStr(itemNum));
+        dbEntry *de = dbFind(server.db[db], key);
+        sdsfree(key);
+        return de;
+    }
+
+
+    // The test expects that the next item read will be BGITERATOR_ITEM_COMPLETE
+    void expectReadComplete(bgIterator *iter) {
+        bgIteration_feedIterators();
+        bgIteratorItem *item = bgIteratorRead(iter);
+        EXPECT_EQ(item->type, BGITERATOR_ITEM_COMPLETE);
+        bgIteratorClose(iter);
+
+        int oldCleanupCount = cleanupCount;
+        bgIteration_feedIterators();
+        EXPECT_EQ(cleanupCount, oldCleanupCount + 1);
+    }
+
+
+    // The test is cleaning up and isn't validating the remaining cleanup
+    void expectAnythingCleanup(bgIterator *iter) {
+        while (true) {
+            bgIteration_feedIterators();
+            bgIteratorItem *item = bgIteratorRead(iter);
+            if ((item->type == BGITERATOR_ITEM_COMPLETE ||
+                 item->type == BGITERATOR_ITEM_TERMINATED)) {
+                bgIteratorClose(iter);
+                break;
+            }
+        }
+        bgIteration_feedIterators(); // Recognize the closed iterator
+        EXPECT_EQ(cleanupCount, 1);
+    }
+
+
+    void expectDictEntryMetadataMatch(dbEntry *de1, dbEntry *de2) {
+        bgIterationEntryMetadata *dm1 = static_cast<bgIterationEntryMetadata *>(objectGetMetadata(de1));
+        bgIterationEntryMetadata *dm2 = static_cast<bgIterationEntryMetadata *>(objectGetMetadata(de2));
+
+        EXPECT_NE(dm1, nullptr);
+        EXPECT_NE(dm2, nullptr);
+        EXPECT_EQ(*dm1, *dm2);
+    }
+
+
+    // Useful when debugging new tests.  It reads/prints all remaining items then crashes.
+    void cleanupIteratorDebugPrint(bgIterator *iter) {
+        bool done = false;
+        printf("[DEBUG] Printing bgIterator '%s' items:\n", bgIteratorName(iter));
+        while (!done) {
+            bgIteration_feedIterators();
+            bgIteratorItem *item = bgIteratorRead(iter);
+            switch (item->type) {
+            case BGITERATOR_ITEM_DBENTRY: {
+                auto obj = item->u.dbe.de;
+                const char *keyStr = objectGetKey(obj);
+                printf("Entry: %s -> %s [itemNum: %i]\n",
+                       keyStr,
+                       static_cast<char *>(objectGetVal(obj)),
+                       itemNumFromKey(keyStr));
+                break;
+            }
+            case BGITERATOR_ITEM_REPLICATION:
+                printf("Repl: DB=%d : ", item->dbid);
+                for (int i = 0; i < item->u.repl.argc; i++)
+                    printf("%s ", static_cast<char *>(objectGetVal(item->u.repl.argv[i])));
+                printf("\n");
+                break;
+            case BGITERATOR_ITEM_COMPLETE:
+            case BGITERATOR_ITEM_TERMINATED:
+                bgIteratorClose(iter);
+                done = true;
+                break;
+            default:
+                printf("unhandled: %d\n", item->type);
+            }
+        }
+        bgIteration_feedIterators(); // Recognize the closed iterator
+        ASSERT_TRUE(false);          // Halt the test here
+    }
+
+
+    // Make a copy of the metadata
+    void *cloneMetadata(dbEntry *de) {
+        int size = objectGetMetadataSize(de);
+        void *metadata = zmalloc(size);
+        memcpy(metadata, objectGetMetadata(de), size);
+        return metadata;
+    }
+
+
+    // Compare a previous metadata copy to an existing entry
+    void compareAndFreeClonedMetadata(dbEntry *de, void *metadata) {
+        EXPECT_EQ(memcmp(objectGetMetadata(de), metadata, objectGetMetadataSize(de)), 0);
+        zfree(metadata);
+    }
+
+
+    // The test expects the next item will be a specific key
+    //  The item value is verified against the default unless provided as a parameter.
+    void expectReadKey(bgIterator *iter, int itemNum, const char *value = nullptr) {
+        int db = getDbFromItemNum(itemNum);
+
+        bgIteration_feedIterators();
+        bgIteratorItem *item = bgIteratorRead(iter);
+        bgIteration_feedIterators();
+
+        ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY);
+        EXPECT_EQ(item->dbid, db);
+        EXPECT_FALSE(item->u.dbe.is_cloned);
+        EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum));
+        if (value) {
+            EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value));
+        } else {
+            EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum)));
+        }
+    }
+
+
+    // The test expects the next item will be a specific key amd that the item is cloned.
+    //  Metadata is tested (to make sure the clone includes the proper metadata).
+    //  The item value is verified against the default unless provided as a parameter.
+    void expectReadClonedKey(bgIterator *iter, int itemNum, void *metadata, const char *value = nullptr) {
+        int db = getDbFromItemNum(itemNum);
+
+        bgIteration_feedIterators();
+        bgIteratorItem *item = bgIteratorRead(iter);
+        bgIteration_feedIterators();
+
+        ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY);
+        EXPECT_EQ(item->dbid, db);
+        EXPECT_TRUE(item->u.dbe.is_cloned);
+        compareAndFreeClonedMetadata(item->u.dbe.de, metadata);
+        EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum));
+        if (value) {
+            EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value));
+        } else {
+            EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum)));
+        }
+    }
+
+
+    // Test expects the next key, but specified by key name, not itemNum.
+    void expectReadDbKeyValue(bgIterator *iter, int db, const char *key, const char *value) {
+        bgIteration_feedIterators();
+        bgIteratorItem *item = bgIteratorRead(iter);
+        bgIteration_feedIterators();
+
+        ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY);
+        EXPECT_EQ(item->dbid, db);
+        EXPECT_STREQ(objectGetKey(item->u.dbe.de), key);
+        EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value));
+    }
+
+
+    // Test expect to read a sequence of key items
+    void expectReadKeySequence(bgIterator *iter, int startItem, int endItem) {
+        for (int i = startItem; i <= endItem; i++) expectReadKey(iter, i);
+    }
+
+
+    // Just like expectReadKey, but also tests that a previous item is becoming unblocked.
+    void expectReadKeyWithUnblock(bgIterator *iter, int itemNum, int unblockItem, const char *value = nullptr) {
+        bool blocked = true;
+        EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(unblockItem))))
+            .WillOnce(Assign(&blocked, false));
+        expectReadKey(iter, itemNum, value);
+        EXPECT_FALSE(blocked);
+    }
+
+
+    // Test expects to read a replication item matching the command help by client 'c'
+    void expectReadReplication(bgIterator *iter, client *c) {
+        bgIteration_feedIterators();
+        bgIteratorItem *item = bgIteratorRead(iter);
+        bgIteration_feedIterators();
+
+        ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION);
+        EXPECT_EQ(item->dbid, c->db->id);
+        EXPECT_EQ(item->u.repl.cmd, c->cmd);
+        EXPECT_EQ(item->u.repl.argc, c->argc);
+        for (int i = 0; i < c->argc; i++) {
+            EXPECT_STREQ(static_cast<char *>(objectGetVal(item->u.repl.argv[i])),
+                         static_cast<char *>(objectGetVal(c->argv[i])));
+        }
+    }
+
+
+    // We expect to read a MULTI command which should have been inserted.
+    void expectReadMultiReplication(bgIterator *iter) {
+        bgIteration_feedIterators();
+        bgIteratorItem *item = bgIteratorRead(iter);
+        bgIteration_feedIterators();
+
+        ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION);
+        EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("multi"));
+    }
+
+
+    // We expect to read an EXEC command which should have been inserted.
+    void expectReadExecReplication(bgIterator *iter) {
+        bgIteration_feedIterators();
+        bgIteratorItem *item = bgIteratorRead(iter);
+        bgIteration_feedIterators();
+
+        ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION);
+        EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("exec"));
+    }
+
+
+    // Expecting that a DEL command should have been replicated.
+    void expectReadReplicationDel(bgIterator *iter, int itemNum) {
+        int db = getDbFromItemNum(itemNum);
+
+        bgIteration_feedIterators();
+        bgIteratorItem *item = bgIteratorRead(iter);
+        bgIteration_feedIterators();
+
+        ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION);
+        EXPECT_EQ(item->dbid, db);
+        EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("DEL"));
+        EXPECT_EQ(item->u.repl.argc, 2);
+        EXPECT_THAT(item->u.repl.argv[0], robjEqualsStr("DEL"));
+        EXPECT_THAT(item->u.repl.argv[1], robjEqualsStr(keyStr(itemNum)));
+    }
+
+
+    // Expecting that a special SWAPDB item has been inserted.
+    void expectReadSwapDB(bgIterator *iter, int db1, int db2) {
+        bgIteration_feedIterators();
+        bgIteratorItem *item = bgIteratorRead(iter);
+        bgIteration_feedIterators();
+
+        ASSERT_EQ(item->type, BGITERATOR_ITEM_SWAPDB);
+        EXPECT_EQ(item->dbid, db1);
+        EXPECT_EQ(item->u.dbid2, db2);
+    }
+
+
+    static void debugPrintBucketInfoCb(void *privdata, void *entry) {
+        UNUSED(privdata);
+        dbEntry *de = (dbEntry *)entry;
+        printf("--- %s\n", objectGetKey(de));
+    }
+
+    void debugPrintBucketInfo() {
+        printf("*******DEBUG*******\n");
+        for (int db = 0; db < server.dbnum; db++) {
+            int num_ht = kvstoreNumHashtables(server.db[db]->keys);
+            for (int slot = 0; slot < num_ht; slot++) {
+                hashtable *ht = kvstoreGetHashtable(server.db[db]->keys, slot);
+                if (!ht) continue;
+
+                printf("DB: %d, slot: %d\n", db, slot);
+                size_t cursor = 0;
+                do {
+                    cursor = hashtableScan(ht, cursor, debugPrintBucketInfoCb, NULL);
+                    printf("-----------\n");
+                } while (cursor != 0);
+            }
+        }
+        ASSERT_TRUE(false);
+    }
+
+
+    // Creates a client with a write command (SET) for the given itemNum
+    client *getWriteClient(int itemNum, const char *value) {
+        int db = getDbFromItemNum(itemNum);
+
+        client *c = static_cast<client *>(zcalloc(sizeof(client)));
+
+        c->cmd = lookupCommandByCString("set");
+        c->db = server.db[db];
+
+        c->argc = 3;
+        c->argv = static_cast<robj **>(zcalloc(sizeof(robj *) * c->argc));
+        c->argv[0] = createStringObjectFromCString(c->cmd->fullname);
+        c->argv[1] = createStringObjectFromCString(keyStr(itemNum));
+        c->argv[2] = createStringObjectFromCString(value);
+
+        return c;
+    }
+
+
+    // Create a client with a write command that touches multiple keys
+    client *getWriteMultiKeysClient(const char *cmdName,
+                                    int dstItemNum,
+                                    const std::vector<int> &srcItemsNum) {
+        assert(!srcItemsNum.empty());
+
+        const int db = getDbFromItemNum(dstItemNum);
+        std::for_each(srcItemsNum.cbegin(), srcItemsNum.cend(), [&db, this](int srcItemNum) {
+            assert(db == getDbFromItemNum(srcItemNum));
+        });
+
+        client *c = static_cast<client *>(zcalloc(sizeof(client)));
+
+        c->cmd = lookupCommandByCString(cmdName);
+        assert(c->cmd != nullptr);
+        c->db = server.db[db];
+
+        c->argc = 2 + srcItemsNum.size();
+        c->argv = static_cast<robj **>(zcalloc(sizeof(robj *) * c->argc));
+        c->argv[0] = createStringObjectFromCString(c->cmd->fullname);
+        c->argv[1] = createStringObjectFromCString(keyStr(dstItemNum));
+        for (unsigned int i = 0; i < srcItemsNum.size(); i++) {
+            c->argv[2 + i] = createStringObjectFromCString(keyStr(srcItemsNum[i]));
+        }
+
+        return c;
+    }
+
+
+    client *getWrite2KeysClient(const char *cmdName, int dstItemNum, int srcItemNum) {
+        return getWriteMultiKeysClient(cmdName, dstItemNum, {srcItemNum});
+    }
+
+
+    client *getWrite3KeysClient(const char *cmdName, int dstItemNum, int src1ItemNum, int src2ItemNum) {
+        return getWriteMultiKeysClient(cmdName, dstItemNum, {src1ItemNum, src2ItemNum});
+    }
+
+
+    // Create a client with a MULTI/EXEC block.
+    //  This parses a series of commands separated by ';'
+    //  Example: getMultiClient("SET A0 xxx; SELECT 1; SET A1 xxx; SET B1 xxx")
+    client *getMultiClient(const char *commands, int dbid = 0) {
+        char *commandsCopy = zstrdup(commands); // a mutable copy
+        char *commandStr, *commandStrSave;
+        char *token, *tokenSave;
+
+        client *c = static_cast<client *>(zcalloc(sizeof(client)));
+        c->db = server.db[dbid];
+        initClientMultiState(c);
+        c->flag.multi = 1;
+        c->mstate->cmd_flags |= CMD_WRITE;
+
+        commandStr = strtok_r(commandsCopy, ";", &commandStrSave);
+        while (commandStr != NULL) {
+            token = strtok_r(commandStr, " ", &tokenSave);
+            c->cmd = lookupCommandByCString(token);
+
+            c->argv = static_cast<robj **>(zcalloc(sizeof(robj *) * 5)); // command + 4 args
+
+            for (int i = 0; token != NULL; i++) {
+                c->argv[i] = createStringObjectFromCString(token);
+                c->argc = i + 1;
+                token = strtok_r(NULL, " ", &tokenSave);
+            }
+
+            queueMultiCommand(c, 0);
+            freeClientArgv(c);
+
+            commandStr = strtok_r(NULL, ";", &commandStrSave);
+        }
+
+        c->cmd = lookupCommandByCString("exec");
+        c->argc = 1;
+        c->argv = static_cast<robj **>(zcalloc(sizeof(robj *) * c->argc));
+        c->argv[0] = createStringObjectFromCString("EXEC");
+
+        zfree(commandsCopy);
+        return c;
+    }
+
+
+    // Initially, a MULTI client is set up to execute the EXEC command (which examines the
+    //  contents of the multi/exec block).  This function advances the client to begin executing
+    //  the individual commands within the multi/exec block.
+    void advanceMultiClientToCommand(client *c, int cmdNum) {
+        assert(cmdNum >= 0 && cmdNum < c->mstate->count);
+        if (cmdNum == 0) {
+            // Save off the EXEC
+            orig_argc = c->argc;
+            orig_argv = c->argv;
+        }
+        c->argc = c->mstate->commands[cmdNum].argc;
+        c->argv = c->mstate->commands[cmdNum].argv;
+        c->argv_len = c->mstate->commands[cmdNum].argv_len;
+        c->cmd = c->realcmd = c->mstate->commands[cmdNum].cmd;
+    }
+
+
+    // A client with a fictional command:
+    //  SETGET <write_key> <value> <read_key>
+    //  - writes a value to the first key (making this CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY)
+    //  - reads a second key
+    client *getSetGetClient(int itemNum1, const char *value1, int itemNum2) {
+        // Fictional command which writes to 1st key and reads the 2nd
+        int db = getDbFromItemNum(itemNum1);
+        assert(db == getDbFromItemNum(itemNum2)); // (this would be a testcase error)
+
+        client *c = static_cast<client *>(zcalloc(sizeof(client)));
+        struct serverCommand *cmd = static_cast<struct serverCommand *>(zcalloc(sizeof(struct serverCommand)));
+
+        cmd->fullname = sdsnew("SETGET");
+        cmd->arity = 4;
+        cmd->flags = CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY;
+
+        cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INDEX;
+        cmd->legacy_range_key_spec.bs.index.pos = 1; // firstkey
+        cmd->legacy_range_key_spec.fk.range.lastkey = -1;
+        cmd->legacy_range_key_spec.fk.range.keystep = 2;
+
+        c->cmd = cmd;
+        c->db = server.db[db];
+
+        c->argc = 4;
+        c->argv = static_cast<robj **>(zcalloc(sizeof(robj *) * c->argc));
+        c->argv[0] = createStringObjectFromCString(cmd->fullname);
+        c->argv[1] = createStringObjectFromCString(keyStr(itemNum1));
+        c->argv[2] = createStringObjectFromCString(value1);
+        c->argv[3] = createStringObjectFromCString(keyStr(itemNum2));
+
+        return c;
+    }
+
+
+    // Client with a fictional write command with no keys specified
+    client *getNoKeysWriteClient() {
+        // Fictional command which is marked WRITE, but has no keys.
+        client *c = static_cast<client *>(zcalloc(sizeof(client)));
+        struct serverCommand *cmd = static_cast<struct serverCommand *>(zcalloc(sizeof(struct serverCommand)));
+
+        cmd->fullname = sdsnew("NOKEYSWRITE");
+        cmd->arity = 1;
+        cmd->flags = CMD_WRITE;
+
+        cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INVALID; // No keys
+
+        c->cmd = cmd;
+        c->db = server.db[0];
+
+        c->argc = 1;
+        c->argv = static_cast<robj **>(zcalloc(sizeof(robj *) * c->argc));
+        c->argv[0] = createStringObjectFromCString(cmd->fullname);
+
+        return c;
+    }
+
+
+    void freeClientArgv(client *c) {
+        for (int i = 0; i < c->argc; i++) decrRefCount(c->argv[i]);
+        zfree(c->argv);
+        c->argv = NULL;
+        c->argc = 0;
+    }
+
+
+    // During testing, we create some fake commands.  This checks if the command is real or fake.
+    //  A fake command is dynamically allocated and can be freed.  Real commands are static.
+    bool isRealValkeyCommand(struct serverCommand *cmd) {
+        return lookupCommandByCString(cmd->declared_name);
+    }
+
+
+    void freeTestClient(client *c) {
+        // If the current command references one of the multi commands, set it back to the EXEC
+        if (c->mstate != NULL) {
+            for (int i = 0; i < c->mstate->count; i++) {
+                if (c->argv == c->mstate->commands[i].argv) {
+                    c->argc = orig_argc;
+                    c->argv = orig_argv;
+                    orig_argc = 0;
+                    orig_argv = nullptr;
+                    break;
+                }
+            }
+        }
+        freeClientMultiState(c);
+        freeClientArgv(c);
+
+        if (!isRealValkeyCommand(c->cmd)) {
+            sdsfree(c->cmd->fullname);
+            zfree(c->cmd);
+        }
+
+        zfree(c);
+    }
+
+
+    // Simulate what happens when a write command is blocked
+    void simulateBlockedWrite(client *c, int expectedNumberBlockedKeys = 1) {
+        EXPECT_CALL(mock, blockClientInUseOnKeys(c, expectedNumberBlockedKeys, _)).Times(1);
+        bool blocked = bgIteration_blockClientIfRequired(c);
+        EXPECT_TRUE(blocked);
+    }
+
+
+    // Simulate what happens when a write command isn't blocked
+    void simulateUnblockedWrite(client *c) {
+        EXPECT_CALL(mock, blockClientInUseOnKeys(c, _, _)).Times(0);
+        bool blocked = bgIteration_blockClientIfRequired(c);
+        EXPECT_FALSE(blocked);
+    }
+
+
+    // Simulate what happens when a write command is NOT blocked, because the key can be cloned
+    //  and expedited.  This requires a scenario where we would normally need to block the
+    //  client so that bgIteration can process the item.
+    void simulateClonedWrite(bgIterator *it, client *c) {
+        bgIteratorStatus status;
+        bgIteratorGetStatus(it, &status);
+        unsigned long initialClones = status.dbentry_clones_queued;
+
+        // Client should not get blocked
+        EXPECT_CALL(mock, blockClientInUseOnKeys(c, _, _)).Times(0);
+        bool blocked = bgIteration_blockClientIfRequired(c);
+        EXPECT_FALSE(blocked);
+
+        // Ensure that cloning took place
+        bgIteratorGetStatus(it, &status);
+        EXPECT_EQ(status.dbentry_clones_queued, (initialClones + 1));
+
+        // Ensure that the real item isn't inuse (because we cloned it instead)
+        dbEntry *de = dbFind(c->db, static_cast<sds>(objectGetVal(c->argv[1])));
+        ASSERT_FALSE(bgIteration_isEntryInuse(de));
+    }
+
+
+    // Simulates what happens when a write command (SET) actually executes.  This requires a
+    //  scenario where we would NOT be blocked on the write.  It actually alters the value of
+    //  the key and updates the metadata.
+    void simulateUnblockedWriteWithModification(client *c) {
+        EXPECT_CALL(mock, blockClientInUseOnKeys(c, _, _)).Times(0);
+        bool blocked = bgIteration_blockClientIfRequired(c);
+        EXPECT_FALSE(blocked);
+
+        // Fake execution of the command - touch the iterator_epoch counter and swap the value
+        // We need to duplicate the value because setKey() can reallocate it.
+        robj *value = dupStringObject(c->argv[2]);
+        setKey(c, c->db, c->argv[1], &value, SETKEY_ADD_OR_UPDATE);
+
+        // Let's make sure that setKey updated the iteration epoch (as it should have)
+        dbEntry *de = dbFind(c->db, static_cast<sds>(objectGetVal(c->argv[1])));
+        bgIterationEntryMetadata *md = static_cast<bgIterationEntryMetadata *>(objectGetMetadata(de));
+        bgIterationEntryMetadata md_after_setkey = *md;
+        // Now update the md again, and it should still match
+        bgIteration_dbEntryModified(de);
+        EXPECT_EQ(md, objectGetMetadata(de)); // the md location shouldn't have changed
+        EXPECT_EQ(md_after_setkey, *md);      // the md value should still be the same
+
+        server.in_call++;
+        bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv);
+        server.in_call--;
+    }
+
+
+    // Simulate the expiration (active expiration) of a key.  This is independent of command execution.
+    void simulateExpiration(int itemNum) {
+        ASSERT_NE(getItem(itemNum), nullptr); // Should be there before expire
+
+        // Send bgIteration the DEL
+        int db = getDbFromItemNum(itemNum);
+        robj *argv[2];
+        argv[0] = createStringObjectFromCString("DEL");
+        argv[1] = createStringObjectFromCString(keyStr(itemNum));
+        serverCommand *cmd = lookupCommandByCString("DEL");
+        // KeyDelete should be called before the deletion occurs
+        bgIteration_keyDelete(db, static_cast<sds>(objectGetVal(argv[1])));
+
+        simpleDelItem(itemNum); // Simulate the actual del
+
+        // Replication happens after the deletion occurs
+        ASSERT_EQ(server.in_call, 0); // test sanity check
+        bgIteration_handleCommandReplication(db, cmd, 2, argv);
+        decrRefCount(argv[0]);
+        decrRefCount(argv[1]);
+
+        EXPECT_EQ(getItem(itemNum), nullptr);
+    }
+
+
+    // Simulates an expiration, but validates behavior for an item inuse by bgIteration.
+    void simulateExpirationOfInuse(int itemNum) {
+        // An inuse item will have a refcount > 1.  BgIteration should have incremented the
+        //  refcount while it is inuse.
+        dbEntry *de = getItem(itemNum);
+        ASSERT_NE(de, nullptr); // Should be there before expire
+        EXPECT_TRUE(bgIteration_isEntryInuse(de));
+        EXPECT_EQ(de->refcount, 2u);
+
+        simulateExpiration(itemNum);
+
+        // At this point, the item is removed from the DB, but still exists, and the refcount
+        //  has been reduced to 1.  This allows a background thread to continue using the item.
+        EXPECT_EQ(de->refcount, 1u);
+    }
+
+
+    // Simulates an expiration, but the item is a future item which will be expedited.
+    void simulateExpirationWithExpedite(int itemNum) {
+        // An inuse item will have a refcount > 1.  BgIteration should have incremented the
+        //  refcount while it is inuse.
+        dbEntry *de = getItem(itemNum);
+        ASSERT_NE(de, nullptr);                     // Should be there before expire
+        EXPECT_FALSE(bgIteration_isEntryInuse(de)); // Not yet inuse
+        EXPECT_EQ(de->refcount, 1u);
+
+        simulateExpiration(itemNum);
+
+        // At this point, the item is removed from the DB, but still exists, and the refcount
+        //  has been reduced to 1.  This allows a background thread to continue using the item.
+        EXPECT_TRUE(bgIteration_isEntryInuse(de)); // It's inuse now
+        EXPECT_EQ(getItem(itemNum), nullptr);      // but it's not in the DB anymore
+        EXPECT_EQ(de->refcount, 1u);
+    }
+
+
+    // Simulate execution of a SWAPDB command
+    void simulateSwapDB(int dbid0, int dbid1) {
+        char dbStr[2] = {0};
+
+        client *c = static_cast<client *>(zcalloc(sizeof(client)));
+
+        c->cmd = lookupCommandByCString("swapdb");
+        c->db = server.db[0];
+
+        c->argc = 3;
+        c->argv = static_cast<robj **>(zcalloc(sizeof(robj *) * c->argc));
+        c->argv[0] = createStringObjectFromCString(c->cmd->fullname);
+        dbStr[0] = '0' + dbid0;
+        c->argv[1] = createStringObjectFromCString(dbStr);
+        dbStr[0] = '0' + dbid1;
+        c->argv[2] = createStringObjectFromCString(dbStr);
+
+        bool blocked = bgIteration_blockClientIfRequired(c);
+        EXPECT_FALSE(blocked); // SWAPDB should never block
+
+        // The real SWAP does more than this, but this is enough for unit tests
+        serverDb *aux = server.db[dbid0];
+        server.db[dbid0] = server.db[dbid1];
+        server.db[dbid1] = aux;
+
+        bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv);
+
+        freeTestClient(c);
+    }
+
+
+    // Simulate execution of a FLUSHDB or FLUSHALL command
+    void simulateFlushDB(int db, int anInUseItem) {
+        client *c = static_cast<client *>(zcalloc(sizeof(client)));
+
+        if (db == -1) {
+            c->cmd = lookupCommandByCString("flushall");
+            c->db = server.db[0];
+        } else {
+            c->cmd = lookupCommandByCString("flushdb");
+            c->db = server.db[db];
+        }
+
+        c->argc = 1;
+        c->argv = static_cast<robj **>(zcalloc(sizeof(robj *) * c->argc));
+        c->argv[0] = createStringObjectFromCString(c->cmd->fullname);
+
+        dbEntry *de_in_use = getItem(anInUseItem);
+        EXPECT_EQ(de_in_use->refcount, 2u);
+
+        bool blocked = bgIteration_blockClientIfRequired(c);
+        EXPECT_FALSE(blocked); // FLUSHDB should never block
+
+        // The real FLUSH does more than this, but this is enough for unit tests
+
+        // Now flush the items
+        for (int d = 0; d < server.dbnum; d++) {
+            if (db == -1 || db == d) {
+                kvstoreRelease(server.db[d]->keys);
+                server.db[d]->keys = NULL;
+            }
+        }
+
+        EXPECT_EQ(de_in_use->refcount, 1u);
+
+        // and replicate
+
+        bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv);
+
+        freeTestClient(c);
+    }
+};
+
+
+TEST_F(BgIterationTest, dbIsOK) {
+    // Just run the setup/teardown code to make sure the DB is OK.
+}
+
+
+/////////////////////////////////////////////////////
+// Simple Full-scan iterator tests
+/////////////////////////////////////////////////////
+
+// A simple full scan that just checks basic flow.
+TEST_F(BgIterationTest, createAndCleanup) {
+    bgIterator *it = bgIteratorCreateFullScanIter("simple", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    EXPECT_EQ(bgIteratorFind("simple"), it);
+    EXPECT_STREQ(bgIteratorName(it), "simple");
+
+    bgIteratorStatus status;
+    bgIteratorGetStatus(it, &status);
+
+    EXPECT_EQ(status.dbentries_queued, 0u);
+    EXPECT_EQ(status.dbentries_processed, 0u);
+    EXPECT_EQ(status.replication_queued, 0u);
+    EXPECT_EQ(status.replication_processed, 0u);
+    EXPECT_EQ(status.swapdb_queued, 0u);
+    EXPECT_EQ(status.swapdb_processed, 0u);
+    EXPECT_EQ(status.flushdb_queued, 0u);
+    EXPECT_EQ(status.flushdb_processed, 0u);
+
+    EXPECT_EQ(status.queue_length, 0u);
+    EXPECT_GT(status.queue_length_target, 0u);
+
+    EXPECT_LT(status.runtime_ms, 5u);
+    EXPECT_EQ(status.current_item_ms, 0u);
+
+    expectAnythingCleanup(it);
+
+    EXPECT_EQ(bgIteratorFind("simple"), nullptr);
+}
+
+
+// Close client before reading anything
+TEST_F(BgIterationTest, testClientCloseBeforeRead) {
+    bgIterator *it = bgIteratorCreateFullScanIter("simple", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    bgIteration_feedIterators();
+
+    bgIteratorClose(it); // Immediately close before reading
+
+    bgIteration_feedIterators(); // Recognize the closed iterator
+
+    // Check that the cleanup callback was executed properly
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_TRUE(cleanupTerminated);
+}
+
+
+// Test that the full scan hits each item in the expected sequence.
+TEST_F(BgIterationTest, orderedIteration) {
+    bgIterator *it = bgIteratorCreateFullScanIter("simple", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKeySequence(it, 0, LAST_ITEM);
+
+    // Quick status check.  At this point, the final item hasn't been returned yet.
+    bgIteratorStatus status;
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentries_queued, static_cast<unsigned int>(TOTAL_ITEMS));
+    EXPECT_EQ(status.dbentries_processed, static_cast<unsigned int>(TOTAL_ITEMS) - 1);
+
+    expectReadComplete(it); // Returns the final item, and reads the completion item
+
+    // Check that the cleanup callback was executed properly
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_FALSE(cleanupTerminated);
+}
+
+
+// Test that two simultaneous iterations work properly.
+TEST_F(BgIterationTest, twoOrderedIterations) {
+    bgIterator *it1 = bgIteratorCreateFullScanIter("simple1", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                   iteratorCleanupFn, PRIVDATA);
+    bgIterator *it2 = bgIteratorCreateFullScanIter("simple2", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                   iteratorCleanupFn, PRIVDATA);
+    EXPECT_EQ(bgIteratorFind("simple1"), it1);
+    EXPECT_EQ(bgIteratorFind("simple2"), it2);
+
+    int it1Count = 0;
+    int it2Count = 0;
+    while (it1Count < TOTAL_ITEMS || it2Count < TOTAL_ITEMS) {
+        // Randomly read from either iterator
+        if ((rand() % 2) == 0) {
+            if (it1Count < TOTAL_ITEMS) expectReadKey(it1, it1Count++);
+        } else {
+            if (it2Count < TOTAL_ITEMS) expectReadKey(it2, it2Count++);
+        }
+    }
+
+    // Nothing left but to read the final completions
+    expectReadComplete(it1);
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_FALSE(cleanupTerminated);
+    expectReadComplete(it2);
+    EXPECT_EQ(cleanupCount, 2);
+    EXPECT_FALSE(cleanupTerminated);
+}
+
+
+/////////////////////////////////////////////////////
+// MODIFY A FUTURE ITEM
+// The next tests validate the basic pattern when a key, not yet iterated, is modified.
+// Each variation of iteration flags is tested.
+// Note that these tests execute without cloning (cloning is tested elsewhere).
+/////////////////////////////////////////////////////
+
+// Modify a future item, without replication or consistency.
+// Our expectation for this case is that the modification should proceed without blocking, the item
+//  shouldn't be expedited, and we will see the modified item once the iterator reaches it.
+TEST_F(BgIterationTest, modFutureItem) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    // Fake a modification to a later key so that we can see if it gets processed out of order.
+    c = getWriteClient(6, "xxx");
+
+    // We DONT expect the client to be blocked - not consistent
+    simulateUnblockedWriteWithModification(c);
+
+    // Now continue reading, 1, 2, 3, 4, 5
+    expectReadKeySequence(it, 1, 5);
+
+    // Let's validate that key 6 shows the new value
+    expectReadKey(it, 6, "xxx");
+
+    // Continue...
+    expectReadKeySequence(it, 7, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Modify a future item, without replication but with consistency.  (Like a SAVE operation)
+// Our expectation for this case is that the modification SHOULD be blocked, as we have to save the
+//  the item in it's state before the modification.  To reduce blocking time, the item should be
+//  moved to the head of the queue - there's no replication in this case, so out-of-order processing
+//  isn't a concern.
+TEST_F(BgIterationTest, modFutureItem_start) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    // Fake a modification to a later key so that we can see if it gets processed out of order.
+    c = getWriteClient(6, "xxx");
+    // Since this is consistent, we will block the client, disallowing the write.
+    simulateBlockedWrite(c);
+
+    // On a consistent iterator, the event is expedited in-front of items already in queue!
+    //  Read key 6 out of order.
+    expectReadKey(it, 6);
+
+    // Now, when we read key 1, key 6 is released back to Valkey, and the client will be unblocked.
+    expectReadKeyWithUnblock(it, 1, 6);
+    simulateUnblockedWriteWithModification(c); // Now the write can proceed
+
+    // Continue...
+    expectReadKeySequence(it, 2, 5);
+    // 6 has already been processed
+    expectReadKeySequence(it, 7, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Modify a future item, with replication but without consistency.  (Like a Threadsave Full Sync operation)
+// Our expectation for this case is that the modification should proceed without blocking, as the
+//  mode is inconsistent.  We don't expect replication, as we haven't reached the item yet.  We'll
+//  see the modified item later.
+TEST_F(BgIterationTest, modFutureItem_eventual) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    // Fake a modification to a later key so that we can see if it gets processed out of order.
+    c = getWriteClient(6, "xxx");
+
+    // We DONT expect the client to be blocked - not consistent
+    simulateUnblockedWriteWithModification(c);
+
+    // NOTE:  Since we haven't reached this item yet, and consistency is not required, there's no
+    //        need to replicate this command.  So everything should wrap up just fine - we will see
+    //        the new value when we get to it.
+
+    // Now continue reading, 1, 2, 3, 4, 5
+    expectReadKeySequence(it, 1, 5);
+
+    // Let's validate that key 6 shows the new value
+    expectReadKey(it, 6, "xxx");
+
+    // Continue...
+    expectReadKeySequence(it, 7, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+/////////////////////////////////////////////////////
+// MODIFY A CURRENT ITEM
+// The next tests validate the basic pattern when a key, currently in use, is modified.
+// Each variation of iteration flags is tested.
+// Note that these tests execute without cloning (cloning is tested elsewhere).
+/////////////////////////////////////////////////////
+
+// Modify a current item, without replication or consistency.
+// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't
+//  be expedited (it's already in use).
+TEST_F(BgIterationTest, modCurrentItem) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    c = getWriteClient(2, "xxx");
+
+    // Must be blocked since key is queued
+    simulateBlockedWrite(c);
+
+    // Now continue reading
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+    expectReadKeyWithUnblock(it, 3, 2);
+    simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication)
+
+    // Continue...
+    expectReadKeySequence(it, 4, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Modify a current item, without replication but with consistency.  (Like a SAVE operation)
+// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't
+//  be expedited (it's already in use).
+TEST_F(BgIterationTest, modCurrentItem_start) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    c = getWriteClient(2, "xxx");
+
+    // Must be blocked since key is queued
+    simulateBlockedWrite(c);
+
+    // Now continue reading
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+    expectReadKeyWithUnblock(it, 3, 2);
+    simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication)
+
+    // Continue...
+    expectReadKeySequence(it, 4, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Modify a current item, with replication but without consistency.  (Like a Threadsave Full Sync operation)
+// Our expectation for this case is that the modification SHOULD be blocked.  After the key is processed,
+//  the write will proceed, and the replication will be sent.
+TEST_F(BgIterationTest, modCurrentItem_eventual) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    c = getWriteClient(2, "xxx");
+
+    // Must be blocked since key is queued
+    simulateBlockedWrite(c);
+
+    // Now continue reading
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+    expectReadKeyWithUnblock(it, 3, 2);
+    simulateUnblockedWriteWithModification(c); // the actual write will cause replication
+
+    expectReadKey(it, 4); // 4 got put in queue when 3 was read
+
+    expectReadReplication(it, c);
+
+    // Continue...
+    expectReadKeySequence(it, 5, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+/////////////////////////////////////////////////////
+// MODIFY A PAST ITEM
+// The next tests validate the basic pattern when a key, not yet iterated on, is modified.
+// Each variation of iteration flags is tested.
+// Note that these tests execute without cloning (cloning is tested elsewhere).
+/////////////////////////////////////////////////////
+
+// Modify a past item, without replication or consistency.
+// Our expectation for this case is that the modification should proceed without blocking.
+//  No replication is generated and keys are processed similar to no modification.
+TEST_F(BgIterationTest, modPastItem) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // This read returns key 0 (making it a past item)
+    expectReadKey(it, 1);
+
+    // At this point, key 0 is returned.
+    c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    // Continue...
+    expectReadKeySequence(it, 2, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Modify a past item, without replication but with consistency.  (Like a SAVE operation)
+// Our expectation for this case is that the modification should proceed without blocking.
+//  No replication is generated and keys are processed similar to no modification.
+TEST_F(BgIterationTest, modPastItem_start) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // This read returns key 0 (making it a past item)
+    expectReadKey(it, 1);
+
+    // At this point, key 0 is returned.
+    c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    // Continue...
+    expectReadKeySequence(it, 2, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Modify a past item, with replication but without consistency.  (Like a Threadsave Full Sync operation)
+// Our expectation for this case is that the modification should proceed without blocking.
+//  Replication will be sent.
+TEST_F(BgIterationTest, modPastItem_eventual) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // This read returns key 0 (making it a past item)
+    expectReadKey(it, 1);
+
+    // At this point, key 0 is returned.
+    c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    // Key 2 was already in queue (same bucket as key 1).  The replication will follow.
+    expectReadKey(it, 2);
+    expectReadReplication(it, c);
+
+    // Continue...
+    expectReadKeySequence(it, 3, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+/////////////////////////////////////////////////////
+// TESTS FOR ITEM CLONING
+/////////////////////////////////////////////////////
+
+// In a consistent iteration, verify that a simple string is properly cloned, and that a write can
+//  occur without blocking.  Validate the cloned item and metadata.
+TEST_F(BgIterationTest, modFutureItem_start_CloneExpeditedItem) {
+    // Initialize cloning configurations.
+    bgIteration_unitTestEnableCloning(50, 100);
+
+    bgIteratorStatus status;
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    // Fake a modification to a later key so that we can see if it gets processed out of order.
+    c = getWriteClient(6, "xxx");
+
+    // Quick status check.  At this point, no clones exist yet.
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_queued, 0u);
+
+    // Since item 6 should be cloned, it will not block the client, allowing the write.
+    void *de6_md = cloneMetadata(getItem(6));
+    simulateClonedWrite(it, c);                // This wouldn't block, and queues the cloned value
+    simulateUnblockedWriteWithModification(c); // This modifies the real entry in the de (touching metadata)
+
+    // At this point, one clone is in the queue.
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_queued, 1u);
+
+    // On a consistent iterator, the event is expedited in-front of items already in queue!
+    //  Read key 6 (which is cloned) out of order.  The value will still match the key.
+    expectReadClonedKey(it, 6, de6_md); // Also validates and frees the metadata
+
+    // Quick status check.  At this point, cloned items have not been marked as processed yet.
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_processed, 0u);
+
+    // Reading key 1 will release key 6, and the clone will finish processing.
+    expectReadKey(it, 1);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_processed, 1u);
+
+    // Now, when we read key 2 should not have an impact on number of processed clones.
+    expectReadKey(it, 2);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_processed, 1u);
+
+    // Continue...
+    expectReadKeySequence(it, 3, 5);
+    // 6 has already been processed
+    expectReadKeySequence(it, 7, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Check that cloning for simple strings is respecting the size limits and pool size.  On a
+//  consistent iteration, we expect to block or clone on all future keys.  We validate that we can
+//  clone if the item is small enough and the cloning pool has more space left.
+TEST_F(BgIterationTest, modFutureItem_start_LargeItemOrClonePoolFull) {
+    // Initialize cloning configurations to test the clone pool functionality first.
+    bgIteration_unitTestEnableCloning(50, 50);
+
+    bgIteratorStatus status;
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    // Fake a modification to a later key so that we can see if it gets processed out of order.
+    client *c6 = getWriteClient(6, "xxx");
+    client *c7 = getWriteClient(7, "xxx");
+    client *c8 = getWriteClient(8, "xxx");
+
+    // Quick status check.  At this point, no clones exist yet.
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_queued, 0u);
+
+    // Since item 6 should be cloned, it will not block the client, allowing the write.
+    void *de6_md = cloneMetadata(getItem(6));
+    simulateClonedWrite(it, c6);
+    simulateUnblockedWriteWithModification(c6);
+
+    // At this point, one clone is in the queue.
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_queued, 1u);
+
+    // Now that cloning pool is full, item 7 will not be cloned and the client will be blocked.
+    simulateBlockedWrite(c7);
+    ASSERT_TRUE(bgIteration_isEntryInuse(getItem(7)));
+
+    // There is still only one cloned item in the queue.
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_queued, 1u);
+
+    // Now change cloning configurations to test that large items will not be cloned. We adjust
+    //  the clone pool size to allow two items, but set the maximum item size to be smaller than
+    //  the size of item 8. The clone pool size must be larger than the total size of the existing
+    //  clones plus the maximum item clone size.
+    bgIteration_unitTestEnableCloning(1, 101);
+
+    // This write will pass the clone pool check but fail the item size check, blocking the client.
+    simulateBlockedWrite(c8);
+    ASSERT_TRUE(bgIteration_isEntryInuse(getItem(8)));
+
+    // On a consistent iterator, the expedited item in-front of items already in queue!
+    //  Read key 6 out of order.
+    expectReadClonedKey(it, 6, de6_md);
+
+    // Now, when we expect to read key 7, which was expedited, key 6 will be released back to Valkey
+    //  and the clone will be deallocated here.
+    expectReadKey(it, 7);
+
+    // Now, when we read key 8, which was expedited, key 7 is released back to Valkey, and the client
+    // will be unblocked.
+    // (actually, unblock is called after every key [just in case] - but functionally we only care
+    //  about this one)
+    expectReadKeyWithUnblock(it, 8, 7);
+    simulateUnblockedWriteWithModification(c7);
+
+    // Now, when we read key 1, key 8 is released back to Valkey, and the client will be unblocked.
+    expectReadKeyWithUnblock(it, 1, 8);
+    simulateUnblockedWriteWithModification(c8);
+
+    // Since only one item was cloned, there should be one clone processed
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.dbentry_clones_processed, 1u);
+
+    // Continue...
+    expectReadKeySequence(it, 2, 5);
+    // 6, 7, and 8 have already been processed
+    expectReadKeySequence(it, 9, LAST_ITEM);
+    expectReadComplete(it);
+    freeTestClient(c6);
+    freeTestClient(c7);
+    freeTestClient(c8);
+}
+
+
+/////////////////////////////////////////////////////
+// TESTS RELATED TO MODIFICATION OF TWO ITEMS
+// When 2 keys are modified, we need to ensure that both keys have been sent before we can send
+//  replication.  This means that if replication is present, we may have to block/expedite for
+//  future keys, even in the inconsistent scenario.
+/////////////////////////////////////////////////////
+
+// Replication enabled, but NOT consistent.  In this case, if ANY of the keys have been iterated,
+//  ALL of the keys must be replicated so that the command can be processed properly on the replica.
+TEST_F(BgIterationTest, modPastFutureItem_eventual) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // In this test, we need a past and future key IN THE SAME DB (they're used in the same command).
+    //  DB1 has lots of buckets.  After reading item 9,
+    //    8 will be past, 10 will be in queue, 11-15 will be future.
+    expectReadKeySequence(it, 0, 9);
+
+    // We're going to write to key 8 (past) and read from key 12 (future)
+    // Even though key 12 is for READ in this command, it must be expedited so that it exists before
+    //  the associated replication is sent.
+    c = getSetGetClient(8, "xxx", 12);
+    simulateBlockedWrite(c);
+
+    // Key 12 will be expedited, but not in front of existing items in queue (can only do that for
+    //  consistent iterators)
+
+    expectReadKey(it, 10);
+    expectReadKey(it, 12); // expedited
+
+    expectReadKeyWithUnblock(it, 11, 12); // 13 is now in queue
+
+    simulateUnblockedWriteWithModification(c);
+
+    // Continue...
+    expectReadKey(it, 13);
+    expectReadReplication(it, c);
+
+    expectReadKeySequence(it, 14, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Replication NOT enabled.  A read-only key doesn't need to be expedited, even if other keys have
+//  been processed already.  (This should work identically for both consistent/non-consistent.
+TEST_F(BgIterationTest, modPastFutureItem_start) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter1", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // In this test, we need a past and future key IN THE SAME DB (they're used in the same command).
+    //  DB1 has lots of buckets.  After reading item 9,
+    //    8 will be past, 10 will be in queue, 11-15 will be future.
+    expectReadKeySequence(it, 0, 9);
+
+    // We're going to write to key 8 (past) and read from key 12 (future)
+    // Since there's no replication, we don't have to worry about expediting 12.  The write will
+    //  proceed without blocking.
+    c = getSetGetClient(8, "xxx", 12);
+    simulateUnblockedWriteWithModification(c);
+
+    // Key 12 will not be expedited.  Remaining keys should be received in normal order.
+    expectReadKeySequence(it, 10, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+TEST_F(BgIterationTest, modPastFutureItem) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter2", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // In this test, we need a past and future key IN THE SAME DB (they're used in the same command).
+    //  DB1 has lots of buckets.  After reading item 9,
+    //    8 will be past, 10 will be in queue, 11-15 will be future.
+    expectReadKeySequence(it, 0, 9);
+
+    // We're going to write to key 8 (past) and read from key 12 (future)
+    // Since there's no replication, we don't have to worry about expediting 12.  The write will
+    //  proceed without blocking.
+    c = getSetGetClient(8, "xxx", 12);
+    simulateUnblockedWriteWithModification(c);
+
+    // Key 9 will not be expedited.  Remaining keys should be received in normal order.
+    expectReadKeySequence(it, 10, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+/////////////////////////////////////////////////////
+// TESTS RELATED TO MISSING ITEMS
+// Missing items are tricky.  A missing item might be logically located in the past or future, in
+//  relation to the current iteration position.  The command may (or may not) create the "missing"
+//  key.  Some general considerations:
+//    * In a consistent iteration, a missing key didn't exist at the time of consistency, or it was
+//      already processed (saved) at the time of the deletion.  If the missing key gets created, we
+//      must be sure to skip it if we later iterate over it.
+//    * In a non-consistent iteration with replication:
+//        * If the key location is already passed, the replication is sent, allowing the key to be
+//          created (or not) based on the replication.
+//        * If the key location is in the future, we can allow the command to proceed, without
+//          replication.  If the key is created, we will process it when the iterator gets to it.
+//
+// We expect:
+//  no-repl, no-consist:  past items are ignored - future items are processed when iterated
+//  no-repl, yes-consist:  past items are ignored - future items are ignored
+//  yes-repl, no-consist:  past item skipped, but replicated - future items are created by replication and skipped later
+//  yes-repl, yes-consist:  past item skipped, but replicated - future items are processed when iterated
+/////////////////////////////////////////////////////
+
+// no-repl, no-consist: creation of PAST item has no impact
+TEST_F(BgIterationTest, missingPastItem) {
+    simpleDelItem(0); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+
+    c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKeySequence(it, 3, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// no-repl, yes-consist: creation of PAST item has no impact
+TEST_F(BgIterationTest, missingPastItem_start) {
+    simpleDelItem(0); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+
+    c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKeySequence(it, 3, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// yes-repl, no-consist: creation of a PAST item will be replicated
+TEST_F(BgIterationTest, missingPastItem_eventual) {
+    simpleDelItem(0); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+    expectReadKey(it, 3);
+
+    c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c); // replication will be added after item 4 (3,4 in same bucket)
+
+    expectReadKey(it, 4);
+
+    expectReadReplication(it, c);
+
+    expectReadKeySequence(it, 5, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// no-repl, no-consist: creation of FUTURE item is seen when reached by the iteration.
+TEST_F(BgIterationTest, missingFutureItem) {
+    // Using DB1 so we have lots of buckets
+    simpleDelItem(14); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+
+    const char *newValue = "xxx";
+    c = getWriteClient(14, newValue);
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKeySequence(it, 1, 13);
+
+    // We expect to see item 14.
+    //  Note that for an inconsistent DB view, it is logically undefined if this value is seen (or not).
+    //  But as implemented, we should see it and the test is helpful to understand if/when the
+    //  functionality changes.
+    expectReadKey(it, 14, newValue);
+
+    expectReadKey(it, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// no-repl, yes-consist: creation of FUTURE item is ignored by consistent iteration.
+TEST_F(BgIterationTest, missingFutureItem_start) {
+    // Using DB1 so we have lots of buckets
+    simpleDelItem(14); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+
+    c = getWriteClient(14, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKeySequence(it, 1, 13);
+    // Key 14 is missing - it didn't exist at start of consistent iteration
+    expectReadKey(it, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// yes-repl, no-consist: creation of FUTURE item is handled by the replication, and then the key is
+//  later skipped (treated like an early iteration case).
+TEST_F(BgIterationTest, missingFutureItem_eventual) {
+    // Using DB1 so we have lots of buckets
+    simpleDelItem(14); // Delete the item before iterator creation
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0); // Items 1 & 2 are in queue (same bucket)
+
+    c = getWriteClient(14, "xxx");
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKeySequence(it, 1, 2);
+
+    expectReadReplication(it, c); // Here's the replication creating item 14
+
+    expectReadKeySequence(it, 3, 13);
+    // We expect item 14 to be skipped, because it was created by the earlier replication
+    expectReadKey(it, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+/////////////////////////////////////////////////////
+// TESTS RELATED TO EXPIRATION
+// Expiration can be tricky.  When pre-evaluating a command with bgIteration_blockClientIfRequired,
+//  a key might exist, but be ready for expiration.  Then, as the command executes, the key expires
+//  and gets deleted before the write operation.  Consider SET K V.
+//  In the unexpired case, this appears to bgIteration as a single SET command (which replaces the value).
+//  In the expired case, bgIteration will receive a DEL followed by a SET.
+//
+// Another case is a READ command.  A read command won't cause the client to be blocked.  However,
+//  if the key is expired, this will cause a DEL.  For consistent processing, this key might need to
+//  be expedited so that it can be processed before it gets deleted.  In this case, the key is
+//  unlinked from the main Valkey dictionary, but the actual deletion is deferred.
+/////////////////////////////////////////////////////
+
+TEST_F(BgIterationTest, expireKeys) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+    expectReadKey(it, 1);
+
+    // At this point, key 1 is active, key 2 is in queue.
+
+    simulateExpiration(0);        // Past - we no longer care
+    simulateExpirationOfInuse(2); // Current - it's inuse
+    simulateExpiration(5);        // Future - we don't care (non-consistent)
+
+    expectReadKeySequence(it, 2, 4);
+    // key 5 has been deleted
+    expectReadKeySequence(it, 6, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+TEST_F(BgIterationTest, expireKeys_eventual) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+    expectReadKey(it, 1);
+
+    // At this point, key 1 is active, key 2 is in queue.
+
+    simulateExpiration(0);        // Past - we expect replication
+    simulateExpirationOfInuse(2); // Current - it's inuse, but we expect replication
+    simulateExpiration(5);        // Future - we don't care (non-consistent)
+
+    expectReadKey(it, 2); // this was already queued
+
+    expectReadReplicationDel(it, 0); // Past item should replicate
+    expectReadReplicationDel(it, 2); // Current item should replicate
+    // Item 5 is a future item and doesn't need to replicate
+
+    expectReadKeySequence(it, 3, 4);
+    // Item 5 has been deleted
+    expectReadKeySequence(it, 6, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+TEST_F(BgIterationTest, expireKeys_start) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+    expectReadKey(it, 1);
+
+    // At this point, key 1 is active, key 2 is in queue.
+
+    simulateExpiration(0);             // Past - we no longer care
+    simulateExpirationOfInuse(2);      // Current - we must defer
+    simulateExpirationWithExpedite(5); // Future - will become inuse and expedited for consistency
+
+    expectReadKey(it, 5); // Expedited to front
+
+    expectReadKeySequence(it, 2, 4);
+    // Item 5 has been deleted
+    expectReadKeySequence(it, 6, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Special case during a non-consistent iteration with replication and expiration.
+//  1. A future key is created (and processed by its replication) - considered early iterated
+//  2. Later the key is expired and deleted during command processing (causes DEL to be sent) - no longer early iterated
+//  3. The key is recreated as part of the command processing (and this command was replicated) - again early iterated
+//  4. Finally, when we iterate to the key, it shouldn't be sent, because it was replicated in step 3.
+TEST_F(BgIterationTest, expireKeys_eventual_FutureKeyCreatedThenExpiredDuringSet) {
+    simpleDelItem(8); // Start with a missing future item
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0); // Get the iterator started
+
+    c = getWriteClient(8, "xxx");
+    simulateUnblockedWriteWithModification(c); // Not blocked because this is a future key (but we expect repl)
+
+    // Now do it again, but break out the steps so that we can simulate an expiration
+    bool blocked = bgIteration_blockClientIfRequired(c);
+    EXPECT_FALSE(blocked); // Shouldn't be blocked because this is a future key
+
+    // Now, as the SET command tries to execute, simulate that the key is expired.  Expiration
+    //  processing sends the replication FIRST!
+    robj *argv[2];
+    argv[0] = createStringObjectFromCString("DEL");
+    argv[1] = c->argv[1];
+    serverCommand *cmd = lookupCommandByCString("DEL");
+    bgIteration_handleCommandReplication(getDbFromItemNum(8), cmd, 2, argv);
+    decrRefCount(argv[0]);
+
+    // Now the call to keyDelete happens (after the replication).
+    bgIteration_keyDelete(getDbFromItemNum(8), static_cast<sds>(objectGetVal(c->argv[1])));
+    simpleDelItem(8); // Simulate the actual del
+
+    // Now the SET will run, re-creating the item (which is still a future item)
+    // We need to duplicate the value because setKey() can reallocate it.
+    robj *value = dupStringObject(c->argv[2]);
+    setKey(c, c->db, c->argv[1], &(value), SETKEY_ADD_OR_UPDATE);
+
+    // Finally, replication will be sent because this is creating a new key
+    bgIteration_handleCommandReplication(getDbFromItemNum(8), c->cmd, c->argc, c->argv);
+
+    // Test that everything comes as expected
+    expectReadKeySequence(it, 1, 2); // All one bucket - queued after key 0 read
+
+    expectReadReplication(it, c);    // Repl from the first SET command
+    expectReadReplicationDel(it, 8); // This is the expected replication of the DEL from expire
+    expectReadReplication(it, c);    // Repl from the second SET command (recreating deleted key)
+
+    expectReadKeySequence(it, 3, 7); // continue with normal iteration
+    // KEY 8 SHOULD BE OMITTED - This was already replicated
+    expectReadKeySequence(it, 9, LAST_ITEM);
+
+    expectReadComplete(it);
+}
+
+
+/////////////////////////////////////////////////////
+// THE REMAINING TESTS ARE GENERAL / UNCATEGORIZED
+/////////////////////////////////////////////////////
+
+// Iteration can be terminated from the main thread or from the child client.
+//  This tests termination driven from the main thread.
+TEST_F(BgIterationTest, earlyTerminationFromMain) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+
+    // At this point, keys 1 & 2 are in queue.  A termination should release those keys.
+    bool blocked1 = true;
+    bool blocked2 = true;
+    // We expect no general unblocks, we account for each specific unblock below.
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0);
+    // We should expect to see unblock called for items 1 & 2, as they are released from the queue.
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1))))
+        .WillOnce(Assign(&blocked1, false));
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2))))
+        .WillOnce(Assign(&blocked2, false));
+    bgIteratorTerminate(it); // queues the items for release
+    EXPECT_TRUE(bgIteratorIsTerminating(it));
+    bgIteration_feedIterators(); // actually performs the release
+    EXPECT_FALSE(blocked1);
+    EXPECT_FALSE(blocked2);
+
+    bool blocked0 = true;
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0))))
+        .WillOnce(Assign(&blocked0, false));
+    bgIteratorItem *item = bgIteratorRead(it);
+    EXPECT_FALSE(blocked0);
+    EXPECT_EQ(item->type, BGITERATOR_ITEM_TERMINATED);
+
+    bgIteratorClose(it); // background thread completes the termination
+
+    EXPECT_EQ(cleanupCount, 0);
+    bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_TRUE(cleanupTerminated);
+}
+
+
+// Iteration can be terminated from the main thread or from the child client.
+//  This tests termination driven from the child client (the background thread).
+TEST_F(BgIterationTest, earlyTerminationFromChild) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+
+    // At this point, keys 1 & 2 are in queue.  A termination should release those keys.
+    bgIteratorClose(it); // background thread initiates the termination
+    EXPECT_TRUE(bgIteratorIsTerminating(it));
+
+    bool blocked0 = true;
+    bool blocked1 = true;
+    bool blocked2 = true;
+    // Expecting no extra unblocks
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0);
+    // We expect item 0 (the in progress item) to be released
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0))))
+        .WillOnce(Assign(&blocked0, false));
+    // We expect items 1-4 (the queued items) to be released
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1))))
+        .WillOnce(Assign(&blocked1, false));
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2))))
+        .WillOnce(Assign(&blocked2, false));
+    bgIteration_feedIterators();
+    EXPECT_FALSE(blocked0);
+    EXPECT_FALSE(blocked1);
+    EXPECT_FALSE(blocked2);
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_TRUE(cleanupTerminated);
+}
+
+
+// Edge case.  Executing a command (like SUNIONSTORE) which REPLACES the first key and reads the
+//  second key.  In this case, bgIteration will get notified of the key deletion during execution of
+//  SETUNIONSTORE.  Given that both keys are in the future (not iterated yet), we'll allow the
+//  command to execute, unblocked.  We won't replicate as we'll pick up the key when we get to it.
+TEST_F(BgIterationTest, writeWith2Keys_eventual_keyDeletedDuringSetReplace) {
+    // Using DB1 so we have lots of buckets
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKeySequence(it, 0, 8); // 9 is in queue
+
+    // Write command that has 2 keys. 1 existing key that we write to and 1 dependant future key.
+    c = getWrite2KeysClient("sunionstore", 12, 13);
+
+    simulateUnblockedWrite(c);
+
+    // Now the call to keyDelete happens
+    sds sdskey = sdsnew(keyStr(12));
+    bgIteration_keyDelete(getDbFromItemNum(12), sdskey);
+    sdsfree(sdskey);
+    simpleDelItem(12); // So simulate the actual del
+
+    // Now the write will run, re-creating the item (which is still a future item)
+    const char *const newValueStr = "new value";
+    robj *newValueRobj = createStringObjectFromCString(newValueStr);
+    setKey(c, c->db, c->argv[1], &newValueRobj, SETKEY_ADD_OR_UPDATE);
+
+    // Finally, we are letting bgIteration know that the write command was executed
+    bgIteration_handleCommandReplication(getDbFromItemNum(12), c->cmd, c->argc, c->argv);
+
+    // Since the write command was not replicated, we expect all the keys to be read in the normal
+    //  order from the dictionary.
+    expectReadKeySequence(it, 9, 11);
+    expectReadKey(it, 12, newValueStr);
+    expectReadKeySequence(it, 13, LAST_ITEM);
+
+    expectReadComplete(it);
+}
+
+
+// Edge case.  When we have a new key which is created by a command, AND replication is enabled, we
+//  expect that we will replicate the command rather than serializing the key/value later.  As an
+//  example, consider SUNIONSTORE A B.  We want to create A by replicating the command.  We don't
+//  want to have to process A as a key later on.  But in this case, we can't run the command until
+//  B has been sent.  We expect the command to be blocked while we send B.
+TEST_F(BgIterationTest, writeWith2Keys_eventual_setNewKey_DependantFuture) {
+    // Using DB1 so we have lots of buckets
+    simpleDelItem(12); // Deleting key 12 to then create it with a write command
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKeySequence(it, 0, 8); // 9 is in queue
+
+    // Write command that has 2 keys. 1 new key and 1 dependant future key.
+    c = getWrite2KeysClient("sunionstore", 12, 13);
+
+    // We are simulating a new key in the dict. This command should block on the dependant key.
+    // This adds key 13 in the queue since the command depends on it.
+    simulateBlockedWrite(c);
+
+    // Key 9 was already in the queue
+    expectReadKey(it, 9);
+
+    // Key 13 is processed out of order since the write depends on it
+    expectReadKey(it, 13);
+
+    // Reading key 10 will unblock key 13, allowing us to write.
+    expectReadKey(it, 10);
+
+    // Now that key 13 was processed and released by the iterator, the write command can be executed.
+    simulateUnblockedWriteWithModification(c);
+
+    // Key 11 was queued when we read key 10
+    expectReadKey(it, 11);
+
+    // The replication of the write command was enqueued after key 11
+    expectReadReplication(it, c);
+
+    // We shouldn't see key 12 - as that was processed via replication.
+    // We shouldn't see key 13 - as that was expedited earlier
+
+    // Now resuming processing of dict entries
+    expectReadKeySequence(it, 14, LAST_ITEM);
+
+    expectReadComplete(it);
+}
+
+
+// A new key is being created, but is dependent on another key which has already been processed.
+//  In this case, the command shouldn't be blocked.
+TEST_F(BgIterationTest, writeWith2Keys_eventual_setNewKey_DependantPast) {
+    // Using DB1 so we have lots of buckets
+    simpleDelItem(12); // Deleting key 12 to then create it with a write command
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    expectReadKeySequence(it, 0, 9); // 10 is in queue, done with 8
+
+    // Write command that has 2 keys. 1 new key and 1 dependant past key.
+    c = getWrite2KeysClient("sunionstore", 12, 8);
+
+    // We are simulating a new key in the dict.
+    // This command should not block since the dependant key has already been processed.
+    simulateUnblockedWriteWithModification(c);
+
+    // Key 10 was put in the queue before the write
+    expectReadKey(it, 10);
+
+    expectReadReplication(it, c);
+
+    expectReadKey(it, 11);
+
+    // Key 12 should be missing - it was processed by replication
+
+    expectReadKeySequence(it, 13, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// A new key is being created, and has dependencies on 2 other keys - one already processed, one not.
+//  In this case, the command should be blocked so that the future key can be sent first.
+TEST_F(BgIterationTest, writeWith3Keys_eventual_setNewKey_1DependantPast1DependantFuture) {
+    // Using DB1 so we have lots of buckets
+    simpleDelItem(12); // Deleting key 12 to then create it with a write command
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    expectReadKeySequence(it, 0, 9); // 8 has been returned, 9 is active, 10 is in queue
+
+    // Write command that has 1 new key and 2 dependencies (past/future)
+    c = getWrite3KeysClient("sunionstore", 12, 8, 13);
+
+    // The write should be blocked, so that item 13 can be processed.
+    simulateBlockedWrite(c);
+
+    expectReadKey(it, 10); // 10 was already in queue
+    expectReadKey(it, 13); // 13 was expedited since the write depends on it
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(13)))).Times(1);
+    expectReadKey(it, 11); // Releases 13 so the command can execute
+
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKey(it, 14); // was queued when reading 11 (12 is missing, 13 was expedited)
+
+    expectReadReplication(it, c);
+
+    expectReadKey(it, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Test an edge case with the same (future) key being repeated in the command, like:
+//  SUNIONSTORE A B B
+// In this test, A is a previously handled key, and B is a future key.  We expect the future key B to
+//  be expedited (once).
+TEST_F(BgIterationTest, writeWith3Keys_eventual_repeatedKey_1DependantPast1RepeatedFuture) {
+    // Using DB1 so we have lots of buckets
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    expectReadKeySequence(it, 0, 9); // We're done with 8, and 10 is in queue
+
+    // Write command that has 3 keys. 1 past key and 1 repeated key in the future.
+    c = getWrite3KeysClient("sunionstore", 8, 12, 12);
+
+    // This command should block because 12 needs to be expedited.
+    simulateBlockedWrite(c);
+
+    expectReadKey(it, 10); // was already in queue
+    expectReadKey(it, 12); // expedited
+    expectReadKey(it, 11); // releases 12 (unblocking the command)
+
+    // Now that key 12 was processed and released by the iterator, the write command can be executed.
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKey(it, 13); // queued when we read 11
+
+    expectReadReplication(it, c);
+
+    // Now resuming processing of dict entries.
+    expectReadKeySequence(it, 14, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+/* Tests the replication of a write command that creates a new key and depends on a
+ * future key which is duplicated in the command. */
+TEST_F(BgIterationTest, writeWith3Keys_eventual_repeatedKey_1newKey1RepeatedFuture) {
+    simpleDelItem(3); // Deleting key 3 to then create it with a write command
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+    // At this point, keys 1 & 2 are in queue.
+
+    // Write command that has 3 keys. 1 new key and 1 repeated key in the future.
+    c = getWrite3KeysClient("sunionstore", 3, 5, 5);
+
+    // This command should block on key 5.
+    // This adds key 5 in the queue because:
+    // - the command depends on key 5 which hasn't been processed yet
+    // - the command creates a new key (key 3).
+    simulateBlockedWrite(c);
+
+    expectReadKeySequence(it, 1, 2); // These were already in queue
+
+    // Key 5 is processed out of order since the write depends on it
+    expectReadKey(it, 5);
+
+    // Keys 4 is the next in queue, and releases the expedited key 5
+    expectReadKey(it, 4);
+
+    // Now that key 4 was processed and released by the iterator, the write command can be executed.
+    simulateUnblockedWriteWithModification(c);
+
+    // Key 6 & 7 are next, having been queued after reading key 4.
+    expectReadKeySequence(it, 6, 7);
+
+    // The replication of the write command was enqueued after 5 was released (unblocking the command)
+    expectReadReplication(it, c);
+
+    // Now resuming processing of dict entries.
+    expectReadKeySequence(it, 8, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+/* A command modifying an in-progress key, but dependent on a future (repeated) key. */
+TEST_F(BgIterationTest, writeWith3Keys_start_repeatedKey_1DependantPast1RepeatedFuture) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+    // At this point, keys 1 & 2 are in queue.
+
+    // Write command that has 3 keys. 0 is in progress.  4 is still future.
+    // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a
+    //  multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY).
+    c = getWriteMultiKeysClient("blpop", 0, {4, 4, 0});
+
+    // This command should block on 2 keys (0 and 4), since:
+    //  - key 0 is in use by the iterator (still in the queue since it has not been processed by the consumer yet)
+    //  - key 4 is in the future
+    // This adds key 4 in the queue since the command depends on it and it hasn't been processed yet.
+    simulateBlockedWrite(c, 2);
+
+    // Key 4 is processed out of order since the write depends on it.
+    // Key 4 is processed before key 1 even though key 1 was already in the queue
+    //  because key 4 was enqueued as a priority item with a no-replication iterator.
+    // Reading key 4 will release key 0 - releasing that lock on the command
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))).Times(1);
+    expectReadKey(it, 4); // This unblocks key 0
+
+    EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(4)))).Times(1);
+    expectReadKey(it, 1); // this was already in queue (releases key 4)
+
+    // Now that keys 4 and 0 were processed and released by the iterator, the write command can be executed.
+    simulateUnblockedWriteWithModification(c);
+
+    expectReadKeySequence(it, 2, 3);
+
+    // 4 is skipped because it was already expedited
+
+    expectReadKeySequence(it, 5, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+/* Test that creates a new key, repeating the future key in the command. */
+TEST_F(BgIterationTest, writeWith3Keys_repeatedKey_1repeatedNewKey) {
+    simpleDelItem(6); // Deleting key 6 to then create it with a write command
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    // Getting started
+    expectReadKeySequence(it, 0, 3);
+    // Now, 0,1,2 are in the past.  3 is being processed, and 4 is in queue.
+
+    // Write command that has 3 keys. 1 new repeated key and 1 key in the past.
+    // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a
+    //  multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY).
+    c = getWriteMultiKeysClient("blpop", 6, {0, 6, 0});
+
+    // The write command is not blocked since key 0 & 6 are not in use, and no consistency requirements
+    simulateUnblockedWriteWithModification(c);
+
+    // Keys 2, 3 are next in the queue (it was put in the queue at the same time as key 1).
+    expectReadKeySequence(it, 4, 5);
+
+    // There are no consistency requirements - so the new key should just be iterated.
+    // Key 6 is now in the dict with the value of key 0.
+    expectReadKey(it, 6, keyStr(0));
+
+    // Processing the rest of the dict entries.
+    expectReadKeySequence(it, 7, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+/* In this test, the COPY command is copying from one DB to another.  We will create the
+ *  same key in both DBs.  We make sure that the proper key is created via replication, and
+ *  the proper key is created by iteration. */
+TEST_F(BgIterationTest, copyHandlesProperDb_eventual) {
+    // NOTE:  Adding E0 to dict 1.  Now there is a E0 in both dict 0 and dict 1.
+    addKeyToDb(1, "H0", "H0");
+
+    // The test:
+    //  We will simulate (with DB0 selected): COPY B1 H0 DB 1 REPLACE
+    //  This will overwrite DB1:H0 that was created above.
+    //  Since DB0:B1 is already in queue, we need to expedite the target (DB1:H0) as well
+    //  After DB1:H0 is "overwritten", it should be marked early iterate.
+    //  We expect DB0:H0 to NOT be marked early iterate, and should get processed normally.
+
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0); // B0
+    // At this point, keys 1(B1) & 2(B2) are in queue.
+
+    // COPY B1 H0 DB 1 REPLACE
+    c = static_cast<client *>(zcalloc(sizeof(client)));
+    c->cmd = lookupCommandByCString("copy");
+    c->db = server.db[0];
+    c->argc = 6;
+    c->argv = static_cast<robj **>(zcalloc(sizeof(robj *) * c->argc));
+    c->argv[0] = createStringObjectFromCString(c->cmd->fullname);
+    c->argv[1] = createStringObjectFromCString("B1");
+    c->argv[2] = createStringObjectFromCString("H0");
+    c->argv[3] = createStringObjectFromCString("DB");
+    c->argv[4] = createStringObjectFromCString("1");
+    c->argv[5] = createStringObjectFromCString("REPLACE");
+
+    // This should block on 2 keys.  DB0:B1 is in queue.  DB1:H0 needs to be expedited.
+    simulateBlockedWrite(c, 2);
+
+    // These 2 keys were already in queue
+    expectReadKey(it, 1); // DB0:B1
+    expectReadKey(it, 2); // DB0:B2
+
+    // And now we expect to see the expedited DB1:H0
+    expectReadDbKeyValue(it, 1, "H0", "H0");
+
+    expectReadKey(it, 3); // releases DB1:E0
+
+    // Now key 4 is still in the queue
+
+    simulateUnblockedWrite(c); // We shouldn't be blocked this time
+
+    // Now, we'll simulate the actual activity of the COPY.  DB1:H0 will be deleted in order to
+    //  be overwritten.
+    sds sdskey = sdsnew("H0");
+    bgIteration_keyDelete(1, sdskey); // bgIteration would be signaled about the deletion
+    sdsfree(sdskey);
+    // At this point the key would actually be deleted and recreated by COPY (no need to actually do this)
+
+    // And finally the replication (this should queue replication)
+    bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv);
+
+    // Now let's read everything...
+    expectReadKey(it, 4);         // (this was previously in queue)
+    expectReadReplication(it, c); // This is the new replication (creating DB1:H0)
+
+    // The rest should be normal.  We shouldn't see DB1:E0 as it was recreated by replication
+    expectReadKeySequence(it, 5, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Check that termination with replication in queue works OK.
+TEST_F(BgIterationTest, terminateWithReplication) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+    expectReadKey(it, 1); // makes sure we are done with key 0 (don't want to block)
+
+    c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c); // Should replicate
+
+    bgIteratorTerminate(it);
+
+    bgIteratorItem *item = bgIteratorRead(it);
+    ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED);
+
+    bgIteratorClose(it); // background thread completes the termination
+
+    bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_TRUE(cleanupTerminated);
+}
+
+
+// SWAPDB tests - Get ready for the mind-bend...
+
+/* In the non-consistent iterator (without replication), items are identified with the DBID at
+ *  the time they are placed into the queue.  The SWAPDB event signals the change to the
+ *  iterating process - and this is properly sequenced with the DB info for each item. */
+TEST_F(BgIterationTest, swapDB) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    bgIteratorStatus status;
+
+    expectReadKey(it, 0);
+    // Keys 1 & 2 are in queue
+
+    simulateSwapDB(0, 1); // The swap event will be queued after item 2
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.swapdb_queued, 1u);
+    EXPECT_EQ(status.swapdb_processed, 0u);
+
+    expectReadKey(it, 1); // These were already in queue,
+    expectReadKey(it, 2); //  ... and the iteration client hasn't seen the swap yet
+
+    expectReadSwapDB(it, 0, 1);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.swapdb_queued, 1u);
+    EXPECT_EQ(status.swapdb_processed, 0u); // still processing it...
+
+    // Since we've seen the swap event, items now have the new DBID
+
+    expectReadDbKeyValue(it, 1, keyStr(3), keyStr(3)); // item 3 should show in DB1
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.swapdb_queued, 1u);
+    EXPECT_EQ(status.swapdb_processed, 1u); // done processing the swapdb
+
+    // Keys 4 is in the queue - let's swap back!
+    simulateSwapDB(1, 0); // The swap event will be queued after item 4
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.swapdb_queued, 2u); // 2nd one queued
+    EXPECT_EQ(status.swapdb_processed, 1u);
+
+    expectReadDbKeyValue(it, 1, keyStr(4), keyStr(4)); // item 4 should still show in DB1
+
+    expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.swapdb_queued, 2u);
+    EXPECT_EQ(status.swapdb_processed, 1u); // still processing it...
+
+    // Since we've seen the second swap, items should now show with their original DB
+
+    expectReadKey(it, 5);
+    bgIteratorGetStatus(it, &status);
+    EXPECT_EQ(status.swapdb_queued, 2u);
+    EXPECT_EQ(status.swapdb_processed, 2u); // done processing all swaps
+
+    expectReadKeySequence(it, 6, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+/* In the consistent iterator (without replication) all items are presented to the iterating
+ * process using the DBID at the time of the iterator creation.  No changes are evident.
+ * Swap events are not presented to the iteration client. */
+TEST_F(BgIterationTest, swapDB_start) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+    // Keys 1 & 2 are in queue
+
+    simulateSwapDB(0, 1); // The swap occurs, but the iterator sees no change
+
+    expectReadKey(it, 1);
+    expectReadKey(it, 2);
+    expectReadKey(it, 3);
+
+    // Heck, let's go crazy with those swaps...
+    for (int itemNum = 4; itemNum <= LAST_ITEM; itemNum++) {
+        simulateSwapDB(0, 1);
+        expectReadKey(it, itemNum);
+    }
+
+    expectReadComplete(it);
+}
+
+
+/* In the non-consistent iterator WITH replication, items are identified with the DBID at the
+ *  time they are placed into the queue.  The SWAPDB event signals the change to the iterating
+ *  process - and this is properly sequenced with the DB info for each item. */
+TEST_F(BgIterationTest, swapDB_eventual) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+    // Keys 1 & 2 are in queue
+
+    simulateSwapDB(0, 1); // The swap event will be queued after item 2
+
+    expectReadKey(it, 1); // These were already in queue,
+    expectReadKey(it, 2); //  ... and the iteration client hasn't seen the swap yet
+
+    expectReadSwapDB(it, 0, 1);                // We should see a SWAPDB event
+    bgIteratorItem *item = bgIteratorRead(it); // followed by the associated replication
+    ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION);
+    bgIteration_feedIterators();
+
+    // Since we've seen the swap event, items now have the new DBID
+    expectReadDbKeyValue(it, 1, keyStr(3), keyStr(3)); // item 3 is now in DB1
+
+    // Key 4 is in the queue - let's swap back!
+    simulateSwapDB(1, 0); // The swap event will be queued after item 4
+
+    expectReadDbKeyValue(it, 1, keyStr(4), keyStr(4)); // Still appears as DB1
+
+    expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap
+    item = bgIteratorRead(it);
+    ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION);
+    bgIteration_feedIterators();
+
+    expectReadKeySequence(it, 5, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+// There is no test for swapDB_YesReplication_YesConsistent because this configuration is not
+//  permitted with multiple DBs (not permitted with swaps).
+
+
+// FLUSHDB & FLUSHALL Tests
+
+TEST_F(BgIterationTest, flushDB_flushAll) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+    expectReadKey(it, 1);
+
+    // key 1 is active in the iterator - this key won't be deallocated because of the refcount.
+    // keys 2 is in queue - but will be returned to Valkey before the flush.  It is yanked
+    //  back by Valkey and will not be seen by iterator.
+    simulateFlushDB(-1, 1);
+
+    bgIteratorItem *item = bgIteratorRead(it);
+    ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED);
+
+    bgIteratorClose(it); // background thread completes the termination
+
+    bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_TRUE(cleanupTerminated);
+}
+
+TEST_F(BgIterationTest, flushDB_flushOne) {
+    bgIterator *it1 = bgIteratorCreateFullScanIter("iter1", BGITERATOR_CONSISTENCY_NONE, NULL,
+                                                   iteratorCleanupFn, PRIVDATA);
+    bgIterator *it2 = bgIteratorCreateFullScanIter("iter2", BGITERATOR_CONSISTENCY_START, NULL,
+                                                   iteratorCleanupFn, PRIVDATA);
+    bgIteratorStatus status;
+
+    // The test flushes DB0.  This is half the data.  Since <= half, a non-consistent iterator is
+    //  allowed to proceed.  But the consistent iterator will be terminated.
+
+    expectReadKey(it1, 0);
+    expectReadKey(it2, 0);
+    expectReadKey(it1, 1);
+    expectReadKey(it2, 1);
+
+    // key 1 is active in the iterator - this key won't be deallocated because of the refcount.
+    // keys 2 is in queue - but will be returned to Valkey before the flush.  These are yanked
+    //  back by Valkey and will not be seen by iterator.
+    simulateFlushDB(0, 1);
+    bgIteratorGetStatus(it1, &status);
+    EXPECT_EQ(status.flushdb_queued, 1u);
+    EXPECT_EQ(status.flushdb_processed, 0u);
+
+    // Testing the non-consistent one continues...
+    // Everything already on the iterator queue should be preserved (deleted from the DB).
+    //  Keys 2 is already queued (and preserved).
+    expectReadKey(it1, 2);
+
+    // Read the flushdb item on iterator 1.
+    bgIteratorItem *item = bgIteratorRead(it1);
+    ASSERT_EQ(item->type, BGITERATOR_ITEM_FLUSHDB);
+    ASSERT_EQ(item->dbid, 0);
+    bgIteratorGetStatus(it1, &status);
+    EXPECT_EQ(status.flushdb_queued, 1u);
+    EXPECT_EQ(status.flushdb_processed, 0u); // still processing it
+
+    // And iterator 1 keeps processing with the 2nd DB
+    expectReadKey(it1, ITEMS_PER_DB);
+    bgIteratorGetStatus(it1, &status);
+    EXPECT_EQ(status.flushdb_queued, 1u);
+    EXPECT_EQ(status.flushdb_processed, 1u); // done with all flushdb's
+
+    expectReadKeySequence(it1, ITEMS_PER_DB + 1, LAST_ITEM);
+    expectReadComplete(it1);
+    EXPECT_EQ(cleanupCount, 1);
+    EXPECT_FALSE(cleanupTerminated);
+
+    // But the consistent iterator should be terminated
+    item = bgIteratorRead(it2);
+    ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED);
+    bgIteratorClose(it2);        // background thread completes the termination
+    bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function
+    EXPECT_EQ(cleanupCount, 2);
+    EXPECT_TRUE(cleanupTerminated);
+}
+
+
+/* A multi with one future and one past key must expedite and replicate. */
+TEST_F(BgIterationTest, multiTwoKeysFirstFuture) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    expectReadKey(it, 0); // Causes keys 1 & 2 to be queued (same bucket)
+    expectReadKey(it, 1); // Causes key 0 to be released
+
+    // Now, B0(0) is in the past.  H0(5) is in the future.  R0(11) [in DB1] is also future.
+
+    /* For a non-consistent iteration, with replication...
+     * Normally, H0 (future) wouldn't need to expedite - we'd just modify it in place (without
+     * replication and iterate on it later.  But, in this case, since it's wrapped in a multi, with
+     * B0 (past) - we need to expedite H0 so that the multi can all be handled in the same way.
+     * Key R0(11) [DB1] just makes thing a little trickier. */
+    c = getMultiClient("SET B0 xxx; SET H0 xxx; SELECT 1; SET R0 xxx");
+
+    // The EXEC should block on 2 keys, because H0(5) & R0(11) should be expedited
+    simulateBlockedWrite(c, 2);
+
+    expectReadKey(it, 2); // (was already in queue)
+
+    // Note - it would be logically OK if these 2 were reversed, but this is how the current algorithm works.
+    expectReadKey(it, 5);  // Key 5 (H0) was expedited
+    expectReadKey(it, 11); // Key 11 (R0) was expedited
+
+    // We don't need to actually simulate the multi.  Just checking that the keys were expedited.
+
+    // and clean up the rest...
+    expectReadKeySequence(it, 3, 4);
+    // Key 5 was already read above (expedited)
+    expectReadKeySequence(it, 6, 10);
+    // Key 11 was already read above (expedited)
+    expectReadKeySequence(it, 12, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+// Multi blocking on future items.  Consistent.
+TEST_F(BgIterationTest, multiBlocksOnFutureKey) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+    // Keys 1 & 2 are in queue
+
+    // Since there's no replication, an expedited key will be moved to the front of the queue.
+    // Let's fake a modification to key 6 (H1)
+    // Dummy up a MULTI...
+    c = getMultiClient("SET H1 xxx");
+
+    // Since this is consistent, we will block the client, disallowing the write.
+    simulateBlockedWrite(c);
+
+    // H1 (key 6) will be expedited to the front of the queue (because no replication)
+    expectReadKey(it, 6);
+
+    // Now that we've read key 6, key 0 (B0) is passed and should not block
+    freeTestClient(c);
+    c = getMultiClient("SET B0 xxx");
+    simulateUnblockedWrite(c);
+
+    // and clean up the rest...
+    expectReadKeySequence(it, 1, 5);
+    expectReadKeySequence(it, 7, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Scenario.  We have a multi that doesn't need to be replicated because all of the keys exist
+//  but are all future keys.  Note that missing keys are considered already-iterated, so all
+//  must exist for this test.  Then:
+//   - we delete a key
+//   - we re-create the deleted (future) key - normally this would be replicated
+//   - we access another (future) key - we don't expect to get blocked!
+TEST_F(BgIterationTest, multiNotReplicatedButDelRecreateAccess) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+    // Keys 1 & 2 are in queue
+
+    c = getMultiClient("DEL H1; SET H1 xxx; SET H2 yyy");
+    // Now let's process the multi.  Since H1 & H2 are both future (existing) items, we shouldn't
+    //  block or replicate.
+    simulateUnblockedWrite(c); // the EXEC
+
+    // Simulate the DEL H1
+    server.in_exec = 1; // Simulate actual execution of the MULTI/EXEC
+
+    advanceMultiClientToCommand(c, 0); // DEL H1
+    EXPECT_CALL(mock, blockClientInUseOnKeys(c, _, _)).Times(0);
+    bool blocked = bgIteration_blockClientIfRequired(c);
+    EXPECT_FALSE(blocked);
+    simpleDelItem(6); // H1
+    sds delKey = sdsnew(keyStr(6));
+    bgIteration_keyDelete(0, delKey);
+    sdsfree(delKey);
+    bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); // shouldn't replicate
+
+    // Simulate SET H1 - the key doesn't exist, and would normally replicate and mark early iterate,
+    //  but this is in a transaction, and we are not replicating this transaction.
+    advanceMultiClientToCommand(c, 1); // SET H1 xxx
+    simulateUnblockedWriteWithModification(c);
+
+    // Now write to another existing future key - this should work if we weren't confused by the DEL
+    advanceMultiClientToCommand(c, 2); // SET H2 yyy
+    simulateUnblockedWriteWithModification(c);
+    server.in_exec = 0;
+
+    // Now we can continue iterating, and we should pick up keys 1...  (and no replication!)
+    expectReadKeySequence(it, 1, 5);
+    expectReadKey(it, 6, "xxx");
+    expectReadKey(it, 7, "yyy");
+    expectReadKeySequence(it, 8, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// For this test, B0 is added into DB1 - so it exists in both DB 0 and 1.  We will process it
+//  in DB0, but it will be unprocessed in DB1.  See if we track SELECT properly.
+TEST_F(BgIterationTest, multiHandlesSelectProperly) {
+    addKeyToDb(1, "B0", "B0");
+
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    // Read the 1st key - B0 in DB0.
+    expectReadKey(it, 0);
+    // Now, we are done with B0 in DB0, but not in DB1
+    expectReadKey(it, 1); // Reads B1, and releases B0 in DB0
+
+    // These cases should NOT block...  (they access B0 in DB0)
+    c = getMultiClient("SET B0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 0; SET B0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SET B0 xxx; SELECT 1");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 1; SELECT 0; SET B0 xxx; SELECT 1");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+
+    // These cases SHOULD block...  (they access B0 in DB1)
+    c = getMultiClient("SET B0 xxx");
+    c->db = server.db[1];
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 1; SET B0 xxx");
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 1; SET B0 xxx; SELECT 0");
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 0; SELECT 1; SET B0 xxx; SELECT 1");
+    simulateBlockedWrite(c);
+
+    expectAnythingCleanup(it);
+}
+
+// For this test, B0 is added into DB1 - so it exists in both DB0 and DB1.  We will process it
+//  in DB0, but it will be unprocessed in DB1.  See if we track select properly - WHEN WE HAVE NO
+//  PERMISSION TO EXECUTE SELECT!
+TEST_F(BgIterationTest, multiHandlesSelectNoPermissionProperly) {
+    addKeyToDb(1, "B0", "B0");
+
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    // Read the 1st key - B0 in DB0.
+    expectReadKey(it, 0);
+    // Now, we are done with B0 in DB0, but not in DB1
+    expectReadKey(it, 1); // Reads B1, and releases B0 in DB0
+
+    // No permission for any commands (specifically select/swapdb)
+    EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_, _, _, _, _, _))
+        .Times(AtLeast(1))
+        .WillRepeatedly(Return(ACL_DENIED_CMD));
+
+    // These cases should NOT block...  (they access B0 in DB0)
+    //  The SELECTs below are inconsequential - with/without select, same result.
+    c = getMultiClient("SET B0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 0; SET B0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SET B0 xxx; SELECT 1");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SELECT 1; SELECT 0; SET B0 xxx; SELECT 1");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+
+    // These cases SHOULD block IF SELECT IS WORKING...  (they access B0 in DB1)
+    c = getMultiClient("SET B0 xxx");
+    c->db = server.db[1];    // already starting on DB1
+    simulateBlockedWrite(c); // will block, no select
+    freeTestClient(c);
+    c = getMultiClient("SELECT 1; SET B0 xxx");
+    simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails)
+    freeTestClient(c);
+    c = getMultiClient("SELECT 1; SET B0 xxx; SELECT 0");
+    simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails)
+    freeTestClient(c);
+    c = getMultiClient("SELECT 0; SELECT 1; SET B0 xxx; SELECT 1");
+    simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails)
+
+    expectAnythingCleanup(it);
+}
+
+// For this test, B0 is added into DB1 - so it exists in both DB0 and DB1.  We will process it
+//  in DB0, but it will be unprocessed in DB1.  See if we track SWAPDB properly.
+TEST_F(BgIterationTest, multiHandlesSwapdbProperly) {
+    addKeyToDb(1, "B0", "B0");
+
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    // Read the 1st key - B0 in DB0.
+    expectReadKey(it, 0);
+    // Now, we are done with B0 in DB0, but not in DB1
+    expectReadKey(it, 1); // Reads B1, and releases B0 in DB0
+
+    // These cases should NOT block...  (they access B0 in DB0)
+    c = getMultiClient("SET B0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SET B0 xxx; SWAPDB 0 1");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SET B0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET B0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 0 1; SELECT 1; SET B0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+
+    // These cases SHOULD block...  (they access B0 in DB1)
+    c = getMultiClient("SET B0 xxx");
+    c->db = server.db[1];
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 1 0; SET B0 xxx; SWAPDB 0 1");
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 1 0; SELECT 0; SET B0 xxx; SWAPDB 0 1");
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET B0 xxx; SELECT 1");
+    simulateBlockedWrite(c);
+
+    expectAnythingCleanup(it);
+}
+
+// For this test, B0 is added into DB1 - so it exists in both DB0 and DB1.  We will process it
+//  in DB0, but it will be unprocessed in DB1.  See if we track select properly - WHEN WE HAVE NO
+//  PERMISSION TO EXECUTE SWAPDB!
+TEST_F(BgIterationTest, multiHandlesSwapdbNoPermissionProperly) {
+    addKeyToDb(1, "B0", "B0");
+
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    // Read the 1st key - B0 in DB0.
+    expectReadKey(it, 0);
+    // Now, we are done with B0 in DB0, but not in DB1
+    expectReadKey(it, 1); // Reads B1, and releases B0 in DB0
+
+    // No permission for any commands (specifically select/swapdb)
+    EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_, _, _, _, _, _))
+        .Times(AtLeast(1))
+        .WillRepeatedly(Return(ACL_DENIED_CMD));
+
+    // These cases should NOT block...  (they access B0 in DB0)
+    //  The SELECTs & SWAPDBs below are inconsequential - with/without select/swapdb, same result.
+    c = getMultiClient("SET B0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SET B0 xxx; SWAPDB 0 1");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SET B0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET B0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 0 1; SELECT 1; SET B0 xxx");
+    simulateUnblockedWrite(c);
+    freeTestClient(c);
+
+    // These cases SHOULD block IF SELECT/SWAPDB IS WORKING...  (they access B0 in DB1)
+    c = getMultiClient("SET B0 xxx");
+    c->db = server.db[1];
+    simulateBlockedWrite(c);
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 1 0; SET B0 xxx; SWAPDB 0 1");
+    simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb fails)
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 1 0; SELECT 0; SET B0 xxx; SWAPDB 0 1");
+    simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails)
+    freeTestClient(c);
+    c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET B0 xxx; SELECT 1");
+    simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails)
+
+    expectAnythingCleanup(it);
+}
+
+
+static void *pthreadWait200msAndReadTwoKeys(void *arg) {
+    bgIterator *it = static_cast<bgIterator *>(arg);
+
+    usleep(200000);
+    bgIteratorRead(it);
+    bgIteratorRead(it);
+    return nullptr;
+}
+
+static void asyncWait200msAndReadTwoKeys(bgIterator *it) {
+    int rc;
+    pthread_attr_t attr;
+    pthread_t thread;
+
+    rc = pthread_attr_init(&attr);
+    assert(rc == 0);
+    rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+    assert(rc == 0);
+
+    rc = pthread_create(&thread, &attr, pthreadWait200msAndReadTwoKeys, it);
+    assert(rc == 0);
+
+    rc = pthread_attr_destroy(&attr);
+    assert(rc == 0);
+}
+
+TEST_F(BgIterationTest, testLuaWithUndeclaredKey) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+
+    // Read the 1st key - let's get the party started
+    expectReadKey(it, 0);
+
+    // At this point, key 0 is read.  Keys 1 & 2 are queued (they are all in the same bucket).
+    // If we fake a modification to key 3, we won't know if it's handled out of order.
+    // So we fake a modification to key 4
+    c = getWriteClient(4, "xxx");
+    c->flag.script = 1;
+
+    // Now for a LUA script, we have already blocked (on the eval/evalsha) for any declared keys
+    //  But here, we're about to modify an undeclared key.  We can't actually block in the middle
+    //  of the LUA script.  So this will behave as unblocked, but incur a synchronous wait.
+
+    // Key 4 will get expedited when we simulate the write.  After reading key 4, key 1 will need
+    //  to be read to return key 4 to Valkey, unblocking the synchronous wait.
+    asyncWait200msAndReadTwoKeys(it);
+
+    monotime blockTimer;
+    elapsedStart(&blockTimer);
+    simulateUnblockedWrite(c); // Not blocked, but delays internally
+    // Must have delayed at least 150ms (some time may have passed before timer start)
+    EXPECT_GT(elapsedMs(blockTimer), 150u);
+
+    // Continue...
+    expectReadKeySequence(it, 2, 3);
+    // 4 has already been processed
+    expectReadKeySequence(it, 5, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+
+// Make sure that replication received while processing the last key is sent
+TEST_F(BgIterationTest, replicationReceivedWhileProcessingLastKey) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKeySequence(it, 0, LAST_ITEM);
+
+    c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0
+
+    expectReadReplication(it, c); // Replication happened while processing the last item, should be here.
+
+    simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing
+
+    expectReadComplete(it); // We expect to see the completion instead
+}
+
+TEST_F(BgIterationTest, repldoneFunctionCalled) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL,
+                                                  iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA);
+    expectReadKeySequence(it, 0, LAST_ITEM);
+    c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0
+
+    // Since in testing, we are only feeding one item at a time, and synchronously, we won't call
+    //  the repldone function until after we release the last item.
+    EXPECT_EQ(replDoneConfirmed, 0);
+    expectReadReplication(it, c);    // Replication happened while processing the last item, should be here.
+    EXPECT_EQ(replDoneConfirmed, 1); // Last key released, now done feeding replication
+
+    simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing
+
+    expectReadComplete(it); // We expect to see the completion instead
+}
+
+TEST_F(BgIterationTest, repldoneFunctionCalledTwice) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL,
+                                                  iteratorRepldoneFnNotBeingReadyInitially, iteratorCleanupFn, PRIVDATA);
+    expectReadKeySequence(it, 0, LAST_ITEM);
+    c = getWriteClient(0, "xxx");
+    simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0
+
+    // Won't signal replDone until we've released the final item (which happens when reading the replication)
+    EXPECT_EQ(replDoneRejected, 0);
+    EXPECT_EQ(replDoneConfirmed, 0);
+    expectReadReplication(it, c);   // Releases the final item
+    EXPECT_EQ(replDoneRejected, 1); // replDone called once (and rejected by client)
+    EXPECT_EQ(replDoneConfirmed, 0);
+    simulateUnblockedWriteWithModification(c); // This will replicate (because replDone returned false)
+
+    expectReadReplication(it, c); // ReplDone gets called again (and accepted this time)
+    EXPECT_EQ(replDoneConfirmed, 1);
+
+    simulateUnblockedWriteWithModification(c); // This won't replicate because replication is done
+
+    expectReadComplete(it); // We expect to see the completion instead
+}
+
+// Check that the memory reported for replication is correct
+TEST_F(BgIterationTest, checkReplicationByteCount) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL,
+                                                  iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA);
+    c = getWriteClient(0, "xxx");
+    size_t expectedReplicationSize = sizeof(bgIteratorItem);
+    for (int i = 0; i < c->argc; i++) {
+        expectedReplicationSize += objectComputeSize(NULL, c->argv[i], 0, 0);
+    }
+
+    expectReadKey(it, 0);
+    expectReadKey(it, 1); // Releases and unblocks 0
+    EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u);
+
+    simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0
+    EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize);
+    simulateUnblockedWriteWithModification(c); // and write again (2nd replication)
+    EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize);
+
+    expectReadKey(it, 2); // Keys 0..2 all in same bucket
+
+    expectReadReplication(it, c);
+    // After reading the 1st replication, it hasn't been returned yet (it's the active item)
+    EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize);
+    expectReadReplication(it, c);
+    // After reading the 2nd replication, the 1st has been returned
+    EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize);
+
+    expectReadKey(it, 3);
+    // Now all replication has been returned/freed
+    EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u);
+
+    expectReadKeySequence(it, 4, LAST_ITEM);
+    expectReadComplete(it);
+}
+
+// Test that for an arbitrary write command having no keys, replication should occur.
+TEST_F(BgIterationTest, checkNoKeysWriteIsReplicated) {
+    bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL,
+                                                  iteratorCleanupFn, PRIVDATA);
+    expectReadKey(it, 0);
+
+    c = getNoKeysWriteClient();
+    EXPECT_CALL(mock, blockClientInUseOnKeys(c, _, _)).Times(0);
+    bool blocked = bgIteration_blockClientIfRequired(c);
+    EXPECT_FALSE(blocked);
+    bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv);
+
+    expectReadKeySequence(it, 1, 2); // These were already in queue
+
+    expectReadReplication(it, c);
+
+    expectReadKeySequence(it, 3, LAST_ITEM);
+    expectReadComplete(it);
+}
diff --git a/src/unit/wrappers.h b/src/unit/wrappers.h
index 0f4fb388b98..5bfc117fab2 100644
--- a/src/unit/wrappers.h
+++ b/src/unit/wrappers.h
@@ -61,6 +61,15 @@ extern "C" {
 long long __wrap_aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, aeTimeProc *proc, void *clientData, aeEventFinalizerProc *finalizerProc);
 int __wrap_processPendingCommandAndInputBuffer(client *c);
 void __wrap_beforeNextClient(client *c);
+
+void __wrap_blockClientInUseOnKeys(client *c, int nKeys, robj **keys);
+void __wrap_unblockClientsInUseOnKey(robj *key);
+
+int __wrap_ACLCheckAllUserCommandPerm(user *u, struct serverCommand *cmd, robj **argv, int argc, int dbid, int *idxptr);
+
+size_t __wrap_hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata);
+bool __wrap_hashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor);
+
 #undef protected
 #undef _Bool
 #undef typename