From 6b8bd418ad0d27f53db47c28216f3b023b6ca3dd Mon Sep 17 00:00:00 2001 From: Harry Lin <49881386+harrylin98@users.noreply.github.com> Date: Wed, 27 May 2026 08:58:53 -0700 Subject: [PATCH 01/40] Set pending_command flag consistently across all command execution paths (#3600) The `pending_command` flag indicates that a client has a fully parsed command ready for execution. This update ensures that the flag is set/cleared consistently across different execution paths. --------- Signed-off-by: harrylin98 --- src/blocked.c | 17 ++++++----------- src/db.c | 2 ++ src/module.c | 7 +++++++ src/networking.c | 3 ++- src/replication.c | 10 ++++++++-- 5 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index 9562f2e3a4b..326c37a1136 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -495,10 +495,10 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo } } c->bstate->unblock_on_nokey = unblock_on_nokey; - /* Currently we assume key blocking will require reprocessing the command. - * However in case of modules, they have a different way to handle the reprocessing - * which does not require setting the pending command flag */ - if (btype != BLOCKED_MODULE) c->flag.pending_command = 1; + /* Key-blocked clients require pending_command for reprocessing on unblock. + * The caller must have set it (processInputBuffer for real clients, + * RM_Call for module fake clients). */ + serverAssert(c->flag.pending_command == 1); blockClient(c, btype); } @@ -699,8 +699,7 @@ void blockPostponeClient(client *c) { listAddNodeTail(server.postponed_clients, c); serverAssert(c->bstate->postponed_list_node == NULL); c->bstate->postponed_list_node = listLast(server.postponed_clients); - /* Mark this client to execute its command */ - c->flag.pending_command = 1; + serverAssert(c->flag.pending_command == 1); } /* Block client due to shutdown command */ @@ -731,7 +730,6 @@ static void unblockClientOnKey(client *c, robj *key) { /* In case this client was blocked on keys during command * we need to re process the command again */ if (c->flag.pending_command) { - c->flag.pending_command = 0; c->flag.reexecuting_command = 1; /* We want the command processing and the unblock handler (see RM_Call 'K' option) * to run atomically, this is why we must enter the execution unit here before @@ -898,10 +896,7 @@ static bool isClientBlockedInUse(client *c) { * The client remains blocked until ALL of its keys are unblocked via * unblockClientsInUseOnKey(). * - * The caller MUST set c->flag.pending_command = 1 before calling this function. - * This ensures the pending command is executed when the client is later - * unblocked via processPendingCommandAndInputBuffer(). - * The caller should then return without executing the command. */ + * The caller should return without executing the command after calling this. */ void blockClientInUseOnKeys(client *c, int num_keys, robj *keys[]) { serverAssert(!c->flag.blocked && !c->flag.unblocked); serverAssert(c->flag.pending_command == 1); diff --git a/src/db.c b/src/db.c index ba9d25c2fa6..ed906f22c4e 100644 --- a/src/db.c +++ b/src/db.c @@ -1469,6 +1469,8 @@ void shutdownCommand(client *c) { return; } + /* Clear pending_command to avoid re-execution. */ + c->flag.pending_command = 0; blockClientShutdown(c); if (prepareForShutdown(c, flags) == C_OK) exit(0); /* If we're here, then shutdown is ongoing (the client is still blocked) or diff --git a/src/module.c b/src/module.c index c2511dbb54e..af6b9324f62 100644 --- a/src/module.c +++ b/src/module.c @@ -6809,6 +6809,9 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const if (!(flags & VALKEYMODULE_ARGV_NO_AOF)) call_flags |= CMD_CALL_PROPAGATE_AOF; if (!(flags & VALKEYMODULE_ARGV_NO_REPLICAS)) call_flags |= CMD_CALL_PROPAGATE_REPL; } + /* Mirror processInputBuffer: set pending_command so that if the command + * blocks on keys, unblockClientOnKey will reprocess it on unblock. */ + c->flag.pending_command = 1; call(c, call_flags); /* Propagate database changes from the temporary client back to the context client @@ -8171,6 +8174,10 @@ ValkeyModuleBlockedClient *moduleBlockClient(ValkeyModuleCtx *ctx, c->bstate->timeout = timeout; blockClient(c, BLOCKED_MODULE); } + /* Module handles its own reply on unblock, so clear pending_command + * to prevent re-execution. Auth clients are the exception — they + * need re-execution after auth completes. */ + if (!auth_reply_callback) c->flag.pending_command = 0; /* Defer response until after being unblocked for a context originated from * keyspace notification events */ if (is_keyspace_notification) { diff --git a/src/networking.c b/src/networking.c index a0bed240588..2540bce1700 100644 --- a/src/networking.c +++ b/src/networking.c @@ -3765,6 +3765,7 @@ void commandProcessed(client *c) { * since we have not applied the command. */ if (c->flag.blocked) return; + c->flag.pending_command = 0; reqresAppendResponse(c); clusterSlotStatsAddNetworkBytesInForUserClient(c); resetClient(c); @@ -3837,7 +3838,6 @@ int processPendingCommandAndInputBuffer(client *c) { * blocked client as well */ if (c->flag.close_asap) return C_ERR; if (c->flag.pending_command) { - c->flag.pending_command = 0; if (processCommandAndResetClient(c) == C_ERR) { return C_ERR; } @@ -4108,6 +4108,7 @@ int processInputBuffer(client *c) { } /* We are finally ready to execute the command. */ + c->flag.pending_command = 1; if (processCommandAndResetClient(c) == C_ERR) { /* If the client is no longer valid, we avoid exiting this * loop and trimming the client buffer later. So we return diff --git a/src/replication.c b/src/replication.c index 9c8c56d44d2..131b4fa2797 100644 --- a/src/replication.c +++ b/src/replication.c @@ -5017,7 +5017,10 @@ void waitCommand(client *c) { } /* Otherwise block the client and put it into our list of clients - * waiting for ack from replicas. */ + * waiting for ack from replicas. WAIT handles its own reply in + * processClientsWaitingReplicas, so clear pending_command to avoid + * being mistaken for a command that needs re-execution. */ + c->flag.pending_command = 0; blockClientForReplicaAck(c, timeout, offset, numreplicas, 0); /* Make sure that the server will send an ACK request to all the replicas @@ -5059,7 +5062,10 @@ void waitaofCommand(client *c) { } /* Otherwise block the client and put it into our list of clients - * waiting for ack from replicas. */ + * waiting for ack from replicas. WAITAOF handles its own reply in + * processClientsWaitingReplicas, so clear pending_command to avoid + * being mistaken for a command that needs re-execution. */ + c->flag.pending_command = 0; blockClientForReplicaAck(c, timeout, offset, numreplicas, numlocal); /* Make sure that the server will send an ACK request to all the replicas From 738bf48a32ca5a7269531ab8ffb1f08e211df3b5 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 21 Apr 2026 22:43:31 +0000 Subject: [PATCH 02/40] Forkless Save Signed-off-by: Jim Brunner --- .config/typos.toml | 6 +- src/Makefile | 1 + src/bgiteration.c | 2728 ++++++++++++++++++++++++ src/bgiteration.h | 363 ++++ src/db.c | 17 +- src/hashtable.c | 80 +- src/hashtable.h | 2 + src/kvstore.c | 10 + src/module.c | 2 + src/object.c | 5 +- src/rdb.c | 2 + src/replication.c | 3 + src/server.c | 50 +- src/server.h | 14 +- src/unit/test_bgiteration.cpp | 3747 +++++++++++++++++++++++++++++++++ src/unit/wrappers.h | 6 + 16 files changed, 7024 insertions(+), 12 deletions(-) create mode 100644 src/bgiteration.c create mode 100644 src/bgiteration.h create mode 100644 src/unit/test_bgiteration.cpp diff --git a/.config/typos.toml b/.config/typos.toml index 10103279c57..ff90d3a679d 100644 --- a/.config/typos.toml +++ b/.config/typos.toml @@ -15,11 +15,12 @@ optin = "optin" smove = "smove" Parth = "Parth" # seems like the spellchecker does not like it is similar to "Path" nd = "nd" +threadsave = "threadsave" [default] extend-ignore-re = [ - "SELECTed", - "WATCHed", + "[A-Z]{2,}ed", # SELECTed, WATCHed, etc. + "[A-Z]{2,}s", # SELECTs, etc. ] [type.c] @@ -64,6 +65,7 @@ pathc = "pathc" pn = "pn" seeked = "seeked" tre = "tre" +dbe = "dbe" [type.systemd.extend-words] # systemd = .conf diff --git a/src/Makefile b/src/Makefile index 2c78f95986e..98f49108e46 100644 --- a/src/Makefile +++ b/src/Makefile @@ -457,6 +457,7 @@ ENGINE_SERVER_OBJ = \ allocator_defrag.o \ anet.o \ aof.o \ + bgiteration.o \ bio.o \ bitops.o \ blocked.o \ diff --git a/src/bgiteration.c b/src/bgiteration.c new file mode 100644 index 00000000000..ed6ac40bddc --- /dev/null +++ b/src/bgiteration.c @@ -0,0 +1,2728 @@ +#include "fmacros.h" +#include "bgiteration.h" +#include "dict.h" +#include "fifo.h" +#include "kvstore.h" +#include "monotonic.h" +#include "mutexqueue.h" +#include "server.h" + +int getFlushCommandFlags(client *c, int *flags); // in db.c +uint64_t dictObjHash(const void *key); // in server.c +int dictObjKeyCompare(const void *key1, const void *key2); // in server.c +size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid); // in object.c +robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const sds key, long long expire); // in object.c + + +// Non-public hashtable/kvstore functions... +bool hashtableInternalFindBucketIdx(hashtable *ht, void *key, int *table_idx, size_t *bucket_idx); +void hashtableInternalIteratorGetBucketIdx(hashtableIterator *iterator, int *table_idx, size_t *bucket_idx); +bool hashtableInternalIteratorIsBucketIdxComplete(hashtableIterator *iterator); +hashtableIterator *kvstoreInternalIteratorGetCurrentHashtableIterator(kvstoreIterator *kvs_it); + + +static bool receiveItemsBackFromOneIterator(bgIterator *it); // in bgiteration.c - forward declaration + +// ################ TEMP COMPILE HACKS ########################### +// Issue found. server.db has changed from an array of db to an array of pointers to db (change all refs to server.db) +// Issue: iterators (kvstore/hashtable) are not safe across event loop invocations. Hashtable (kvstore?) needs to track and maintain safe iterators. + + +// Don't think there's any current need for this... +static bool ignoreKeyForSave(const_sds key) { + UNUSED(key); + return false; +} + +//------- END OF COMPILE HACKS ------------------- + + +// Returns true if the cmd is a script command that may replicate. +static bool isScriptCallWriteCmd(struct serverCommand *cmd) { + return ((cmd->proc == fcallCommand) || (cmd->proc == evalCommand) || (cmd->proc == evalShaCommand)); +} + +// The PFCOUNT command (which does NOT have the CMD_WRITE flag) modifies the underlying string and +// is replicated as a write. So it needs to be detected and handled specially. +static bool isWriteCmd(struct serverCommand *cmd) { + return ((cmd->flags & CMD_WRITE) || (cmd->proc == pfcountCommand) || (cmd->proc == execCommand) || (isScriptCallWriteCmd(cmd))); +} + +// Returns true if the command is a deletion based command (DEL or UNLINK) +static bool isDeleteCmd(struct serverCommand *cmd) { + return ((cmd->proc == delCommand) || (cmd->proc == unlinkCommand)); +} + + +static bool onValkeyMainThread(void) { + return (pthread_equal(server.main_thread_id, pthread_self()) != 0); +} + +/* Parse a parameters robj, extracting a valid DBID. + * Returns FALSE if DBID isn't valid. + */ +static bool getDbIdFromRobj(robj *obj, int *db_id) { + long long value; + if (getLongLongFromObject(obj, &value) != C_OK) return false; + if ((value < 0) || (value >= server.dbnum)) return false; + *db_id = (int)value; + return true; +} + +/* Parse the parameters of the COPY command, extracting the target DBID. + * Returns FALSE if the command would not run. + */ +static bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid, int *target_dbid) { + const int COPY_COMMAND_OPTIONAL_ARG_START_INDEX = 3; + + *target_dbid = selected_dbid; + + for (int i = COPY_COMMAND_OPTIONAL_ARG_START_INDEX; i < argc; i++) { + if (!strcasecmp((char *)objectGetVal(argv[i]), "replace")) { + continue; + } else if (!strcasecmp((char *)objectGetVal(argv[i]), "db") && (i + 1 < argc)) { + /* Note the parsing here needs to perfectly match what we have in Valkey OSS for COPY. + * The following command is considered OK by Valkey 8.1 so we can't return here, but + * must continue to parse till the last db which is the one that's effectively used. + * COPY key1 key2 db 1 db 2 db 3 // (This will use db 3) + */ + if (!getDbIdFromRobj(argv[i + 1], target_dbid)) { + return false; // parse failure + } + i++; // Consume additional argument + } else { + return false; // parse failure + } + } + return true; +} + +/* Get parameters for the SWAPDB command. + * The optional permission_client allows for checking of a client's permission for swapdb. + * Returns true if command would be executed. + */ +bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, int *id1_p, int *id2_p) { + static struct serverCommand *swapdb_cmd = NULL; + + // We don't need to check permissions in the replication phase + if (permission_client != NULL) { + if (swapdb_cmd == NULL) { + swapdb_cmd = lookupCommandByCString("swapdb"); + serverAssert(swapdb_cmd != NULL); + } + + int idxptr; + if (ACLCheckAllUserCommandPerm(permission_client->user, swapdb_cmd, argv, argc, + permission_client->db->id, &idxptr) != ACL_OK) return false; + } + + long long dbid1, dbid2; + if (argc != 3) return false; + if (server.cluster_enabled) return false; + if (getLongLongFromObject(argv[1], &dbid1) != C_OK) return false; + if (getLongLongFromObject(argv[2], &dbid2) != C_OK) return false; + if (dbid1 < 0 || dbid1 >= server.dbnum) return false; + if (dbid2 < 0 || dbid2 >= server.dbnum) return false; + if (dbid1 == dbid2) return false; // Valid, but doesn't do anything + + *id1_p = (int)dbid1; + *id2_p = (int)dbid2; + return true; +} + +/* Get parameters for the SELECT command. + * The optional permission_client allows for checking of a client's permission for select. + * Returns true if command would be executed. + */ +bool getParamsForSelect(int argc, robj **argv, client *permission_client, int *dbid_p) { + static struct serverCommand *select_cmd = NULL; + + // We don't need to check permissions in the replication phase + if (permission_client != NULL) { + if (select_cmd == NULL) { + select_cmd = lookupCommandByCString("select"); + serverAssert(select_cmd != NULL); + } + + int idxptr; + if (ACLCheckAllUserCommandPerm(permission_client->user, select_cmd, argv, argc, + permission_client->db->id, &idxptr) != ACL_OK) return false; + } + + long long dbid; + if (argc != 2) return false; + if (getLongLongFromObject(argv[1], &dbid) != C_OK) return false; + if (dbid < 0 || dbid >= server.dbnum) return false; + + *dbid_p = (int)dbid; + return true; +} + + +/* DictType for SDS->ptr. The SDS is referenced, no destructor. */ +static dictType sdsrefToPtrDictType = { + .entryGetKey = dictEntryGetKey, + .hashFunction = dictSdsHash, + .keyCompare = dictSdsKeyCompare +}; + + +/* Wrap decrRefCount() so that it can be used as a callback requiring void. */ +static void decrRefCountVoid(void *o) { + decrRefCount(o); +} + + +/* Concatenate argc/argv into a command string for debugging. */ +static sds createSdsFromClientArgv(int argc, robj **argv) { + sds cmd = sdsempty(); + for (int i = 0; i < argc; i++) { + robj *arg = getDecodedObject(argv[i]); // some objects are int encoded + cmd = sdscatprintf(cmd, "'%s' ", (char *)objectGetVal(arg)); + decrRefCount(arg); + } + return cmd; +} + + +//########################################################################### + + +/* bgIteration internal (compile time) configuration values */ +enum { + BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE = 16384, // Prevent initial rehashing + BGITER_MAX_CLONE_ITEM_BYTES = 512, // Max size item to clone + BGITER_MAX_CLONE_POOL_BYTES = (1 * 1024 * 1024), // Total limit for all cloned items + BGITER_QUEUE_INCREASE_INCR = 100, // Step size when increasing queue target + BGITER_CYCLE_DELAY_MS = 2, // Delay between calls on bgIteration timer + BGITER_CYCLE_BUDGET_MS = 1, // Normal time limit for timer processing + BGITER_CYCLE_BUDGET_MAX_MS = 10 // Maximum time limit when starvation seen +}; + +// These can be tweaked by unit tests +static int bgiter_max_clone_item_bytes = BGITER_MAX_CLONE_ITEM_BYTES; +static int bgiter_max_clone_pool_bytes = BGITER_MAX_CLONE_POOL_BYTES; + +void bgIteration_unitTestDisableCloning(void) { + bgiter_max_clone_item_bytes = 0; + bgiter_max_clone_pool_bytes = 0; +} +void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes) { + bgiter_max_clone_item_bytes = item_bytes; + bgiter_max_clone_pool_bytes = pool_bytes; +} + +typedef enum { + BGITERATION_TYPE_NONE, + BGITERATION_TYPE_FULLSCAN, + BGITERATION_TYPE_CLUSTERSLOT +} bgIterationType; + +/* Extensions to bgIteratorItemType. These enumerations are used internally, and are not part of + * the published interface. These allow for extensibility in the internal information-passing + * between the Valkey main thread and the iteration client thread. */ +typedef enum { + /* Indicates that the iteration client has completed use of the bgIterator and that the + * bgIterator should be cleaned up and freed by the Valkey main thread. */ + BGITERATOR_ITEMEXT_ITER_CLOSED = 10 +} bgIteratorItemTypeExtended; + +/* Item for bgIteratorItemTypeExtended.BGITERATOR_ITEMEXT_ITER_CLOSED. Used to pass a bgIterator + * back to the Valkey main thread for cleanup/release. */ +typedef struct { + bgIteratorItemTypeExtended type; + bgIterator *iter; +} bgIteratorItemExtClose; + +/* Used for dictEntryPtrDictType. This dict grows and shrinks constantly during the iteration. + * There is no point to rehash it all the time. */ +static int neverShrink(size_t moreMem, double usedRatio) { + UNUSED(moreMem); + return (usedRatio > 0.5); // Return true only if expanding +} + +// A dictionary with a pointer (itself) as a key (the address pointed to is NOT referenced). +// Nothing is duplicated, this is a very fast dictionary, but potentially unsafe if the original +// items are deleted or moved. +// WARNING: Can't have active defrag running! It might reallocate memory blocks, swapping their +// pointer values! A check must be made in active defrag to ensure that no iteration is +// active. + +// Thomas Wang's 64-bit mix +static uint64_t pointerHash(const void *key) { + uint64_t h = (uint64_t)(uintptr_t)key; + h = (~h) + (h << 21); // h = (h << 21) - h - 1; + h = h ^ (h >> 24); + h = (h + (h << 3)) + (h << 8); // h * 265 + h = h ^ (h >> 14); + h = (h + (h << 2)) + (h << 4); // h * 21 + h = h ^ (h >> 28); + h = h + (h << 31); + return h; +} + +static int pointerCompare(const void *key1, const void *key2) { + return key1 == key2; +} + +static dictType dictEntryPtrDictType = { + .entryGetKey = dictEntryGetKey, + .hashFunction = pointerHash, + .keyCompare = pointerCompare, + .resizeAllowed = neverShrink +}; + +// A TEMP set of robj's (of type sds). This is only for temporary sets as the robj's are not +// ref-counted at insertion/deletion. Used for robj->NULL. +static dictType tempKeysetDictType = { + .entryGetKey = dictEntryGetKey, + .hashFunction = dictObjHash, + .keyCompare = dictObjKeyCompare +}; + +typedef struct genericIterator genericIterator; +typedef void (*iteratorReleaseFunc) (genericIterator *genIt); +typedef fifo * (*iteratorGetEntriesFunc) (genericIterator *genIt, int *orig_dbid, int *cur_dbid); +typedef void (*iteratorSwapDbFunc) (genericIterator *genIt, int db1, int db2); +typedef void (*iteratorFlushDbFunc) (genericIterator *genIt, int cur_dbid); +typedef bool (*iteratorHasPassedItemFunc) (genericIterator *genIt, const_sds key, int cur_dbid); +typedef int (*iteratorOriginalDbFunc) (genericIterator *genIt, int cur_dbid); +typedef bool (*iteratorIsKeyInScopeFunc) (genericIterator *genIt, const_sds key); + +// Function pointers supporting polymorphic iterator implementation +struct genericIterator { + iteratorReleaseFunc release; + iteratorGetEntriesFunc getEntries; + iteratorSwapDbFunc swapDb; + iteratorFlushDbFunc flushDb; + iteratorHasPassedItemFunc hasPassedItem; + iteratorOriginalDbFunc originalDb; + iteratorIsKeyInScopeFunc isKeyInScope; +}; + +typedef struct itemListNode { + struct itemListNode *next; +} itemListNode; + +static itemListNode *freeItemStackHead = NULL; + +static void itemFreeList_returnItemBackToFreeList(bgIteratorItem* item) { + itemListNode *freedNode = (itemListNode*)item; + freedNode->next = freeItemStackHead; + freeItemStackHead = freedNode; +} + +static bgIteratorItem *itemFreeList_getElementOrAllocate(void) { + + bgIteratorItem *item; + // Pop a free node from the free list or allocate if none free + if (freeItemStackHead) { + item = (bgIteratorItem*)freeItemStackHead; + freeItemStackHead = freeItemStackHead->next; + if (freeItemStackHead) { + valkey_prefetch(freeItemStackHead); + } + } + else { + // Create new listNode and item + item = zmalloc(sizeof(bgIteratorItem)); + } + return item; +} + +static void itemFreeList_release(void) { + while(freeItemStackHead) { + itemListNode *node = freeItemStackHead; + freeItemStackHead = node->next; + zfree((bgIteratorItem*)node); + } +} + +// This struct is used across threads. Unless otherwise noted, the fields are initialized at +// iterator creation (within the main thread) and are read-only by the client thread. +struct bgIterator { + sds name; // Iterator name + bgIteratorReplDoneFunc repldone; // Optional repldone function to be run on the main thread + bgIteratorCleanupFunc cleanup; // Optional cleanup function to be run on main thread + void *privdata; // Client's private data to be passed to cleanup function + + int iteration_flags; // Consistent and/or Replication + int iteration_type; // Full scan or cluster slot + uint32_t consistent_modification_id; // iterator epoch at time of iterator creation + + genericIterator *keyset_iter; // Low-level iterator (polymorphic) + + dict *early_iterate_entries; // Used to keep track of what items have already been iterated + // over by out-of-order expedited process, ensuring a bgIterator + // does not try to reprocess items. + // Used only by main thread. + // dictEntry -> NULL + + mutexQueue *items_for_iterator; // Created/Destroyed in main thread, used in both (threadsafe) + + mutexQueue *return_to_valkey; // Queue of items to be returned to the Valkey main thread (threadsafe) + + unsigned int item_count_target; // Used only by main thread + + bgIteratorItem *volatile current_item; // current_item is normally only used in the iteration client. + // It's marked volatile here only to support snooping from the + // main thread when handling a FLUSHDB command. This prevents + // the compiler from generating code which might read the + // pointer multiple times (when it's coded to read only once). + // Also - this syntax is for a volatile POINTER to a + // non-volatile item. "volatile" at the beginning of the + // declaration, would indicate a (non-volatile) pointer to a + // volatile item. + + bool client_is_active; // Set to true when client performs 1st read + bool completed; // Set to true in main thread when last item from iteration has + // been queued to the client. No additional items will be + // enqueued to the client after this has been set. + + volatile bool terminated; // Set to true in main thread when iteration is to be killed + // Set to true in iteration client when it decides to end early + + bool cur_cmd_may_replicate; // Used only in main thread during command processing + + // Variables maintaining runtime statistics + unsigned long dbentries_queued; // Updated by main thread + unsigned long dbentries_processed; // Updated by client thread + unsigned long replication_queued; // Updated by main thread + unsigned long replication_processed; // Updated by client thread + unsigned long swapdb_queued; // Updated by main thread + unsigned long swapdb_processed; // Updated by client thread + unsigned long flushdb_queued; // Updated by main thread + unsigned long flushdb_processed; // Updated by client thread + unsigned long dbentry_clones_queued; // Updated by main thread + unsigned long dbentry_clones_processed; // Updated by client thread + monotime monotonic_start_time; // Time iteration started + + volatile monotime monotonic_item_start_time; // The item start time is set in the iteration client. It is + // marked volatile as it can be read from the main thread by + // bgIteratorGetStatus. If 0, this indicates that the + // iteration client is waiting for an item to process. +}; + + +// These static values are only accessed from the main Valkey thread. + +static list *allIterators; // list of bgIterator +static dict *nameToIterator; // bgIterator->name -> bgIterator + +// Global, across all iterators, dict contains a dbEntry pointer -> ref count +static dict *inUseEntries; // dbEntry -> ref count + +// Key values in the current command which don't exist in the DB yet. Needed for determination of +// replication for NON-consistent iterations. +static list *curCmdMissingKeys; // list of robj + +// A counter of the total amount of memory used for buffered replication data. +// This amount is excluded when computing the need for evictions. +static ssize_t bufferedReplicationBytes; + +// Memory pool to track current allocated memory of cloned items (in bytes) +static ssize_t bgiteration_current_clone_memory_pool_size; + +// Snapshot of the last queue size to seed the next queue +// We assume all bgIterators consume items at the same rate +static int last_item_count_target; + +// Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID) +static long long bgIterator_timeproc_id; + +// Incremented on each new iteration, this is updated in dbEntry metadata whenever an entry is modified. +static uint32_t bgIteration_epoch = 1; + + +// BgIteration debug captures BgIteration activity to a large sds buffer. When an iterator is +// completed, the entire buffer is written to a file in the current working directory. Note that +// memory must be available for the ENTIRE debug in memory. This isn't captured incrementally to +// a file as the file I/O is more likely to affect timing. +// Future implementation: the current design is most useful for a single iterator. When items are +// queued to an iterator, the iterator name is not recorded (to save space). +// Developer note: using a CONST value here allows the compiler to completely remove all of the +// debugging code at compile time. There is no run-time performance overhead when set to FALSE. +// This is essentially like an IFDEF, however, it's better as it forces the compiler to validate +// syntax. +static const bool BGITERATION_DEBUG = false; // DO NOT SUBMIT WITH THIS SYMBOL SET TO TRUE! +static sds debugBuffer; + + + +//============================================================================================= +// Full Scan Iterator +//============================================================================================= +/* The full scan iterator performs the actual iteration over the Valkey keyset. The iterator is + * only used from within the Valkey main thread. Iteration proceeds one DB at a time, based on + * the DB ordering at the time of iterator creation. Each time the iterator returns items, all + * of the dictionary entries from a single hash bucket are returned. + */ + +struct fullScanIterator { + genericIterator callbacks; // (must be first item) + + // Array of mapping from original DB ID (at the time of iteration start) to that DB's + // current index. So, if the DB which was DB-0 is now at index 6, orig_to_cur_db[0]==6. + int *orig_to_cur_db; + + // The reverse of the above array. This maps a current DB index to its original index + // (at the time of iteration start). + int *cur_to_orig_db; + + // This is the DB we are currently iterating over. This is relative to the ORIGINAL + // DB ordering, at the time of iterator creation. Iteration proceeds from 0..N based on + // the original ordering. + int iter_db; + + // Iterator for the DB orig_to_cur_db[iter_db] + kvstore *kvs; // keep track of kvs associated with iter_dbi + kvstoreIterator *iter_dbi; +}; + +static void fullScanIteratorRelease(genericIterator *genIt) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + if (it->iter_dbi) kvstoreIteratorRelease(it->iter_dbi); + zfree(it->orig_to_cur_db); + zfree(it->cur_to_orig_db); + zfree(it); +} + +static fifo * fullScanIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + if (it->iter_db >= server.dbnum) return NULL; // Finished scanning + + fifo *dbEntryFifo = fifoCreate(); + while (fifoLength(dbEntryFifo) == 0) { + while (it->iter_dbi == NULL) { + if (++it->iter_db >= server.dbnum) { + fifoRelease(dbEntryFifo); + return NULL; // Iteration complete + } + serverDb *db = server.db[it->orig_to_cur_db[it->iter_db]]; + if (db != NULL) { + it->kvs = db->keys; + it->iter_dbi = kvstoreIteratorInit(it->kvs, HASHTABLE_ITER_SAFE); + } + } + + hashtableIterator *ht_it = NULL; + do { + dbEntry *de; + if (!kvstoreIteratorNext(it->iter_dbi, (void **)&de)) { + kvstoreIteratorRelease(it->iter_dbi); + it->kvs = NULL, it->iter_dbi = NULL; + break; + } + + ht_it = kvstoreInternalIteratorGetCurrentHashtableIterator(it->iter_dbi); + if (ignoreKeyForSave(objectGetKey(de))) continue; // slot migration: keys being purged + fifoPush(dbEntryFifo, de); + } while (!hashtableInternalIteratorIsBucketIdxComplete(ht_it)); + } + *orig_dbid = it->iter_db; + *cur_dbid = it->orig_to_cur_db[*orig_dbid]; + return dbEntryFifo; +} + +static void fullScanIteratorSwapDb(genericIterator *genIt, int db1, int db2) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + int temp = it->cur_to_orig_db[db1]; + it->cur_to_orig_db[db1] = it->cur_to_orig_db[db2]; + it->cur_to_orig_db[db2] = temp; + + it->orig_to_cur_db[it->cur_to_orig_db[db1]] = db1; + it->orig_to_cur_db[it->cur_to_orig_db[db2]] = db2; +} + +static void fullScanIteratorFlushDb(genericIterator *genIt, int cur_dbid) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + int orig_db = it->cur_to_orig_db[cur_dbid]; + if (orig_db == it->iter_db) { + // We are currently iterating on the DB that's being flushed. + kvstoreIteratorRelease(it->iter_dbi); + it->kvs = NULL, it->iter_dbi = NULL; + // Iteration will continue with the next DB. + } +} + +static bool fullScanIteratorHasPassedItem(genericIterator *genIt, const_sds key, int cur_dbid) { + struct fullScanIterator *it = (struct fullScanIterator *) genIt; + int orig_dbid = it->cur_to_orig_db[cur_dbid]; + + if (orig_dbid < it->iter_db) return true; // Entire DB has already been processed + if (orig_dbid > it->iter_db) return false; // Haven't started this DB yet + // Now, orig_dbid == it->iter_db + + if (it->iter_dbi == NULL) return true; // just finished this DB + + // We're in the middle of processing a DB. In cluster-mode, the DB is divided into 1 hashtable + // per slot. In cluster-mode-disabled, we treat all keys as in slot 0. + int keySlot = server.cluster_enabled ? getKeySlot((sds)key) : 0; + if (keySlot < kvstoreIteratorGetCurrentHashtableIndex(it->iter_dbi)) return true; + if (keySlot > kvstoreIteratorGetCurrentHashtableIndex(it->iter_dbi)) return false; + + // At this point, we're down to a specific hashtable. + + hashtable *iter_current_ht = kvstoreGetHashtable(it->kvs, keySlot); + int table; // 0 or 1 (supporting rehashing) + size_t index; // bucket number within the hashtable + // If key doesn't exist, we consider it passed - we MIGHT have iterated over it had it existed. + if (!hashtableInternalFindBucketIdx(iter_current_ht, (void *)key, &table, &index)) return true; + + hashtableIterator *htIter = kvstoreInternalIteratorGetCurrentHashtableIterator(it->iter_dbi); + int iter_table; + size_t iter_index; + hashtableInternalIteratorGetBucketIdx(htIter, &iter_table, &iter_index); + if (table < iter_table) return true; // iteration in table 1, but item is in table 0 + if (table > iter_table) return false; // iteration in table 0, but item is in table 1 + // if index <= iterator index, it has been passed. bgIterator + // processes buckets atomically. hashtableIterator points to the + // last returned position. It means bucket at iter_index has + // already been processed. + if (index <= iter_index) return true; + if (ignoreKeyForSave(key)) return true; // if slot being purged, pretend we have passed it + return false; +} + +static int fullScanIteratorOriginalDb(genericIterator *genIt, int cur_dbid) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + return it->cur_to_orig_db[cur_dbid]; +} + +static bool fullScanIteratorIsKeyInScope(genericIterator *genIt, const_sds key) { + UNUSED(genIt); + UNUSED(key); + return true; // All keys are in scope +} + +static genericIterator * fullScanIteratorCreate(void) { + struct fullScanIterator *it = zmalloc(sizeof(struct fullScanIterator)); + it->orig_to_cur_db = zmalloc(sizeof(int) * server.dbnum); + it->cur_to_orig_db = zmalloc(sizeof(int) * server.dbnum); + for (int i = 0; i < server.dbnum; i++) { + it->orig_to_cur_db[i] = i; + it->cur_to_orig_db[i] = i; + } + it->iter_db = -1; + it->kvs = NULL; + it->iter_dbi = NULL; + + it->callbacks.release = fullScanIteratorRelease; + it->callbacks.getEntries = fullScanIteratorGetEntries; + it->callbacks.swapDb = fullScanIteratorSwapDb; + it->callbacks.flushDb = fullScanIteratorFlushDb; + it->callbacks.hasPassedItem = fullScanIteratorHasPassedItem; + it->callbacks.originalDb = fullScanIteratorOriginalDb; + it->callbacks.isKeyInScope = fullScanIteratorIsKeyInScope; + + return (genericIterator *)it; +} + + + +//============================================================================================= +// Cluster Slot Iterator +//============================================================================================= +/* The cluster slot iterator performs iteration over one cluster slot of the Valkey keyset. The + * iterator is only used from within the Valkey main thread. + */ +struct clusterSlotIterator { + genericIterator callbacks; // (must be first item) +}; + +static void clusterSlotIteratorRelease(genericIterator *genIt) { + UNUSED(genIt); + serverAssert(false); // Not yet implemented +} + +static fifo * clusterSlotIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) { + UNUSED(genIt); + UNUSED(orig_dbid); + UNUSED(cur_dbid); + serverAssert(false); // Not yet implemented +} + +static void clusterSlotIteratorSwapDb(genericIterator *genIt, int db1, int db2) { + UNUSED(genIt); + UNUSED(db1); + UNUSED(db2); + serverAssert(false); // swap not valid in cluster mode +} + +static void clusterSlotIteratorFlushDb(genericIterator *genIt, int cur_dbid) { + UNUSED(genIt); + UNUSED(cur_dbid); + serverAssert(false); // Not yet implemented +} + +static bool clusterSlotIteratorHasPassedItem(genericIterator *genIt, const_sds key, int cur_dbid) { + UNUSED(genIt); + UNUSED(key); + UNUSED(cur_dbid); + serverAssert(false); // Not yet implemented +} + +static int clusterSlotIteratorOriginalDb(genericIterator *genIt, int cur_dbid) { + UNUSED(genIt); + UNUSED(cur_dbid); + return cur_dbid; // swap not supported in cluster mode +} + +/* When checking if a command is in scope for this iterator, all of its keys should be either in + * scope or not. In cluster mode enabled a command cannot reference keys from different slots, so + * this assumption will always be true. */ +static bool clusterSlotIteratorIsKeyInScope(genericIterator *genIt, const_sds key) { + UNUSED(genIt); + UNUSED(key); + serverAssert(false); // Not yet implemented +} + +static genericIterator * clusterSlotIteratorCreate(const int *slots, size_t slots_count) { + struct clusterSlotIterator *it = zmalloc(sizeof(struct clusterSlotIterator)); + it->callbacks.release = clusterSlotIteratorRelease; + it->callbacks.getEntries = clusterSlotIteratorGetEntries; + it->callbacks.swapDb = clusterSlotIteratorSwapDb; + it->callbacks.flushDb = clusterSlotIteratorFlushDb; + it->callbacks.hasPassedItem = clusterSlotIteratorHasPassedItem; + it->callbacks.originalDb = clusterSlotIteratorOriginalDb; + it->callbacks.isKeyInScope = clusterSlotIteratorIsKeyInScope; + + UNUSED(slots); + UNUSED(slots_count); + serverAssert(false); // Not yet implemented + + return (genericIterator *)it; +} + + + +//============================================================================================= +// General iteration support (across all iterators) +//============================================================================================= + +// While an item is potentially in use by a background thread, we can't have +// rehashing by the main thread. Returns true if rehashing was paused. +static bool pauseRehashing(dbEntry *de) { + switch (de->encoding) { + case OBJ_ENCODING_HASHTABLE: { // SET or HASH + hashtable *ht = objectGetVal(de); + hashtablePauseRehashing(ht); + return true; + } + case OBJ_ENCODING_SKIPLIST: { // SORTED SET + zset *zs = objectGetVal(de); + hashtablePauseRehashing(zs->ht); + return true; + } + default: + return false; + } +} + +static void resumeRehashing(dbEntry *de) { + switch (de->encoding) { + case OBJ_ENCODING_HASHTABLE: { // SET or HASH + hashtable *ht = objectGetVal(de); + hashtableResumeRehashing(ht); + break; + } + case OBJ_ENCODING_SKIPLIST: { // SORTED SET + zset *zs = objectGetVal(de); + hashtableResumeRehashing(zs->ht); + break; + } + default: + break; + } +} + +// Maintain a list of entries which are currently in-use. These items should not be modified. +static void incrementEntryInuse(dbEntry *de) { + dictEntry *existingEntry; + dictEntry *newEntry = dictAddRaw(inUseEntries, de, &existingEntry); + if (newEntry) { + incrRefCount(de); + dictSetSignedIntegerVal(newEntry, 1); + } else { + dictSetSignedIntegerVal(existingEntry, dictGetSignedIntegerVal(existingEntry) + 1); + } +} + + +static void decrementEntryInuse(dbEntry *de) { + dictEntry *entry = dictFind(inUseEntries, de); + if (dictGetSignedIntegerVal(entry) == 1) { + dictDelete(inUseEntries, de); + decrRefCount(de); + } else { + serverAssert(dictGetSignedIntegerVal(entry) > 1); + dictSetSignedIntegerVal(entry, dictGetSignedIntegerVal(entry) - 1); + } +} + +static bool isEntryInuseBySingleIterator(dbEntry *de) { + dictEntry *entry = dictFind(inUseEntries, de); + return dictGetSignedIntegerVal(entry) == 1; +} + +static bool isEntryInuseByAnyIterator(dbEntry *de) { + return (dictFind(inUseEntries, de) != NULL); +} + + +static ssize_t computeStringDbEntrySize(dbEntry *de) { + sds key = objectGetKey(de); + size_t valueSize = stringObjectLen(de); + + return sdslen(key) + valueSize; // ignore the rest of the overhead, it's minor & transient +} + + +static dbEntry *tryCloneDbEntry(dbEntry *de) { + if (bgiteration_current_clone_memory_pool_size + bgiter_max_clone_item_bytes + > bgiter_max_clone_pool_bytes) { + return NULL; + } + + // Future optimization: Incorporate small ziplists, sorted sets, etc. + // OBJ_ENCODING_INT is omitted only because there isn't a good API for cloning it yet. + if (de->type == OBJ_STRING && de->encoding != OBJ_ENCODING_INT) { + ssize_t itemSize = computeStringDbEntrySize(de); + + if (itemSize <= bgiter_max_clone_item_bytes) { + bgiteration_current_clone_memory_pool_size += itemSize; + dbEntry *clone = createStringObjectWithKeyAndExpire((char *)objectGetVal(de), sdslen(objectGetVal(de)), objectGetKey(de), objectGetExpire(de)); + ((bgIterationEntryMetadata *)objectGetMetadata(clone))->iterator_epoch + = ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch; + return clone; + } + } + + return NULL; +} + + +static void freeClonedDictEntry(dbEntry *clonedEntry) { + serverAssert(clonedEntry->type == OBJ_STRING); + + // Add back to memory pool + bgiteration_current_clone_memory_pool_size -= computeStringDbEntrySize(clonedEntry); + + decrRefCount(clonedEntry); +} + +static bgIteratorItem * makeDbEntryItem(dbEntry *de, int dbid, bool isCloned) { + if (!isCloned) incrementEntryInuse(de); + + bgIteratorItem *item = itemFreeList_getElementOrAllocate(); + item->type = BGITERATOR_ITEM_DBENTRY; + item->dbid = dbid; + item->u.dbe.de = de; + item->u.dbe.is_cloned = isCloned; + item->u.dbe.is_rehashing_paused = pauseRehashing(de); + + return item; +} + +static robj ** cloneRobjArray(int argc, robj **argv) { + robj **newarray = zmalloc(sizeof(robj*) * argc); + for (int i = 0; i < argc; i++) { + newarray[i] = argv[i]; + incrRefCount(argv[i]); + } + return newarray; +} + + +static void freeRobjArray(int argc, robj **argv) { + for (int i = 0; i < argc; i++) { + decrRefCount(argv[i]); + } + zfree(argv); +} + + +// Called by iterator thread to release an item. +static void returnCurrentItemToValkey(bgIterator *it) { + bgIteratorItem *item = it->current_item; + if (item == NULL) return; + + switch (item->type) { + case BGITERATOR_ITEM_DBENTRY: + it->dbentries_processed++; + if (item->u.dbe.is_cloned) it->dbentry_clones_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_REPLICATION: + it->replication_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_SWAPDB: + it->swapdb_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_FLUSHDB: + it->flushdb_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + + case BGITERATOR_ITEM_COMPLETE: + case BGITERATOR_ITEM_TERMINATED: + // These are static and just used to wake the iterator - they should never be returned. + serverAssert(false); + break; + + default: + serverAssert(false); + } + + // Do this AFTER placing into return_to_valkey. This is volatile and snooped when there is a + // flushall event. Don't want an item to be missed. + it->current_item = NULL; +} + + + +//============================================================================================= +// Background Iterator (private) +//============================================================================================= + +static void bgIteratorRelease(bgIterator *it) { + serverAssert(onValkeyMainThread()); + serverAssert(it->current_item == NULL); + serverAssert(mutexQueueLength(it->items_for_iterator) == 0); + serverAssert(mutexQueueLength(it->return_to_valkey) == 0); + + dictDelete(nameToIterator, it->name); + listDelNode(allIterators, listSearchKey(allIterators, it)); + + mutexQueueRelease(it->items_for_iterator); + it->items_for_iterator = NULL; + + mutexQueueRelease(it->return_to_valkey); + it->return_to_valkey = NULL; + + it->keyset_iter->release(it->keyset_iter); + it->keyset_iter = NULL; + + dictRelease(it->early_iterate_entries); + it->early_iterate_entries = NULL; + + sdsfree(it->name); + zfree(it); +} + + +static bool shouldFeedIteratorMore(bgIterator *it) { + return (!it->completed + && !it->terminated + && mutexQueueLength(it->items_for_iterator) < it->item_count_target); +} + + +// Debugging routine +static sds createEntryString(int dbid, dbEntry *de) { + sds key = objectGetKey(de); + + sds entrySds = sdsempty(); + entrySds = sdscatprintf(entrySds, "(%d)'%s'", dbid, key); + if (de->type == OBJ_STRING) { + robj *o = getDecodedObject(de); // might be encoded as int + const unsigned valuePrintLen = 20; + entrySds = sdscatprintf(entrySds, " : '%.*s'", valuePrintLen, (char *)objectGetVal(o)); + if (sdslen((sds)objectGetVal(o)) > valuePrintLen) entrySds = sdscat(entrySds, "..."); + decrRefCount(o); + } else { + entrySds = sdscatprintf(entrySds, " : type(%d)", de->type); + } + return entrySds; +} + + +static void feedIterator(bgIterator *it, monotime end_time_us) { + // Smart logic to dynamically adjust the size of the queue + unsigned int initial_queue_len = mutexQueueLength(it->items_for_iterator); + + if (initial_queue_len > 2 && it->item_count_target >= initial_queue_len) { + it->item_count_target -= initial_queue_len / 2; + } + + // Now do some feeding + bool have_time = (getMonotonicUs() < end_time_us); + int timeCheckCounter = 0; + while (shouldFeedIteratorMore(it) && have_time) { + int orig_dbid, cur_dbid; + fifo *dbEntryFifo = it->keyset_iter->getEntries(it->keyset_iter, &orig_dbid, &cur_dbid); + + if (dbEntryFifo == NULL) { + // Iteration of items is complete for this iterator + serverAssert(it->dbentries_queued >= it->dbentries_processed); + serverAssert(it->replication_queued >= it->replication_processed); + serverAssert(it->swapdb_queued >= it->swapdb_processed); + serverAssert(it->flushdb_queued >= it->flushdb_processed); + serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed); + + // Snapshot queue size to seed next iterator when terminated + last_item_count_target = it->item_count_target; + + if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { + if (!it->client_is_active || (it->dbentries_queued > it->dbentries_processed)) { + // We are done feeding dict entries to the iterator, but before ending the + // replication processing make sure that the iterator has become active (has + // started reading) and make sure that all of the dict entries have been processed + // by the client. + break; + } + if (it->repldone) { + bool clientWantsMoreReplication = (!it->repldone(it->privdata)); + if (clientWantsMoreReplication) break; + } + } + bgIteratorItem *completionItem = itemFreeList_getElementOrAllocate(); + *completionItem = (bgIteratorItem){ .type = BGITERATOR_ITEM_COMPLETE }; + if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { + rdbSaveInfo rsi; + completionItem->dbid = (rdbPopulateSaveInfo(&rsi)) ? rsi.repl_stream_db : 0; + completionItem->u.master_repl_offset = server.primary_repl_offset; + if (BGITERATION_DEBUG) { + debugBuffer = sdscat(debugBuffer, "REPLDONE FN\n"); + } + } + + if (BGITERATION_DEBUG) { + debugBuffer = sdscat(debugBuffer, "SENDING COMPLETE\n"); + } + + mutexQueueAdd(it->items_for_iterator, completionItem); + it->completed = true; + break; + } + + int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) ? orig_dbid : cur_dbid; + + fifo *itemsToAdd = fifoCreate(); + while (fifoLength(dbEntryFifo) > 0) { + dbEntry *de; + fifoPop(dbEntryFifo, (void **)&de); + + // Remove new/modified items during consistent iteration. + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT + && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch > it->consistent_modification_id) { + continue; + } + + // Remove any items which have been processed early + if (dictFind(it->early_iterate_entries, de) != NULL) { + dictDelete(it->early_iterate_entries, de); + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, de); + debugBuffer = sdscatprintf(debugBuffer, "SKIPPING ITEM(early iterate): %s\n", entryString); + sdsfree(entryString); + } + continue; + } + + // For items which are left, convert them from dbEntry to iteratorItem + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, de); + debugBuffer = sdscatprintf(debugBuffer, "ITEM: %s\n", entryString); + sdsfree(entryString); + } + + bgIteratorItem *item = makeDbEntryItem(de, dbid, false); + + fifoPush(itemsToAdd, item); + } + fifoRelease(dbEntryFifo); + + if (fifoLength(itemsToAdd) > 0) { + it->dbentries_queued += fifoLength(itemsToAdd); + mutexQueueAddMultiple(it->items_for_iterator, itemsToAdd); + } + fifoRelease(itemsToAdd); + + // This is a predictably fast loop. We don't need to check the time on every pass. + if (++timeCheckCounter % 32 == 0) { + have_time = (getMonotonicUs() < end_time_us); + } + } + + // Smart logic to dynamically adjust the size of the queue + if (initial_queue_len == 0 && have_time) { + it->item_count_target += BGITER_QUEUE_INCREASE_INCR; + } +} + + +static bool addEarlyIterationKey(bgIterator *it, dbEntry *earlyEntry, int cur_dbid) { + int rc = dictAdd(it->early_iterate_entries, earlyEntry, NULL); + serverAssert(rc == DICT_OK); + + int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) + ? it->keyset_iter->originalDb(it->keyset_iter, cur_dbid) + : cur_dbid; + + dbEntry *cloneEntry = tryCloneDbEntry(earlyEntry); + bool isClonedEntry = (cloneEntry != NULL); + bgIteratorItem *item = makeDbEntryItem(isClonedEntry ? cloneEntry : earlyEntry, dbid, isClonedEntry); + + it->dbentries_queued++; + if (isClonedEntry) it->dbentry_clones_queued++; + + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { // JHB - can we optimize here in cluster mode (no swap) + // On consistent iteration, SWAPDB events are not provided. So there is no requirement to + // keep items in order or synchronized with SWAPDB. + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, item->u.dbe.de); + debugBuffer = sdscatprintf(debugBuffer, "EARLY_1: %s\n", entryString); + sdsfree(entryString); + } + mutexQueuePushPriority(it->items_for_iterator, item); + } else { + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, item->u.dbe.de); + debugBuffer = sdscatprintf(debugBuffer, "EARLY: %s\n", entryString); + sdsfree(entryString); + } + mutexQueueAdd(it->items_for_iterator, item); + } + return !isClonedEntry; // Block if the entry will be used by the background thread +} + + +// This expedites a single key and doesn't attempt to avoid expediting through optimization. +static bool expediteSingleKeyWithoutOptimization( + bgIterator *it, + int dbid, + robj *oKey, + dict *waitingOnKeys) { + + bool mustBlock = false; + + bool iterComplete = it->completed || it->terminated; + + sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], key); + if (de != NULL) { + if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) + && (dictFind(it->early_iterate_entries, de) == NULL)) { + if (addEarlyIterationKey(it, de, dbid)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } else { + if (isEntryInuseByAnyIterator(de)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } + } + + return mustBlock; +} + + +// MOVE/COPY are unfortunate special commands. They work on 2 DBs at once. +const int MOVE_COMMAND_DBID_ARG_INDEX = 2; +static bool expediteKeysForMove( + bgIterator *it, + int dbid, + int argc, + robj **argv, + dict *waitingOnKeys) { + if (argc <= MOVE_COMMAND_DBID_ARG_INDEX) return false; + + int destDbid; + if (!getDbIdFromRobj(argv[MOVE_COMMAND_DBID_ARG_INDEX], &destDbid)) return false; + + bool mustBlock = false; + robj *key = argv[1]; + + // Not looking for special cases to optimize here. Just try to expedite both src and dest + // keys. Note that the dest key might exist (and need iteration) but could be expired and + // could be overwritten by MOVE. In this case, a DEL would replicate due to the expiry. So + // even if the target is expired, we need to replicate it before executing the command. + if (expediteSingleKeyWithoutOptimization(it, dbid, key, waitingOnKeys)) mustBlock = true; + if (expediteSingleKeyWithoutOptimization(it, destDbid, key, waitingOnKeys)) mustBlock = true; + + it->cur_cmd_may_replicate = true; + return mustBlock; +} + + +// MOVE/COPY are unfortunate special commands. They work on 2 DBs at once. +static bool expediteKeysForCopy( + bgIterator *it, + int dbid, + int argc, + robj **argv, + dict *waitingOnKeys) { + + int destDbid; + if (!getTargetDbIdForCopyCommand(argc, argv, dbid, &destDbid)) return false; + + bool mustBlock = false; + robj *srcKey = argv[1]; + robj *destKey = argv[2]; + + // Not trying to optimize COPY. Just expedite source and destination (if it exists). We + // don't really care if the value is overwritten or not (so no need to parse REPLACE option). + if (expediteSingleKeyWithoutOptimization(it, dbid, srcKey, waitingOnKeys)) mustBlock = true; + if (expediteSingleKeyWithoutOptimization(it, destDbid, destKey, waitingOnKeys)) mustBlock = true; + + it->cur_cmd_may_replicate = true; + return mustBlock; +} + + +/* There are several cases where a client must be blocked on write operations. (Clients never need + * to be blocked for read operations.) + * + * Note: An Amazon extension to the Valkey command structure allows us to identify commands where + * the first key is for write and the rest are for read. This allows us to make the + * following optimizations: + * - for keys which are read only, there's no need to block if the key is in-use by an iterator + * - without replication, there's no need to immediately queue read keys on a consistent iteration + * + * Iterator: CONSISTENT = NO, REPLICATION = NO + * - Block if any write-key is in use by an the iterator + * + * Iterator: CONSISTENT = NO, REPLICATION = YES + * - Block if any write-key is in use by an the iterator + * - If ANY key has already been iterated (but some keys have not), then + * - Block and immediately queue any key (read or write) that has not + * already been iterated + * Example: SDIFFSTORE KEY_A KEY_B KEY_C + * In this case, KEY_A is written, KEY_B and KEY_C are read. If KEY_A has already been + * iterated over, the replication stream will contain this command. The receiver of this + * replication will need KEY_B and KEY_C in order to process the replication stream. So + * these need to be iterated and the client blocked. + * + * Iterator: CONSISTENT = YES, REPLICATION = NO + * - Block if any write-key is in use by an the iterator + * - Block and immediately queue any WRITE-key that has not already been iterated + * + * Iterator: CONSISTENT = YES, REPLICATION = YES + * (Combination only valid in cluster mode - no SWAPDB possible) + * - Block if any write-key is in use by an the iterator + * - Block and immediately queue any key (read or write) that has not already been iterated + */ +static bool expediteKeysForWrite( + bgIterator *it, + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv, + keyReference *keyrefs, + int numKeys, + dict *waitingOnKeys) { + serverAssert(numKeys > 0); + + bool mustBlock = false; + + // All keys of the command should either be in scope or not since in cluster mode enabled they + // should all be in the same slot. So we just check the first key. + robj *oKey = argv[keyrefs[0].pos]; + sds key = objectGetVal(oKey); + // If it's not in the iteration scope for the current iterator, then we don't need to do + // anything with this command. + if (!it->keyset_iter->isKeyInScope(it->keyset_iter, key)) return false; + + // Note: performance optimization for commands which only modify the first key. If this flag + // is not available, we can safely remove this `if` statement. + if ((cmd->flags & CMD_WRITE_FIRSTKEY_ONLY) + && !(it->iteration_flags & BGITERATOR_FLAG_REPLICATION)) { + // If this write command only modifies the 1st key, we don't need to expedite others + // unless replication enabled. + numKeys = 1; + } + + if (cmd->proc == moveCommand) { + // Unfortunate special case for MOVE + return expediteKeysForMove(it, dbid, argc, argv, waitingOnKeys); + } + + if (cmd->proc == copyCommand) { + // Similar special case for COPY + return expediteKeysForCopy(it, dbid, argc, argv, waitingOnKeys); + } + + bool iterComplete = it->completed || it->terminated; + + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { + // CONSISTENT = YES, REPLICATION = YES / NO + for (int i = 0; i < numKeys; i++) { + robj *oKey = argv[keyrefs[i].pos]; + sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], key); + if (de == NULL) continue; // New key, no need to expedite + if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) + && dictFind(it->early_iterate_entries, de) == NULL + && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { + if (addEarlyIterationKey(it, de, dbid)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } else { + if (isEntryInuseByAnyIterator(de)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } + } + it->cur_cmd_may_replicate = true; // Will replicate only if replication enabled + } else { + // Identification of missing keys is only needed for non-consistent iteration. This only + // needs to be collected once (on the 1st non-consistent iteration) + bool collectMissing = (listLength(curCmdMissingKeys) == 0); + + if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { + // CONSISTENT = NO, REPLICATION = YES + bool someIterated = false; + // dict containing the keys that have not been iterated yet. + // Using a dict dedupes the keys in case the command contains duplicated keys. + dict *notIteratedKeys = dictCreate(&dictEntryPtrDictType); // dict of dbEntry* -> robj* + + for (int i = 0; i < numKeys; i++) { + robj *oKey = argv[keyrefs[i].pos]; + sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], key); + if (de == NULL) { + if (collectMissing) { + incrRefCount(oKey); + listAddNodeHead(curCmdMissingKeys, oKey); + } + continue; + } + if (iterComplete + || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) + || (dictFind(it->early_iterate_entries, de) != NULL)) { + someIterated = true; + } else { + dictAdd(notIteratedKeys, de, oKey); + } + if (isEntryInuseByAnyIterator(de)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } + + // Since missing keys are considered as already iterated, if there are any missing keys + // we must consider that some keys have been iterated, and make sure all other keys + // will be expedited if needed. + if (listLength(curCmdMissingKeys) > 0) someIterated = true; + + // This command may be executing as part of a larger transaction. If some parts of the + // transaction have already been identified to replicate, we must wait on all keys and + // replicate here as well. (Take care not to set cur_cmd_may_replicate to false.) + if (someIterated) { + if (server.in_exec) { + // We are now executing the commands in a multi-exec block. + // + // Regarding MULTI/EXEC: Remember that this code is executed twice for commands + // within a MULTI/EXEC block. First, we parse all the commands when deciding + // if the EXEC should be blocked. Then, as each command is executed, it's + // re-parsed so that we can maintain the early iterated list as the commands + // execute. In this second pass, as each command is executed, we can't change + // the replication decision which was made earlier (when the EXEC was processed). + // We don't want to get tricked (by a key being removed and recreated) into + // into starting to replicate in the middle of a MULTI/EXEC block. + } else { + it->cur_cmd_may_replicate = true; + } + } + if (it->cur_cmd_may_replicate) { + dictEntry *de; + dictIterator *di = dictGetIterator(notIteratedKeys); + while ((de = dictNext(di)) != NULL) { + dbEntry *notIteratedEntry = dictGetKey(de); + robj *oKey = dictGetVal(de); + + if (addEarlyIterationKey(it, notIteratedEntry, dbid)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } + dictReleaseIterator(di); + } + dictRelease(notIteratedKeys); + } else { + // CONSISTENT = NO, REPLICATION = NO + for (int i = 0; i < numKeys; i++) { + robj *oKey = argv[keyrefs[i].pos]; + sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], key); + if (de == NULL) { + if (collectMissing) { + incrRefCount(oKey); + listAddNodeHead(curCmdMissingKeys, oKey); + } + continue; + } + if (isEntryInuseByAnyIterator(de)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } + } + } + + return mustBlock; +} + + +// Called when an iterator is terminated. Pulls everything out of the queue +// and returns the items to Valkey (before they hit the iterator). +static void returnAllItemsToValkey(bgIterator *it) { + serverAssert(onValkeyMainThread()); + + fifo *poppedFifo = mutexQueuePopAll(it->items_for_iterator, false); + if (poppedFifo == NULL) return; // Nothing to return + + // Release non-dictentry items first... + fifo *itemsToReturn = fifoCreate(); + while (fifoLength(poppedFifo) > 0) { + bgIteratorItem *item; + fifoPop(poppedFifo, (void **)&item); + switch (item->type) { + // back out the "queued" statistic + case BGITERATOR_ITEM_DBENTRY: + it->dbentries_queued--; + if (item->u.dbe.is_cloned) it->dbentry_clones_queued--; + break; + case BGITERATOR_ITEM_REPLICATION: + it->replication_queued--; + break; + case BGITERATOR_ITEM_SWAPDB: + it->swapdb_queued--; + break; + case BGITERATOR_ITEM_FLUSHDB: + it->flushdb_queued--; + break; + + case BGITERATOR_ITEM_COMPLETE: + // This can only happen if the completion item has been enqueued and + // the iterator is terminated before reaching the completion item. + itemFreeList_returnItemBackToFreeList(item); + continue; // Skip pushing this onto itemsToReturn + + case BGITERATOR_ITEM_TERMINATED: + // This can only happen if there is a race when terminating between + // the iteration client and main thread. + itemFreeList_returnItemBackToFreeList(item); + continue; // Skip pushing this onto itemsToReturn + + default: + serverAssert(false); + } + + fifoPush(itemsToReturn, item); + } + fifoRelease(poppedFifo); + + // Now release items all at once... + if (fifoLength(itemsToReturn) > 0) { + mutexQueueAddMultiple(it->return_to_valkey, itemsToReturn); + } + fifoRelease(itemsToReturn); +} + + + +//============================================================================================= +// Foreground support functions (private) +//============================================================================================= + +static size_t replicationItemSize(bgIteratorItem *item) { + serverAssert(item->type == BGITERATOR_ITEM_REPLICATION); + size_t itemSize = sizeof(bgIteratorItem); + for (int i = 0; i < item->u.repl.argc; i++) { + itemSize += objectComputeSize(NULL, item->u.repl.argv[i], 0, 0); + } + return itemSize; +} + +static void processReturnOfItemToValkey(bgIteratorItem *item, bgIterator *iter) { + serverAssert(onValkeyMainThread()); + switch ((int)item->type) { + case BGITERATOR_ITEM_REPLICATION: + bufferedReplicationBytes -= replicationItemSize(item); + freeRobjArray(item->u.repl.argc, item->u.repl.argv); + break; + + case BGITERATOR_ITEM_DBENTRY: + { + if (item->u.dbe.is_cloned) { + freeClonedDictEntry(item->u.dbe.de); + } else { + if (isEntryInuseBySingleIterator(item->u.dbe.de)) { + // This blocking mechanism isn't the best. Written for slot-migration, + // it assumes a single DB so if the same key appears in multiple DBs, + // commands might get unblocked only to get blocked again. (This would + // happen only rarely, and with minimal impact.) + robj *key = createStringObjectFromSds(objectGetKey(item->u.dbe.de)); + unblockClientsInUseOnKey(key); + decrRefCount(key); + } + // resumeRehashing must be called before decrementEntryInuse, since decrementEntryInuse can free + if (item->u.dbe.is_rehashing_paused) resumeRehashing(item->u.dbe.de); + decrementEntryInuse(item->u.dbe.de); + } + } + break; + + case BGITERATOR_ITEM_SWAPDB: + case BGITERATOR_ITEM_FLUSHDB: + break; + + case BGITERATOR_ITEMEXT_ITER_CLOSED: + { + bgIterator *it = ((bgIteratorItemExtClose*)item)->iter; + serverAssert(it == iter); + if (it->terminated) { + // Abnormal termination + // Normally the item is TERMINATED, but might be COMPLETE in race + serverAssert(it->current_item->type == BGITERATOR_ITEM_TERMINATED + || it->current_item->type == BGITERATOR_ITEM_COMPLETE); + // Release any items stranded on the iterator after early termination + returnAllItemsToValkey(it); + receiveItemsBackFromOneIterator(it); + } else { + // Normal completion + serverAssert(it->current_item->type == BGITERATOR_ITEM_COMPLETE); + } + serverAssert(mutexQueueLength(it->items_for_iterator) == 0); + serverAssert(it->dbentries_queued == it->dbentries_processed); + serverAssert(it->replication_queued == it->replication_processed); + serverAssert(it->swapdb_queued == it->swapdb_processed); + serverAssert(it->flushdb_queued == it->flushdb_processed); + serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed); + + listEmpty(curCmdMissingKeys); // Just in case any remain + + itemFreeList_returnItemBackToFreeList(it->current_item); + it->current_item = NULL; + + bool terminated = it->terminated; + void *privdata = it->privdata; + bgIteratorCleanupFunc cleanup = it->cleanup; + bgIteratorRelease(it); // Fully release the iterator before calling cleanup + + if (BGITERATION_DEBUG) { + if (cleanup) debugBuffer = sdscatprintf(debugBuffer, "CLEANUP FN (%s)\n", + (terminated) ? "terminated" : "success"); + + sds filename = sdscatprintf(sdsempty(), "bgiteration_debug.%d", getpid()); + FILE *f = fopen(filename, "w"); + sdsfree(filename); + + fputs(debugBuffer, f); + + fclose(f); + sdsfree(debugBuffer); + debugBuffer = sdsempty(); + } + + if (cleanup) cleanup(terminated, privdata); + } + break; + + default: + serverAssert(false); // Not expecting any other type of item! + } + + // We don't allocate extension items from the pool so we manually free them + if((int)item->type == BGITERATOR_ITEMEXT_ITER_CLOSED) { + zfree(item); + } else { + itemFreeList_returnItemBackToFreeList(item); + } +} + +static void prepareAndProcessReturnedItems(int n, bgIteratorItem **items, bgIterator *iter) { + int i = 0; + for (i = 0; i < n; i++) valkey_prefetch(items[i]); + for (i = 0; i < n; i++) { + if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue; + // Prefetch can have a significant perf hit on NULL + // but we never expect items[i]->u.dbe.de to be NULL + valkey_prefetch(items[i]->u.dbe.de); + } + for (i = 0; i < n; i++) { + if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue; + // Same as above, assume key is never NULL + valkey_prefetch(objectGetKey(items[i]->u.dbe.de)); + } + for (i = 0; i < n; i++) processReturnOfItemToValkey(items[i], iter); +} + +#define PREFETCH_BATCH_SIZE 16 + +static bool receiveItemsBackFromOneIterator(bgIterator *it) { + bgIteratorItem* batchPool[PREFETCH_BATCH_SIZE]; + int n = 0; + // Returns true if we process at least one item from + // a given iterator's return_to_valkey queue, false otherwise. + fifo *poppedFifo = mutexQueuePopAll(it->return_to_valkey, false); + if (poppedFifo != NULL) { + while (fifoLength(poppedFifo) > 0) { + fifoPop(poppedFifo, (void **)&batchPool[n++]); + if (n == PREFETCH_BATCH_SIZE) { + prepareAndProcessReturnedItems(n, batchPool, it); + n = 0; + } + } + if (n > 0) { + prepareAndProcessReturnedItems(n, batchPool, it); + } + fifoRelease(poppedFifo); + return true; + } + return false; +} + +static void receiveItemsBackFromIterators(bool blocking) { + // Process each iterator's return_to_valkey queue + // If `blocking` is true, continue reading until + // at least one queue was not empty. + serverAssert(onValkeyMainThread()); + listIter li; + listNode *node; + bool processedItems = false; + do { + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + processedItems |= receiveItemsBackFromOneIterator(it); + } + if (blocking) usleep(100); // Sleep for 1ms and re-try processing iterators + } while (blocking && !processedItems); +} + + +static long long bgIteration_feedIterators_task( + struct aeEventLoop *eventLoop, + long long id, + void *clientData) { + UNUSED(eventLoop); + UNUSED(id); + UNUSED(clientData); + serverAssert(onValkeyMainThread()); + + static monotime lastFeedEndTime; // STATIC: Persists For checking starvation + monotime startTime = getMonotonicUs(); + + if (!bgIteration_iterationActive()) { + // No more iterators exist. Self-check, and terminate the "feed" task. + serverAssert(dictSize(nameToIterator) == 0); + serverAssert(dictSize(inUseEntries) == 0); + serverAssert(bufferedReplicationBytes == 0); + + // Shrink dict back to zero (doesn't normally shrink) + dictRelease(inUseEntries); + inUseEntries = dictCreate(&dictEntryPtrDictType); + + itemFreeList_release(); + + bgIterator_timeproc_id = AE_DELETED_EVENT_ID; + lastFeedEndTime = 0; + return AE_NOMORE; + } + + long dutyTimeUs = BGITER_CYCLE_BUDGET_MS * 1000; + if (lastFeedEndTime > 0) { + // If the timer was delayed, compute the proportional time we should have had, and increase + // the duty cycle to compensate (up to a limit). + long starvationUs = (startTime - lastFeedEndTime) - BGITER_CYCLE_DELAY_MS * 1000; + if (starvationUs > 0) { + long starvationCompensationUs = starvationUs * BGITER_CYCLE_BUDGET_MS + / (BGITER_CYCLE_BUDGET_MS + BGITER_CYCLE_DELAY_MS); + dutyTimeUs += starvationCompensationUs; + dutyTimeUs = MIN(dutyTimeUs, BGITER_CYCLE_BUDGET_MAX_MS * 1000); + } + } + monotime endTime = startTime + dutyTimeUs; + + // Run this part regardless of time limit... + receiveItemsBackFromIterators(false); + + // Feeding iterators (below) respects endTime. The stuff above always runs to completion. + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL && getMonotonicUs() < endTime) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated) continue; + feedIterator(it, endTime); + } + + lastFeedEndTime = getMonotonicUs(); + return BGITER_CYCLE_DELAY_MS; +} + + +// Not static, but not API. Intended for unit tests where the event loop may not be active. +void bgIteration_feedIterators(void) { + // For unit testing, force the item_count_target to 1 in each call. This ensures that we only + // feed a minimal amount to the iterators rather than a non-deterministic amount. + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + it->item_count_target = 1; + } + + // Invoke the feeding task (normally invoked by timer). + bgIteration_feedIterators_task(NULL, 0, NULL); +} + + +static void resetReplicationFlagForIterators(client *c) { + // For any given command, the command may or may not need to be replicated based on the status + // and flags of each iterator. Furthermore, if a command does need to be replicated, this + // replication must occur for an entire atomic unit; we can't replicate only part of a script + // or multi/exec. + // This function is the only place where the replication flag is cleared. + + if (c->flag.multi || c->flag.script) { + // REGARDING MULTI/EXEC + // -------------------- + // When processing a MULTI/EXEC, blockClientIfRequired is called first for the MULTI. Then, + // all of the commands are queued up in server.c:processCommand(). It's only when EXEC is + // encountered, that server.c:call() is fired to begin execution. + // AFTER the EXEC is processed by call(), then each of the commands in the MULTI/EXEC block + // will be processed through call(). + // If write commands are present, MULTI & EXEC will be passed to the replication stream + // before/after the transaction commands. Note that MULTI & EXEC are not actually + // "executed" at the time when their replication is passed to the replication stream. + // + // Example: MULTI; SET A B; EXEC + // 1. blockClientIfRequired() called for MULTI. MULTI flag IS NOT set. (Won't block.) + // 2. blockClientIfRequired() called for EXEC. MULTI flag IS set. (Might block.) + // 3. blockClientIfRequired() called for SET. MULTI flag IS set. (Won't block.) + // 4. handleCommandReplication() is called for MULTI. + // 5. handleCommandReplication() is called for SET. + // 6. handleCommandReplication() is called for EXEC. + // + // SO - if the MULTI flag is set, we DON'T clear the flag. It should only be cleared at the + // start of the transaction, when MULTI is received - and the flag isn't set yet. + + // REGARDING SCRIPTS + // ----------------- + // When processing a script, blockClientIfRequired is called first for the EVAL/EVALSHA/FCALL. + // Then, all of the commands are processed using a special script client. The script + // client has the CLIENT_SCRIPT flag set. For scripts, the replication flag is set when + // processing the EVAL/EVALSHA/FCALL and should not be cleared when executing individual + // commands in the script. + + // If it's the EXEC command, we fall through and clear the flag below. But for all other + // commands within the transaction, we don't clear the flag. + if (c->cmd->proc != execCommand) return; + } + + // For most commands, the replication flag is cleared and we determine if replication is needed + // based on the keys being used and their state in each iterator. If a modified key hasn't been + // processed yet, there's no need to expedite the key or send the replication. The key will be + // sent later, when reached by the iterator. + // However, for scripts, it is not possible to perform this optimization. There is no way to + // know if an undeclared key might be modified. Since the entire script needs to be replicated + // (or not replicated) atomically, we can't take the chance that an undeclared key might be + // hit which requires replication. + bool isScript = isScriptCallWriteCmd(c->cmd); + + getKeysResult result; + initGetKeysResult(&result); + getKeysFromCommand(c->cmd, c->argv, c->argc, &result); + + // [sm-bgiterator] TODO: ELMO-108525, This assumes all keys are in the same slot, should consider cross-slot script case. + sds check_key = (result.numkeys > 0) ? objectGetVal(c->argv[result.keys[0].pos]) : NULL; + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated) { + it->cur_cmd_may_replicate = false; + } else { + // Set initial state of the replication flag for this transaction + // For full scan iterators, write commands within scripts must always be replicated. + // For cluster slot iterators, replication of script write commands depends on whether + // the key is in scope of the current iterator. + it->cur_cmd_may_replicate = isScript && it->keyset_iter->isKeyInScope(it->keyset_iter, check_key); + } + } + getKeysFreeResult(&result); +} + + +static void handleSwapdb(int db1, int db2) { + serverAssert(onValkeyMainThread()); + serverAssert(bgIteration_iterationActive()); + serverAssert(!server.cluster_enabled); + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated) continue; + + // Let the iterator internal mechanism know + it->keyset_iter->swapDb(it->keyset_iter, db1, db2); + + // Let the background client know + if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) { + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "SWAP: %d %d\n", db1, db2); + } + + bgIteratorItem *item = itemFreeList_getElementOrAllocate(); + item->type = BGITERATOR_ITEM_SWAPDB; + item->dbid = db1; + item->u.dbid2 = db2; + it->swapdb_queued++; + mutexQueueAdd(it->items_for_iterator, item); + } + } +} + + +static void removePtrFromEarlyIterate(dbEntry *de) { + // If the item is being released, let's get the pointer out of our early_iterate_entries. + // Note that this is not strictly necessary, but it frees some memory and keeps the + // dictionary small. + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + dictDelete(it->early_iterate_entries, de); // just try delete (might not be here) + } +} + + +static int findDbForEntry(dbEntry *de) { + for (int i = 0; i < server.dbnum; i++) { + if (dbFind(server.db[i], objectGetKey(de)) == de) return i; + } + serverAssert(false); // the entry MUST be in one of the DBs +} + + +static void terminateIteratorForFlush(bgIterator *it, int dbid) { + if (!it->terminated) bgIteratorTerminate(it); + + // Snoop on the iterator. There might be 1 item still being processed. If that item is in the + // DB being flushed, the item is removed from the dict and held for deferred deletion. This + // allows the iterator to complete processing on the current item without the item being + // deleted unexpectedly. + // Since this is running in parallel with a background thread, the results are volatile. This + // is OK as when the iterator completes processing the item, it still won't have been accepted + // back to Valkey yet, meaning the item will still be in inUseEntries. + bgIteratorItem *item = it->current_item; + if (item && item->type == BGITERATOR_ITEM_DBENTRY) { + dbEntry *de = item->u.dbe.de; + int deDb = findDbForEntry(de); + if (dbid == -1 || dbid == deDb) { + removePtrFromEarlyIterate(de); + } + } +} + + +static void preserveIteratorItemsForFlush(bgIterator *it, int dbid) { + serverAssert(onValkeyMainThread()); + serverAssert(!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)); + serverAssert(dbid >= 0); + // Since this is not a consistent iteration, it's OK if the early_iterate_entries contains + // pointers to items being deleted. The item is not actually accessed from the pointer. And + // if the pointer gets reused for a new item, there's no guarantee that we would iterate it + // anyway. If replication is enabled, both new items and early_iterate_entries are treated the + // same (replication is processed). So this is safe in all cases. + // Given this, we will just worry about preserving items in the iterator's processing queue. + // Because of commands like SWAPDB and MOVE, there's no attempt to remove unnecessary items + // from the queue. This is also safer to future Valkey extensions. + + // Temporarily yank all items from the iterator's queue + fifo *poppedFifo = mutexQueuePopAll(it->items_for_iterator, false); + if (poppedFifo != NULL) { + fifo *readdFifo = fifoCreate(); + while(fifoLength(poppedFifo) > 0) { + bgIteratorItem *item; + fifoPop(poppedFifo, (void **)&item); + if (item->type == BGITERATOR_ITEM_DBENTRY) { + dbEntry *de = item->u.dbe.de; + if (dbFind(server.db[dbid], objectGetKey(de)) == de) { + // Found the entry in the DB about to be flushed + removePtrFromEarlyIterate(de); + } + } + fifoPush(readdFifo, item); + } + fifoRelease(poppedFifo); + + // Now give the list back to the iterator + mutexQueueAddMultiple(it->items_for_iterator, readdFifo); + fifoRelease(readdFifo); + } + + // And snoop on the active item. Even if the background task finishes with this item as we look + // at it, the item can't have been returned to Valkey yet. + bgIteratorItem *item = it->current_item; + if (item && item->type == BGITERATOR_ITEM_DBENTRY) { + dbEntry *de = item->u.dbe.de; + if (dbFind(server.db[dbid], objectGetKey(de)) == de) { + // Found the entry in the DB about to be flushed + removePtrFromEarlyIterate(de); + } + } +} + + +static bool isDbSignificant(int dbid) { + unsigned long long totalKeys = 0; + for (int i = 0; i < server.dbnum; i++) { + totalKeys += (server.db[i]) ? dbSize(server.db[i]) : 0; + } + return (server.db[dbid]) ? (dbSize(server.db[dbid]) > totalKeys / 2) : false; +} + + +static void handleFlushdb(int dbid) { + // Invoked BEFORE the actual flush. -1 indicates FLUSHALL. + bool should_abort_iterators = server.cluster_enabled || dbid == -1 || isDbSignificant(dbid); + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + + if (should_abort_iterators || it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { + terminateIteratorForFlush(it, dbid); + } else { + // In this (limited) case, we're only flushing a single DB that contains < half the + // keys. We don't want to kill a full-sync replication. We will just continue with + // iteration, knowing that a replication client will also receive the FLUSHDB on the + // replication stream. + // It would be nice to do this with consistent snapshot also, but given that this is a + // very rare condition, development is not justified to save off the DB for deferred + // delete. This would add a lot of complexity as well as memory implications. + preserveIteratorItemsForFlush(it, dbid); + it->keyset_iter->flushDb(it->keyset_iter, dbid); + + // Send a flushdb event to notify the client + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "FLUSH: %d\n", dbid); + } + + bgIteratorItem *item = itemFreeList_getElementOrAllocate(); + item->type = BGITERATOR_ITEM_FLUSHDB; + item->dbid = dbid; + it->flushdb_queued++; + mutexQueueAdd(it->items_for_iterator, item); + } + } + receiveItemsBackFromIterators(false); // Receive items back before flushing the items +} + + +static bool expediteKeysForWriteOnAllIterators( + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv, + keyReference *keyrefs, + int numKeys, + dict *waitingOnKeys) { + bool mustBlock = false; + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (expediteKeysForWrite(it, dbid, cmd, argc, argv, keyrefs, numKeys, waitingOnKeys)) + mustBlock = true; + } + + return mustBlock; +} + + +static bool anIteratorWillReplicateForThisCommand(void) { + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->cur_cmd_may_replicate) return true; + } + return false; +} + + +static bool expediteKeysForMultiExec(client *c, dict *waitingOnKeys) { + serverAssert(c->cmd->proc == execCommand); + + /* For MULTI/EXEC, Valkey buffers all of the commands until hitting the EXEC. + * At this point, the client holds all of the commands to be executed. This function searches + * for all of the keys used by any of the buffered write commands. In addition, if SWAPDB or + * SELECT is used, this tracks the DBIDs through various swap/select operations. + */ + + /* There's a special concern for a NON-consistent iteration with replication. If the keys are + * all "future" keys (which haven't been processed by the iterator yet), then we don't expedite + * the keys or replicate. However, if some keys have already been processed, we need to + * expedite the remaining keys and replicate everything. + * + * When processing a single command, this is all handled. But in this function, for MULTI/EXEC, + * we process 1 command at a time. There's an issue if the first command modifies a "future" + * key, we don't know (without reading ahead) if a later command will modify a prior key. This + * would require the future key to be expedited. + * + * This COULD be addressed by collecting all of the keys into a single structure and then + * analyzing them all at once. However, this won't share code well with the single commands. + * Also, building this structure is a little complex/time-consuming as we need to track both + * key AND dictID. One way to do this might be with a dict of dicts, where the first dict maps + * a dictID to a dict of keys. + * + * ALTERNATIVELY (and it's the simpler approach that's taken here) we can just check if the + * MULTI will be replicated. If so, we re-process the MULTI, just in case there were commands + * prior to deciding that replication was required that might have missed expediting. If so, + * these will be caught on the 2nd time around. + * + * Checking replication status before/after ensures that there can only be a single recursive + * call. + */ + bool initiallyAnIteratorWillReplicate = anIteratorWillReplicateForThisCommand(); + + bool mustBlock = false; + int *cur_to_orig_db = NULL; + + int curDb = c->db->id; + for (int cmdNum = 0; cmdNum < c->mstate->count; cmdNum++) { + struct serverCommand *cmd = c->mstate->commands[cmdNum].cmd; + robj **argv = c->mstate->commands[cmdNum].argv; + int argc = c->mstate->commands[cmdNum].argc; + + if (cmd->proc == swapdbCommand) { + int id1, id2; + if (getParamsForSwapdb(argc, argv, c, &id1, &id2)) { + if (cur_to_orig_db == NULL) { + cur_to_orig_db = zmalloc(sizeof(int) * server.dbnum); + for (int i = 0; i < server.dbnum; i++) cur_to_orig_db[i] = i; + } + int temp = cur_to_orig_db[id1]; + cur_to_orig_db[id1] = cur_to_orig_db[id2]; + cur_to_orig_db[id2] = temp; + } + continue; + } + + if (cmd->proc == selectCommand) { + int id; + if (getParamsForSelect(argc, argv, c, &id)) { + curDb = id; + } + continue; + } + + if (!isWriteCmd(cmd)) continue; + + getKeysResult result; + initGetKeysResult(&result); + int numkeys = getKeysFromCommand(cmd, argv, argc, &result); + keyReference *keyrefs = result.keys; + if (numkeys == 0) continue; // Write command with no keys - like FLUSHDB + + if (expediteKeysForWriteOnAllIterators( + cur_to_orig_db ? cur_to_orig_db[curDb] : curDb, + cmd, argc, argv, keyrefs, numkeys, waitingOnKeys)) { + mustBlock = true; + } + getKeysFreeResult(&result); + } + + zfree(cur_to_orig_db); + + if (!initiallyAnIteratorWillReplicate && anIteratorWillReplicateForThisCommand()) { + // We've decided to replicate. Re-process the MULTI/EXEC just once more to make sure that + // we didn't miss any keys at the beginning. This can't continue to recurse because + // `initiallyAnIteratorWillReplicate` will be TRUE in the recursive call. Note that the + // recursive call may add additional entries to `waitingOnKeys`. + if (expediteKeysForMultiExec(c, waitingOnKeys)) mustBlock = true; + } + + return mustBlock; +} + +static bgIterator * bgIteratorCreate( + const char *name, + int flags, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata, + bgIterationType iter_type, + genericIterator *keyset_iter) { + serverAssert(onValkeyMainThread()); + serverAssert(server.cluster_enabled || iter_type == BGITERATION_TYPE_FULLSCAN); + serverAssert(server.cluster_enabled // Don't allow CONSISTENT & REPLICATION + || !(flags & BGITERATOR_FLAG_CONSISTENT) // unless cluster mode (avoids + || !(flags & BGITERATOR_FLAG_REPLICATION)); // complications with SWAPDB & FLUSHDB) + + bgIterator *it = zmalloc(sizeof(bgIterator)); + it->name = sdsnew(name); + it->repldone = repldone; + it->cleanup = cleanup; + it->privdata = privdata; + it->items_for_iterator = mutexQueueCreate(); + it->return_to_valkey = mutexQueueCreate(); + + // Floor queue size to bgiteration_queue_increase_incr or use last queue size value + if (last_item_count_target < BGITER_QUEUE_INCREASE_INCR) { + last_item_count_target = BGITER_QUEUE_INCREASE_INCR; + } + it->item_count_target = last_item_count_target; + it->iteration_flags = flags; + it->iteration_type = iter_type; + it->consistent_modification_id = bgIteration_epoch++; + it->keyset_iter = keyset_iter; + it->early_iterate_entries = dictCreate(&dictEntryPtrDictType); + dictExpand(it->early_iterate_entries, BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE); + it->current_item = NULL; + it->client_is_active = false; + it->completed = false; + it->terminated = false; + it->cur_cmd_may_replicate = false; + + it->dbentries_queued = 0; + it->dbentries_processed = 0; + it->replication_queued = 0; + it->replication_processed = 0; + it->swapdb_queued = 0; + it->swapdb_processed = 0; + it->flushdb_queued = 0; + it->flushdb_processed = 0; + it->dbentry_clones_queued = 0; + it->dbentry_clones_processed = 0; + + elapsedStart(&it->monotonic_start_time); + it->monotonic_item_start_time = 0; + + + if (bgIterator_timeproc_id <= 0) { + // If iteration is not currently active, start the feeding task. (Runs in main thread.) + bgIterator_timeproc_id = aeCreateTimeEvent(server.el, 1, bgIteration_feedIterators_task, NULL, NULL); + serverAssert(bgIterator_timeproc_id != AE_ERR); + } + + if (dictAdd(nameToIterator, (void*)it->name, it) != DICT_OK) { + // Can't have 2 iterators with the same name! + serverAssert(false); + } + + listAddNodeTail(allIterators, it); + + dictExpand(inUseEntries, listLength(allIterators) * it->item_count_target); + + return it; +} + + + +//============================================================================================= +// PUBLIC INTERFACE: Iterator creation and use +//============================================================================================= + +// PUBLIC API +bgIterator * bgIteratorCreateFullScanIter( + const char *name, + int flags, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata) { + return bgIteratorCreate(name, flags, repldone, cleanup, privdata, BGITERATION_TYPE_FULLSCAN, + fullScanIteratorCreate()); +} + +// PUBLIC API +bgIterator * bgIteratorCreateSlotsIter( + const char *name, + int flags, + const int *slots, + int slots_count, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata) { + return bgIteratorCreate(name, flags, repldone, cleanup, privdata, BGITERATION_TYPE_CLUSTERSLOT, + clusterSlotIteratorCreate(slots, slots_count)); +} + +// PUBLIC API +bgIterator * bgIteratorFind(const char *name) { + serverAssert(onValkeyMainThread()); + + sds sdsname = sdsnew(name); + bgIterator *it = dictFetchValue(nameToIterator, sdsname); + sdsfree(sdsname); + + return it; +} + + +// PUBLIC API +const char *bgIteratorName(bgIterator *it) { + return it->name; +} + + +// PUBLIC API +void bgIteratorGetStatus(bgIterator *it, bgIteratorStatus *status) { + status->dbentries_queued = it->dbentries_queued; + status->dbentries_processed = it->dbentries_processed; + status->replication_queued = it->replication_queued; + status->replication_processed = it->replication_processed; + status->swapdb_queued = it->swapdb_queued; + status->swapdb_processed = it->swapdb_processed; + status->flushdb_queued = it->flushdb_queued; + status->flushdb_processed = it->flushdb_processed; + status->dbentry_clones_queued = it->dbentry_clones_queued; + status->dbentry_clones_processed = it->dbentry_clones_processed; + + status->queue_length = mutexQueueLength(it->items_for_iterator); + status->queue_length_target = it->item_count_target; + + status->runtime_ms = elapsedMs(it->monotonic_start_time); + + monotime nonvolatile_item_start_time = it->monotonic_item_start_time; + status->current_item_ms = + (nonvolatile_item_start_time == 0) ? 0 : elapsedMs(nonvolatile_item_start_time); +} + + +// PUBLIC API +void bgIteratorTerminate(bgIterator *it) { + serverAssert(onValkeyMainThread()); + + // Remove any items in the queue, but doesn't affect the 1 item that's being processed. + returnAllItemsToValkey(it); + + // We have to add an item, just in case the READER is waiting on the mutex. + if (BGITERATION_DEBUG) { + debugBuffer = sdscat(debugBuffer, "SENDING TERMINATE\n"); + } + + bgIteratorItem *terminationItem = itemFreeList_getElementOrAllocate(); + *terminationItem = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED }; + mutexQueueAdd(it->items_for_iterator, terminationItem); + + it->terminated = true; +} + + +// PUBLIC API +bool bgIteratorIsTerminating(bgIterator *it) { + return it->terminated; +} + + +// PUBLIC API +bgIteratorItem * bgIteratorRead(bgIterator *it) { + serverAssert(it->current_item == NULL + || (it->current_item->type != BGITERATOR_ITEM_COMPLETE + && it->current_item->type != BGITERATOR_ITEM_TERMINATED)); + + // First, clean up the previous item read + if (it->current_item != NULL) { + returnCurrentItemToValkey(it); + + // To support unit tests. Normal clients call bgIteratorRead from an alternate thread. + // Without this, a unit test could get stuck waiting on the completion event because + // feed won't get invoked. For production, this is called regularly from the main thread. + if (onValkeyMainThread()) bgIteration_feedIterators_task(NULL, 0, NULL); + } else { + it->client_is_active = true; + } + + it->monotonic_item_start_time = 0; // idle until blocking pop returns + it->current_item = mutexQueuePop(it->items_for_iterator, true); + it->monotonic_item_start_time = getMonotonicUs(); + + return it->current_item; +} + + +// PUBLIC API +void bgIteratorClose(bgIterator *it) { + if (it->current_item != NULL) { + if (it->current_item->type == BGITERATOR_ITEM_COMPLETE + || it->current_item->type == BGITERATOR_ITEM_TERMINATED) { + // Normal confirmation of background completion + } else { + // Client is initiating the termination + it->terminated = true; + returnCurrentItemToValkey(it); + + it->current_item = itemFreeList_getElementOrAllocate(); + *(it->current_item) = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED }; + } + } else { + // terminated before first item read + it->terminated = true; + it->current_item = itemFreeList_getElementOrAllocate(); + *(it->current_item) = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED }; + } + + // We don't allocate extension items from the free list + bgIteratorItemExtClose *itemClose = zmalloc(sizeof(bgIteratorItemExtClose)); + itemClose->type = BGITERATOR_ITEMEXT_ITER_CLOSED; + itemClose->iter = it; + mutexQueueAdd(it->return_to_valkey, itemClose); +} + + + +//============================================================================================= +// PUBLIC INTERFACE: Valkey main-thread support hooks +//============================================================================================= + +// PUBLIC API +void bgIteration_init(void) { + serverAssert(onValkeyMainThread()); + + /* This should be called once and only once from the Valkey main thread. However to support + * unit tests, this is not validated, and multiple invocations are ignored. */ + if (nameToIterator) return; // If already initialized, ignore (unit tests) + + nameToIterator = dictCreate(&sdsrefToPtrDictType); + serverAssert(nameToIterator != NULL); + + allIterators = listCreate(); + serverAssert(allIterators != NULL); + + inUseEntries = dictCreate(&dictEntryPtrDictType); + serverAssert(inUseEntries != NULL); + + curCmdMissingKeys = listCreate(); + serverAssert(curCmdMissingKeys != NULL); + listSetFreeMethod(curCmdMissingKeys, decrRefCountVoid); + + bufferedReplicationBytes = 0; + + if (BGITERATION_DEBUG) { + debugBuffer = sdsMakeRoomFor(sdsempty(), SDS_MAX_PREALLOC); + } +} + + +// PUBLIC API +bool bgIteration_iterationActive(void) { + return (allIterators != NULL && listLength(allIterators) > 0); +} + + +// PUBLIC API +void bgIteration_keyDelete(int dbid, const_sds key) { + if (!bgIteration_iterationActive()) return; + serverAssert(onValkeyMainThread()); + + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "KEYDEL: (%d)%s\n", dbid, key); + } + + dbEntry *de = dbFind(server.db[dbid], (sds)key); + if (de == NULL) return; + + // For consistent iterators, we need to make sure the item gets written before delete + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated || !it->keyset_iter->isKeyInScope(it->keyset_iter, key)) continue; + + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT + && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { + if (!it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) + && !(dictFind(it->early_iterate_entries, de) != NULL)) { + addEarlyIterationKey(it, de, dbid); // (may also add to inUseEntries) + } + } + } + + removePtrFromEarlyIterate(de); + + // We might be within the context of a command execution. This happens if the key is found to + // be expired when attempting to execute the command. In this case, we should treat the key as + // missing. If the key exists after the command executes, we can treat it like a new key. + // (If not in command execution, this is ok - it's reset at the beginning of command execution.) + robj *oKey = createObject(OBJ_STRING, sdsdup(key)); + listAddNodeHead(curCmdMissingKeys, oKey); +} + + +// PUBLIC API +// Notify bgIteration that a FLUSHALL is being performed outside of the normal client interface. +void bgIteration_flushall(void) { + handleFlushdb(-1); +} + + +// PUBLIC API +bool bgIteration_blockClientIfRequired(client *c) { + serverAssert(onValkeyMainThread()); + if (!bgIteration_iterationActive()) return false; + if (!isWriteCmd(c->cmd)) return false; + + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "BLCK?: (%d)%s\n", c->db->id, + createSdsFromClientArgv(c->argc, c->argv)); + } + + // Before executing a command or atomic transaction, the replication flag is cleared for each + // iterator. If it's determined that the command should replicate, the flag will be set + // as the command and keys are examined for expedite. + resetReplicationFlagForIterators(c); + + if (c->cmd->proc == flushdbCommand || c->cmd->proc == flushallCommand) { + // Handle flush commands prior to execution + int flags; + if (getFlushCommandFlags(c, &flags) == C_OK) { + // The command parsed ok - we WILL flush + handleFlushdb((c->cmd->proc == flushdbCommand) ? c->db->id : -1); + } + } + + bool mustBlock = false; + dict *waitOnKeys = dictCreate(&tempKeysetDictType); // dict of robj(sds)->NULL + listEmpty(curCmdMissingKeys); + + if (c->cmd->proc == execCommand) { + mustBlock = expediteKeysForMultiExec(c, waitOnKeys); + } else { + getKeysResult result; + initGetKeysResult(&result); + int numkeys = getKeysFromCommand(c->cmd, c->argv, c->argc, &result); + keyReference *keyrefs = result.keys; + if (numkeys > 0) { + mustBlock = expediteKeysForWriteOnAllIterators( + c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); + serverAssert(!(mustBlock && (c->flag.multi) && !(c->flag.script))); + + if (mustBlock && (c->flag.script)) { + /* For scripts, we will block for keys declared in EVAL/EVALSHA/FCALL. + * However, scripts are NOT required to declare keys. Even if it declares keys, + * it's not declaring the DB for the key. After a SELECT or SWAPDB, we might be on + * a key we haven't blocked for. In this case, there is no option but to execute a + * synchronous block and wait for the iterator(s) to be done with the key(s). + * (Yuck.) */ + while (mustBlock) { + receiveItemsBackFromIterators(true); // Blocking + dictEmpty(waitOnKeys, NULL); + mustBlock = expediteKeysForWriteOnAllIterators( + c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); + } + } + getKeysFreeResult(&result); + } else { + // WRITE commands with no keys should always be replicated. SWAPDB, FLUSH, FUNCTION, etc. + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + it->cur_cmd_may_replicate = true; + } + } + } + + if (mustBlock) { + serverAssert(dictSize(waitOnKeys) > 0); + robj **waitKeysArgv = zmalloc(sizeof(robj*) * dictSize(waitOnKeys)); + + dictEntry *de; + dictIterator *di = dictGetIterator(waitOnKeys); + unsigned long argvCount = 0; + while((de = dictNext(di)) != NULL) { + waitKeysArgv[argvCount++] = dictGetKey(de); + } + dictReleaseIterator(di); + serverAssert(argvCount == dictSize(waitOnKeys)); + + blockClientInUseOnKeys(c, argvCount, waitKeysArgv); + + zfree(waitKeysArgv); + } + + dictRelease(waitOnKeys); + + if (BGITERATION_DEBUG) { + if (mustBlock) debugBuffer = sdscat(debugBuffer, " (blocked)\n"); + } + + return mustBlock; +} + + +// PUBLIC API +void bgIteration_handleCommandReplication( + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv) { + if (BGITERATION_DEBUG) { + // DEBUG - enable this to capture replication not queued because iteration is inactive + if (0 && !bgIteration_iterationActive() && (isWriteCmd(cmd) || cmd->proc == multiCommand)) { + debugBuffer = sdscatprintf(debugBuffer, "REPL? INACT: (%d)%s\n", dbid, + createSdsFromClientArgv(argc, argv)); + } + } + + if (!bgIteration_iterationActive()) return; + serverAssert(onValkeyMainThread()); + + // Some commands are replicated which are not writes (like publish) these can be ignored. + // Be careful with MULTI which is not a write command, but must be replicated. + if (!isWriteCmd(cmd) && cmd->proc != multiCommand) return; + + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "REPL?: (%d)%s\n", dbid, + createSdsFromClientArgv(argc, argv)); + } + + if (cmd->proc == swapdbCommand) { + // All iterators and clients must be informed of swapdb + int id1, id2; + // command has been processed, but Valkey allows "swapdb 0 0" (which can be ignored) + if (getParamsForSwapdb(argc, argv, NULL, &id1, &id2)) + handleSwapdb(id1, id2); + } + + // In the case that a key is touched in a different DB (COPY/MOVE) the key is recorded as + // a "special" key and than handled below. + int special_dbid = 0; + sds special_key = NULL; + dbEntry *special_dbEntry = NULL; + if (cmd->proc == moveCommand) { + // The MOVE command succeeded. However MOVE requires special handling as it creates a new + // key in a different database. We need to make sure that we don't later try to iterate + // on the key as it would be a duplicate key at that point. So, instead, we will mark the + // newly created key as "early iterated". + bool success = getDbIdFromRobj(argv[MOVE_COMMAND_DBID_ARG_INDEX], &special_dbid); + serverAssert(success); // the command already succeeded, so this should work! + + robj *oKey = argv[1]; + special_key = (sds)objectGetVal(oKey); + + special_dbEntry = dbFind(server.db[special_dbid], special_key); + } + if (cmd->proc == copyCommand) { + // The COPY command succeeded. However COPY requires special handling (like MOVE). + bool success = getTargetDbIdForCopyCommand(argc, argv, dbid, &special_dbid); + serverAssert(success); // the command already succeeded, so this should work! + + // Find the newly created entry. + robj *oKey = argv[2]; + special_key = (sds)objectGetVal(oKey); + + special_dbEntry = dbFind(server.db[special_dbid], special_key); + } + + /* Implementation note regarding LUA and MULTI: LUA scripts and MULTI-EXEC blocks must be + * treated atomically. We need to ensure that either ALL of the replication (or none of the + * replication) for the atomic operation is processed by the iterator(s). This is handled + * naturally as we can only "complete" the iteration during the feeding process - and feeding + * is only performed when handling timer events (after the LUA/MULTI has completed). */ + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated) continue; + + // For consistent iteration, we only iterate values based on version. But for + // non-consistent iteration, we don't need to explicitly iterate any values newly created + // during the iteration. So we mark them as expedited. We know we have a new key if it + // was missing before the command, and exists now. + if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) { + // Handle the special case of a key moved to a different DB + if (special_dbEntry != NULL) { + if (it->cur_cmd_may_replicate + && !it->keyset_iter->hasPassedItem(it->keyset_iter, special_key, special_dbid)) { + dictAdd(it->early_iterate_entries, special_dbEntry, NULL); + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(special_dbid, special_dbEntry); + debugBuffer = sdscatprintf(debugBuffer, "EARLY(special): %s\n", entryString); + sdsfree(entryString); + } + } + + // Note: In the cases where there's a special command, we are copying or moving an + // item to a different DB. In these limited cases, we can only possibly be + // creating a single key. And if we've handled it here, we don't need to + // handle it as a "missing key" below. If we were to try to handle it as a + // standard "missing key", we would get the DBID incorrect. + } else if (listLength(curCmdMissingKeys) > 0) { + listIter missingIt; + listNode *missingNode; + listRewind(curCmdMissingKeys, &missingIt); + while ((missingNode = listNext(&missingIt)) != NULL) { + robj *oKey = listNodeValue(missingNode); + const_sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], (sds)key); + if (de != NULL) { + // It exists now! + if (it->cur_cmd_may_replicate + && !it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) { + // If the current command is allowed to replicate, and there is a new + // key which we haven't yet reached in iteration, it needs to be added + // to the set of early iterate entries. (We know that it's not already + // in that set because it's a newly created key!) + dictAdd(it->early_iterate_entries, de, NULL); + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, de); + debugBuffer = sdscatprintf(debugBuffer, "EARLY(NEW): %s\n", entryString); + sdsfree(entryString); + } + } + } + } + } + } + + /* Deletes (and unlinks) are special. + * Developer context: For most commands, we call bgIteration_blockClientIfRequired before + * the command and then call bgIteration_handleCommandReplication after the command. While + * the "before" logic is determining the need to block, it can also determine (mostly) the + * need for replication (on each iterator). Doing this all in one place saves us from + * performing some of the same logic twice. When we get to this point in the code, we just + * use the previously determined information regarding replication. This works because + * Valkey is single-threaded and only processes one command at a time. + * + * But deletes (and unlinks) happen multiple ways - and occur outside the normal + * before/after logic for commands. These situations must be handled: + * - A normal (client-driven) DEL/UNLINK command will use the standard before/after + * logic. If the key is in use by bgIteration, the command will be blocked. + * - An EVICTION generates a DEL/UNLINK which happens outside of the context of a client + * issued command. The replication flags on the iterators are stale and relate to the + * prior command executed. + * - An EXPIRATION in the context of a client-driven WRITE command occurs when the client + * command attempts to access a key and it is found to be expired. In this case, the + * client-command has already gone through the blocking process, so it should be OK to + * use it->cmd_may_replicate. + * - An EXPIRATION in the context of a client-driven READ command occurs when the client + * command attempts to access a key and it is found to be expired. In this case, the + * client-command has NOT gone through the blocking process. The replication flags on + * the iterators are stale and relate to the prior (write) command executed. + * - An EXPIRATION outside of a client-driven command occurs due to active expiry. In + * this case, the replication flags on the iterator are stale and relate to the prior + * command executed. + * + * In the case of EXPIRE/EVICT occurring outside the context of a write command, this is + * handled. If the key is in-use by bgIterator, increment of robj's refcount prevents the + * key from deletion. In this case the key will be removed from the main dictionary, but + * held inside bgIteration until no longer needed. + * Even though the entry is not physically deleted yet, it is logically deleted and it is + * safe to replicate the DEL/UNLINK. Since iterators process items FIFO, the replication + * for DEL/UNLINK won't actually get processed until other queued replication is processed. + * + * In the case of a client driven DEL command, the key will have already been deleted when + * we hit this routine. In the case of EXPIRE/EVICT, they propagate happens before the key + * is deleted. So if the key is missing, we can use the cached replication decision. But + * if the key still exists (indicating EXPIRE/EVICT) we evaluate it specially. + */ + bool shouldReplicateDelCommand = false; + bool isDelCommand = isDeleteCmd(cmd); + if (isDelCommand) { + sds key = objectGetVal(argv[1]); + if (it->keyset_iter->isKeyInScope(it->keyset_iter, key)) { + dbEntry *de = dbFind(server.db[dbid], key); + if (de) { + // NOTE: It's weird, but helpful, for both EXPIRE and EVICT the propagation happens + // BEFORE the actual delete. So if the dbEntry still exists, we are doing + // an expire/evict which is not preceded by blockClientIfRequired(). + if (it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) + || (dictFind(it->early_iterate_entries, de) != NULL)) { + shouldReplicateDelCommand = true; + } + } else { + // The dbEntry has already been deleted, this must be part of normal command + // processing. + shouldReplicateDelCommand = it->cur_cmd_may_replicate; + } + } + } + + bool replicate = (it->iteration_flags & BGITERATOR_FLAG_REPLICATION && + ((!isDelCommand && it->cur_cmd_may_replicate) + || shouldReplicateDelCommand)); + + if (replicate) { + /* We will replicate the command in these cases: + * 1) For consistent iteration - it->cur_cmd_may_replicate is always true + * 2) For non-consistent, if any of the keys have been processed, expediteKeysForWrite + * will ensure that ALL of the keys have been expedited - and we should replicate + * 3) For non-consistent, if NONE of the keys have been processed, no need to replicate + */ + + if (BGITERATION_DEBUG) { + debugBuffer = sdscat(debugBuffer, " (queued)\n"); + } + + bgIteratorItem *item = itemFreeList_getElementOrAllocate(); + item->type = BGITERATOR_ITEM_REPLICATION; + item->dbid = dbid; + item->u.repl.cmd = cmd; + item->u.repl.argv = cloneRobjArray(argc, argv); + item->u.repl.argc = argc; + bufferedReplicationBytes += replicationItemSize(item); + it->replication_queued++; + mutexQueueAdd(it->items_for_iterator, item); + } + } // allIterators loop +} + + +// PUBLIC API +size_t bgIteration_memoryInuseForReplication(void) { + return bufferedReplicationBytes; +} + + +// PUBLIC API +bool bgIteration_isEntryInuse(dbEntry *de) { + serverAssert(onValkeyMainThread()); + return isEntryInuseByAnyIterator(de); +} + + +// PUBLIC API +uint32_t bgIteration_getEpoch(void) { + return bgIteration_epoch; +} + + +// PUBLIC API +void bgIteration_updateDbEntryPtr(dbEntry *old, dbEntry *new) { + if (!bgIteration_iterationActive() || old == new) return; + serverAssert(onValkeyMainThread()); + serverAssert(!isEntryInuseByAnyIterator(old)); + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (dictDelete(it->early_iterate_entries, old) == DICT_OK) { + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "EARLY LIST UPDATE %p -> %p\n", (void *)old, (void *)new); + } + dictAdd(it->early_iterate_entries, new, NULL); + } + } +} diff --git a/src/bgiteration.h b/src/bgiteration.h new file mode 100644 index 00000000000..35a4b988857 --- /dev/null +++ b/src/bgiteration.h @@ -0,0 +1,363 @@ +#ifndef __BGITERATION_H +#define __BGITERATION_H + +#include +#include "sds.h" + +/* A mechanism for creating iteration clients which iterate over the main dictionary in a + * background thread. + * + * This mechanism passes keys to the iteration client, while blocking the keys from write by the + * Valkey main thread. Once an iteration client is done with a key, it is returned to the Valkey + * main thread and any pending writers are unblocked. + * + * A bgIterator must be created on the main Valkey thread, and then passed to another thread which + * implements the logic of the iteration client. + * + * Iteration clients are expected to read through the keyspace until the iteration is complete or + * terminated. An iteration client may not perform modifications on a key. + * + * Future enhancement: Certain types of modifications may be passed back to the Valkey main thread. + * Use case: A background compression thread wants to compress a string value. + */ + +/* Avoids dependency on server.h */ +typedef struct serverObject dbEntry; // An object with key/value inserted into main dictionary +typedef struct serverObject robj; // An object with a value used for command parameters +typedef struct client client; + +/* The bgIterator is an opaque structure. */ +typedef struct bgIterator bgIterator; + + +/* Flag indicates that a consistent iteration is required. This is used to create a point-in-time + * iteration. The iteration client will see all keys AS THEY EXISTED at the time when the iterator + * was created. + * Note: The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration + * start). SWAPDB events are NOT provided during a consistent iteration. */ +#define BGITERATOR_FLAG_CONSISTENT (1 << 0) + +/* Flag indicating that the replication stream for keys which have already been processed should be + * forwarded to the iteration client. Most useful for non-consistent iteration to track changes + * to keys already processed. By tracking changes, this allows an non-consistent iteration client + * to achieve a consistent view at the END of the iteration. + * NOTE: Replication events will be provided ordered and synchronized with any SWAPDB events. + * LIMITATION: Since SWAPDB events are not provided during CONSISTENT iteration, it is not + * permitted to use both CONSISTENT and REPLICATION on a non-clustermode instance. */ +#define BGITERATOR_FLAG_REPLICATION (1 << 1) + + +/* When running an iterator with replication, a replication-done function (callback) may be + * provided. This function will be executed after the last replication item has been fed into the + * queue for the client. This function will be run on the Valkey main thread, and allows a client + * to recognize the point where no additional replication data will be sent for processing. + * + * PRIVDATA: this pointer is for data private to the iteration client. + * + * Returns true when an iterator stops accepting any replication item into the queue for the client. + * If false is returned, replication will continue, and bgiteration will periodically call the callback + * until true is returned. In this context, returning false indicates that the client is not ready to + * stop receiving replication, it is requesting that replication be continued. + */ +typedef bool (*bgIteratorReplDoneFunc)(void *privdata); + + +/* When creating a bgIterator, a cleanup function (callback) may be provided. This function will be + * executed once iteration has completed and this will run on the Valkey main thread. + * + * TERMINATED: will be passed as TRUE if the iteration process was terminated early (either by + * the main thread calling bgIteratorTerminate() or the iteration client calling + * bgIteratorClose()). + * PRIVDATA: this pointer is for data private to the iteration client. + */ +typedef void (*bgIteratorCleanupFunc)(bool terminated, void *privdata); + + +/* Create a background full-scan iterator (bgIterator). + * This bgIterator will iterate through the entire keyspace (across all DBs). + * + * NAME: a human readable name for the iterator (must be unique) + * FLAGS: creation flags indicate iteration options + * REPLDONE: if provided, called after the last replication item has been queued (on the Valkey main thread) + * CLEANUP: if provided, called at the end of iteration (on the Valkey main thread) + * PRIVDATA: passed to cleanup function + * + * This method creates and initializes the bgIterator. It does not perform any thread management. + * It is expected that the main Valkey thread will call this method, and then start a new thread to + * to implement the iteration client which will read from the returned bgIterator. + * + * There is no need to delete/destroy a bgIterator. It will automatically be cleaned up after the + * last item is read. + */ +bgIterator * bgIteratorCreateFullScanIter( + const char *name, + int flags, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata); + + +/* Create a background slots iterator (bgIterator). + * This bgIterator will iterate through the keys belonging to a set of cluster slots. + * + * NAME: a human readable name for the iterator (must be unique) + * FLAGS: creation flags indicate iteration options + * SLOTS: array of cluster slots to iterate over + * SLOTS_COUNT: size of the array of slots + * REPLDONE: if provided, called after the last replication item has been queued (on the Valkey main thread) + * CLEANUP: if provided, called at the end of iteration (on the Valkey main thread) + * PRIVDATA: passed to cleanup function + * + * This method creates and initializes the bgIterator. It does not perform any thread management. + * It is expected that the main Valkey thread will call this method, and then start a new thread to + * to implement the iteration client which will read from the returned bgIterator. + * + * The caller of this function has the ownership of the `slots` array's memory. This function will + * just copy its data and leave the array untouched. + * + * There is no need to delete/destroy a bgIterator. It will automatically be cleaned up after the + * last item is read. + */ +bgIterator * bgIteratorCreateSlotsIter( + const char *name, + int flags, + const int *slots, + int slots_count, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata); + + +/* Find an existing bgIterator by name. + * Returns NULL if the iterator does not exist (or has completed). + */ +bgIterator * bgIteratorFind(const char *name); + + +/* Get the name of an existing iterator. */ +const char * bgIteratorName(bgIterator *iter); + + +/* Struct to retrieve status information for an active iteration client. */ +typedef struct { + unsigned long dbentries_queued; // Cumulative BGITERATOR_ITEM_DBENTRY queued + unsigned long dbentries_processed; // Cumulative BGITERATOR_ITEM_DBENTRY processed + unsigned long replication_queued; // Cumulative BGITERATOR_ITEM_REPLICATION queued + unsigned long replication_processed; // Cumulative BGITERATOR_ITEM_REPLICATION processed + unsigned long swapdb_queued; // Cumulative BGITERATOR_ITEM_SWAPDB queued + unsigned long swapdb_processed; // Cumulative BGITERATOR_ITEM_SWAPDB processed + unsigned long flushdb_queued; // Cumulative BGITERATOR_ITEM_FLUSHDB queued + unsigned long flushdb_processed; // Cumulative BGITERATOR_ITEM_FLUSHDB processed + unsigned long dbentry_clones_queued; // A subset of dbentries_queued for cloned entries + unsigned long dbentry_clones_processed; // A subset of dbentries_processed for cloned entries + unsigned long queue_length; // Current length of queue to iteration client + unsigned long queue_length_target; // Dynamic target length for queue to iteration client + unsigned long runtime_ms; // Time, in milliseconds, that iterator has been running + unsigned long current_item_ms; // Time, in milliseconds, spent processing current item +} bgIteratorStatus; + + +/* Get the status of a background iteration. + * + * The caller-provided bgIteratorStatus will be populated. + */ +void bgIteratorGetStatus(bgIterator *iter, bgIteratorStatus *status); + + +/* Terminate a background iteration. + * + * An iteration is terminated by the Valkey main thread. It is expected that the iteration client + * will continue to read, receiving BGITERATOR_ITEM_TERMINATED or BGITERATOR_ITEM_COMPLETE to + * complete the iteration. (This is necessary to ensure proper cleanup.) + * NOTE: If the iteration client wants to terminate iteration, it may call bgIteratorClose(). + */ +void bgIteratorTerminate(bgIterator *iter); + + +/* Check if an iterator is being terminated. + * + * This checks if the iterator is in the process of terminating. For the Valkey main thread, this + * can be used to determine if a call has already been made to bgIteratorTerminate. For an + * iteration client, it normally learns about terminate by reading the next item, this allows + * out-of-band detection of termination which can be useful when processing a large key. + */ +bool bgIteratorIsTerminating(bgIterator *iter); + + +typedef enum { + /* Indicates that the iteration has completed normally. No more items to read. + * If replication is enabled, on completion, the final replication offset is recorded in + * 'u.master_repl_offset' and 'dbid' is set to the selected replication db. The iteration + * client will have received all *applicable* replication data to this point. */ + BGITERATOR_ITEM_COMPLETE = 1, + + /* Indicates that the iteration has been terminated before completion. No more items to read.*/ + BGITERATOR_ITEM_TERMINATED, + + /* A dbEntry for DB=dbid. + * NOTE: The dbEntry MAY be expired. It is up to the client to decide how to handle + * expired entries. */ + BGITERATOR_ITEM_DBENTRY, + + /* A replication command for DB=dbid. cmd, argv, & argc provided. + * NOTE: The command may have been re-written before replication. */ + BGITERATOR_ITEM_REPLICATION, + + /* A SWAPDB event. dbid swapped with dbid2. + * Note that SWAPDB events are not provided during consistent iteration. */ + BGITERATOR_ITEM_SWAPDB, + + /* A FLUSHDB event. In most cases, iteration will be terminated, and this event will NOT be + * sent. However, in the case of a single minor DB being flushed, non-consistent iteration is + * permitted to continue. */ + BGITERATOR_ITEM_FLUSHDB +} bgIteratorItemType; + + +typedef struct { + dbEntry *de; + bool is_cloned; + bool is_rehashing_paused; +} dbEntryData; + +typedef struct { + struct serverCommand *cmd; + robj **argv; + int argc; +} replicationData; + +typedef struct { + bgIteratorItemType type; + int dbid; /* orig DB ID for CONSISTENT, queue-time DB ID for !CONSISTENT. */ + union { + dbEntryData dbe; // for BGITERATOR_ITEM_DBENTRY + replicationData repl; // for BGITERATOR_ITEM_REPLICATION + long long master_repl_offset; // for BGITERATOR_ITEM_COMPLETE + int dbid2; // for BGITERATOR_ITEM_SWAPDB + } u; +} bgIteratorItem; + + +/* Read the next bgIteratorItem from the bgIterator. + * + * The iteration client is expected to call this function in a loop. After reading + * BGITERATOR_ITEM_COMPLETE or BGITERATOR_ITEM_TERMINATED, the iteration client must call + * bgIteratorClose to finalize the iteration process. + * + * This is a blocking call. If the main Valkey thread has been too busy to send items to the + * iterator, the iteration client's queue may run dry and this call will block until data is + * available. + * + * NOTE: Reading an item returns previously read items to Valkey. It is unsafe to reference an item + * previously read. + * + * (All memory management is the responsibility of the bgIterator - not the reader.) + */ +bgIteratorItem * bgIteratorRead(bgIterator *iter); + + +/* Close the bgIterator, allowing the bgIterator to be deallocated. + * + * This must be called by an iteration client to release the bgIterator. + * + * It is required that this is called after receiving BGITERATOR_ITEM_COMPLETE or + * BGITERATOR_ITEM_TERMINATED and signals that the background activity is complete. + * + * This may also be called by the iteration client to force terminate an iteration early. The + * bgIterator will be marked as terminated. + */ +void bgIteratorClose(bgIterator *iter); + + +/******************************************************************************************** + * BGITERATION HOOKS REQUIRED TO SUPPORT ITERATION - CALLS INSERTED INTO MAIN VALKEY CODE + ********************************************************************************************/ + +typedef struct { + uint32_t iterator_epoch; // iterator epoch of last modification +} bgIterationEntryMetadata; + + +/* Must be called once (and only once) at server startup. */ +void bgIteration_init(void); + + +/* Returns true if any iterators are currently active. */ +bool bgIteration_iterationActive(void); + + +/* Notify bgIteration that a key is being deleted. In Valkey, key deletion can occur in a READ + * command if the key is expired. Note that this notification is more about status than memory. + * Since the dbEntry is a reference counted object, the dbEntry can't be physically deleted if + * bgIteration is still actively using it. + */ +void bgIteration_keyDelete(int dbid, const_sds key); + + +/* Iteration needs to know if a FLUSHALL is being performed. For normal clients, this comes through + * the standard "blockClientIfRequired" interface. This interface is for cases where Valkey + * performs the FLUSHALL operation independently of clients (e.g. when syncing with master). + */ +void bgIteration_flushall(void); + + +/* Updating value or expiration of an existing key may lead to reallocation of the dbEntry (robj). + * BgIteration keeps track of expedited keys (by pointer) to avoid repeated iteration. BgIteration + * must be notified when dbEntries are reallocated. BgIteration will not dereference the pointers; + * it is safe to have deallocated the old dbEntry before calling this function. + * + * We can't update the dbEntry if the entry is actually in use (bgIteration_isEntryInuse)! + * + * To simplify calling code, this function does nothing if old_entry == new_entry. + */ +void bgIteration_updateDbEntryPtr(dbEntry *old_entry, dbEntry *new_entry); + + +/* Before executing any command, the Valkey main thread must call this function. If the key(s) are + * blocked for writes by an iterator, the function returns true and the client is blocked. A + * blocked client will be unblocked once the key becomes available for write. + * + * This should be called for all commands - even commands which are executed as part of a MULTI/EXEC + * or LUA script. + * + * For MULTI/EXEC - This function is called when hitting the EXEC - after all of the commands + * have been queued. This may block the EXEC, but will NOT block individual + * commands as they are executed in the MULTI/EXEC block. + * + * For LUA script - This function is first called for EVAL/EVALSHA. It may block the script while + * waiting on declared keys. However, if the script accesses undeclared keys or + * performs SWAPDB, a synchronous block may be performed (returning false) on + * individual commands within the script. + * + * Note: this function should be called for all commands (not just writes). + */ +bool bgIteration_blockClientIfRequired(client *c); + + +/* After execution of a write command, the Valkey main thread must provide the command to iterators + * which are interested in the replication feed. It is required that all commands have been passed + * through bgIteration_blockClientIfRequired(), however, it is permitted that the command can be + * re-written for propagation. + */ +void bgIteration_handleCommandReplication( + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv); + + +/* The memory that bgIteration uses while temporarily buffering replication data is not included in + * the maxmemory computation used for eviction. This function provides insight into the current + * amount of memory used for buffered replication data. + */ +size_t bgIteration_memoryInuseForReplication(void); + + +/* Check if a dbEntry is currently in-use/locked by bgIteration. */ +bool bgIteration_isEntryInuse(dbEntry *de); + + +/* Get the current iteration epoch, for tagging metadata on keys. */ +uint32_t bgIteration_getEpoch(void); + +#endif diff --git a/src/db.c b/src/db.c index ed906f22c4e..97706d7f2c0 100644 --- a/src/db.c +++ b/src/db.c @@ -37,6 +37,7 @@ #include "module.h" #include "vector.h" #include "expire.h" +#include "bgiteration.h" /*----------------------------------------------------------------------------- * C-level DB API @@ -361,6 +362,7 @@ static void dbSetValue(serverDb *db, robj *key, robj **valref, int overwrite, vo val->lru = old->lru; long long expire = objectGetExpire(old); new = objectSetKeyAndExpire(val, objectGetVal(key), expire); + bgIteration_updateDbEntryPtr(old, new); *oldref = new; /* Replace the old value at its location in the expire space. */ if (expire >= 0) { @@ -430,6 +432,8 @@ void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags) { } else { dbSetValue(db, key, valref, 1, NULL); } + bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(*valref); + if (md) md->iterator_epoch = bgIteration_getEpoch(); if (!(flags & SETKEY_KEEPTTL)) removeExpire(db, key); if (!(flags & SETKEY_NO_SIGNAL)) signalModifiedKey(c, db, key); } @@ -475,6 +479,8 @@ int dbGenericDeleteWithDictIndex(serverDb *db, robj *key, int async, int flags, hashtablePosition pos; void **ref = kvstoreHashtableTwoPhasePopFindRef(db->keys, dict_index, objectGetVal(key), &pos); if (ref != NULL) { + bgIteration_keyDelete(db->id, (sds)objectGetVal(key)); + robj *val = *ref; /* VM_StringDMA may call dbUnshareStringValue which may free val, so we * need to incr to retain val */ @@ -753,6 +759,15 @@ long long dbTotalServerKeyCount(void) { void signalModifiedKey(client *c, serverDb *db, robj *key) { touchWatchedKey(db, key); trackingInvalidateKey(c, key, 1); + + /* If bgIteration is running, need to maintain the iteration epoch. */ + if (bgIteration_iterationActive()) { + dbEntry *o = dbFind(db, objectGetVal(key)); + if (o) { + bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(o); + if (md) md->iterator_epoch = bgIteration_getEpoch(); + } + } } void signalFlushedDb(int dbid, int async) { @@ -2257,7 +2272,7 @@ robj *dbFindExpires(serverDb *db, sds key) { } unsigned long long dbSize(serverDb *db) { - return kvstoreSize(db->keys); + return (db->keys) ? kvstoreSize(db->keys) : 0; } unsigned long long dbScan(serverDb *db, unsigned long long cursor, kvstoreScanFunction scan_cb, void *privdata) { diff --git a/src/hashtable.c b/src/hashtable.c index dcae6dfa014..1dcb8038030 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -214,6 +214,8 @@ static_assert(100 * BUCKET_DIVISOR / BUCKET_FACTOR / ENTRIES_PER_BUCKET <= MAX_F "Expand must result in a fill below the soft max fill factor"); static_assert(MAX_FILL_PERCENT_SOFT <= MAX_FILL_PERCENT_HARD, "Soft vs hard fill factor"); +#define ITERATOR_DONE_WITH_BUCKET_IDX (ENTRIES_PER_BUCKET + 1) + /* --- Random entry --- */ #define FAIR_RANDOM_SAMPLE_SIZE (ENTRIES_PER_BUCKET * 10) @@ -344,7 +346,7 @@ typedef struct { } position; static_assert(sizeof(hashtablePosition) >= sizeof(position), - "Opaque iterator size"); + "Opaque position size"); /* State for incremental find. */ typedef struct { @@ -612,7 +614,8 @@ static bucket *fetchEntriesForExpand(bucket *b, void *buf[], int *size, int max_ /* Processes one bucket chain during incremental table expansion. * Uses batch processing to optimize memory access patterns. */ -static void rehashStepExpand(hashtable *ht) { +// Not API, but not static - used in unit testing +void rehashStepExpand(hashtable *ht) { void *entry_buf[FETCH_ENTRY_BUFFER_SIZE_WHEN_EXPAND]; const void *key_buf[FETCH_ENTRY_BUFFER_SIZE_WHEN_EXPAND]; size_t idx = ht->rehash_idx; @@ -1377,13 +1380,13 @@ void hashtableResumeAutoShrink(hashtable *ht) { * spaces, "holes", in the bucket chains, which wastes memory. Additionally, we * pause auto shrink when rehashing is paused, meaning the hashtable will not * shrink the bucket count. */ -static void hashtablePauseRehashing(hashtable *ht) { +void hashtablePauseRehashing(hashtable *ht) { ht->pause_rehash++; hashtablePauseAutoShrink(ht); } /* Resumes incremental rehashing, after pausing it. */ -static void hashtableResumeRehashing(hashtable *ht) { +void hashtableResumeRehashing(hashtable *ht) { ht->pause_rehash--; assert(ht->pause_rehash >= 0); hashtableResumeAutoShrink(ht); @@ -2268,7 +2271,9 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) { * child bucket in a chain, or to the next bucket index, or to the * next table. */ iter->pos_in_bucket++; - if (iter->bucket->chained && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1) { + if (iter->bucket->chained + && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1 + && iter->pos_in_bucket != ITERATOR_DONE_WITH_BUCKET_IDX) { iter->pos_in_bucket = 0; iter->bucket = getChildBucket(iter->bucket); } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) { @@ -2562,3 +2567,68 @@ int hashtableLongestBucketChain(hashtable *ht) { } return maxlen; } + +/* This is an internal function - not part of the standard API. It must be explicitly declared + * where used. It shouldn't be included in any .h (API) file. Use of this interface is discouraged + * as it depends on the internal structure, which may change. + * + * For a given key, return: + * table_idx - the index of the internal table (0 or 1) + * bucket_idx - the bucket index within the table (0..n) + * + * Returns TRUE if the the key exists in the table. + * Returns FALSE if the key doesn't exist (and table/index are undefined) + */ +bool hashtableInternalFindBucketIdx(hashtable *ht, void *key, int *table_idx, size_t *bucket_idx) { + uint64_t hash = hashKey(ht, key); + int pos_in_bucket; + int table; + bucket *b = findBucket(ht, hash, key, &pos_in_bucket, &table); + if (!b) return false; + + *table_idx = table; + *bucket_idx = hash & expToMask(ht->bucket_exp[table]); + return true; +} + +/* This is an internal function - not part of the standard API. It must be explicitly declared + * where used. It shouldn't be included in any .h (API) file. Use of this interface is discouraged + * as it depends on the internal structure, which may change. + * + * For a given iterator, return: + * table_idx - the index of the internal table (0 or 1) + * bucket_idx - the bucket index within the table (0..n) + * + * NOTE: hashtableIterator position is based on the LAST item returned. + */ +void hashtableInternalIteratorGetBucketIdx(hashtableIterator *iterator, int *table_idx, size_t *bucket_idx) { + iter *it = iteratorFromOpaque(iterator); + *table_idx = it->table; + *bucket_idx = it->index; +} + +/* This is an internal function - not part of the standard API. It must be explicitly declared + * where used. It shouldn't be included in any .h (API) file. Use of this interface is discouraged + * as it depends on the internal structure, which may change. + * + * Returns TRUE if the iterator is ready to move to the next bucket index (if it has completed the + * current bucket index). Note: hashtableIterator bucket_idx is the bucket index of the last item + * returned by hashtableNext. + * + * Note: If this function returns true, the iterator commits to move onto the next bucket index, + * even if something new is added to the end of the current bucket before hashtableNext is called. + */ +bool hashtableInternalIteratorIsBucketIdxComplete(hashtableIterator *iterator) { + iter *it = iteratorFromOpaque(iterator); + + if (it->bucket->chained) return false; + + if (!(it->bucket->presence >> (it->pos_in_bucket + 1))) { + /* There's CURRENTLY nothing else to return at this bucket index. Mark pos_in_bucket so + * so that hashtableNext will move to the next bucket index, regardless of items which may + * be added in the future. */ + it->pos_in_bucket = ITERATOR_DONE_WITH_BUCKET_IDX; + return true; + } + return false; +} diff --git a/src/hashtable.h b/src/hashtable.h index 8bbf5d8c05b..97ecab68518 100644 --- a/src/hashtable.h +++ b/src/hashtable.h @@ -129,6 +129,8 @@ size_t hashtableMemUsage(const hashtable *ht); void hashtablePauseAutoShrink(hashtable *ht); void hashtableResumeAutoShrink(hashtable *ht); bool hashtableIsRehashing(hashtable *ht); +void hashtablePauseRehashing(hashtable *ht); +void hashtableResumeRehashing(hashtable *ht); bool hashtableIsRehashingPaused(hashtable *ht); ssize_t hashtableGetRehashingIndex(hashtable *ht); void hashtableRehashingInfo(hashtable *ht, size_t *from_size, size_t *to_size); diff --git a/src/kvstore.c b/src/kvstore.c index 86078cfc1ab..1ac72a01dc2 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -689,6 +689,16 @@ int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it) { return kvs_it->didx; } +/* This is an internal function - not part of the standard API. It must be explicitly declared + * where used. It shouldn't be included in any .h (API) file. Use of this interface is discouraged + * as it depends on the internal structure, which may change. + * + * Return the current hashtableIterator from within the kvstoreIterator. + */ +hashtableIterator *kvstoreInternalIteratorGetCurrentHashtableIterator(kvstoreIterator *kvs_it) { + return &kvs_it->di; +} + /* Fetches the next element and returns true. Returns false if there are no more elements. */ bool kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next) { if (kvs_it->didx != KVSTORE_INDEX_NOT_FOUND && hashtableNext(&kvs_it->di, next)) { diff --git a/src/module.c b/src/module.c index af6b9324f62..6efd517a501 100644 --- a/src/module.c +++ b/src/module.c @@ -70,6 +70,7 @@ #include "io_threads.h" #include "scripting_engine.h" #include "cluster_migrateslots.h" +#include "bgiteration.h" #include #include #include @@ -4464,6 +4465,7 @@ int VM_SetAbsExpire(ValkeyModuleKey *key, mstime_t expire) { * When async is set to true, db contents will be freed by a background thread. */ void VM_ResetDataset(int restart_aof, int async) { if (restart_aof && server.aof_state != AOF_OFF) stopAppendOnly(); + bgIteration_flushall(); flushAllDataAndResetRDB((async ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS) | EMPTYDB_NOFUNCTIONS); if (server.aof_enabled && restart_aof) restartAOFAfterSYNC(); } diff --git a/src/object.c b/src/object.c index 21eb57e5cbd..f4545cf8025 100644 --- a/src/object.c +++ b/src/object.c @@ -38,6 +38,7 @@ #include "zmalloc.h" #include "sds.h" #include "module.h" +#include "bgiteration.h" #include #include @@ -340,7 +341,7 @@ robj *createStringObjectFromSds(const_sds s) { return createStringObject(s, sdslen(s)); } -static robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const_sds key, long long expire) { +robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const_sds key, long long expire) { if (shouldEmbedStringObject(len, key, expire)) { return createEmbeddedStringObjectWithKeyAndExpire(ptr, len, key, expire); } else { @@ -447,6 +448,7 @@ void objectUnembedVal(robj *o) { robj *objectSetKeyAndExpire(robj *o, const_sds key, long long expire) { if (o->type == OBJ_STRING && o->encoding == OBJ_ENCODING_EMBSTR) { robj *new = createStringObjectWithKeyAndExpire(objectGetVal(o), sdslen(objectGetVal(o)), key, expire); + bgIteration_updateDbEntryPtr(o, new); new->lru = o->lru; decrRefCount(o); return new; @@ -471,6 +473,7 @@ robj *objectSetKeyAndExpire(robj *o, const_sds key, long long expire) { serverPanic("Not implemented"); } robj *new = createUnembeddedObjectWithKeyAndExpire(o->type, ptr, key, expire); + bgIteration_updateDbEntryPtr(o, new); new->encoding = o->encoding; new->lru = o->lru; decrRefCount(o); diff --git a/src/rdb.c b/src/rdb.c index e4e006a16ec..ae16f62bd26 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -46,6 +46,7 @@ #include "module.h" #include "cluster.h" #include "cluster_migrateslots.h" +#include "bgiteration.h" #include #include @@ -3171,6 +3172,7 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin if (rdbflags & RDBFLAGS_EMPTY_DATA) { int empty_db_flags = server.repl_replica_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS; serverLog(LL_NOTICE, "RDB signature and version check passed. Flushing old data"); + bgIteration_flushall(); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); /* functionsLibCtx is cleared when we call emptyData, reinitialize here. */ diff --git a/src/replication.c b/src/replication.c index 131b4fa2797..0172987434d 100644 --- a/src/replication.c +++ b/src/replication.c @@ -41,6 +41,7 @@ #include "connection.h" #include "module.h" #include "cluster_migrateslots.h" +#include "bgiteration.h" #include #include @@ -2482,6 +2483,7 @@ int replicaLoadPrimaryRDBFromSocket(connection *conn, char *buf, char *eofmark, } else { /* Remove the half-loaded data in case the load failed for other reasons. */ serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); + bgIteration_flushall(); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); } } @@ -2585,6 +2587,7 @@ int replicaLoadPrimaryRDBFromDisk(rdbSaveInfo *rsi) { } else { /* If disk-based RDB loading fails, remove the half-loaded dataset. */ serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); + bgIteration_flushall(); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); } diff --git a/src/server.c b/src/server.c index 4eb7798a924..ecbae40c2f5 100644 --- a/src/server.c +++ b/src/server.c @@ -54,6 +54,7 @@ #include "util.h" #include "eval.h" +#include "bgiteration.h" #include "trace/trace_commands.h" @@ -3018,8 +3019,11 @@ void initServer(void) { /* Set object metadata size before creating any database key objects */ if (server.forkless_options_supported) { - objectSetMetadataSize(sizeof(uint32_t)); /* This is a placeholder until Threadsave defines a metadata structure */ - /* 4 bytes for iterator_epoch for now*/ + /* NOTE: At this time, there is only one reason for dbEntry metadata. bgIteration. However, + * if/when new metadata options are added, we will need to compute the size of a variable + * size metadata, and provide appropriate accessors to access the specific portion of the + * metadata (each of which may/may not exist, based on immutable startup parameters). */ + objectSetMetadataSize(sizeof(bgIterationEntryMetadata)); } createDatabaseIfNeeded(0); /* The default database should always exist */ @@ -3141,6 +3145,7 @@ void initServer(void) { commandlogInit(); latencyMonitorInit(); initSharedQueryBuf(); + bgIteration_init(); /* Initialize ACL default password if it exists */ ACLUpdateDefaultUserPassword(server.requirepass); @@ -3702,6 +3707,11 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot) if (propagate_to_slot_migration) clusterFeedSlotExportJobs(dbid, argv, argc, slot); } +// If true, a MULTI has been sent to bgIterator. +// Remember to send the matching EXEC in propagatePendingCommands(). +static bool sentMultiToBgIterator = false; +static int lastDbidSentToBgIterator; + /* Used inside commands to schedule the propagation of additional commands * after the current command is propagated to AOF / Replication. * @@ -3714,6 +3724,29 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot) * stack allocated). The function automatically increments ref count of * passed objects, so the caller does not need to. */ void alsoPropagate(int dbid, robj **argv, int argc, int target, int slot) { + if (target & PROPAGATE_REPL && bgIteration_iterationActive()) { + // Note that bgIterator must be invoked immediately after each command. This is required + // for proper processing in the bgIterator state machine. It's NOT ok to call bgIterator + // from propagateNow as that handles all of the commands for a transaction at the end. + // THIS FUNCTION (alsoPropagate) is called after each command. + if (!sentMultiToBgIterator && (scriptIsRunning() || server.in_exec)) { + // For a script or multi/exec, we should be sending the MULTI at the beginning of the + // execution unit. There shouldn't be any commands in the propagation queue yet. + serverAssert(server.also_propagate.numops == 0); + // If this is the first propagated command of a script or multi, make it a transaction. + // It may turn out that there is only 1 command in the MULTI block, but we can't know + // that now. Unlike regular replication, we can't defer all of the replication until + // we know for sure. We must call bgIterator after each command. + static struct serverCommand* cmd_multi = NULL; // STATIC to avoid repeated lookups + if (cmd_multi == NULL) cmd_multi = lookupCommandOrOriginal(&shared.multi, 1); + bgIteration_handleCommandReplication(dbid, cmd_multi, 1, &shared.multi); + sentMultiToBgIterator = true; + } + struct serverCommand* cmd = lookupCommandOrOriginal(argv, argc); + bgIteration_handleCommandReplication(dbid, cmd, argc, argv); + lastDbidSentToBgIterator = dbid; + } + robj **argvcopy; int j; @@ -3780,6 +3813,17 @@ void updateCommandLatencyHistogram(struct hdr_histogram **latency_histogram, int * multiple separated commands. Note that alsoPropagate() is not affected * by CLIENT_PREVENT_PROP flag. */ static void propagatePendingCommands(void) { + // Note: This is done before the check on server.also_propagate.numops. Numops might be zero + // if there is no replica but we might be running bgIteration for something other than + // replication. If we sent the multi (to bgIteration), we need to send the matching exec. + if (sentMultiToBgIterator) { + // If a MULTI was sent to bgIterator via alsoPropagate(), then send the matching EXEC. + static struct serverCommand* cmd_exec = NULL; // STATIC to avoid repeated lookups + if (cmd_exec == NULL) cmd_exec = lookupCommandOrOriginal(&shared.exec, 1); + bgIteration_handleCommandReplication(lastDbidSentToBgIterator, cmd_exec, 1, &shared.exec); + sentMultiToBgIterator = false; + } + if (server.also_propagate.numops == 0) return; int j; @@ -3909,6 +3953,8 @@ int incrCommandStatsOnError(struct serverCommand *cmd, int flags) { * */ void call(client *c, int flags) { + if (bgIteration_blockClientIfRequired(c)) return; + long long dirty; struct ClientFlags client_old_flags = c->flag; diff --git a/src/server.h b/src/server.h index 51db9a38baa..c68dd524592 100644 --- a/src/server.h +++ b/src/server.h @@ -103,7 +103,19 @@ static_assert(sizeof(off_t) >= 8, "off_t must be 64-bit; ensure _FILE_OFFSET_BIT #define dismissMemory zmadvise_dontneed #define VALKEYMODULE_CORE 1 -typedef struct serverObject robj; + +/* serverObject (aka robj) is currently overloaded for 2 purposes. This is a legacy artifact. + * 1. It's carries a reference counted STRING (a keyless value) during parsing and command execution. + * 2. It's also used to carry a key/value pair which is inserted into the DB. In this form, the + * value is not limited to being a string. + * + * The typedef "dbEntry" is used to explicitly connote the latter form. It indicates a key/value + * pair which is suitable to exist in the DB. It might be active in the DB, or may be unlinked from + * the DB (but still contains a key/value). The value may be any of the Valkey data types/encodings. + */ +typedef struct serverObject robj; // A keyless string OR a key/value pair +typedef struct serverObject dbEntry; // Explicitly a key/value pair + #include "valkeymodule.h" /* Modules API defines. */ /* Following includes allow test functions to be called from main() */ diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp new file mode 100644 index 00000000000..7499e53ca52 --- /dev/null +++ b/src/unit/test_bgiteration.cpp @@ -0,0 +1,3747 @@ +//#include +#include "generated_wrappers.hpp" +#include +//#include "amz_assert.h" + +// +// +// ## +// ######: ## +// #######: ## +// ## :## +// ## ## ##.#### .####: ##: :## #### .####: ## ## +// ## :## ####### .######: ## ## #### .######: ##. .## +// #######: ###. ##: :## :## ##: ## ##: :## #: ## :# +// ######: ## ######## ##..## ## ######## :#:.##.:#: +// ## ## ######## ##::## ## ######## # :##:## +// ## ## ## :####: ## ## ## ## ## +// ## ## ###. :# #### ## ###. :# ###::## +// ## ## .####### #### ######## .####### :##..##: +// ## ## .#####: :##: ######## .#####: .## ## +// +// +// +// +// +// +// +// .####. #### +// ###### #### +// :## ##: ## +// ##: :## ##.#### ## ## ## +// ## ## ####### ## :## ## +// ## ## ### :## ## ##: ##. +// ## ## ## ## ## ###:## +// ## ## ## ## ## .## # +// ##: :## ## ## ## ####. +// :## ##: ## ## ##: :### +// ###### ## ## ##### ## +// .####. ## ## .#### ##. +// :## +// ###: +// ### +// +// +// +// +// ### ## ## +// ### ## ## ## +// ###: ## ## ## +// #### ## .####. ####### ##.#### .####: :#### :###.## ## ## +// ##:#: ## .######. ####### ####### .######: ###### :####### :## ## +// ## ## ## ### ### ## ###. ##: :## #: :## ### ### ##: ##. +// ## ## ## ##. .## ## ## ######## :##### ##. .## ###:## +// ## :#:## ## ## ## ## ######## .####### ## ## .## # +// ## #### ##. .## ## ## ## ## . ## ##. .## ####. +// ## :### ### ### ##. ## ###. :# ##: ### ### ### :### +// ## ### .######. ##### ## .####### ######## :####### ## +// ## ### .####. .#### ## .#####: ###.## :###.## ##. +// :## +// ###: +// ### +// +// +// +// ## +// :#### ## +// ##### ## +// ## +// ####### .####. ##.#### ##.#### .####: ##: :## #### .####: ## ## +// ####### .######. ####### ####### .######: ## ## #### .######: ##. .## +// ## ### ### ###. ###. ##: :## :## ##: ## ##: :## #: ## :# +// ## ##. .## ## ## ######## ##..## ## ######## :#:.##.:#: +// ## ## ## ## ## ######## ##::## ## ######## # :##:## +// ## ##. .## ## ## ## :####: ## ## ## ## ## +// ## ### ### ## ## ###. :# #### ## ###. :# ###::## +// ## .######. ## ## .####### #### ######## .####### :##..##: +// ## .####. ## ## .#####: :##: ######## .#####: .## ## +// +// +// +// + + + +using namespace ::testing; + +extern "C" { + #include "stdlib.h" + #include "bgiteration.h" + #include "server.h" + //#include "serverassert.h" + #define using usingvar // compile hack + #include "module.h" + #undef using + extern hashtableType commandSetType; + extern dictType keylistDictType; + bool iteratorRepldoneFn(void *privdata); + void iteratorCleanupFn(bool terminated, void *privdata); + void bgIteration_feedIterators(void); + void createSharedObjects(void); + void hashtableDump(hashtable *ht); + void rehashStepExpand(hashtable *ht); // in hashtable.c (non-API) + void bgIteration_unitTestDisableCloning(void); + void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes); +} + + +// The private data is a pointer to arbitrary data. This value is used just to +// test that the correct value is passed through. +#define PRIVDATA reinterpret_cast(12345) + +// A bgIteration cleanup function used for testing. +int cleanupCount; +bool cleanupTerminated; +void iteratorCleanupFn(bool terminated, void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + cleanupCount++; + cleanupTerminated = terminated; +} + + +// A bgIteration repldone function used for testing. +int repldoneCount; +bool iteratorRepldoneFn(void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + repldoneCount++; + return true; +} + + +// A more complicated repldone function that can delay the replcation done condition. +bool isReplDoneReady; +bool iteratorRepldoneFnNotBeingReadyInitially(void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + // This is to test the behavior when Repl Done function is not ready to be executed. + if (!isReplDoneReady) { + isReplDoneReady = true; + return false; + } + repldoneCount++; + return true; +} + + +static const char *logfile = ""; + +/* Most of the bgIteration unit tests are based on a CMD instance with 2 DBs. There are 8 keys in + * each DB. The keys are named A0, B0, C0, D0, E0, F0, G0, H0 for DB-0 and A1, B1, C1, D1, E1, F1, + * G1, H1 for DB-1. There are a number of helper functions to simulate certain key modification + * actions within our test configuration. Note that this is isolated from the actual call to + * processCommand. + * + * Because most of bgIteration is based on an ordered processing of keys, it doesn't matter if we + * are simulating CMD or CME, full scan, or slot-based. The majority of tests are independent of + * these concerns. + * + * However, there are some tests which are are unique to these configurations and use a specialized + * derived class to handle the differences. We do not want to duplicate all of the tests for + * the different configurations, but we do want to ensure that each configuration works properly. + * - bgIterationTestCluster - handles tests unique to full scan in cluster mode + * - bgIterationTestClusterSlots - handles tests unique to cluster slot-based iteration + */ +class BgIterationTest : public ::testing::Test { + private: + static const int DB_COUNT = 2; + static const int ITEMS_PER_DB = 8; + + // This is the expected order of the keys when hashed + const char *keys[DB_COUNT][ITEMS_PER_DB] = {{"D0", "G0", "H0", "C0", "F0", "A0", "B0", "E0"}, + {"B1", "C1", "F1", "G1", "E1", "D1", "A1", "H1"}}; + + protected: + static const int TOTAL_ITEMS = DB_COUNT * ITEMS_PER_DB; + static const int LAST_ITEM = TOTAL_ITEMS - 1; + + MockValkey mock; + RealValkey real; + + struct serverCommand dummy_cmd = {0}; + + // Helper functions for accessing the keys. We can access by db(0..1) and seq(0..4) + // or by item number (0..9). + // NOTE: These virtual functions can be overridden in subclasses which may have different item layout. + virtual const char * getKeyAtDbSeq(int db, int seq) { + assert(db < DB_COUNT); + assert(seq < ITEMS_PER_DB); + return keys[db][seq]; + } + + virtual int getDbFromItemNum(int itemNum) { + assert(itemNum < DB_COUNT * ITEMS_PER_DB); + return itemNum / ITEMS_PER_DB; + } + + virtual int getSeqFromItemNum(int itemNum) { + assert(itemNum < DB_COUNT * ITEMS_PER_DB); + return itemNum % ITEMS_PER_DB; + } + + const char * keyStr(int itemNum) { + return getKeyAtDbSeq(getDbFromItemNum(itemNum), getSeqFromItemNum(itemNum)); + } + + int itemNumFromKey(const char * key) { + for (int itemNum = 0; itemNum < DB_COUNT * ITEMS_PER_DB; itemNum++) { + if (strcmp(key, keyStr(itemNum)) == 0) return itemNum; + } + return -1; + } + + + // Do some general initialization before starting the suite. Normally, the tests are run in + // isolation - and this isn't much different than SetUp(). But if running the + // entire test suite together (just manually running the test executable), this gets called + // only once. + static void SetUpTestSuite() { + monotonicInit(); + + bzero(&server, sizeof(server)); + server.hz = 100; + server.logfile = const_cast(logfile); + createSharedObjects(); + + moduleInitModulesSystem(); + + server.commands = hashtableCreate(&commandSetType); + server.orig_commands = hashtableCreate(&commandSetType); + populateCommandTable(); + } + + + static void TearDownTestSuite() { + hashtableRelease(server.commands); + hashtableRelease(server.orig_commands); + } + + + void initializeServerDb(int dbid, int slot_count_bits = 0) { + server.db[dbid] = static_cast(zcalloc(sizeof(serverDb))); + server.db[dbid]->id = dbid; + server.db[dbid]->keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, 0); + server.db[dbid]->expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, 0); + server.db[dbid]->watched_keys = dictCreate(&keylistDictType); + kvstoreExpand(server.db[dbid]->keys, 8, 0, NULL); + } + + + void addKeyAndValObjsToDb(int dbid, sds key, sds val) { + robj *key_obj = createStringObjectFromSds(key); + robj *val_obj = createStringObjectFromSds(val); + dbAdd(server.db[dbid], key_obj, &val_obj); + decrRefCount(key_obj); + } + + + void addKeyToDb(int dbid, const char *key, const char *val) { + addKeyAndValObjsToDb(dbid, sdsnew(key), sdsnew(val)); + } + + + virtual void setupDatabase() { + // For these unit tests, a standard database is constructed. The order of items in the + // hash table is important, and this is validated here. If the hash table + // implementation changes, we will find out quickly at this point. All other tests + // will become invalid! + + server.dbnum = 2; + server.cluster_enabled = false; + server.db = static_cast(zcalloc(sizeof(serverDb *) * server.dbnum)); + + for (int dbid = 0; dbid < server.dbnum; dbid++) { + initializeServerDb(dbid); + } + + // With hashtable, it can be difficult to get our keys spread across different buckets. + // Here we play with hashtable size and rehashing to get comfortable scenarios for testing. + // NOTE: If the hashtable bucketization changes, we'll need to evaluate the tests for + // changes. Since bgIteration processes a bucket at a time, we need to evaluate + // all the tests when bucketization changes. + // As an alternative, we could mock all of the hashtable activity, but it's better if we + // can use the real functionality as much as possible. + + kvstoreExpand(server.db[0]->keys, 16, 0, NULL); + addKeyToDb(0, "A0", "A0"); + addKeyToDb(0, "B0", "B0"); + addKeyToDb(0, "C0", "C0"); + addKeyToDb(0, "D0", "D0"); + addKeyToDb(0, "E0", "E0"); + addKeyToDb(0, "F0", "F0"); + addKeyToDb(0, "G0", "G0"); + addKeyToDb(0, "H0", "H0"); + hashtable *ht = kvstoreGetHashtable(server.db[1]->keys, 0); + hashtablePauseRehashing(ht); + + kvstoreExpand(server.db[1]->keys, 16, 0, NULL); + addKeyToDb(1, "A1", "A1"); + addKeyToDb(1, "B1", "B1"); + addKeyToDb(1, "C1", "C1"); + addKeyToDb(1, "D1", "D1"); + addKeyToDb(1, "E1", "E1"); + addKeyToDb(1, "F1", "F1"); + addKeyToDb(1, "G1", "G1"); + addKeyToDb(1, "H1", "H1"); + // Now, let's increase the size and start a rehash on the 2nd DB. This ensures that + // iteration is working even if a hashtable is in the middle of rehashing. We choose + // a 128 size so that rehashed keys all get unique buckets. + kvstoreExpand(server.db[1]->keys, 128, 0, NULL); + ht = kvstoreGetHashtable(server.db[1]->keys, 0); + rehashStepExpand(ht); // in hashtable.c (non-API) + rehashStepExpand(ht); // and rehash the 2nd bucket also + hashtablePauseRehashing(ht); + + // The bucketization should look like this. Remember that DB-1 is in + // the middle of a rehash, so it has 2 tables. + // + // DB: 0 SLOT: 0 + // Table 0, used 8, exp 2, top-level buckets 4, child buckets 0 + // Bucket 0:1 level:0 + // 0 h2 63, key "D0" + // 1 h2 a5, key "G0" + // 2 h2 ca, key "H0" + // Bucket 0:2 level:0 + // 0 h2 91, key "C0" + // 1 h2 88, key "F0" + // Bucket 0:3 level:0 + // 0 h2 b8, key "A0" + // 1 h2 f5, key "B0" + // 2 h2 13, key "E0" + // Table 1, used 0, exp -1, top-level buckets 0, child buckets 0 + // + // DB: 1 SLOT: 0 + // Table 0, used 3, exp 2, top-level buckets 4, child buckets 0 + // Bucket 0:0 level:0 <- rehashed into table 1 + // Bucket 0:1 level:0 <- rehashed into table 1 + // Bucket 0:2 level:0 + // 0 h2 18, key "B1" + // 1 h2 fd, key "C1" + // Bucket 0:3 level:0 + // 0 h2 6f, key "F1" + // Table 1, used 5, exp 5, top-level buckets 32, child buckets 0 + // Bucket 1:1 level:0 + // 0 h2 ad, key "G1" + // Bucket 1:5 level:0 + // 0 h2 0c, key "E1" + // Bucket 1:12 level:0 + // 0 h2 e9, key "D1" + // Bucket 1:17 level:0 + // 0 h2 36, key "A1" + // Bucket 1:29 level:0 + // 0 h2 9e, key "H1" + // Bucket 1:30 level:0 + + + // In case we need to debug... + // Used to generate comment above, showing bucketization. + if (0) debugPrintBucketInfo(); + + // Validate that the iteration order matches the expected order + for (int db = 0; db < server.dbnum; db++) { + ht = kvstoreGetHashtable(server.db[db]->keys, 0); + hashtableIterator *it = hashtableCreateIterator(ht, 0); + robj *next; + int i = 0; + while (hashtableNext(it, reinterpret_cast(&next))) { + ASSERT_THAT(next, robjEqualsStr(getKeyAtDbSeq(db, i++))); + } + hashtableReleaseIterator(it); + } + } + + + void SetUp() override { + server.main_thread_id = pthread_self(); + server.forkless_options_supported = 1; + objectSetMetadataSize(sizeof(bgIterationEntryMetadata)); + + bgIteration_unitTestDisableCloning(); + + setupDatabase(); + + EXPECT_CALL(mock, aeCreateTimeEvent(_,_,_,_,_)).WillRepeatedly(Return(0)); + bgIteration_init(); + + cleanupCount = 0; + repldoneCount = 0; + isReplDoneReady = false; + + // By default, in tests, we treat items as not having an expiration + //JHB EXPECT_CALL(mock, getExpire(_,_)).WillRepeatedly(Return(-1)); + + // By default, do nothing for these + EXPECT_CALL(mock, blockClientInUseOnKeys(_,_,_)).WillRepeatedly(Return()); + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).WillRepeatedly(Return()); + + // By default, expect no permission issues + EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_,_,_,_,_,_)).WillRepeatedly(Return(ACL_OK)); + + //JHB EXPECT_CALL(mock, lookupCommandOrOriginal(_)).WillRepeatedly(Return(&dummy_cmd)); + } + + + void TearDown() override { + bgIteration_feedIterators(); // process returning stuff before deleting DB + bgIteration_feedIterators(); // in case an iterator was closed there might be more + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i]->keys) kvstoreRelease(server.db[i]->keys); + if (server.db[i]->expires) kvstoreRelease(server.db[i]->expires); + dictRelease(server.db[i]->watched_keys); + zfree(server.db[i]); + } + zfree(server.db); + } + + + // void update_keys(const char **new_keys, int db, int len) { + // memcpy(keys[db], new_keys, len * sizeof(const char *)); + // } + + + + + + + // Deletes an item from the DB (often at the start of a test) - but does NOT notify + // bgIteration. bgIteration_keyDelete() should be explicitly called where needed. + void simpleDelItem(int itemNum) { + int db = getDbFromItemNum(itemNum); + + sds delKey = sdsnew(keyStr(itemNum)); + int rc = kvstoreHashtableDelete(server.db[db]->keys, 0, delKey); + ASSERT_EQ(rc, 1); + sdsfree(delKey); + } + + + // Find the actual dbEntry object by itemNum + dbEntry * getItem(int itemNum) { + int db = getDbFromItemNum(itemNum); + sds key = sdsnew(keyStr(itemNum)); + dbEntry *de = dbFind(server.db[db], key); + sdsfree(key); + return de; + } + + + // The test expects that the next item read will be BGITERATOR_ITEM_COMPLETE + void expectReadComplete(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + EXPECT_EQ(item->type, BGITERATOR_ITEM_COMPLETE); + bgIteratorClose(iter); + + int oldCleanupCount = cleanupCount; + bgIteration_feedIterators(); + EXPECT_EQ(cleanupCount, oldCleanupCount + 1); + } + + + // The test is cleaning up and isn't validating the remaining cleanup + void expectAnythingCleanup(bgIterator *iter) { + while (true) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + if ((item->type == BGITERATOR_ITEM_COMPLETE + || item->type == BGITERATOR_ITEM_TERMINATED)) { + bgIteratorClose(iter); + break; + } + } + bgIteration_feedIterators(); // Recognize the closed iterator + EXPECT_EQ(cleanupCount, 1); + } + + + void expectDictEntryMetadataMatch(dbEntry *de1, dbEntry *de2) { + bgIterationEntryMetadata *dm1 = static_cast(objectGetMetadata(de1)); + bgIterationEntryMetadata *dm2 = static_cast(objectGetMetadata(de2)); + + EXPECT_NE(dm1, nullptr); + EXPECT_NE(dm2, nullptr); + EXPECT_EQ(dm1->iterator_epoch, dm2->iterator_epoch); + } + + + // Useful when debugging new tests. It reads/prints all remaining items then crashes. + void cleanupIteratorDebugPrint(bgIterator *iter) { + bool done = false; + printf("[DEBUG] Printing bgIterator '%s' items:\n", bgIteratorName(iter)); + while (!done) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + switch (item->type) { + case BGITERATOR_ITEM_DBENTRY: + { + auto obj = item->u.dbe.de; + const char * keyStr = objectGetKey(obj); + printf("Entry: %s -> %s [itemNum: %i]\n", + keyStr, + static_cast(objectGetVal(obj)), + itemNumFromKey(keyStr)); + break; + } + case BGITERATOR_ITEM_REPLICATION: + printf("Repl: DB=%d : ", item->dbid); + for (int i = 0; i < item->u.repl.argc; i++) + printf("%s ", static_cast(objectGetVal(item->u.repl.argv[i]))); + printf("\n"); + break; + case BGITERATOR_ITEM_COMPLETE: + case BGITERATOR_ITEM_TERMINATED: + bgIteratorClose(iter); + done = true; + break; + default: + printf("unhandled: %d\n", item->type); + } + } + bgIteration_feedIterators(); // Recognize the closed iterator + ASSERT_TRUE(false); // Halt the test here + } + + + // Make a copy of the metadata + void * cloneMetadata(dbEntry *de) { + int size = objectGetMetadataSize(de); + void *metadata = zmalloc(size); + memcpy(metadata, objectGetMetadata(de), size); + return metadata; + } + + + // Compare a previous metadata copy to an existing entry + void compareAndFreeClonedMetadata(dbEntry *de, void *metadata) { + EXPECT_EQ(memcmp(objectGetMetadata(de), metadata, objectGetMetadataSize(de)), 0); + zfree(metadata); + } + + + // The test expects the next item will be a specific key + // The item value is verified against the default unless provided as a parameter. + void expectReadKey(bgIterator *iter, int itemNum, const char *value=nullptr) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_FALSE(item->u.dbe.is_cloned); + // if (item->u.dbe.is_cloned) { // JHB - wrong place to check this. + // // If the entry is cloned, make sure we copied the metadata + // dbEntry *cloned_dbEntry = item->u.dbe.de; + // dbEntry *original_dbEntry = getItem(itemNum); + // expectDictEntryMetadataMatch(original_dbEntry, cloned_dbEntry); + // } + EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum)); + if (value) { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } else { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum))); + } + } + + + // The test expects the next item will be a specific key amd that the item is cloned. + // Metadata is tested (to make sure the clone includes the proper metadata). + // The item value is verified against the default unless provided as a parameter. + void expectReadClonedKey(bgIterator *iter, int itemNum, void *metadata, const char *value=nullptr) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_TRUE(item->u.dbe.is_cloned); + compareAndFreeClonedMetadata(item->u.dbe.de, metadata); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum)); + if (value) { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } else { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum))); + } + } + + + // Test expects the next key, but specified by key name, not itemNum. + void expectReadDbKeyValue(bgIterator *iter, int db, const char *key, const char *value) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), key); + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } + + + // Test expect to read a sequence of key items + void expectReadKeySequence(bgIterator *iter, int startItem, int endItem) { + for (int i = startItem; i <= endItem; i++) expectReadKey(iter, i); + } + + + // Just like expectReadKey, but also tests that a previous item is becoming unblocked. + void expectReadKeyWithUnblock(bgIterator *iter, int itemNum, int unblockItem, const char *value=nullptr) { + bool blocked = true; + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(unblockItem)))) + .WillOnce(Assign(&blocked, false)); + expectReadKey(iter, itemNum, value); + EXPECT_FALSE(blocked); + } + + + // Test expects to read a replication item matching the command help by client 'c' + void expectReadReplication(bgIterator *iter, client *c) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->dbid, c->db->id); + EXPECT_EQ(item->u.repl.cmd, c->cmd); + EXPECT_EQ(item->u.repl.argc, c->argc); + for (int i = 0; i < c->argc; i++) { + EXPECT_STREQ(static_cast(objectGetVal(item->u.repl.argv[i])), + static_cast(objectGetVal(c->argv[i]))); + } + } + + + // We expect to read a MULTI command which should have been inserted. + void expectReadMultiReplication(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("multi")); + } + + + // We expect to read an EXEC command which should have been inserted. + void expectReadExecReplication(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("exec")); + } + + + // Expecting that a DEL command should have been replicated. + void expectReadReplicationDel(bgIterator *iter, int itemNum) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->dbid, db); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("DEL")); + EXPECT_EQ(item->u.repl.argc, 2); + EXPECT_THAT(item->u.repl.argv[0], robjEqualsStr("DEL")); + EXPECT_THAT(item->u.repl.argv[1], robjEqualsStr(keyStr(itemNum))); + } + + + // Expecting that a special SWAPDB item has been inserted. + void expectReadSwapDB(bgIterator *iter, int db1, int db2) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_SWAPDB); + EXPECT_EQ(item->dbid, db1); + EXPECT_EQ(item->u.dbid2, db2); + } + + + // Used to examine the physical bucket layout in the hash table. Generated the comment + // above which shows each item in each bucket. Necessary if hash table layout changes. + void debugPrintBucketInfo(int num_slots = -1) { + for (int db = 0; db < server.dbnum; db++) { + int n = (num_slots == -1) ? kvstoreNumHashtables(server.db[db]->keys) : num_slots; + for (int slot = 0; slot < n; slot++) { + hashtable *ht = kvstoreGetHashtable(server.db[db]->keys, slot); + printf("DB: %d SLOT: %d\n", db, slot); + hashtableDump(ht); + } + } + } + + + // Creates a client with a write command (SET) for the given itemNum + client * getWriteClient(int itemNum, const char *value) { + int db = getDbFromItemNum(itemNum); + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString("set"); + c->db = server.db[db]; + + c->argc = 3; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); + c->argv[1] = createStringObjectFromSds(sdsnew(keyStr(itemNum))); + c->argv[2] = createStringObjectFromSds(sdsnew(value)); + + return c; + } + + + // Create a client with a write command that touches multiple keys + client * getWriteMultiKeysClient( + const char * cmdName, + int dstItemNum, + const std::vector & srcItemsNum) { + + assert(!srcItemsNum.empty()); + + const int db = getDbFromItemNum(dstItemNum); + std::for_each(srcItemsNum.cbegin(), srcItemsNum.cend(), [&db, this](int srcItemNum) { + assert(db == getDbFromItemNum(srcItemNum)); + }); + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString(cmdName); + assert(c->cmd != nullptr); + c->db = server.db[db]; + + c->argc = 2 + srcItemsNum.size(); + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); + c->argv[1] = createStringObjectFromSds(sdsnew(keyStr(dstItemNum))); + for (unsigned int i = 0; i < srcItemsNum.size(); i++) { + c->argv[2 + i] = createStringObjectFromSds(sdsnew(keyStr(srcItemsNum[i]))); + } + + return c; + } + + + client * getWrite2KeysClient(const char * cmdName, int dstItemNum, int srcItemNum) { + return getWriteMultiKeysClient(cmdName, dstItemNum, {srcItemNum}); + } + + + client * getWrite3KeysClient( + const char * cmdName, int dstItemNum, int src1ItemNum, int src2ItemNum) { + return getWriteMultiKeysClient(cmdName, dstItemNum, {src1ItemNum, src2ItemNum}); + } + + + // Create a client with a MULTI/EXEC block. + // This parses a series of commands separated by ';' + // Example: getMultiClient("SET A0 xxx; SELECT 1; SET A1 xxx; SET B1 xxx") + client * getMultiClient(const char *commands, int dbid = 0) { + char *commandsCopy = zstrdup(commands); // a mutable copy + char *commandStr, *commandStrSave; + char *token, *tokenSave; + + client *c = static_cast(zcalloc(sizeof(client))); + c->db = server.db[dbid]; + initClientMultiState(c); + c->flag.multi = 1; + c->mstate->cmd_flags |= CMD_WRITE; + + commandStr = strtok_r(commandsCopy, ";", &commandStrSave); + while (commandStr != NULL) { + + token = strtok_r(commandStr, " ", &tokenSave); + c->cmd = lookupCommandByCString(token); + + c->argv = static_cast(zcalloc(sizeof(robj*) * 5)); // command + 4 args + + for (int i = 0; token != NULL; i++) { + c->argv[i] = createStringObject(token, strlen(token)); + c->argc = i+1; + token = strtok_r(NULL, " ", &tokenSave); + } + + queueMultiCommand(c, 0); + freeClientArgv(c); + + commandStr = strtok_r(NULL, ";", &commandStrSave); + } + + c->cmd = lookupCommandByCString("exec"); + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew("EXEC")); + + zfree(commandsCopy); + return c; + } + + + // Initially, a MULTI client is set up to execute the EXEC command (which examines the + // contents of the multi/exec block). This function advances the client to begin executing + // the individual commands within the multi/exec block. + void advanceMultiClientToCommand(client *c, int cmdNum) { + assert(cmdNum >= 0 && cmdNum < c->mstate->count); + c->argc = c->mstate->commands[cmdNum].argc; + c->argv = c->mstate->commands[cmdNum].argv; + c->argv_len = c->mstate->commands[cmdNum].argv_len; + c->cmd = c->realcmd = c->mstate->commands[cmdNum].cmd; + } + + + // A client with a fictional command: + // SETGET + // - writes a value to the first key (making this CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY) + // - reads a second key + client * getSetGetClient(int itemNum1, const char *value1, int itemNum2) { + // Fictional command which writes to 1st key and reads the 2nd + int db = getDbFromItemNum(itemNum1); + assert(db == getDbFromItemNum(itemNum2)); // (this would be a testcase error) + + client *c = static_cast(zcalloc(sizeof(client))); + struct serverCommand *cmd + = static_cast(zcalloc(sizeof(struct serverCommand))); + + cmd->fullname = const_cast("SETGET"); + cmd->arity = 4; + cmd->flags = CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY; + + cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INDEX; + cmd->legacy_range_key_spec.bs.index.pos = 1; // firstkey + cmd->legacy_range_key_spec.fk.range.lastkey = -1; + cmd->legacy_range_key_spec.fk.range.keystep = 2; + + c->cmd = cmd; + c->db = server.db[db]; + + c->argc = 4; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(cmd->fullname)); + c->argv[1] = createStringObjectFromSds(sdsnew(keyStr(itemNum1))); + c->argv[2] = createStringObjectFromSds(sdsnew(value1)); + c->argv[3] = createStringObjectFromSds(sdsnew(keyStr(itemNum2))); + + return c; + } + + + // Client with a fictional write command with no keys specified + client * getNoKeysWriteClient() { + // Fictional command which is marked WRITE, but has no keys. + client *c = static_cast(zcalloc(sizeof(client))); + struct serverCommand *cmd + = static_cast(zcalloc(sizeof(struct serverCommand))); + + cmd->fullname = const_cast("NOKEYSWRITE"); + cmd->arity = 1; + cmd->flags = CMD_WRITE; + + cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INVALID; // No keys + + c->cmd = cmd; + c->db = server.db[0]; + + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(cmd->fullname)); + + return c; + } + + + void freeClientArgv(client *c) { + for (int i = 0; i < c->argc; i++) decrRefCount(c->argv[i]); + zfree(c->argv); + c->argv = NULL; + c->argc = 0; + } + + + // During testing, we create some fake commands. This checks if the command is real or fake. + // A fake command is dynamically allocated and can be freed. Real commands are static. + bool isRealValkeyCommand(struct serverCommand *cmd) { + return lookupCommandByCString(cmd->declared_name); + } + + + void freeTestClient(client *c) { + freeClientMultiState(c); + freeClientArgv(c); + + if (!isRealValkeyCommand(c->cmd)) zfree(c->cmd); + + zfree(c); + } + + + // Simulate what happens when a write command is blocked + void simulateBlockedWrite(client *c, int expectedNumberBlockedKeys = 1) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c,expectedNumberBlockedKeys,_)).Times(1); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_TRUE(blocked); + } + + + // Simulate what happens when a write command isn't blocked + void simulateUnblockedWrite(client *c) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + } + + + // Simulate what happens when a write command is NOT blocked, because the key can be cloned + // and expedited. This requires a scenario where we would normally need to block the + // client so that bgIteration can process the item. + void simulateClonedWrite(bgIterator *it, client *c) { + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + unsigned long initialClones = status.dbentry_clones_queued; + + // Client should not get blocked + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + + // Ensure that cloning took place + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, (initialClones + 1)); + + // Ensure that the real item isn't inuse (because we cloned it instead) + dbEntry *de = dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); + ASSERT_FALSE(bgIteration_isEntryInuse(de)); + } + + + // Simulates what happens when a write command (SET) actually executes. This requires a + // scenario where we would NOT be blocked on the write. It actually alters the value of + // the key and updates the metadata. + void simulateUnblockedWriteWithModification(client *c) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + + //dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); JHB + + // Fake execution of the command - touch the iterator_epoch counter and swap the value + // We need to duplicate the value because setKey() can reallocate it. + robj *value = dupStringObject(c->argv[2]); + setKey(c, c->db, c->argv[1], &value, SETKEY_ADD_OR_UPDATE); + + // Let's make sure that setKey updated the iteration epoch (as it should have) + dbEntry *de = dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); + bgIterationEntryMetadata *md = static_cast(objectGetMetadata(de)); + EXPECT_EQ(md->iterator_epoch, bgIteration_getEpoch()); + + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + } + + + // Simulate execution of a MULTI/EXEC transaction for a client `c` without blocking. + // It replays all queued commands and ensures replication matches a real transaction. + // command replication flag is revalidated when exec command is processed. + // This requires a scenario where we don't expect the client to be blocked. + void simulateUnblockedMultiExec(client *c) { + + // simulate EXEC command of the multi/exec client + simulateUnblockedWrite(c); + server.in_exec = 1; + + // If there are other commands, call both blockClientIfRequired and handleCommandReplication for each of the command. + for (int i = 0; i < c->mstate->count; i++) { + advanceMultiClientToCommand(c, i); + simulateUnblockedWrite(c); + + // Replicate MULTI if this is the first instruction inside MULTI/EXEC + if (i == 0) { + robj *argv[1]; + argv[0] = createStringObjectFromSds(sdsnew("multi")); + bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("multi"), 1, argv); + decrRefCount(argv[0]); + } + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + } + + // Call handleCommandReplication for EXEC + robj *argv[1]; + argv[0] = createStringObjectFromSds(sdsnew("EXEC")); + bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("exec"), 1, argv); + server.in_exec = 0; + decrRefCount(argv[0]); + } + + + // Simulate the expiration (active expiration) of a key. This is independent of command execution. + void simulateExpiration(int itemNum) { + ASSERT_NE(getItem(itemNum), nullptr); // Should be there before expire + + // NOTE: This seems weird, but Valkey propagates the delete before actually expiring the + // key. BgIterator expects this behavior and expects the key to exist when the + // DEL is received for propagation. + + // Send bgIteration the DEL + int db = getDbFromItemNum(itemNum); + sds sdsKey = sdsnew(keyStr(itemNum)); + robj *argv[2]; + argv[0] = createStringObjectFromSds(sdsnew("DEL")); + argv[1] = createStringObjectFromSds(sdsdup(sdsKey)); + serverCommand *cmd = lookupCommandByCString("DEL"); + bgIteration_handleCommandReplication(db, cmd, 2, argv); + decrRefCount(argv[0]); + decrRefCount(argv[1]); + + bgIteration_keyDelete(db, sdsKey); + simpleDelItem(itemNum); // Simulate the actual del + + EXPECT_EQ(getItem(itemNum), nullptr); + sdsfree(sdsKey); + } + + + // Simulates an expiration, but validates behavior for an item inuse by bgIteration. + void simulateExpirationOfInuse(int itemNum) { + // An inuse item will have a refcount > 1. BgIteration should have incremented the + // refcount while it is inuse. + dbEntry *de = getItem(itemNum); + ASSERT_NE(de, nullptr); // Should be there before expire + EXPECT_TRUE(bgIteration_isEntryInuse(de)); + EXPECT_EQ(de->refcount, 2u); + + simulateExpiration(itemNum); + + // At this point, the item is removed from the DB, but still exists, and the refcount + // has been reduced to 1. This allows a background thread to continue using the item. + EXPECT_EQ(de->refcount, 1u); + } + + + // Simulates an expiration, but the item is a future item which will be expedited. + void simulateExpirationWithExpedite(int itemNum) { + // An inuse item will have a refcount > 1. BgIteration should have incremented the + // refcount while it is inuse. + dbEntry *de = getItem(itemNum); + ASSERT_NE(de, nullptr); // Should be there before expire + EXPECT_FALSE(bgIteration_isEntryInuse(de)); // Not yet inuse + EXPECT_EQ(de->refcount, 1u); + + simulateExpiration(itemNum); + + // At this point, the item is removed from the DB, but still exists, and the refcount + // has been reduced to 1. This allows a background thread to continue using the item. + EXPECT_TRUE(bgIteration_isEntryInuse(de)); // It's inuse now + EXPECT_EQ(getItem(itemNum), nullptr); // but it's not in the DB anymore + EXPECT_EQ(de->refcount, 1u); + } + + + // Simulate execution of a SWAPDB command + void simulateSwapDB(int dbid0, int dbid1) { + char dbStr[2] = {0}; + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString("swapdb"); + c->db = server.db[0]; + + c->argc = 3; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); + dbStr[0] = '0' + dbid0; + c->argv[1] = createStringObjectFromSds(sdsnew(dbStr)); + dbStr[0] = '0' + dbid1; + c->argv[2] = createStringObjectFromSds(sdsnew(dbStr)); + + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // SWAPDB should never block + + // The real SWAP does more than this, but this is enough for unit tests + serverDb *aux = server.db[dbid0]; + server.db[dbid0] = server.db[dbid1]; + server.db[dbid1] = aux; + + bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv); + + freeTestClient(c); + } + + + // Simulate execution of a FLUSHDB or FLUSHALL command + void simulateFlushDB(int db, int anInUseItem) { + client *c = static_cast(zcalloc(sizeof(client))); + + if (db == -1) { + c->cmd = lookupCommandByCString("flushall"); + c->db = server.db[0]; + } else { + c->cmd = lookupCommandByCString("flushdb"); + c->db = server.db[db]; + } + + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); + + dbEntry *de_in_use = getItem(anInUseItem); + EXPECT_EQ(de_in_use->refcount, 2u); + + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // FLUSHDB should never block + + // The real FLUSH does more than this, but this is enough for unit tests + + // Now flush the items + for (int d = 0; d < server.dbnum; d++) { + if (db == -1 || db == d) { + kvstoreRelease(server.db[d]->keys); + server.db[d]->keys = NULL; + } + } + + EXPECT_EQ(de_in_use->refcount, 1u); + + // and replicate + + bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv); + + freeTestClient(c); + } +}; + +using BgIterationDeathTest = BgIterationTest; + + +TEST_F(BgIterationTest, dbIsOK) { + // Just run the setup/teardown code to make sure the DB is OK. +} + + +///////////////////////////////////////////////////// +// Simple Full-scan iterator tests +///////////////////////////////////////////////////// + +// A simple full scan that just checks basic flow. +TEST_F(BgIterationTest, createAndCleanup) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + 0, NULL, iteratorCleanupFn, PRIVDATA); + EXPECT_EQ(bgIteratorFind("simple"), it); + EXPECT_STREQ(bgIteratorName(it), "simple"); + + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + + EXPECT_EQ(status.dbentries_queued, 0u); + EXPECT_EQ(status.dbentries_processed, 0u); + EXPECT_EQ(status.replication_queued, 0u); + EXPECT_EQ(status.replication_processed, 0u); + EXPECT_EQ(status.swapdb_queued, 0u); + EXPECT_EQ(status.swapdb_processed, 0u); + EXPECT_EQ(status.flushdb_queued, 0u); + EXPECT_EQ(status.flushdb_processed, 0u); + + EXPECT_EQ(status.queue_length, 0u); + EXPECT_GT(status.queue_length_target, 0u); + + EXPECT_LT(status.runtime_ms, 5u); + EXPECT_EQ(status.current_item_ms, 0u); + + expectAnythingCleanup(it); + + EXPECT_EQ(bgIteratorFind("simple"), nullptr); +} + + +// Close client before reading anything +TEST_F(BgIterationTest, testClientCloseBeforeRead) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + 0, NULL, iteratorCleanupFn, PRIVDATA); + bgIteration_feedIterators(); + + bgIteratorClose(it); // Immediately close before reading + + bgIteration_feedIterators(); // Recognize the closed iterator + + // Check that the cleanup callback was executed properly + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Test that the full scan hits each item in the expected sequence. +TEST_F(BgIterationTest, orderedIteration) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + 0, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, LAST_ITEM); + + // Quick status check. At this point, item #9 hasn't been returned yet. + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentries_queued, static_cast(TOTAL_ITEMS)); + EXPECT_EQ(status.dbentries_processed, static_cast(TOTAL_ITEMS) - 1); + + expectReadComplete(it); // Returns item #9, and reads the completion item + + // Check that the cleanup callback was executed properly + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); +} + + +// Test that two simultaneous iterations work properly. +TEST_F(BgIterationTest, twoOrderedIterations) { + bgIterator *it1 = bgIteratorCreateFullScanIter("simple1", + 0, NULL, iteratorCleanupFn, PRIVDATA); + bgIterator *it2 = bgIteratorCreateFullScanIter("simple2", + 0, NULL, iteratorCleanupFn, PRIVDATA); + EXPECT_EQ(bgIteratorFind("simple1"), it1); + EXPECT_EQ(bgIteratorFind("simple2"), it2); + + int it1Count = 0; + int it2Count = 0; + while (it1Count < TOTAL_ITEMS || it2Count < TOTAL_ITEMS) { + // Randomly read from either iterator + if ((rand() % 2) == 0) { + if (it1Count < TOTAL_ITEMS) expectReadKey(it1, it1Count++); + } else { + if (it2Count < TOTAL_ITEMS) expectReadKey(it2, it2Count++); + } + } + + // Nothing left but to read the final completions + expectReadComplete(it1); + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); + expectReadComplete(it2); + EXPECT_EQ(cleanupCount, 2); + EXPECT_FALSE(cleanupTerminated); +} + + +///////////////////////////////////////////////////// +// MODIFY A FUTURE ITEM +// The next tests validate the basic pattern when a key, not yet iterated, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a future item, without replication or consistency. +// Our expectation for this case is that the modification should proceed without blocking, the item +// shouldn't be expedited, and we will see the modified item once the iterator reaches it. +TEST_F(BgIterationTest, modFutureItem_NoReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c = getWriteClient(6, "xxx"); + + // We DONT expect the client to be blocked - not consistent + simulateUnblockedWriteWithModification(c); + + // Now continue reading, 1, 2, 3, 4, 5 + expectReadKeySequence(it, 1, 5); + + // Let's validate that key 6 shows the new value + expectReadKey(it, 6, "xxx"); + + // Continue... + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Modify a future item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification SHOULD be blocked, as we have to save the +// the item in it's state before the modification. To reduce blocking time, the item should be +// moved to the head of the queue - there's no replication in this case, so out-of-order processing +// isn't a concern. +TEST_F(BgIterationTest, modFutureItem_NoReplication_YesConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c = getWriteClient(6, "xxx"); + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c); + + // On a consistent iterator, the event is expedited in-front of items already in queue! + // Read key 6 out of order. + expectReadKey(it, 6); + + // Now, when we read key 1, key 6 is released back to Valkey, and the client will be unblocked. + expectReadKeyWithUnblock(it, 1, 6); + simulateUnblockedWriteWithModification(c); // Now the write can proceed + + // Continue... + expectReadKeySequence(it, 2, 5); + // 6 has already been processed + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Modify a future item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification should proceed without blocking, as the +// mode is inconsistent. We don't expect replication, as we haven't reached the item yet. We'll +// see the modified item later. +TEST_F(BgIterationTest, modFutureItem_YesReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c = getWriteClient(6, "xxx"); + + // We DONT expect the client to be blocked - not consistent + simulateUnblockedWriteWithModification(c); + + // NOTE: Since we haven't reached this item yet, and consistency is not required, there's no + // need to replicate this command. So everything should wrap up just fine - we will see + // the new value when we get to it. + + // Now continue reading, 1, 2, 3, 4, 5 + expectReadKeySequence(it, 1, 5); + + // Let's validate that key 6 shows the new value + expectReadKey(it, 6, "xxx"); + + // Continue... + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// There's no current use case for CONSISTENT with REPLICATION. It's included for completeness +// and to clarify the functionality of the design. However, if this combination were to be used, +// it would be invalid in the presence of SWAPDB. +TEST_F(BgIterationDeathTest, modFutureItem_YesReplication_YesConsistent_fail) { + // Note: This configuration (CONSISTENT with REPLICATION) is invalid unless in cluster mode. + // The issue is that with multiple database supporting SWAPDB creates a problem. How is it + // possible to maintain a CONSISTENT view with a SWAPDB impacting the values seen in the + // replication stream? (Cluster mode doesn't support SWAPDB, so no issue there.) + EXPECT_DEATH(bgIteratorCreateFullScanIter("iter", BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, NULL, NULL), ""); +} + + +///////////////////////////////////////////////////// +// MODIFY A CURRENT ITEM +// The next tests validate the basic pattern when a key, currently in use, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a current item, without replication or consistency. +// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't +// be expedited (it's already in use). +TEST_F(BgIterationTest, modCurrentItem_NoReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + client *c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication) + + // Continue... + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Modify a current item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't +// be expedited (it's already in use). +TEST_F(BgIterationTest, modCurrentItem_NoReplication_YesConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + client *c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication) + + // Continue... + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Modify a current item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification SHOULD be blocked. After the key is processed, +// the write will proceed, and the replication will be sent. +TEST_F(BgIterationTest, modCurrentItem_YesReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + client *c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write will cause replication + + expectReadKey(it, 4); // 4 got put in queue when 3 was read + + expectReadReplication(it, c); + + // Continue... + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +#ifdef CODE_NOT_READY_YET +TEST_F(BgIterationTestCluster, modCurrentItem_YesReplication_YesConsistent_cluster) { + // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. All other keys are queued. + client *c = getWriteClient(1, "xxx"); + + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c); + + // Not expedited because item is already in queue + expectReadKey(it, 1); + expectReadKeyWithUnblock(it, 2, nullptr, 1); // reading original/unmodified item + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 3); // 2, 3 & 4 are in the same bucket, so the replication comes after + expectReadKey(it, 4); + expectReadReplication(it, c); + + // Continue... + expectReadComplete(it); + freeTestClient(c); +} +#endif + + +///////////////////////////////////////////////////// +// MODIFY A PAST ITEM +// The next tests validate the basic pattern when a key, not yet iterated on, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a past item, without replication or consistency. +// Our expectation for this case is that the modification should proceed without blocking. +// No replication is generated and keys are processed similar to no modification. +TEST_F(BgIterationTest, modPastItem_NoReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKeySequence(it, 2, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Modify a past item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification should proceed without blocking. +// No replication is generated and keys are processed similar to no modification. +TEST_F(BgIterationTest, modPastItem_NoReplication_YesConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKeySequence(it, 2, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Modify a past item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification should proceed without blocking. +// Replication will be sent. +TEST_F(BgIterationTest, modPastItem_YesReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Key 2 was already in queue (same bucket as key 1). The replication will follow. + expectReadKey(it, 2); + expectReadReplication(it, c); + + // Continue... + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +#ifdef CODE_NOT_READY_YET +TEST_F(BgIterationTestCluster, modPastItem_YesReplication_YesConsistent_cluster) { + // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Keys 2, 3, and 4 were already in queue. The replication will follow. + expectReadKey(it, 2); + expectReadKey(it, 3); + expectReadKey(it, 4); + expectReadReplication(it, c); + + expectReadComplete(it); + freeTestClient(c); +} +#endif + + +///////////////////////////////////////////////////// +// TESTS FOR ITEM CLONING +///////////////////////////////////////////////////// + +// In a consistent iteration, verify that a simple string is properly cloned, and that a write can +// occur without blocking. Validate the cloned item and metadata. +TEST_F(BgIterationTest, modFutureItem_NoReplication_YesConsistent_CloneExpeditedItem) { + // Initialize cloning configurations. + bgIteration_unitTestEnableCloning(50, 100); + + bgIteratorStatus status; + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c = getWriteClient(6, "xxx"); + + // Quick status check. At this point, no clones exist yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 0u); + + // Since item 6 should be cloned, it will not block the client, allowing the write. + void *de6_md = cloneMetadata(getItem(6)); + simulateClonedWrite(it, c); // This wouldn't block, and queues the cloned value + simulateUnblockedWriteWithModification(c); // This modifies the real entry in the de (touching metadata) + + // At this point, one clone is in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // On a consistent iterator, the event is expedited in-front of items already in queue! + // Read key 6 (which is cloned) out of order. The value will still match the key. + expectReadClonedKey(it, 6, de6_md); // Also validates and frees the metadata + + // Quick status check. At this point, cloned items have not been marked as processed yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 0u); + + // Reading key 1 will release key 6, and the clone will finish processing. + expectReadKey(it, 1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Now, when we read key 2 should not have an impact on number of processed clones. + expectReadKey(it, 2); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Continue... + expectReadKeySequence(it, 3, 5); + // 6 has already been processed + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Check that cloning for simple strings is respecting the size limits and pool size. On a +// consistent iteration, we expect to block or clone on all future keys. We validate that we can +// clone if the item is small enough and the cloning pool has more space left. +TEST_F(BgIterationTest, modFutureItem_NoReplication_YesConsistent_LargeItemOrClonePoolFull) { + // Initialize cloning configurations to test the clone pool functionality first. + bgIteration_unitTestEnableCloning(50, 50); + + bgIteratorStatus status; + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c6 = getWriteClient(6, "xxx"); + client *c7 = getWriteClient(7, "xxx"); + client *c8 = getWriteClient(8, "xxx"); + + // Quick status check. At this point, no clones exist yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 0u); + + // Since item 6 should be cloned, it will not block the client, allowing the write. + void *de6_md = cloneMetadata(getItem(6)); + simulateClonedWrite(it, c6); + simulateUnblockedWriteWithModification(c6); + + // At this point, one clone is in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // Now that cloning pool is full, item 7 will not be cloned and the client will be blocked. + simulateBlockedWrite(c7); + ASSERT_TRUE(bgIteration_isEntryInuse(getItem(7))); + + // There is still only one cloned item in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // Now change cloning configurations to test that large items will not be cloned. We adjust + // the clone pool size to allow two items, but set the maximum item size to be smaller than + // the size of item 8. The clone pool size must be larger than the total size of the existing + // clones plus the maximum item clone size. + bgIteration_unitTestEnableCloning(1, 101); + + // This write will pass the clone pool check but fail the item size check, blocking the client. + simulateBlockedWrite(c8); + ASSERT_TRUE(bgIteration_isEntryInuse(getItem(8))); + + // On a consistent iterator, the expedited item in-front of items already in queue! + // Read key 6 out of order. + expectReadClonedKey(it, 6, de6_md); + + // Now, when we expect to read key 7, which was expedited, key 6 will be released back to Valkey + // and the clone will be deallocated here. + expectReadKey(it, 7); + + // Now, when we read key 8, which was expedited, key 7 is released back to Valkey, and the client + // will be unblocked. + // (actually, unblock is called after every key [just in case] - but functionally we only care + // about this one) + expectReadKeyWithUnblock(it, 8, 7); + simulateUnblockedWriteWithModification(c7); + + // Now, when we read key 1, key 8 is released back to Valkey, and the client will be unblocked. + expectReadKeyWithUnblock(it, 1, 8); + simulateUnblockedWriteWithModification(c8); + + // Since only one item was cloned, there should be one clone processed + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Continue... + expectReadKeySequence(it, 2, 5); + // 6, 7, and 8 have already been processed + expectReadKeySequence(it, 9, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c6); + freeTestClient(c7); + freeTestClient(c8); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO MODIFICATION OF TWO ITEMS +// When 2 keys are modified, we need to ensure that both keys have been sent before we can send +// replication. This means that if replication is present, we may have to block/expedite for +// future keys, even in the inconsistent scenario. +///////////////////////////////////////////////////// + +// Replication enabled, but NOT consistent. In this case, if ANY of the keys have been iterated, +// ALL of the keys must be replicated so that the command can be processed properly on the replica. +TEST_F(BgIterationTest, modPastFutureItem_YesReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Even though key 12 is for READ in this command, it must be expedited so that it exists before + // the associated replication is sent. + client *c = getSetGetClient(8, "xxx", 12); + simulateBlockedWrite(c); + + // Key 12 will be expedited, but not in front of existing items in queue (can only do that for + // consistent iterators) - JHB How about cluster mode? + + expectReadKey(it, 10); + expectReadKey(it, 12); // expedited + expectReadKeyWithUnblock(it, 11, 12); // 13 is now in queue + + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKey(it, 13); + expectReadReplication(it, c); + + expectReadKeySequence(it, 14, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Replication NOT enabled. A read-only key doesn't need to be expedited, even if other keys have +// been processed already. (This should work identically for both consistent/non-consistent. +TEST_F(BgIterationTest, modPastFutureItem_NoReplication_YesConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter1", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Since there's no replication, we don't have to worry about expediting 12. The write will + // proceed without blocking. + client *c = getSetGetClient(8, "xxx", 12); + simulateUnblockedWriteWithModification(c); + + // Key 12 will not be expedited. Remaining keys should be received in normal order. + expectReadKeySequence(it, 10, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + +TEST_F(BgIterationTest, modPastFutureItem_NoReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter2", + 0, NULL, iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Since there's no replication, we don't have to worry about expediting 12. The write will + // proceed without blocking. + client *c = getSetGetClient(8, "xxx", 12); + simulateUnblockedWriteWithModification(c); + + // Key 9 will not be expedited. Remaining keys should be received in normal order. + expectReadKeySequence(it, 10, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO MISSING ITEMS +// Missing items are tricky. A missing item might be logically located in the past or future, in +// relation to the current iteration position. The command may (or may not) create the "missing" +// key. Some general considerations: +// * In a consistent iteration, a missing key didn't exist at the time of consistency, or it was +// already processed (saved) at the time of the deletion. If the missing key gets created, we +// must be sure to skip it if we later iterate over it. +// * In a non-consistent iteration with replication: +// * If the key location is already passed, the replication is sent, allowing the key to be +// created (or not) based on the replication. +// * If the key location is in the furure, we can allow the command to proceed, without +// replication. If the key is created, we will process it when the iterator gets to it. +// +// We expect: +// no-repl, no-consist: past items are ignored - future items are processed when iterated +// no-repl, yes-consist: past items are ignored - future items are ignored +// yes-repl, no-consist: past item skipped, but replicated - future items are created by replication and skipped later +// yes-repl, yes-consist: past item skipped, but replicated - future items are processed when iterated +///////////////////////////////////////////////////// + +// no-repl, no-consist: creation of PAST item has no impact +TEST_F(BgIterationTest, missingPastItem_NoReplication_NoConsistent) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + 0, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// no-repl, yes-consist: creation of PAST item has no impact +TEST_F(BgIterationTest, missingPastItem_NoReplication_YesConsistent) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// yes-repl, no-consist: creation of a PAST item will be replicated +TEST_F(BgIterationTest, missingPastItem_YesReplication_NoConsistent) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // replication will be added after item 4 (3,4 in same bucket) + + expectReadKey(it, 4); + + expectReadReplication(it, c); + + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +#ifdef CODE_NOT_READY_YET +// yes-repl, yes-consist: creation of a PAST item will be replicated +TEST_F(BgIterationTestCluster, missingPastItem_YesReplication_YesConsistent) { + // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // replication will be added after item 4 (2, 3, and 4 in same bucket) + + expectReadKey(it, 3); + expectReadKey(it, 4); + expectReadReplication(it, c); + + expectReadComplete(it); + freeTestClient(c); +} +#endif + + +// no-repl, no-consist: creation of FUTURE item is seen when reached by the iteration. +TEST_F(BgIterationTest, missingFutureItem_NoReplication_NoConsistent) { + // Using DB1 so we have lots of buckets + // Note: Choosing item 14 because it's in the portion of DB1 that's already rehashed. So we + // know that the item won't be moving when we re-add it. + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + 0, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + const char * newValue = "xxx"; + client *c = getWriteClient(14, newValue); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 13); + + // We expect to see item 14. + // Note that for an inconsistent DB view, it is logically undefined if this value is seen (or not). + // But as implemented, we should see it and the test is helpful to understand if/when the + // functionality changes. + expectReadKey(it, 14, newValue); + + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// no-repl, yes-consist: creation of FUTURE item is ignored by consistent iteration. +TEST_F(BgIterationTest, missingFutureItem_NoReplication_YesConsistent) { + // Using DB1 so we have lots of buckets + // Note: Choosing item 14 because it's in the portion of DB1 that's already rehashed. So we + // know that the item won't be moving when we re-add it. + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + client *c = getWriteClient(14, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 13); + // Key 14 is missing - it didn't exist at start of consistent iteration + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// yes-repl, no-consist: creation of FUTURE item is handled by the replication, and then the key is +// later skipped (treated like an early iteration case). +TEST_F(BgIterationTest, missingFutureItem_YesReplication_NoConsistent) { + // Using DB1 so we have lots of buckets + // Note: Choosing item 14 because it's in the portion of DB1 that's already rehashed. So we + // know that the item won't be moving when we re-add it. + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Items 1 & 2 are in queue (same bucket) + + client *c = getWriteClient(14, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 2); + + expectReadReplication(it, c); // Here's the replication creating item 14 + + expectReadKeySequence(it, 3, 13); + // We expect item 14 to be skipped, because it was created by the earlier replication + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + +#ifdef CODE_NOT_READY_YET +TEST_F(BgIterationTestCluster, missingFutureItem_YesReplication_YesConsistent) { + // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode + simpleDelItem(4); + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, iteratorCleanupFn, PRIVDATA); + + bgIteration_feedIterators(); // Make sure we get key 0 and 1 into the queue + + client *c = getWriteClient(4, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + expectReadReplication(it, c); + + expectReadKey(it, 2); + expectReadKey(it, 3); + + // The replication was read - we don't want to see the key now - #4 should be skipped + + expectReadComplete(it); + freeTestClient(c); +} +#endif + + +///////////////////////////////////////////////////// +// TESTS RELATED TO EXPIRATION +// Expiration can be tricky. When pre-evaluating a command with bgIteration_blockClientIfRequired, +// a key might exist, but be ready for expiration. Then, as the command executes, the key expires +// and gets deleted before the write operation. Consider SET K V. +// In the unexpired case, this appears to bgIteration as a single SET command (which replaces the value). +// In the expired case, bgIteration will receive a DEL followed by a SET. +// +// Another case is a READ command. A read command won't cause the client to be blocked. However, +// if the key is expired, this will cause a DEL. For consistent processing, this key might need to +// be expedited so that it can be processed before it gets deleted. In this case, the key is +// unlinked from the main Valkey dictionary, but the actual deletion is deferred. +///////////////////////////////////////////////////// + +TEST_F(BgIterationTest, expireKeys_NoReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + 0, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we no longer care + simulateExpirationOfInuse(2); // Current - it's inuse + simulateExpiration(5); // Future - we don't care (non-consistent) + + expectReadKeySequence(it, 2, 4); + // key 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, expireKeys_Replication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we expect replication + simulateExpirationOfInuse(2); // Current - it's inuse, but we expect replication + simulateExpiration(5); // Future - we don't care (non-consistent) + + expectReadKey(it, 2); // this was already queued + expectReadReplicationDel(it, 0); // Past item should replicate + expectReadReplicationDel(it, 2); // Current item should replicate + // Item 5 is a future item and doesn't need to replicate + + expectReadKeySequence(it, 3, 4); + // Item 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, expireKeys_NoReplication_YesConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we no longer care + simulateExpirationOfInuse(2); // Current - we must defer + simulateExpirationWithExpedite(5); // Future - will become inuse and expedited for consistency + + expectReadKey(it, 5); // Expedited to front + + expectReadKeySequence(it, 2, 4); + // Item 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +// Special case during a non-consistent iteration with replication and expiration. +// 1. A future key is created (and processed by its replication) - considered early iterated +// 2. Later the key is expired and deleted during command processing (causes DEL to be sent) - no longer early iterated +// 3. The key is recreated as part of the command processing (and this command was replicated) - again early iterated +// 4. Finally, when we iterate to the key, it shouldn't be sent, because it was replicated in step 3. +TEST_F(BgIterationTest, expireKeys_Replication_NoConsistent_FutureKeyCreatedThenExipredDuringSet) { + simpleDelItem(8); // Start with a missing future item + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Get the iterator started + + client *c = getWriteClient(8, "xxx"); + simulateUnblockedWriteWithModification(c); // Not blocked because this is a future key (but we expect repl) + + // Now do it again, but break out the steps so that we can simulate an expiration + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // Shouldn't be blocked because this is a future key + + // Now, as the SET command tries to execute, simulate that the key is expired. Expiration + // processing sends the replication FIRST! + robj *argv[2]; + argv[0] = createStringObjectFromSds(sdsnew("DEL")); + argv[1] = c->argv[1]; + serverCommand *cmd = lookupCommandByCString("DEL"); + bgIteration_handleCommandReplication(getDbFromItemNum(8), cmd, 2, argv); + decrRefCount(argv[0]); + + // Now the call to keyDelete happens (after the replication). + bgIteration_keyDelete(getDbFromItemNum(8), static_cast(objectGetVal(c->argv[1]))); + simpleDelItem(8); // Simulate the actual del + + // Now the SET will run, re-creating the item (which is still a future item) + // We need to duplicate the value because setKey() can reallocate it. + robj *value = dupStringObject(c->argv[2]); + setKey(c, c->db, c->argv[1], &(value), SETKEY_ADD_OR_UPDATE); + + // Finally, replication will be sent because this is creating a new key + bgIteration_handleCommandReplication(getDbFromItemNum(8), c->cmd, c->argc, c->argv); + + // Test that everything comes as expected + expectReadKeySequence(it, 1, 2); // All one bucket - queued after key 0 read + + expectReadReplication(it, c); // Repl from the first SET command + expectReadReplicationDel(it, 8); // This is the expected replication of the DEL from expire + expectReadReplication(it, c); // Repl from the second SET command (recreating deleted key) + + expectReadKeySequence(it, 3, 7); // continue with normal iteration + // KEY 8 SHOULD BE OMITTED - This was already replicated + expectReadKeySequence(it, 9, LAST_ITEM); + + expectReadComplete(it); +} + + +#ifdef CODE_NOT_READY_YET +///////////////////////////////////////////////////// +// THE REMAINING TESTS ARE GENERAL / UNCATEGORIZED +///////////////////////////////////////////////////// + +// Iteration can be terminated from the main thread or from the child client. +// This tests termination driven from the main thread. +TEST_F(BgIterationTest, earlyTerminationFromMain) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + + // At this point, keys 1 & 2 are in queue. A termination should release those keys. + bool blocked1 = true; + bool blocked2 = true; + // We expect no general unblocks, we account for each specific unblock below. + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0); + // We should expect to see unblock called for items 1-4, as they are released from the queue. + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1)))) + .WillOnce(Assign(&blocked1, false)); + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2)))) + .WillOnce(Assign(&blocked2, false)); + bgIteratorTerminate(it); // queues the items for release + EXPECT_TRUE(bgIteratorIsTerminating(it)); + bgIteration_feedIterators(); // actually performs the release + EXPECT_FALSE(blocked1); + EXPECT_FALSE(blocked2); + + bool blocked0 = true; + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))) + .WillOnce(Assign(&blocked0, false)); + bgIteratorItem *item = bgIteratorRead(it); + EXPECT_FALSE(blocked0); + EXPECT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + EXPECT_EQ(cleanupCount, 0); + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Iteration can be terminated from the main thread or from the child client. +// This tests termination driven from the child client (the background thread). +TEST_F(BgIterationTest, earlyTerminationFromChild) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + // At this point, keys 1 & 2 are in queue. A termination should release those keys. + bgIteratorClose(it); // background thread initiates the termination + EXPECT_TRUE(bgIteratorIsTerminating(it)); + + bool blocked0 = true; + bool blocked1 = true; + bool blocked2 = true; + // Expecting no extra unblocks + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0); + // We expect item 0 (the in progress item) to be released + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))) + .WillOnce(Assign(&blocked0, false)); + // We expect items 1-4 (the queued items) to be released + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1)))) + .WillOnce(Assign(&blocked1, false)); + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2)))) + .WillOnce(Assign(&blocked2, false)); + bgIteration_feedIterators(); + EXPECT_FALSE(blocked0); + EXPECT_FALSE(blocked1); + EXPECT_FALSE(blocked2); + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Edge case. Executing a command (like SUNIONSTORE) which REPLACES the first key and reads the +// second key. In this case, bgIteration will get notified of the key deletion during execution of +// SETUNIONSTORE. Given that both keys are in the future (not iterated yet), we'll allow the +// command to execute, unblocked. We won't replicate as we'll pick up the key when we get to it. +TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_keyDeletedDuringSetReplace) { + // Using DB1 so we have lots of buckets + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 8); // 9 is in queue + + // Write command that has 2 keys. 1 existing key that we write to and 1 dependant future key. + client *c = getWrite2KeysClient("sunionstore", 12, 13); + + simulateUnblockedWrite(c); + + // Now the call to keyDelete happens + bgIteration_keyDelete(getDbFromItemNum(12), keyStr(12)); + simpleDelItem(12); // So simulate the actual del + + // Now the write will run, re-creating the item (which is still a future item) + const char * const newValueStr = "new value"; + robj *newValueRobj = createStringObjectFromSds(sdsnew(newValueStr)); + setKey(c, c->db, c->argv[1], &newValueRobj, SETKEY_ADD_OR_UPDATE); + + // Finally, we are letting bgIteration know that the write command was executed + bgIteration_handleCommandReplication(getDbFromItemNum(12), c->cmd, c->argc, c->argv); + + // Since the write command was not replicated, we expect all the keys to be read in the normal + // order from the dictionary. + expectReadKeySequence(it, 9, 11); + expectReadKey(it, 12, newValueStr); + expectReadKeySequence(it, 13, LAST_ITEM); + + expectReadComplete(it); + freeTestClient(c); +} + + +// Edge case. When we have a new key which is created by a command, AND replication is enabled, we +// expect that we will replicate the command rather than serializing the key/value later. As an +// example, consider SUNIONSTORE A B. We want to create A by replicating the command. We don't +// want to have to process A as a key later on. But in this case, we can't run the command until +// B has been sent. We expect the command to be blocked while we send B. +TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_setNewKey_DependantFuture) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 8); // 9 is in queue + + // Write command that has 2 keys. 1 new key and 1 dependant future key. + client *c = getWrite2KeysClient("sunionstore", 12, 13); + + // We are simulating a new key in the dict. This command should block on the dependant key. + // This adds key 13 in the queue since the command depends on it. + simulateBlockedWrite(c); + + // Key 9 was already in the queue + expectReadKey(it, 9); + + // Key 13 is processed out of order since the write depends on it + expectReadKey(it, 13); + + // Reading key 10 will unblock key 13, allowing us to write. + expectReadKey(it, 10); + + // Now that key 13 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + // Key 11 was queued when we read key 10 + expectReadKey(it, 11); + + // The replication of the write command was enqueued after key 11 + expectReadReplication(it, c); + + // We shouldn't see key 12 - as that was processed via replication. + // We shouldn't see key 13 - as that was expedited earlier + + // Now resuming processing of dict entries + expectReadKeySequence(it, 14, LAST_ITEM); + + expectReadComplete(it); + freeTestClient(c); +} + + +// A new key is being created, but is dependent on another key which has already been processed. +// In this case, the command shouldn't be blocked. +TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_setNewKey_DependantPast) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // 10 is in queue, done with 8 + + // Write command that has 2 keys. 1 new key and 1 dependant past key. + client *c = getWrite2KeysClient("sunionstore", 12, 8); + + // We are simulating a new key in the dict. + // This command should not block since the dependant key has already been processed. + simulateUnblockedWriteWithModification(c); + + // Key 10 was put in the queue before the write + expectReadKey(it, 10); + + expectReadReplication(it, c); + + expectReadKey(it, 11); + + // Key 12 should be missing - it was processed by replication + + expectReadKeySequence(it, 13, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// A new key is being created, and has dependencies on 2 other keys - one already processed, one not. +// In this case, the command should be blocked so that the future key can be sent first. +TEST_F(BgIterationTest, writeWith3Keys_Replication_NoConsistent_setNewKey_1DependantPast1DependantFuture) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // 8 has been returned, 9 is active, 10 is in queue + + // Write command that has 1 new key and 2 dependencies (past/future) + client *c = getWrite3KeysClient("sunionstore", 12, 8, 13); + + // The write should be blocked, so that item 13 can be processed. + simulateBlockedWrite(c); + + expectReadKey(it, 10); // 10 was already in queue + expectReadKey(it, 13); // 13 was expedited since the write depends on it + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(13)))).Times(1); + expectReadKey(it, 11); // Releases 13 so the command can execute + + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 14); // was queued when reading 11 (12 is missing, 13 was expedited) + + expectReadReplication(it, c); + + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + +// Test an edge case with the same (future) key being repeated in the command, like: +// SUNIONSTORE A B B +// In this test, A is a previously handled key, and B is a future key. We expect the future key B to +// be expedited (once). +TEST_F(BgIterationTest, writeWith3Keys_Replication_NoConsistent_repeatedKey_1DependantPast1RepeatedFuture) { + // Using DB1 so we have lots of buckets + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // We're done with 8, and 10 is in queue + + // Write command that has 3 keys. 1 past key and 1 repeated key in the future. + client *c = getWrite3KeysClient("sunionstore", 8, 12, 12); + + // This command should block because 12 needs to be expedited. + simulateBlockedWrite(c); + + expectReadKey(it, 10); // was already in queue + expectReadKey(it, 12); // expedited + expectReadKey(it, 11); // releases 12 (unblocking the command) + + // Now that key 12 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 13); // queued when we read 11 + + expectReadReplication(it, c); + + // Now resuming processing of dict entries. + expectReadKeySequence(it, 14, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} +#endif +#ifdef CODE_NOT_READY_YET + + +TEST_F(BgIterationTest, writeWith3Keys_Replication_NoConsistent_repeatedKey_1newKey1RepeatedFuture) { + // This tests the replication of a write command that creates a new key and depends on 1 other + // key which is repeated in the command. The repeated key is in the future. + // This test is meant to replicate this bug: https://issues.amazon.com/ELMO-46572 + + // Expected sequence of event for this test: + // ITEM: (0)'D0' : 'D0' + // BLCK?: (0)'sunionstore' 'E0' 'C0' 'C0' + // EARLY: (0)'C0' : 'C0' + // (blocked) + // ITEM: (0)'B0' : 'B0' + // ITEM: (0)'A0' : 'A0' + // BLCK?: (0)'sunionstore' 'E0' 'C0' 'C0' + // REPL?: (0)'sunionstore' 'E0' 'C0' 'C0' + // (queued) + // SKIPPING ITEM(early iterate): (0)'C0' : 'C0' + // ITEM: (1)'E1' : 'E1' + // ITEM: (1)'C1' : 'C1' + // ITEM: (1)'B1' : 'B1' + // ITEM: (1)'A1' : 'A1' + // ITEM: (1)'D1' : 'D1' + // SENDING COMPLETE + // CLEANUP FN (success) + + simpleDelItem(1); // Deleting key 1 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // Start with this to load 0 into the queue - but don't read 0 as that would load 1,2,3 into the queue! + bgIteration_feedIterators(); + + // Write command that has 3 keys. 1 new key and 1 repeated key in the future. + client *c = getWrite3KeysClient(1, 4, 4); + + // This command should block on key 4. + // This adds key 4 in the queue because: + // - the command depends on key 4 which hasn't been processed yet + // - the command depends on a new key (key 1). + simulateBlockedWrite(c); + + // Key 0 was already enqueued. + expectReadKey(it, 0); + + // Key 4 is processed out of order since the write depends on it + expectReadKey(it, 4); + + // Keys 2,3 are next in the queue (they are all in the same bucket). + // Only reading key 2 for now to release key 4 from the iterator. + expectReadKey(it, 2); + + // Now that key 4 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + // Key 3 is next in the queue (it was put in the queue at the same time as key 2). + expectReadKey(it, 3); + + // The replication of the write command was enqueued after keys 1,2,3. + expectReadReplication(it, c); + + // Now resuming processing of dict entries. + expectReadKeySequence(it, 5, 9); + + expectReadComplete(it); + freeTestClient(c); +} + + +TEST_F(BgIterationTest, writeWith3Keys_NoReplication_Consistent_repeatedKey_1DependantPast1RepeatedFuture) { + // This tests the replication of a write command that updates multiple keys and depends on a key + // which is repeated in the command. The repeated key is in the future and the other key is in + // the past. + + // Expected sequence of event for this test: + // ITEM: (0)'D0' : 'D0' + // BLCK?: (0)'blpop' 'D0' 'C0' 'C0' 'D0' + // EARLY_1: (0)'C0' : 'C0' + // (blocked) + // ITEM: (0)'E0' : 'E0' + // ITEM: (0)'B0' : 'B0' + // ITEM: (0)'A0' : 'A0' + // BLCK?: (0)'blpop' 'D0' 'C0' 'C0' 'D0' + // REPL?: (0)'blpop' 'D0' 'C0' 'C0' 'D0' + // SKIPPING ITEM(early iterate): (0)'C0' : 'C0' + // ITEM: (1)'E1' : 'E1' + // ITEM: (1)'C1' : 'C1' + // ITEM: (1)'B1' : 'B1' + // ITEM: (1)'A1' : 'A1' + // ITEM: (1)'D1' : 'D1' + // SENDING COMPLETE + // CLEANUP FN (success) + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Start with this to load 0 into the queue - but don't read 0 as that would load 1,2,3 into the queue! + bgIteration_feedIterators(); + + // Write command that has 3 keys. 1 past key and 1 repeated key in the future. + // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a + // multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY). + client *c = getWriteMultiKeysClient(0, {4, 4, 0}, "blpop"); + + // This command should block on 2 keys (0 and 4), since: + // - key 0 is in use by the iterator (still in the queue since it has not been processed by the consumer yet) + // - key 4 is in the future + // This adds key 4 in the queue since the command depends on it and it hasn't been processed yet. + simulateBlockedWrite(c, 2); + + // Key 4 is processed out of order since the write depends on it. + // Key 4 is processed before key 0 even though key 0 was already in the queue + // because key 4 was enqueued as a priority item. + expectReadKey(it, 4); + + // Key 0 was already enqueued. + // Reading key 0 releases key 4 from the iterator. + expectReadKey(it, 0); + + // Keys 1,2,3 are next in the queue (they are all in the same bucket). + // Only reading key 1 for now to release key 0 from the iterator. + expectReadKey(it, 1); + + // Now that keys 4 and 0 were processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + // Keys 2, 3 are next in the queue (it was put in the queue at the same time as key 1). + expectReadKeySequence(it, 2, 3); + + // Now resuming processing of dict entries. + expectReadKeySequence(it, 5, 9); + + expectReadComplete(it); + freeTestClient(c); +} + + +TEST_F(BgIterationTest, writeWith3Keys_NoReplication_NoConsistent_repeatedKey_1repeatedNewKey) { + // This tests a write command that creates a new key where the new key is repeated in the + // command. The repeated key is in the future. + + // Expected sequence of event for this test: + // ITEM: (0)'D0' : 'D0' + // ITEM: (0)'A0' : 'A0' + // ITEM: (0)'B0' : 'B0' + // ITEM: (0)'E0' : 'E0' + // BLCK?: (0)'blpop' 'C0' 'D0' 'C0' 'D0' + // REPL?: (0)'blpop' 'C0' 'D0' 'C0' 'D0' + // ITEM: (0)'C0' : 'D0' + // ITEM: (1)'B1' : 'B1' + // ITEM: (1)'C1' : 'C1' + // ITEM: (1)'D1' : 'D1' + // ITEM: (1)'A1' : 'A1' + // ITEM: (1)'E1' : 'E1' + // SENDING COMPLETE + // CLEANUP FN (success) + + server.db[0]->keys->dtype->resizeAllowed = NULL; + kvstoreExpand(server.db[0]->keys, 32, 0, NULL); + hashtableRehash(server.db[0]->keys->hashtables[0], 32); + + // The table looks this way now: + // Table 0, used 5, exp 3, top-level buckets 8, child buckets 0 + // Bucket 0:0 level:0 + // 0 (empty) + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:1 level:0 + // 0 h2 63, key "D0" + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:2 level:0 + // 0 (empty) + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:3 level:0 + // 0 h2 b8, key "A0" + // 1 h2 f5, key "B0" + // 2 h2 13, key "E0" + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:4 level:0 + // 0 (empty) + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:5 level:0 + // 0 (empty) + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:6 level:0 + // 0 h2 91, key "C0" + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:7 level:0 + // 0 (empty) + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + + const char *new_keys[5] = {"D0", "A0", "B0", "E0", "C0"}; + update_keys(new_keys, 0, 5); + + simpleDelItem(4); // Deleting key 4 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + 0, NULL, iteratorCleanupFn, PRIVDATA); + + // Getting started + // The first bucket is empty + bgIteration_feedIterators(); + expectReadKey(it, 0); + + // Key 1 is the next in the queue. + // Reading key 1 to release key 0 from the iterator. + expectReadKey(it, 1); + + // Write command that has 3 keys. 1 new repeated key and 1 key in the past. + // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a + // multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY). + client *c = getWriteMultiKeysClient(4, {0, 4, 0}, "blpop"); + + // The write command is not blocked since key 0 is not in use by the iterator + simulateUnblockedWriteWithModification(c); + + // Keys 2, 3 are next in the queue (it was put in the queue at the same time as key 1). + expectReadKeySequence(it, 2, 3); + + // Key 4 is now in the dict with the value of key 0. + expectReadKey(it, 4, keyStr(0)); + + // Processing the rest of the dict entries. + expectReadKeySequence(it, 5, 9); + + expectReadComplete(it); + freeTestClient(c); +} + +TEST_F(BgIterationTest, copyHandlesProperDb_Replication_NoConsistent) { + // In this test, the COPY command is copying from one DB to another. We will create the + // same key in both DBs. We make sure that the proper key is created via replication, and + // the proper key is created by iteration. + + // NOTE: Adding E0 to dict 1. Now there is a E0 in both dict 0 and dict 1. + addKeyToDb(1, "E0", "E0"); + + // The test: + // We will simulate (with DB0 selected): COPY D0 C0 DB 1 REPLACE + // This will overwrite DB1:C0 that was created above. + // Since DB0:D0 is the first iterated key we expect that DB1:C0 will be expedited. + // After DB1:C0 is "overwritten", it should be marked early iterate. + // We expect DB0:C0 to NOT be marked early iterate, and should get processed normally. + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // Start with this to load 0 (C0) into the queue - but don't read 0 as that would load 1,2,3 into the queue! + bgIteration_feedIterators(); + + // COPY C0 E0 DB 1 REPLACE + client *c = static_cast(zcalloc(sizeof(client))); + c->cmd = lookupCommandByCString("copy"); + c->db = server.db[0]; + c->argc = 6; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); + c->argv[1] = createStringObjectFromSds(sdsnew("C0")); + c->argv[2] = createStringObjectFromSds(sdsnew("E0")); + c->argv[3] = createStringObjectFromSds(sdsnew("DB")); + c->argv[4] = createStringObjectFromSds(sdsnew("1")); + c->argv[5] = createStringObjectFromSds(sdsnew("REPLACE")); + + // This should block on 2 keys. DB0:C0 is in queue. DB1:E0 needs to be expedited. + simulateBlockedWrite(c, 2); + expectReadKey(it, 0); // DB0:C0 + expectReadDbKeyValue(it, 1, "E0", "E0"); // DB1:E0 is expedited + expectReadKey(it, 1); // (to release DB1:E0) + // Now keys 2 & 3 & 4 are in the queue + + simulateUnblockedWrite(c); // We shouldn't be blocked this time + + // Now, we'll simulate the actual activity of the COPY. DB1:C0 will be deleted in order to + // be overwritten. + bgIteration_keyDelete(1, sdsnew("E0")); + // At this point the key would actually be deleted and recreated by COPY (no need to actually do this) + + // And finally the replication (this should queue replication) + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + + // Now let's read everything... + expectReadKeySequence(it, 2, 4); // These were in queue already + expectReadReplication(it, c); // This is the new replication (creating DB1:C0) + + expectReadKeySequence(it, 5, 9); // These are all normal + + expectReadComplete(it); // At this point, we should be done. We should NOT see DB1:C0. + freeTestClient(c); +} + + +// Just check that termination with replication in queue works OK. +TEST_F(BgIterationTest, terminateWithReplication) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); // makes sure we are done with key 0 (don't want to block) + + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Should replicate + freeTestClient(c); + + bgIteratorTerminate(it); + + bgIteratorItem *item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// SWAPDB tests - Get ready for the mind-bend... + +TEST_F(BgIterationTest, swapDB_NoReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + 0, NULL, iteratorCleanupFn, PRIVDATA); + bgIteratorStatus status; + + // In the non-consistent iterator (without replication), items are identified with the DBID at + // the time they are placed into the queue. The SWAPDB event signals the change to the + // iterating process - and this is properly sequenced with the DB info for each item. + + expectReadKey(it, 0); + + // Keys 1,2,3, and 4 are in queue + simulateSwapDB(0, 1); // The swap event will be queued after item 3 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 0u); + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + expectReadKey(it, 4); + + expectReadSwapDB(it, 0, 1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 0u); // still processing it... + + // Since we've seen the swap event, items now have the new DBID + expectReadDbKeyValue(it, 0, keyStr(5), keyStr(5)); // item 5 is in DB0 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 1u); // done processing the swapdb + + // Keys 6 & 7 are in the queue - let's swap back! + simulateSwapDB(1, 0); // The swap event will be queued after item 7 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); // 2nd one queued + EXPECT_EQ(status.swapdb_processed, 1u); + + expectReadDbKeyValue(it, 0, keyStr(6), keyStr(6)); // Still appears as DB0 + expectReadDbKeyValue(it, 0, keyStr(7), keyStr(7)); // Still appears as DB0 + + expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); + EXPECT_EQ(status.swapdb_processed, 1u); // still processing it... + + expectReadKey(it, 8); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); + EXPECT_EQ(status.swapdb_processed, 2u); // done processing all swaps + + expectReadKey(it, 9); + expectReadComplete(it); +} + +TEST_F(BgIterationTest, swapDB_NoReplication_YesConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // In the consistent iterator (without replication) all items are presented to the iterating + // process using the DBID at the time of the iterator creation. No changes are evident. + + expectReadKey(it, 0); + + // Keys 1,2,3,4 are in queue + simulateSwapDB(0, 1); // The swap occurs, but the iterator sees no change + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + expectReadKey(it, 4); + + // Heck, let's go crazy with those swaps... + for (int itemNum = 5; itemNum <= 9; itemNum++) { + simulateSwapDB(0, 1); + expectReadKey(it, itemNum); + } + + expectReadComplete(it); +} + +TEST_F(BgIterationTest, swapDB_YesReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // In the non-consistent iterator WITH replication, items are identified with the DBID at the + // time they are placed into the queue. The SWAPDB event signals the change to the iterating + // process - and this is properly sequenced with the DB info for each item. + + expectReadKey(it, 0); + + // Keys 1,2,3,4 are in queue + simulateSwapDB(0, 1); // The swap event will be queued after item 3 + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + expectReadKey(it, 4); + + expectReadSwapDB(it, 0, 1); // We should see a SWAPDB event + bgIteratorItem *item = bgIteratorRead(it); // followed by the associated replication + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + bgIteration_feedIterators(); + + // Since we've seen the swap event, items now have the new DBID + expectReadDbKeyValue(it, 0, keyStr(5), keyStr(5)); // item 5 is in DB0 + + // Keys 6 & 7 are in the queue - let's swap back! + simulateSwapDB(1, 0); // The swap event will be queued after item 7 + + expectReadDbKeyValue(it, 0, keyStr(6), keyStr(6)); // Still appears as DB0 + expectReadDbKeyValue(it, 0, keyStr(7), keyStr(7)); // Still appears as DB0 + + expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap + item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + bgIteration_feedIterators(); + + expectReadKey(it, 8); + expectReadKey(it, 9); + expectReadComplete(it); +} + +// There is no test for swapDB_YesReplication_YesConsistent because this configuration is not +// permitted with multiple DBs (not permitted with swaps). + + +// FLUSHDB & FLUSHALL Tests +TEST_F(BgIterationTest, flushDB_flushAll) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // key 1 is active in the iterator - this key will be removed from the DB before flush. + // keys 2 & 3 & 4 are in queue - but will be returned to Valkey before the flush. These are yanked + // back by Valkey and will not be seen by iterator. + simulateFlushDB(-1, 1); + + bgIteratorItem *item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + +TEST_F(BgIterationTest, flushDB_flushOne) { + bgIterator *it1 = bgIteratorCreateFullScanIter("iter1", + 0, NULL, iteratorCleanupFn, PRIVDATA); + bgIterator *it2 = bgIteratorCreateFullScanIter("iter2", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + bgIteratorStatus status; + + // The test flushes DB0. This is half the data. Since <= half, a non-consistent iterator is + // allowed to proceed. But the consistent iterator will be terminated. + + expectReadKey(it1, 0); + expectReadKey(it2, 0); + expectReadKey(it1, 1); + expectReadKey(it2, 1); + + // key 1 is active in the iterator - this key will be removed from the DB before flush. + // keys 2 & 3 & 4 are in queue - but will be returned to Valkey before the flush. These are yanked + // back by Valkey and will not be seen by iterator. + simulateFlushDB(0, 1); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 0u); + + // Testing the non-consistent one continues... + // Everything already on the iterator queue should be preserved (deleted from the DB). + // Keys 2 & 3 & 4 are already queued (and preserved). + expectReadKey(it1, 2); + expectReadKey(it1, 3); + expectReadKey(it1, 4); + + bgIteratorItem *item = bgIteratorRead(it1); + ASSERT_EQ(item->type, BGITERATOR_ITEM_FLUSHDB); + ASSERT_EQ(item->dbid, 0); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 0u); // still processing it + + expectReadKey(it1, 5); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 1u); // done with all flushdb's + expectReadKey(it1, 6); + expectReadKey(it1, 7); + expectReadKey(it1, 8); + expectReadKey(it1, 9); + expectReadComplete(it1); + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); + + // But the consistent iterator should be terminated + item = bgIteratorRead(it2); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + bgIteratorClose(it2); // background thread completes the termination + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 2); + EXPECT_TRUE(cleanupTerminated); +} + +// Cluster mode, 2 iterators, CONSISTENT+REPLICATION and NONCONSISTENT+REPLICATION +// Modify a missing key. +TEST_F(BgIterationTestCluster, modMissingKey_2iter_cluster) { + // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode + // For this test, we only have 5 keys since not using DB[1]. Remove the last one. + simpleDelItem(4); + + bgIterator *it1 = bgIteratorCreateFullScanIter("iter1", + BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, iteratorCleanupFn, PRIVDATA); + bgIterator *it2 = bgIteratorCreateFullScanIter("iter2", + BGITERATOR_FLAG_REPLICATION, + NULL, iteratorCleanupFn, PRIVDATA); + + client *c = getWriteClient(4, "xxx"); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked since key doesn't exist + + bgIteration_feedIterators(); // Prime the feed - key 0 and 1 are now enqueued + + // Process the consistent iteration + expectReadReplication(it1, c); // replication happened before feeding (should be 1st) + expectReadKeySequence(it1, 0, 3); + expectReadComplete(it1); + + // Process the non-consistent iteration + expectReadReplication(it2, c); // replication happened before feeding (should be 1st) + expectReadKeySequence(it2, 0, 3); + expectReadComplete(it2); + + freeTestClient(c); +} + +TEST_F(BgIterationTest, twoKeys_firstFuture) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, + NULL, iteratorCleanupFn, PRIVDATA); + + bgIteration_feedIterators(); // Prime the feed - key 0 + expectReadKey(it, 0); // Causes keys 1, 2, 3, 4 to be queued (same bucket) + expectReadKey(it, 1); // Causes key 0 to be released + + // This must replicate, because A0 is in the past. B1 (future) wouldn't need replication except + // for the modification to B1. We try to trip up bgIterator by giving a key that doesn't need + // replication except for the later command that does. Make this a little trickier by adding + // the set for A1 - unnecessary, but more clearly shows the expediting in progress. + client *c = getMultiClient("SET A0 xxx; SELECT 1; SET A1 xxx; SET B1 xxx", 1); + + // The EXEC should block on 2 keys, because B1(5) & A1(8) should be expedited + simulateBlockedWrite(c, 2); + + expectReadKeySequence(it, 2, 4); // These were already in queue + + // Note - it would be OK if these 2 were reversed, but this is how the current algorithm works. + expectReadKey(it, 8); // Key 8 (A1) was expedited + expectReadKey(it, 5); // Key 5 (B1) was expedited + + // and clean up the rest... + expectReadKeySequence(it, 6, 7); + // Key 8 was already read above (expedited) + expectReadKey(it, 9); + expectReadComplete(it); +} + +TEST_F(BgIterationTest, multiBlocksOnFutureKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1,2,3,4 are queued (they are all in the same bucket). + // If we fake a modification to key 5, we won't know if it's handled out of order. + // So we fake a modification to key 6 + // Dummy up a MULTI... + client *c = getMultiClient("SET C1 xxx", 1); + + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c); + freeTestClient(c); + + // C1 (key 6) will be expedited to the front of the list + expectReadKey(it, 6); + + // Now that we've read key 5, key 0 (C0) is passed and should not block + client *c2 = getMultiClient("SET C0 xxx"); + simulateUnblockedWrite(c2); + freeTestClient(c2); + + + expectReadKeySequence(it, 1, 5); + expectReadKeySequence(it, 7, 9); + expectReadComplete(it); +} + +TEST_F(BgIterationTest, multiNotReplicatedButDelRecreateAccess) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // Scenario. We have a multi that doesn't need to be replicated because all of the keys exist + // but are all future keys. Note that missing keys are considered already-iterated, so all + // must exist for this test. Then: + // - we delete a key + // - we re-create the deleted (future) key - normally this would be replicated + // - we access another (future) key - we don't expect to get blocked! + + // We use DB 1 only because the hash table buckets are better broken up there. + client *c = getMultiClient("DEL A1; SET A1 xxx; SET E1 yyy", 1); + + // For DB[1]: + // Bucket 0:0 level:0 + // 0 h2 18, key "B1" + // 1 h2 fd, key "C1" + // 2 h2 e9, key "D1" + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:1 level:0 + // 0 h2 36, key "A1" + // 1 h2 0c, key "E1" + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + + // Read through DB 0 and into DB 1 + expectReadKeySequence(it, 0, 5); // D0, E0, B0, A0, C0, B1 + // Now, C1 and D1 are in the queue (in use) and A1 & E1 are future + + // Now let's process the multi. Since A1 & D1 are both future (existing) items, we shouldn't + // block or replicate. + simulateUnblockedWrite(c); // the EXEC + + // Simulate the DEL A1 + server.in_exec = 1; // Simulate actual execution of the MULTI/EXEC + advanceMultiClientToCommand(c, 0); // DEL A1 + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + simpleDelItem(8); + sds delKey = sdsnew(keyStr(8)); + bgIteration_keyDelete(1, delKey); + sdsfree(delKey); + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); // shouldn't replicate + + // Simulate SET A1 - the key doesn't exist, and would normally replicate and mark early iterate, + // but this is in a transaction, and we are not replicating this transaction. + advanceMultiClientToCommand(c, 1); // SET A1 xxx + simulateUnblockedWriteWithModification(c); + + // Now write to another existing future key - this should work if we weren't confused by the DEL + advanceMultiClientToCommand(c, 2); // SET E1 yyy + simulateUnblockedWriteWithModification(c); + server.in_exec = 0; + + // Now we can continue iterating, and we should pick up keys 6-9. (and no replication!) + expectReadKeySequence(it, 6, 7); + expectReadKey(it, 8, "xxx"); + expectReadKey(it, 9, "yyy"); + expectReadComplete(it); +} + +TEST_F(BgIterationTest, multiHandlesSelectProperly) { + // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1. We will process it + // in DB0, but it will be unprocessed in DB1. See if we track select properly. + addKeyToDb(1, "C0", "C0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - C0 in DB 0. + expectReadKey(it, 0); + + // Now, we are done with C0 in DB0, but not in DB1 + expectReadKey(it, 1); + + // These cases should NOT block... (they access C0 in DB0) + client *c; + c = getMultiClient("SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET C0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SELECT 0; SET C0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block... (they access C0 in DB1) + c = getMultiClient("SET C0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SET C0 xxx"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SET C0 xxx; SELECT 0"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SELECT 1; SET C0 xxx; SELECT 1"); + simulateBlockedWrite(c); + freeTestClient(c); + + expectAnythingCleanup(it); +} + + +TEST_F(BgIterationTest, multiHandlesSelectNoPermissionProperly) { + // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1. We will process it + // in DB0, but it will be unprocessed in DB1. See if we track select properly. + addKeyToDb(1, "C0", "C0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - C0 in DB 0. + expectReadKey(it, 0); + + // Now, we are done with DC00 in DB0, but not in DB1 + expectReadKey(it, 1); + + // No permission for any commands (specifically select/swapdb) + EXPECT_CALL(mock, amzCanClientExecuteCommand(_,_,_,_)) + .Times(AtLeast(1)).WillRepeatedly(Return(false)); + + // These cases should NOT block... (they access C0 in DB0) + // The SELECTs below are inconsequential - with/without select, same result. + client *c; + c = getMultiClient("SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET C0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SELECT 0; SET C0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block IF SELECT IS WORKING... (they access C0 in DB1) + c = getMultiClient("SET C0 xxx"); + c->db = server.db[1]; // already starting on DB1 + simulateBlockedWrite(c); // will block, no select + freeTestClient(c); + c = getMultiClient("SELECT 1; SET C0 xxx"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + freeTestClient(c); + c = getMultiClient("SELECT 1; SET C0 xxx; SELECT 0"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + freeTestClient(c); + c = getMultiClient("SELECT 0; SELECT 1; SET C0 xxx; SELECT 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + freeTestClient(c); + + expectAnythingCleanup(it); +} + + +TEST_F(BgIterationTest, multiHandlesSwapdbProperly) { + // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1. We will process it + // in DB0, but it will be unprocessed in DB1. See if we track select properly. + addKeyToDb(1, "C0", "C0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - C0 in DB 0. + expectReadKey(it, 0); + + // Now, we are done with C0 in DB0, but not in DB1 + expectReadKey(it, 1); + + // These cases should NOT block... (they access C0 in DB0) + client *c; + c = getMultiClient("SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET C0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET C0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 0 1; SELECT 1; SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block... (they access C0 in DB1) + c = getMultiClient("SET C0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SET C0 xxx; SWAPDB 0 1"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SELECT 0; SET C0 xxx; SWAPDB 0 1"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET C0 xxx; SELECT 1"); + simulateBlockedWrite(c); + freeTestClient(c); + + expectAnythingCleanup(it); +} + + +TEST_F(BgIterationTest, multiHandlesSwapdbNoPermissionProperly) { + // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1. We will process it + // in DB0, but it will be unprocessed in DB1. See if we track select properly. + addKeyToDb(1, "C0", "C0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - C0 in DB 0. + expectReadKey(it, 0); + + // Now, we are done with C0 in DB0, but not in DB1 + expectReadKey(it, 1); + + // No permission for any commands (specifically select/swapdb) + EXPECT_CALL(mock, amzCanClientExecuteCommand(_,_,_,_)) + .Times(AtLeast(1)).WillRepeatedly(Return(false)); + + // These cases should NOT block... (they access C0 in DB0) + // The SELECTs & SWAPDBs below are inconsequential - with/without select/swapdb, same result. + client *c; + c = getMultiClient("SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET C0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET C0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 0 1; SELECT 1; SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block IF SELECT/SWAPDB IS WORKING... (they access C0 in DB1) + c = getMultiClient("SET C0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SET C0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb fails) + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SELECT 0; SET C0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails) + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET C0 xxx; SELECT 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails) + freeTestClient(c); + + expectAnythingCleanup(it); +} + +void * pthreadWait200msAndReadTwoKeys(void *arg) { + bgIterator *it = static_cast(arg); + + usleep(200000); + bgIteratorRead(it); + bgIteratorRead(it); + return nullptr; +} + +void asyncWait200msAndReadTwoKeys(bgIterator *it) { + int rc; + pthread_attr_t attr; + pthread_t thread; + + rc = pthread_attr_init(&attr); + assert(rc == 0); + rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + assert(rc == 0); + + rc = pthread_create(&thread, &attr, pthreadWait200msAndReadTwoKeys, it); + assert(rc == 0); + + rc = pthread_attr_destroy(&attr); + assert(rc == 0); +} + + +TEST_F(BgIterationTest, testLuaWithUndeclaredKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1,2,3 are queued (they are all in the same bucket). + // If we fake a modification to key 4, we won't know if it's handled out of order. + // So we fake a modification to key 5 + client *c = getWriteClient(5, "xxx"); + c->flag.script = 1; + + // Now for a LUA script, we have already blocked (on the eval/evalsha) for any declared keys + // But here, we're about to modify an undeclared key. We can't actually block in the middle + // of the LUA script. So this will behave as unblocked, but incur a synchronous wait. + + // Key 5 will get expedited when we simulate the write. After reading key 5, key 1 will need + // to be read to return key 5 to Valkey, unbloking the synchronous wait. + asyncWait200msAndReadTwoKeys(it); + + monotime blockTimer; + elapsedStart(&blockTimer); + simulateUnblockedWrite(c); + // Must have delayed at least 150ms (some time may have passed before timer start) + EXPECT_GT(elapsedMs(blockTimer), 150u); + + // Continue... + expectReadKeySequence(it, 2, 4); + // 5 has already been processed + expectReadKeySequence(it, 6, 9); + expectReadComplete(it); + freeTestClient(c); +} + + +TEST_F(BgIterationTest, replicationReceivedWhileProcessingLastKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + client *c = getWriteClient(0, "xxx"); + + expectReadKeySequence(it, 0, 9); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + expectReadReplication(it, c); // Replication happened while processing key 9, should be here. + + simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing key 9 + expectReadComplete(it); // We expect to see the completion instead + + freeTestClient(c); +} + + +TEST_F(BgIterationTest, repldoneFunctionCalled) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); + + client *c = getWriteClient(0, "xxx"); + + expectReadKeySequence(it, 0, 9); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + expectReadReplication(it, c); // Replication happened while processing key 9, should be here. + EXPECT_EQ(repldoneCount, 1); // Last key released, now done feeding replication + + simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing key 9 + expectReadComplete(it); // We expect to see the completion instead + + freeTestClient(c); +} + + +TEST_F(BgIterationTest, repldoneFunctionCalledTwice) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, iteratorRepldoneFnNotBeingReadyInitially, iteratorCleanupFn, PRIVDATA); + + client *c = getWriteClient(0, "xxx"); + + expectReadKeySequence(it, 0, 9); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + expectReadReplication(it, c); // Replication happened while processing key 9, should be here. + EXPECT_EQ(repldoneCount, 0); // Last key released, now done feeding replication + EXPECT_EQ(isReplDoneReady, 1); + bgIteration_feedIterators(); // Need to call it as RepldoneFnNotBeingReadyInitially returns false in first call + EXPECT_EQ(repldoneCount, 1); + + simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing key 9 + expectReadComplete(it); // We expect to see the completion instead + + freeTestClient(c); +} + + +TEST_F(BgIterationTest, queuingitemFunctionCalled) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + 0, NULL, iteratorCleanupFn, iteratorBeforeAndAfterQueuingItemFn, PRIVDATA); + EXPECT_EQ(beforeQueuingItemCount, 0); + EXPECT_EQ(afterQueuingItemCount, 0); + expectReadKeySequence(it, 0, 9); + expectReadComplete(it); + // Callback is invoked when item is fed to and returned from an iterator + EXPECT_EQ(beforeQueuingItemCount, 10); + EXPECT_EQ(afterQueuingItemCount, 10); +} + +TEST_F(BgIterationTest, checkReplicationByteCount) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); + + client *c = getWriteClient(0, "xxx"); + int expectedReplicationSize = sizeof(bgIteratorItem); + for (int i = 0; i < c->argc; i++) { + expectedReplicationSize += objectComputeSize(NULL, c->argv[i], 0, 0); + } + + expectReadKey(it, 0); + expectReadKey(it, 1); // Releases and unblocks 0 + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u); + + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize); + simulateUnblockedWriteWithModification(c); // and write again (2nd replication) + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize); + + expectReadKeySequence(it, 2, 4); // Keys 1..4 all in same bucket + + expectReadReplication(it, c); + // After reading the 1st replication, it hasn't been returned yet (it's the active item) + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize); + expectReadReplication(it, c); + // After reading the 2nd replication, the 1st has been returned + EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize); + + expectReadKey(it, 5); + // Now all replication has been returned/freed + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u); + + expectReadKeySequence(it, 6, 9); + expectReadComplete(it); + + freeTestClient(c); +} + +// Test that for an arbitrary write command having no keys, replication should occur. +TEST_F(BgIterationTest, checkNoKeysWriteIsReplicated) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + client *c = getNoKeysWriteClient(); + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + + expectReadKeySequence(it, 1, 4); // These were already in queue + + expectReadReplication(it, c); + + expectReadKeySequence(it, 5, 9); + expectReadComplete(it); + freeTestClient(c); +} +TEST_F(BgIterationTestClusterSlots, testAmzKeyIsLogicallyDeletedInOrderedIteration3Slots) { + bgIterator *it = bgIteratorCreateSlotsIter("simple", + 0, slots_to_iterate, slots_to_iterate_size, NULL, iteratorCleanupFn, PRIVDATA); + EXPECT_CALL(mock, amzKeyIsLogicallyDeleted(_)).WillOnce(Return(true)).WillRepeatedly(Return(false)); + expectReadKeySequence(it, 1, n_keys_to_read - 1); + + // Quick status check. At this point, the last item hasn't been returned yet. + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentries_queued, n_keys_to_read - 1); // The first item should be skipped from the queue + EXPECT_EQ(status.dbentries_processed, n_keys_to_read - 2); + + expectReadComplete(it); + EXPECT_FALSE(cleanupTerminated); +} + +TEST_F(BgIterationTest, testAmzKeyIsLogicallyDeletedInOrderedFullScanIteration) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + 0, NULL, iteratorCleanupFn, PRIVDATA); + EXPECT_CALL(mock, amzKeyIsLogicallyDeleted(_)).WillOnce(Return(true)).WillRepeatedly(Return(false)); + expectReadKeySequence(it, 1, 9); + + // Quick status check. At this point, item #9 hasn't been returned yet. + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentries_queued, 9u); // The first item should be skipped from the queue + EXPECT_EQ(status.dbentries_processed, 8u); + + expectReadComplete(it); + EXPECT_FALSE(cleanupTerminated); +} +#endif + +#ifdef CODE_NOT_READY_YET +class BgIterationTestCluster : public BgIterationTest { + private: + // This is the expected order of the keys when hashed into a single dict at slot 0 having size 8. + // The "{06S}" prefix ensures use of only slot 0. + const char *keys[1][5] = {{"{06S}C0", "{06S}D0", "{06S}A0", "{06S}B0", "{06S}E0"}}; + + protected: + // Furthermore, the bucketization will look like this: + // db 0 slot 0 + // Table 0, used 5, exp 1, top-level buckets 2, child buckets 0 + // Bucket 0:0 level:0 + // 0 h2 1a, key "{06S}C0" + // 1 h2 7b, key "{06S}D0" + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:1 level:0 + // 0 h2 5c, key "{06S}A0" + // 1 h2 bf, key "{06S}B0" + // 2 h2 57, key "{06S}E0" + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + + virtual const char * getKeyAtDbSeq(int db, int seq) override { + assert(db == 0); + return keys[db][seq]; + } + + + virtual void setupDatabase() override { + // For these unit tests, a standard database is constructed. The order of items in the + // hash table is important, and this is validated here. If the hash table + // implementation changes, we will find out quickly at this point. All other tests + // will become invalid! + + // Note that the cluster_enabled tests are designed for the purpose of testing + // CONSISTENT iteration WITH REPLICATION. This type of iteration is not supported + // in non-cluster-mode. At the time of writing, there is no-known use-case for this + // combination. But it is tested for completeness and to ensure future availability. + + // Note also that the cluster_enabled tests are not designed to address issues specific + // to per-slot-dictionaries. The tests are simplified by ensuring that all keys are + // mapped to slot-0. It is assumed that iteration would progress in slot order, and + // failure in this regard will be caught in integration tests (amztests). + + server.dbnum = 1; // cluster-mode means 1 DB + server.cluster_enabled = true; + server.db = static_cast(zcalloc(sizeof(serverDb *) * server.dbnum)); + + // Yes, it's cluster mode, but we're mapping all keys to slot 0 - so we cheat and create only 1 dict (just like CMD). + initializeServerDb(0, CLUSTER_SLOT_MASK_BITS); + + // Note "06S" is a prefix that maps to slot 0. We're not testing slots here. + + addKeyToDb(0, "{06S}A0", "{06S}A0"); + addKeyToDb(0, "{06S}B0", "{06S}B0"); + addKeyToDb(0, "{06S}C0", "{06S}C0"); + addKeyToDb(0, "{06S}D0", "{06S}D0"); + addKeyToDb(0, "{06S}E0", "{06S}E0"); + + // In case we need to debug... + if (0) debugPrintBucketInfo(); + + // Validate that the iteration order matches the expected order + hashtableIterator *it = hashtableCreateIterator(server.db[0]->keys->hashtables[0], 0); + for (int i = 0; i < 5; i++) { + void *nextEntry; + hashtableNext(it, &nextEntry); + dbEntry *de = static_cast(nextEntry); + ASSERT_STREQ(static_cast(objectGetKey(de)), getKeyAtDbSeq(0, i)); + } + hashtableReleaseIterator(it); + } +}; +#endif + +#ifdef CODE_NOT_READY_YET +TEST_F(BgIterationTestCluster, dictIsOK) { + // Just run the setup/teardown code to make sure the dict is OK. +} + + +TEST_F(BgIterationTestCluster, modFutureItem_YesReplication_YesConsistent_cluster) { + // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, iteratorCleanupFn, PRIVDATA); + bgIteratorStatus status; + + // For this test, don't read the 1st key - we only have 5 keys since not using DB[1] + bgIteration_feedIterators(); // Prime the feed - key 0 and 1 are now enqueued + + // At this point, key 0, and 1 are queued. Fake a modification to key 2 & 4 - two keys to ensure + // that replication is ordered + client *c1 = getWriteClient(2, "xxx"); + client *c2 = getWriteClient(4, "yyy"); + + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c1); + simulateBlockedWrite(c2); + + // On a consistent iterator, the event is expedited in-front of items already in queue! + // Read keys 2&4 out of order. + expectReadKey(it, 2); // reading original/unmodified item + + // This call is expected to unblock the client waiting on #2 + expectReadKeyWithUnblock(it, 4, nullptr, 2); // reading original/unmodified item + simulateUnblockedWriteWithModification(c1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.replication_queued, 1u); + EXPECT_EQ(status.replication_processed, 0u); + + // Now read items 0 and 1 - these were actually already queued before keys 1 & 4 were expedited. + // This call is expected to unblock the client waiting on #4 + expectReadKeyWithUnblock(it, 0, nullptr, 4); + simulateUnblockedWriteWithModification(c2); + expectReadKey(it, 1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.replication_queued, 2u); + EXPECT_EQ(status.replication_processed, 0u); + + // And now the 2 replications are queued + expectReadReplication(it, c1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.replication_queued, 2u); // 1st replication still being processed + EXPECT_EQ(status.replication_processed, 0u); // (no change in these metrics yet) + + expectReadReplication(it, c2); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.replication_queued, 2u); + EXPECT_EQ(status.replication_processed, 1u); // Done with 1st, processing 2nd + + // Continue... + expectReadKey(it, 3); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.replication_queued, 2u); + EXPECT_EQ(status.replication_processed, 2u); // Done processing both repl items + expectReadComplete(it); + freeTestClient(c1); + freeTestClient(c2); +} +#endif + + + +// JHB - need test that hashing is paused when an entry is in use. diff --git a/src/unit/wrappers.h b/src/unit/wrappers.h index 0f4fb388b98..0f80919d6f7 100644 --- a/src/unit/wrappers.h +++ b/src/unit/wrappers.h @@ -61,6 +61,12 @@ extern "C" { long long __wrap_aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, aeTimeProc *proc, void *clientData, aeEventFinalizerProc *finalizerProc); int __wrap_processPendingCommandAndInputBuffer(client *c); void __wrap_beforeNextClient(client *c); + +void __wrap_blockClientInUseOnKeys(client *c, int nKeys, robj **keys); +void __wrap_unblockClientsInUseOnKey(robj *key); + +int __wrap_ACLCheckAllUserCommandPerm(user *u, struct serverCommand *cmd, robj **argv, int argc, int dbid, int *idxptr); + #undef protected #undef _Bool #undef typename From 4bd498f1970387e332d8a166c011e845bca0ad36 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 28 Apr 2026 15:25:14 +0000 Subject: [PATCH 03/40] continue on unit tests Signed-off-by: Jim Brunner --- src/unit/test_bgiteration.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index 7499e53ca52..b76cd06cb45 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -289,7 +289,7 @@ class BgIterationTest : public ::testing::Test { addKeyToDb(0, "F0", "F0"); addKeyToDb(0, "G0", "G0"); addKeyToDb(0, "H0", "H0"); - hashtable *ht = kvstoreGetHashtable(server.db[1]->keys, 0); + hashtable *ht = kvstoreGetHashtable(server.db[0]->keys, 0); hashtablePauseRehashing(ht); kvstoreExpand(server.db[1]->keys, 16, 0, NULL); From e08abcc85325d60b5962ad848e8e7175c1c6660e Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 28 Apr 2026 22:37:17 +0000 Subject: [PATCH 04/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index ed6ac40bddc..626130f6fa2 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -1806,7 +1806,7 @@ static void removePtrFromEarlyIterate(dbEntry *de) { static int findDbForEntry(dbEntry *de) { for (int i = 0; i < server.dbnum; i++) { - if (dbFind(server.db[i], objectGetKey(de)) == de) return i; + if (server.db[i] && dbFind(server.db[i], objectGetKey(de)) == de) return i; } serverAssert(false); // the entry MUST be in one of the DBs } From 90644e4e9d380d30a40030714aa73b5c4db492f8 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Thu, 14 May 2026 18:18:36 +0000 Subject: [PATCH 05/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 3 +++ src/hashtable.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index 626130f6fa2..1f544cdeea2 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -7,6 +7,9 @@ #include "mutexqueue.h" #include "server.h" +// Just for the moment, until https://github.com/valkey-io/valkey/issues/3450 is resolved +// clang-format off + int getFlushCommandFlags(client *c, int *flags); // in db.c uint64_t dictObjHash(const void *key); // in server.c int dictObjKeyCompare(const void *key1, const void *key2); // in server.c diff --git a/src/hashtable.c b/src/hashtable.c index 1dcb8038030..f9b4683832d 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -2273,7 +2273,7 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) { iter->pos_in_bucket++; if (iter->bucket->chained && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1 - && iter->pos_in_bucket != ITERATOR_DONE_WITH_BUCKET_IDX) { + && iter->pos_in_bucket != ITERATOR_DONE_WITH_BUCKET_IDX+1) { iter->pos_in_bucket = 0; iter->bucket = getChildBucket(iter->bucket); } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) { From 7a949165d5258215d9feedd8d274ffdca47a74b5 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Thu, 14 May 2026 19:26:32 +0000 Subject: [PATCH 06/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.h | 3 --- src/db.c | 5 ++++- src/module.c | 2 -- src/rdb.c | 2 -- src/replication.c | 3 --- src/server.c | 4 ++-- 6 files changed, 6 insertions(+), 13 deletions(-) diff --git a/src/bgiteration.h b/src/bgiteration.h index 35a4b988857..a7a14fdb4a0 100644 --- a/src/bgiteration.h +++ b/src/bgiteration.h @@ -16,9 +16,6 @@ * * Iteration clients are expected to read through the keyspace until the iteration is complete or * terminated. An iteration client may not perform modifications on a key. - * - * Future enhancement: Certain types of modifications may be passed back to the Valkey main thread. - * Use case: A background compression thread wants to compress a string value. */ /* Avoids dependency on server.h */ diff --git a/src/db.c b/src/db.c index 97706d7f2c0..3292de34b51 100644 --- a/src/db.c +++ b/src/db.c @@ -668,9 +668,12 @@ long long emptyData(int dbnum, int flags, void(callback)(hashtable *)) { return -1; } + /* bgIteration must be notified for flushall. */ + if (dbnum == -1) bgIteration_flushall(); + /* Fire the flushdb modules event. */ moduleFireServerEvent(VALKEYMODULE_EVENT_FLUSHDB, VALKEYMODULE_SUBEVENT_FLUSHDB_START, &fi); - + /* Make sure the WATCHed keys are affected by the FLUSH* commands. * Note that we need to call the function while the keys are still * there. */ diff --git a/src/module.c b/src/module.c index 6efd517a501..af6b9324f62 100644 --- a/src/module.c +++ b/src/module.c @@ -70,7 +70,6 @@ #include "io_threads.h" #include "scripting_engine.h" #include "cluster_migrateslots.h" -#include "bgiteration.h" #include #include #include @@ -4465,7 +4464,6 @@ int VM_SetAbsExpire(ValkeyModuleKey *key, mstime_t expire) { * When async is set to true, db contents will be freed by a background thread. */ void VM_ResetDataset(int restart_aof, int async) { if (restart_aof && server.aof_state != AOF_OFF) stopAppendOnly(); - bgIteration_flushall(); flushAllDataAndResetRDB((async ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS) | EMPTYDB_NOFUNCTIONS); if (server.aof_enabled && restart_aof) restartAOFAfterSYNC(); } diff --git a/src/rdb.c b/src/rdb.c index ae16f62bd26..e4e006a16ec 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -46,7 +46,6 @@ #include "module.h" #include "cluster.h" #include "cluster_migrateslots.h" -#include "bgiteration.h" #include #include @@ -3172,7 +3171,6 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin if (rdbflags & RDBFLAGS_EMPTY_DATA) { int empty_db_flags = server.repl_replica_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS; serverLog(LL_NOTICE, "RDB signature and version check passed. Flushing old data"); - bgIteration_flushall(); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); /* functionsLibCtx is cleared when we call emptyData, reinitialize here. */ diff --git a/src/replication.c b/src/replication.c index 0172987434d..131b4fa2797 100644 --- a/src/replication.c +++ b/src/replication.c @@ -41,7 +41,6 @@ #include "connection.h" #include "module.h" #include "cluster_migrateslots.h" -#include "bgiteration.h" #include #include @@ -2483,7 +2482,6 @@ int replicaLoadPrimaryRDBFromSocket(connection *conn, char *buf, char *eofmark, } else { /* Remove the half-loaded data in case the load failed for other reasons. */ serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); - bgIteration_flushall(); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); } } @@ -2587,7 +2585,6 @@ int replicaLoadPrimaryRDBFromDisk(rdbSaveInfo *rsi) { } else { /* If disk-based RDB loading fails, remove the half-loaded dataset. */ serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); - bgIteration_flushall(); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); } diff --git a/src/server.c b/src/server.c index ecbae40c2f5..860a46a9f08 100644 --- a/src/server.c +++ b/src/server.c @@ -3726,8 +3726,8 @@ static int lastDbidSentToBgIterator; void alsoPropagate(int dbid, robj **argv, int argc, int target, int slot) { if (target & PROPAGATE_REPL && bgIteration_iterationActive()) { // Note that bgIterator must be invoked immediately after each command. This is required - // for proper processing in the bgIterator state machine. It's NOT ok to call bgIterator - // from propagateNow as that handles all of the commands for a transaction at the end. + // by the bgIterator state machine. It's NOT ok to call bgIterator from propagateNow as + // that handles all of the commands for a transaction at the end. // THIS FUNCTION (alsoPropagate) is called after each command. if (!sentMultiToBgIterator && (scriptIsRunning() || server.in_exec)) { // For a script or multi/exec, we should be sending the MULTI at the beginning of the From b95abdca3f82dd945ead9124b31d04b0a8086fa5 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Thu, 14 May 2026 20:27:38 +0000 Subject: [PATCH 07/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 7 +++++-- src/server.c | 6 +++++- src/server.h | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index 1f544cdeea2..c0f966c9cb4 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -58,7 +58,10 @@ static bool isDeleteCmd(struct serverCommand *cmd) { static bool onValkeyMainThread(void) { - return (pthread_equal(server.main_thread_id, pthread_self()) != 0); + // Modules interact with the main thread using a mutex. If a module owns the mutex, consider + // that equivalent to being on the main thread. + bool inModule = (atomic_load_explicit(&server.module_gil_acquired, memory_order_relaxed) == 0); + return (inModule || pthread_equal(server.main_thread_id, pthread_self()) != 0); } /* Parse a parameters robj, extracting a valid DBID. @@ -1896,7 +1899,7 @@ static bool isDbSignificant(int dbid) { static void handleFlushdb(int dbid) { // Invoked BEFORE the actual flush. -1 indicates FLUSHALL. - bool should_abort_iterators = server.cluster_enabled || dbid == -1 || isDbSignificant(dbid); + bool should_abort_iterators = (dbid == -1 || isDbSignificant(dbid)); listIter li; listNode *node; diff --git a/src/server.c b/src/server.c index 860a46a9f08..4321bcb5e98 100644 --- a/src/server.c +++ b/src/server.c @@ -2053,7 +2053,10 @@ void beforeSleep(struct aeEventLoop *eventLoop) { /* Before we are going to sleep, let the threads access the dataset by * releasing the GIL. The server main thread will not touch anything at this * time. */ - if (moduleCount()) moduleReleaseGIL(); + if (moduleCount()) { + atomic_store_explicit(&server.module_gil_acquired, 0, memory_order_relaxed); + moduleReleaseGIL(); + } /********************* WARNING ******************** * Do NOT add anything below moduleReleaseGIL !!! * ***************************** ********************/ @@ -2075,6 +2078,7 @@ void afterSleep(struct aeEventLoop *eventLoop, int numevents) { atomic_store_explicit(&server.module_gil_acquiring, 1, memory_order_relaxed); moduleAcquireGIL(); atomic_store_explicit(&server.module_gil_acquiring, 0, memory_order_relaxed); + atomic_store_explicit(&server.module_gil_acquired, 1, memory_order_relaxed); moduleFireServerEvent(VALKEYMODULE_EVENT_EVENTLOOP, VALKEYMODULE_SUBEVENT_EVENTLOOP_AFTER_SLEEP, NULL); latencyEndMonitor(latency); latencyAddSampleIfNeeded("module-acquire-GIL", latency); diff --git a/src/server.h b/src/server.h index c68dd524592..245b21c2468 100644 --- a/src/server.h +++ b/src/server.h @@ -1797,6 +1797,7 @@ struct valkeyServer { pid_t child_pid; /* PID of current child */ int child_type; /* Type of current child */ _Atomic(int) module_gil_acquiring; /* Indicates whether the GIL is being acquiring by the main thread. */ + _Atomic(int) module_gil_acquired; /* Indicates if the main thread has the GIL acquired. */ /* Networking */ int port; /* TCP listening port */ int tls_port; /* TLS listening port */ From 4c46bfaa7da5170466b8da0fd44b3d64faaa575f Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Thu, 14 May 2026 21:43:20 +0000 Subject: [PATCH 08/40] Forkless Save Signed-off-by: Jim Brunner --- src/unit/test_bgiteration.cpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index b76cd06cb45..9c4bdf42fa0 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -245,19 +245,14 @@ class BgIterationTest : public ::testing::Test { } - void addKeyAndValObjsToDb(int dbid, sds key, sds val) { - robj *key_obj = createStringObjectFromSds(key); - robj *val_obj = createStringObjectFromSds(val); + void addKeyToDb(int dbid, const char *key, const char *val) { + robj *key_obj = createStringObject(key, strlen(key)); + robj *val_obj = createStringObject(val, strlen(val)); dbAdd(server.db[dbid], key_obj, &val_obj); decrRefCount(key_obj); } - void addKeyToDb(int dbid, const char *key, const char *val) { - addKeyAndValObjsToDb(dbid, sdsnew(key), sdsnew(val)); - } - - virtual void setupDatabase() { // For these unit tests, a standard database is constructed. The order of items in the // hash table is important, and this is validated here. If the hash table From 786166934727e56bb58361e1f1f088377b6b984b Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Fri, 15 May 2026 17:12:36 +0000 Subject: [PATCH 09/40] Forkless Save Signed-off-by: Jim Brunner --- .config/typos.toml | 3 +++ cmake/Modules/SourceFiles.cmake | 1 + src/db.c | 2 +- src/server.c | 6 +++--- src/server.h | 4 ++-- src/unit/test_bgiteration.cpp | 4 +++- 6 files changed, 13 insertions(+), 7 deletions(-) diff --git a/.config/typos.toml b/.config/typos.toml index ff90d3a679d..a300ef2c61b 100644 --- a/.config/typos.toml +++ b/.config/typos.toml @@ -67,6 +67,9 @@ seeked = "seeked" tre = "tre" dbe = "dbe" +[type.cpp.extend-words] +dbe = "dbe" + [type.systemd.extend-words] # systemd = .conf ake = "ake" diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake index e2cd375bcc7..54964d079cd 100644 --- a/cmake/Modules/SourceFiles.cmake +++ b/cmake/Modules/SourceFiles.cmake @@ -36,6 +36,7 @@ set(VALKEY_SERVER_SRCS ${CMAKE_SOURCE_DIR}/src/t_hash.c ${CMAKE_SOURCE_DIR}/src/config.c ${CMAKE_SOURCE_DIR}/src/aof.c + ${CMAKE_SOURCE_DIR}/src/bgiteration.c ${CMAKE_SOURCE_DIR}/src/pubsub.c ${CMAKE_SOURCE_DIR}/src/multi.c ${CMAKE_SOURCE_DIR}/src/debug.c diff --git a/src/db.c b/src/db.c index 3292de34b51..cdf41c6a8b7 100644 --- a/src/db.c +++ b/src/db.c @@ -673,7 +673,7 @@ long long emptyData(int dbnum, int flags, void(callback)(hashtable *)) { /* Fire the flushdb modules event. */ moduleFireServerEvent(VALKEYMODULE_EVENT_FLUSHDB, VALKEYMODULE_SUBEVENT_FLUSHDB_START, &fi); - + /* Make sure the WATCHed keys are affected by the FLUSH* commands. * Note that we need to call the function while the keys are still * there. */ diff --git a/src/server.c b/src/server.c index 4321bcb5e98..6b2942c3714 100644 --- a/src/server.c +++ b/src/server.c @@ -3741,12 +3741,12 @@ void alsoPropagate(int dbid, robj **argv, int argc, int target, int slot) { // It may turn out that there is only 1 command in the MULTI block, but we can't know // that now. Unlike regular replication, we can't defer all of the replication until // we know for sure. We must call bgIterator after each command. - static struct serverCommand* cmd_multi = NULL; // STATIC to avoid repeated lookups + static struct serverCommand *cmd_multi = NULL; // STATIC to avoid repeated lookups if (cmd_multi == NULL) cmd_multi = lookupCommandOrOriginal(&shared.multi, 1); bgIteration_handleCommandReplication(dbid, cmd_multi, 1, &shared.multi); sentMultiToBgIterator = true; } - struct serverCommand* cmd = lookupCommandOrOriginal(argv, argc); + struct serverCommand *cmd = lookupCommandOrOriginal(argv, argc); bgIteration_handleCommandReplication(dbid, cmd, argc, argv); lastDbidSentToBgIterator = dbid; } @@ -3822,7 +3822,7 @@ static void propagatePendingCommands(void) { // replication. If we sent the multi (to bgIteration), we need to send the matching exec. if (sentMultiToBgIterator) { // If a MULTI was sent to bgIterator via alsoPropagate(), then send the matching EXEC. - static struct serverCommand* cmd_exec = NULL; // STATIC to avoid repeated lookups + static struct serverCommand *cmd_exec = NULL; // STATIC to avoid repeated lookups if (cmd_exec == NULL) cmd_exec = lookupCommandOrOriginal(&shared.exec, 1); bgIteration_handleCommandReplication(lastDbidSentToBgIterator, cmd_exec, 1, &shared.exec); sentMultiToBgIterator = false; diff --git a/src/server.h b/src/server.h index 245b21c2468..b683a950e6f 100644 --- a/src/server.h +++ b/src/server.h @@ -113,8 +113,8 @@ static_assert(sizeof(off_t) >= 8, "off_t must be 64-bit; ensure _FILE_OFFSET_BIT * pair which is suitable to exist in the DB. It might be active in the DB, or may be unlinked from * the DB (but still contains a key/value). The value may be any of the Valkey data types/encodings. */ -typedef struct serverObject robj; // A keyless string OR a key/value pair -typedef struct serverObject dbEntry; // Explicitly a key/value pair +typedef struct serverObject robj; // A keyless string OR a key/value pair +typedef struct serverObject dbEntry; // Explicitly a key/value pair #include "valkeymodule.h" /* Modules API defines. */ diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index 9c4bdf42fa0..c54ce3c8515 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -1,3 +1,5 @@ +// Just for the moment, until https://github.com/valkey-io/valkey/issues/3450 is resolved +// clang-format off //#include #include "generated_wrappers.hpp" #include @@ -1841,7 +1843,7 @@ TEST_F(BgIterationTest, modPastFutureItem_NoReplication_NoConsistent) { // * In a non-consistent iteration with replication: // * If the key location is already passed, the replication is sent, allowing the key to be // created (or not) based on the replication. -// * If the key location is in the furure, we can allow the command to proceed, without +// * If the key location is in the future, we can allow the command to proceed, without // replication. If the key is created, we will process it when the iterator gets to it. // // We expect: From 2e3806a5e7159175e0780b721e21115e813bb2e0 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Fri, 15 May 2026 17:14:47 +0000 Subject: [PATCH 10/40] Forkless Save Signed-off-by: Jim Brunner --- .config/typos.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.config/typos.toml b/.config/typos.toml index a300ef2c61b..e1d582c3f8c 100644 --- a/.config/typos.toml +++ b/.config/typos.toml @@ -68,6 +68,7 @@ tre = "tre" dbe = "dbe" [type.cpp.extend-words] +fo = "fo" dbe = "dbe" [type.systemd.extend-words] @@ -77,6 +78,3 @@ ake = "ake" [type.tcl.extend-words] fo = "fo" tre = "tre" - -[type.cpp.extend-words] -fo = "fo" From 312b0abcc6dd9cda5dce053f6b81654ca19cd25d Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Mon, 18 May 2026 21:20:24 +0000 Subject: [PATCH 11/40] Forkless Save Signed-off-by: Jim Brunner --- .config/typos.toml | 5 +-- src/bgiteration.c | 60 ++++++++++++++++++----------------- src/bgiteration.h | 56 ++++++++++++++++---------------- src/hashtable.c | 4 ++- src/server.h | 2 +- src/unit/test_bgiteration.cpp | 2 +- 6 files changed, 67 insertions(+), 62 deletions(-) diff --git a/.config/typos.toml b/.config/typos.toml index e1d582c3f8c..da8175e4ff9 100644 --- a/.config/typos.toml +++ b/.config/typos.toml @@ -17,6 +17,9 @@ Parth = "Parth" # seems like the spellchecker does not like it is similar to "Pa nd = "nd" threadsave = "threadsave" +[default.extend-identifiers] +dbe = "dbe" + [default] extend-ignore-re = [ "[A-Z]{2,}ed", # SELECTed, WATCHed, etc. @@ -65,11 +68,9 @@ pathc = "pathc" pn = "pn" seeked = "seeked" tre = "tre" -dbe = "dbe" [type.cpp.extend-words] fo = "fo" -dbe = "dbe" [type.systemd.extend-words] # systemd = .conf diff --git a/src/bgiteration.c b/src/bgiteration.c index c0f966c9cb4..c70bd129015 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -169,7 +169,8 @@ bool getParamsForSelect(int argc, robj **argv, client *permission_client, int *d static dictType sdsrefToPtrDictType = { .entryGetKey = dictEntryGetKey, .hashFunction = dictSdsHash, - .keyCompare = dictSdsKeyCompare + .keyCompare = dictSdsKeyCompare, + .entryDestructor = zfree }; @@ -275,13 +276,13 @@ static dictType dictEntryPtrDictType = { .entryGetKey = dictEntryGetKey, .hashFunction = pointerHash, .keyCompare = pointerCompare, - .resizeAllowed = neverShrink + .resizeAllowed = neverShrink, + .entryDestructor = zfree }; // A TEMP set of robj's (of type sds). This is only for temporary sets as the robj's are not -// ref-counted at insertion/deletion. Used for robj->NULL. -static dictType tempKeysetDictType = { - .entryGetKey = dictEntryGetKey, +// ref-counted at insertion/deletion. +static hashtableType tempKeysetHashtableType = { .hashFunction = dictObjHash, .keyCompare = dictObjKeyCompare }; @@ -1101,7 +1102,7 @@ static bool expediteSingleKeyWithoutOptimization( bgIterator *it, int dbid, robj *oKey, - dict *waitingOnKeys) { + hashtable *waitingOnKeys) { bool mustBlock = false; @@ -1114,12 +1115,12 @@ static bool expediteSingleKeyWithoutOptimization( && (dictFind(it->early_iterate_entries, de) == NULL)) { if (addEarlyIterationKey(it, de, dbid)) { mustBlock = true; - dictAdd(waitingOnKeys, oKey, NULL); + hashtableAdd(waitingOnKeys, oKey); } } else { if (isEntryInuseByAnyIterator(de)) { mustBlock = true; - dictAdd(waitingOnKeys, oKey, NULL); + hashtableAdd(waitingOnKeys, oKey); } } } @@ -1135,7 +1136,7 @@ static bool expediteKeysForMove( int dbid, int argc, robj **argv, - dict *waitingOnKeys) { + hashtable *waitingOnKeys) { if (argc <= MOVE_COMMAND_DBID_ARG_INDEX) return false; int destDbid; @@ -1162,7 +1163,7 @@ static bool expediteKeysForCopy( int dbid, int argc, robj **argv, - dict *waitingOnKeys) { + hashtable *waitingOnKeys) { int destDbid; if (!getTargetDbIdForCopyCommand(argc, argv, dbid, &destDbid)) return false; @@ -1221,7 +1222,7 @@ static bool expediteKeysForWrite( robj **argv, keyReference *keyrefs, int numKeys, - dict *waitingOnKeys) { + hashtable *waitingOnKeys) { serverAssert(numKeys > 0); bool mustBlock = false; @@ -1267,12 +1268,12 @@ static bool expediteKeysForWrite( && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { if (addEarlyIterationKey(it, de, dbid)) { mustBlock = true; - dictAdd(waitingOnKeys, oKey, NULL); + hashtableAdd(waitingOnKeys, oKey); } } else { if (isEntryInuseByAnyIterator(de)) { mustBlock = true; - dictAdd(waitingOnKeys, oKey, NULL); + hashtableAdd(waitingOnKeys, oKey); } } } @@ -1309,7 +1310,7 @@ static bool expediteKeysForWrite( } if (isEntryInuseByAnyIterator(de)) { mustBlock = true; - dictAdd(waitingOnKeys, oKey, NULL); + hashtableAdd(waitingOnKeys, oKey); } } @@ -1346,7 +1347,7 @@ static bool expediteKeysForWrite( if (addEarlyIterationKey(it, notIteratedEntry, dbid)) { mustBlock = true; - dictAdd(waitingOnKeys, oKey, NULL); + hashtableAdd(waitingOnKeys, oKey); } } dictReleaseIterator(di); @@ -1367,7 +1368,7 @@ static bool expediteKeysForWrite( } if (isEntryInuseByAnyIterator(de)) { mustBlock = true; - dictAdd(waitingOnKeys, oKey, NULL); + hashtableAdd(waitingOnKeys, oKey); } } } @@ -1943,7 +1944,7 @@ static bool expediteKeysForWriteOnAllIterators( robj **argv, keyReference *keyrefs, int numKeys, - dict *waitingOnKeys) { + hashtable *waitingOnKeys) { bool mustBlock = false; listIter li; @@ -1971,7 +1972,7 @@ static bool anIteratorWillReplicateForThisCommand(void) { } -static bool expediteKeysForMultiExec(client *c, dict *waitingOnKeys) { +static bool expediteKeysForMultiExec(client *c, hashtable *waitingOnKeys) { serverAssert(c->cmd->proc == execCommand); /* For MULTI/EXEC, Valkey buffers all of the commands until hitting the EXEC. @@ -2404,7 +2405,7 @@ bool bgIteration_blockClientIfRequired(client *c) { } bool mustBlock = false; - dict *waitOnKeys = dictCreate(&tempKeysetDictType); // dict of robj(sds)->NULL + hashtable *waitOnKeys = hashtableCreate(&tempKeysetHashtableType); // set of robj(sds) listEmpty(curCmdMissingKeys); if (c->cmd->proc == execCommand) { @@ -2428,7 +2429,7 @@ bool bgIteration_blockClientIfRequired(client *c) { * (Yuck.) */ while (mustBlock) { receiveItemsBackFromIterators(true); // Blocking - dictEmpty(waitOnKeys, NULL); + hashtableEmpty(waitOnKeys, NULL); mustBlock = expediteKeysForWriteOnAllIterators( c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); } @@ -2447,24 +2448,25 @@ bool bgIteration_blockClientIfRequired(client *c) { } if (mustBlock) { - serverAssert(dictSize(waitOnKeys) > 0); - robj **waitKeysArgv = zmalloc(sizeof(robj*) * dictSize(waitOnKeys)); + serverAssert(hashtableSize(waitOnKeys) > 0); + robj **waitKeysArgv = zmalloc(sizeof(robj*) * hashtableSize(waitOnKeys)); - dictEntry *de; - dictIterator *di = dictGetIterator(waitOnKeys); + robj *key; + hashtableIterator hi; + hashtableInitIterator(&hi, waitOnKeys, 0); unsigned long argvCount = 0; - while((de = dictNext(di)) != NULL) { - waitKeysArgv[argvCount++] = dictGetKey(de); + while (hashtableNext(&hi, (void **)&key)) { + waitKeysArgv[argvCount++] = key; } - dictReleaseIterator(di); - serverAssert(argvCount == dictSize(waitOnKeys)); + hashtableCleanupIterator(&hi); + serverAssert(argvCount == hashtableSize(waitOnKeys)); blockClientInUseOnKeys(c, argvCount, waitKeysArgv); zfree(waitKeysArgv); } - dictRelease(waitOnKeys); + hashtableRelease(waitOnKeys); if (BGITERATION_DEBUG) { if (mustBlock) debugBuffer = sdscat(debugBuffer, " (blocked)\n"); diff --git a/src/bgiteration.h b/src/bgiteration.h index a7a14fdb4a0..ed361f3ca03 100644 --- a/src/bgiteration.h +++ b/src/bgiteration.h @@ -19,8 +19,8 @@ */ /* Avoids dependency on server.h */ -typedef struct serverObject dbEntry; // An object with key/value inserted into main dictionary -typedef struct serverObject robj; // An object with a value used for command parameters +typedef struct serverObject dbEntry; // An object with key/value inserted into main dictionary +typedef struct serverObject robj; // An object with a value used for command parameters typedef struct client client; /* The bgIterator is an opaque structure. */ @@ -87,11 +87,11 @@ typedef void (*bgIteratorCleanupFunc)(bool terminated, void *privdata); * last item is read. */ bgIterator * bgIteratorCreateFullScanIter( - const char *name, - int flags, - bgIteratorReplDoneFunc repldone, - bgIteratorCleanupFunc cleanup, - void *privdata); + const char *name, + int flags, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata); /* Create a background slots iterator (bgIterator). @@ -116,23 +116,23 @@ bgIterator * bgIteratorCreateFullScanIter( * last item is read. */ bgIterator * bgIteratorCreateSlotsIter( - const char *name, - int flags, - const int *slots, - int slots_count, - bgIteratorReplDoneFunc repldone, - bgIteratorCleanupFunc cleanup, - void *privdata); + const char *name, + int flags, + const int *slots, + int slots_count, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata); /* Find an existing bgIterator by name. * Returns NULL if the iterator does not exist (or has completed). */ -bgIterator * bgIteratorFind(const char *name); +bgIterator *bgIteratorFind(const char *name); /* Get the name of an existing iterator. */ -const char * bgIteratorName(bgIterator *iter); +const char *bgIteratorName(bgIterator *iter); /* Struct to retrieve status information for an active iteration client. */ @@ -225,12 +225,12 @@ typedef struct { typedef struct { bgIteratorItemType type; - int dbid; /* orig DB ID for CONSISTENT, queue-time DB ID for !CONSISTENT. */ + int dbid; // orig DB ID for CONSISTENT, queue-time DB ID for !CONSISTENT. union { - dbEntryData dbe; // for BGITERATOR_ITEM_DBENTRY - replicationData repl; // for BGITERATOR_ITEM_REPLICATION - long long master_repl_offset; // for BGITERATOR_ITEM_COMPLETE - int dbid2; // for BGITERATOR_ITEM_SWAPDB + dbEntryData dbe; // for BGITERATOR_ITEM_DBENTRY + replicationData repl; // for BGITERATOR_ITEM_REPLICATION + long long master_repl_offset; // for BGITERATOR_ITEM_COMPLETE + int dbid2; // for BGITERATOR_ITEM_SWAPDB } u; } bgIteratorItem; @@ -250,7 +250,7 @@ typedef struct { * * (All memory management is the responsibility of the bgIterator - not the reader.) */ -bgIteratorItem * bgIteratorRead(bgIterator *iter); +bgIteratorItem *bgIteratorRead(bgIterator *iter); /* Close the bgIterator, allowing the bgIterator to be deallocated. @@ -271,7 +271,7 @@ void bgIteratorClose(bgIterator *iter); ********************************************************************************************/ typedef struct { - uint32_t iterator_epoch; // iterator epoch of last modification + uint32_t iterator_epoch; // iterator epoch of last modification } bgIterationEntryMetadata; @@ -302,7 +302,7 @@ void bgIteration_flushall(void); * BgIteration keeps track of expedited keys (by pointer) to avoid repeated iteration. BgIteration * must be notified when dbEntries are reallocated. BgIteration will not dereference the pointers; * it is safe to have deallocated the old dbEntry before calling this function. - * + * * We can't update the dbEntry if the entry is actually in use (bgIteration_isEntryInuse)! * * To simplify calling code, this function does nothing if old_entry == new_entry. @@ -337,10 +337,10 @@ bool bgIteration_blockClientIfRequired(client *c); * re-written for propagation. */ void bgIteration_handleCommandReplication( - int dbid, - struct serverCommand *cmd, - int argc, - robj **argv); + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv); /* The memory that bgIteration uses while temporarily buffering replication data is not included in diff --git a/src/hashtable.c b/src/hashtable.c index f9b4683832d..2a54835a51e 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -2271,11 +2271,13 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) { * child bucket in a chain, or to the next bucket index, or to the * next table. */ iter->pos_in_bucket++; + // clang format off if (iter->bucket->chained && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1 - && iter->pos_in_bucket != ITERATOR_DONE_WITH_BUCKET_IDX+1) { + && iter->pos_in_bucket != ITERATOR_DONE_WITH_BUCKET_IDX + 1) { iter->pos_in_bucket = 0; iter->bucket = getChildBucket(iter->bucket); + // clang format on } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) { /* Bucket index done. */ if (isSafe(iter)) { diff --git a/src/server.h b/src/server.h index b683a950e6f..f7e5c55ac19 100644 --- a/src/server.h +++ b/src/server.h @@ -108,7 +108,7 @@ static_assert(sizeof(off_t) >= 8, "off_t must be 64-bit; ensure _FILE_OFFSET_BIT * 1. It's carries a reference counted STRING (a keyless value) during parsing and command execution. * 2. It's also used to carry a key/value pair which is inserted into the DB. In this form, the * value is not limited to being a string. - * + * * The typedef "dbEntry" is used to explicitly connote the latter form. It indicates a key/value * pair which is suitable to exist in the DB. It might be active in the DB, or may be unlinked from * the DB (but still contains a key/value). The value may be any of the Valkey data types/encodings. diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index c54ce3c8515..85241b26951 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -2129,7 +2129,7 @@ TEST_F(BgIterationTest, expireKeys_NoReplication_YesConsistent) { // 2. Later the key is expired and deleted during command processing (causes DEL to be sent) - no longer early iterated // 3. The key is recreated as part of the command processing (and this command was replicated) - again early iterated // 4. Finally, when we iterate to the key, it shouldn't be sent, because it was replicated in step 3. -TEST_F(BgIterationTest, expireKeys_Replication_NoConsistent_FutureKeyCreatedThenExipredDuringSet) { +TEST_F(BgIterationTest, expireKeys_Replication_NoConsistent_FutureKeyCreatedThenExpiredDuringSet) { simpleDelItem(8); // Start with a missing future item bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); From e2c82a8748e195bbac17b28cb8048f81d6198eca Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Mon, 18 May 2026 21:55:30 +0000 Subject: [PATCH 12/40] Forkless Save Signed-off-by: Jim Brunner --- src/hashtable.c | 4 +- src/unit/test_bgiteration.cpp | 71 ++++++++++++++++++++--------------- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/src/hashtable.c b/src/hashtable.c index 2a54835a51e..5fefc0b45a2 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -2246,6 +2246,7 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) { /* Check if iterator has been invalidated */ if (iter->hashtable == NULL) return false; + // clang format off while (1) { if (iter->index == -1 && iter->table == 0) { /* It's the first call to next. */ @@ -2271,13 +2272,11 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) { * child bucket in a chain, or to the next bucket index, or to the * next table. */ iter->pos_in_bucket++; - // clang format off if (iter->bucket->chained && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1 && iter->pos_in_bucket != ITERATOR_DONE_WITH_BUCKET_IDX + 1) { iter->pos_in_bucket = 0; iter->bucket = getChildBucket(iter->bucket); - // clang format on } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) { /* Bucket index done. */ if (isSafe(iter)) { @@ -2327,6 +2326,7 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) { } return true; } + // clang format on return false; } diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index 85241b26951..21b5da11679 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -1,5 +1,7 @@ // Just for the moment, until https://github.com/valkey-io/valkey/issues/3450 is resolved // clang-format off +// The hashtable uses different ordering for 32-bit. Need consider mocking. +#if SIZE_MAX == UINT64_MAX /* 64-bit version */ //#include #include "generated_wrappers.hpp" #include @@ -247,9 +249,14 @@ class BgIterationTest : public ::testing::Test { } + robj *createStringObjectFromCString(const char *s) { + return createStringObject(s, strlen(s)); + } + + void addKeyToDb(int dbid, const char *key, const char *val) { - robj *key_obj = createStringObject(key, strlen(key)); - robj *val_obj = createStringObject(val, strlen(val)); + robj *key_obj = createStringObjectFromCString(key); + robj *val_obj = createStringObjectFromCString(val); dbAdd(server.db[dbid], key_obj, &val_obj); decrRefCount(key_obj); } @@ -705,9 +712,9 @@ class BgIterationTest : public ::testing::Test { c->argc = 3; c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); - c->argv[1] = createStringObjectFromSds(sdsnew(keyStr(itemNum))); - c->argv[2] = createStringObjectFromSds(sdsnew(value)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + c->argv[1] = createStringObjectFromCString(keyStr(itemNum)); + c->argv[2] = createStringObjectFromCString(value); return c; } @@ -734,10 +741,10 @@ class BgIterationTest : public ::testing::Test { c->argc = 2 + srcItemsNum.size(); c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); - c->argv[1] = createStringObjectFromSds(sdsnew(keyStr(dstItemNum))); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + c->argv[1] = createStringObjectFromCString(keyStr(dstItemNum)); for (unsigned int i = 0; i < srcItemsNum.size(); i++) { - c->argv[2 + i] = createStringObjectFromSds(sdsnew(keyStr(srcItemsNum[i]))); + c->argv[2 + i] = createStringObjectFromCString(keyStr(srcItemsNum[i])); } return c; @@ -778,7 +785,7 @@ class BgIterationTest : public ::testing::Test { c->argv = static_cast(zcalloc(sizeof(robj*) * 5)); // command + 4 args for (int i = 0; token != NULL; i++) { - c->argv[i] = createStringObject(token, strlen(token)); + c->argv[i] = createStringObjectFromCString(token); c->argc = i+1; token = strtok_r(NULL, " ", &tokenSave); } @@ -792,7 +799,7 @@ class BgIterationTest : public ::testing::Test { c->cmd = lookupCommandByCString("exec"); c->argc = 1; c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromSds(sdsnew("EXEC")); + c->argv[0] = createStringObjectFromCString("EXEC"); zfree(commandsCopy); return c; @@ -838,10 +845,10 @@ class BgIterationTest : public ::testing::Test { c->argc = 4; c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromSds(sdsnew(cmd->fullname)); - c->argv[1] = createStringObjectFromSds(sdsnew(keyStr(itemNum1))); - c->argv[2] = createStringObjectFromSds(sdsnew(value1)); - c->argv[3] = createStringObjectFromSds(sdsnew(keyStr(itemNum2))); + c->argv[0] = createStringObjectFromCString(cmd->fullname); + c->argv[1] = createStringObjectFromCString(keyStr(itemNum1)); + c->argv[2] = createStringObjectFromCString(value1); + c->argv[3] = createStringObjectFromCString(keyStr(itemNum2)); return c; } @@ -865,7 +872,7 @@ class BgIterationTest : public ::testing::Test { c->argc = 1; c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromSds(sdsnew(cmd->fullname)); + c->argv[0] = createStringObjectFromCString(cmd->fullname); return c; } @@ -977,7 +984,7 @@ class BgIterationTest : public ::testing::Test { // Replicate MULTI if this is the first instruction inside MULTI/EXEC if (i == 0) { robj *argv[1]; - argv[0] = createStringObjectFromSds(sdsnew("multi")); + argv[0] = createStringObjectFromCString("multi"); bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("multi"), 1, argv); decrRefCount(argv[0]); } @@ -986,7 +993,7 @@ class BgIterationTest : public ::testing::Test { // Call handleCommandReplication for EXEC robj *argv[1]; - argv[0] = createStringObjectFromSds(sdsnew("EXEC")); + argv[0] = createStringObjectFromCString("EXEC"); bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("exec"), 1, argv); server.in_exec = 0; decrRefCount(argv[0]); @@ -1005,8 +1012,8 @@ class BgIterationTest : public ::testing::Test { int db = getDbFromItemNum(itemNum); sds sdsKey = sdsnew(keyStr(itemNum)); robj *argv[2]; - argv[0] = createStringObjectFromSds(sdsnew("DEL")); - argv[1] = createStringObjectFromSds(sdsdup(sdsKey)); + argv[0] = createStringObjectFromCString("DEL"); + argv[1] = createStringObjectFromCString(sdsKey); serverCommand *cmd = lookupCommandByCString("DEL"); bgIteration_handleCommandReplication(db, cmd, 2, argv); decrRefCount(argv[0]); @@ -1067,11 +1074,11 @@ class BgIterationTest : public ::testing::Test { c->argc = 3; c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); dbStr[0] = '0' + dbid0; - c->argv[1] = createStringObjectFromSds(sdsnew(dbStr)); + c->argv[1] = createStringObjectFromCString(dbStr); dbStr[0] = '0' + dbid1; - c->argv[2] = createStringObjectFromSds(sdsnew(dbStr)); + c->argv[2] = createStringObjectFromCString(dbStr); bool blocked = bgIteration_blockClientIfRequired(c); EXPECT_FALSE(blocked); // SWAPDB should never block @@ -1101,7 +1108,7 @@ class BgIterationTest : public ::testing::Test { c->argc = 1; c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); dbEntry *de_in_use = getItem(anInUseItem); EXPECT_EQ(de_in_use->refcount, 2u); @@ -2146,7 +2153,7 @@ TEST_F(BgIterationTest, expireKeys_Replication_NoConsistent_FutureKeyCreatedThen // Now, as the SET command tries to execute, simulate that the key is expired. Expiration // processing sends the replication FIRST! robj *argv[2]; - argv[0] = createStringObjectFromSds(sdsnew("DEL")); + argv[0] = createStringObjectFromCString("DEL"); argv[1] = c->argv[1]; serverCommand *cmd = lookupCommandByCString("DEL"); bgIteration_handleCommandReplication(getDbFromItemNum(8), cmd, 2, argv); @@ -2277,7 +2284,7 @@ TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_keyDeletedDuring // Now the write will run, re-creating the item (which is still a future item) const char * const newValueStr = "new value"; - robj *newValueRobj = createStringObjectFromSds(sdsnew(newValueStr)); + robj *newValueRobj = createStringObjectFromCString(newValueStr); setKey(c, c->db, c->argv[1], &newValueRobj, SETKEY_ADD_OR_UPDATE); // Finally, we are letting bgIteration know that the write command was executed @@ -2734,12 +2741,12 @@ TEST_F(BgIterationTest, copyHandlesProperDb_Replication_NoConsistent) { c->db = server.db[0]; c->argc = 6; c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); - c->argv[1] = createStringObjectFromSds(sdsnew("C0")); - c->argv[2] = createStringObjectFromSds(sdsnew("E0")); - c->argv[3] = createStringObjectFromSds(sdsnew("DB")); - c->argv[4] = createStringObjectFromSds(sdsnew("1")); - c->argv[5] = createStringObjectFromSds(sdsnew("REPLACE")); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname)); + c->argv[1] = createStringObjectFromCString("C0"); + c->argv[2] = createStringObjectFromCString("E0"); + c->argv[3] = createStringObjectFromCString("DB"); + c->argv[4] = createStringObjectFromCString("1"); + c->argv[5] = createStringObjectFromCString("REPLACE"); // This should block on 2 keys. DB0:C0 is in queue. DB1:E0 needs to be expedited. simulateBlockedWrite(c, 2); @@ -3742,3 +3749,5 @@ TEST_F(BgIterationTestCluster, modFutureItem_YesReplication_YesConsistent_cluste // JHB - need test that hashing is paused when an entry is in use. + +#endif // if SIZE_MAX == UINT64_MAX /* 64-bit version */ From 798eef9ea3479c599555011f6c820572afc5b733 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 19 May 2026 16:26:22 +0000 Subject: [PATCH 13/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.h | 4 ++-- src/hashtable.c | 4 ++-- src/unit/custom_matchers.hpp | 6 +++++- src/unit/test_bgiteration.cpp | 1 + 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/bgiteration.h b/src/bgiteration.h index ed361f3ca03..7be91e4534b 100644 --- a/src/bgiteration.h +++ b/src/bgiteration.h @@ -86,7 +86,7 @@ typedef void (*bgIteratorCleanupFunc)(bool terminated, void *privdata); * There is no need to delete/destroy a bgIterator. It will automatically be cleaned up after the * last item is read. */ -bgIterator * bgIteratorCreateFullScanIter( +bgIterator *bgIteratorCreateFullScanIter( const char *name, int flags, bgIteratorReplDoneFunc repldone, @@ -115,7 +115,7 @@ bgIterator * bgIteratorCreateFullScanIter( * There is no need to delete/destroy a bgIterator. It will automatically be cleaned up after the * last item is read. */ -bgIterator * bgIteratorCreateSlotsIter( +bgIterator *bgIteratorCreateSlotsIter( const char *name, int flags, const int *slots, diff --git a/src/hashtable.c b/src/hashtable.c index 5fefc0b45a2..b5791f347f0 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -2246,7 +2246,7 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) { /* Check if iterator has been invalidated */ if (iter->hashtable == NULL) return false; - // clang format off + // clang-format off while (1) { if (iter->index == -1 && iter->table == 0) { /* It's the first call to next. */ @@ -2326,7 +2326,7 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) { } return true; } - // clang format on + // clang-format on return false; } diff --git a/src/unit/custom_matchers.hpp b/src/unit/custom_matchers.hpp index 2d9c8193d29..9aa7dfc7454 100644 --- a/src/unit/custom_matchers.hpp +++ b/src/unit/custom_matchers.hpp @@ -14,7 +14,11 @@ MATCHER_P(robjEqualsStr, str, "robj string matcher") { assert(arg->type == OBJ_STRING); assert(sdsEncodedObject(arg)); - return strcmp(static_cast(objectGetVal(arg)), str) == 0; + + if (strcmp(static_cast(objectGetVal(arg)), str) == 0) return true; + + *result_listener << "robj(\"" << (char*)objectGetVal(arg) << "\") doesn't match \"" << str << "\""; + return false; } #endif // _CUSTOM_MATCHERS_HPP_ diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index 21b5da11679..a322faae301 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -1,6 +1,7 @@ // Just for the moment, until https://github.com/valkey-io/valkey/issues/3450 is resolved // clang-format off // The hashtable uses different ordering for 32-bit. Need consider mocking. +#include #if SIZE_MAX == UINT64_MAX /* 64-bit version */ //#include #include "generated_wrappers.hpp" From 243a170bbcbaf65ce6bd3f02578c522f86cf3267 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 19 May 2026 16:38:12 +0000 Subject: [PATCH 14/40] Forkless Save Signed-off-by: Jim Brunner --- src/unit/custom_matchers.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unit/custom_matchers.hpp b/src/unit/custom_matchers.hpp index 9aa7dfc7454..edc83bf33ea 100644 --- a/src/unit/custom_matchers.hpp +++ b/src/unit/custom_matchers.hpp @@ -17,7 +17,7 @@ MATCHER_P(robjEqualsStr, str, "robj string matcher") { if (strcmp(static_cast(objectGetVal(arg)), str) == 0) return true; - *result_listener << "robj(\"" << (char*)objectGetVal(arg) << "\") doesn't match \"" << str << "\""; + *result_listener << "robj(\"" << (char *)objectGetVal(arg) << "\") doesn't match \"" << str << "\""; return false; } From 239126022d4a66db227efea032115a73f4357e29 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 19 May 2026 17:56:20 +0000 Subject: [PATCH 15/40] Forkless Save Signed-off-by: Jim Brunner --- src/unit/test_bgiteration.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index a322faae301..d56f5da80ad 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -2184,6 +2184,7 @@ TEST_F(BgIterationTest, expireKeys_Replication_NoConsistent_FutureKeyCreatedThen expectReadKeySequence(it, 9, LAST_ITEM); expectReadComplete(it); + freeTestClient(c); } From df33d897cc27bd309ec84ab7f33643d325df07c9 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Wed, 20 May 2026 22:00:26 +0000 Subject: [PATCH 16/40] Forkless Save Signed-off-by: Jim Brunner --- src/unit/test_bgiteration.cpp | 3755 --------------------------------- 1 file changed, 3755 deletions(-) delete mode 100644 src/unit/test_bgiteration.cpp diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp deleted file mode 100644 index d56f5da80ad..00000000000 --- a/src/unit/test_bgiteration.cpp +++ /dev/null @@ -1,3755 +0,0 @@ -// Just for the moment, until https://github.com/valkey-io/valkey/issues/3450 is resolved -// clang-format off -// The hashtable uses different ordering for 32-bit. Need consider mocking. -#include -#if SIZE_MAX == UINT64_MAX /* 64-bit version */ -//#include -#include "generated_wrappers.hpp" -#include -//#include "amz_assert.h" - -// -// -// ## -// ######: ## -// #######: ## -// ## :## -// ## ## ##.#### .####: ##: :## #### .####: ## ## -// ## :## ####### .######: ## ## #### .######: ##. .## -// #######: ###. ##: :## :## ##: ## ##: :## #: ## :# -// ######: ## ######## ##..## ## ######## :#:.##.:#: -// ## ## ######## ##::## ## ######## # :##:## -// ## ## ## :####: ## ## ## ## ## -// ## ## ###. :# #### ## ###. :# ###::## -// ## ## .####### #### ######## .####### :##..##: -// ## ## .#####: :##: ######## .#####: .## ## -// -// -// -// -// -// -// -// .####. #### -// ###### #### -// :## ##: ## -// ##: :## ##.#### ## ## ## -// ## ## ####### ## :## ## -// ## ## ### :## ## ##: ##. -// ## ## ## ## ## ###:## -// ## ## ## ## ## .## # -// ##: :## ## ## ## ####. -// :## ##: ## ## ##: :### -// ###### ## ## ##### ## -// .####. ## ## .#### ##. -// :## -// ###: -// ### -// -// -// -// -// ### ## ## -// ### ## ## ## -// ###: ## ## ## -// #### ## .####. ####### ##.#### .####: :#### :###.## ## ## -// ##:#: ## .######. ####### ####### .######: ###### :####### :## ## -// ## ## ## ### ### ## ###. ##: :## #: :## ### ### ##: ##. -// ## ## ## ##. .## ## ## ######## :##### ##. .## ###:## -// ## :#:## ## ## ## ## ######## .####### ## ## .## # -// ## #### ##. .## ## ## ## ## . ## ##. .## ####. -// ## :### ### ### ##. ## ###. :# ##: ### ### ### :### -// ## ### .######. ##### ## .####### ######## :####### ## -// ## ### .####. .#### ## .#####: ###.## :###.## ##. -// :## -// ###: -// ### -// -// -// -// ## -// :#### ## -// ##### ## -// ## -// ####### .####. ##.#### ##.#### .####: ##: :## #### .####: ## ## -// ####### .######. ####### ####### .######: ## ## #### .######: ##. .## -// ## ### ### ###. ###. ##: :## :## ##: ## ##: :## #: ## :# -// ## ##. .## ## ## ######## ##..## ## ######## :#:.##.:#: -// ## ## ## ## ## ######## ##::## ## ######## # :##:## -// ## ##. .## ## ## ## :####: ## ## ## ## ## -// ## ### ### ## ## ###. :# #### ## ###. :# ###::## -// ## .######. ## ## .####### #### ######## .####### :##..##: -// ## .####. ## ## .#####: :##: ######## .#####: .## ## -// -// -// -// - - - -using namespace ::testing; - -extern "C" { - #include "stdlib.h" - #include "bgiteration.h" - #include "server.h" - //#include "serverassert.h" - #define using usingvar // compile hack - #include "module.h" - #undef using - extern hashtableType commandSetType; - extern dictType keylistDictType; - bool iteratorRepldoneFn(void *privdata); - void iteratorCleanupFn(bool terminated, void *privdata); - void bgIteration_feedIterators(void); - void createSharedObjects(void); - void hashtableDump(hashtable *ht); - void rehashStepExpand(hashtable *ht); // in hashtable.c (non-API) - void bgIteration_unitTestDisableCloning(void); - void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes); -} - - -// The private data is a pointer to arbitrary data. This value is used just to -// test that the correct value is passed through. -#define PRIVDATA reinterpret_cast(12345) - -// A bgIteration cleanup function used for testing. -int cleanupCount; -bool cleanupTerminated; -void iteratorCleanupFn(bool terminated, void *privdata) { - EXPECT_EQ(privdata, PRIVDATA); - cleanupCount++; - cleanupTerminated = terminated; -} - - -// A bgIteration repldone function used for testing. -int repldoneCount; -bool iteratorRepldoneFn(void *privdata) { - EXPECT_EQ(privdata, PRIVDATA); - repldoneCount++; - return true; -} - - -// A more complicated repldone function that can delay the replcation done condition. -bool isReplDoneReady; -bool iteratorRepldoneFnNotBeingReadyInitially(void *privdata) { - EXPECT_EQ(privdata, PRIVDATA); - // This is to test the behavior when Repl Done function is not ready to be executed. - if (!isReplDoneReady) { - isReplDoneReady = true; - return false; - } - repldoneCount++; - return true; -} - - -static const char *logfile = ""; - -/* Most of the bgIteration unit tests are based on a CMD instance with 2 DBs. There are 8 keys in - * each DB. The keys are named A0, B0, C0, D0, E0, F0, G0, H0 for DB-0 and A1, B1, C1, D1, E1, F1, - * G1, H1 for DB-1. There are a number of helper functions to simulate certain key modification - * actions within our test configuration. Note that this is isolated from the actual call to - * processCommand. - * - * Because most of bgIteration is based on an ordered processing of keys, it doesn't matter if we - * are simulating CMD or CME, full scan, or slot-based. The majority of tests are independent of - * these concerns. - * - * However, there are some tests which are are unique to these configurations and use a specialized - * derived class to handle the differences. We do not want to duplicate all of the tests for - * the different configurations, but we do want to ensure that each configuration works properly. - * - bgIterationTestCluster - handles tests unique to full scan in cluster mode - * - bgIterationTestClusterSlots - handles tests unique to cluster slot-based iteration - */ -class BgIterationTest : public ::testing::Test { - private: - static const int DB_COUNT = 2; - static const int ITEMS_PER_DB = 8; - - // This is the expected order of the keys when hashed - const char *keys[DB_COUNT][ITEMS_PER_DB] = {{"D0", "G0", "H0", "C0", "F0", "A0", "B0", "E0"}, - {"B1", "C1", "F1", "G1", "E1", "D1", "A1", "H1"}}; - - protected: - static const int TOTAL_ITEMS = DB_COUNT * ITEMS_PER_DB; - static const int LAST_ITEM = TOTAL_ITEMS - 1; - - MockValkey mock; - RealValkey real; - - struct serverCommand dummy_cmd = {0}; - - // Helper functions for accessing the keys. We can access by db(0..1) and seq(0..4) - // or by item number (0..9). - // NOTE: These virtual functions can be overridden in subclasses which may have different item layout. - virtual const char * getKeyAtDbSeq(int db, int seq) { - assert(db < DB_COUNT); - assert(seq < ITEMS_PER_DB); - return keys[db][seq]; - } - - virtual int getDbFromItemNum(int itemNum) { - assert(itemNum < DB_COUNT * ITEMS_PER_DB); - return itemNum / ITEMS_PER_DB; - } - - virtual int getSeqFromItemNum(int itemNum) { - assert(itemNum < DB_COUNT * ITEMS_PER_DB); - return itemNum % ITEMS_PER_DB; - } - - const char * keyStr(int itemNum) { - return getKeyAtDbSeq(getDbFromItemNum(itemNum), getSeqFromItemNum(itemNum)); - } - - int itemNumFromKey(const char * key) { - for (int itemNum = 0; itemNum < DB_COUNT * ITEMS_PER_DB; itemNum++) { - if (strcmp(key, keyStr(itemNum)) == 0) return itemNum; - } - return -1; - } - - - // Do some general initialization before starting the suite. Normally, the tests are run in - // isolation - and this isn't much different than SetUp(). But if running the - // entire test suite together (just manually running the test executable), this gets called - // only once. - static void SetUpTestSuite() { - monotonicInit(); - - bzero(&server, sizeof(server)); - server.hz = 100; - server.logfile = const_cast(logfile); - createSharedObjects(); - - moduleInitModulesSystem(); - - server.commands = hashtableCreate(&commandSetType); - server.orig_commands = hashtableCreate(&commandSetType); - populateCommandTable(); - } - - - static void TearDownTestSuite() { - hashtableRelease(server.commands); - hashtableRelease(server.orig_commands); - } - - - void initializeServerDb(int dbid, int slot_count_bits = 0) { - server.db[dbid] = static_cast(zcalloc(sizeof(serverDb))); - server.db[dbid]->id = dbid; - server.db[dbid]->keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, 0); - server.db[dbid]->expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, 0); - server.db[dbid]->watched_keys = dictCreate(&keylistDictType); - kvstoreExpand(server.db[dbid]->keys, 8, 0, NULL); - } - - - robj *createStringObjectFromCString(const char *s) { - return createStringObject(s, strlen(s)); - } - - - void addKeyToDb(int dbid, const char *key, const char *val) { - robj *key_obj = createStringObjectFromCString(key); - robj *val_obj = createStringObjectFromCString(val); - dbAdd(server.db[dbid], key_obj, &val_obj); - decrRefCount(key_obj); - } - - - virtual void setupDatabase() { - // For these unit tests, a standard database is constructed. The order of items in the - // hash table is important, and this is validated here. If the hash table - // implementation changes, we will find out quickly at this point. All other tests - // will become invalid! - - server.dbnum = 2; - server.cluster_enabled = false; - server.db = static_cast(zcalloc(sizeof(serverDb *) * server.dbnum)); - - for (int dbid = 0; dbid < server.dbnum; dbid++) { - initializeServerDb(dbid); - } - - // With hashtable, it can be difficult to get our keys spread across different buckets. - // Here we play with hashtable size and rehashing to get comfortable scenarios for testing. - // NOTE: If the hashtable bucketization changes, we'll need to evaluate the tests for - // changes. Since bgIteration processes a bucket at a time, we need to evaluate - // all the tests when bucketization changes. - // As an alternative, we could mock all of the hashtable activity, but it's better if we - // can use the real functionality as much as possible. - - kvstoreExpand(server.db[0]->keys, 16, 0, NULL); - addKeyToDb(0, "A0", "A0"); - addKeyToDb(0, "B0", "B0"); - addKeyToDb(0, "C0", "C0"); - addKeyToDb(0, "D0", "D0"); - addKeyToDb(0, "E0", "E0"); - addKeyToDb(0, "F0", "F0"); - addKeyToDb(0, "G0", "G0"); - addKeyToDb(0, "H0", "H0"); - hashtable *ht = kvstoreGetHashtable(server.db[0]->keys, 0); - hashtablePauseRehashing(ht); - - kvstoreExpand(server.db[1]->keys, 16, 0, NULL); - addKeyToDb(1, "A1", "A1"); - addKeyToDb(1, "B1", "B1"); - addKeyToDb(1, "C1", "C1"); - addKeyToDb(1, "D1", "D1"); - addKeyToDb(1, "E1", "E1"); - addKeyToDb(1, "F1", "F1"); - addKeyToDb(1, "G1", "G1"); - addKeyToDb(1, "H1", "H1"); - // Now, let's increase the size and start a rehash on the 2nd DB. This ensures that - // iteration is working even if a hashtable is in the middle of rehashing. We choose - // a 128 size so that rehashed keys all get unique buckets. - kvstoreExpand(server.db[1]->keys, 128, 0, NULL); - ht = kvstoreGetHashtable(server.db[1]->keys, 0); - rehashStepExpand(ht); // in hashtable.c (non-API) - rehashStepExpand(ht); // and rehash the 2nd bucket also - hashtablePauseRehashing(ht); - - // The bucketization should look like this. Remember that DB-1 is in - // the middle of a rehash, so it has 2 tables. - // - // DB: 0 SLOT: 0 - // Table 0, used 8, exp 2, top-level buckets 4, child buckets 0 - // Bucket 0:1 level:0 - // 0 h2 63, key "D0" - // 1 h2 a5, key "G0" - // 2 h2 ca, key "H0" - // Bucket 0:2 level:0 - // 0 h2 91, key "C0" - // 1 h2 88, key "F0" - // Bucket 0:3 level:0 - // 0 h2 b8, key "A0" - // 1 h2 f5, key "B0" - // 2 h2 13, key "E0" - // Table 1, used 0, exp -1, top-level buckets 0, child buckets 0 - // - // DB: 1 SLOT: 0 - // Table 0, used 3, exp 2, top-level buckets 4, child buckets 0 - // Bucket 0:0 level:0 <- rehashed into table 1 - // Bucket 0:1 level:0 <- rehashed into table 1 - // Bucket 0:2 level:0 - // 0 h2 18, key "B1" - // 1 h2 fd, key "C1" - // Bucket 0:3 level:0 - // 0 h2 6f, key "F1" - // Table 1, used 5, exp 5, top-level buckets 32, child buckets 0 - // Bucket 1:1 level:0 - // 0 h2 ad, key "G1" - // Bucket 1:5 level:0 - // 0 h2 0c, key "E1" - // Bucket 1:12 level:0 - // 0 h2 e9, key "D1" - // Bucket 1:17 level:0 - // 0 h2 36, key "A1" - // Bucket 1:29 level:0 - // 0 h2 9e, key "H1" - // Bucket 1:30 level:0 - - - // In case we need to debug... - // Used to generate comment above, showing bucketization. - if (0) debugPrintBucketInfo(); - - // Validate that the iteration order matches the expected order - for (int db = 0; db < server.dbnum; db++) { - ht = kvstoreGetHashtable(server.db[db]->keys, 0); - hashtableIterator *it = hashtableCreateIterator(ht, 0); - robj *next; - int i = 0; - while (hashtableNext(it, reinterpret_cast(&next))) { - ASSERT_THAT(next, robjEqualsStr(getKeyAtDbSeq(db, i++))); - } - hashtableReleaseIterator(it); - } - } - - - void SetUp() override { - server.main_thread_id = pthread_self(); - server.forkless_options_supported = 1; - objectSetMetadataSize(sizeof(bgIterationEntryMetadata)); - - bgIteration_unitTestDisableCloning(); - - setupDatabase(); - - EXPECT_CALL(mock, aeCreateTimeEvent(_,_,_,_,_)).WillRepeatedly(Return(0)); - bgIteration_init(); - - cleanupCount = 0; - repldoneCount = 0; - isReplDoneReady = false; - - // By default, in tests, we treat items as not having an expiration - //JHB EXPECT_CALL(mock, getExpire(_,_)).WillRepeatedly(Return(-1)); - - // By default, do nothing for these - EXPECT_CALL(mock, blockClientInUseOnKeys(_,_,_)).WillRepeatedly(Return()); - EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).WillRepeatedly(Return()); - - // By default, expect no permission issues - EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_,_,_,_,_,_)).WillRepeatedly(Return(ACL_OK)); - - //JHB EXPECT_CALL(mock, lookupCommandOrOriginal(_)).WillRepeatedly(Return(&dummy_cmd)); - } - - - void TearDown() override { - bgIteration_feedIterators(); // process returning stuff before deleting DB - bgIteration_feedIterators(); // in case an iterator was closed there might be more - for (int i = 0; i < server.dbnum; i++) { - if (server.db[i]->keys) kvstoreRelease(server.db[i]->keys); - if (server.db[i]->expires) kvstoreRelease(server.db[i]->expires); - dictRelease(server.db[i]->watched_keys); - zfree(server.db[i]); - } - zfree(server.db); - } - - - // void update_keys(const char **new_keys, int db, int len) { - // memcpy(keys[db], new_keys, len * sizeof(const char *)); - // } - - - - - - - // Deletes an item from the DB (often at the start of a test) - but does NOT notify - // bgIteration. bgIteration_keyDelete() should be explicitly called where needed. - void simpleDelItem(int itemNum) { - int db = getDbFromItemNum(itemNum); - - sds delKey = sdsnew(keyStr(itemNum)); - int rc = kvstoreHashtableDelete(server.db[db]->keys, 0, delKey); - ASSERT_EQ(rc, 1); - sdsfree(delKey); - } - - - // Find the actual dbEntry object by itemNum - dbEntry * getItem(int itemNum) { - int db = getDbFromItemNum(itemNum); - sds key = sdsnew(keyStr(itemNum)); - dbEntry *de = dbFind(server.db[db], key); - sdsfree(key); - return de; - } - - - // The test expects that the next item read will be BGITERATOR_ITEM_COMPLETE - void expectReadComplete(bgIterator *iter) { - bgIteration_feedIterators(); - bgIteratorItem *item = bgIteratorRead(iter); - EXPECT_EQ(item->type, BGITERATOR_ITEM_COMPLETE); - bgIteratorClose(iter); - - int oldCleanupCount = cleanupCount; - bgIteration_feedIterators(); - EXPECT_EQ(cleanupCount, oldCleanupCount + 1); - } - - - // The test is cleaning up and isn't validating the remaining cleanup - void expectAnythingCleanup(bgIterator *iter) { - while (true) { - bgIteration_feedIterators(); - bgIteratorItem *item = bgIteratorRead(iter); - if ((item->type == BGITERATOR_ITEM_COMPLETE - || item->type == BGITERATOR_ITEM_TERMINATED)) { - bgIteratorClose(iter); - break; - } - } - bgIteration_feedIterators(); // Recognize the closed iterator - EXPECT_EQ(cleanupCount, 1); - } - - - void expectDictEntryMetadataMatch(dbEntry *de1, dbEntry *de2) { - bgIterationEntryMetadata *dm1 = static_cast(objectGetMetadata(de1)); - bgIterationEntryMetadata *dm2 = static_cast(objectGetMetadata(de2)); - - EXPECT_NE(dm1, nullptr); - EXPECT_NE(dm2, nullptr); - EXPECT_EQ(dm1->iterator_epoch, dm2->iterator_epoch); - } - - - // Useful when debugging new tests. It reads/prints all remaining items then crashes. - void cleanupIteratorDebugPrint(bgIterator *iter) { - bool done = false; - printf("[DEBUG] Printing bgIterator '%s' items:\n", bgIteratorName(iter)); - while (!done) { - bgIteration_feedIterators(); - bgIteratorItem *item = bgIteratorRead(iter); - switch (item->type) { - case BGITERATOR_ITEM_DBENTRY: - { - auto obj = item->u.dbe.de; - const char * keyStr = objectGetKey(obj); - printf("Entry: %s -> %s [itemNum: %i]\n", - keyStr, - static_cast(objectGetVal(obj)), - itemNumFromKey(keyStr)); - break; - } - case BGITERATOR_ITEM_REPLICATION: - printf("Repl: DB=%d : ", item->dbid); - for (int i = 0; i < item->u.repl.argc; i++) - printf("%s ", static_cast(objectGetVal(item->u.repl.argv[i]))); - printf("\n"); - break; - case BGITERATOR_ITEM_COMPLETE: - case BGITERATOR_ITEM_TERMINATED: - bgIteratorClose(iter); - done = true; - break; - default: - printf("unhandled: %d\n", item->type); - } - } - bgIteration_feedIterators(); // Recognize the closed iterator - ASSERT_TRUE(false); // Halt the test here - } - - - // Make a copy of the metadata - void * cloneMetadata(dbEntry *de) { - int size = objectGetMetadataSize(de); - void *metadata = zmalloc(size); - memcpy(metadata, objectGetMetadata(de), size); - return metadata; - } - - - // Compare a previous metadata copy to an existing entry - void compareAndFreeClonedMetadata(dbEntry *de, void *metadata) { - EXPECT_EQ(memcmp(objectGetMetadata(de), metadata, objectGetMetadataSize(de)), 0); - zfree(metadata); - } - - - // The test expects the next item will be a specific key - // The item value is verified against the default unless provided as a parameter. - void expectReadKey(bgIterator *iter, int itemNum, const char *value=nullptr) { - int db = getDbFromItemNum(itemNum); - - bgIteration_feedIterators(); - bgIteratorItem *item = bgIteratorRead(iter); - bgIteration_feedIterators(); - - ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); - EXPECT_EQ(item->dbid, db); - EXPECT_FALSE(item->u.dbe.is_cloned); - // if (item->u.dbe.is_cloned) { // JHB - wrong place to check this. - // // If the entry is cloned, make sure we copied the metadata - // dbEntry *cloned_dbEntry = item->u.dbe.de; - // dbEntry *original_dbEntry = getItem(itemNum); - // expectDictEntryMetadataMatch(original_dbEntry, cloned_dbEntry); - // } - EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum)); - if (value) { - EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); - } else { - EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum))); - } - } - - - // The test expects the next item will be a specific key amd that the item is cloned. - // Metadata is tested (to make sure the clone includes the proper metadata). - // The item value is verified against the default unless provided as a parameter. - void expectReadClonedKey(bgIterator *iter, int itemNum, void *metadata, const char *value=nullptr) { - int db = getDbFromItemNum(itemNum); - - bgIteration_feedIterators(); - bgIteratorItem *item = bgIteratorRead(iter); - bgIteration_feedIterators(); - - ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); - EXPECT_EQ(item->dbid, db); - EXPECT_TRUE(item->u.dbe.is_cloned); - compareAndFreeClonedMetadata(item->u.dbe.de, metadata); - EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum)); - if (value) { - EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); - } else { - EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum))); - } - } - - - // Test expects the next key, but specified by key name, not itemNum. - void expectReadDbKeyValue(bgIterator *iter, int db, const char *key, const char *value) { - bgIteration_feedIterators(); - bgIteratorItem *item = bgIteratorRead(iter); - bgIteration_feedIterators(); - - ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); - EXPECT_EQ(item->dbid, db); - EXPECT_STREQ(objectGetKey(item->u.dbe.de), key); - EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); - } - - - // Test expect to read a sequence of key items - void expectReadKeySequence(bgIterator *iter, int startItem, int endItem) { - for (int i = startItem; i <= endItem; i++) expectReadKey(iter, i); - } - - - // Just like expectReadKey, but also tests that a previous item is becoming unblocked. - void expectReadKeyWithUnblock(bgIterator *iter, int itemNum, int unblockItem, const char *value=nullptr) { - bool blocked = true; - EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(unblockItem)))) - .WillOnce(Assign(&blocked, false)); - expectReadKey(iter, itemNum, value); - EXPECT_FALSE(blocked); - } - - - // Test expects to read a replication item matching the command help by client 'c' - void expectReadReplication(bgIterator *iter, client *c) { - bgIteration_feedIterators(); - bgIteratorItem *item = bgIteratorRead(iter); - bgIteration_feedIterators(); - - ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); - EXPECT_EQ(item->dbid, c->db->id); - EXPECT_EQ(item->u.repl.cmd, c->cmd); - EXPECT_EQ(item->u.repl.argc, c->argc); - for (int i = 0; i < c->argc; i++) { - EXPECT_STREQ(static_cast(objectGetVal(item->u.repl.argv[i])), - static_cast(objectGetVal(c->argv[i]))); - } - } - - - // We expect to read a MULTI command which should have been inserted. - void expectReadMultiReplication(bgIterator *iter) { - bgIteration_feedIterators(); - bgIteratorItem *item = bgIteratorRead(iter); - bgIteration_feedIterators(); - - ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); - EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("multi")); - } - - - // We expect to read an EXEC command which should have been inserted. - void expectReadExecReplication(bgIterator *iter) { - bgIteration_feedIterators(); - bgIteratorItem *item = bgIteratorRead(iter); - bgIteration_feedIterators(); - - ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); - EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("exec")); - } - - - // Expecting that a DEL command should have been replicated. - void expectReadReplicationDel(bgIterator *iter, int itemNum) { - int db = getDbFromItemNum(itemNum); - - bgIteration_feedIterators(); - bgIteratorItem *item = bgIteratorRead(iter); - bgIteration_feedIterators(); - - ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); - EXPECT_EQ(item->dbid, db); - EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("DEL")); - EXPECT_EQ(item->u.repl.argc, 2); - EXPECT_THAT(item->u.repl.argv[0], robjEqualsStr("DEL")); - EXPECT_THAT(item->u.repl.argv[1], robjEqualsStr(keyStr(itemNum))); - } - - - // Expecting that a special SWAPDB item has been inserted. - void expectReadSwapDB(bgIterator *iter, int db1, int db2) { - bgIteration_feedIterators(); - bgIteratorItem *item = bgIteratorRead(iter); - bgIteration_feedIterators(); - - ASSERT_EQ(item->type, BGITERATOR_ITEM_SWAPDB); - EXPECT_EQ(item->dbid, db1); - EXPECT_EQ(item->u.dbid2, db2); - } - - - // Used to examine the physical bucket layout in the hash table. Generated the comment - // above which shows each item in each bucket. Necessary if hash table layout changes. - void debugPrintBucketInfo(int num_slots = -1) { - for (int db = 0; db < server.dbnum; db++) { - int n = (num_slots == -1) ? kvstoreNumHashtables(server.db[db]->keys) : num_slots; - for (int slot = 0; slot < n; slot++) { - hashtable *ht = kvstoreGetHashtable(server.db[db]->keys, slot); - printf("DB: %d SLOT: %d\n", db, slot); - hashtableDump(ht); - } - } - } - - - // Creates a client with a write command (SET) for the given itemNum - client * getWriteClient(int itemNum, const char *value) { - int db = getDbFromItemNum(itemNum); - - client *c = static_cast(zcalloc(sizeof(client))); - - c->cmd = lookupCommandByCString("set"); - c->db = server.db[db]; - - c->argc = 3; - c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromCString(c->cmd->fullname); - c->argv[1] = createStringObjectFromCString(keyStr(itemNum)); - c->argv[2] = createStringObjectFromCString(value); - - return c; - } - - - // Create a client with a write command that touches multiple keys - client * getWriteMultiKeysClient( - const char * cmdName, - int dstItemNum, - const std::vector & srcItemsNum) { - - assert(!srcItemsNum.empty()); - - const int db = getDbFromItemNum(dstItemNum); - std::for_each(srcItemsNum.cbegin(), srcItemsNum.cend(), [&db, this](int srcItemNum) { - assert(db == getDbFromItemNum(srcItemNum)); - }); - - client *c = static_cast(zcalloc(sizeof(client))); - - c->cmd = lookupCommandByCString(cmdName); - assert(c->cmd != nullptr); - c->db = server.db[db]; - - c->argc = 2 + srcItemsNum.size(); - c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromCString(c->cmd->fullname); - c->argv[1] = createStringObjectFromCString(keyStr(dstItemNum)); - for (unsigned int i = 0; i < srcItemsNum.size(); i++) { - c->argv[2 + i] = createStringObjectFromCString(keyStr(srcItemsNum[i])); - } - - return c; - } - - - client * getWrite2KeysClient(const char * cmdName, int dstItemNum, int srcItemNum) { - return getWriteMultiKeysClient(cmdName, dstItemNum, {srcItemNum}); - } - - - client * getWrite3KeysClient( - const char * cmdName, int dstItemNum, int src1ItemNum, int src2ItemNum) { - return getWriteMultiKeysClient(cmdName, dstItemNum, {src1ItemNum, src2ItemNum}); - } - - - // Create a client with a MULTI/EXEC block. - // This parses a series of commands separated by ';' - // Example: getMultiClient("SET A0 xxx; SELECT 1; SET A1 xxx; SET B1 xxx") - client * getMultiClient(const char *commands, int dbid = 0) { - char *commandsCopy = zstrdup(commands); // a mutable copy - char *commandStr, *commandStrSave; - char *token, *tokenSave; - - client *c = static_cast(zcalloc(sizeof(client))); - c->db = server.db[dbid]; - initClientMultiState(c); - c->flag.multi = 1; - c->mstate->cmd_flags |= CMD_WRITE; - - commandStr = strtok_r(commandsCopy, ";", &commandStrSave); - while (commandStr != NULL) { - - token = strtok_r(commandStr, " ", &tokenSave); - c->cmd = lookupCommandByCString(token); - - c->argv = static_cast(zcalloc(sizeof(robj*) * 5)); // command + 4 args - - for (int i = 0; token != NULL; i++) { - c->argv[i] = createStringObjectFromCString(token); - c->argc = i+1; - token = strtok_r(NULL, " ", &tokenSave); - } - - queueMultiCommand(c, 0); - freeClientArgv(c); - - commandStr = strtok_r(NULL, ";", &commandStrSave); - } - - c->cmd = lookupCommandByCString("exec"); - c->argc = 1; - c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromCString("EXEC"); - - zfree(commandsCopy); - return c; - } - - - // Initially, a MULTI client is set up to execute the EXEC command (which examines the - // contents of the multi/exec block). This function advances the client to begin executing - // the individual commands within the multi/exec block. - void advanceMultiClientToCommand(client *c, int cmdNum) { - assert(cmdNum >= 0 && cmdNum < c->mstate->count); - c->argc = c->mstate->commands[cmdNum].argc; - c->argv = c->mstate->commands[cmdNum].argv; - c->argv_len = c->mstate->commands[cmdNum].argv_len; - c->cmd = c->realcmd = c->mstate->commands[cmdNum].cmd; - } - - - // A client with a fictional command: - // SETGET - // - writes a value to the first key (making this CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY) - // - reads a second key - client * getSetGetClient(int itemNum1, const char *value1, int itemNum2) { - // Fictional command which writes to 1st key and reads the 2nd - int db = getDbFromItemNum(itemNum1); - assert(db == getDbFromItemNum(itemNum2)); // (this would be a testcase error) - - client *c = static_cast(zcalloc(sizeof(client))); - struct serverCommand *cmd - = static_cast(zcalloc(sizeof(struct serverCommand))); - - cmd->fullname = const_cast("SETGET"); - cmd->arity = 4; - cmd->flags = CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY; - - cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INDEX; - cmd->legacy_range_key_spec.bs.index.pos = 1; // firstkey - cmd->legacy_range_key_spec.fk.range.lastkey = -1; - cmd->legacy_range_key_spec.fk.range.keystep = 2; - - c->cmd = cmd; - c->db = server.db[db]; - - c->argc = 4; - c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromCString(cmd->fullname); - c->argv[1] = createStringObjectFromCString(keyStr(itemNum1)); - c->argv[2] = createStringObjectFromCString(value1); - c->argv[3] = createStringObjectFromCString(keyStr(itemNum2)); - - return c; - } - - - // Client with a fictional write command with no keys specified - client * getNoKeysWriteClient() { - // Fictional command which is marked WRITE, but has no keys. - client *c = static_cast(zcalloc(sizeof(client))); - struct serverCommand *cmd - = static_cast(zcalloc(sizeof(struct serverCommand))); - - cmd->fullname = const_cast("NOKEYSWRITE"); - cmd->arity = 1; - cmd->flags = CMD_WRITE; - - cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INVALID; // No keys - - c->cmd = cmd; - c->db = server.db[0]; - - c->argc = 1; - c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromCString(cmd->fullname); - - return c; - } - - - void freeClientArgv(client *c) { - for (int i = 0; i < c->argc; i++) decrRefCount(c->argv[i]); - zfree(c->argv); - c->argv = NULL; - c->argc = 0; - } - - - // During testing, we create some fake commands. This checks if the command is real or fake. - // A fake command is dynamically allocated and can be freed. Real commands are static. - bool isRealValkeyCommand(struct serverCommand *cmd) { - return lookupCommandByCString(cmd->declared_name); - } - - - void freeTestClient(client *c) { - freeClientMultiState(c); - freeClientArgv(c); - - if (!isRealValkeyCommand(c->cmd)) zfree(c->cmd); - - zfree(c); - } - - - // Simulate what happens when a write command is blocked - void simulateBlockedWrite(client *c, int expectedNumberBlockedKeys = 1) { - EXPECT_CALL(mock, blockClientInUseOnKeys(c,expectedNumberBlockedKeys,_)).Times(1); - bool blocked = bgIteration_blockClientIfRequired(c); - EXPECT_TRUE(blocked); - } - - - // Simulate what happens when a write command isn't blocked - void simulateUnblockedWrite(client *c) { - EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); - bool blocked = bgIteration_blockClientIfRequired(c); - EXPECT_FALSE(blocked); - } - - - // Simulate what happens when a write command is NOT blocked, because the key can be cloned - // and expedited. This requires a scenario where we would normally need to block the - // client so that bgIteration can process the item. - void simulateClonedWrite(bgIterator *it, client *c) { - bgIteratorStatus status; - bgIteratorGetStatus(it, &status); - unsigned long initialClones = status.dbentry_clones_queued; - - // Client should not get blocked - EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); - bool blocked = bgIteration_blockClientIfRequired(c); - EXPECT_FALSE(blocked); - - // Ensure that cloning took place - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentry_clones_queued, (initialClones + 1)); - - // Ensure that the real item isn't inuse (because we cloned it instead) - dbEntry *de = dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); - ASSERT_FALSE(bgIteration_isEntryInuse(de)); - } - - - // Simulates what happens when a write command (SET) actually executes. This requires a - // scenario where we would NOT be blocked on the write. It actually alters the value of - // the key and updates the metadata. - void simulateUnblockedWriteWithModification(client *c) { - EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); - bool blocked = bgIteration_blockClientIfRequired(c); - EXPECT_FALSE(blocked); - - //dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); JHB - - // Fake execution of the command - touch the iterator_epoch counter and swap the value - // We need to duplicate the value because setKey() can reallocate it. - robj *value = dupStringObject(c->argv[2]); - setKey(c, c->db, c->argv[1], &value, SETKEY_ADD_OR_UPDATE); - - // Let's make sure that setKey updated the iteration epoch (as it should have) - dbEntry *de = dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); - bgIterationEntryMetadata *md = static_cast(objectGetMetadata(de)); - EXPECT_EQ(md->iterator_epoch, bgIteration_getEpoch()); - - bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); - } - - - // Simulate execution of a MULTI/EXEC transaction for a client `c` without blocking. - // It replays all queued commands and ensures replication matches a real transaction. - // command replication flag is revalidated when exec command is processed. - // This requires a scenario where we don't expect the client to be blocked. - void simulateUnblockedMultiExec(client *c) { - - // simulate EXEC command of the multi/exec client - simulateUnblockedWrite(c); - server.in_exec = 1; - - // If there are other commands, call both blockClientIfRequired and handleCommandReplication for each of the command. - for (int i = 0; i < c->mstate->count; i++) { - advanceMultiClientToCommand(c, i); - simulateUnblockedWrite(c); - - // Replicate MULTI if this is the first instruction inside MULTI/EXEC - if (i == 0) { - robj *argv[1]; - argv[0] = createStringObjectFromCString("multi"); - bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("multi"), 1, argv); - decrRefCount(argv[0]); - } - bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); - } - - // Call handleCommandReplication for EXEC - robj *argv[1]; - argv[0] = createStringObjectFromCString("EXEC"); - bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("exec"), 1, argv); - server.in_exec = 0; - decrRefCount(argv[0]); - } - - - // Simulate the expiration (active expiration) of a key. This is independent of command execution. - void simulateExpiration(int itemNum) { - ASSERT_NE(getItem(itemNum), nullptr); // Should be there before expire - - // NOTE: This seems weird, but Valkey propagates the delete before actually expiring the - // key. BgIterator expects this behavior and expects the key to exist when the - // DEL is received for propagation. - - // Send bgIteration the DEL - int db = getDbFromItemNum(itemNum); - sds sdsKey = sdsnew(keyStr(itemNum)); - robj *argv[2]; - argv[0] = createStringObjectFromCString("DEL"); - argv[1] = createStringObjectFromCString(sdsKey); - serverCommand *cmd = lookupCommandByCString("DEL"); - bgIteration_handleCommandReplication(db, cmd, 2, argv); - decrRefCount(argv[0]); - decrRefCount(argv[1]); - - bgIteration_keyDelete(db, sdsKey); - simpleDelItem(itemNum); // Simulate the actual del - - EXPECT_EQ(getItem(itemNum), nullptr); - sdsfree(sdsKey); - } - - - // Simulates an expiration, but validates behavior for an item inuse by bgIteration. - void simulateExpirationOfInuse(int itemNum) { - // An inuse item will have a refcount > 1. BgIteration should have incremented the - // refcount while it is inuse. - dbEntry *de = getItem(itemNum); - ASSERT_NE(de, nullptr); // Should be there before expire - EXPECT_TRUE(bgIteration_isEntryInuse(de)); - EXPECT_EQ(de->refcount, 2u); - - simulateExpiration(itemNum); - - // At this point, the item is removed from the DB, but still exists, and the refcount - // has been reduced to 1. This allows a background thread to continue using the item. - EXPECT_EQ(de->refcount, 1u); - } - - - // Simulates an expiration, but the item is a future item which will be expedited. - void simulateExpirationWithExpedite(int itemNum) { - // An inuse item will have a refcount > 1. BgIteration should have incremented the - // refcount while it is inuse. - dbEntry *de = getItem(itemNum); - ASSERT_NE(de, nullptr); // Should be there before expire - EXPECT_FALSE(bgIteration_isEntryInuse(de)); // Not yet inuse - EXPECT_EQ(de->refcount, 1u); - - simulateExpiration(itemNum); - - // At this point, the item is removed from the DB, but still exists, and the refcount - // has been reduced to 1. This allows a background thread to continue using the item. - EXPECT_TRUE(bgIteration_isEntryInuse(de)); // It's inuse now - EXPECT_EQ(getItem(itemNum), nullptr); // but it's not in the DB anymore - EXPECT_EQ(de->refcount, 1u); - } - - - // Simulate execution of a SWAPDB command - void simulateSwapDB(int dbid0, int dbid1) { - char dbStr[2] = {0}; - - client *c = static_cast(zcalloc(sizeof(client))); - - c->cmd = lookupCommandByCString("swapdb"); - c->db = server.db[0]; - - c->argc = 3; - c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromCString(c->cmd->fullname); - dbStr[0] = '0' + dbid0; - c->argv[1] = createStringObjectFromCString(dbStr); - dbStr[0] = '0' + dbid1; - c->argv[2] = createStringObjectFromCString(dbStr); - - bool blocked = bgIteration_blockClientIfRequired(c); - EXPECT_FALSE(blocked); // SWAPDB should never block - - // The real SWAP does more than this, but this is enough for unit tests - serverDb *aux = server.db[dbid0]; - server.db[dbid0] = server.db[dbid1]; - server.db[dbid1] = aux; - - bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv); - - freeTestClient(c); - } - - - // Simulate execution of a FLUSHDB or FLUSHALL command - void simulateFlushDB(int db, int anInUseItem) { - client *c = static_cast(zcalloc(sizeof(client))); - - if (db == -1) { - c->cmd = lookupCommandByCString("flushall"); - c->db = server.db[0]; - } else { - c->cmd = lookupCommandByCString("flushdb"); - c->db = server.db[db]; - } - - c->argc = 1; - c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromCString(c->cmd->fullname); - - dbEntry *de_in_use = getItem(anInUseItem); - EXPECT_EQ(de_in_use->refcount, 2u); - - bool blocked = bgIteration_blockClientIfRequired(c); - EXPECT_FALSE(blocked); // FLUSHDB should never block - - // The real FLUSH does more than this, but this is enough for unit tests - - // Now flush the items - for (int d = 0; d < server.dbnum; d++) { - if (db == -1 || db == d) { - kvstoreRelease(server.db[d]->keys); - server.db[d]->keys = NULL; - } - } - - EXPECT_EQ(de_in_use->refcount, 1u); - - // and replicate - - bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv); - - freeTestClient(c); - } -}; - -using BgIterationDeathTest = BgIterationTest; - - -TEST_F(BgIterationTest, dbIsOK) { - // Just run the setup/teardown code to make sure the DB is OK. -} - - -///////////////////////////////////////////////////// -// Simple Full-scan iterator tests -///////////////////////////////////////////////////// - -// A simple full scan that just checks basic flow. -TEST_F(BgIterationTest, createAndCleanup) { - bgIterator *it = bgIteratorCreateFullScanIter("simple", - 0, NULL, iteratorCleanupFn, PRIVDATA); - EXPECT_EQ(bgIteratorFind("simple"), it); - EXPECT_STREQ(bgIteratorName(it), "simple"); - - bgIteratorStatus status; - bgIteratorGetStatus(it, &status); - - EXPECT_EQ(status.dbentries_queued, 0u); - EXPECT_EQ(status.dbentries_processed, 0u); - EXPECT_EQ(status.replication_queued, 0u); - EXPECT_EQ(status.replication_processed, 0u); - EXPECT_EQ(status.swapdb_queued, 0u); - EXPECT_EQ(status.swapdb_processed, 0u); - EXPECT_EQ(status.flushdb_queued, 0u); - EXPECT_EQ(status.flushdb_processed, 0u); - - EXPECT_EQ(status.queue_length, 0u); - EXPECT_GT(status.queue_length_target, 0u); - - EXPECT_LT(status.runtime_ms, 5u); - EXPECT_EQ(status.current_item_ms, 0u); - - expectAnythingCleanup(it); - - EXPECT_EQ(bgIteratorFind("simple"), nullptr); -} - - -// Close client before reading anything -TEST_F(BgIterationTest, testClientCloseBeforeRead) { - bgIterator *it = bgIteratorCreateFullScanIter("simple", - 0, NULL, iteratorCleanupFn, PRIVDATA); - bgIteration_feedIterators(); - - bgIteratorClose(it); // Immediately close before reading - - bgIteration_feedIterators(); // Recognize the closed iterator - - // Check that the cleanup callback was executed properly - EXPECT_EQ(cleanupCount, 1); - EXPECT_TRUE(cleanupTerminated); -} - - -// Test that the full scan hits each item in the expected sequence. -TEST_F(BgIterationTest, orderedIteration) { - bgIterator *it = bgIteratorCreateFullScanIter("simple", - 0, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKeySequence(it, 0, LAST_ITEM); - - // Quick status check. At this point, item #9 hasn't been returned yet. - bgIteratorStatus status; - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentries_queued, static_cast(TOTAL_ITEMS)); - EXPECT_EQ(status.dbentries_processed, static_cast(TOTAL_ITEMS) - 1); - - expectReadComplete(it); // Returns item #9, and reads the completion item - - // Check that the cleanup callback was executed properly - EXPECT_EQ(cleanupCount, 1); - EXPECT_FALSE(cleanupTerminated); -} - - -// Test that two simultaneous iterations work properly. -TEST_F(BgIterationTest, twoOrderedIterations) { - bgIterator *it1 = bgIteratorCreateFullScanIter("simple1", - 0, NULL, iteratorCleanupFn, PRIVDATA); - bgIterator *it2 = bgIteratorCreateFullScanIter("simple2", - 0, NULL, iteratorCleanupFn, PRIVDATA); - EXPECT_EQ(bgIteratorFind("simple1"), it1); - EXPECT_EQ(bgIteratorFind("simple2"), it2); - - int it1Count = 0; - int it2Count = 0; - while (it1Count < TOTAL_ITEMS || it2Count < TOTAL_ITEMS) { - // Randomly read from either iterator - if ((rand() % 2) == 0) { - if (it1Count < TOTAL_ITEMS) expectReadKey(it1, it1Count++); - } else { - if (it2Count < TOTAL_ITEMS) expectReadKey(it2, it2Count++); - } - } - - // Nothing left but to read the final completions - expectReadComplete(it1); - EXPECT_EQ(cleanupCount, 1); - EXPECT_FALSE(cleanupTerminated); - expectReadComplete(it2); - EXPECT_EQ(cleanupCount, 2); - EXPECT_FALSE(cleanupTerminated); -} - - -///////////////////////////////////////////////////// -// MODIFY A FUTURE ITEM -// The next tests validate the basic pattern when a key, not yet iterated, is modified. -// Each variation of iteration flags is tested. -// Note that these tests execute without cloning (cloning is tested elsewhere). -///////////////////////////////////////////////////// - -// Modify a future item, without replication or consistency. -// Our expectation for this case is that the modification should proceed without blocking, the item -// shouldn't be expedited, and we will see the modified item once the iterator reaches it. -TEST_F(BgIterationTest, modFutureItem_NoReplication_NoConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). - // Fake a modification to a later key so that we can see if it gets processed out of order. - client *c = getWriteClient(6, "xxx"); - - // We DONT expect the client to be blocked - not consistent - simulateUnblockedWriteWithModification(c); - - // Now continue reading, 1, 2, 3, 4, 5 - expectReadKeySequence(it, 1, 5); - - // Let's validate that key 6 shows the new value - expectReadKey(it, 6, "xxx"); - - // Continue... - expectReadKeySequence(it, 7, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// Modify a future item, without replication but with consistency. (Like a SAVE operation) -// Our expectation for this case is that the modification SHOULD be blocked, as we have to save the -// the item in it's state before the modification. To reduce blocking time, the item should be -// moved to the head of the queue - there's no replication in this case, so out-of-order processing -// isn't a concern. -TEST_F(BgIterationTest, modFutureItem_NoReplication_YesConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). - // Fake a modification to a later key so that we can see if it gets processed out of order. - client *c = getWriteClient(6, "xxx"); - // Since this is consistent, we will block the client, disallowing the write. - simulateBlockedWrite(c); - - // On a consistent iterator, the event is expedited in-front of items already in queue! - // Read key 6 out of order. - expectReadKey(it, 6); - - // Now, when we read key 1, key 6 is released back to Valkey, and the client will be unblocked. - expectReadKeyWithUnblock(it, 1, 6); - simulateUnblockedWriteWithModification(c); // Now the write can proceed - - // Continue... - expectReadKeySequence(it, 2, 5); - // 6 has already been processed - expectReadKeySequence(it, 7, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// Modify a future item, with replication but without consistency. (Like a Threadsave Full Sync operation) -// Our expectation for this case is that the modification should proceed without blocking, as the -// mode is inconsistent. We don't expect replication, as we haven't reached the item yet. We'll -// see the modified item later. -TEST_F(BgIterationTest, modFutureItem_YesReplication_NoConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). - // Fake a modification to a later key so that we can see if it gets processed out of order. - client *c = getWriteClient(6, "xxx"); - - // We DONT expect the client to be blocked - not consistent - simulateUnblockedWriteWithModification(c); - - // NOTE: Since we haven't reached this item yet, and consistency is not required, there's no - // need to replicate this command. So everything should wrap up just fine - we will see - // the new value when we get to it. - - // Now continue reading, 1, 2, 3, 4, 5 - expectReadKeySequence(it, 1, 5); - - // Let's validate that key 6 shows the new value - expectReadKey(it, 6, "xxx"); - - // Continue... - expectReadKeySequence(it, 7, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// There's no current use case for CONSISTENT with REPLICATION. It's included for completeness -// and to clarify the functionality of the design. However, if this combination were to be used, -// it would be invalid in the presence of SWAPDB. -TEST_F(BgIterationDeathTest, modFutureItem_YesReplication_YesConsistent_fail) { - // Note: This configuration (CONSISTENT with REPLICATION) is invalid unless in cluster mode. - // The issue is that with multiple database supporting SWAPDB creates a problem. How is it - // possible to maintain a CONSISTENT view with a SWAPDB impacting the values seen in the - // replication stream? (Cluster mode doesn't support SWAPDB, so no issue there.) - EXPECT_DEATH(bgIteratorCreateFullScanIter("iter", BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, - NULL, NULL, NULL), ""); -} - - -///////////////////////////////////////////////////// -// MODIFY A CURRENT ITEM -// The next tests validate the basic pattern when a key, currently in use, is modified. -// Each variation of iteration flags is tested. -// Note that these tests execute without cloning (cloning is tested elsewhere). -///////////////////////////////////////////////////// - -// Modify a current item, without replication or consistency. -// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't -// be expedited (it's already in use). -TEST_F(BgIterationTest, modCurrentItem_NoReplication_NoConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). - client *c = getWriteClient(2, "xxx"); - - // Must be blocked since key is queued - simulateBlockedWrite(c); - - // Now continue reading - expectReadKey(it, 1); - expectReadKey(it, 2); - expectReadKeyWithUnblock(it, 3, 2); - simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication) - - // Continue... - expectReadKeySequence(it, 4, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// Modify a current item, without replication but with consistency. (Like a SAVE operation) -// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't -// be expedited (it's already in use). -TEST_F(BgIterationTest, modCurrentItem_NoReplication_YesConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). - client *c = getWriteClient(2, "xxx"); - - // Must be blocked since key is queued - simulateBlockedWrite(c); - - // Now continue reading - expectReadKey(it, 1); - expectReadKey(it, 2); - expectReadKeyWithUnblock(it, 3, 2); - simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication) - - // Continue... - expectReadKeySequence(it, 4, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// Modify a current item, with replication but without consistency. (Like a Threadsave Full Sync operation) -// Our expectation for this case is that the modification SHOULD be blocked. After the key is processed, -// the write will proceed, and the replication will be sent. -TEST_F(BgIterationTest, modCurrentItem_YesReplication_NoConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). - client *c = getWriteClient(2, "xxx"); - - // Must be blocked since key is queued - simulateBlockedWrite(c); - - // Now continue reading - expectReadKey(it, 1); - expectReadKey(it, 2); - expectReadKeyWithUnblock(it, 3, 2); - simulateUnblockedWriteWithModification(c); // the actual write will cause replication - - expectReadKey(it, 4); // 4 got put in queue when 3 was read - - expectReadReplication(it, c); - - // Continue... - expectReadKeySequence(it, 5, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -#ifdef CODE_NOT_READY_YET -TEST_F(BgIterationTestCluster, modCurrentItem_YesReplication_YesConsistent_cluster) { - // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, - NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // At this point, key 0 is read. All other keys are queued. - client *c = getWriteClient(1, "xxx"); - - // Since this is consistent, we will block the client, disallowing the write. - simulateBlockedWrite(c); - - // Not expedited because item is already in queue - expectReadKey(it, 1); - expectReadKeyWithUnblock(it, 2, nullptr, 1); // reading original/unmodified item - simulateUnblockedWriteWithModification(c); - - expectReadKey(it, 3); // 2, 3 & 4 are in the same bucket, so the replication comes after - expectReadKey(it, 4); - expectReadReplication(it, c); - - // Continue... - expectReadComplete(it); - freeTestClient(c); -} -#endif - - -///////////////////////////////////////////////////// -// MODIFY A PAST ITEM -// The next tests validate the basic pattern when a key, not yet iterated on, is modified. -// Each variation of iteration flags is tested. -// Note that these tests execute without cloning (cloning is tested elsewhere). -///////////////////////////////////////////////////// - -// Modify a past item, without replication or consistency. -// Our expectation for this case is that the modification should proceed without blocking. -// No replication is generated and keys are processed similar to no modification. -TEST_F(BgIterationTest, modPastItem_NoReplication_NoConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // This read returns key 0 (making it a past item) - expectReadKey(it, 1); - - // At this point, key 0 is returned. - client *c = getWriteClient(0, "xxx"); - simulateUnblockedWriteWithModification(c); - - // Continue... - expectReadKeySequence(it, 2, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// Modify a past item, without replication but with consistency. (Like a SAVE operation) -// Our expectation for this case is that the modification should proceed without blocking. -// No replication is generated and keys are processed similar to no modification. -TEST_F(BgIterationTest, modPastItem_NoReplication_YesConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // This read returns key 0 (making it a past item) - expectReadKey(it, 1); - - // At this point, key 0 is returned. - client *c = getWriteClient(0, "xxx"); - simulateUnblockedWriteWithModification(c); - - // Continue... - expectReadKeySequence(it, 2, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// Modify a past item, with replication but without consistency. (Like a Threadsave Full Sync operation) -// Our expectation for this case is that the modification should proceed without blocking. -// Replication will be sent. -TEST_F(BgIterationTest, modPastItem_YesReplication_NoConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // This read returns key 0 (making it a past item) - expectReadKey(it, 1); - - // At this point, key 0 is returned. - client *c = getWriteClient(0, "xxx"); - simulateUnblockedWriteWithModification(c); - - // Key 2 was already in queue (same bucket as key 1). The replication will follow. - expectReadKey(it, 2); - expectReadReplication(it, c); - - // Continue... - expectReadKeySequence(it, 3, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -#ifdef CODE_NOT_READY_YET -TEST_F(BgIterationTestCluster, modPastItem_YesReplication_YesConsistent_cluster) { - // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, - NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // This read returns key 0 (making it a past item) - expectReadKey(it, 1); - - // At this point, key 0 is returned. - client *c = getWriteClient(0, "xxx"); - simulateUnblockedWriteWithModification(c); - - // Keys 2, 3, and 4 were already in queue. The replication will follow. - expectReadKey(it, 2); - expectReadKey(it, 3); - expectReadKey(it, 4); - expectReadReplication(it, c); - - expectReadComplete(it); - freeTestClient(c); -} -#endif - - -///////////////////////////////////////////////////// -// TESTS FOR ITEM CLONING -///////////////////////////////////////////////////// - -// In a consistent iteration, verify that a simple string is properly cloned, and that a write can -// occur without blocking. Validate the cloned item and metadata. -TEST_F(BgIterationTest, modFutureItem_NoReplication_YesConsistent_CloneExpeditedItem) { - // Initialize cloning configurations. - bgIteration_unitTestEnableCloning(50, 100); - - bgIteratorStatus status; - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). - // Fake a modification to a later key so that we can see if it gets processed out of order. - client *c = getWriteClient(6, "xxx"); - - // Quick status check. At this point, no clones exist yet. - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentry_clones_queued, 0u); - - // Since item 6 should be cloned, it will not block the client, allowing the write. - void *de6_md = cloneMetadata(getItem(6)); - simulateClonedWrite(it, c); // This wouldn't block, and queues the cloned value - simulateUnblockedWriteWithModification(c); // This modifies the real entry in the de (touching metadata) - - // At this point, one clone is in the queue. - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentry_clones_queued, 1u); - - // On a consistent iterator, the event is expedited in-front of items already in queue! - // Read key 6 (which is cloned) out of order. The value will still match the key. - expectReadClonedKey(it, 6, de6_md); // Also validates and frees the metadata - - // Quick status check. At this point, cloned items have not been marked as processed yet. - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentry_clones_processed, 0u); - - // Reading key 1 will release key 6, and the clone will finish processing. - expectReadKey(it, 1); - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentry_clones_processed, 1u); - - // Now, when we read key 2 should not have an impact on number of processed clones. - expectReadKey(it, 2); - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentry_clones_processed, 1u); - - // Continue... - expectReadKeySequence(it, 3, 5); - // 6 has already been processed - expectReadKeySequence(it, 7, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// Check that cloning for simple strings is respecting the size limits and pool size. On a -// consistent iteration, we expect to block or clone on all future keys. We validate that we can -// clone if the item is small enough and the cloning pool has more space left. -TEST_F(BgIterationTest, modFutureItem_NoReplication_YesConsistent_LargeItemOrClonePoolFull) { - // Initialize cloning configurations to test the clone pool functionality first. - bgIteration_unitTestEnableCloning(50, 50); - - bgIteratorStatus status; - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). - // Fake a modification to a later key so that we can see if it gets processed out of order. - client *c6 = getWriteClient(6, "xxx"); - client *c7 = getWriteClient(7, "xxx"); - client *c8 = getWriteClient(8, "xxx"); - - // Quick status check. At this point, no clones exist yet. - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentry_clones_queued, 0u); - - // Since item 6 should be cloned, it will not block the client, allowing the write. - void *de6_md = cloneMetadata(getItem(6)); - simulateClonedWrite(it, c6); - simulateUnblockedWriteWithModification(c6); - - // At this point, one clone is in the queue. - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentry_clones_queued, 1u); - - // Now that cloning pool is full, item 7 will not be cloned and the client will be blocked. - simulateBlockedWrite(c7); - ASSERT_TRUE(bgIteration_isEntryInuse(getItem(7))); - - // There is still only one cloned item in the queue. - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentry_clones_queued, 1u); - - // Now change cloning configurations to test that large items will not be cloned. We adjust - // the clone pool size to allow two items, but set the maximum item size to be smaller than - // the size of item 8. The clone pool size must be larger than the total size of the existing - // clones plus the maximum item clone size. - bgIteration_unitTestEnableCloning(1, 101); - - // This write will pass the clone pool check but fail the item size check, blocking the client. - simulateBlockedWrite(c8); - ASSERT_TRUE(bgIteration_isEntryInuse(getItem(8))); - - // On a consistent iterator, the expedited item in-front of items already in queue! - // Read key 6 out of order. - expectReadClonedKey(it, 6, de6_md); - - // Now, when we expect to read key 7, which was expedited, key 6 will be released back to Valkey - // and the clone will be deallocated here. - expectReadKey(it, 7); - - // Now, when we read key 8, which was expedited, key 7 is released back to Valkey, and the client - // will be unblocked. - // (actually, unblock is called after every key [just in case] - but functionally we only care - // about this one) - expectReadKeyWithUnblock(it, 8, 7); - simulateUnblockedWriteWithModification(c7); - - // Now, when we read key 1, key 8 is released back to Valkey, and the client will be unblocked. - expectReadKeyWithUnblock(it, 1, 8); - simulateUnblockedWriteWithModification(c8); - - // Since only one item was cloned, there should be one clone processed - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentry_clones_processed, 1u); - - // Continue... - expectReadKeySequence(it, 2, 5); - // 6, 7, and 8 have already been processed - expectReadKeySequence(it, 9, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c6); - freeTestClient(c7); - freeTestClient(c8); -} - - -///////////////////////////////////////////////////// -// TESTS RELATED TO MODIFICATION OF TWO ITEMS -// When 2 keys are modified, we need to ensure that both keys have been sent before we can send -// replication. This means that if replication is present, we may have to block/expedite for -// future keys, even in the inconsistent scenario. -///////////////////////////////////////////////////// - -// Replication enabled, but NOT consistent. In this case, if ANY of the keys have been iterated, -// ALL of the keys must be replicated so that the command can be processed properly on the replica. -TEST_F(BgIterationTest, modPastFutureItem_YesReplication_NoConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). - // DB1 has lots of buckets. After reading item 9, - // 8 will be past, 10 will be in queue, 11-15 will be future. - expectReadKeySequence(it, 0, 9); - - // We're going to write to key 8 (past) and read from key 12 (future) - // Even though key 12 is for READ in this command, it must be expedited so that it exists before - // the associated replication is sent. - client *c = getSetGetClient(8, "xxx", 12); - simulateBlockedWrite(c); - - // Key 12 will be expedited, but not in front of existing items in queue (can only do that for - // consistent iterators) - JHB How about cluster mode? - - expectReadKey(it, 10); - expectReadKey(it, 12); // expedited - expectReadKeyWithUnblock(it, 11, 12); // 13 is now in queue - - simulateUnblockedWriteWithModification(c); - - // Continue... - expectReadKey(it, 13); - expectReadReplication(it, c); - - expectReadKeySequence(it, 14, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// Replication NOT enabled. A read-only key doesn't need to be expedited, even if other keys have -// been processed already. (This should work identically for both consistent/non-consistent. -TEST_F(BgIterationTest, modPastFutureItem_NoReplication_YesConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter1", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). - // DB1 has lots of buckets. After reading item 9, - // 8 will be past, 10 will be in queue, 11-15 will be future. - expectReadKeySequence(it, 0, 9); - - // We're going to write to key 8 (past) and read from key 12 (future) - // Since there's no replication, we don't have to worry about expediting 12. The write will - // proceed without blocking. - client *c = getSetGetClient(8, "xxx", 12); - simulateUnblockedWriteWithModification(c); - - // Key 12 will not be expedited. Remaining keys should be received in normal order. - expectReadKeySequence(it, 10, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - -TEST_F(BgIterationTest, modPastFutureItem_NoReplication_NoConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter2", - 0, NULL, iteratorCleanupFn, PRIVDATA); - - // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). - // DB1 has lots of buckets. After reading item 9, - // 8 will be past, 10 will be in queue, 11-15 will be future. - expectReadKeySequence(it, 0, 9); - - // We're going to write to key 8 (past) and read from key 12 (future) - // Since there's no replication, we don't have to worry about expediting 12. The write will - // proceed without blocking. - client *c = getSetGetClient(8, "xxx", 12); - simulateUnblockedWriteWithModification(c); - - // Key 9 will not be expedited. Remaining keys should be received in normal order. - expectReadKeySequence(it, 10, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -///////////////////////////////////////////////////// -// TESTS RELATED TO MISSING ITEMS -// Missing items are tricky. A missing item might be logically located in the past or future, in -// relation to the current iteration position. The command may (or may not) create the "missing" -// key. Some general considerations: -// * In a consistent iteration, a missing key didn't exist at the time of consistency, or it was -// already processed (saved) at the time of the deletion. If the missing key gets created, we -// must be sure to skip it if we later iterate over it. -// * In a non-consistent iteration with replication: -// * If the key location is already passed, the replication is sent, allowing the key to be -// created (or not) based on the replication. -// * If the key location is in the future, we can allow the command to proceed, without -// replication. If the key is created, we will process it when the iterator gets to it. -// -// We expect: -// no-repl, no-consist: past items are ignored - future items are processed when iterated -// no-repl, yes-consist: past items are ignored - future items are ignored -// yes-repl, no-consist: past item skipped, but replicated - future items are created by replication and skipped later -// yes-repl, yes-consist: past item skipped, but replicated - future items are processed when iterated -///////////////////////////////////////////////////// - -// no-repl, no-consist: creation of PAST item has no impact -TEST_F(BgIterationTest, missingPastItem_NoReplication_NoConsistent) { - simpleDelItem(0); // Delete the item before iterator creation - bgIterator *it = bgIteratorCreateFullScanIter("iter", - 0, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 1); - expectReadKey(it, 2); - - client *c = getWriteClient(0, "xxx"); - simulateUnblockedWriteWithModification(c); - - expectReadKeySequence(it, 3, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// no-repl, yes-consist: creation of PAST item has no impact -TEST_F(BgIterationTest, missingPastItem_NoReplication_YesConsistent) { - simpleDelItem(0); // Delete the item before iterator creation - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 1); - expectReadKey(it, 2); - - client *c = getWriteClient(0, "xxx"); - simulateUnblockedWriteWithModification(c); - - expectReadKeySequence(it, 3, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// yes-repl, no-consist: creation of a PAST item will be replicated -TEST_F(BgIterationTest, missingPastItem_YesReplication_NoConsistent) { - simpleDelItem(0); // Delete the item before iterator creation - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 1); - expectReadKey(it, 2); - expectReadKey(it, 3); - - client *c = getWriteClient(0, "xxx"); - simulateUnblockedWriteWithModification(c); // replication will be added after item 4 (3,4 in same bucket) - - expectReadKey(it, 4); - - expectReadReplication(it, c); - - expectReadKeySequence(it, 5, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -#ifdef CODE_NOT_READY_YET -// yes-repl, yes-consist: creation of a PAST item will be replicated -TEST_F(BgIterationTestCluster, missingPastItem_YesReplication_YesConsistent) { - // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode - simpleDelItem(0); // Delete the item before iterator creation - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, - NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 1); - expectReadKey(it, 2); - - client *c = getWriteClient(0, "xxx"); - simulateUnblockedWriteWithModification(c); // replication will be added after item 4 (2, 3, and 4 in same bucket) - - expectReadKey(it, 3); - expectReadKey(it, 4); - expectReadReplication(it, c); - - expectReadComplete(it); - freeTestClient(c); -} -#endif - - -// no-repl, no-consist: creation of FUTURE item is seen when reached by the iteration. -TEST_F(BgIterationTest, missingFutureItem_NoReplication_NoConsistent) { - // Using DB1 so we have lots of buckets - // Note: Choosing item 14 because it's in the portion of DB1 that's already rehashed. So we - // know that the item won't be moving when we re-add it. - simpleDelItem(14); // Delete the item before iterator creation - bgIterator *it = bgIteratorCreateFullScanIter("iter", - 0, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 0); - - const char * newValue = "xxx"; - client *c = getWriteClient(14, newValue); - simulateUnblockedWriteWithModification(c); - - expectReadKeySequence(it, 1, 13); - - // We expect to see item 14. - // Note that for an inconsistent DB view, it is logically undefined if this value is seen (or not). - // But as implemented, we should see it and the test is helpful to understand if/when the - // functionality changes. - expectReadKey(it, 14, newValue); - - expectReadKey(it, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// no-repl, yes-consist: creation of FUTURE item is ignored by consistent iteration. -TEST_F(BgIterationTest, missingFutureItem_NoReplication_YesConsistent) { - // Using DB1 so we have lots of buckets - // Note: Choosing item 14 because it's in the portion of DB1 that's already rehashed. So we - // know that the item won't be moving when we re-add it. - simpleDelItem(14); // Delete the item before iterator creation - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 0); - - client *c = getWriteClient(14, "xxx"); - simulateUnblockedWriteWithModification(c); - - expectReadKeySequence(it, 1, 13); - // Key 14 is missing - it didn't exist at start of consistent iteration - expectReadKey(it, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// yes-repl, no-consist: creation of FUTURE item is handled by the replication, and then the key is -// later skipped (treated like an early iteration case). -TEST_F(BgIterationTest, missingFutureItem_YesReplication_NoConsistent) { - // Using DB1 so we have lots of buckets - // Note: Choosing item 14 because it's in the portion of DB1 that's already rehashed. So we - // know that the item won't be moving when we re-add it. - simpleDelItem(14); // Delete the item before iterator creation - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 0); // Items 1 & 2 are in queue (same bucket) - - client *c = getWriteClient(14, "xxx"); - simulateUnblockedWriteWithModification(c); - - expectReadKeySequence(it, 1, 2); - - expectReadReplication(it, c); // Here's the replication creating item 14 - - expectReadKeySequence(it, 3, 13); - // We expect item 14 to be skipped, because it was created by the earlier replication - expectReadKey(it, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - -#ifdef CODE_NOT_READY_YET -TEST_F(BgIterationTestCluster, missingFutureItem_YesReplication_YesConsistent) { - // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode - simpleDelItem(4); - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, - NULL, iteratorCleanupFn, PRIVDATA); - - bgIteration_feedIterators(); // Make sure we get key 0 and 1 into the queue - - client *c = getWriteClient(4, "xxx"); - simulateUnblockedWriteWithModification(c); - - expectReadKey(it, 0); - expectReadKey(it, 1); - - expectReadReplication(it, c); - - expectReadKey(it, 2); - expectReadKey(it, 3); - - // The replication was read - we don't want to see the key now - #4 should be skipped - - expectReadComplete(it); - freeTestClient(c); -} -#endif - - -///////////////////////////////////////////////////// -// TESTS RELATED TO EXPIRATION -// Expiration can be tricky. When pre-evaluating a command with bgIteration_blockClientIfRequired, -// a key might exist, but be ready for expiration. Then, as the command executes, the key expires -// and gets deleted before the write operation. Consider SET K V. -// In the unexpired case, this appears to bgIteration as a single SET command (which replaces the value). -// In the expired case, bgIteration will receive a DEL followed by a SET. -// -// Another case is a READ command. A read command won't cause the client to be blocked. However, -// if the key is expired, this will cause a DEL. For consistent processing, this key might need to -// be expedited so that it can be processed before it gets deleted. In this case, the key is -// unlinked from the main Valkey dictionary, but the actual deletion is deferred. -///////////////////////////////////////////////////// - -TEST_F(BgIterationTest, expireKeys_NoReplication_NoConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - 0, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 0); - expectReadKey(it, 1); - - // At this point, key 1 is active, key 2 is in queue. - - simulateExpiration(0); // Past - we no longer care - simulateExpirationOfInuse(2); // Current - it's inuse - simulateExpiration(5); // Future - we don't care (non-consistent) - - expectReadKeySequence(it, 2, 4); - // key 5 has been deleted - expectReadKeySequence(it, 6, LAST_ITEM); - expectReadComplete(it); -} - - -TEST_F(BgIterationTest, expireKeys_Replication_NoConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 0); - expectReadKey(it, 1); - - // At this point, key 1 is active, key 2 is in queue. - - simulateExpiration(0); // Past - we expect replication - simulateExpirationOfInuse(2); // Current - it's inuse, but we expect replication - simulateExpiration(5); // Future - we don't care (non-consistent) - - expectReadKey(it, 2); // this was already queued - expectReadReplicationDel(it, 0); // Past item should replicate - expectReadReplicationDel(it, 2); // Current item should replicate - // Item 5 is a future item and doesn't need to replicate - - expectReadKeySequence(it, 3, 4); - // Item 5 has been deleted - expectReadKeySequence(it, 6, LAST_ITEM); - expectReadComplete(it); -} - - -TEST_F(BgIterationTest, expireKeys_NoReplication_YesConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 0); - expectReadKey(it, 1); - - // At this point, key 1 is active, key 2 is in queue. - - simulateExpiration(0); // Past - we no longer care - simulateExpirationOfInuse(2); // Current - we must defer - simulateExpirationWithExpedite(5); // Future - will become inuse and expedited for consistency - - expectReadKey(it, 5); // Expedited to front - - expectReadKeySequence(it, 2, 4); - // Item 5 has been deleted - expectReadKeySequence(it, 6, LAST_ITEM); - expectReadComplete(it); -} - - -// Special case during a non-consistent iteration with replication and expiration. -// 1. A future key is created (and processed by its replication) - considered early iterated -// 2. Later the key is expired and deleted during command processing (causes DEL to be sent) - no longer early iterated -// 3. The key is recreated as part of the command processing (and this command was replicated) - again early iterated -// 4. Finally, when we iterate to the key, it shouldn't be sent, because it was replicated in step 3. -TEST_F(BgIterationTest, expireKeys_Replication_NoConsistent_FutureKeyCreatedThenExpiredDuringSet) { - simpleDelItem(8); // Start with a missing future item - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 0); // Get the iterator started - - client *c = getWriteClient(8, "xxx"); - simulateUnblockedWriteWithModification(c); // Not blocked because this is a future key (but we expect repl) - - // Now do it again, but break out the steps so that we can simulate an expiration - bool blocked = bgIteration_blockClientIfRequired(c); - EXPECT_FALSE(blocked); // Shouldn't be blocked because this is a future key - - // Now, as the SET command tries to execute, simulate that the key is expired. Expiration - // processing sends the replication FIRST! - robj *argv[2]; - argv[0] = createStringObjectFromCString("DEL"); - argv[1] = c->argv[1]; - serverCommand *cmd = lookupCommandByCString("DEL"); - bgIteration_handleCommandReplication(getDbFromItemNum(8), cmd, 2, argv); - decrRefCount(argv[0]); - - // Now the call to keyDelete happens (after the replication). - bgIteration_keyDelete(getDbFromItemNum(8), static_cast(objectGetVal(c->argv[1]))); - simpleDelItem(8); // Simulate the actual del - - // Now the SET will run, re-creating the item (which is still a future item) - // We need to duplicate the value because setKey() can reallocate it. - robj *value = dupStringObject(c->argv[2]); - setKey(c, c->db, c->argv[1], &(value), SETKEY_ADD_OR_UPDATE); - - // Finally, replication will be sent because this is creating a new key - bgIteration_handleCommandReplication(getDbFromItemNum(8), c->cmd, c->argc, c->argv); - - // Test that everything comes as expected - expectReadKeySequence(it, 1, 2); // All one bucket - queued after key 0 read - - expectReadReplication(it, c); // Repl from the first SET command - expectReadReplicationDel(it, 8); // This is the expected replication of the DEL from expire - expectReadReplication(it, c); // Repl from the second SET command (recreating deleted key) - - expectReadKeySequence(it, 3, 7); // continue with normal iteration - // KEY 8 SHOULD BE OMITTED - This was already replicated - expectReadKeySequence(it, 9, LAST_ITEM); - - expectReadComplete(it); - freeTestClient(c); -} - - -#ifdef CODE_NOT_READY_YET -///////////////////////////////////////////////////// -// THE REMAINING TESTS ARE GENERAL / UNCATEGORIZED -///////////////////////////////////////////////////// - -// Iteration can be terminated from the main thread or from the child client. -// This tests termination driven from the main thread. -TEST_F(BgIterationTest, earlyTerminationFromMain) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); - expectReadKey(it, 0); - - // At this point, keys 1 & 2 are in queue. A termination should release those keys. - bool blocked1 = true; - bool blocked2 = true; - // We expect no general unblocks, we account for each specific unblock below. - EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0); - // We should expect to see unblock called for items 1-4, as they are released from the queue. - EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1)))) - .WillOnce(Assign(&blocked1, false)); - EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2)))) - .WillOnce(Assign(&blocked2, false)); - bgIteratorTerminate(it); // queues the items for release - EXPECT_TRUE(bgIteratorIsTerminating(it)); - bgIteration_feedIterators(); // actually performs the release - EXPECT_FALSE(blocked1); - EXPECT_FALSE(blocked2); - - bool blocked0 = true; - EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))) - .WillOnce(Assign(&blocked0, false)); - bgIteratorItem *item = bgIteratorRead(it); - EXPECT_FALSE(blocked0); - EXPECT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); - - bgIteratorClose(it); // background thread completes the termination - - EXPECT_EQ(cleanupCount, 0); - bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function - EXPECT_EQ(cleanupCount, 1); - EXPECT_TRUE(cleanupTerminated); -} - - -// Iteration can be terminated from the main thread or from the child client. -// This tests termination driven from the child client (the background thread). -TEST_F(BgIterationTest, earlyTerminationFromChild) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 0); - - // At this point, keys 1 & 2 are in queue. A termination should release those keys. - bgIteratorClose(it); // background thread initiates the termination - EXPECT_TRUE(bgIteratorIsTerminating(it)); - - bool blocked0 = true; - bool blocked1 = true; - bool blocked2 = true; - // Expecting no extra unblocks - EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0); - // We expect item 0 (the in progress item) to be released - EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))) - .WillOnce(Assign(&blocked0, false)); - // We expect items 1-4 (the queued items) to be released - EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1)))) - .WillOnce(Assign(&blocked1, false)); - EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2)))) - .WillOnce(Assign(&blocked2, false)); - bgIteration_feedIterators(); - EXPECT_FALSE(blocked0); - EXPECT_FALSE(blocked1); - EXPECT_FALSE(blocked2); - EXPECT_EQ(cleanupCount, 1); - EXPECT_TRUE(cleanupTerminated); -} - - -// Edge case. Executing a command (like SUNIONSTORE) which REPLACES the first key and reads the -// second key. In this case, bgIteration will get notified of the key deletion during execution of -// SETUNIONSTORE. Given that both keys are in the future (not iterated yet), we'll allow the -// command to execute, unblocked. We won't replicate as we'll pick up the key when we get to it. -TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_keyDeletedDuringSetReplace) { - // Using DB1 so we have lots of buckets - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKeySequence(it, 0, 8); // 9 is in queue - - // Write command that has 2 keys. 1 existing key that we write to and 1 dependant future key. - client *c = getWrite2KeysClient("sunionstore", 12, 13); - - simulateUnblockedWrite(c); - - // Now the call to keyDelete happens - bgIteration_keyDelete(getDbFromItemNum(12), keyStr(12)); - simpleDelItem(12); // So simulate the actual del - - // Now the write will run, re-creating the item (which is still a future item) - const char * const newValueStr = "new value"; - robj *newValueRobj = createStringObjectFromCString(newValueStr); - setKey(c, c->db, c->argv[1], &newValueRobj, SETKEY_ADD_OR_UPDATE); - - // Finally, we are letting bgIteration know that the write command was executed - bgIteration_handleCommandReplication(getDbFromItemNum(12), c->cmd, c->argc, c->argv); - - // Since the write command was not replicated, we expect all the keys to be read in the normal - // order from the dictionary. - expectReadKeySequence(it, 9, 11); - expectReadKey(it, 12, newValueStr); - expectReadKeySequence(it, 13, LAST_ITEM); - - expectReadComplete(it); - freeTestClient(c); -} - - -// Edge case. When we have a new key which is created by a command, AND replication is enabled, we -// expect that we will replicate the command rather than serializing the key/value later. As an -// example, consider SUNIONSTORE A B. We want to create A by replicating the command. We don't -// want to have to process A as a key later on. But in this case, we can't run the command until -// B has been sent. We expect the command to be blocked while we send B. -TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_setNewKey_DependantFuture) { - // Using DB1 so we have lots of buckets - simpleDelItem(12); // Deleting key 12 to then create it with a write command - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKeySequence(it, 0, 8); // 9 is in queue - - // Write command that has 2 keys. 1 new key and 1 dependant future key. - client *c = getWrite2KeysClient("sunionstore", 12, 13); - - // We are simulating a new key in the dict. This command should block on the dependant key. - // This adds key 13 in the queue since the command depends on it. - simulateBlockedWrite(c); - - // Key 9 was already in the queue - expectReadKey(it, 9); - - // Key 13 is processed out of order since the write depends on it - expectReadKey(it, 13); - - // Reading key 10 will unblock key 13, allowing us to write. - expectReadKey(it, 10); - - // Now that key 13 was processed and released by the iterator, the write command can be executed. - simulateUnblockedWriteWithModification(c); - - // Key 11 was queued when we read key 10 - expectReadKey(it, 11); - - // The replication of the write command was enqueued after key 11 - expectReadReplication(it, c); - - // We shouldn't see key 12 - as that was processed via replication. - // We shouldn't see key 13 - as that was expedited earlier - - // Now resuming processing of dict entries - expectReadKeySequence(it, 14, LAST_ITEM); - - expectReadComplete(it); - freeTestClient(c); -} - - -// A new key is being created, but is dependent on another key which has already been processed. -// In this case, the command shouldn't be blocked. -TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_setNewKey_DependantPast) { - // Using DB1 so we have lots of buckets - simpleDelItem(12); // Deleting key 12 to then create it with a write command - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKeySequence(it, 0, 9); // 10 is in queue, done with 8 - - // Write command that has 2 keys. 1 new key and 1 dependant past key. - client *c = getWrite2KeysClient("sunionstore", 12, 8); - - // We are simulating a new key in the dict. - // This command should not block since the dependant key has already been processed. - simulateUnblockedWriteWithModification(c); - - // Key 10 was put in the queue before the write - expectReadKey(it, 10); - - expectReadReplication(it, c); - - expectReadKey(it, 11); - - // Key 12 should be missing - it was processed by replication - - expectReadKeySequence(it, 13, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - - -// A new key is being created, and has dependencies on 2 other keys - one already processed, one not. -// In this case, the command should be blocked so that the future key can be sent first. -TEST_F(BgIterationTest, writeWith3Keys_Replication_NoConsistent_setNewKey_1DependantPast1DependantFuture) { - // Using DB1 so we have lots of buckets - simpleDelItem(12); // Deleting key 12 to then create it with a write command - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKeySequence(it, 0, 9); // 8 has been returned, 9 is active, 10 is in queue - - // Write command that has 1 new key and 2 dependencies (past/future) - client *c = getWrite3KeysClient("sunionstore", 12, 8, 13); - - // The write should be blocked, so that item 13 can be processed. - simulateBlockedWrite(c); - - expectReadKey(it, 10); // 10 was already in queue - expectReadKey(it, 13); // 13 was expedited since the write depends on it - EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(13)))).Times(1); - expectReadKey(it, 11); // Releases 13 so the command can execute - - simulateUnblockedWriteWithModification(c); - - expectReadKey(it, 14); // was queued when reading 11 (12 is missing, 13 was expedited) - - expectReadReplication(it, c); - - expectReadKey(it, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} - -// Test an edge case with the same (future) key being repeated in the command, like: -// SUNIONSTORE A B B -// In this test, A is a previously handled key, and B is a future key. We expect the future key B to -// be expedited (once). -TEST_F(BgIterationTest, writeWith3Keys_Replication_NoConsistent_repeatedKey_1DependantPast1RepeatedFuture) { - // Using DB1 so we have lots of buckets - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKeySequence(it, 0, 9); // We're done with 8, and 10 is in queue - - // Write command that has 3 keys. 1 past key and 1 repeated key in the future. - client *c = getWrite3KeysClient("sunionstore", 8, 12, 12); - - // This command should block because 12 needs to be expedited. - simulateBlockedWrite(c); - - expectReadKey(it, 10); // was already in queue - expectReadKey(it, 12); // expedited - expectReadKey(it, 11); // releases 12 (unblocking the command) - - // Now that key 12 was processed and released by the iterator, the write command can be executed. - simulateUnblockedWriteWithModification(c); - - expectReadKey(it, 13); // queued when we read 11 - - expectReadReplication(it, c); - - // Now resuming processing of dict entries. - expectReadKeySequence(it, 14, LAST_ITEM); - expectReadComplete(it); - freeTestClient(c); -} -#endif -#ifdef CODE_NOT_READY_YET - - -TEST_F(BgIterationTest, writeWith3Keys_Replication_NoConsistent_repeatedKey_1newKey1RepeatedFuture) { - // This tests the replication of a write command that creates a new key and depends on 1 other - // key which is repeated in the command. The repeated key is in the future. - // This test is meant to replicate this bug: https://issues.amazon.com/ELMO-46572 - - // Expected sequence of event for this test: - // ITEM: (0)'D0' : 'D0' - // BLCK?: (0)'sunionstore' 'E0' 'C0' 'C0' - // EARLY: (0)'C0' : 'C0' - // (blocked) - // ITEM: (0)'B0' : 'B0' - // ITEM: (0)'A0' : 'A0' - // BLCK?: (0)'sunionstore' 'E0' 'C0' 'C0' - // REPL?: (0)'sunionstore' 'E0' 'C0' 'C0' - // (queued) - // SKIPPING ITEM(early iterate): (0)'C0' : 'C0' - // ITEM: (1)'E1' : 'E1' - // ITEM: (1)'C1' : 'C1' - // ITEM: (1)'B1' : 'B1' - // ITEM: (1)'A1' : 'A1' - // ITEM: (1)'D1' : 'D1' - // SENDING COMPLETE - // CLEANUP FN (success) - - simpleDelItem(1); // Deleting key 1 to then create it with a write command - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - // Start with this to load 0 into the queue - but don't read 0 as that would load 1,2,3 into the queue! - bgIteration_feedIterators(); - - // Write command that has 3 keys. 1 new key and 1 repeated key in the future. - client *c = getWrite3KeysClient(1, 4, 4); - - // This command should block on key 4. - // This adds key 4 in the queue because: - // - the command depends on key 4 which hasn't been processed yet - // - the command depends on a new key (key 1). - simulateBlockedWrite(c); - - // Key 0 was already enqueued. - expectReadKey(it, 0); - - // Key 4 is processed out of order since the write depends on it - expectReadKey(it, 4); - - // Keys 2,3 are next in the queue (they are all in the same bucket). - // Only reading key 2 for now to release key 4 from the iterator. - expectReadKey(it, 2); - - // Now that key 4 was processed and released by the iterator, the write command can be executed. - simulateUnblockedWriteWithModification(c); - - // Key 3 is next in the queue (it was put in the queue at the same time as key 2). - expectReadKey(it, 3); - - // The replication of the write command was enqueued after keys 1,2,3. - expectReadReplication(it, c); - - // Now resuming processing of dict entries. - expectReadKeySequence(it, 5, 9); - - expectReadComplete(it); - freeTestClient(c); -} - - -TEST_F(BgIterationTest, writeWith3Keys_NoReplication_Consistent_repeatedKey_1DependantPast1RepeatedFuture) { - // This tests the replication of a write command that updates multiple keys and depends on a key - // which is repeated in the command. The repeated key is in the future and the other key is in - // the past. - - // Expected sequence of event for this test: - // ITEM: (0)'D0' : 'D0' - // BLCK?: (0)'blpop' 'D0' 'C0' 'C0' 'D0' - // EARLY_1: (0)'C0' : 'C0' - // (blocked) - // ITEM: (0)'E0' : 'E0' - // ITEM: (0)'B0' : 'B0' - // ITEM: (0)'A0' : 'A0' - // BLCK?: (0)'blpop' 'D0' 'C0' 'C0' 'D0' - // REPL?: (0)'blpop' 'D0' 'C0' 'C0' 'D0' - // SKIPPING ITEM(early iterate): (0)'C0' : 'C0' - // ITEM: (1)'E1' : 'E1' - // ITEM: (1)'C1' : 'C1' - // ITEM: (1)'B1' : 'B1' - // ITEM: (1)'A1' : 'A1' - // ITEM: (1)'D1' : 'D1' - // SENDING COMPLETE - // CLEANUP FN (success) - - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // Start with this to load 0 into the queue - but don't read 0 as that would load 1,2,3 into the queue! - bgIteration_feedIterators(); - - // Write command that has 3 keys. 1 past key and 1 repeated key in the future. - // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a - // multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY). - client *c = getWriteMultiKeysClient(0, {4, 4, 0}, "blpop"); - - // This command should block on 2 keys (0 and 4), since: - // - key 0 is in use by the iterator (still in the queue since it has not been processed by the consumer yet) - // - key 4 is in the future - // This adds key 4 in the queue since the command depends on it and it hasn't been processed yet. - simulateBlockedWrite(c, 2); - - // Key 4 is processed out of order since the write depends on it. - // Key 4 is processed before key 0 even though key 0 was already in the queue - // because key 4 was enqueued as a priority item. - expectReadKey(it, 4); - - // Key 0 was already enqueued. - // Reading key 0 releases key 4 from the iterator. - expectReadKey(it, 0); - - // Keys 1,2,3 are next in the queue (they are all in the same bucket). - // Only reading key 1 for now to release key 0 from the iterator. - expectReadKey(it, 1); - - // Now that keys 4 and 0 were processed and released by the iterator, the write command can be executed. - simulateUnblockedWriteWithModification(c); - - // Keys 2, 3 are next in the queue (it was put in the queue at the same time as key 1). - expectReadKeySequence(it, 2, 3); - - // Now resuming processing of dict entries. - expectReadKeySequence(it, 5, 9); - - expectReadComplete(it); - freeTestClient(c); -} - - -TEST_F(BgIterationTest, writeWith3Keys_NoReplication_NoConsistent_repeatedKey_1repeatedNewKey) { - // This tests a write command that creates a new key where the new key is repeated in the - // command. The repeated key is in the future. - - // Expected sequence of event for this test: - // ITEM: (0)'D0' : 'D0' - // ITEM: (0)'A0' : 'A0' - // ITEM: (0)'B0' : 'B0' - // ITEM: (0)'E0' : 'E0' - // BLCK?: (0)'blpop' 'C0' 'D0' 'C0' 'D0' - // REPL?: (0)'blpop' 'C0' 'D0' 'C0' 'D0' - // ITEM: (0)'C0' : 'D0' - // ITEM: (1)'B1' : 'B1' - // ITEM: (1)'C1' : 'C1' - // ITEM: (1)'D1' : 'D1' - // ITEM: (1)'A1' : 'A1' - // ITEM: (1)'E1' : 'E1' - // SENDING COMPLETE - // CLEANUP FN (success) - - server.db[0]->keys->dtype->resizeAllowed = NULL; - kvstoreExpand(server.db[0]->keys, 32, 0, NULL); - hashtableRehash(server.db[0]->keys->hashtables[0], 32); - - // The table looks this way now: - // Table 0, used 5, exp 3, top-level buckets 8, child buckets 0 - // Bucket 0:0 level:0 - // 0 (empty) - // 1 (empty) - // 2 (empty) - // 3 (empty) - // 4 (empty) - // 5 (empty) - // 6 (empty) - // Bucket 0:1 level:0 - // 0 h2 63, key "D0" - // 1 (empty) - // 2 (empty) - // 3 (empty) - // 4 (empty) - // 5 (empty) - // 6 (empty) - // Bucket 0:2 level:0 - // 0 (empty) - // 1 (empty) - // 2 (empty) - // 3 (empty) - // 4 (empty) - // 5 (empty) - // 6 (empty) - // Bucket 0:3 level:0 - // 0 h2 b8, key "A0" - // 1 h2 f5, key "B0" - // 2 h2 13, key "E0" - // 3 (empty) - // 4 (empty) - // 5 (empty) - // 6 (empty) - // Bucket 0:4 level:0 - // 0 (empty) - // 1 (empty) - // 2 (empty) - // 3 (empty) - // 4 (empty) - // 5 (empty) - // 6 (empty) - // Bucket 0:5 level:0 - // 0 (empty) - // 1 (empty) - // 2 (empty) - // 3 (empty) - // 4 (empty) - // 5 (empty) - // 6 (empty) - // Bucket 0:6 level:0 - // 0 h2 91, key "C0" - // 1 (empty) - // 2 (empty) - // 3 (empty) - // 4 (empty) - // 5 (empty) - // 6 (empty) - // Bucket 0:7 level:0 - // 0 (empty) - // 1 (empty) - // 2 (empty) - // 3 (empty) - // 4 (empty) - // 5 (empty) - // 6 (empty) - - const char *new_keys[5] = {"D0", "A0", "B0", "E0", "C0"}; - update_keys(new_keys, 0, 5); - - simpleDelItem(4); // Deleting key 4 to then create it with a write command - bgIterator *it = bgIteratorCreateFullScanIter("iter", - 0, NULL, iteratorCleanupFn, PRIVDATA); - - // Getting started - // The first bucket is empty - bgIteration_feedIterators(); - expectReadKey(it, 0); - - // Key 1 is the next in the queue. - // Reading key 1 to release key 0 from the iterator. - expectReadKey(it, 1); - - // Write command that has 3 keys. 1 new repeated key and 1 key in the past. - // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a - // multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY). - client *c = getWriteMultiKeysClient(4, {0, 4, 0}, "blpop"); - - // The write command is not blocked since key 0 is not in use by the iterator - simulateUnblockedWriteWithModification(c); - - // Keys 2, 3 are next in the queue (it was put in the queue at the same time as key 1). - expectReadKeySequence(it, 2, 3); - - // Key 4 is now in the dict with the value of key 0. - expectReadKey(it, 4, keyStr(0)); - - // Processing the rest of the dict entries. - expectReadKeySequence(it, 5, 9); - - expectReadComplete(it); - freeTestClient(c); -} - -TEST_F(BgIterationTest, copyHandlesProperDb_Replication_NoConsistent) { - // In this test, the COPY command is copying from one DB to another. We will create the - // same key in both DBs. We make sure that the proper key is created via replication, and - // the proper key is created by iteration. - - // NOTE: Adding E0 to dict 1. Now there is a E0 in both dict 0 and dict 1. - addKeyToDb(1, "E0", "E0"); - - // The test: - // We will simulate (with DB0 selected): COPY D0 C0 DB 1 REPLACE - // This will overwrite DB1:C0 that was created above. - // Since DB0:D0 is the first iterated key we expect that DB1:C0 will be expedited. - // After DB1:C0 is "overwritten", it should be marked early iterate. - // We expect DB0:C0 to NOT be marked early iterate, and should get processed normally. - - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - // Start with this to load 0 (C0) into the queue - but don't read 0 as that would load 1,2,3 into the queue! - bgIteration_feedIterators(); - - // COPY C0 E0 DB 1 REPLACE - client *c = static_cast(zcalloc(sizeof(client))); - c->cmd = lookupCommandByCString("copy"); - c->db = server.db[0]; - c->argc = 6; - c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); - c->argv[0] = createStringObjectFromCString(c->cmd->fullname)); - c->argv[1] = createStringObjectFromCString("C0"); - c->argv[2] = createStringObjectFromCString("E0"); - c->argv[3] = createStringObjectFromCString("DB"); - c->argv[4] = createStringObjectFromCString("1"); - c->argv[5] = createStringObjectFromCString("REPLACE"); - - // This should block on 2 keys. DB0:C0 is in queue. DB1:E0 needs to be expedited. - simulateBlockedWrite(c, 2); - expectReadKey(it, 0); // DB0:C0 - expectReadDbKeyValue(it, 1, "E0", "E0"); // DB1:E0 is expedited - expectReadKey(it, 1); // (to release DB1:E0) - // Now keys 2 & 3 & 4 are in the queue - - simulateUnblockedWrite(c); // We shouldn't be blocked this time - - // Now, we'll simulate the actual activity of the COPY. DB1:C0 will be deleted in order to - // be overwritten. - bgIteration_keyDelete(1, sdsnew("E0")); - // At this point the key would actually be deleted and recreated by COPY (no need to actually do this) - - // And finally the replication (this should queue replication) - bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); - - // Now let's read everything... - expectReadKeySequence(it, 2, 4); // These were in queue already - expectReadReplication(it, c); // This is the new replication (creating DB1:C0) - - expectReadKeySequence(it, 5, 9); // These are all normal - - expectReadComplete(it); // At this point, we should be done. We should NOT see DB1:C0. - freeTestClient(c); -} - - -// Just check that termination with replication in queue works OK. -TEST_F(BgIterationTest, terminateWithReplication) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 0); - expectReadKey(it, 1); // makes sure we are done with key 0 (don't want to block) - - client *c = getWriteClient(0, "xxx"); - simulateUnblockedWriteWithModification(c); // Should replicate - freeTestClient(c); - - bgIteratorTerminate(it); - - bgIteratorItem *item = bgIteratorRead(it); - ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); - - bgIteratorClose(it); // background thread completes the termination - - bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function - EXPECT_EQ(cleanupCount, 1); - EXPECT_TRUE(cleanupTerminated); -} - - -// SWAPDB tests - Get ready for the mind-bend... - -TEST_F(BgIterationTest, swapDB_NoReplication_NoConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - 0, NULL, iteratorCleanupFn, PRIVDATA); - bgIteratorStatus status; - - // In the non-consistent iterator (without replication), items are identified with the DBID at - // the time they are placed into the queue. The SWAPDB event signals the change to the - // iterating process - and this is properly sequenced with the DB info for each item. - - expectReadKey(it, 0); - - // Keys 1,2,3, and 4 are in queue - simulateSwapDB(0, 1); // The swap event will be queued after item 3 - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.swapdb_queued, 1u); - EXPECT_EQ(status.swapdb_processed, 0u); - - expectReadKey(it, 1); - expectReadKey(it, 2); - expectReadKey(it, 3); - expectReadKey(it, 4); - - expectReadSwapDB(it, 0, 1); - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.swapdb_queued, 1u); - EXPECT_EQ(status.swapdb_processed, 0u); // still processing it... - - // Since we've seen the swap event, items now have the new DBID - expectReadDbKeyValue(it, 0, keyStr(5), keyStr(5)); // item 5 is in DB0 - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.swapdb_queued, 1u); - EXPECT_EQ(status.swapdb_processed, 1u); // done processing the swapdb - - // Keys 6 & 7 are in the queue - let's swap back! - simulateSwapDB(1, 0); // The swap event will be queued after item 7 - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.swapdb_queued, 2u); // 2nd one queued - EXPECT_EQ(status.swapdb_processed, 1u); - - expectReadDbKeyValue(it, 0, keyStr(6), keyStr(6)); // Still appears as DB0 - expectReadDbKeyValue(it, 0, keyStr(7), keyStr(7)); // Still appears as DB0 - - expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.swapdb_queued, 2u); - EXPECT_EQ(status.swapdb_processed, 1u); // still processing it... - - expectReadKey(it, 8); - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.swapdb_queued, 2u); - EXPECT_EQ(status.swapdb_processed, 2u); // done processing all swaps - - expectReadKey(it, 9); - expectReadComplete(it); -} - -TEST_F(BgIterationTest, swapDB_NoReplication_YesConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // In the consistent iterator (without replication) all items are presented to the iterating - // process using the DBID at the time of the iterator creation. No changes are evident. - - expectReadKey(it, 0); - - // Keys 1,2,3,4 are in queue - simulateSwapDB(0, 1); // The swap occurs, but the iterator sees no change - - expectReadKey(it, 1); - expectReadKey(it, 2); - expectReadKey(it, 3); - expectReadKey(it, 4); - - // Heck, let's go crazy with those swaps... - for (int itemNum = 5; itemNum <= 9; itemNum++) { - simulateSwapDB(0, 1); - expectReadKey(it, itemNum); - } - - expectReadComplete(it); -} - -TEST_F(BgIterationTest, swapDB_YesReplication_NoConsistent) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - // In the non-consistent iterator WITH replication, items are identified with the DBID at the - // time they are placed into the queue. The SWAPDB event signals the change to the iterating - // process - and this is properly sequenced with the DB info for each item. - - expectReadKey(it, 0); - - // Keys 1,2,3,4 are in queue - simulateSwapDB(0, 1); // The swap event will be queued after item 3 - - expectReadKey(it, 1); - expectReadKey(it, 2); - expectReadKey(it, 3); - expectReadKey(it, 4); - - expectReadSwapDB(it, 0, 1); // We should see a SWAPDB event - bgIteratorItem *item = bgIteratorRead(it); // followed by the associated replication - ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); - bgIteration_feedIterators(); - - // Since we've seen the swap event, items now have the new DBID - expectReadDbKeyValue(it, 0, keyStr(5), keyStr(5)); // item 5 is in DB0 - - // Keys 6 & 7 are in the queue - let's swap back! - simulateSwapDB(1, 0); // The swap event will be queued after item 7 - - expectReadDbKeyValue(it, 0, keyStr(6), keyStr(6)); // Still appears as DB0 - expectReadDbKeyValue(it, 0, keyStr(7), keyStr(7)); // Still appears as DB0 - - expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap - item = bgIteratorRead(it); - ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); - bgIteration_feedIterators(); - - expectReadKey(it, 8); - expectReadKey(it, 9); - expectReadComplete(it); -} - -// There is no test for swapDB_YesReplication_YesConsistent because this configuration is not -// permitted with multiple DBs (not permitted with swaps). - - -// FLUSHDB & FLUSHALL Tests -TEST_F(BgIterationTest, flushDB_flushAll) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 0); - expectReadKey(it, 1); - - // key 1 is active in the iterator - this key will be removed from the DB before flush. - // keys 2 & 3 & 4 are in queue - but will be returned to Valkey before the flush. These are yanked - // back by Valkey and will not be seen by iterator. - simulateFlushDB(-1, 1); - - bgIteratorItem *item = bgIteratorRead(it); - ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); - - bgIteratorClose(it); // background thread completes the termination - - bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function - EXPECT_EQ(cleanupCount, 1); - EXPECT_TRUE(cleanupTerminated); -} - -TEST_F(BgIterationTest, flushDB_flushOne) { - bgIterator *it1 = bgIteratorCreateFullScanIter("iter1", - 0, NULL, iteratorCleanupFn, PRIVDATA); - bgIterator *it2 = bgIteratorCreateFullScanIter("iter2", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - bgIteratorStatus status; - - // The test flushes DB0. This is half the data. Since <= half, a non-consistent iterator is - // allowed to proceed. But the consistent iterator will be terminated. - - expectReadKey(it1, 0); - expectReadKey(it2, 0); - expectReadKey(it1, 1); - expectReadKey(it2, 1); - - // key 1 is active in the iterator - this key will be removed from the DB before flush. - // keys 2 & 3 & 4 are in queue - but will be returned to Valkey before the flush. These are yanked - // back by Valkey and will not be seen by iterator. - simulateFlushDB(0, 1); - bgIteratorGetStatus(it1, &status); - EXPECT_EQ(status.flushdb_queued, 1u); - EXPECT_EQ(status.flushdb_processed, 0u); - - // Testing the non-consistent one continues... - // Everything already on the iterator queue should be preserved (deleted from the DB). - // Keys 2 & 3 & 4 are already queued (and preserved). - expectReadKey(it1, 2); - expectReadKey(it1, 3); - expectReadKey(it1, 4); - - bgIteratorItem *item = bgIteratorRead(it1); - ASSERT_EQ(item->type, BGITERATOR_ITEM_FLUSHDB); - ASSERT_EQ(item->dbid, 0); - bgIteratorGetStatus(it1, &status); - EXPECT_EQ(status.flushdb_queued, 1u); - EXPECT_EQ(status.flushdb_processed, 0u); // still processing it - - expectReadKey(it1, 5); - bgIteratorGetStatus(it1, &status); - EXPECT_EQ(status.flushdb_queued, 1u); - EXPECT_EQ(status.flushdb_processed, 1u); // done with all flushdb's - expectReadKey(it1, 6); - expectReadKey(it1, 7); - expectReadKey(it1, 8); - expectReadKey(it1, 9); - expectReadComplete(it1); - EXPECT_EQ(cleanupCount, 1); - EXPECT_FALSE(cleanupTerminated); - - // But the consistent iterator should be terminated - item = bgIteratorRead(it2); - ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); - bgIteratorClose(it2); // background thread completes the termination - bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function - EXPECT_EQ(cleanupCount, 2); - EXPECT_TRUE(cleanupTerminated); -} - -// Cluster mode, 2 iterators, CONSISTENT+REPLICATION and NONCONSISTENT+REPLICATION -// Modify a missing key. -TEST_F(BgIterationTestCluster, modMissingKey_2iter_cluster) { - // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode - // For this test, we only have 5 keys since not using DB[1]. Remove the last one. - simpleDelItem(4); - - bgIterator *it1 = bgIteratorCreateFullScanIter("iter1", - BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, - NULL, iteratorCleanupFn, PRIVDATA); - bgIterator *it2 = bgIteratorCreateFullScanIter("iter2", - BGITERATOR_FLAG_REPLICATION, - NULL, iteratorCleanupFn, PRIVDATA); - - client *c = getWriteClient(4, "xxx"); - simulateUnblockedWriteWithModification(c); // Wouldn't be blocked since key doesn't exist - - bgIteration_feedIterators(); // Prime the feed - key 0 and 1 are now enqueued - - // Process the consistent iteration - expectReadReplication(it1, c); // replication happened before feeding (should be 1st) - expectReadKeySequence(it1, 0, 3); - expectReadComplete(it1); - - // Process the non-consistent iteration - expectReadReplication(it2, c); // replication happened before feeding (should be 1st) - expectReadKeySequence(it2, 0, 3); - expectReadComplete(it2); - - freeTestClient(c); -} - -TEST_F(BgIterationTest, twoKeys_firstFuture) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, - NULL, iteratorCleanupFn, PRIVDATA); - - bgIteration_feedIterators(); // Prime the feed - key 0 - expectReadKey(it, 0); // Causes keys 1, 2, 3, 4 to be queued (same bucket) - expectReadKey(it, 1); // Causes key 0 to be released - - // This must replicate, because A0 is in the past. B1 (future) wouldn't need replication except - // for the modification to B1. We try to trip up bgIterator by giving a key that doesn't need - // replication except for the later command that does. Make this a little trickier by adding - // the set for A1 - unnecessary, but more clearly shows the expediting in progress. - client *c = getMultiClient("SET A0 xxx; SELECT 1; SET A1 xxx; SET B1 xxx", 1); - - // The EXEC should block on 2 keys, because B1(5) & A1(8) should be expedited - simulateBlockedWrite(c, 2); - - expectReadKeySequence(it, 2, 4); // These were already in queue - - // Note - it would be OK if these 2 were reversed, but this is how the current algorithm works. - expectReadKey(it, 8); // Key 8 (A1) was expedited - expectReadKey(it, 5); // Key 5 (B1) was expedited - - // and clean up the rest... - expectReadKeySequence(it, 6, 7); - // Key 8 was already read above (expedited) - expectReadKey(it, 9); - expectReadComplete(it); -} - -TEST_F(BgIterationTest, multiBlocksOnFutureKey) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // At this point, key 0 is read. Keys 1,2,3,4 are queued (they are all in the same bucket). - // If we fake a modification to key 5, we won't know if it's handled out of order. - // So we fake a modification to key 6 - // Dummy up a MULTI... - client *c = getMultiClient("SET C1 xxx", 1); - - // Since this is consistent, we will block the client, disallowing the write. - simulateBlockedWrite(c); - freeTestClient(c); - - // C1 (key 6) will be expedited to the front of the list - expectReadKey(it, 6); - - // Now that we've read key 5, key 0 (C0) is passed and should not block - client *c2 = getMultiClient("SET C0 xxx"); - simulateUnblockedWrite(c2); - freeTestClient(c2); - - - expectReadKeySequence(it, 1, 5); - expectReadKeySequence(it, 7, 9); - expectReadComplete(it); -} - -TEST_F(BgIterationTest, multiNotReplicatedButDelRecreateAccess) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - // Scenario. We have a multi that doesn't need to be replicated because all of the keys exist - // but are all future keys. Note that missing keys are considered already-iterated, so all - // must exist for this test. Then: - // - we delete a key - // - we re-create the deleted (future) key - normally this would be replicated - // - we access another (future) key - we don't expect to get blocked! - - // We use DB 1 only because the hash table buckets are better broken up there. - client *c = getMultiClient("DEL A1; SET A1 xxx; SET E1 yyy", 1); - - // For DB[1]: - // Bucket 0:0 level:0 - // 0 h2 18, key "B1" - // 1 h2 fd, key "C1" - // 2 h2 e9, key "D1" - // 3 (empty) - // 4 (empty) - // 5 (empty) - // 6 (empty) - // Bucket 0:1 level:0 - // 0 h2 36, key "A1" - // 1 h2 0c, key "E1" - // 2 (empty) - // 3 (empty) - // 4 (empty) - // 5 (empty) - // 6 (empty) - - // Read through DB 0 and into DB 1 - expectReadKeySequence(it, 0, 5); // D0, E0, B0, A0, C0, B1 - // Now, C1 and D1 are in the queue (in use) and A1 & E1 are future - - // Now let's process the multi. Since A1 & D1 are both future (existing) items, we shouldn't - // block or replicate. - simulateUnblockedWrite(c); // the EXEC - - // Simulate the DEL A1 - server.in_exec = 1; // Simulate actual execution of the MULTI/EXEC - advanceMultiClientToCommand(c, 0); // DEL A1 - EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); - bool blocked = bgIteration_blockClientIfRequired(c); - EXPECT_FALSE(blocked); - simpleDelItem(8); - sds delKey = sdsnew(keyStr(8)); - bgIteration_keyDelete(1, delKey); - sdsfree(delKey); - bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); // shouldn't replicate - - // Simulate SET A1 - the key doesn't exist, and would normally replicate and mark early iterate, - // but this is in a transaction, and we are not replicating this transaction. - advanceMultiClientToCommand(c, 1); // SET A1 xxx - simulateUnblockedWriteWithModification(c); - - // Now write to another existing future key - this should work if we weren't confused by the DEL - advanceMultiClientToCommand(c, 2); // SET E1 yyy - simulateUnblockedWriteWithModification(c); - server.in_exec = 0; - - // Now we can continue iterating, and we should pick up keys 6-9. (and no replication!) - expectReadKeySequence(it, 6, 7); - expectReadKey(it, 8, "xxx"); - expectReadKey(it, 9, "yyy"); - expectReadComplete(it); -} - -TEST_F(BgIterationTest, multiHandlesSelectProperly) { - // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1. We will process it - // in DB0, but it will be unprocessed in DB1. See if we track select properly. - addKeyToDb(1, "C0", "C0"); - - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - C0 in DB 0. - expectReadKey(it, 0); - - // Now, we are done with C0 in DB0, but not in DB1 - expectReadKey(it, 1); - - // These cases should NOT block... (they access C0 in DB0) - client *c; - c = getMultiClient("SET C0 xxx"); - simulateUnblockedWrite(c); - freeTestClient(c); - c = getMultiClient("SELECT 0; SET C0 xxx"); - simulateUnblockedWrite(c); - freeTestClient(c); - c = getMultiClient("SET C0 xxx; SELECT 1"); - simulateUnblockedWrite(c); - freeTestClient(c); - c = getMultiClient("SELECT 1; SELECT 0; SET C0 xxx; SELECT 1"); - simulateUnblockedWrite(c); - freeTestClient(c); - - // These cases SHOULD block... (they access C0 in DB1) - c = getMultiClient("SET C0 xxx"); - c->db = server.db[1]; - simulateBlockedWrite(c); - freeTestClient(c); - c = getMultiClient("SELECT 1; SET C0 xxx"); - simulateBlockedWrite(c); - freeTestClient(c); - c = getMultiClient("SELECT 1; SET C0 xxx; SELECT 0"); - simulateBlockedWrite(c); - freeTestClient(c); - c = getMultiClient("SELECT 0; SELECT 1; SET C0 xxx; SELECT 1"); - simulateBlockedWrite(c); - freeTestClient(c); - - expectAnythingCleanup(it); -} - - -TEST_F(BgIterationTest, multiHandlesSelectNoPermissionProperly) { - // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1. We will process it - // in DB0, but it will be unprocessed in DB1. See if we track select properly. - addKeyToDb(1, "C0", "C0"); - - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - C0 in DB 0. - expectReadKey(it, 0); - - // Now, we are done with DC00 in DB0, but not in DB1 - expectReadKey(it, 1); - - // No permission for any commands (specifically select/swapdb) - EXPECT_CALL(mock, amzCanClientExecuteCommand(_,_,_,_)) - .Times(AtLeast(1)).WillRepeatedly(Return(false)); - - // These cases should NOT block... (they access C0 in DB0) - // The SELECTs below are inconsequential - with/without select, same result. - client *c; - c = getMultiClient("SET C0 xxx"); - simulateUnblockedWrite(c); - freeTestClient(c); - c = getMultiClient("SELECT 0; SET C0 xxx"); - simulateUnblockedWrite(c); - freeTestClient(c); - c = getMultiClient("SET C0 xxx; SELECT 1"); - simulateUnblockedWrite(c); - freeTestClient(c); - c = getMultiClient("SELECT 1; SELECT 0; SET C0 xxx; SELECT 1"); - simulateUnblockedWrite(c); - freeTestClient(c); - - // These cases SHOULD block IF SELECT IS WORKING... (they access C0 in DB1) - c = getMultiClient("SET C0 xxx"); - c->db = server.db[1]; // already starting on DB1 - simulateBlockedWrite(c); // will block, no select - freeTestClient(c); - c = getMultiClient("SELECT 1; SET C0 xxx"); - simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) - freeTestClient(c); - c = getMultiClient("SELECT 1; SET C0 xxx; SELECT 0"); - simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) - freeTestClient(c); - c = getMultiClient("SELECT 0; SELECT 1; SET C0 xxx; SELECT 1"); - simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) - freeTestClient(c); - - expectAnythingCleanup(it); -} - - -TEST_F(BgIterationTest, multiHandlesSwapdbProperly) { - // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1. We will process it - // in DB0, but it will be unprocessed in DB1. See if we track select properly. - addKeyToDb(1, "C0", "C0"); - - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - C0 in DB 0. - expectReadKey(it, 0); - - // Now, we are done with C0 in DB0, but not in DB1 - expectReadKey(it, 1); - - // These cases should NOT block... (they access C0 in DB0) - client *c; - c = getMultiClient("SET C0 xxx"); - simulateUnblockedWrite(c); - freeTestClient(c); - c = getMultiClient("SET C0 xxx; SWAPDB 0 1"); - simulateUnblockedWrite(c); - freeTestClient(c); - c = getMultiClient("SET C0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET C0 xxx"); - simulateUnblockedWrite(c); - freeTestClient(c); - c = getMultiClient("SWAPDB 0 1; SELECT 1; SET C0 xxx"); - simulateUnblockedWrite(c); - freeTestClient(c); - - // These cases SHOULD block... (they access C0 in DB1) - c = getMultiClient("SET C0 xxx"); - c->db = server.db[1]; - simulateBlockedWrite(c); - freeTestClient(c); - c = getMultiClient("SWAPDB 1 0; SET C0 xxx; SWAPDB 0 1"); - simulateBlockedWrite(c); - freeTestClient(c); - c = getMultiClient("SWAPDB 1 0; SELECT 0; SET C0 xxx; SWAPDB 0 1"); - simulateBlockedWrite(c); - freeTestClient(c); - c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET C0 xxx; SELECT 1"); - simulateBlockedWrite(c); - freeTestClient(c); - - expectAnythingCleanup(it); -} - - -TEST_F(BgIterationTest, multiHandlesSwapdbNoPermissionProperly) { - // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1. We will process it - // in DB0, but it will be unprocessed in DB1. See if we track select properly. - addKeyToDb(1, "C0", "C0"); - - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - C0 in DB 0. - expectReadKey(it, 0); - - // Now, we are done with C0 in DB0, but not in DB1 - expectReadKey(it, 1); - - // No permission for any commands (specifically select/swapdb) - EXPECT_CALL(mock, amzCanClientExecuteCommand(_,_,_,_)) - .Times(AtLeast(1)).WillRepeatedly(Return(false)); - - // These cases should NOT block... (they access C0 in DB0) - // The SELECTs & SWAPDBs below are inconsequential - with/without select/swapdb, same result. - client *c; - c = getMultiClient("SET C0 xxx"); - simulateUnblockedWrite(c); - freeTestClient(c); - c = getMultiClient("SET C0 xxx; SWAPDB 0 1"); - simulateUnblockedWrite(c); - freeTestClient(c); - c = getMultiClient("SET C0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET C0 xxx"); - simulateUnblockedWrite(c); - freeTestClient(c); - c = getMultiClient("SWAPDB 0 1; SELECT 1; SET C0 xxx"); - simulateUnblockedWrite(c); - freeTestClient(c); - - // These cases SHOULD block IF SELECT/SWAPDB IS WORKING... (they access C0 in DB1) - c = getMultiClient("SET C0 xxx"); - c->db = server.db[1]; - simulateBlockedWrite(c); - freeTestClient(c); - c = getMultiClient("SWAPDB 1 0; SET C0 xxx; SWAPDB 0 1"); - simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb fails) - freeTestClient(c); - c = getMultiClient("SWAPDB 1 0; SELECT 0; SET C0 xxx; SWAPDB 0 1"); - simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails) - freeTestClient(c); - c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET C0 xxx; SELECT 1"); - simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails) - freeTestClient(c); - - expectAnythingCleanup(it); -} - -void * pthreadWait200msAndReadTwoKeys(void *arg) { - bgIterator *it = static_cast(arg); - - usleep(200000); - bgIteratorRead(it); - bgIteratorRead(it); - return nullptr; -} - -void asyncWait200msAndReadTwoKeys(bgIterator *it) { - int rc; - pthread_attr_t attr; - pthread_t thread; - - rc = pthread_attr_init(&attr); - assert(rc == 0); - rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); - assert(rc == 0); - - rc = pthread_create(&thread, &attr, pthreadWait200msAndReadTwoKeys, it); - assert(rc == 0); - - rc = pthread_attr_destroy(&attr); - assert(rc == 0); -} - - -TEST_F(BgIterationTest, testLuaWithUndeclaredKey) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); - - // Read the 1st key - let's get the party started - expectReadKey(it, 0); - - // At this point, key 0 is read. Keys 1,2,3 are queued (they are all in the same bucket). - // If we fake a modification to key 4, we won't know if it's handled out of order. - // So we fake a modification to key 5 - client *c = getWriteClient(5, "xxx"); - c->flag.script = 1; - - // Now for a LUA script, we have already blocked (on the eval/evalsha) for any declared keys - // But here, we're about to modify an undeclared key. We can't actually block in the middle - // of the LUA script. So this will behave as unblocked, but incur a synchronous wait. - - // Key 5 will get expedited when we simulate the write. After reading key 5, key 1 will need - // to be read to return key 5 to Valkey, unbloking the synchronous wait. - asyncWait200msAndReadTwoKeys(it); - - monotime blockTimer; - elapsedStart(&blockTimer); - simulateUnblockedWrite(c); - // Must have delayed at least 150ms (some time may have passed before timer start) - EXPECT_GT(elapsedMs(blockTimer), 150u); - - // Continue... - expectReadKeySequence(it, 2, 4); - // 5 has already been processed - expectReadKeySequence(it, 6, 9); - expectReadComplete(it); - freeTestClient(c); -} - - -TEST_F(BgIterationTest, replicationReceivedWhileProcessingLastKey) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - client *c = getWriteClient(0, "xxx"); - - expectReadKeySequence(it, 0, 9); - simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 - expectReadReplication(it, c); // Replication happened while processing key 9, should be here. - - simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing key 9 - expectReadComplete(it); // We expect to see the completion instead - - freeTestClient(c); -} - - -TEST_F(BgIterationTest, repldoneFunctionCalled) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); - - client *c = getWriteClient(0, "xxx"); - - expectReadKeySequence(it, 0, 9); - simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 - expectReadReplication(it, c); // Replication happened while processing key 9, should be here. - EXPECT_EQ(repldoneCount, 1); // Last key released, now done feeding replication - - simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing key 9 - expectReadComplete(it); // We expect to see the completion instead - - freeTestClient(c); -} - - -TEST_F(BgIterationTest, repldoneFunctionCalledTwice) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, iteratorRepldoneFnNotBeingReadyInitially, iteratorCleanupFn, PRIVDATA); - - client *c = getWriteClient(0, "xxx"); - - expectReadKeySequence(it, 0, 9); - simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 - expectReadReplication(it, c); // Replication happened while processing key 9, should be here. - EXPECT_EQ(repldoneCount, 0); // Last key released, now done feeding replication - EXPECT_EQ(isReplDoneReady, 1); - bgIteration_feedIterators(); // Need to call it as RepldoneFnNotBeingReadyInitially returns false in first call - EXPECT_EQ(repldoneCount, 1); - - simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing key 9 - expectReadComplete(it); // We expect to see the completion instead - - freeTestClient(c); -} - - -TEST_F(BgIterationTest, queuingitemFunctionCalled) { - bgIterator *it = bgIteratorCreateFullScanIter("simple", - 0, NULL, iteratorCleanupFn, iteratorBeforeAndAfterQueuingItemFn, PRIVDATA); - EXPECT_EQ(beforeQueuingItemCount, 0); - EXPECT_EQ(afterQueuingItemCount, 0); - expectReadKeySequence(it, 0, 9); - expectReadComplete(it); - // Callback is invoked when item is fed to and returned from an iterator - EXPECT_EQ(beforeQueuingItemCount, 10); - EXPECT_EQ(afterQueuingItemCount, 10); -} - -TEST_F(BgIterationTest, checkReplicationByteCount) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); - - client *c = getWriteClient(0, "xxx"); - int expectedReplicationSize = sizeof(bgIteratorItem); - for (int i = 0; i < c->argc; i++) { - expectedReplicationSize += objectComputeSize(NULL, c->argv[i], 0, 0); - } - - expectReadKey(it, 0); - expectReadKey(it, 1); // Releases and unblocks 0 - EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u); - - simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 - EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize); - simulateUnblockedWriteWithModification(c); // and write again (2nd replication) - EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize); - - expectReadKeySequence(it, 2, 4); // Keys 1..4 all in same bucket - - expectReadReplication(it, c); - // After reading the 1st replication, it hasn't been returned yet (it's the active item) - EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize); - expectReadReplication(it, c); - // After reading the 2nd replication, the 1st has been returned - EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize); - - expectReadKey(it, 5); - // Now all replication has been returned/freed - EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u); - - expectReadKeySequence(it, 6, 9); - expectReadComplete(it); - - freeTestClient(c); -} - -// Test that for an arbitrary write command having no keys, replication should occur. -TEST_F(BgIterationTest, checkNoKeysWriteIsReplicated) { - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); - - expectReadKey(it, 0); - - client *c = getNoKeysWriteClient(); - EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); - bool blocked = bgIteration_blockClientIfRequired(c); - EXPECT_FALSE(blocked); - bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); - - expectReadKeySequence(it, 1, 4); // These were already in queue - - expectReadReplication(it, c); - - expectReadKeySequence(it, 5, 9); - expectReadComplete(it); - freeTestClient(c); -} -TEST_F(BgIterationTestClusterSlots, testAmzKeyIsLogicallyDeletedInOrderedIteration3Slots) { - bgIterator *it = bgIteratorCreateSlotsIter("simple", - 0, slots_to_iterate, slots_to_iterate_size, NULL, iteratorCleanupFn, PRIVDATA); - EXPECT_CALL(mock, amzKeyIsLogicallyDeleted(_)).WillOnce(Return(true)).WillRepeatedly(Return(false)); - expectReadKeySequence(it, 1, n_keys_to_read - 1); - - // Quick status check. At this point, the last item hasn't been returned yet. - bgIteratorStatus status; - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentries_queued, n_keys_to_read - 1); // The first item should be skipped from the queue - EXPECT_EQ(status.dbentries_processed, n_keys_to_read - 2); - - expectReadComplete(it); - EXPECT_FALSE(cleanupTerminated); -} - -TEST_F(BgIterationTest, testAmzKeyIsLogicallyDeletedInOrderedFullScanIteration) { - bgIterator *it = bgIteratorCreateFullScanIter("simple", - 0, NULL, iteratorCleanupFn, PRIVDATA); - EXPECT_CALL(mock, amzKeyIsLogicallyDeleted(_)).WillOnce(Return(true)).WillRepeatedly(Return(false)); - expectReadKeySequence(it, 1, 9); - - // Quick status check. At this point, item #9 hasn't been returned yet. - bgIteratorStatus status; - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.dbentries_queued, 9u); // The first item should be skipped from the queue - EXPECT_EQ(status.dbentries_processed, 8u); - - expectReadComplete(it); - EXPECT_FALSE(cleanupTerminated); -} -#endif - -#ifdef CODE_NOT_READY_YET -class BgIterationTestCluster : public BgIterationTest { - private: - // This is the expected order of the keys when hashed into a single dict at slot 0 having size 8. - // The "{06S}" prefix ensures use of only slot 0. - const char *keys[1][5] = {{"{06S}C0", "{06S}D0", "{06S}A0", "{06S}B0", "{06S}E0"}}; - - protected: - // Furthermore, the bucketization will look like this: - // db 0 slot 0 - // Table 0, used 5, exp 1, top-level buckets 2, child buckets 0 - // Bucket 0:0 level:0 - // 0 h2 1a, key "{06S}C0" - // 1 h2 7b, key "{06S}D0" - // 2 (empty) - // 3 (empty) - // 4 (empty) - // 5 (empty) - // 6 (empty) - // Bucket 0:1 level:0 - // 0 h2 5c, key "{06S}A0" - // 1 h2 bf, key "{06S}B0" - // 2 h2 57, key "{06S}E0" - // 3 (empty) - // 4 (empty) - // 5 (empty) - // 6 (empty) - - virtual const char * getKeyAtDbSeq(int db, int seq) override { - assert(db == 0); - return keys[db][seq]; - } - - - virtual void setupDatabase() override { - // For these unit tests, a standard database is constructed. The order of items in the - // hash table is important, and this is validated here. If the hash table - // implementation changes, we will find out quickly at this point. All other tests - // will become invalid! - - // Note that the cluster_enabled tests are designed for the purpose of testing - // CONSISTENT iteration WITH REPLICATION. This type of iteration is not supported - // in non-cluster-mode. At the time of writing, there is no-known use-case for this - // combination. But it is tested for completeness and to ensure future availability. - - // Note also that the cluster_enabled tests are not designed to address issues specific - // to per-slot-dictionaries. The tests are simplified by ensuring that all keys are - // mapped to slot-0. It is assumed that iteration would progress in slot order, and - // failure in this regard will be caught in integration tests (amztests). - - server.dbnum = 1; // cluster-mode means 1 DB - server.cluster_enabled = true; - server.db = static_cast(zcalloc(sizeof(serverDb *) * server.dbnum)); - - // Yes, it's cluster mode, but we're mapping all keys to slot 0 - so we cheat and create only 1 dict (just like CMD). - initializeServerDb(0, CLUSTER_SLOT_MASK_BITS); - - // Note "06S" is a prefix that maps to slot 0. We're not testing slots here. - - addKeyToDb(0, "{06S}A0", "{06S}A0"); - addKeyToDb(0, "{06S}B0", "{06S}B0"); - addKeyToDb(0, "{06S}C0", "{06S}C0"); - addKeyToDb(0, "{06S}D0", "{06S}D0"); - addKeyToDb(0, "{06S}E0", "{06S}E0"); - - // In case we need to debug... - if (0) debugPrintBucketInfo(); - - // Validate that the iteration order matches the expected order - hashtableIterator *it = hashtableCreateIterator(server.db[0]->keys->hashtables[0], 0); - for (int i = 0; i < 5; i++) { - void *nextEntry; - hashtableNext(it, &nextEntry); - dbEntry *de = static_cast(nextEntry); - ASSERT_STREQ(static_cast(objectGetKey(de)), getKeyAtDbSeq(0, i)); - } - hashtableReleaseIterator(it); - } -}; -#endif - -#ifdef CODE_NOT_READY_YET -TEST_F(BgIterationTestCluster, dictIsOK) { - // Just run the setup/teardown code to make sure the dict is OK. -} - - -TEST_F(BgIterationTestCluster, modFutureItem_YesReplication_YesConsistent_cluster) { - // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode - bgIterator *it = bgIteratorCreateFullScanIter("iter", - BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, - NULL, iteratorCleanupFn, PRIVDATA); - bgIteratorStatus status; - - // For this test, don't read the 1st key - we only have 5 keys since not using DB[1] - bgIteration_feedIterators(); // Prime the feed - key 0 and 1 are now enqueued - - // At this point, key 0, and 1 are queued. Fake a modification to key 2 & 4 - two keys to ensure - // that replication is ordered - client *c1 = getWriteClient(2, "xxx"); - client *c2 = getWriteClient(4, "yyy"); - - // Since this is consistent, we will block the client, disallowing the write. - simulateBlockedWrite(c1); - simulateBlockedWrite(c2); - - // On a consistent iterator, the event is expedited in-front of items already in queue! - // Read keys 2&4 out of order. - expectReadKey(it, 2); // reading original/unmodified item - - // This call is expected to unblock the client waiting on #2 - expectReadKeyWithUnblock(it, 4, nullptr, 2); // reading original/unmodified item - simulateUnblockedWriteWithModification(c1); - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.replication_queued, 1u); - EXPECT_EQ(status.replication_processed, 0u); - - // Now read items 0 and 1 - these were actually already queued before keys 1 & 4 were expedited. - // This call is expected to unblock the client waiting on #4 - expectReadKeyWithUnblock(it, 0, nullptr, 4); - simulateUnblockedWriteWithModification(c2); - expectReadKey(it, 1); - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.replication_queued, 2u); - EXPECT_EQ(status.replication_processed, 0u); - - // And now the 2 replications are queued - expectReadReplication(it, c1); - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.replication_queued, 2u); // 1st replication still being processed - EXPECT_EQ(status.replication_processed, 0u); // (no change in these metrics yet) - - expectReadReplication(it, c2); - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.replication_queued, 2u); - EXPECT_EQ(status.replication_processed, 1u); // Done with 1st, processing 2nd - - // Continue... - expectReadKey(it, 3); - bgIteratorGetStatus(it, &status); - EXPECT_EQ(status.replication_queued, 2u); - EXPECT_EQ(status.replication_processed, 2u); // Done processing both repl items - expectReadComplete(it); - freeTestClient(c1); - freeTestClient(c2); -} -#endif - - - -// JHB - need test that hashing is paused when an entry is in use. - -#endif // if SIZE_MAX == UINT64_MAX /* 64-bit version */ From 10b08c81b37916b0ed21f50f7cc5bd0c5d5a0b31 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Thu, 28 May 2026 20:32:15 +0000 Subject: [PATCH 17/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 97 +++++++++++++++++++++++++---------------------- src/hashtable.c | 76 +++++++------------------------------ src/hashtable.h | 1 + 3 files changed, 66 insertions(+), 108 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index c70bd129015..a7e5b50d38a 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -18,9 +18,6 @@ robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const sds // Non-public hashtable/kvstore functions... -bool hashtableInternalFindBucketIdx(hashtable *ht, void *key, int *table_idx, size_t *bucket_idx); -void hashtableInternalIteratorGetBucketIdx(hashtableIterator *iterator, int *table_idx, size_t *bucket_idx); -bool hashtableInternalIteratorIsBucketIdxComplete(hashtableIterator *iterator); hashtableIterator *kvstoreInternalIteratorGetCurrentHashtableIterator(kvstoreIterator *kvs_it); @@ -107,7 +104,7 @@ static bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid * The optional permission_client allows for checking of a client's permission for swapdb. * Returns true if command would be executed. */ -bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, int *id1_p, int *id2_p) { +static bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, int *id1_p, int *id2_p) { static struct serverCommand *swapdb_cmd = NULL; // We don't need to check permissions in the replication phase @@ -140,7 +137,7 @@ bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, int *i * The optional permission_client allows for checking of a client's permission for select. * Returns true if command would be executed. */ -bool getParamsForSelect(int argc, robj **argv, client *permission_client, int *dbid_p) { +static bool getParamsForSelect(int argc, robj **argv, client *permission_client, int *dbid_p) { static struct serverCommand *select_cmd = NULL; // We don't need to check permissions in the replication phase @@ -164,6 +161,15 @@ bool getParamsForSelect(int argc, robj **argv, client *permission_client, int *d return true; } +static void pauseReshahForKvsHashtable(kvstore *kvs, int didx) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (ht != NULL) hashtablePauseRehashing(ht); +} + +static void resumeReshahForKvsHashtable(kvstore *kvs, int didx) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (ht != NULL) hashtableResumeRehashing(ht); +} /* DictType for SDS->ptr. The SDS is referenced, no destructor. */ static dictType sdsrefToPtrDictType = { @@ -482,25 +488,34 @@ struct fullScanIterator { int iter_db; // Iterator for the DB orig_to_cur_db[iter_db] - kvstore *kvs; // keep track of kvs associated with iter_dbi - kvstoreIterator *iter_dbi; + kvstore *kvs; // keep track of kvs associated with iter_dbi + int kvs_didx; // hashtable index within the kvstore + size_t ht_cursor; // cursor for scanning hashtable }; static void fullScanIteratorRelease(genericIterator *genIt) { struct fullScanIterator *it = (struct fullScanIterator *)genIt; - if (it->iter_dbi) kvstoreIteratorRelease(it->iter_dbi); + if (it->kvs) resumeReshahForKvsHashtable(it->kvs, it->kvs_didx); zfree(it->orig_to_cur_db); zfree(it->cur_to_orig_db); zfree(it); } -static fifo * fullScanIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) { +/* Scan callback used by fullScanIteratorGetEntries2 to collect entries into a fifo. */ +static void fullScanIteratorScanCallback(void *privdata, void *entry) { + fifo *dbEntryFifo = (fifo *)privdata; + dbEntry *de = (dbEntry *)entry; + if (ignoreKeyForSave(objectGetKey(de))) return; // slot migration: keys being purged + fifoPush(dbEntryFifo, de); +} + +static fifo *fullScanIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) { struct fullScanIterator *it = (struct fullScanIterator *)genIt; if (it->iter_db >= server.dbnum) return NULL; // Finished scanning fifo *dbEntryFifo = fifoCreate(); while (fifoLength(dbEntryFifo) == 0) { - while (it->iter_dbi == NULL) { + while (it->kvs == NULL) { if (++it->iter_db >= server.dbnum) { fifoRelease(dbEntryFifo); return NULL; // Iteration complete @@ -508,23 +523,27 @@ static fifo * fullScanIteratorGetEntries(genericIterator *genIt, int *orig_dbid, serverDb *db = server.db[it->orig_to_cur_db[it->iter_db]]; if (db != NULL) { it->kvs = db->keys; - it->iter_dbi = kvstoreIteratorInit(it->kvs, HASHTABLE_ITER_SAFE); + it->kvs_didx = kvstoreGetFirstNonEmptyHashtableIndex(it->kvs); + it->ht_cursor = 0; + if (it->kvs_didx == KVSTORE_INDEX_NOT_FOUND) it->kvs = NULL; + if (it->kvs != NULL) pauseReshahForKvsHashtable(it->kvs, it->kvs_didx); } } - hashtableIterator *ht_it = NULL; - do { - dbEntry *de; - if (!kvstoreIteratorNext(it->iter_dbi, (void **)&de)) { - kvstoreIteratorRelease(it->iter_dbi); - it->kvs = NULL, it->iter_dbi = NULL; - break; - } + hashtable *ht = kvstoreGetHashtable(it->kvs, it->kvs_didx); + if (ht) { + it->ht_cursor = hashtableScan(ht, it->ht_cursor, fullScanIteratorScanCallback, dbEntryFifo); + } else { + it->ht_cursor = 0; + } - ht_it = kvstoreInternalIteratorGetCurrentHashtableIterator(it->iter_dbi); - if (ignoreKeyForSave(objectGetKey(de))) continue; // slot migration: keys being purged - fifoPush(dbEntryFifo, de); - } while (!hashtableInternalIteratorIsBucketIdxComplete(ht_it)); + if (it->ht_cursor == 0) { + /* Done with this hashtable, move to next. */ + resumeReshahForKvsHashtable(it->kvs, it->kvs_didx); + it->kvs_didx = kvstoreGetNextNonEmptyHashtableIndex(it->kvs, it->kvs_didx); + if (it->kvs_didx == KVSTORE_INDEX_NOT_FOUND) it->kvs = NULL; + if (it->kvs != NULL) pauseReshahForKvsHashtable(it->kvs, it->kvs_didx); + } } *orig_dbid = it->iter_db; *cur_dbid = it->orig_to_cur_db[*orig_dbid]; @@ -546,8 +565,7 @@ static void fullScanIteratorFlushDb(genericIterator *genIt, int cur_dbid) { int orig_db = it->cur_to_orig_db[cur_dbid]; if (orig_db == it->iter_db) { // We are currently iterating on the DB that's being flushed. - kvstoreIteratorRelease(it->iter_dbi); - it->kvs = NULL, it->iter_dbi = NULL; + it->kvs = NULL; // Iteration will continue with the next DB. } } @@ -560,33 +578,22 @@ static bool fullScanIteratorHasPassedItem(genericIterator *genIt, const_sds key, if (orig_dbid > it->iter_db) return false; // Haven't started this DB yet // Now, orig_dbid == it->iter_db - if (it->iter_dbi == NULL) return true; // just finished this DB + if (it->kvs == NULL) return true; // just finished this DB // We're in the middle of processing a DB. In cluster-mode, the DB is divided into 1 hashtable // per slot. In cluster-mode-disabled, we treat all keys as in slot 0. int keySlot = server.cluster_enabled ? getKeySlot((sds)key) : 0; - if (keySlot < kvstoreIteratorGetCurrentHashtableIndex(it->iter_dbi)) return true; - if (keySlot > kvstoreIteratorGetCurrentHashtableIndex(it->iter_dbi)) return false; + if (keySlot < it->kvs_didx) return true; + if (keySlot > it->kvs_didx) return false; // At this point, we're down to a specific hashtable. - hashtable *iter_current_ht = kvstoreGetHashtable(it->kvs, keySlot); - int table; // 0 or 1 (supporting rehashing) - size_t index; // bucket number within the hashtable + hashtable *ht = kvstoreGetHashtable(it->kvs, keySlot); // If key doesn't exist, we consider it passed - we MIGHT have iterated over it had it existed. - if (!hashtableInternalFindBucketIdx(iter_current_ht, (void *)key, &table, &index)) return true; - - hashtableIterator *htIter = kvstoreInternalIteratorGetCurrentHashtableIterator(it->iter_dbi); - int iter_table; - size_t iter_index; - hashtableInternalIteratorGetBucketIdx(htIter, &iter_table, &iter_index); - if (table < iter_table) return true; // iteration in table 1, but item is in table 0 - if (table > iter_table) return false; // iteration in table 0, but item is in table 1 - // if index <= iterator index, it has been passed. bgIterator - // processes buckets atomically. hashtableIterator points to the - // last returned position. It means bucket at iter_index has - // already been processed. - if (index <= iter_index) return true; + if (!hashtableFind(ht, key, NULL)) return true; + + if (hashtableScanHasPassedKey(ht, key, it->ht_cursor)) return true; + if (ignoreKeyForSave(key)) return true; // if slot being purged, pretend we have passed it return false; } @@ -612,7 +619,6 @@ static genericIterator * fullScanIteratorCreate(void) { } it->iter_db = -1; it->kvs = NULL; - it->iter_dbi = NULL; it->callbacks.release = fullScanIteratorRelease; it->callbacks.getEntries = fullScanIteratorGetEntries; @@ -1037,7 +1043,6 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { } bgIteratorItem *item = makeDbEntryItem(de, dbid, false); - fifoPush(itemsToAdd, item); } fifoRelease(dbEntryFifo); diff --git a/src/hashtable.c b/src/hashtable.c index b5791f347f0..5634d48bb16 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -2570,67 +2570,19 @@ int hashtableLongestBucketChain(hashtable *ht) { return maxlen; } -/* This is an internal function - not part of the standard API. It must be explicitly declared - * where used. It shouldn't be included in any .h (API) file. Use of this interface is discouraged - * as it depends on the internal structure, which may change. - * - * For a given key, return: - * table_idx - the index of the internal table (0 or 1) - * bucket_idx - the bucket index within the table (0..n) - * - * Returns TRUE if the the key exists in the table. - * Returns FALSE if the key doesn't exist (and table/index are undefined) - */ -bool hashtableInternalFindBucketIdx(hashtable *ht, void *key, int *table_idx, size_t *bucket_idx) { - uint64_t hash = hashKey(ht, key); - int pos_in_bucket; - int table; - bucket *b = findBucket(ht, hash, key, &pos_in_bucket, &table); - if (!b) return false; - - *table_idx = table; - *bucket_idx = hash & expToMask(ht->bucket_exp[table]); - return true; -} - -/* This is an internal function - not part of the standard API. It must be explicitly declared - * where used. It shouldn't be included in any .h (API) file. Use of this interface is discouraged - * as it depends on the internal structure, which may change. - * - * For a given iterator, return: - * table_idx - the index of the internal table (0 or 1) - * bucket_idx - the bucket index within the table (0..n) - * - * NOTE: hashtableIterator position is based on the LAST item returned. - */ -void hashtableInternalIteratorGetBucketIdx(hashtableIterator *iterator, int *table_idx, size_t *bucket_idx) { - iter *it = iteratorFromOpaque(iterator); - *table_idx = it->table; - *bucket_idx = it->index; +// Temporary, waiting on PR #3803 +bool hashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor) { + if (cursor == 0) return false; + if (hashtableSize(ht) == 0) return true; + + /* The scan visits buckets in reverse-binary order based on the smallest + * table. During rehashing, a small-table bucket and its corresponding + * large-table buckets are processed together, so the small-table mask + * determines ordering in both cases. */ + int exp = ht->bucket_exp[0]; + if (hashtableIsRehashing(ht) && ht->bucket_exp[1] < exp) exp = ht->bucket_exp[1]; + size_t mask = expToMask(exp); + size_t bucket_idx = hashKey(ht, key) & mask; + return rev(bucket_idx) < rev(cursor & mask); } -/* This is an internal function - not part of the standard API. It must be explicitly declared - * where used. It shouldn't be included in any .h (API) file. Use of this interface is discouraged - * as it depends on the internal structure, which may change. - * - * Returns TRUE if the iterator is ready to move to the next bucket index (if it has completed the - * current bucket index). Note: hashtableIterator bucket_idx is the bucket index of the last item - * returned by hashtableNext. - * - * Note: If this function returns true, the iterator commits to move onto the next bucket index, - * even if something new is added to the end of the current bucket before hashtableNext is called. - */ -bool hashtableInternalIteratorIsBucketIdxComplete(hashtableIterator *iterator) { - iter *it = iteratorFromOpaque(iterator); - - if (it->bucket->chained) return false; - - if (!(it->bucket->presence >> (it->pos_in_bucket + 1))) { - /* There's CURRENTLY nothing else to return at this bucket index. Mark pos_in_bucket so - * so that hashtableNext will move to the next bucket index, regardless of items which may - * be added in the future. */ - it->pos_in_bucket = ITERATOR_DONE_WITH_BUCKET_IDX; - return true; - } - return false; -} diff --git a/src/hashtable.h b/src/hashtable.h index 97ecab68518..289bc183db1 100644 --- a/src/hashtable.h +++ b/src/hashtable.h @@ -163,6 +163,7 @@ bool hashtableIncrementalFindGetResult(hashtableIncrementalFindState *state, voi /* Iteration & scan */ size_t hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata); size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata, void *(*defragfn)(void *), int flags); +bool hashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor); void hashtableInitIterator(hashtableIterator *iter, hashtable *ht, uint8_t flags); void hashtableRetargetIterator(hashtableIterator *iterator, hashtable *ht); void hashtableCleanupIterator(hashtableIterator *iter); From a13558962d64b1fa08fc1e246af702f4fdfcb788 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Thu, 28 May 2026 20:36:41 +0000 Subject: [PATCH 18/40] Forkless Save Signed-off-by: Jim Brunner --- src/hashtable.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/hashtable.c b/src/hashtable.c index 5634d48bb16..982292112c6 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -214,8 +214,6 @@ static_assert(100 * BUCKET_DIVISOR / BUCKET_FACTOR / ENTRIES_PER_BUCKET <= MAX_F "Expand must result in a fill below the soft max fill factor"); static_assert(MAX_FILL_PERCENT_SOFT <= MAX_FILL_PERCENT_HARD, "Soft vs hard fill factor"); -#define ITERATOR_DONE_WITH_BUCKET_IDX (ENTRIES_PER_BUCKET + 1) - /* --- Random entry --- */ #define FAIR_RANDOM_SAMPLE_SIZE (ENTRIES_PER_BUCKET * 10) @@ -2246,7 +2244,6 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) { /* Check if iterator has been invalidated */ if (iter->hashtable == NULL) return false; - // clang-format off while (1) { if (iter->index == -1 && iter->table == 0) { /* It's the first call to next. */ @@ -2272,9 +2269,7 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) { * child bucket in a chain, or to the next bucket index, or to the * next table. */ iter->pos_in_bucket++; - if (iter->bucket->chained - && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1 - && iter->pos_in_bucket != ITERATOR_DONE_WITH_BUCKET_IDX + 1) { + if (iter->bucket->chained && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1) { iter->pos_in_bucket = 0; iter->bucket = getChildBucket(iter->bucket); } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) { @@ -2326,7 +2321,6 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) { } return true; } - // clang-format on return false; } From 9a4feefbc441e61bd08d34fe12112905c5e32d0d Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Thu, 28 May 2026 20:38:42 +0000 Subject: [PATCH 19/40] Forkless Save Signed-off-by: Jim Brunner --- src/hashtable.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/hashtable.c b/src/hashtable.c index 982292112c6..20aa906cc25 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -612,8 +612,7 @@ static bucket *fetchEntriesForExpand(bucket *b, void *buf[], int *size, int max_ /* Processes one bucket chain during incremental table expansion. * Uses batch processing to optimize memory access patterns. */ -// Not API, but not static - used in unit testing -void rehashStepExpand(hashtable *ht) { +static void rehashStepExpand(hashtable *ht) { void *entry_buf[FETCH_ENTRY_BUFFER_SIZE_WHEN_EXPAND]; const void *key_buf[FETCH_ENTRY_BUFFER_SIZE_WHEN_EXPAND]; size_t idx = ht->rehash_idx; From 648565b586adccdf3ae2d39e662bc2e0276aae9e Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Thu, 28 May 2026 20:58:43 +0000 Subject: [PATCH 20/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 6 ++++++ src/bgiteration.h | 6 ++++++ src/hashtable.c | 1 - 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index a7e5b50d38a..bb514daed1e 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -1,3 +1,9 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + #include "fmacros.h" #include "bgiteration.h" #include "dict.h" diff --git a/src/bgiteration.h b/src/bgiteration.h index 7be91e4534b..dd6da71608f 100644 --- a/src/bgiteration.h +++ b/src/bgiteration.h @@ -1,3 +1,9 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + #ifndef __BGITERATION_H #define __BGITERATION_H diff --git a/src/hashtable.c b/src/hashtable.c index 20aa906cc25..940adcc8c01 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -2578,4 +2578,3 @@ bool hashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor) { size_t bucket_idx = hashKey(ht, key) & mask; return rev(bucket_idx) < rev(cursor & mask); } - From 5935e9a295c29f2888cc22e6a07c94986c8708ca Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Fri, 29 May 2026 17:01:19 +0000 Subject: [PATCH 21/40] Forkless save Signed-off-by: Jim Brunner --- src/bgiteration.c | 23 +++++++++++++++++++++-- src/bgiteration.h | 12 ++++++------ src/db.c | 13 ++----------- src/server.c | 2 +- 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index bb514daed1e..c8ce6da18ad 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -218,6 +218,13 @@ enum { BGITER_CYCLE_BUDGET_MAX_MS = 10 // Maximum time limit when starvation seen }; +// dbEntry metadata +typedef struct { + uint32_t iterator_epoch; // iterator epoch of last modification +} bgIterationEntryMetadata; +static_assert(sizeof(bgIterationEntryMetadata) == BGITERATION_ENTRY_METADATA_SIZE, ""); + + // These can be tweaked by unit tests static int bgiter_max_clone_item_bytes = BGITER_MAX_CLONE_ITEM_BYTES; static int bgiter_max_clone_pool_bytes = BGITER_MAX_CLONE_POOL_BYTES; @@ -2721,8 +2728,20 @@ bool bgIteration_isEntryInuse(dbEntry *de) { // PUBLIC API -uint32_t bgIteration_getEpoch(void) { - return bgIteration_epoch; +void bgIteration_dbEntryModified(dbEntry *de) { + if (bgIteration_iterationActive()) { + bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(de); + if (md) md->iterator_epoch = bgIteration_epoch; + } +} + + +// PUBLIC API +void bgIteration_keyModified(int dbid, const_sds key) { + if (bgIteration_iterationActive()) { + dbEntry *de = dbFind(server.db[dbid], (sds)key); + if (de) bgIteration_dbEntryModified(de); + } } diff --git a/src/bgiteration.h b/src/bgiteration.h index dd6da71608f..b2be82a54fa 100644 --- a/src/bgiteration.h +++ b/src/bgiteration.h @@ -276,10 +276,7 @@ void bgIteratorClose(bgIterator *iter); * BGITERATION HOOKS REQUIRED TO SUPPORT ITERATION - CALLS INSERTED INTO MAIN VALKEY CODE ********************************************************************************************/ -typedef struct { - uint32_t iterator_epoch; // iterator epoch of last modification -} bgIterationEntryMetadata; - +#define BGITERATION_ENTRY_METADATA_SIZE 4 /* Must be called once (and only once) at server startup. */ void bgIteration_init(void); @@ -360,7 +357,10 @@ size_t bgIteration_memoryInuseForReplication(void); bool bgIteration_isEntryInuse(dbEntry *de); -/* Get the current iteration epoch, for tagging metadata on keys. */ -uint32_t bgIteration_getEpoch(void); +/* Notify bgIteration that a dbEntry has been added/modified. + * - If caller has a dbEntry*, dbEntryModified is more efficient + * - If caller has a dbid/key, a lookup is performed to find the dbEntry */ +void bgIteration_dbEntryModified(dbEntry *de); +void bgIteration_keyModified(int dbid, const_sds key); #endif diff --git a/src/db.c b/src/db.c index cdf41c6a8b7..4b6f3c11a39 100644 --- a/src/db.c +++ b/src/db.c @@ -432,8 +432,7 @@ void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags) { } else { dbSetValue(db, key, valref, 1, NULL); } - bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(*valref); - if (md) md->iterator_epoch = bgIteration_getEpoch(); + bgIteration_dbEntryModified(*valref); if (!(flags & SETKEY_KEEPTTL)) removeExpire(db, key); if (!(flags & SETKEY_NO_SIGNAL)) signalModifiedKey(c, db, key); } @@ -762,15 +761,7 @@ long long dbTotalServerKeyCount(void) { void signalModifiedKey(client *c, serverDb *db, robj *key) { touchWatchedKey(db, key); trackingInvalidateKey(c, key, 1); - - /* If bgIteration is running, need to maintain the iteration epoch. */ - if (bgIteration_iterationActive()) { - dbEntry *o = dbFind(db, objectGetVal(key)); - if (o) { - bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(o); - if (md) md->iterator_epoch = bgIteration_getEpoch(); - } - } + bgIteration_keyModified(db->id, objectGetVal(key)); } void signalFlushedDb(int dbid, int async) { diff --git a/src/server.c b/src/server.c index 6b2942c3714..ec529182b09 100644 --- a/src/server.c +++ b/src/server.c @@ -3027,7 +3027,7 @@ void initServer(void) { * if/when new metadata options are added, we will need to compute the size of a variable * size metadata, and provide appropriate accessors to access the specific portion of the * metadata (each of which may/may not exist, based on immutable startup parameters). */ - objectSetMetadataSize(sizeof(bgIterationEntryMetadata)); + objectSetMetadataSize(BGITERATION_ENTRY_METADATA_SIZE); } createDatabaseIfNeeded(0); /* The default database should always exist */ From 0f1782263fe4ac0322fdc8c382f824a104b80d60 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Fri, 29 May 2026 21:06:07 +0000 Subject: [PATCH 22/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 45 +++++++++++++++++++++++++++++++++------------ src/bgiteration.h | 43 ++++++++++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 29 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index c8ce6da18ad..30d21ea307f 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -244,6 +244,20 @@ typedef enum { BGITERATION_TYPE_CLUSTERSLOT } bgIterationType; +/* Flag indicates that a consistent iteration is required. This is used to create a point-in-time + * iteration. The iteration client will see all keys AS THEY EXISTED at the time when the iterator + * was created. + * Note: The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration + * start). SWAPDB events are NOT provided during a consistent iteration. */ +#define BGITERATOR_FLAG_CONSISTENT (1 << 0) + +/* Flag indicating that the replication stream for keys which have already been processed should be + * forwarded to the iteration client. Used for non-consistent iteration to track changes + * to keys already processed. By tracking changes, this allows an non-consistent iteration client + * to achieve a consistent view at the END of the iteration. + * NOTE: Replication events will be provided ordered and synchronized with any SWAPDB events. */ +#define BGITERATOR_FLAG_REPLICATION (1 << 1) + /* Extensions to bgIteratorItemType. These enumerations are used internally, and are not part of * the published interface. These allow for extensibility in the internal information-passing * between the Valkey main thread and the iteration client thread. */ @@ -2087,7 +2101,7 @@ static bool expediteKeysForMultiExec(client *c, hashtable *waitingOnKeys) { static bgIterator * bgIteratorCreate( const char *name, - int flags, + bgIteratorConsistency consistency, bgIteratorReplDoneFunc repldone, bgIteratorCleanupFunc cleanup, void *privdata, @@ -2095,9 +2109,16 @@ static bgIterator * bgIteratorCreate( genericIterator *keyset_iter) { serverAssert(onValkeyMainThread()); serverAssert(server.cluster_enabled || iter_type == BGITERATION_TYPE_FULLSCAN); - serverAssert(server.cluster_enabled // Don't allow CONSISTENT & REPLICATION - || !(flags & BGITERATOR_FLAG_CONSISTENT) // unless cluster mode (avoids - || !(flags & BGITERATOR_FLAG_REPLICATION)); // complications with SWAPDB & FLUSHDB) + + int flags; + switch (consistency) { + case BGITERATOR_CONSISTENCY_NONE: flags = 0; break; + case BGITERATOR_CONSISTENCY_START: flags = BGITERATOR_FLAG_CONSISTENT; break; + case BGITERATOR_CONSISTENCY_EVENTUAL: flags = BGITERATOR_FLAG_REPLICATION; break; + default: serverAssert(false); + } + // Consistent, with replication - doesn't make sense. + serverAssert(!((flags & BGITERATOR_FLAG_CONSISTENT) && (flags & BGITERATOR_FLAG_REPLICATION))); bgIterator *it = zmalloc(sizeof(bgIterator)); it->name = sdsnew(name); @@ -2164,27 +2185,27 @@ static bgIterator * bgIteratorCreate( //============================================================================================= // PUBLIC API -bgIterator * bgIteratorCreateFullScanIter( +bgIterator *bgIteratorCreateFullScanIter( const char *name, - int flags, + bgIteratorConsistency consistency, bgIteratorReplDoneFunc repldone, bgIteratorCleanupFunc cleanup, void *privdata) { - return bgIteratorCreate(name, flags, repldone, cleanup, privdata, BGITERATION_TYPE_FULLSCAN, - fullScanIteratorCreate()); + return bgIteratorCreate(name, consistency, repldone, cleanup, privdata, + BGITERATION_TYPE_FULLSCAN, fullScanIteratorCreate()); } // PUBLIC API -bgIterator * bgIteratorCreateSlotsIter( +bgIterator *bgIteratorCreateSlotsIter( const char *name, - int flags, + bgIteratorConsistency consistency, const int *slots, int slots_count, bgIteratorReplDoneFunc repldone, bgIteratorCleanupFunc cleanup, void *privdata) { - return bgIteratorCreate(name, flags, repldone, cleanup, privdata, BGITERATION_TYPE_CLUSTERSLOT, - clusterSlotIteratorCreate(slots, slots_count)); + return bgIteratorCreate(name, consistency, repldone, cleanup, privdata, + BGITERATION_TYPE_CLUSTERSLOT, clusterSlotIteratorCreate(slots, slots_count)); } // PUBLIC API diff --git a/src/bgiteration.h b/src/bgiteration.h index b2be82a54fa..8d247fa7bbf 100644 --- a/src/bgiteration.h +++ b/src/bgiteration.h @@ -33,21 +33,30 @@ typedef struct client client; typedef struct bgIterator bgIterator; -/* Flag indicates that a consistent iteration is required. This is used to create a point-in-time - * iteration. The iteration client will see all keys AS THEY EXISTED at the time when the iterator - * was created. - * Note: The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration - * start). SWAPDB events are NOT provided during a consistent iteration. */ -#define BGITERATOR_FLAG_CONSISTENT (1 << 0) - -/* Flag indicating that the replication stream for keys which have already been processed should be - * forwarded to the iteration client. Most useful for non-consistent iteration to track changes - * to keys already processed. By tracking changes, this allows an non-consistent iteration client - * to achieve a consistent view at the END of the iteration. - * NOTE: Replication events will be provided ordered and synchronized with any SWAPDB events. - * LIMITATION: Since SWAPDB events are not provided during CONSISTENT iteration, it is not - * permitted to use both CONSISTENT and REPLICATION on a non-clustermode instance. */ -#define BGITERATOR_FLAG_REPLICATION (1 << 1) +/* Consistency type for iteration. */ +typedef enum { + /* With no consistency requirements, dbEntries are provided to the iteration client as they + * appear at the time of iteration. No replication is provided. The only guarantee is that + * dbEntries which existed at the start of iteration, and remained through the duration of + * iteration, will be provided to the iteration client once (and only once). If a dbEntry is + * modified during iteration, either the old or the new value may be provided. */ + BGITERATOR_CONSISTENCY_NONE = 0, + + /* With consistency at the start of iteration, a point-in-time iteration is performed. The + * iteration client will see all keys AS THEY EXISTED at the time when the iterator was created. + * Note: The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration + * start). SWAPDB events will not be provided. */ + BGITERATOR_CONSISTENCY_START = 1, + + /* With an eventually consistent iteration, dbEntries will be followed by relevant replication. + * This will allow a client to achieve a consistent state at the END of the iteration. Once a + * dbEntry has been provided to the iteration client, any replication related to that entry will + * also be forwarded to the iteration client. With eventual consistency, keys are provided as + * they are at the time of iteration. This mode requires that the iteration client be aware of + * SWAPDB events. If a SWAPDB is performed, the client will receive a SWAPDB event. + * Replication events will be provided ordered and synchronized with any SWAPDB events. */ + BGITERATOR_CONSISTENCY_EVENTUAL = 2 +} bgIteratorConsistency; /* When running an iterator with replication, a replication-done function (callback) may be @@ -94,7 +103,7 @@ typedef void (*bgIteratorCleanupFunc)(bool terminated, void *privdata); */ bgIterator *bgIteratorCreateFullScanIter( const char *name, - int flags, + bgIteratorConsistency consistency, bgIteratorReplDoneFunc repldone, bgIteratorCleanupFunc cleanup, void *privdata); @@ -123,7 +132,7 @@ bgIterator *bgIteratorCreateFullScanIter( */ bgIterator *bgIteratorCreateSlotsIter( const char *name, - int flags, + bgIteratorConsistency consistency, const int *slots, int slots_count, bgIteratorReplDoneFunc repldone, From a88d079f426a6699f67657aca0e09e04051782c9 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Fri, 29 May 2026 22:29:54 +0000 Subject: [PATCH 23/40] Forkless Save Signed-off-by: Jim Brunner --- src/unit/test_bgiteration.cpp | 2661 +++++++++++++++++++++++++++++++++ src/unit/wrappers.h | 3 + 2 files changed, 2664 insertions(+) create mode 100644 src/unit/test_bgiteration.cpp diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp new file mode 100644 index 00000000000..c2caca75e25 --- /dev/null +++ b/src/unit/test_bgiteration.cpp @@ -0,0 +1,2661 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + + // Just for the moment, until https://github.com/valkey-io/valkey/issues/3450 is resolved +// clang-format off +#include "generated_wrappers.hpp" +#include + +using namespace ::testing; + +extern "C" { + #include "stdlib.h" + #include "bgiteration.h" + #include "server.h" + #define using usingvar // compile hack + #include "module.h" // uses "using" keyword + #undef using + extern hashtableType commandSetType; + extern dictType keylistDictType; + void bgIteration_feedIterators(void); + void createSharedObjects(void); + void hashtableDump(hashtable *ht); + void bgIteration_unitTestDisableCloning(void); + void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes); + static size_t mockHashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata); +} + + +// The private data is a pointer to arbitrary data. This value is used just to +// test that the correct value is passed through. +#define PRIVDATA reinterpret_cast(12345) + +typedef int32_t bgIterationEntryMetadata; // opaque 4 bytes +static_assert(sizeof(bgIterationEntryMetadata) == BGITERATION_ENTRY_METADATA_SIZE); + +// A bgIteration cleanup function used for testing. +static int cleanupCount; +static bool cleanupTerminated; +static void iteratorCleanupFn(bool terminated, void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + cleanupCount++; + cleanupTerminated = terminated; +} + + +// A bgIteration repldone function used for testing. +static int repldoneCount; +static bool iteratorRepldoneFn(void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + repldoneCount++; + return true; +} + + +/* This mock for hashtableScan will return the items in lexical order. It assumes that the entries + * are robjs containing an sds string for the key. The key is expected to begin with a capital + * letter [A-Z]. The caller passes 0 as the cursor to start the iteration. The returned cursor + * value will indicate the prior letter returned (1=A, ...). After entries starting with 'Z' have + * been returned, the cursor of 0 will indicate that the scan is complete. Note that all entries + * starting with the same letter will be returned in a single call. */ +static size_t mockHashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata) { + // Just in case, if it's not one of our hashtables, use the unmocked function + bool our_ht = (server.db[0]->keys && ht == kvstoreGetHashtable(server.db[0]->keys, 0)) + || (server.db[1]->keys && ht == kvstoreGetHashtable(server.db[1]->keys, 0)); + if (!our_ht) return __real_hashtableScan(ht, cursor, fn, privdata); + + // Collect all entries from the hashtable + std::vector entries; + hashtableIterator *iter = hashtableCreateIterator(ht, 0); + dbEntry *entry; + while (hashtableNext(iter, (void **)&entry)) { + char first = objectGetKey(entry)[0]; + assert(first >= 'A' && first <= 'Z'); + entries.push_back(entry); + } + hashtableReleaseIterator(iter); + + // Sort by key lexicographically + std::sort(entries.begin(), entries.end(), [](dbEntry *a, dbEntry *b) { + return strcmp(objectGetKey(a), objectGetKey(b)) < 0; + }); + + // cursor 0 means start at 'A', otherwise start after the cursor letter + char startLetter = (char)('A' + cursor); + + // Find the first letter to emit + char emitLetter = 0; + for (dbEntry *e : entries) { + char first = objectGetKey(e)[0]; + if (first >= startLetter) { + emitLetter = first; + break; + } + } + + if (emitLetter == 0) return 0; + + // Call fn for all entries starting with emitLetter + for (dbEntry *e : entries) { + char first = objectGetKey(e)[0]; + if (first == emitLetter) fn(privdata, (void *)e); + } + + size_t nextCursor = (size_t)(emitLetter - 'A' + 1); + return (nextCursor > 25) ? 0 : nextCursor; +} + + +static bool mockHashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor) { + // Just in case, if it's not one of our hashtables, use the unmocked function + if (ht != kvstoreGetHashtable(server.db[0]->keys, 0) + && ht != kvstoreGetHashtable(server.db[1]->keys, 0)) + return __real_hashtableScanHasPassedKey(ht, key, cursor); + + return ((const char *)key)[0] < (char)('A' + cursor); +} + + +static const char *logfile = ""; + +/* Most of the bgIteration unit tests are based on a CMD instance with 2 DBs. There are 8 keys in + * each DB. The hashtableScan function is mocked to return the keys in a predictable order. + * + * There are a number of helper functions to simulate certain key modification actions within our + * test configuration. Note that this is isolated from the actual call to processCommand. + * + * Because most of bgIteration is based on an ordered processing of keys, it doesn't matter if we + * are simulating CMD or CME, full scan, or slot-based. The majority of tests are independent of + * these concerns. + * + * However, there are some tests which are are unique to these configurations and use a specialized + * derived class to handle the differences. We do not want to duplicate all of the tests for + * the different configurations, but we do want to ensure that each configuration works properly. + * - bgIterationTestCluster - handles tests unique to full scan in cluster mode + * - bgIterationTestClusterSlots - handles tests unique to cluster slot-based iteration + */ +class BgIterationTest : public ::testing::Test { + protected: + static const int DB_COUNT = 2; + static const int ITEMS_PER_DB = 8; + + private: + /* With the mock hashtableScan, we get keys in a predictable order. DB0 works with buckets + * containing groups of keys (which hashtableScan returns in a single call). DB1 returns + * each key individually, as more separate buckets. Convention (for test readability) is + * that keys beginning [A-M] would be in DB0 and keys beginning [N-Z] in DB1. Letters are + * intentionally skipped to allow for possible insertions. */ + const char *keys[DB_COUNT][ITEMS_PER_DB] = {{"B0", "B1", "B2", "E0", "E1", "H0", "H1", "H2"}, + {"N0", "O0", "Q0", "R0", "T0", "U0", "W0", "Y0"}}; + + protected: + static const int TOTAL_ITEMS = DB_COUNT * ITEMS_PER_DB; + static const int LAST_ITEM = TOTAL_ITEMS - 1; + + MockValkey mock; + RealValkey real; + client *c = nullptr; // for general use in the tests (with common cleanup) + + struct serverCommand dummy_cmd = {0}; + + // Helper functions for accessing the keys. We can access by db(0..1) and seq(0..7) + // or by item number (0..15). + // NOTE: These virtual functions can be overridden in subclasses which may have different item layout. + virtual const char * getKeyAtDbSeq(int db, int seq) { + assert(db < DB_COUNT); + assert(seq < ITEMS_PER_DB); + return keys[db][seq]; + } + + virtual int getDbFromItemNum(int itemNum) { + assert(itemNum < DB_COUNT * ITEMS_PER_DB); + return itemNum / ITEMS_PER_DB; + } + + virtual int getSeqFromItemNum(int itemNum) { + assert(itemNum < DB_COUNT * ITEMS_PER_DB); + return itemNum % ITEMS_PER_DB; + } + + const char * keyStr(int itemNum) { + return getKeyAtDbSeq(getDbFromItemNum(itemNum), getSeqFromItemNum(itemNum)); + } + + int itemNumFromKey(const char *key) { + for (int itemNum = 0; itemNum < DB_COUNT * ITEMS_PER_DB; itemNum++) { + if (strcmp(key, keyStr(itemNum)) == 0) return itemNum; + } + return -1; + } + + + // Do some general initialization before starting the suite. Normally, the tests are run in + // isolation - and this isn't much different than SetUp(). But if running the + // entire test suite together (just manually running the test executable), this gets called + // only once. + static void SetUpTestSuite() { + monotonicInit(); + + bzero(&server, sizeof(server)); + server.hz = 100; + server.logfile = const_cast(logfile); + createSharedObjects(); + + moduleInitModulesSystem(); + + server.commands = hashtableCreate(&commandSetType); + server.orig_commands = hashtableCreate(&commandSetType); + populateCommandTable(); + } + + + static void TearDownTestSuite() { + hashtableRelease(server.commands); + hashtableRelease(server.orig_commands); + } + + + void initializeServerDb(int dbid, int slot_count_bits = 0) { + server.db[dbid] = static_cast(zcalloc(sizeof(serverDb))); + server.db[dbid]->id = dbid; + server.db[dbid]->keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, 0); + server.db[dbid]->expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, 0); + server.db[dbid]->watched_keys = dictCreate(&keylistDictType); + } + + + robj *createStringObjectFromCString(const char *s) { + return createStringObject(s, strlen(s)); + } + + + void addKeyToDb(int dbid, const char *key, const char *val) { + robj *key_obj = createStringObjectFromCString(key); + robj *val_obj = createStringObjectFromCString(val); + dbAdd(server.db[dbid], key_obj, &val_obj); + decrRefCount(key_obj); + } + + + virtual void setupDatabase() { + /* For these unit tests, a standard database is constructed. But we will use our own + * mocked scan function to ensure a consistent iteration order */ + + server.dbnum = DB_COUNT; + server.cluster_enabled = false; + server.db = static_cast(zcalloc(sizeof(serverDb *) * server.dbnum)); + + for (int dbid = 0; dbid < server.dbnum; dbid++) { + initializeServerDb(dbid); + for (int keynum = 0; keynum < ITEMS_PER_DB; keynum++) { + addKeyToDb(dbid, keys[dbid][keynum], keys[dbid][keynum]); + } + } + + EXPECT_CALL(mock, hashtableScan(_,_,_,_)).WillRepeatedly(Invoke(mockHashtableScan)); + EXPECT_CALL(mock, hashtableScanHasPassedKey(_,_,_)).WillRepeatedly(Invoke(mockHashtableScanHasPassedKey)); + + if (0) debugPrintBucketInfo(); + } + + + void SetUp() override { + server.main_thread_id = pthread_self(); + server.forkless_options_supported = 1; + objectSetMetadataSize(BGITERATION_ENTRY_METADATA_SIZE); + + bgIteration_unitTestDisableCloning(); + + setupDatabase(); + + EXPECT_CALL(mock, aeCreateTimeEvent(_,_,_,_,_)).WillRepeatedly(Return(0)); + bgIteration_init(); + + cleanupCount = 0; + repldoneCount = 0; + + // By default, do nothing for these + EXPECT_CALL(mock, blockClientInUseOnKeys(_,_,_)).WillRepeatedly(Return()); + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).WillRepeatedly(Return()); + + // By default, expect no permission issues + EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_,_,_,_,_,_)).WillRepeatedly(Return(ACL_OK)); + } + + + void TearDown() override { + bgIteration_feedIterators(); // process returning stuff before deleting DB + bgIteration_feedIterators(); // in case an iterator was closed there might be more + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i]->keys) kvstoreRelease(server.db[i]->keys); + if (server.db[i]->expires) kvstoreRelease(server.db[i]->expires); + dictRelease(server.db[i]->watched_keys); + zfree(server.db[i]); + } + zfree(server.db); + + if (c != NULL) freeTestClient(c); + } + + + // Deletes an item from the DB (often at the start of a test) - but does NOT notify + // bgIteration. bgIteration_keyDelete() should be explicitly called where needed. + void simpleDelItem(int itemNum) { + int db = getDbFromItemNum(itemNum); + + sds delKey = sdsnew(keyStr(itemNum)); + int rc = kvstoreHashtableDelete(server.db[db]->keys, 0, delKey); + ASSERT_EQ(rc, 1); + sdsfree(delKey); + } + + + // Find the actual dbEntry object by itemNum + dbEntry * getItem(int itemNum) { + int db = getDbFromItemNum(itemNum); + sds key = sdsnew(keyStr(itemNum)); + dbEntry *de = dbFind(server.db[db], key); + sdsfree(key); + return de; + } + + + // The test expects that the next item read will be BGITERATOR_ITEM_COMPLETE + void expectReadComplete(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + EXPECT_EQ(item->type, BGITERATOR_ITEM_COMPLETE); + bgIteratorClose(iter); + + int oldCleanupCount = cleanupCount; + bgIteration_feedIterators(); + EXPECT_EQ(cleanupCount, oldCleanupCount + 1); + } + + + // The test is cleaning up and isn't validating the remaining cleanup + void expectAnythingCleanup(bgIterator *iter) { + while (true) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + if ((item->type == BGITERATOR_ITEM_COMPLETE + || item->type == BGITERATOR_ITEM_TERMINATED)) { + bgIteratorClose(iter); + break; + } + } + bgIteration_feedIterators(); // Recognize the closed iterator + EXPECT_EQ(cleanupCount, 1); + } + + + void expectDictEntryMetadataMatch(dbEntry *de1, dbEntry *de2) { + bgIterationEntryMetadata *dm1 = static_cast(objectGetMetadata(de1)); + bgIterationEntryMetadata *dm2 = static_cast(objectGetMetadata(de2)); + + EXPECT_NE(dm1, nullptr); + EXPECT_NE(dm2, nullptr); + EXPECT_EQ(*dm1, *dm2); + } + + + // Useful when debugging new tests. It reads/prints all remaining items then crashes. + void cleanupIteratorDebugPrint(bgIterator *iter) { + bool done = false; + printf("[DEBUG] Printing bgIterator '%s' items:\n", bgIteratorName(iter)); + while (!done) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + switch (item->type) { + case BGITERATOR_ITEM_DBENTRY: + { + auto obj = item->u.dbe.de; + const char * keyStr = objectGetKey(obj); + printf("Entry: %s -> %s [itemNum: %i]\n", + keyStr, + static_cast(objectGetVal(obj)), + itemNumFromKey(keyStr)); + break; + } + case BGITERATOR_ITEM_REPLICATION: + printf("Repl: DB=%d : ", item->dbid); + for (int i = 0; i < item->u.repl.argc; i++) + printf("%s ", static_cast(objectGetVal(item->u.repl.argv[i]))); + printf("\n"); + break; + case BGITERATOR_ITEM_COMPLETE: + case BGITERATOR_ITEM_TERMINATED: + bgIteratorClose(iter); + done = true; + break; + default: + printf("unhandled: %d\n", item->type); + } + } + bgIteration_feedIterators(); // Recognize the closed iterator + ASSERT_TRUE(false); // Halt the test here + } + + + // Make a copy of the metadata + void * cloneMetadata(dbEntry *de) { + int size = objectGetMetadataSize(de); + void *metadata = zmalloc(size); + memcpy(metadata, objectGetMetadata(de), size); + return metadata; + } + + + // Compare a previous metadata copy to an existing entry + void compareAndFreeClonedMetadata(dbEntry *de, void *metadata) { + EXPECT_EQ(memcmp(objectGetMetadata(de), metadata, objectGetMetadataSize(de)), 0); + zfree(metadata); + } + + + // The test expects the next item will be a specific key + // The item value is verified against the default unless provided as a parameter. + void expectReadKey(bgIterator *iter, int itemNum, const char *value=nullptr) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_FALSE(item->u.dbe.is_cloned); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum)); + if (value) { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } else { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum))); + } + } + + + // The test expects the next item will be a specific key amd that the item is cloned. + // Metadata is tested (to make sure the clone includes the proper metadata). + // The item value is verified against the default unless provided as a parameter. + void expectReadClonedKey(bgIterator *iter, int itemNum, void *metadata, const char *value=nullptr) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_TRUE(item->u.dbe.is_cloned); + compareAndFreeClonedMetadata(item->u.dbe.de, metadata); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum)); + if (value) { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } else { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum))); + } + } + + + // Test expects the next key, but specified by key name, not itemNum. + void expectReadDbKeyValue(bgIterator *iter, int db, const char *key, const char *value) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), key); + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } + + + // Test expect to read a sequence of key items + void expectReadKeySequence(bgIterator *iter, int startItem, int endItem) { + for (int i = startItem; i <= endItem; i++) expectReadKey(iter, i); + } + + + // Just like expectReadKey, but also tests that a previous item is becoming unblocked. + void expectReadKeyWithUnblock(bgIterator *iter, int itemNum, int unblockItem, const char *value=nullptr) { + bool blocked = true; + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(unblockItem)))) + .WillOnce(Assign(&blocked, false)); + expectReadKey(iter, itemNum, value); + EXPECT_FALSE(blocked); + } + + + // Test expects to read a replication item matching the command help by client 'c' + void expectReadReplication(bgIterator *iter, client *c) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->dbid, c->db->id); + EXPECT_EQ(item->u.repl.cmd, c->cmd); + EXPECT_EQ(item->u.repl.argc, c->argc); + for (int i = 0; i < c->argc; i++) { + EXPECT_STREQ(static_cast(objectGetVal(item->u.repl.argv[i])), + static_cast(objectGetVal(c->argv[i]))); + } + } + + + // We expect to read a MULTI command which should have been inserted. + void expectReadMultiReplication(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("multi")); + } + + + // We expect to read an EXEC command which should have been inserted. + void expectReadExecReplication(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("exec")); + } + + + // Expecting that a DEL command should have been replicated. + void expectReadReplicationDel(bgIterator *iter, int itemNum) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->dbid, db); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("DEL")); + EXPECT_EQ(item->u.repl.argc, 2); + EXPECT_THAT(item->u.repl.argv[0], robjEqualsStr("DEL")); + EXPECT_THAT(item->u.repl.argv[1], robjEqualsStr(keyStr(itemNum))); + } + + + // Expecting that a special SWAPDB item has been inserted. + void expectReadSwapDB(bgIterator *iter, int db1, int db2) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_SWAPDB); + EXPECT_EQ(item->dbid, db1); + EXPECT_EQ(item->u.dbid2, db2); + } + + + static void debugPrintBucketInfoCb(void *privdata, void *entry) { + UNUSED(privdata); + dbEntry *de = (dbEntry *)entry; + printf("--- %s\n", objectGetKey(de)); + } + + void debugPrintBucketInfo() { + printf("*******DEBUG*******\n"); + for (int db = 0; db < server.dbnum; db++) { + int num_ht = kvstoreNumHashtables(server.db[db]->keys); + for (int slot = 0; slot < num_ht; slot++) { + hashtable *ht = kvstoreGetHashtable(server.db[db]->keys, slot); + if (!ht) continue; + + printf("DB: %d, slot: %d\n", db, slot); + size_t cursor = 0; + do { + cursor = hashtableScan(ht, cursor, debugPrintBucketInfoCb, NULL); + printf("-----------\n"); + } while (cursor != 0); + } + } + ASSERT_TRUE(false); + } + + + // Creates a client with a write command (SET) for the given itemNum + client *getWriteClient(int itemNum, const char *value) { + int db = getDbFromItemNum(itemNum); + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString("set"); + c->db = server.db[db]; + + c->argc = 3; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + c->argv[1] = createStringObjectFromCString(keyStr(itemNum)); + c->argv[2] = createStringObjectFromCString(value); + + return c; + } + + + // Create a client with a write command that touches multiple keys + client *getWriteMultiKeysClient( + const char * cmdName, + int dstItemNum, + const std::vector & srcItemsNum) { + + assert(!srcItemsNum.empty()); + + const int db = getDbFromItemNum(dstItemNum); + std::for_each(srcItemsNum.cbegin(), srcItemsNum.cend(), [&db, this](int srcItemNum) { + assert(db == getDbFromItemNum(srcItemNum)); + }); + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString(cmdName); + assert(c->cmd != nullptr); + c->db = server.db[db]; + + c->argc = 2 + srcItemsNum.size(); + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + c->argv[1] = createStringObjectFromCString(keyStr(dstItemNum)); + for (unsigned int i = 0; i < srcItemsNum.size(); i++) { + c->argv[2 + i] = createStringObjectFromCString(keyStr(srcItemsNum[i])); + } + + return c; + } + + + client *getWrite2KeysClient(const char * cmdName, int dstItemNum, int srcItemNum) { + return getWriteMultiKeysClient(cmdName, dstItemNum, {srcItemNum}); + } + + + client *getWrite3KeysClient( + const char * cmdName, int dstItemNum, int src1ItemNum, int src2ItemNum) { + return getWriteMultiKeysClient(cmdName, dstItemNum, {src1ItemNum, src2ItemNum}); + } + + + // Create a client with a MULTI/EXEC block. + // This parses a series of commands separated by ';' + // Example: getMultiClient("SET A0 xxx; SELECT 1; SET A1 xxx; SET B1 xxx") + client *getMultiClient(const char *commands, int dbid = 0) { + char *commandsCopy = zstrdup(commands); // a mutable copy + char *commandStr, *commandStrSave; + char *token, *tokenSave; + + client *c = static_cast(zcalloc(sizeof(client))); + c->db = server.db[dbid]; + initClientMultiState(c); + c->flag.multi = 1; + c->mstate->cmd_flags |= CMD_WRITE; + + commandStr = strtok_r(commandsCopy, ";", &commandStrSave); + while (commandStr != NULL) { + + token = strtok_r(commandStr, " ", &tokenSave); + c->cmd = lookupCommandByCString(token); + + c->argv = static_cast(zcalloc(sizeof(robj*) * 5)); // command + 4 args + + for (int i = 0; token != NULL; i++) { + c->argv[i] = createStringObjectFromCString(token); + c->argc = i+1; + token = strtok_r(NULL, " ", &tokenSave); + } + + queueMultiCommand(c, 0); + freeClientArgv(c); + + commandStr = strtok_r(NULL, ";", &commandStrSave); + } + + c->cmd = lookupCommandByCString("exec"); + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString("EXEC"); + + zfree(commandsCopy); + return c; + } + + + // Initially, a MULTI client is set up to execute the EXEC command (which examines the + // contents of the multi/exec block). This function advances the client to begin executing + // the individual commands within the multi/exec block. + void advanceMultiClientToCommand(client *c, int cmdNum) { + assert(cmdNum >= 0 && cmdNum < c->mstate->count); + c->argc = c->mstate->commands[cmdNum].argc; + c->argv = c->mstate->commands[cmdNum].argv; + c->argv_len = c->mstate->commands[cmdNum].argv_len; + c->cmd = c->realcmd = c->mstate->commands[cmdNum].cmd; + } + + + // A client with a fictional command: + // SETGET + // - writes a value to the first key (making this CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY) + // - reads a second key + client *getSetGetClient(int itemNum1, const char *value1, int itemNum2) { + // Fictional command which writes to 1st key and reads the 2nd + int db = getDbFromItemNum(itemNum1); + assert(db == getDbFromItemNum(itemNum2)); // (this would be a testcase error) + + client *c = static_cast(zcalloc(sizeof(client))); + struct serverCommand *cmd + = static_cast(zcalloc(sizeof(struct serverCommand))); + + cmd->fullname = const_cast("SETGET"); + cmd->arity = 4; + cmd->flags = CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY; + + cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INDEX; + cmd->legacy_range_key_spec.bs.index.pos = 1; // firstkey + cmd->legacy_range_key_spec.fk.range.lastkey = -1; + cmd->legacy_range_key_spec.fk.range.keystep = 2; + + c->cmd = cmd; + c->db = server.db[db]; + + c->argc = 4; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(cmd->fullname); + c->argv[1] = createStringObjectFromCString(keyStr(itemNum1)); + c->argv[2] = createStringObjectFromCString(value1); + c->argv[3] = createStringObjectFromCString(keyStr(itemNum2)); + + return c; + } + + + // Client with a fictional write command with no keys specified + client *getNoKeysWriteClient() { + // Fictional command which is marked WRITE, but has no keys. + client *c = static_cast(zcalloc(sizeof(client))); + struct serverCommand *cmd + = static_cast(zcalloc(sizeof(struct serverCommand))); + + cmd->fullname = const_cast("NOKEYSWRITE"); + cmd->arity = 1; + cmd->flags = CMD_WRITE; + + cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INVALID; // No keys + + c->cmd = cmd; + c->db = server.db[0]; + + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(cmd->fullname); + + return c; + } + + + void freeClientArgv(client *c) { + for (int i = 0; i < c->argc; i++) decrRefCount(c->argv[i]); + zfree(c->argv); + c->argv = NULL; + c->argc = 0; + } + + + // During testing, we create some fake commands. This checks if the command is real or fake. + // A fake command is dynamically allocated and can be freed. Real commands are static. + bool isRealValkeyCommand(struct serverCommand *cmd) { + return lookupCommandByCString(cmd->declared_name); + } + + + void freeTestClient(client *c) { + freeClientMultiState(c); + freeClientArgv(c); + + if (!isRealValkeyCommand(c->cmd)) zfree(c->cmd); + + zfree(c); + } + + + // Simulate what happens when a write command is blocked + void simulateBlockedWrite(client *c, int expectedNumberBlockedKeys = 1) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c,expectedNumberBlockedKeys,_)).Times(1); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_TRUE(blocked); + } + + + // Simulate what happens when a write command isn't blocked + void simulateUnblockedWrite(client *c) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + } + + + // Simulate what happens when a write command is NOT blocked, because the key can be cloned + // and expedited. This requires a scenario where we would normally need to block the + // client so that bgIteration can process the item. + void simulateClonedWrite(bgIterator *it, client *c) { + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + unsigned long initialClones = status.dbentry_clones_queued; + + // Client should not get blocked + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + + // Ensure that cloning took place + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, (initialClones + 1)); + + // Ensure that the real item isn't inuse (because we cloned it instead) + dbEntry *de = dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); + ASSERT_FALSE(bgIteration_isEntryInuse(de)); + } + + + // Simulates what happens when a write command (SET) actually executes. This requires a + // scenario where we would NOT be blocked on the write. It actually alters the value of + // the key and updates the metadata. + void simulateUnblockedWriteWithModification(client *c) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + + // Fake execution of the command - touch the iterator_epoch counter and swap the value + // We need to duplicate the value because setKey() can reallocate it. + robj *value = dupStringObject(c->argv[2]); + setKey(c, c->db, c->argv[1], &value, SETKEY_ADD_OR_UPDATE); + + // Let's make sure that setKey updated the iteration epoch (as it should have) + dbEntry *de = dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); + bgIterationEntryMetadata *md = static_cast(objectGetMetadata(de)); + bgIterationEntryMetadata md_after_setkey = *md; + // Now update the md again, and it should still match + bgIteration_dbEntryModified(de); + EXPECT_EQ(md, objectGetMetadata(de)); // the md location shouldn't have changed + EXPECT_EQ(md_after_setkey, *md); // the md value should still be the same + + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + } + + + // Simulate execution of a MULTI/EXEC transaction for a client `c` without blocking. + // It replays all queued commands and ensures replication matches a real transaction. + // command replication flag is revalidated when exec command is processed. + // This requires a scenario where we don't expect the client to be blocked. + void simulateUnblockedMultiExec(client *c) { + + // simulate EXEC command of the multi/exec client + simulateUnblockedWrite(c); + server.in_exec = 1; + + // If there are other commands, call both blockClientIfRequired and handleCommandReplication for each of the command. + for (int i = 0; i < c->mstate->count; i++) { + advanceMultiClientToCommand(c, i); + simulateUnblockedWrite(c); + + // Replicate MULTI if this is the first instruction inside MULTI/EXEC + if (i == 0) { + robj *argv[1]; + argv[0] = createStringObjectFromCString("multi"); + bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("multi"), 1, argv); + decrRefCount(argv[0]); + } + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + } + + // Call handleCommandReplication for EXEC + robj *argv[1]; + argv[0] = createStringObjectFromCString("EXEC"); + bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("exec"), 1, argv); + server.in_exec = 0; + decrRefCount(argv[0]); + } + + + // Simulate the expiration (active expiration) of a key. This is independent of command execution. + void simulateExpiration(int itemNum) { + ASSERT_NE(getItem(itemNum), nullptr); // Should be there before expire + + // NOTE: This seems weird, but Valkey propagates the delete before actually expiring the + // key. BgIterator expects this behavior and expects the key to exist when the + // DEL is received for propagation. + + // Send bgIteration the DEL + int db = getDbFromItemNum(itemNum); + sds sdsKey = sdsnew(keyStr(itemNum)); + robj *argv[2]; + argv[0] = createStringObjectFromCString("DEL"); + argv[1] = createStringObjectFromCString(sdsKey); + serverCommand *cmd = lookupCommandByCString("DEL"); + bgIteration_handleCommandReplication(db, cmd, 2, argv); + decrRefCount(argv[0]); + decrRefCount(argv[1]); + + bgIteration_keyDelete(db, sdsKey); + simpleDelItem(itemNum); // Simulate the actual del + + EXPECT_EQ(getItem(itemNum), nullptr); + sdsfree(sdsKey); + } + + + // Simulates an expiration, but validates behavior for an item inuse by bgIteration. + void simulateExpirationOfInuse(int itemNum) { + // An inuse item will have a refcount > 1. BgIteration should have incremented the + // refcount while it is inuse. + dbEntry *de = getItem(itemNum); + ASSERT_NE(de, nullptr); // Should be there before expire + EXPECT_TRUE(bgIteration_isEntryInuse(de)); + EXPECT_EQ(de->refcount, 2u); + + simulateExpiration(itemNum); + + // At this point, the item is removed from the DB, but still exists, and the refcount + // has been reduced to 1. This allows a background thread to continue using the item. + EXPECT_EQ(de->refcount, 1u); + } + + + // Simulates an expiration, but the item is a future item which will be expedited. + void simulateExpirationWithExpedite(int itemNum) { + // An inuse item will have a refcount > 1. BgIteration should have incremented the + // refcount while it is inuse. + dbEntry *de = getItem(itemNum); + ASSERT_NE(de, nullptr); // Should be there before expire + EXPECT_FALSE(bgIteration_isEntryInuse(de)); // Not yet inuse + EXPECT_EQ(de->refcount, 1u); + + simulateExpiration(itemNum); + + // At this point, the item is removed from the DB, but still exists, and the refcount + // has been reduced to 1. This allows a background thread to continue using the item. + EXPECT_TRUE(bgIteration_isEntryInuse(de)); // It's inuse now + EXPECT_EQ(getItem(itemNum), nullptr); // but it's not in the DB anymore + EXPECT_EQ(de->refcount, 1u); + } + + + // Simulate execution of a SWAPDB command + void simulateSwapDB(int dbid0, int dbid1) { + char dbStr[2] = {0}; + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString("swapdb"); + c->db = server.db[0]; + + c->argc = 3; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + dbStr[0] = '0' + dbid0; + c->argv[1] = createStringObjectFromCString(dbStr); + dbStr[0] = '0' + dbid1; + c->argv[2] = createStringObjectFromCString(dbStr); + + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // SWAPDB should never block + + // The real SWAP does more than this, but this is enough for unit tests + serverDb *aux = server.db[dbid0]; + server.db[dbid0] = server.db[dbid1]; + server.db[dbid1] = aux; + + bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv); + + freeTestClient(c); + } + + + // Simulate execution of a FLUSHDB or FLUSHALL command + void simulateFlushDB(int db, int anInUseItem) { + client *c = static_cast(zcalloc(sizeof(client))); + + if (db == -1) { + c->cmd = lookupCommandByCString("flushall"); + c->db = server.db[0]; + } else { + c->cmd = lookupCommandByCString("flushdb"); + c->db = server.db[db]; + } + + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + + dbEntry *de_in_use = getItem(anInUseItem); + EXPECT_EQ(de_in_use->refcount, 2u); + + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // FLUSHDB should never block + + // The real FLUSH does more than this, but this is enough for unit tests + + // Now flush the items + for (int d = 0; d < server.dbnum; d++) { + if (db == -1 || db == d) { + kvstoreRelease(server.db[d]->keys); + server.db[d]->keys = NULL; + } + } + + EXPECT_EQ(de_in_use->refcount, 1u); + + // and replicate + + bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv); + + freeTestClient(c); + } +}; + + +TEST_F(BgIterationTest, dbIsOK) { + // Just run the setup/teardown code to make sure the DB is OK. +} + + +///////////////////////////////////////////////////// +// Simple Full-scan iterator tests +///////////////////////////////////////////////////// + +// A simple full scan that just checks basic flow. +TEST_F(BgIterationTest, createAndCleanup) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + EXPECT_EQ(bgIteratorFind("simple"), it); + EXPECT_STREQ(bgIteratorName(it), "simple"); + + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + + EXPECT_EQ(status.dbentries_queued, 0u); + EXPECT_EQ(status.dbentries_processed, 0u); + EXPECT_EQ(status.replication_queued, 0u); + EXPECT_EQ(status.replication_processed, 0u); + EXPECT_EQ(status.swapdb_queued, 0u); + EXPECT_EQ(status.swapdb_processed, 0u); + EXPECT_EQ(status.flushdb_queued, 0u); + EXPECT_EQ(status.flushdb_processed, 0u); + + EXPECT_EQ(status.queue_length, 0u); + EXPECT_GT(status.queue_length_target, 0u); + + EXPECT_LT(status.runtime_ms, 5u); + EXPECT_EQ(status.current_item_ms, 0u); + + expectAnythingCleanup(it); + + EXPECT_EQ(bgIteratorFind("simple"), nullptr); +} + + +// Close client before reading anything +TEST_F(BgIterationTest, testClientCloseBeforeRead) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + bgIteration_feedIterators(); + + bgIteratorClose(it); // Immediately close before reading + + bgIteration_feedIterators(); // Recognize the closed iterator + + // Check that the cleanup callback was executed properly + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Test that the full scan hits each item in the expected sequence. +TEST_F(BgIterationTest, orderedIteration) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, LAST_ITEM); + + // Quick status check. At this point, the final item hasn't been returned yet. + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentries_queued, static_cast(TOTAL_ITEMS)); + EXPECT_EQ(status.dbentries_processed, static_cast(TOTAL_ITEMS) - 1); + + expectReadComplete(it); // Returns the final item, and reads the completion item + + // Check that the cleanup callback was executed properly + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); +} + + +// Test that two simultaneous iterations work properly. +TEST_F(BgIterationTest, twoOrderedIterations) { + bgIterator *it1 = bgIteratorCreateFullScanIter("simple1", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + bgIterator *it2 = bgIteratorCreateFullScanIter("simple2", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + EXPECT_EQ(bgIteratorFind("simple1"), it1); + EXPECT_EQ(bgIteratorFind("simple2"), it2); + + int it1Count = 0; + int it2Count = 0; + while (it1Count < TOTAL_ITEMS || it2Count < TOTAL_ITEMS) { + // Randomly read from either iterator + if ((rand() % 2) == 0) { + if (it1Count < TOTAL_ITEMS) expectReadKey(it1, it1Count++); + } else { + if (it2Count < TOTAL_ITEMS) expectReadKey(it2, it2Count++); + } + } + + // Nothing left but to read the final completions + expectReadComplete(it1); + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); + expectReadComplete(it2); + EXPECT_EQ(cleanupCount, 2); + EXPECT_FALSE(cleanupTerminated); +} + + +///////////////////////////////////////////////////// +// MODIFY A FUTURE ITEM +// The next tests validate the basic pattern when a key, not yet iterated, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a future item, without replication or consistency. +// Our expectation for this case is that the modification should proceed without blocking, the item +// shouldn't be expedited, and we will see the modified item once the iterator reaches it. +TEST_F(BgIterationTest, modFutureItem) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + c = getWriteClient(6, "xxx"); + + // We DONT expect the client to be blocked - not consistent + simulateUnblockedWriteWithModification(c); + + // Now continue reading, 1, 2, 3, 4, 5 + expectReadKeySequence(it, 1, 5); + + // Let's validate that key 6 shows the new value + expectReadKey(it, 6, "xxx"); + + // Continue... + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a future item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification SHOULD be blocked, as we have to save the +// the item in it's state before the modification. To reduce blocking time, the item should be +// moved to the head of the queue - there's no replication in this case, so out-of-order processing +// isn't a concern. +TEST_F(BgIterationTest, modFutureItem_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + c = getWriteClient(6, "xxx"); + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c); + + // On a consistent iterator, the event is expedited in-front of items already in queue! + // Read key 6 out of order. + expectReadKey(it, 6); + + // Now, when we read key 1, key 6 is released back to Valkey, and the client will be unblocked. + expectReadKeyWithUnblock(it, 1, 6); + simulateUnblockedWriteWithModification(c); // Now the write can proceed + + // Continue... + expectReadKeySequence(it, 2, 5); + // 6 has already been processed + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a future item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification should proceed without blocking, as the +// mode is inconsistent. We don't expect replication, as we haven't reached the item yet. We'll +// see the modified item later. +TEST_F(BgIterationTest, modFutureItem_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + c = getWriteClient(6, "xxx"); + + // We DONT expect the client to be blocked - not consistent + simulateUnblockedWriteWithModification(c); + + // NOTE: Since we haven't reached this item yet, and consistency is not required, there's no + // need to replicate this command. So everything should wrap up just fine - we will see + // the new value when we get to it. + + // Now continue reading, 1, 2, 3, 4, 5 + expectReadKeySequence(it, 1, 5); + + // Let's validate that key 6 shows the new value + expectReadKey(it, 6, "xxx"); + + // Continue... + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// MODIFY A CURRENT ITEM +// The next tests validate the basic pattern when a key, currently in use, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a current item, without replication or consistency. +// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't +// be expedited (it's already in use). +TEST_F(BgIterationTest, modCurrentItem) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication) + + // Continue... + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a current item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't +// be expedited (it's already in use). +TEST_F(BgIterationTest, modCurrentItem_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication) + + // Continue... + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a current item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification SHOULD be blocked. After the key is processed, +// the write will proceed, and the replication will be sent. +TEST_F(BgIterationTest, modCurrentItem_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write will cause replication + + expectReadKey(it, 4); // 4 got put in queue when 3 was read + + expectReadReplication(it, c); + + // Continue... + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// MODIFY A PAST ITEM +// The next tests validate the basic pattern when a key, not yet iterated on, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a past item, without replication or consistency. +// Our expectation for this case is that the modification should proceed without blocking. +// No replication is generated and keys are processed similar to no modification. +TEST_F(BgIterationTest, modPastItem) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKeySequence(it, 2, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a past item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification should proceed without blocking. +// No replication is generated and keys are processed similar to no modification. +TEST_F(BgIterationTest, modPastItem_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKeySequence(it, 2, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a past item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification should proceed without blocking. +// Replication will be sent. +TEST_F(BgIterationTest, modPastItem_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Key 2 was already in queue (same bucket as key 1). The replication will follow. + expectReadKey(it, 2); + expectReadReplication(it, c); + + // Continue... + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// TESTS FOR ITEM CLONING +///////////////////////////////////////////////////// + +// In a consistent iteration, verify that a simple string is properly cloned, and that a write can +// occur without blocking. Validate the cloned item and metadata. +TEST_F(BgIterationTest, modFutureItem_start_CloneExpeditedItem) { + // Initialize cloning configurations. + bgIteration_unitTestEnableCloning(50, 100); + + bgIteratorStatus status; + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + c = getWriteClient(6, "xxx"); + + // Quick status check. At this point, no clones exist yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 0u); + + // Since item 6 should be cloned, it will not block the client, allowing the write. + void *de6_md = cloneMetadata(getItem(6)); + simulateClonedWrite(it, c); // This wouldn't block, and queues the cloned value + simulateUnblockedWriteWithModification(c); // This modifies the real entry in the de (touching metadata) + + // At this point, one clone is in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // On a consistent iterator, the event is expedited in-front of items already in queue! + // Read key 6 (which is cloned) out of order. The value will still match the key. + expectReadClonedKey(it, 6, de6_md); // Also validates and frees the metadata + + // Quick status check. At this point, cloned items have not been marked as processed yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 0u); + + // Reading key 1 will release key 6, and the clone will finish processing. + expectReadKey(it, 1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Now, when we read key 2 should not have an impact on number of processed clones. + expectReadKey(it, 2); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Continue... + expectReadKeySequence(it, 3, 5); + // 6 has already been processed + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +// Check that cloning for simple strings is respecting the size limits and pool size. On a +// consistent iteration, we expect to block or clone on all future keys. We validate that we can +// clone if the item is small enough and the cloning pool has more space left. +TEST_F(BgIterationTest, modFutureItem_start_LargeItemOrClonePoolFull) { + // Initialize cloning configurations to test the clone pool functionality first. + bgIteration_unitTestEnableCloning(50, 50); + + bgIteratorStatus status; + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c6 = getWriteClient(6, "xxx"); + client *c7 = getWriteClient(7, "xxx"); + client *c8 = getWriteClient(8, "xxx"); + + // Quick status check. At this point, no clones exist yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 0u); + + // Since item 6 should be cloned, it will not block the client, allowing the write. + void *de6_md = cloneMetadata(getItem(6)); + simulateClonedWrite(it, c6); + simulateUnblockedWriteWithModification(c6); + + // At this point, one clone is in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // Now that cloning pool is full, item 7 will not be cloned and the client will be blocked. + simulateBlockedWrite(c7); + ASSERT_TRUE(bgIteration_isEntryInuse(getItem(7))); + + // There is still only one cloned item in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // Now change cloning configurations to test that large items will not be cloned. We adjust + // the clone pool size to allow two items, but set the maximum item size to be smaller than + // the size of item 8. The clone pool size must be larger than the total size of the existing + // clones plus the maximum item clone size. + bgIteration_unitTestEnableCloning(1, 101); + + // This write will pass the clone pool check but fail the item size check, blocking the client. + simulateBlockedWrite(c8); + ASSERT_TRUE(bgIteration_isEntryInuse(getItem(8))); + + // On a consistent iterator, the expedited item in-front of items already in queue! + // Read key 6 out of order. + expectReadClonedKey(it, 6, de6_md); + + // Now, when we expect to read key 7, which was expedited, key 6 will be released back to Valkey + // and the clone will be deallocated here. + expectReadKey(it, 7); + + // Now, when we read key 8, which was expedited, key 7 is released back to Valkey, and the client + // will be unblocked. + // (actually, unblock is called after every key [just in case] - but functionally we only care + // about this one) + expectReadKeyWithUnblock(it, 8, 7); + simulateUnblockedWriteWithModification(c7); + + // Now, when we read key 1, key 8 is released back to Valkey, and the client will be unblocked. + expectReadKeyWithUnblock(it, 1, 8); + simulateUnblockedWriteWithModification(c8); + + // Since only one item was cloned, there should be one clone processed + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Continue... + expectReadKeySequence(it, 2, 5); + // 6, 7, and 8 have already been processed + expectReadKeySequence(it, 9, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c6); + freeTestClient(c7); + freeTestClient(c8); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO MODIFICATION OF TWO ITEMS +// When 2 keys are modified, we need to ensure that both keys have been sent before we can send +// replication. This means that if replication is present, we may have to block/expedite for +// future keys, even in the inconsistent scenario. +///////////////////////////////////////////////////// + +// Replication enabled, but NOT consistent. In this case, if ANY of the keys have been iterated, +// ALL of the keys must be replicated so that the command can be processed properly on the replica. +TEST_F(BgIterationTest, modPastFutureItem_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Even though key 12 is for READ in this command, it must be expedited so that it exists before + // the associated replication is sent. + c = getSetGetClient(8, "xxx", 12); + simulateBlockedWrite(c); + + // Key 12 will be expedited, but not in front of existing items in queue (can only do that for + // consistent iterators) + + expectReadKey(it, 10); + expectReadKey(it, 12); // expedited + expectReadKeyWithUnblock(it, 11, 12); // 13 is now in queue + + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKey(it, 13); + expectReadReplication(it, c); + + expectReadKeySequence(it, 14, LAST_ITEM); + expectReadComplete(it); +} + + +// Replication NOT enabled. A read-only key doesn't need to be expedited, even if other keys have +// been processed already. (This should work identically for both consistent/non-consistent. +TEST_F(BgIterationTest, modPastFutureItem_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter1", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Since there's no replication, we don't have to worry about expediting 12. The write will + // proceed without blocking. + c = getSetGetClient(8, "xxx", 12); + simulateUnblockedWriteWithModification(c); + + // Key 12 will not be expedited. Remaining keys should be received in normal order. + expectReadKeySequence(it, 10, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, modPastFutureItem) { + bgIterator *it = bgIteratorCreateFullScanIter("iter2", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Since there's no replication, we don't have to worry about expediting 12. The write will + // proceed without blocking. + c = getSetGetClient(8, "xxx", 12); + simulateUnblockedWriteWithModification(c); + + // Key 9 will not be expedited. Remaining keys should be received in normal order. + expectReadKeySequence(it, 10, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO MISSING ITEMS +// Missing items are tricky. A missing item might be logically located in the past or future, in +// relation to the current iteration position. The command may (or may not) create the "missing" +// key. Some general considerations: +// * In a consistent iteration, a missing key didn't exist at the time of consistency, or it was +// already processed (saved) at the time of the deletion. If the missing key gets created, we +// must be sure to skip it if we later iterate over it. +// * In a non-consistent iteration with replication: +// * If the key location is already passed, the replication is sent, allowing the key to be +// created (or not) based on the replication. +// * If the key location is in the future, we can allow the command to proceed, without +// replication. If the key is created, we will process it when the iterator gets to it. +// +// We expect: +// no-repl, no-consist: past items are ignored - future items are processed when iterated +// no-repl, yes-consist: past items are ignored - future items are ignored +// yes-repl, no-consist: past item skipped, but replicated - future items are created by replication and skipped later +// yes-repl, yes-consist: past item skipped, but replicated - future items are processed when iterated +///////////////////////////////////////////////////// + +// no-repl, no-consist: creation of PAST item has no impact +TEST_F(BgIterationTest, missingPastItem) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); +} + + +// no-repl, yes-consist: creation of PAST item has no impact +TEST_F(BgIterationTest, missingPastItem_start) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); +} + + +// yes-repl, no-consist: creation of a PAST item will be replicated +TEST_F(BgIterationTest, missingPastItem_eventual) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // replication will be added after item 4 (3,4 in same bucket) + + expectReadKey(it, 4); + + expectReadReplication(it, c); + + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +// no-repl, no-consist: creation of FUTURE item is seen when reached by the iteration. +TEST_F(BgIterationTest, missingFutureItem) { + // Using DB1 so we have lots of buckets + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + const char * newValue = "xxx"; + c = getWriteClient(14, newValue); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 13); + + // We expect to see item 14. + // Note that for an inconsistent DB view, it is logically undefined if this value is seen (or not). + // But as implemented, we should see it and the test is helpful to understand if/when the + // functionality changes. + expectReadKey(it, 14, newValue); + + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); +} + + +// no-repl, yes-consist: creation of FUTURE item is ignored by consistent iteration. +TEST_F(BgIterationTest, missingFutureItem_start) { + // Using DB1 so we have lots of buckets + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + c = getWriteClient(14, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 13); + // Key 14 is missing - it didn't exist at start of consistent iteration + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); +} + + +// yes-repl, no-consist: creation of FUTURE item is handled by the replication, and then the key is +// later skipped (treated like an early iteration case). +TEST_F(BgIterationTest, missingFutureItem_eventual) { + // Using DB1 so we have lots of buckets + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Items 1 & 2 are in queue (same bucket) + + c = getWriteClient(14, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 2); + + expectReadReplication(it, c); // Here's the replication creating item 14 + + expectReadKeySequence(it, 3, 13); + // We expect item 14 to be skipped, because it was created by the earlier replication + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO EXPIRATION +// Expiration can be tricky. When pre-evaluating a command with bgIteration_blockClientIfRequired, +// a key might exist, but be ready for expiration. Then, as the command executes, the key expires +// and gets deleted before the write operation. Consider SET K V. +// In the unexpired case, this appears to bgIteration as a single SET command (which replaces the value). +// In the expired case, bgIteration will receive a DEL followed by a SET. +// +// Another case is a READ command. A read command won't cause the client to be blocked. However, +// if the key is expired, this will cause a DEL. For consistent processing, this key might need to +// be expedited so that it can be processed before it gets deleted. In this case, the key is +// unlinked from the main Valkey dictionary, but the actual deletion is deferred. +///////////////////////////////////////////////////// + +TEST_F(BgIterationTest, expireKeys) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we no longer care + simulateExpirationOfInuse(2); // Current - it's inuse + simulateExpiration(5); // Future - we don't care (non-consistent) + + expectReadKeySequence(it, 2, 4); + // key 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, expireKeys_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we expect replication + simulateExpirationOfInuse(2); // Current - it's inuse, but we expect replication + simulateExpiration(5); // Future - we don't care (non-consistent) + + expectReadKey(it, 2); // this was already queued + expectReadReplicationDel(it, 0); // Past item should replicate + expectReadReplicationDel(it, 2); // Current item should replicate + // Item 5 is a future item and doesn't need to replicate + + expectReadKeySequence(it, 3, 4); + // Item 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, expireKeys_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we no longer care + simulateExpirationOfInuse(2); // Current - we must defer + simulateExpirationWithExpedite(5); // Future - will become inuse and expedited for consistency + + expectReadKey(it, 5); // Expedited to front + + expectReadKeySequence(it, 2, 4); + // Item 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +// Special case during a non-consistent iteration with replication and expiration. +// 1. A future key is created (and processed by its replication) - considered early iterated +// 2. Later the key is expired and deleted during command processing (causes DEL to be sent) - no longer early iterated +// 3. The key is recreated as part of the command processing (and this command was replicated) - again early iterated +// 4. Finally, when we iterate to the key, it shouldn't be sent, because it was replicated in step 3. +TEST_F(BgIterationTest, expireKeys_eventual_FutureKeyCreatedThenExpiredDuringSet) { + simpleDelItem(8); // Start with a missing future item + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Get the iterator started + + c = getWriteClient(8, "xxx"); + simulateUnblockedWriteWithModification(c); // Not blocked because this is a future key (but we expect repl) + + // Now do it again, but break out the steps so that we can simulate an expiration + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // Shouldn't be blocked because this is a future key + + // Now, as the SET command tries to execute, simulate that the key is expired. Expiration + // processing sends the replication FIRST! + robj *argv[2]; + argv[0] = createStringObjectFromCString("DEL"); + argv[1] = c->argv[1]; + serverCommand *cmd = lookupCommandByCString("DEL"); + bgIteration_handleCommandReplication(getDbFromItemNum(8), cmd, 2, argv); + decrRefCount(argv[0]); + + // Now the call to keyDelete happens (after the replication). + bgIteration_keyDelete(getDbFromItemNum(8), static_cast(objectGetVal(c->argv[1]))); + simpleDelItem(8); // Simulate the actual del + + // Now the SET will run, re-creating the item (which is still a future item) + // We need to duplicate the value because setKey() can reallocate it. + robj *value = dupStringObject(c->argv[2]); + setKey(c, c->db, c->argv[1], &(value), SETKEY_ADD_OR_UPDATE); + + // Finally, replication will be sent because this is creating a new key + bgIteration_handleCommandReplication(getDbFromItemNum(8), c->cmd, c->argc, c->argv); + + // Test that everything comes as expected + expectReadKeySequence(it, 1, 2); // All one bucket - queued after key 0 read + + expectReadReplication(it, c); // Repl from the first SET command + expectReadReplicationDel(it, 8); // This is the expected replication of the DEL from expire + expectReadReplication(it, c); // Repl from the second SET command (recreating deleted key) + + expectReadKeySequence(it, 3, 7); // continue with normal iteration + // KEY 8 SHOULD BE OMITTED - This was already replicated + expectReadKeySequence(it, 9, LAST_ITEM); + + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// THE REMAINING TESTS ARE GENERAL / UNCATEGORIZED +///////////////////////////////////////////////////// + +// Iteration can be terminated from the main thread or from the child client. +// This tests termination driven from the main thread. +TEST_F(BgIterationTest, earlyTerminationFromMain) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + + // At this point, keys 1 & 2 are in queue. A termination should release those keys. + bool blocked1 = true; + bool blocked2 = true; + // We expect no general unblocks, we account for each specific unblock below. + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0); + // We should expect to see unblock called for items 1 & 2, as they are released from the queue. + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1)))) + .WillOnce(Assign(&blocked1, false)); + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2)))) + .WillOnce(Assign(&blocked2, false)); + bgIteratorTerminate(it); // queues the items for release + EXPECT_TRUE(bgIteratorIsTerminating(it)); + bgIteration_feedIterators(); // actually performs the release + EXPECT_FALSE(blocked1); + EXPECT_FALSE(blocked2); + + bool blocked0 = true; + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))) + .WillOnce(Assign(&blocked0, false)); + bgIteratorItem *item = bgIteratorRead(it); + EXPECT_FALSE(blocked0); + EXPECT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + EXPECT_EQ(cleanupCount, 0); + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Iteration can be terminated from the main thread or from the child client. +// This tests termination driven from the child client (the background thread). +TEST_F(BgIterationTest, earlyTerminationFromChild) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + // At this point, keys 1 & 2 are in queue. A termination should release those keys. + bgIteratorClose(it); // background thread initiates the termination + EXPECT_TRUE(bgIteratorIsTerminating(it)); + + bool blocked0 = true; + bool blocked1 = true; + bool blocked2 = true; + // Expecting no extra unblocks + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0); + // We expect item 0 (the in progress item) to be released + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))) + .WillOnce(Assign(&blocked0, false)); + // We expect items 1-4 (the queued items) to be released + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1)))) + .WillOnce(Assign(&blocked1, false)); + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2)))) + .WillOnce(Assign(&blocked2, false)); + bgIteration_feedIterators(); + EXPECT_FALSE(blocked0); + EXPECT_FALSE(blocked1); + EXPECT_FALSE(blocked2); + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Edge case. Executing a command (like SUNIONSTORE) which REPLACES the first key and reads the +// second key. In this case, bgIteration will get notified of the key deletion during execution of +// SETUNIONSTORE. Given that both keys are in the future (not iterated yet), we'll allow the +// command to execute, unblocked. We won't replicate as we'll pick up the key when we get to it. +TEST_F(BgIterationTest, writeWith2Keys_eventual_keyDeletedDuringSetReplace) { + // Using DB1 so we have lots of buckets + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 8); // 9 is in queue + + // Write command that has 2 keys. 1 existing key that we write to and 1 dependant future key. + c = getWrite2KeysClient("sunionstore", 12, 13); + + simulateUnblockedWrite(c); + + // Now the call to keyDelete happens + bgIteration_keyDelete(getDbFromItemNum(12), keyStr(12)); + simpleDelItem(12); // So simulate the actual del + + // Now the write will run, re-creating the item (which is still a future item) + const char * const newValueStr = "new value"; + robj *newValueRobj = createStringObjectFromCString(newValueStr); + setKey(c, c->db, c->argv[1], &newValueRobj, SETKEY_ADD_OR_UPDATE); + + // Finally, we are letting bgIteration know that the write command was executed + bgIteration_handleCommandReplication(getDbFromItemNum(12), c->cmd, c->argc, c->argv); + + // Since the write command was not replicated, we expect all the keys to be read in the normal + // order from the dictionary. + expectReadKeySequence(it, 9, 11); + expectReadKey(it, 12, newValueStr); + expectReadKeySequence(it, 13, LAST_ITEM); + + expectReadComplete(it); +} + + +// Edge case. When we have a new key which is created by a command, AND replication is enabled, we +// expect that we will replicate the command rather than serializing the key/value later. As an +// example, consider SUNIONSTORE A B. We want to create A by replicating the command. We don't +// want to have to process A as a key later on. But in this case, we can't run the command until +// B has been sent. We expect the command to be blocked while we send B. +TEST_F(BgIterationTest, writeWith2Keys_eventual_setNewKey_DependantFuture) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 8); // 9 is in queue + + // Write command that has 2 keys. 1 new key and 1 dependant future key. + c = getWrite2KeysClient("sunionstore", 12, 13); + + // We are simulating a new key in the dict. This command should block on the dependant key. + // This adds key 13 in the queue since the command depends on it. + simulateBlockedWrite(c); + + // Key 9 was already in the queue + expectReadKey(it, 9); + + // Key 13 is processed out of order since the write depends on it + expectReadKey(it, 13); + + // Reading key 10 will unblock key 13, allowing us to write. + expectReadKey(it, 10); + + // Now that key 13 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + // Key 11 was queued when we read key 10 + expectReadKey(it, 11); + + // The replication of the write command was enqueued after key 11 + expectReadReplication(it, c); + + // We shouldn't see key 12 - as that was processed via replication. + // We shouldn't see key 13 - as that was expedited earlier + + // Now resuming processing of dict entries + expectReadKeySequence(it, 14, LAST_ITEM); + + expectReadComplete(it); +} + + +// A new key is being created, but is dependent on another key which has already been processed. +// In this case, the command shouldn't be blocked. +TEST_F(BgIterationTest, writeWith2Keys_eventual_setNewKey_DependantPast) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // 10 is in queue, done with 8 + + // Write command that has 2 keys. 1 new key and 1 dependant past key. + c = getWrite2KeysClient("sunionstore", 12, 8); + + // We are simulating a new key in the dict. + // This command should not block since the dependant key has already been processed. + simulateUnblockedWriteWithModification(c); + + // Key 10 was put in the queue before the write + expectReadKey(it, 10); + + expectReadReplication(it, c); + + expectReadKey(it, 11); + + // Key 12 should be missing - it was processed by replication + + expectReadKeySequence(it, 13, LAST_ITEM); + expectReadComplete(it); +} + + +// A new key is being created, and has dependencies on 2 other keys - one already processed, one not. +// In this case, the command should be blocked so that the future key can be sent first. +TEST_F(BgIterationTest, writeWith3Keys_eventual_setNewKey_1DependantPast1DependantFuture) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // 8 has been returned, 9 is active, 10 is in queue + + // Write command that has 1 new key and 2 dependencies (past/future) + c = getWrite3KeysClient("sunionstore", 12, 8, 13); + + // The write should be blocked, so that item 13 can be processed. + simulateBlockedWrite(c); + + expectReadKey(it, 10); // 10 was already in queue + expectReadKey(it, 13); // 13 was expedited since the write depends on it + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(13)))).Times(1); + expectReadKey(it, 11); // Releases 13 so the command can execute + + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 14); // was queued when reading 11 (12 is missing, 13 was expedited) + + expectReadReplication(it, c); + + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); +} + + +// Test an edge case with the same (future) key being repeated in the command, like: +// SUNIONSTORE A B B +// In this test, A is a previously handled key, and B is a future key. We expect the future key B to +// be expedited (once). +TEST_F(BgIterationTest, writeWith3Keys_eventual_repeatedKey_1DependantPast1RepeatedFuture) { + // Using DB1 so we have lots of buckets + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // We're done with 8, and 10 is in queue + + // Write command that has 3 keys. 1 past key and 1 repeated key in the future. + c = getWrite3KeysClient("sunionstore", 8, 12, 12); + + // This command should block because 12 needs to be expedited. + simulateBlockedWrite(c); + + expectReadKey(it, 10); // was already in queue + expectReadKey(it, 12); // expedited + expectReadKey(it, 11); // releases 12 (unblocking the command) + + // Now that key 12 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 13); // queued when we read 11 + + expectReadReplication(it, c); + + // Now resuming processing of dict entries. + expectReadKeySequence(it, 14, LAST_ITEM); + expectReadComplete(it); +} + + +/* Tests the replication of a write command that creates a new key and depends on a + * future key which is duplicated in the command. */ +TEST_F(BgIterationTest, writeWith3Keys_eventual_repeatedKey_1newKey1RepeatedFuture) { + simpleDelItem(3); // Deleting key 3 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + // At this point, keys 1 & 2 are in queue. + + // Write command that has 3 keys. 1 new key and 1 repeated key in the future. + c = getWrite3KeysClient("sunionstore", 3, 5, 5); + + // This command should block on key 5. + // This adds key 5 in the queue because: + // - the command depends on key 5 which hasn't been processed yet + // - the command creates a new key (key 3). + simulateBlockedWrite(c); + + expectReadKeySequence(it, 1, 2); // These were already in queue + + // Key 5 is processed out of order since the write depends on it + expectReadKey(it, 5); + + // Keys 4 is the next in queue, and releases the expedited key 5 + expectReadKey(it, 4); + + // Now that key 4 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + // Key 6 & 7 are next, having been queued after reading key 4. + expectReadKeySequence(it, 6, 7); + + // The replication of the write command was enqueued after 5 was released (unblocking the command) + expectReadReplication(it, c); + + // Now resuming processing of dict entries. + expectReadKeySequence(it, 8, LAST_ITEM); + expectReadComplete(it); +} + + +/* A command modifying an in-progress key, but dependent on a future (repeated) key. */ +TEST_F(BgIterationTest, writeWith3Keys_start_repeatedKey_1DependantPast1RepeatedFuture) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + // At this point, keys 1 & 2 are in queue. + + // Write command that has 3 keys. 0 is in progress. 4 is still future. + // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a + // multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY). + c = getWriteMultiKeysClient("blpop", 0, {4, 4, 0}); + + // This command should block on 2 keys (0 and 4), since: + // - key 0 is in use by the iterator (still in the queue since it has not been processed by the consumer yet) + // - key 4 is in the future + // This adds key 4 in the queue since the command depends on it and it hasn't been processed yet. + simulateBlockedWrite(c, 2); + + // Key 4 is processed out of order since the write depends on it. + // Key 4 is processed before key 1 even though key 1 was already in the queue + // because key 4 was enqueued as a priority item with a no-replication iterator. + // Reading key 4 will release key 0 - releasing that lock on the command + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))).Times(1); + expectReadKey(it, 4); // This unblocks key 0 + + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(4)))).Times(1); + expectReadKey(it, 1); // this was already in queue (releases key 4) + + // Now that keys 4 and 0 were processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 2, 3); + + // 4 is skipped because it was already expedited + + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +/* Test that creates a new key, repeating the future key in the command. */ +TEST_F(BgIterationTest, writeWith3Keys_repeatedKey_1repeatedNewKey) { + simpleDelItem(6); // Deleting key 6 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + // Getting started + expectReadKeySequence(it, 0, 3); + // Now, 0,1,2 are in the past. 3 is being processed, and 4 is in queue. + + // Write command that has 3 keys. 1 new repeated key and 1 key in the past. + // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a + // multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY). + c = getWriteMultiKeysClient("blpop", 6, {0, 6, 0}); + + // The write command is not blocked since key 0 & 6 are not in use, and no consistency requirements + simulateUnblockedWriteWithModification(c); + + // Keys 2, 3 are next in the queue (it was put in the queue at the same time as key 1). + expectReadKeySequence(it, 4, 5); + + // There are no consistency requirements - so the new key should just be iterated. + // Key 6 is now in the dict with the value of key 0. + expectReadKey(it, 6, keyStr(0)); + + // Processing the rest of the dict entries. + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +/* In this test, the COPY command is copying from one DB to another. We will create the + * same key in both DBs. We make sure that the proper key is created via replication, and + * the proper key is created by iteration. */ +TEST_F(BgIterationTest, copyHandlesProperDb_eventual) { + + // NOTE: Adding E0 to dict 1. Now there is a E0 in both dict 0 and dict 1. + addKeyToDb(1, "H0", "H0"); + + // The test: + // We will simulate (with DB0 selected): COPY B1 H0 DB 1 REPLACE + // This will overwrite DB1:H0 that was created above. + // Since DB0:B1 is already in queue, we need to expedite the target (DB1:H0) as well + // After DB1:H0 is "overwritten", it should be marked early iterate. + // We expect DB0:H0 to NOT be marked early iterate, and should get processed normally. + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // B0 + // At this point, keys 1(B1) & 2(B2) are in queue. + + // COPY B1 H0 DB 1 REPLACE + c = static_cast(zcalloc(sizeof(client))); + c->cmd = lookupCommandByCString("copy"); + c->db = server.db[0]; + c->argc = 6; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + c->argv[1] = createStringObjectFromCString("B1"); + c->argv[2] = createStringObjectFromCString("H0"); + c->argv[3] = createStringObjectFromCString("DB"); + c->argv[4] = createStringObjectFromCString("1"); + c->argv[5] = createStringObjectFromCString("REPLACE"); + + // This should block on 2 keys. DB0:B1 is in queue. DB1:H0 needs to be expedited. + simulateBlockedWrite(c, 2); + + // These 2 keys were already in queue + expectReadKey(it, 1); // DB0:B1 + expectReadKey(it, 2); // DB0:B2 + + // And now we expect to see the expedited DB1:H0 + expectReadDbKeyValue(it, 1, "H0", "H0"); + + expectReadKey(it, 3); // releases DB1:E0 + + // Now key 4 is still in the queue + + simulateUnblockedWrite(c); // We shouldn't be blocked this time + + // Now, we'll simulate the actual activity of the COPY. DB1:H0 will be deleted in order to + // be overwritten. + bgIteration_keyDelete(1, sdsnew("H0")); // bgIteration would be signaled about the deletion + // At this point the key would actually be deleted and recreated by COPY (no need to actually do this) + + // And finally the replication (this should queue replication) + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + + // Now let's read everything... + expectReadKey(it, 4); // (this was previously in queue) + expectReadReplication(it, c); // This is the new replication (creating DB1:H0) + + // The rest should be normal. We shouldn't see DB1:E0 as it was recreated by replication + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +// Check that termination with replication in queue works OK. +TEST_F(BgIterationTest, terminateWithReplication) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); // makes sure we are done with key 0 (don't want to block) + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Should replicate + + bgIteratorTerminate(it); + + bgIteratorItem *item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// SWAPDB tests - Get ready for the mind-bend... + +/* In the non-consistent iterator (without replication), items are identified with the DBID at + * the time they are placed into the queue. The SWAPDB event signals the change to the + * iterating process - and this is properly sequenced with the DB info for each item. */ +TEST_F(BgIterationTest, swapDB) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + bgIteratorStatus status; + + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + simulateSwapDB(0, 1); // The swap event will be queued after item 2 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 0u); + + expectReadKey(it, 1); // These were already in queue, + expectReadKey(it, 2); // ... and the iteration client hasn't seen the swap yet + + expectReadSwapDB(it, 0, 1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 0u); // still processing it... + + // Since we've seen the swap event, items now have the new DBID + + expectReadDbKeyValue(it, 1, keyStr(3), keyStr(3)); // item 3 should show in DB1 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 1u); // done processing the swapdb + + // Keys 4 is in the queue - let's swap back! + simulateSwapDB(1, 0); // The swap event will be queued after item 4 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); // 2nd one queued + EXPECT_EQ(status.swapdb_processed, 1u); + + expectReadDbKeyValue(it, 1, keyStr(4), keyStr(4)); // item 4 should still show in DB1 + + expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); + EXPECT_EQ(status.swapdb_processed, 1u); // still processing it... + + // Since we've seen the second swap, items should now show with their original DB + + expectReadKey(it, 5); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); + EXPECT_EQ(status.swapdb_processed, 2u); // done processing all swaps + + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +/* In the consistent iterator (without replication) all items are presented to the iterating + * process using the DBID at the time of the iterator creation. No changes are evident. + * Swap events are not presented to the iteration client. */ +TEST_F(BgIterationTest, swapDB_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + simulateSwapDB(0, 1); // The swap occurs, but the iterator sees no change + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + + // Heck, let's go crazy with those swaps... + for (int itemNum = 4; itemNum <= LAST_ITEM; itemNum++) { + simulateSwapDB(0, 1); + expectReadKey(it, itemNum); + } + + expectReadComplete(it); +} + + +/* In the non-consistent iterator WITH replication, items are identified with the DBID at the + * time they are placed into the queue. The SWAPDB event signals the change to the iterating + * process - and this is properly sequenced with the DB info for each item. */ +TEST_F(BgIterationTest, swapDB_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + simulateSwapDB(0, 1); // The swap event will be queued after item 2 + + expectReadKey(it, 1); // These were already in queue, + expectReadKey(it, 2); // ... and the iteration client hasn't seen the swap yet + + expectReadSwapDB(it, 0, 1); // We should see a SWAPDB event + bgIteratorItem *item = bgIteratorRead(it); // followed by the associated replication + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + bgIteration_feedIterators(); + + // Since we've seen the swap event, items now have the new DBID + expectReadDbKeyValue(it, 1, keyStr(3), keyStr(3)); // item 3 is now in DB1 + + // Key 4 is in the queue - let's swap back! + simulateSwapDB(1, 0); // The swap event will be queued after item 4 + + expectReadDbKeyValue(it, 1, keyStr(4), keyStr(4)); // Still appears as DB1 + + expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap + item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + bgIteration_feedIterators(); + + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + +// There is no test for swapDB_YesReplication_YesConsistent because this configuration is not +// permitted with multiple DBs (not permitted with swaps). + + +// FLUSHDB & FLUSHALL Tests + +TEST_F(BgIterationTest, flushDB_flushAll) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // key 1 is active in the iterator - this key won't be deallocated because of the refcount. + // keys 2 is in queue - but will be returned to Valkey before the flush. It is yanked + // back by Valkey and will not be seen by iterator. + simulateFlushDB(-1, 1); + + bgIteratorItem *item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + +TEST_F(BgIterationTest, flushDB_flushOne) { + bgIterator *it1 = bgIteratorCreateFullScanIter("iter1", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + bgIterator *it2 = bgIteratorCreateFullScanIter("iter2", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + bgIteratorStatus status; + + // The test flushes DB0. This is half the data. Since <= half, a non-consistent iterator is + // allowed to proceed. But the consistent iterator will be terminated. + + expectReadKey(it1, 0); + expectReadKey(it2, 0); + expectReadKey(it1, 1); + expectReadKey(it2, 1); + + // key 1 is active in the iterator - this key won't be deallocated because of the refcount. + // keys 2 is in queue - but will be returned to Valkey before the flush. These are yanked + // back by Valkey and will not be seen by iterator. + simulateFlushDB(0, 1); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 0u); + + // Testing the non-consistent one continues... + // Everything already on the iterator queue should be preserved (deleted from the DB). + // Keys 2 is already queued (and preserved). + expectReadKey(it1, 2); + + // Read the flushdb item on iterator 1. + bgIteratorItem *item = bgIteratorRead(it1); + ASSERT_EQ(item->type, BGITERATOR_ITEM_FLUSHDB); + ASSERT_EQ(item->dbid, 0); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 0u); // still processing it + + // And iterator 1 keeps processing with the 2nd DB + expectReadKey(it1, ITEMS_PER_DB); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 1u); // done with all flushdb's + + expectReadKeySequence(it1, ITEMS_PER_DB + 1, LAST_ITEM); + expectReadComplete(it1); + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); + + // But the consistent iterator should be terminated + item = bgIteratorRead(it2); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + bgIteratorClose(it2); // background thread completes the termination + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 2); + EXPECT_TRUE(cleanupTerminated); +} + + +/* A multi with one future and one past key must expedite and replicate. */ +TEST_F(BgIterationTest, multiTwoKeysFirstFuture) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, + NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Causes keys 1 & 2 to be queued (same bucket) + expectReadKey(it, 1); // Causes key 0 to be released + + // Now, B0(0) is in the past. H0(5) is in the future. R0(11) [in DB1] is also future. + + /* For a non-consistent iteration, with replication... + * Normally, H0 (future) wouldn't need to expedite - we'd just modify it in place (without + * replication and iterate on it later. But, in this case, since it's wrapped in a multi, with + * B0 (past) - we need to expedite H0 so that the multi can all be handled in the same way. + * Key R0(11) [DB1] just makes thing a little trickier. */ + c = getMultiClient("SET B0 xxx; SET H0 xxx; SELECT 1; SET R0 xxx"); + + // The EXEC should block on 2 keys, because H0(5) & R0(11) should be expedited + simulateBlockedWrite(c, 2); + + expectReadKey(it, 2); // (was already in queue) + + // Note - it would be logically OK if these 2 were reversed, but this is how the current algorithm works. + expectReadKey(it, 5); // Key 5 (H0) was expedited + expectReadKey(it, 11); // Key 11 (R0) was expedited + + // We don't need to actually simulate the multi. Just checking that the keys were expedited. + + // and clean up the rest... + expectReadKeySequence(it, 3, 4); + // Key 5 was already read above (expedited) + expectReadKeySequence(it, 6, 10); + // Key 11 was already read above (expedited) + expectReadKeySequence(it, 12, LAST_ITEM); + expectReadComplete(it); +} + +// Multi blocking on future items. Consistent. +TEST_F(BgIterationTest, multiBlocksOnFutureKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + // Since there's no replication, an expedited key will be moved to the front of the queue. + // Let's fake a modification to key 6 (H1) + // Dummy up a MULTI... + c = getMultiClient("SET H1 xxx"); + + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c); + + // H1 (key 6) will be expedited to the front of the queue (because no replication) + expectReadKey(it, 6); + + // Now that we've read key 6, key 0 (B0) is passed and should not block + freeTestClient(c); + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + + // and clean up the rest... + expectReadKeySequence(it, 1, 5); + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} diff --git a/src/unit/wrappers.h b/src/unit/wrappers.h index 0f80919d6f7..5bfc117fab2 100644 --- a/src/unit/wrappers.h +++ b/src/unit/wrappers.h @@ -67,6 +67,9 @@ void __wrap_unblockClientsInUseOnKey(robj *key); int __wrap_ACLCheckAllUserCommandPerm(user *u, struct serverCommand *cmd, robj **argv, int argc, int dbid, int *idxptr); +size_t __wrap_hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata); +bool __wrap_hashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor); + #undef protected #undef _Bool #undef typename From 97d3bb67de755f407e531b1bcd71ec53ae253973 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Mon, 1 Jun 2026 15:15:34 +0000 Subject: [PATCH 24/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 3 ++- src/unit/test_bgiteration.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index 30d21ea307f..b8f809c11b8 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -2289,7 +2289,8 @@ bgIteratorItem * bgIteratorRead(bgIterator *it) { // To support unit tests. Normal clients call bgIteratorRead from an alternate thread. // Without this, a unit test could get stuck waiting on the completion event because // feed won't get invoked. For production, this is called regularly from the main thread. - if (onValkeyMainThread()) bgIteration_feedIterators_task(NULL, 0, NULL); + // Note - this is checking that the exact same thread is used and shouldn't count modules. + if (pthread_equal(server.main_thread_id, pthread_self()) != 0) bgIteration_feedIterators_task(NULL, 0, NULL); } else { it->client_is_active = true; } diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index c2caca75e25..4bdea0569a7 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -4,7 +4,7 @@ * SPDX-License-Identifier: BSD 3-Clause */ - // Just for the moment, until https://github.com/valkey-io/valkey/issues/3450 is resolved +// Just for the moment, until https://github.com/valkey-io/valkey/issues/3450 is resolved // clang-format off #include "generated_wrappers.hpp" #include From bc72ca4f4cadeae1a92e25ccaada3df3f1991e02 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Mon, 1 Jun 2026 15:27:54 +0000 Subject: [PATCH 25/40] Forkless Save Signed-off-by: Jim Brunner --- src/unit/test_bgiteration.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index 4bdea0569a7..0f268bb5a2d 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -46,15 +46,6 @@ static void iteratorCleanupFn(bool terminated, void *privdata) { } -// A bgIteration repldone function used for testing. -static int repldoneCount; -static bool iteratorRepldoneFn(void *privdata) { - EXPECT_EQ(privdata, PRIVDATA); - repldoneCount++; - return true; -} - - /* This mock for hashtableScan will return the items in lexical order. It assumes that the entries * are robjs containing an sds string for the key. The key is expected to begin with a capital * letter [A-Z]. The caller passes 0 as the cursor to start the iteration. The returned cursor @@ -275,7 +266,6 @@ class BgIterationTest : public ::testing::Test { bgIteration_init(); cleanupCount = 0; - repldoneCount = 0; // By default, do nothing for these EXPECT_CALL(mock, blockClientInUseOnKeys(_,_,_)).WillRepeatedly(Return()); From bdea400c45fa1fd1b44268d156494baa5714ee29 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Mon, 1 Jun 2026 17:17:22 +0000 Subject: [PATCH 26/40] Forkless Save Signed-off-by: Jim Brunner --- src/unit/test_bgiteration.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index 0f268bb5a2d..5d88b21108a 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -884,20 +884,18 @@ class BgIterationTest : public ::testing::Test { // Send bgIteration the DEL int db = getDbFromItemNum(itemNum); - sds sdsKey = sdsnew(keyStr(itemNum)); robj *argv[2]; argv[0] = createStringObjectFromCString("DEL"); - argv[1] = createStringObjectFromCString(sdsKey); + argv[1] = createStringObjectFromCString(keyStr(itemNum)); serverCommand *cmd = lookupCommandByCString("DEL"); bgIteration_handleCommandReplication(db, cmd, 2, argv); + bgIteration_keyDelete(db, static_cast(objectGetVal(argv[1]))); decrRefCount(argv[0]); decrRefCount(argv[1]); - bgIteration_keyDelete(db, sdsKey); simpleDelItem(itemNum); // Simulate the actual del EXPECT_EQ(getItem(itemNum), nullptr); - sdsfree(sdsKey); } @@ -2004,7 +2002,9 @@ TEST_F(BgIterationTest, writeWith2Keys_eventual_keyDeletedDuringSetReplace) { simulateUnblockedWrite(c); // Now the call to keyDelete happens - bgIteration_keyDelete(getDbFromItemNum(12), keyStr(12)); + sds sdskey = sdsnew(keyStr(12)); + bgIteration_keyDelete(getDbFromItemNum(12), sdskey); + sdsfree(sdskey); simpleDelItem(12); // So simulate the actual del // Now the write will run, re-creating the item (which is still a future item) @@ -2336,7 +2336,9 @@ TEST_F(BgIterationTest, copyHandlesProperDb_eventual) { // Now, we'll simulate the actual activity of the COPY. DB1:H0 will be deleted in order to // be overwritten. - bgIteration_keyDelete(1, sdsnew("H0")); // bgIteration would be signaled about the deletion + sds sdskey = sdsnew("H0"); + bgIteration_keyDelete(1, sdskey); // bgIteration would be signaled about the deletion + sdsfree(sdskey); // At this point the key would actually be deleted and recreated by COPY (no need to actually do this) // And finally the replication (this should queue replication) From 8a8de8dac91a381d448de13291202b851968f132 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Mon, 1 Jun 2026 20:18:44 +0000 Subject: [PATCH 27/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index b8f809c11b8..319d02c1e55 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -1940,6 +1940,9 @@ static void handleFlushdb(int dbid) { while ((node = listNext(&li)) != NULL) { bgIterator *it = listNodeValue(node); + // Let the low-level iterator know the DB is being flushed + it->keyset_iter->flushDb(it->keyset_iter, dbid); + if (should_abort_iterators || it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { terminateIteratorForFlush(it, dbid); } else { @@ -1951,7 +1954,6 @@ static void handleFlushdb(int dbid) { // very rare condition, development is not justified to save off the DB for deferred // delete. This would add a lot of complexity as well as memory implications. preserveIteratorItemsForFlush(it, dbid); - it->keyset_iter->flushDb(it->keyset_iter, dbid); // Send a flushdb event to notify the client if (BGITERATION_DEBUG) { From bc437ac209f18a6d6cb4816d2ba2256f0767c762 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Mon, 1 Jun 2026 20:44:17 +0000 Subject: [PATCH 28/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index 319d02c1e55..9137615acd5 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -589,7 +589,7 @@ static void fullScanIteratorSwapDb(genericIterator *genIt, int db1, int db2) { static void fullScanIteratorFlushDb(genericIterator *genIt, int cur_dbid) { struct fullScanIterator *it = (struct fullScanIterator *)genIt; - int orig_db = it->cur_to_orig_db[cur_dbid]; + int orig_db = (cur_dbid == -1) ? it->iter_db : it->cur_to_orig_db[cur_dbid]; if (orig_db == it->iter_db) { // We are currently iterating on the DB that's being flushed. it->kvs = NULL; From 47f01ff524d4dd7562a1506c9e994f8f8be11160 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 2 Jun 2026 16:08:28 +0000 Subject: [PATCH 29/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 4 - src/kvstore.c | 10 - src/unit/test_bgiteration.cpp | 437 ++++++++++++++++++++++++++++++++++ 3 files changed, 437 insertions(+), 14 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index 9137615acd5..d323b6001e1 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -23,10 +23,6 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid); robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const sds key, long long expire); // in object.c -// Non-public hashtable/kvstore functions... -hashtableIterator *kvstoreInternalIteratorGetCurrentHashtableIterator(kvstoreIterator *kvs_it); - - static bool receiveItemsBackFromOneIterator(bgIterator *it); // in bgiteration.c - forward declaration // ################ TEMP COMPILE HACKS ########################### diff --git a/src/kvstore.c b/src/kvstore.c index 1ac72a01dc2..86078cfc1ab 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -689,16 +689,6 @@ int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it) { return kvs_it->didx; } -/* This is an internal function - not part of the standard API. It must be explicitly declared - * where used. It shouldn't be included in any .h (API) file. Use of this interface is discouraged - * as it depends on the internal structure, which may change. - * - * Return the current hashtableIterator from within the kvstoreIterator. - */ -hashtableIterator *kvstoreInternalIteratorGetCurrentHashtableIterator(kvstoreIterator *kvs_it) { - return &kvs_it->di; -} - /* Fetches the next element and returns true. Returns false if there are no more elements. */ bool kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next) { if (kvs_it->didx != KVSTORE_INDEX_NOT_FOUND && hashtableNext(&kvs_it->di, next)) { diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index 5d88b21108a..04ee1c68f17 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -26,6 +26,7 @@ extern "C" { void bgIteration_unitTestDisableCloning(void); void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes); static size_t mockHashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata); + size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid); } @@ -45,6 +46,27 @@ static void iteratorCleanupFn(bool terminated, void *privdata) { cleanupTerminated = terminated; } +// A bgIteration repldone function used for testing. +static int replDoneConfirmed; +static bool iteratorRepldoneFn(void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + replDoneConfirmed++; + return true; +} + +// A more complicated repldone function that can delay the replcation done condition. +static int replDoneRejected; +static bool iteratorRepldoneFnNotBeingReadyInitially(void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + // This is to test the behavior when Repl Done function is not ready to be executed. + if (replDoneRejected == 0) { + replDoneRejected++; + return false; + } + replDoneConfirmed++; + return true; +} + /* This mock for hashtableScan will return the items in lexical order. It assumes that the entries * are robjs containing an sds string for the key. The key is expected to begin with a capital @@ -266,6 +288,8 @@ class BgIterationTest : public ::testing::Test { bgIteration_init(); cleanupCount = 0; + replDoneConfirmed = 0; + replDoneRejected = 0; // By default, do nothing for these EXPECT_CALL(mock, blockClientInUseOnKeys(_,_,_)).WillRepeatedly(Return()); @@ -2651,3 +2675,416 @@ TEST_F(BgIterationTest, multiBlocksOnFutureKey) { expectReadKeySequence(it, 7, LAST_ITEM); expectReadComplete(it); } + + +// Scenario. We have a multi that doesn't need to be replicated because all of the keys exist +// but are all future keys. Note that missing keys are considered already-iterated, so all +// must exist for this test. Then: +// - we delete a key +// - we re-create the deleted (future) key - normally this would be replicated +// - we access another (future) key - we don't expect to get blocked! +TEST_F(BgIterationTest, multiNotReplicatedButDelRecreateAccess) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + c = getMultiClient("DEL H1; SET H1 xxx; SET H2 yyy"); + // Now let's process the multi. Since H1 & H2 are both future (existing) items, we shouldn't + // block or replicate. + simulateUnblockedWrite(c); // the EXEC + + // Simulate the DEL H1 + server.in_exec = 1; // Simulate actual execution of the MULTI/EXEC + advanceMultiClientToCommand(c, 0); // DEL H1 + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + simpleDelItem(6); // H1 + sds delKey = sdsnew(keyStr(6)); + bgIteration_keyDelete(0, delKey); + sdsfree(delKey); + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); // shouldn't replicate + + // Simulate SET H1 - the key doesn't exist, and would normally replicate and mark early iterate, + // but this is in a transaction, and we are not replicating this transaction. + advanceMultiClientToCommand(c, 1); // SET H1 xxx + simulateUnblockedWriteWithModification(c); + + // Now write to another existing future key - this should work if we weren't confused by the DEL + advanceMultiClientToCommand(c, 2); // SET H2 yyy + simulateUnblockedWriteWithModification(c); + server.in_exec = 0; + + // Now we can continue iterating, and we should pick up keys 1... (and no replication!) + expectReadKeySequence(it, 1, 5); + expectReadKey(it, 6, "xxx"); + expectReadKey(it, 7, "yyy"); + expectReadKeySequence(it, 8, LAST_ITEM); + expectReadComplete(it); +} + + +// For this test, B0 is added into DB1 - so it exists in both DB 0 and 1. We will process it +// in DB0, but it will be unprocessed in DB1. See if we track SELECT properly. +TEST_F(BgIterationTest, multiHandlesSelectProperly) { + addKeyToDb(1, "B0", "B0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - B0 in DB0. + expectReadKey(it, 0); + // Now, we are done with B0 in DB0, but not in DB1 + expectReadKey(it, 1); // Reads B1, and releases B0 in DB0 + + // These cases should NOT block... (they access B0 in DB0) + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SELECT 0; SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block... (they access B0 in DB1) + c = getMultiClient("SET B0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SET B0 xxx"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SET B0 xxx; SELECT 0"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SELECT 1; SET B0 xxx; SELECT 1"); + simulateBlockedWrite(c); + + expectAnythingCleanup(it); +} + +// For this test, B0 is added into DB1 - so it exists in both DB0 and DB1. We will process it +// in DB0, but it will be unprocessed in DB1. See if we track select properly - WHEN WE HAVE NO +// PERMISSION TO EXECUTE SELECT! +TEST_F(BgIterationTest, multiHandlesSelectNoPermissionProperly) { + addKeyToDb(1, "B0", "B0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - B0 in DB0. + expectReadKey(it, 0); + // Now, we are done with B0 in DB0, but not in DB1 + expectReadKey(it, 1); // Reads B1, and releases B0 in DB0 + + // No permission for any commands (specifically select/swapdb) + EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_,_,_,_,_,_)) + .Times(AtLeast(1)).WillRepeatedly(Return(ACL_DENIED_CMD)); + + // These cases should NOT block... (they access B0 in DB0) + // The SELECTs below are inconsequential - with/without select, same result. + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SELECT 0; SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block IF SELECT IS WORKING... (they access B0 in DB1) + c = getMultiClient("SET B0 xxx"); + c->db = server.db[1]; // already starting on DB1 + simulateBlockedWrite(c); // will block, no select + freeTestClient(c); + c = getMultiClient("SELECT 1; SET B0 xxx"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + freeTestClient(c); + c = getMultiClient("SELECT 1; SET B0 xxx; SELECT 0"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + freeTestClient(c); + c = getMultiClient("SELECT 0; SELECT 1; SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + + expectAnythingCleanup(it); +} + +// For this test, B0 is added into DB1 - so it exists in both DB0 and DB1. We will process it +// in DB0, but it will be unprocessed in DB1. See if we track SWAPDB properly. +TEST_F(BgIterationTest, multiHandlesSwapdbProperly) { + addKeyToDb(1, "B0", "B0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - B0 in DB0. + expectReadKey(it, 0); + // Now, we are done with B0 in DB0, but not in DB1 + expectReadKey(it, 1); // Reads B1, and releases B0 in DB0 + + // These cases should NOT block... (they access B0 in DB0) + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 0 1; SELECT 1; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block... (they access B0 in DB1) + c = getMultiClient("SET B0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SET B0 xxx; SWAPDB 0 1"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SELECT 0; SET B0 xxx; SWAPDB 0 1"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET B0 xxx; SELECT 1"); + simulateBlockedWrite(c); + + expectAnythingCleanup(it); +} + +// For this test, B0 is added into DB1 - so it exists in both DB0 and DB1. We will process it +// in DB0, but it will be unprocessed in DB1. See if we track select properly - WHEN WE HAVE NO +// PERMISSION TO EXECUTE SWAPDB! +TEST_F(BgIterationTest, multiHandlesSwapdbNoPermissionProperly) { + addKeyToDb(1, "B0", "B0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - B0 in DB0. + expectReadKey(it, 0); + // Now, we are done with B0 in DB0, but not in DB1 + expectReadKey(it, 1); // Reads B1, and releases B0 in DB0 + + // No permission for any commands (specifically select/swapdb) + EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_,_,_,_,_,_)) + .Times(AtLeast(1)).WillRepeatedly(Return(ACL_DENIED_CMD)); + + // These cases should NOT block... (they access B0 in DB0) + // The SELECTs & SWAPDBs below are inconsequential - with/without select/swapdb, same result. + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 0 1; SELECT 1; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block IF SELECT/SWAPDB IS WORKING... (they access B0 in DB1) + c = getMultiClient("SET B0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SET B0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb fails) + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SELECT 0; SET B0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails) + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails) + + expectAnythingCleanup(it); +} + + +static void *pthreadWait200msAndReadTwoKeys(void *arg) { + bgIterator *it = static_cast(arg); + + usleep(200000); + bgIteratorRead(it); + bgIteratorRead(it); + return nullptr; +} + +static void asyncWait200msAndReadTwoKeys(bgIterator *it) { + int rc; + pthread_attr_t attr; + pthread_t thread; + + rc = pthread_attr_init(&attr); + assert(rc == 0); + rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + assert(rc == 0); + + rc = pthread_create(&thread, &attr, pthreadWait200msAndReadTwoKeys, it); + assert(rc == 0); + + rc = pthread_attr_destroy(&attr); + assert(rc == 0); +} + +TEST_F(BgIterationTest, testLuaWithUndeclaredKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // If we fake a modification to key 3, we won't know if it's handled out of order. + // So we fake a modification to key 4 + c = getWriteClient(4, "xxx"); + c->flag.script = 1; + + // Now for a LUA script, we have already blocked (on the eval/evalsha) for any declared keys + // But here, we're about to modify an undeclared key. We can't actually block in the middle + // of the LUA script. So this will behave as unblocked, but incur a synchronous wait. + + // Key 4 will get expedited when we simulate the write. After reading key 4, key 1 will need + // to be read to return key 4 to Valkey, unblocking the synchronous wait. + asyncWait200msAndReadTwoKeys(it); + + monotime blockTimer; + elapsedStart(&blockTimer); + simulateUnblockedWrite(c); // Not blocked, but delays internally + // Must have delayed at least 150ms (some time may have passed before timer start) + EXPECT_GT(elapsedMs(blockTimer), 150u); + + // Continue... + expectReadKeySequence(it, 2, 3); + // 4 has already been processed + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +// Make sure that replication received while processing the last key is sent +TEST_F(BgIterationTest, replicationReceivedWhileProcessingLastKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, LAST_ITEM); + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + expectReadReplication(it, c); // Replication happened while processing the last item, should be here. + + simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing + expectReadComplete(it); // We expect to see the completion instead +} + +TEST_F(BgIterationTest, repldoneFunctionCalled) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, LAST_ITEM); + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + + // Since in testing, we are only feeding one item at a time, and synchronously, we won't call + // the repldone function until after we release the last item. + EXPECT_EQ(replDoneConfirmed, 0); + expectReadReplication(it, c); // Replication happened while processing the last item, should be here. + EXPECT_EQ(replDoneConfirmed, 1); // Last key released, now done feeding replication + + simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing + expectReadComplete(it); // We expect to see the completion instead +} + +TEST_F(BgIterationTest, repldoneFunctionCalledTwice) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, iteratorRepldoneFnNotBeingReadyInitially, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, LAST_ITEM); + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + + // Won't signal replDone until we've released the final item (which happens when reading the replication) + EXPECT_EQ(replDoneRejected, 0); + EXPECT_EQ(replDoneConfirmed, 0); + expectReadReplication(it, c); // Releases the final item + EXPECT_EQ(replDoneRejected, 1); // replDone called once (and rejected by client) + EXPECT_EQ(replDoneConfirmed, 0); + simulateUnblockedWriteWithModification(c); // This will replicate (because replDone returned false) + expectReadReplication(it, c); // ReplDone gets called again (and accepted this time) + EXPECT_EQ(replDoneConfirmed, 1); + + simulateUnblockedWriteWithModification(c); // This won't replicate because replication is done + expectReadComplete(it); // We expect to see the completion instead +} + +// Check that the memory reported for replication is correct +TEST_F(BgIterationTest, checkReplicationByteCount) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); + + c = getWriteClient(0, "xxx"); + int expectedReplicationSize = sizeof(bgIteratorItem); + for (int i = 0; i < c->argc; i++) { + expectedReplicationSize += objectComputeSize(NULL, c->argv[i], 0, 0); + } + + expectReadKey(it, 0); + expectReadKey(it, 1); // Releases and unblocks 0 + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u); + + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize); + simulateUnblockedWriteWithModification(c); // and write again (2nd replication) + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize); + + expectReadKey(it, 2); // Keys 0..2 all in same bucket + + expectReadReplication(it, c); + // After reading the 1st replication, it hasn't been returned yet (it's the active item) + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize); + expectReadReplication(it, c); + // After reading the 2nd replication, the 1st has been returned + EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize); + + expectReadKey(it, 3); + // Now all replication has been returned/freed + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u); + + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); +} + +// Test that for an arbitrary write command having no keys, replication should occur. +TEST_F(BgIterationTest, checkNoKeysWriteIsReplicated) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + c = getNoKeysWriteClient(); + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + + expectReadKeySequence(it, 1, 2); // These were already in queue + + expectReadReplication(it, c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); +} From bd0ef8257f926999aa824a819a1e97d2e2aaca2f Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 2 Jun 2026 16:16:05 +0000 Subject: [PATCH 30/40] Forkless Save Signed-off-by: Jim Brunner --- src/unit/test_bgiteration.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index 04ee1c68f17..a91ac25fb78 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -3037,7 +3037,7 @@ TEST_F(BgIterationTest, checkReplicationByteCount) { BGITERATOR_CONSISTENCY_EVENTUAL, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); c = getWriteClient(0, "xxx"); - int expectedReplicationSize = sizeof(bgIteratorItem); + size_t expectedReplicationSize = sizeof(bgIteratorItem); for (int i = 0; i < c->argc; i++) { expectedReplicationSize += objectComputeSize(NULL, c->argv[i], 0, 0); } From 2b59edd22e7bb0554c91c7333df77d9f6cccafca Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 2 Jun 2026 17:19:27 +0000 Subject: [PATCH 31/40] Forkless Save Signed-off-by: Jim Brunner --- src/unit/test_bgiteration.cpp | 44 ++++++++--------------------------- 1 file changed, 10 insertions(+), 34 deletions(-) diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index a91ac25fb78..f1f7b67395f 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -790,6 +790,16 @@ class BgIterationTest : public ::testing::Test { void freeTestClient(client *c) { + // If the current command references one of the multi commands, + // null it out so we don't get a double-free. + if (c->mstate != NULL) { + for (int i = 0; i < c->mstate->count; i++) { + if (c->argv == c->mstate->commands[i].argv) { + c->argv = NULL; + c->argc = 0; + } + } + } freeClientMultiState(c); freeClientArgv(c); @@ -864,40 +874,6 @@ class BgIterationTest : public ::testing::Test { } - // Simulate execution of a MULTI/EXEC transaction for a client `c` without blocking. - // It replays all queued commands and ensures replication matches a real transaction. - // command replication flag is revalidated when exec command is processed. - // This requires a scenario where we don't expect the client to be blocked. - void simulateUnblockedMultiExec(client *c) { - - // simulate EXEC command of the multi/exec client - simulateUnblockedWrite(c); - server.in_exec = 1; - - // If there are other commands, call both blockClientIfRequired and handleCommandReplication for each of the command. - for (int i = 0; i < c->mstate->count; i++) { - advanceMultiClientToCommand(c, i); - simulateUnblockedWrite(c); - - // Replicate MULTI if this is the first instruction inside MULTI/EXEC - if (i == 0) { - robj *argv[1]; - argv[0] = createStringObjectFromCString("multi"); - bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("multi"), 1, argv); - decrRefCount(argv[0]); - } - bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); - } - - // Call handleCommandReplication for EXEC - robj *argv[1]; - argv[0] = createStringObjectFromCString("EXEC"); - bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("exec"), 1, argv); - server.in_exec = 0; - decrRefCount(argv[0]); - } - - // Simulate the expiration (active expiration) of a key. This is independent of command execution. void simulateExpiration(int itemNum) { ASSERT_NE(getItem(itemNum), nullptr); // Should be there before expire From 3e4692cfa07ca16d90c9f8eeadde04e3dc8eb7f0 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 2 Jun 2026 17:48:09 +0000 Subject: [PATCH 32/40] Forkless Save Signed-off-by: Jim Brunner --- src/unit/test_bgiteration.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp index f1f7b67395f..5c6b38ce115 100644 --- a/src/unit/test_bgiteration.cpp +++ b/src/unit/test_bgiteration.cpp @@ -171,6 +171,9 @@ class BgIterationTest : public ::testing::Test { MockValkey mock; RealValkey real; client *c = nullptr; // for general use in the tests (with common cleanup) + robj **orig_argv = nullptr; // Used when simulating multi + int orig_argc = 0; // Used when simulating multi + struct serverCommand dummy_cmd = {0}; @@ -707,6 +710,11 @@ class BgIterationTest : public ::testing::Test { // the individual commands within the multi/exec block. void advanceMultiClientToCommand(client *c, int cmdNum) { assert(cmdNum >= 0 && cmdNum < c->mstate->count); + if (cmdNum == 0) { + // Save off the EXEC + orig_argc = c->argc; + orig_argv = c->argv; + } c->argc = c->mstate->commands[cmdNum].argc; c->argv = c->mstate->commands[cmdNum].argv; c->argv_len = c->mstate->commands[cmdNum].argv_len; @@ -790,13 +798,15 @@ class BgIterationTest : public ::testing::Test { void freeTestClient(client *c) { - // If the current command references one of the multi commands, - // null it out so we don't get a double-free. + // If the current command references one of the multi commands, set it back to the EXEC if (c->mstate != NULL) { for (int i = 0; i < c->mstate->count; i++) { if (c->argv == c->mstate->commands[i].argv) { - c->argv = NULL; - c->argc = 0; + c->argc = orig_argc; + c->argv = orig_argv; + orig_argc = 0; + orig_argv = nullptr; + break; } } } From 7a79901d5dadf1707e4ec4a4454242565fb446fc Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Thu, 4 Jun 2026 17:56:33 +0000 Subject: [PATCH 33/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index d323b6001e1..8e5e55bf331 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -13,9 +13,6 @@ #include "mutexqueue.h" #include "server.h" -// Just for the moment, until https://github.com/valkey-io/valkey/issues/3450 is resolved -// clang-format off - int getFlushCommandFlags(client *c, int *flags); // in db.c uint64_t dictObjHash(const void *key); // in server.c int dictObjKeyCompare(const void *key1, const void *key2); // in server.c @@ -23,22 +20,15 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid); robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const sds key, long long expire); // in object.c -static bool receiveItemsBackFromOneIterator(bgIterator *it); // in bgiteration.c - forward declaration - -// ################ TEMP COMPILE HACKS ########################### -// Issue found. server.db has changed from an array of db to an array of pointers to db (change all refs to server.db) -// Issue: iterators (kvstore/hashtable) are not safe across event loop invocations. Hashtable (kvstore?) needs to track and maintain safe iterators. +static bool receiveItemsBackFromOneIterator(bgIterator *it); -// Don't think there's any current need for this... +// Future extendability static bool ignoreKeyForSave(const_sds key) { UNUSED(key); return false; } -//------- END OF COMPILE HACKS ------------------- - - // Returns true if the cmd is a script command that may replicate. static bool isScriptCallWriteCmd(struct serverCommand *cmd) { return ((cmd->proc == fcallCommand) || (cmd->proc == evalCommand) || (cmd->proc == evalShaCommand)); @@ -56,6 +46,9 @@ static bool isDeleteCmd(struct serverCommand *cmd) { } +/* This utility utilizes the main thread and backgound threads for processing. The API is split, + * with some of the functions intended for the main thread and others intended for the background + * clients. This sanity check ensures that we maintain thread safety, calling the API as intended. */ static bool onValkeyMainThread(void) { // Modules interact with the main thread using a mutex. If a module owns the mutex, consider // that equivalent to being on the main thread. @@ -63,6 +56,7 @@ static bool onValkeyMainThread(void) { return (inModule || pthread_equal(server.main_thread_id, pthread_self()) != 0); } + /* Parse a parameters robj, extracting a valid DBID. * Returns FALSE if DBID isn't valid. */ @@ -173,6 +167,7 @@ static void resumeReshahForKvsHashtable(kvstore *kvs, int didx) { if (ht != NULL) hashtableResumeRehashing(ht); } + /* DictType for SDS->ptr. The SDS is referenced, no destructor. */ static dictType sdsrefToPtrDictType = { .entryGetKey = dictEntryGetKey, @@ -240,6 +235,7 @@ typedef enum { BGITERATION_TYPE_CLUSTERSLOT } bgIterationType; + /* Flag indicates that a consistent iteration is required. This is used to create a point-in-time * iteration. The iteration client will see all keys AS THEY EXISTED at the time when the iterator * was created. @@ -254,6 +250,7 @@ typedef enum { * NOTE: Replication events will be provided ordered and synchronized with any SWAPDB events. */ #define BGITERATOR_FLAG_REPLICATION (1 << 1) + /* Extensions to bgIteratorItemType. These enumerations are used internally, and are not part of * the published interface. These allow for extensibility in the internal information-passing * between the Valkey main thread and the iteration client thread. */ @@ -270,6 +267,7 @@ typedef struct { bgIterator *iter; } bgIteratorItemExtClose; + /* Used for dictEntryPtrDictType. This dict grows and shrinks constantly during the iteration. * There is no point to rehash it all the time. */ static int neverShrink(size_t moreMem, double usedRatio) { @@ -280,9 +278,9 @@ static int neverShrink(size_t moreMem, double usedRatio) { // A dictionary with a pointer (itself) as a key (the address pointed to is NOT referenced). // Nothing is duplicated, this is a very fast dictionary, but potentially unsafe if the original // items are deleted or moved. -// WARNING: Can't have active defrag running! It might reallocate memory blocks, swapping their -// pointer values! A check must be made in active defrag to ensure that no iteration is -// active. +// WARNING: This needs to maintain safety with things that may move the object. +// * In db.c, if the object is reallocatd, bgIteration_updateDbEntryPtr() is called. +// * In defrag.c, we don't defrag if there are multiple references to an object (and we incr the refcount) // Thomas Wang's 64-bit mix static uint64_t pointerHash(const void *key) { @@ -2393,7 +2391,7 @@ void bgIteration_keyDelete(int dbid, const_sds key) { if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { if (!it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) - && !(dictFind(it->early_iterate_entries, de) != NULL)) { + && (dictFind(it->early_iterate_entries, de) == NULL)) { addEarlyIterationKey(it, de, dbid); // (may also add to inUseEntries) } } From 7a150698d5518507dfb38ef8808a4e51fc4be22a Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Thu, 4 Jun 2026 22:31:59 +0000 Subject: [PATCH 34/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 746 ++++++++++++++++++++++------------------------ 1 file changed, 357 insertions(+), 389 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index 8e5e55bf331..c050891939c 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -23,7 +23,7 @@ robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const sds static bool receiveItemsBackFromOneIterator(bgIterator *it); -// Future extendability +// Future extendability static bool ignoreKeyForSave(const_sds key) { UNUSED(key); return false; @@ -76,7 +76,7 @@ static bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid *target_dbid = selected_dbid; - for (int i = COPY_COMMAND_OPTIONAL_ARG_START_INDEX; i < argc; i++) { + for (int i = COPY_COMMAND_OPTIONAL_ARG_START_INDEX; i < argc; i++) { if (!strcasecmp((char *)objectGetVal(argv[i]), "replace")) { continue; } else if (!strcasecmp((char *)objectGetVal(argv[i]), "db") && (i + 1 < argc)) { @@ -122,7 +122,7 @@ static bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, if (getLongLongFromObject(argv[2], &dbid2) != C_OK) return false; if (dbid1 < 0 || dbid1 >= server.dbnum) return false; if (dbid2 < 0 || dbid2 >= server.dbnum) return false; - if (dbid1 == dbid2) return false; // Valid, but doesn't do anything + if (dbid1 == dbid2) return false; // Valid, but doesn't do anything *id1_p = (int)dbid1; *id2_p = (int)dbid2; @@ -173,8 +173,7 @@ static dictType sdsrefToPtrDictType = { .entryGetKey = dictEntryGetKey, .hashFunction = dictSdsHash, .keyCompare = dictSdsKeyCompare, - .entryDestructor = zfree -}; + .entryDestructor = zfree}; /* Wrap decrRefCount() so that it can be used as a callback requiring void. */ @@ -195,18 +194,18 @@ static sds createSdsFromClientArgv(int argc, robj **argv) { } -//########################################################################### +// ########################################################################### /* bgIteration internal (compile time) configuration values */ enum { - BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE = 16384, // Prevent initial rehashing - BGITER_MAX_CLONE_ITEM_BYTES = 512, // Max size item to clone - BGITER_MAX_CLONE_POOL_BYTES = (1 * 1024 * 1024), // Total limit for all cloned items - BGITER_QUEUE_INCREASE_INCR = 100, // Step size when increasing queue target - BGITER_CYCLE_DELAY_MS = 2, // Delay between calls on bgIteration timer - BGITER_CYCLE_BUDGET_MS = 1, // Normal time limit for timer processing - BGITER_CYCLE_BUDGET_MAX_MS = 10 // Maximum time limit when starvation seen + BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE = 16384, // Prevent initial rehashing + BGITER_MAX_CLONE_ITEM_BYTES = 512, // Max size item to clone + BGITER_MAX_CLONE_POOL_BYTES = (1 * 1024 * 1024), // Total limit for all cloned items + BGITER_QUEUE_INCREASE_INCR = 100, // Step size when increasing queue target + BGITER_CYCLE_DELAY_MS = 2, // Delay between calls on bgIteration timer + BGITER_CYCLE_BUDGET_MS = 1, // Normal time limit for timer processing + BGITER_CYCLE_BUDGET_MAX_MS = 10 // Maximum time limit when starvation seen }; // dbEntry metadata @@ -299,79 +298,77 @@ static int pointerCompare(const void *key1, const void *key2) { return key1 == key2; } -static dictType dictEntryPtrDictType = { - .entryGetKey = dictEntryGetKey, - .hashFunction = pointerHash, - .keyCompare = pointerCompare, - .resizeAllowed = neverShrink, - .entryDestructor = zfree -}; - -// A TEMP set of robj's (of type sds). This is only for temporary sets as the robj's are not -// ref-counted at insertion/deletion. -static hashtableType tempKeysetHashtableType = { - .hashFunction = dictObjHash, - .keyCompare = dictObjKeyCompare -}; - -typedef struct genericIterator genericIterator; -typedef void (*iteratorReleaseFunc) (genericIterator *genIt); -typedef fifo * (*iteratorGetEntriesFunc) (genericIterator *genIt, int *orig_dbid, int *cur_dbid); -typedef void (*iteratorSwapDbFunc) (genericIterator *genIt, int db1, int db2); -typedef void (*iteratorFlushDbFunc) (genericIterator *genIt, int cur_dbid); -typedef bool (*iteratorHasPassedItemFunc) (genericIterator *genIt, const_sds key, int cur_dbid); -typedef int (*iteratorOriginalDbFunc) (genericIterator *genIt, int cur_dbid); -typedef bool (*iteratorIsKeyInScopeFunc) (genericIterator *genIt, const_sds key); - -// Function pointers supporting polymorphic iterator implementation -struct genericIterator { - iteratorReleaseFunc release; - iteratorGetEntriesFunc getEntries; - iteratorSwapDbFunc swapDb; - iteratorFlushDbFunc flushDb; - iteratorHasPassedItemFunc hasPassedItem; - iteratorOriginalDbFunc originalDb; - iteratorIsKeyInScopeFunc isKeyInScope; -}; - +// A free list for bgIteratorItem's - avoids churning zmalloc calls typedef struct itemListNode { struct itemListNode *next; } itemListNode; static itemListNode *freeItemStackHead = NULL; -static void itemFreeList_returnItemBackToFreeList(bgIteratorItem* item) { - itemListNode *freedNode = (itemListNode*)item; +static void itemFreeList_returnItemBackToFreeList(bgIteratorItem *item) { + itemListNode *freedNode = (itemListNode *)item; freedNode->next = freeItemStackHead; freeItemStackHead = freedNode; } +// Pop a free node from the free list or allocate if none free static bgIteratorItem *itemFreeList_getElementOrAllocate(void) { - bgIteratorItem *item; - // Pop a free node from the free list or allocate if none free if (freeItemStackHead) { - item = (bgIteratorItem*)freeItemStackHead; + item = (bgIteratorItem *)freeItemStackHead; freeItemStackHead = freeItemStackHead->next; - if (freeItemStackHead) { - valkey_prefetch(freeItemStackHead); - } - } - else { + if (freeItemStackHead) valkey_prefetch(freeItemStackHead); + } else { // Create new listNode and item item = zmalloc(sizeof(bgIteratorItem)); } return item; } - -static void itemFreeList_release(void) { - while(freeItemStackHead) { + +static void itemFreeList_release(void) { + while (freeItemStackHead) { itemListNode *node = freeItemStackHead; freeItemStackHead = node->next; - zfree((bgIteratorItem*)node); + zfree(node); } } + +static dictType dictEntryPtrDictType = { + .entryGetKey = dictEntryGetKey, + .hashFunction = pointerHash, + .keyCompare = pointerCompare, + .resizeAllowed = neverShrink, + .entryDestructor = zfree}; + +// A TEMP set of robj's (of type sds). This is only for temporary sets as the robj's are not +// ref-counted at insertion/deletion. +static hashtableType tempKeysetHashtableType = { + .hashFunction = dictObjHash, + .keyCompare = dictObjKeyCompare}; + + +typedef struct genericIterator genericIterator; +typedef void (*iteratorReleaseFunc)(genericIterator *genIt); +typedef fifo *(*iteratorGetEntriesFunc)(genericIterator *genIt, int *orig_dbid, int *cur_dbid); +typedef void (*iteratorSwapDbFunc)(genericIterator *genIt, int db1, int db2); +typedef void (*iteratorFlushDbFunc)(genericIterator *genIt, int cur_dbid); +typedef bool (*iteratorHasPassedItemFunc)(genericIterator *genIt, const_sds key, int cur_dbid); +typedef int (*iteratorOriginalDbFunc)(genericIterator *genIt, int cur_dbid); +typedef bool (*iteratorIsKeyInScopeFunc)(genericIterator *genIt, const_sds key); + +// Function pointers supporting polymorphic iterator implementation +struct genericIterator { + iteratorReleaseFunc release; + iteratorGetEntriesFunc getEntries; + iteratorSwapDbFunc swapDb; + iteratorFlushDbFunc flushDb; + iteratorHasPassedItemFunc hasPassedItem; + iteratorOriginalDbFunc originalDb; + iteratorIsKeyInScopeFunc isKeyInScope; +}; + + // This struct is used across threads. Unless otherwise noted, the fields are initialized at // iterator creation (within the main thread) and are read-only by the client thread. struct bgIterator { @@ -387,7 +384,7 @@ struct bgIterator { genericIterator *keyset_iter; // Low-level iterator (polymorphic) dict *early_iterate_entries; // Used to keep track of what items have already been iterated - // over by out-of-order expedited process, ensuring a bgIterator + // over by out-of-order expedited process, ensuring a bgIterator // does not try to reprocess items. // Used only by main thread. // dictEntry -> NULL @@ -395,7 +392,7 @@ struct bgIterator { mutexQueue *items_for_iterator; // Created/Destroyed in main thread, used in both (threadsafe) mutexQueue *return_to_valkey; // Queue of items to be returned to the Valkey main thread (threadsafe) - + unsigned int item_count_target; // Used only by main thread bgIteratorItem *volatile current_item; // current_item is normally only used in the iteration client. @@ -482,7 +479,6 @@ static const bool BGITERATION_DEBUG = false; // DO NOT SUBMIT WITH THIS SYMBOL S static sds debugBuffer; - //============================================================================================= // Full Scan Iterator //============================================================================================= @@ -592,7 +588,7 @@ static void fullScanIteratorFlushDb(genericIterator *genIt, int cur_dbid) { } static bool fullScanIteratorHasPassedItem(genericIterator *genIt, const_sds key, int cur_dbid) { - struct fullScanIterator *it = (struct fullScanIterator *) genIt; + struct fullScanIterator *it = (struct fullScanIterator *)genIt; int orig_dbid = it->cur_to_orig_db[cur_dbid]; if (orig_dbid < it->iter_db) return true; // Entire DB has already been processed @@ -630,7 +626,7 @@ static bool fullScanIteratorIsKeyInScope(genericIterator *genIt, const_sds key) return true; // All keys are in scope } -static genericIterator * fullScanIteratorCreate(void) { +static genericIterator *fullScanIteratorCreate(void) { struct fullScanIterator *it = zmalloc(sizeof(struct fullScanIterator)); it->orig_to_cur_db = zmalloc(sizeof(int) * server.dbnum); it->cur_to_orig_db = zmalloc(sizeof(int) * server.dbnum); @@ -653,7 +649,6 @@ static genericIterator * fullScanIteratorCreate(void) { } - //============================================================================================= // Cluster Slot Iterator //============================================================================================= @@ -669,7 +664,7 @@ static void clusterSlotIteratorRelease(genericIterator *genIt) { serverAssert(false); // Not yet implemented } -static fifo * clusterSlotIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) { +static fifo *clusterSlotIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) { UNUSED(genIt); UNUSED(orig_dbid); UNUSED(cur_dbid); @@ -711,7 +706,7 @@ static bool clusterSlotIteratorIsKeyInScope(genericIterator *genIt, const_sds ke serverAssert(false); // Not yet implemented } -static genericIterator * clusterSlotIteratorCreate(const int *slots, size_t slots_count) { +static genericIterator *clusterSlotIteratorCreate(const int *slots, size_t slots_count) { struct clusterSlotIterator *it = zmalloc(sizeof(struct clusterSlotIterator)); it->callbacks.release = clusterSlotIteratorRelease; it->callbacks.getEntries = clusterSlotIteratorGetEntries; @@ -729,7 +724,6 @@ static genericIterator * clusterSlotIteratorCreate(const int *slots, size_t slot } - //============================================================================================= // General iteration support (across all iterators) //============================================================================================= @@ -738,35 +732,35 @@ static genericIterator * clusterSlotIteratorCreate(const int *slots, size_t slot // rehashing by the main thread. Returns true if rehashing was paused. static bool pauseRehashing(dbEntry *de) { switch (de->encoding) { - case OBJ_ENCODING_HASHTABLE: { // SET or HASH - hashtable *ht = objectGetVal(de); - hashtablePauseRehashing(ht); - return true; - } - case OBJ_ENCODING_SKIPLIST: { // SORTED SET - zset *zs = objectGetVal(de); - hashtablePauseRehashing(zs->ht); - return true; - } - default: - return false; + case OBJ_ENCODING_HASHTABLE: { // SET or HASH + hashtable *ht = objectGetVal(de); + hashtablePauseRehashing(ht); + return true; + } + case OBJ_ENCODING_SKIPLIST: { // SORTED SET + zset *zs = objectGetVal(de); + hashtablePauseRehashing(zs->ht); + return true; + } + default: + return false; } } static void resumeRehashing(dbEntry *de) { switch (de->encoding) { - case OBJ_ENCODING_HASHTABLE: { // SET or HASH - hashtable *ht = objectGetVal(de); - hashtableResumeRehashing(ht); - break; - } - case OBJ_ENCODING_SKIPLIST: { // SORTED SET - zset *zs = objectGetVal(de); - hashtableResumeRehashing(zs->ht); - break; - } - default: - break; + case OBJ_ENCODING_HASHTABLE: { // SET or HASH + hashtable *ht = objectGetVal(de); + hashtableResumeRehashing(ht); + break; + } + case OBJ_ENCODING_SKIPLIST: { // SORTED SET + zset *zs = objectGetVal(de); + hashtableResumeRehashing(zs->ht); + break; + } + default: + break; } } @@ -813,10 +807,8 @@ static ssize_t computeStringDbEntrySize(dbEntry *de) { static dbEntry *tryCloneDbEntry(dbEntry *de) { - if (bgiteration_current_clone_memory_pool_size + bgiter_max_clone_item_bytes - > bgiter_max_clone_pool_bytes) { - return NULL; - } + if (bgiteration_current_clone_memory_pool_size + bgiter_max_clone_item_bytes > + bgiter_max_clone_pool_bytes) return NULL; // Future optimization: Incorporate small ziplists, sorted sets, etc. // OBJ_ENCODING_INT is omitted only because there isn't a good API for cloning it yet. @@ -826,8 +818,8 @@ static dbEntry *tryCloneDbEntry(dbEntry *de) { if (itemSize <= bgiter_max_clone_item_bytes) { bgiteration_current_clone_memory_pool_size += itemSize; dbEntry *clone = createStringObjectWithKeyAndExpire((char *)objectGetVal(de), sdslen(objectGetVal(de)), objectGetKey(de), objectGetExpire(de)); - ((bgIterationEntryMetadata *)objectGetMetadata(clone))->iterator_epoch - = ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch; + ((bgIterationEntryMetadata *)objectGetMetadata(clone))->iterator_epoch = + ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch; return clone; } } @@ -845,7 +837,7 @@ static void freeClonedDictEntry(dbEntry *clonedEntry) { decrRefCount(clonedEntry); } -static bgIteratorItem * makeDbEntryItem(dbEntry *de, int dbid, bool isCloned) { +static bgIteratorItem *makeDbEntryItem(dbEntry *de, int dbid, bool isCloned) { if (!isCloned) incrementEntryInuse(de); bgIteratorItem *item = itemFreeList_getElementOrAllocate(); @@ -858,8 +850,8 @@ static bgIteratorItem * makeDbEntryItem(dbEntry *de, int dbid, bool isCloned) { return item; } -static robj ** cloneRobjArray(int argc, robj **argv) { - robj **newarray = zmalloc(sizeof(robj*) * argc); +static robj **cloneRobjArray(int argc, robj **argv) { + robj **newarray = zmalloc(sizeof(robj *) * argc); for (int i = 0; i < argc; i++) { newarray[i] = argv[i]; incrRefCount(argv[i]); @@ -882,32 +874,32 @@ static void returnCurrentItemToValkey(bgIterator *it) { if (item == NULL) return; switch (item->type) { - case BGITERATOR_ITEM_DBENTRY: - it->dbentries_processed++; - if (item->u.dbe.is_cloned) it->dbentry_clones_processed++; - mutexQueueAdd(it->return_to_valkey, item); - break; - case BGITERATOR_ITEM_REPLICATION: - it->replication_processed++; - mutexQueueAdd(it->return_to_valkey, item); - break; - case BGITERATOR_ITEM_SWAPDB: - it->swapdb_processed++; - mutexQueueAdd(it->return_to_valkey, item); - break; - case BGITERATOR_ITEM_FLUSHDB: - it->flushdb_processed++; - mutexQueueAdd(it->return_to_valkey, item); - break; - - case BGITERATOR_ITEM_COMPLETE: - case BGITERATOR_ITEM_TERMINATED: - // These are static and just used to wake the iterator - they should never be returned. - serverAssert(false); - break; + case BGITERATOR_ITEM_DBENTRY: + it->dbentries_processed++; + if (item->u.dbe.is_cloned) it->dbentry_clones_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_REPLICATION: + it->replication_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_SWAPDB: + it->swapdb_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_FLUSHDB: + it->flushdb_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + + case BGITERATOR_ITEM_COMPLETE: + case BGITERATOR_ITEM_TERMINATED: + // These are static and just used to wake the iterator - they should never be returned. + serverAssert(false); + break; - default: - serverAssert(false); + default: + serverAssert(false); } // Do this AFTER placing into return_to_valkey. This is volatile and snooped when there is a @@ -916,7 +908,6 @@ static void returnCurrentItemToValkey(bgIterator *it) { } - //============================================================================================= // Background Iterator (private) //============================================================================================= @@ -948,9 +939,9 @@ static void bgIteratorRelease(bgIterator *it) { static bool shouldFeedIteratorMore(bgIterator *it) { - return (!it->completed - && !it->terminated - && mutexQueueLength(it->items_for_iterator) < it->item_count_target); + return (!it->completed && + !it->terminated && + mutexQueueLength(it->items_for_iterator) < it->item_count_target); } @@ -990,10 +981,10 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { if (dbEntryFifo == NULL) { // Iteration of items is complete for this iterator - serverAssert(it->dbentries_queued >= it->dbentries_processed); - serverAssert(it->replication_queued >= it->replication_processed); - serverAssert(it->swapdb_queued >= it->swapdb_processed); - serverAssert(it->flushdb_queued >= it->flushdb_processed); + serverAssert(it->dbentries_queued >= it->dbentries_processed); + serverAssert(it->replication_queued >= it->replication_processed); + serverAssert(it->swapdb_queued >= it->swapdb_processed); + serverAssert(it->flushdb_queued >= it->flushdb_processed); serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed); // Snapshot queue size to seed next iterator when terminated @@ -1013,7 +1004,7 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { } } bgIteratorItem *completionItem = itemFreeList_getElementOrAllocate(); - *completionItem = (bgIteratorItem){ .type = BGITERATOR_ITEM_COMPLETE }; + *completionItem = (bgIteratorItem){.type = BGITERATOR_ITEM_COMPLETE}; if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { rdbSaveInfo rsi; completionItem->dbid = (rdbPopulateSaveInfo(&rsi)) ? rsi.repl_stream_db : 0; @@ -1040,8 +1031,8 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { fifoPop(dbEntryFifo, (void **)&de); // Remove new/modified items during consistent iteration. - if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT - && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch > it->consistent_modification_id) { + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT && + ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch > it->consistent_modification_id) { continue; } @@ -1090,10 +1081,10 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { static bool addEarlyIterationKey(bgIterator *it, dbEntry *earlyEntry, int cur_dbid) { int rc = dictAdd(it->early_iterate_entries, earlyEntry, NULL); serverAssert(rc == DICT_OK); - + int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) - ? it->keyset_iter->originalDb(it->keyset_iter, cur_dbid) - : cur_dbid; + ? it->keyset_iter->originalDb(it->keyset_iter, cur_dbid) + : cur_dbid; dbEntry *cloneEntry = tryCloneDbEntry(earlyEntry); bool isClonedEntry = (cloneEntry != NULL); @@ -1124,12 +1115,10 @@ static bool addEarlyIterationKey(bgIterator *it, dbEntry *earlyEntry, int cur_db // This expedites a single key and doesn't attempt to avoid expediting through optimization. -static bool expediteSingleKeyWithoutOptimization( - bgIterator *it, - int dbid, - robj *oKey, - hashtable *waitingOnKeys) { - +static bool expediteSingleKeyWithoutOptimization(bgIterator *it, + int dbid, + robj *oKey, + hashtable *waitingOnKeys) { bool mustBlock = false; bool iterComplete = it->completed || it->terminated; @@ -1137,11 +1126,11 @@ static bool expediteSingleKeyWithoutOptimization( sds key = objectGetVal(oKey); dbEntry *de = dbFind(server.db[dbid], key); if (de != NULL) { - if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) - && (dictFind(it->early_iterate_entries, de) == NULL)) { + if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) && + (dictFind(it->early_iterate_entries, de) == NULL)) { if (addEarlyIterationKey(it, de, dbid)) { mustBlock = true; - hashtableAdd(waitingOnKeys, oKey); + hashtableAdd(waitingOnKeys, oKey); } } else { if (isEntryInuseByAnyIterator(de)) { @@ -1157,12 +1146,11 @@ static bool expediteSingleKeyWithoutOptimization( // MOVE/COPY are unfortunate special commands. They work on 2 DBs at once. const int MOVE_COMMAND_DBID_ARG_INDEX = 2; -static bool expediteKeysForMove( - bgIterator *it, - int dbid, - int argc, - robj **argv, - hashtable *waitingOnKeys) { +static bool expediteKeysForMove(bgIterator *it, + int dbid, + int argc, + robj **argv, + hashtable *waitingOnKeys) { if (argc <= MOVE_COMMAND_DBID_ARG_INDEX) return false; int destDbid; @@ -1184,13 +1172,11 @@ static bool expediteKeysForMove( // MOVE/COPY are unfortunate special commands. They work on 2 DBs at once. -static bool expediteKeysForCopy( - bgIterator *it, - int dbid, - int argc, - robj **argv, - hashtable *waitingOnKeys) { - +static bool expediteKeysForCopy(bgIterator *it, + int dbid, + int argc, + robj **argv, + hashtable *waitingOnKeys) { int destDbid; if (!getTargetDbIdForCopyCommand(argc, argv, dbid, &destDbid)) return false; @@ -1240,15 +1226,14 @@ static bool expediteKeysForCopy( * - Block if any write-key is in use by an the iterator * - Block and immediately queue any key (read or write) that has not already been iterated */ -static bool expediteKeysForWrite( - bgIterator *it, - int dbid, - struct serverCommand *cmd, - int argc, - robj **argv, - keyReference *keyrefs, - int numKeys, - hashtable *waitingOnKeys) { +static bool expediteKeysForWrite(bgIterator *it, + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv, + keyReference *keyrefs, + int numKeys, + hashtable *waitingOnKeys) { serverAssert(numKeys > 0); bool mustBlock = false; @@ -1263,8 +1248,8 @@ static bool expediteKeysForWrite( // Note: performance optimization for commands which only modify the first key. If this flag // is not available, we can safely remove this `if` statement. - if ((cmd->flags & CMD_WRITE_FIRSTKEY_ONLY) - && !(it->iteration_flags & BGITERATOR_FLAG_REPLICATION)) { + if ((cmd->flags & CMD_WRITE_FIRSTKEY_ONLY) && + !(it->iteration_flags & BGITERATOR_FLAG_REPLICATION)) { // If this write command only modifies the 1st key, we don't need to expedite others // unless replication enabled. numKeys = 1; @@ -1289,12 +1274,12 @@ static bool expediteKeysForWrite( sds key = objectGetVal(oKey); dbEntry *de = dbFind(server.db[dbid], key); if (de == NULL) continue; // New key, no need to expedite - if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) - && dictFind(it->early_iterate_entries, de) == NULL - && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { + if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) && + dictFind(it->early_iterate_entries, de) == NULL && + ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { if (addEarlyIterationKey(it, de, dbid)) { mustBlock = true; - hashtableAdd(waitingOnKeys, oKey); + hashtableAdd(waitingOnKeys, oKey); } } else { if (isEntryInuseByAnyIterator(de)) { @@ -1327,9 +1312,9 @@ static bool expediteKeysForWrite( } continue; } - if (iterComplete - || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) - || (dictFind(it->early_iterate_entries, de) != NULL)) { + if (iterComplete || + it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) || + (dictFind(it->early_iterate_entries, de) != NULL)) { someIterated = true; } else { dictAdd(notIteratedKeys, de, oKey); @@ -1373,7 +1358,7 @@ static bool expediteKeysForWrite( if (addEarlyIterationKey(it, notIteratedEntry, dbid)) { mustBlock = true; - hashtableAdd(waitingOnKeys, oKey); + hashtableAdd(waitingOnKeys, oKey); } } dictReleaseIterator(di); @@ -1418,35 +1403,35 @@ static void returnAllItemsToValkey(bgIterator *it) { bgIteratorItem *item; fifoPop(poppedFifo, (void **)&item); switch (item->type) { - // back out the "queued" statistic - case BGITERATOR_ITEM_DBENTRY: - it->dbentries_queued--; - if (item->u.dbe.is_cloned) it->dbentry_clones_queued--; - break; - case BGITERATOR_ITEM_REPLICATION: - it->replication_queued--; - break; - case BGITERATOR_ITEM_SWAPDB: - it->swapdb_queued--; - break; - case BGITERATOR_ITEM_FLUSHDB: - it->flushdb_queued--; - break; - - case BGITERATOR_ITEM_COMPLETE: - // This can only happen if the completion item has been enqueued and - // the iterator is terminated before reaching the completion item. - itemFreeList_returnItemBackToFreeList(item); - continue; // Skip pushing this onto itemsToReturn - - case BGITERATOR_ITEM_TERMINATED: - // This can only happen if there is a race when terminating between - // the iteration client and main thread. - itemFreeList_returnItemBackToFreeList(item); - continue; // Skip pushing this onto itemsToReturn - - default: - serverAssert(false); + // back out the "queued" statistic + case BGITERATOR_ITEM_DBENTRY: + it->dbentries_queued--; + if (item->u.dbe.is_cloned) it->dbentry_clones_queued--; + break; + case BGITERATOR_ITEM_REPLICATION: + it->replication_queued--; + break; + case BGITERATOR_ITEM_SWAPDB: + it->swapdb_queued--; + break; + case BGITERATOR_ITEM_FLUSHDB: + it->flushdb_queued--; + break; + + case BGITERATOR_ITEM_COMPLETE: + // This can only happen if the completion item has been enqueued and + // the iterator is terminated before reaching the completion item. + itemFreeList_returnItemBackToFreeList(item); + continue; // Skip pushing this onto itemsToReturn + + case BGITERATOR_ITEM_TERMINATED: + // This can only happen if there is a race when terminating between + // the iteration client and main thread. + itemFreeList_returnItemBackToFreeList(item); + continue; // Skip pushing this onto itemsToReturn + + default: + serverAssert(false); } fifoPush(itemsToReturn, item); @@ -1461,7 +1446,6 @@ static void returnAllItemsToValkey(bgIterator *it) { } - //============================================================================================= // Foreground support functions (private) //============================================================================================= @@ -1478,94 +1462,90 @@ static size_t replicationItemSize(bgIteratorItem *item) { static void processReturnOfItemToValkey(bgIteratorItem *item, bgIterator *iter) { serverAssert(onValkeyMainThread()); switch ((int)item->type) { - case BGITERATOR_ITEM_REPLICATION: - bufferedReplicationBytes -= replicationItemSize(item); - freeRobjArray(item->u.repl.argc, item->u.repl.argv); - break; - - case BGITERATOR_ITEM_DBENTRY: - { - if (item->u.dbe.is_cloned) { - freeClonedDictEntry(item->u.dbe.de); - } else { - if (isEntryInuseBySingleIterator(item->u.dbe.de)) { - // This blocking mechanism isn't the best. Written for slot-migration, - // it assumes a single DB so if the same key appears in multiple DBs, - // commands might get unblocked only to get blocked again. (This would - // happen only rarely, and with minimal impact.) - robj *key = createStringObjectFromSds(objectGetKey(item->u.dbe.de)); - unblockClientsInUseOnKey(key); - decrRefCount(key); - } - // resumeRehashing must be called before decrementEntryInuse, since decrementEntryInuse can free - if (item->u.dbe.is_rehashing_paused) resumeRehashing(item->u.dbe.de); - decrementEntryInuse(item->u.dbe.de); - } + case BGITERATOR_ITEM_REPLICATION: + bufferedReplicationBytes -= replicationItemSize(item); + freeRobjArray(item->u.repl.argc, item->u.repl.argv); + break; + + case BGITERATOR_ITEM_DBENTRY: + if (item->u.dbe.is_cloned) { + freeClonedDictEntry(item->u.dbe.de); + } else { + if (isEntryInuseBySingleIterator(item->u.dbe.de)) { + // This blocking mechanism isn't the best. Written for slot-migration, + // it assumes a single DB so if the same key appears in multiple DBs, + // commands might get unblocked only to get blocked again. (This would + // happen only rarely, and with minimal impact.) + robj *key = createStringObjectFromSds(objectGetKey(item->u.dbe.de)); + unblockClientsInUseOnKey(key); + decrRefCount(key); } - break; - - case BGITERATOR_ITEM_SWAPDB: - case BGITERATOR_ITEM_FLUSHDB: - break; - - case BGITERATOR_ITEMEXT_ITER_CLOSED: - { - bgIterator *it = ((bgIteratorItemExtClose*)item)->iter; - serverAssert(it == iter); - if (it->terminated) { - // Abnormal termination - // Normally the item is TERMINATED, but might be COMPLETE in race - serverAssert(it->current_item->type == BGITERATOR_ITEM_TERMINATED - || it->current_item->type == BGITERATOR_ITEM_COMPLETE); - // Release any items stranded on the iterator after early termination - returnAllItemsToValkey(it); - receiveItemsBackFromOneIterator(it); - } else { - // Normal completion - serverAssert(it->current_item->type == BGITERATOR_ITEM_COMPLETE); - } - serverAssert(mutexQueueLength(it->items_for_iterator) == 0); - serverAssert(it->dbentries_queued == it->dbentries_processed); - serverAssert(it->replication_queued == it->replication_processed); - serverAssert(it->swapdb_queued == it->swapdb_processed); - serverAssert(it->flushdb_queued == it->flushdb_processed); - serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed); + // resumeRehashing must be called before decrementEntryInuse, since decrementEntryInuse can free + if (item->u.dbe.is_rehashing_paused) resumeRehashing(item->u.dbe.de); + decrementEntryInuse(item->u.dbe.de); + } + break; + + case BGITERATOR_ITEM_SWAPDB: + case BGITERATOR_ITEM_FLUSHDB: + break; + + case BGITERATOR_ITEMEXT_ITER_CLOSED: { + bgIterator *it = ((bgIteratorItemExtClose *)item)->iter; + serverAssert(it == iter); + if (it->terminated) { + // Abnormal termination + // Normally the item is TERMINATED, but might be COMPLETE in race + serverAssert(it->current_item->type == BGITERATOR_ITEM_TERMINATED || + it->current_item->type == BGITERATOR_ITEM_COMPLETE); + // Release any items stranded on the iterator after early termination + returnAllItemsToValkey(it); + receiveItemsBackFromOneIterator(it); + } else { + // Normal completion + serverAssert(it->current_item->type == BGITERATOR_ITEM_COMPLETE); + } + serverAssert(mutexQueueLength(it->items_for_iterator) == 0); + serverAssert(it->dbentries_queued == it->dbentries_processed); + serverAssert(it->replication_queued == it->replication_processed); + serverAssert(it->swapdb_queued == it->swapdb_processed); + serverAssert(it->flushdb_queued == it->flushdb_processed); + serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed); - listEmpty(curCmdMissingKeys); // Just in case any remain + listEmpty(curCmdMissingKeys); // Just in case any remain - itemFreeList_returnItemBackToFreeList(it->current_item); - it->current_item = NULL; + itemFreeList_returnItemBackToFreeList(it->current_item); + it->current_item = NULL; - bool terminated = it->terminated; - void *privdata = it->privdata; - bgIteratorCleanupFunc cleanup = it->cleanup; - bgIteratorRelease(it); // Fully release the iterator before calling cleanup + bool terminated = it->terminated; + void *privdata = it->privdata; + bgIteratorCleanupFunc cleanup = it->cleanup; + bgIteratorRelease(it); // Fully release the iterator before calling cleanup - if (BGITERATION_DEBUG) { - if (cleanup) debugBuffer = sdscatprintf(debugBuffer, "CLEANUP FN (%s)\n", - (terminated) ? "terminated" : "success"); + if (BGITERATION_DEBUG) { + if (cleanup) debugBuffer = sdscatprintf(debugBuffer, "CLEANUP FN (%s)\n", + (terminated) ? "terminated" : "success"); - sds filename = sdscatprintf(sdsempty(), "bgiteration_debug.%d", getpid()); - FILE *f = fopen(filename, "w"); - sdsfree(filename); + sds filename = sdscatprintf(sdsempty(), "bgiteration_debug.%d", getpid()); + FILE *f = fopen(filename, "w"); + sdsfree(filename); - fputs(debugBuffer, f); + fputs(debugBuffer, f); - fclose(f); - sdsfree(debugBuffer); - debugBuffer = sdsempty(); - } + fclose(f); + sdsfree(debugBuffer); + debugBuffer = sdsempty(); + } - if (cleanup) cleanup(terminated, privdata); - } - break; + if (cleanup) cleanup(terminated, privdata); + } break; - default: - serverAssert(false); // Not expecting any other type of item! + default: + serverAssert(false); // Not expecting any other type of item! } // We don't allocate extension items from the pool so we manually free them - if((int)item->type == BGITERATOR_ITEMEXT_ITER_CLOSED) { + if ((int)item->type == BGITERATOR_ITEMEXT_ITER_CLOSED) { zfree(item); } else { itemFreeList_returnItemBackToFreeList(item); @@ -1573,26 +1553,22 @@ static void processReturnOfItemToValkey(bgIteratorItem *item, bgIterator *iter) } static void prepareAndProcessReturnedItems(int n, bgIteratorItem **items, bgIterator *iter) { - int i = 0; - for (i = 0; i < n; i++) valkey_prefetch(items[i]); - for (i = 0; i < n; i++) { + for (int i = 0; i < n; i++) valkey_prefetch(items[i]); + for (int i = 0; i < n; i++) { if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue; - // Prefetch can have a significant perf hit on NULL - // but we never expect items[i]->u.dbe.de to be NULL valkey_prefetch(items[i]->u.dbe.de); } - for (i = 0; i < n; i++) { + for (int i = 0; i < n; i++) { if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue; - // Same as above, assume key is never NULL valkey_prefetch(objectGetKey(items[i]->u.dbe.de)); } - for (i = 0; i < n; i++) processReturnOfItemToValkey(items[i], iter); + for (int i = 0; i < n; i++) processReturnOfItemToValkey(items[i], iter); } #define PREFETCH_BATCH_SIZE 16 static bool receiveItemsBackFromOneIterator(bgIterator *it) { - bgIteratorItem* batchPool[PREFETCH_BATCH_SIZE]; + bgIteratorItem *batchPool[PREFETCH_BATCH_SIZE]; int n = 0; // Returns true if we process at least one item from // a given iterator's return_to_valkey queue, false otherwise. @@ -1628,15 +1604,14 @@ static void receiveItemsBackFromIterators(bool blocking) { bgIterator *it = listNodeValue(node); processedItems |= receiveItemsBackFromOneIterator(it); } - if (blocking) usleep(100); // Sleep for 1ms and re-try processing iterators + if (blocking && !processedItems) usleep(100); // Short sleep before retry } while (blocking && !processedItems); } -static long long bgIteration_feedIterators_task( - struct aeEventLoop *eventLoop, - long long id, - void *clientData) { +static long long bgIteration_feedIterators_task(struct aeEventLoop *eventLoop, + long long id, + void *clientData) { UNUSED(eventLoop); UNUSED(id); UNUSED(clientData); @@ -1668,8 +1643,8 @@ static long long bgIteration_feedIterators_task( // the duty cycle to compensate (up to a limit). long starvationUs = (startTime - lastFeedEndTime) - BGITER_CYCLE_DELAY_MS * 1000; if (starvationUs > 0) { - long starvationCompensationUs = starvationUs * BGITER_CYCLE_BUDGET_MS - / (BGITER_CYCLE_BUDGET_MS + BGITER_CYCLE_DELAY_MS); + long starvationCompensationUs = starvationUs * BGITER_CYCLE_BUDGET_MS / + (BGITER_CYCLE_BUDGET_MS + BGITER_CYCLE_DELAY_MS); dutyTimeUs += starvationCompensationUs; dutyTimeUs = MIN(dutyTimeUs, BGITER_CYCLE_BUDGET_MAX_MS * 1000); } @@ -1883,14 +1858,14 @@ static void preserveIteratorItemsForFlush(bgIterator *it, int dbid) { fifo *poppedFifo = mutexQueuePopAll(it->items_for_iterator, false); if (poppedFifo != NULL) { fifo *readdFifo = fifoCreate(); - while(fifoLength(poppedFifo) > 0) { + while (fifoLength(poppedFifo) > 0) { bgIteratorItem *item; fifoPop(poppedFifo, (void **)&item); if (item->type == BGITERATOR_ITEM_DBENTRY) { dbEntry *de = item->u.dbe.de; if (dbFind(server.db[dbid], objectGetKey(de)) == de) { - // Found the entry in the DB about to be flushed - removePtrFromEarlyIterate(de); + // Found the entry in the DB about to be flushed + removePtrFromEarlyIterate(de); } } fifoPush(readdFifo, item); @@ -1908,8 +1883,8 @@ static void preserveIteratorItemsForFlush(bgIterator *it, int dbid) { if (item && item->type == BGITERATOR_ITEM_DBENTRY) { dbEntry *de = item->u.dbe.de; if (dbFind(server.db[dbid], objectGetKey(de)) == de) { - // Found the entry in the DB about to be flushed - removePtrFromEarlyIterate(de); + // Found the entry in the DB about to be flushed + removePtrFromEarlyIterate(de); } } } @@ -1965,14 +1940,13 @@ static void handleFlushdb(int dbid) { } -static bool expediteKeysForWriteOnAllIterators( - int dbid, - struct serverCommand *cmd, - int argc, - robj **argv, - keyReference *keyrefs, - int numKeys, - hashtable *waitingOnKeys) { +static bool expediteKeysForWriteOnAllIterators(int dbid, + struct serverCommand *cmd, + int argc, + robj **argv, + keyReference *keyrefs, + int numKeys, + hashtable *waitingOnKeys) { bool mustBlock = false; listIter li; @@ -2095,23 +2069,22 @@ static bool expediteKeysForMultiExec(client *c, hashtable *waitingOnKeys) { return mustBlock; } -static bgIterator * bgIteratorCreate( - const char *name, - bgIteratorConsistency consistency, - bgIteratorReplDoneFunc repldone, - bgIteratorCleanupFunc cleanup, - void *privdata, - bgIterationType iter_type, - genericIterator *keyset_iter) { +static bgIterator *bgIteratorCreate(const char *name, + bgIteratorConsistency consistency, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata, + bgIterationType iter_type, + genericIterator *keyset_iter) { serverAssert(onValkeyMainThread()); serverAssert(server.cluster_enabled || iter_type == BGITERATION_TYPE_FULLSCAN); int flags; switch (consistency) { - case BGITERATOR_CONSISTENCY_NONE: flags = 0; break; - case BGITERATOR_CONSISTENCY_START: flags = BGITERATOR_FLAG_CONSISTENT; break; - case BGITERATOR_CONSISTENCY_EVENTUAL: flags = BGITERATOR_FLAG_REPLICATION; break; - default: serverAssert(false); + case BGITERATOR_CONSISTENCY_NONE: flags = 0; break; + case BGITERATOR_CONSISTENCY_START: flags = BGITERATOR_FLAG_CONSISTENT; break; + case BGITERATOR_CONSISTENCY_EVENTUAL: flags = BGITERATOR_FLAG_REPLICATION; break; + default: serverAssert(false); } // Consistent, with replication - doesn't make sense. serverAssert(!((flags & BGITERATOR_FLAG_CONSISTENT) && (flags & BGITERATOR_FLAG_REPLICATION))); @@ -2162,7 +2135,7 @@ static bgIterator * bgIteratorCreate( serverAssert(bgIterator_timeproc_id != AE_ERR); } - if (dictAdd(nameToIterator, (void*)it->name, it) != DICT_OK) { + if (dictAdd(nameToIterator, it->name, it) != DICT_OK) { // Can't have 2 iterators with the same name! serverAssert(false); } @@ -2175,37 +2148,34 @@ static bgIterator * bgIteratorCreate( } - //============================================================================================= // PUBLIC INTERFACE: Iterator creation and use //============================================================================================= // PUBLIC API -bgIterator *bgIteratorCreateFullScanIter( - const char *name, - bgIteratorConsistency consistency, - bgIteratorReplDoneFunc repldone, - bgIteratorCleanupFunc cleanup, - void *privdata) { +bgIterator *bgIteratorCreateFullScanIter(const char *name, + bgIteratorConsistency consistency, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata) { return bgIteratorCreate(name, consistency, repldone, cleanup, privdata, BGITERATION_TYPE_FULLSCAN, fullScanIteratorCreate()); } // PUBLIC API -bgIterator *bgIteratorCreateSlotsIter( - const char *name, - bgIteratorConsistency consistency, - const int *slots, - int slots_count, - bgIteratorReplDoneFunc repldone, - bgIteratorCleanupFunc cleanup, - void *privdata) { +bgIterator *bgIteratorCreateSlotsIter(const char *name, + bgIteratorConsistency consistency, + const int *slots, + int slots_count, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata) { return bgIteratorCreate(name, consistency, repldone, cleanup, privdata, BGITERATION_TYPE_CLUSTERSLOT, clusterSlotIteratorCreate(slots, slots_count)); } // PUBLIC API -bgIterator * bgIteratorFind(const char *name) { +bgIterator *bgIteratorFind(const char *name) { serverAssert(onValkeyMainThread()); sds sdsname = sdsnew(name); @@ -2224,14 +2194,14 @@ const char *bgIteratorName(bgIterator *it) { // PUBLIC API void bgIteratorGetStatus(bgIterator *it, bgIteratorStatus *status) { - status->dbentries_queued = it->dbentries_queued; - status->dbentries_processed = it->dbentries_processed; - status->replication_queued = it->replication_queued; + status->dbentries_queued = it->dbentries_queued; + status->dbentries_processed = it->dbentries_processed; + status->replication_queued = it->replication_queued; status->replication_processed = it->replication_processed; - status->swapdb_queued = it->swapdb_queued; - status->swapdb_processed = it->swapdb_processed; - status->flushdb_queued = it->flushdb_queued; - status->flushdb_processed = it->flushdb_processed; + status->swapdb_queued = it->swapdb_queued; + status->swapdb_processed = it->swapdb_processed; + status->flushdb_queued = it->flushdb_queued; + status->flushdb_processed = it->flushdb_processed; status->dbentry_clones_queued = it->dbentry_clones_queued; status->dbentry_clones_processed = it->dbentry_clones_processed; @@ -2241,8 +2211,9 @@ void bgIteratorGetStatus(bgIterator *it, bgIteratorStatus *status) { status->runtime_ms = elapsedMs(it->monotonic_start_time); monotime nonvolatile_item_start_time = it->monotonic_item_start_time; - status->current_item_ms = - (nonvolatile_item_start_time == 0) ? 0 : elapsedMs(nonvolatile_item_start_time); + status->current_item_ms = (nonvolatile_item_start_time == 0) + ? 0 + : elapsedMs(nonvolatile_item_start_time); } @@ -2259,7 +2230,7 @@ void bgIteratorTerminate(bgIterator *it) { } bgIteratorItem *terminationItem = itemFreeList_getElementOrAllocate(); - *terminationItem = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED }; + *terminationItem = (bgIteratorItem){.type = BGITERATOR_ITEM_TERMINATED}; mutexQueueAdd(it->items_for_iterator, terminationItem); it->terminated = true; @@ -2273,10 +2244,10 @@ bool bgIteratorIsTerminating(bgIterator *it) { // PUBLIC API -bgIteratorItem * bgIteratorRead(bgIterator *it) { - serverAssert(it->current_item == NULL - || (it->current_item->type != BGITERATOR_ITEM_COMPLETE - && it->current_item->type != BGITERATOR_ITEM_TERMINATED)); +bgIteratorItem *bgIteratorRead(bgIterator *it) { + serverAssert(it->current_item == NULL || + (it->current_item->type != BGITERATOR_ITEM_COMPLETE && + it->current_item->type != BGITERATOR_ITEM_TERMINATED)); // First, clean up the previous item read if (it->current_item != NULL) { @@ -2302,8 +2273,8 @@ bgIteratorItem * bgIteratorRead(bgIterator *it) { // PUBLIC API void bgIteratorClose(bgIterator *it) { if (it->current_item != NULL) { - if (it->current_item->type == BGITERATOR_ITEM_COMPLETE - || it->current_item->type == BGITERATOR_ITEM_TERMINATED) { + if (it->current_item->type == BGITERATOR_ITEM_COMPLETE || + it->current_item->type == BGITERATOR_ITEM_TERMINATED) { // Normal confirmation of background completion } else { // Client is initiating the termination @@ -2311,13 +2282,13 @@ void bgIteratorClose(bgIterator *it) { returnCurrentItemToValkey(it); it->current_item = itemFreeList_getElementOrAllocate(); - *(it->current_item) = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED }; + *(it->current_item) = (bgIteratorItem){.type = BGITERATOR_ITEM_TERMINATED}; } } else { // terminated before first item read it->terminated = true; it->current_item = itemFreeList_getElementOrAllocate(); - *(it->current_item) = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED }; + *(it->current_item) = (bgIteratorItem){.type = BGITERATOR_ITEM_TERMINATED}; } // We don't allocate extension items from the free list @@ -2328,7 +2299,6 @@ void bgIteratorClose(bgIterator *it) { } - //============================================================================================= // PUBLIC INTERFACE: Valkey main-thread support hooks //============================================================================================= @@ -2388,10 +2358,10 @@ void bgIteration_keyDelete(int dbid, const_sds key) { bgIterator *it = listNodeValue(node); if (it->completed || it->terminated || !it->keyset_iter->isKeyInScope(it->keyset_iter, key)) continue; - if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT - && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { - if (!it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) - && (dictFind(it->early_iterate_entries, de) == NULL)) { + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT && + ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { + if (!it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) && + (dictFind(it->early_iterate_entries, de) == NULL)) { addEarlyIterationKey(it, de, dbid); // (may also add to inUseEntries) } } @@ -2423,7 +2393,7 @@ bool bgIteration_blockClientIfRequired(client *c) { if (BGITERATION_DEBUG) { debugBuffer = sdscatprintf(debugBuffer, "BLCK?: (%d)%s\n", c->db->id, - createSdsFromClientArgv(c->argc, c->argv)); + createSdsFromClientArgv(c->argc, c->argv)); } // Before executing a command or atomic transaction, the replication flag is cleared for each @@ -2453,7 +2423,7 @@ bool bgIteration_blockClientIfRequired(client *c) { keyReference *keyrefs = result.keys; if (numkeys > 0) { mustBlock = expediteKeysForWriteOnAllIterators( - c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); + c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); serverAssert(!(mustBlock && (c->flag.multi) && !(c->flag.script))); if (mustBlock && (c->flag.script)) { @@ -2467,7 +2437,7 @@ bool bgIteration_blockClientIfRequired(client *c) { receiveItemsBackFromIterators(true); // Blocking hashtableEmpty(waitOnKeys, NULL); mustBlock = expediteKeysForWriteOnAllIterators( - c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); + c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); } } getKeysFreeResult(&result); @@ -2485,7 +2455,7 @@ bool bgIteration_blockClientIfRequired(client *c) { if (mustBlock) { serverAssert(hashtableSize(waitOnKeys) > 0); - robj **waitKeysArgv = zmalloc(sizeof(robj*) * hashtableSize(waitOnKeys)); + robj **waitKeysArgv = zmalloc(sizeof(robj *) * hashtableSize(waitOnKeys)); robj *key; hashtableIterator hi; @@ -2513,16 +2483,15 @@ bool bgIteration_blockClientIfRequired(client *c) { // PUBLIC API -void bgIteration_handleCommandReplication( - int dbid, - struct serverCommand *cmd, - int argc, - robj **argv) { +void bgIteration_handleCommandReplication(int dbid, + struct serverCommand *cmd, + int argc, + robj **argv) { if (BGITERATION_DEBUG) { // DEBUG - enable this to capture replication not queued because iteration is inactive if (0 && !bgIteration_iterationActive() && (isWriteCmd(cmd) || cmd->proc == multiCommand)) { debugBuffer = sdscatprintf(debugBuffer, "REPL? INACT: (%d)%s\n", dbid, - createSdsFromClientArgv(argc, argv)); + createSdsFromClientArgv(argc, argv)); } } @@ -2535,7 +2504,7 @@ void bgIteration_handleCommandReplication( if (BGITERATION_DEBUG) { debugBuffer = sdscatprintf(debugBuffer, "REPL?: (%d)%s\n", dbid, - createSdsFromClientArgv(argc, argv)); + createSdsFromClientArgv(argc, argv)); } if (cmd->proc == swapdbCommand) { @@ -2596,8 +2565,8 @@ void bgIteration_handleCommandReplication( if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) { // Handle the special case of a key moved to a different DB if (special_dbEntry != NULL) { - if (it->cur_cmd_may_replicate - && !it->keyset_iter->hasPassedItem(it->keyset_iter, special_key, special_dbid)) { + if (it->cur_cmd_may_replicate && + !it->keyset_iter->hasPassedItem(it->keyset_iter, special_key, special_dbid)) { dictAdd(it->early_iterate_entries, special_dbEntry, NULL); if (BGITERATION_DEBUG) { sds entryString = createEntryString(special_dbid, special_dbEntry); @@ -2621,8 +2590,8 @@ void bgIteration_handleCommandReplication( dbEntry *de = dbFind(server.db[dbid], (sds)key); if (de != NULL) { // It exists now! - if (it->cur_cmd_may_replicate - && !it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) { + if (it->cur_cmd_may_replicate && + !it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) { // If the current command is allowed to replicate, and there is a new // key which we haven't yet reached in iteration, it needs to be added // to the set of early iterate entries. (We know that it's not already @@ -2690,8 +2659,8 @@ void bgIteration_handleCommandReplication( // NOTE: It's weird, but helpful, for both EXPIRE and EVICT the propagation happens // BEFORE the actual delete. So if the dbEntry still exists, we are doing // an expire/evict which is not preceded by blockClientIfRequired(). - if (it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) - || (dictFind(it->early_iterate_entries, de) != NULL)) { + if (it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) || + (dictFind(it->early_iterate_entries, de) != NULL)) { shouldReplicateDelCommand = true; } } else { @@ -2703,8 +2672,7 @@ void bgIteration_handleCommandReplication( } bool replicate = (it->iteration_flags & BGITERATOR_FLAG_REPLICATION && - ((!isDelCommand && it->cur_cmd_may_replicate) - || shouldReplicateDelCommand)); + ((!isDelCommand && it->cur_cmd_may_replicate) || shouldReplicateDelCommand)); if (replicate) { /* We will replicate the command in these cases: From 412207abd7bd178873b1e20bdc7a0cb9104f6125 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Fri, 5 Jun 2026 17:17:40 +0000 Subject: [PATCH 35/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 2 +- src/server.c | 95 +++++++++++++++++++++++++++++------------------ 2 files changed, 59 insertions(+), 38 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index c050891939c..289fe3e3969 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -46,7 +46,7 @@ static bool isDeleteCmd(struct serverCommand *cmd) { } -/* This utility utilizes the main thread and backgound threads for processing. The API is split, +/* This utility utilizes the main thread and background threads for processing. The API is split, * with some of the functions intended for the main thread and others intended for the background * clients. This sanity check ensures that we maintain thread safety, calling the API as intended. */ static bool onValkeyMainThread(void) { diff --git a/src/server.c b/src/server.c index ec529182b09..3e9eb5d1fa2 100644 --- a/src/server.c +++ b/src/server.c @@ -3023,7 +3023,7 @@ void initServer(void) { /* Set object metadata size before creating any database key objects */ if (server.forkless_options_supported) { - /* NOTE: At this time, there is only one reason for dbEntry metadata. bgIteration. However, + /* NOTE: At this time, there is only one reason for dbEntry metadata: bgIteration. However, * if/when new metadata options are added, we will need to compute the size of a variable * size metadata, and provide appropriate accessors to access the specific portion of the * metadata (each of which may/may not exist, based on immutable startup parameters). */ @@ -3711,10 +3711,57 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot) if (propagate_to_slot_migration) clusterFeedSlotExportJobs(dbid, argv, argc, slot); } -// If true, a MULTI has been sent to bgIterator. -// Remember to send the matching EXEC in propagatePendingCommands(). -static bool sentMultiToBgIterator = false; -static int lastDbidSentToBgIterator; +/* BgIteration requires that replicaton is sent after each command, however the + * alsoPropagate mechanism queues replication until the end of the transaction + * (when propagatePendingCommands is invoked). Also, the propagation mechanism + * strips out multi/exec, adding them back during propagatePendingCommands (if + * necessary). This function ensures that replication, including multi/exec are + * sequenced with the commands for bgIteration. + * + * Called from alsoPropagate with regular params. + * Called from propagatePendingCommands with dbid = -1 (to close multi/exec). */ +void propagateToBgIteration(int dbid, int argc, robj **argv, int target) { + /* STATIC indicates that we have sent the MULTI, and need to match it with + * an EXEC during propagatePendingCommands. */ + static bool sentMultiToBgIterator = false; + /* STATIC indicates that last DBID that was sent, so that we can use the + * same DBID when sending a generated EXEC. */ + static int lastDbidSentToBgIterator; + + if (dbid >= 0) { + // Called from alsoPropagate() to replicate a command + if (target & PROPAGATE_REPL && bgIteration_iterationActive()) { + if (!sentMultiToBgIterator && (scriptIsRunning() || server.in_exec)) { + /* For a script or multi/exec, we should be sending the MULTI at + * the beginning of the execution unit. There shouldn't be any + * commands in the propagation queue yet. */ + serverAssert(server.also_propagate.numops == 0); + /* If this is the first propagated command of a script or multi, + * make it a transaction. It may turn out that there is only 1 + * command in the MULTI block, but we can't know that now. + * Unlike regular replication, we can't defer all of the + * replication until we know for sure. We must call bgIteration + * after each command. */ + static struct serverCommand *cmd_multi = NULL; // STATIC + if (cmd_multi == NULL) cmd_multi = lookupCommandOrOriginal(&shared.multi, 1); + bgIteration_handleCommandReplication(dbid, cmd_multi, 1, &shared.multi); + sentMultiToBgIterator = true; + } + struct serverCommand *cmd = lookupCommandOrOriginal(argv, argc); + bgIteration_handleCommandReplication(dbid, cmd, argc, argv); + lastDbidSentToBgIterator = dbid; + } + } else { + // Called from propagatePendingCommands() to finalize a transaction + if (sentMultiToBgIterator) { + // If a MULTI was sent to bgIterator via alsoPropagate(), then send the matching EXEC. + static struct serverCommand *cmd_exec = NULL; // STATIC + if (cmd_exec == NULL) cmd_exec = lookupCommandOrOriginal(&shared.exec, 1); + bgIteration_handleCommandReplication(lastDbidSentToBgIterator, cmd_exec, 1, &shared.exec); + sentMultiToBgIterator = false; + } + } +} /* Used inside commands to schedule the propagation of additional commands * after the current command is propagated to AOF / Replication. @@ -3728,28 +3775,7 @@ static int lastDbidSentToBgIterator; * stack allocated). The function automatically increments ref count of * passed objects, so the caller does not need to. */ void alsoPropagate(int dbid, robj **argv, int argc, int target, int slot) { - if (target & PROPAGATE_REPL && bgIteration_iterationActive()) { - // Note that bgIterator must be invoked immediately after each command. This is required - // by the bgIterator state machine. It's NOT ok to call bgIterator from propagateNow as - // that handles all of the commands for a transaction at the end. - // THIS FUNCTION (alsoPropagate) is called after each command. - if (!sentMultiToBgIterator && (scriptIsRunning() || server.in_exec)) { - // For a script or multi/exec, we should be sending the MULTI at the beginning of the - // execution unit. There shouldn't be any commands in the propagation queue yet. - serverAssert(server.also_propagate.numops == 0); - // If this is the first propagated command of a script or multi, make it a transaction. - // It may turn out that there is only 1 command in the MULTI block, but we can't know - // that now. Unlike regular replication, we can't defer all of the replication until - // we know for sure. We must call bgIterator after each command. - static struct serverCommand *cmd_multi = NULL; // STATIC to avoid repeated lookups - if (cmd_multi == NULL) cmd_multi = lookupCommandOrOriginal(&shared.multi, 1); - bgIteration_handleCommandReplication(dbid, cmd_multi, 1, &shared.multi); - sentMultiToBgIterator = true; - } - struct serverCommand *cmd = lookupCommandOrOriginal(argv, argc); - bgIteration_handleCommandReplication(dbid, cmd, argc, argv); - lastDbidSentToBgIterator = dbid; - } + propagateToBgIteration(dbid, argc, argv, target); robj **argvcopy; int j; @@ -3817,16 +3843,11 @@ void updateCommandLatencyHistogram(struct hdr_histogram **latency_histogram, int * multiple separated commands. Note that alsoPropagate() is not affected * by CLIENT_PREVENT_PROP flag. */ static void propagatePendingCommands(void) { - // Note: This is done before the check on server.also_propagate.numops. Numops might be zero - // if there is no replica but we might be running bgIteration for something other than - // replication. If we sent the multi (to bgIteration), we need to send the matching exec. - if (sentMultiToBgIterator) { - // If a MULTI was sent to bgIterator via alsoPropagate(), then send the matching EXEC. - static struct serverCommand *cmd_exec = NULL; // STATIC to avoid repeated lookups - if (cmd_exec == NULL) cmd_exec = lookupCommandOrOriginal(&shared.exec, 1); - bgIteration_handleCommandReplication(lastDbidSentToBgIterator, cmd_exec, 1, &shared.exec); - sentMultiToBgIterator = false; - } + /* This is done before the check on server.also_propagate.numops. Numops + * might be zero if there is no replica but we might be running bgIteration + * for something other than replication. If we sent the multi (to + * bgIteration), we need to send the matching exec. */ + propagateToBgIteration(-1, 0, NULL, 0); if (server.also_propagate.numops == 0) return; From a2c8e56c4b1306fb96fe98563dd424c1d9c866cf Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Fri, 5 Jun 2026 21:48:10 +0000 Subject: [PATCH 36/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 41 ++++++++++++++--------------------------- src/server.c | 2 +- 2 files changed, 15 insertions(+), 28 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index 289fe3e3969..226c434549e 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -50,16 +50,15 @@ static bool isDeleteCmd(struct serverCommand *cmd) { * with some of the functions intended for the main thread and others intended for the background * clients. This sanity check ensures that we maintain thread safety, calling the API as intended. */ static bool onValkeyMainThread(void) { - // Modules interact with the main thread using a mutex. If a module owns the mutex, consider - // that equivalent to being on the main thread. + /* Modules interact with the main thread using a mutex. If a module owns the mutex, consider + * that equivalent to being on the main thread. */ bool inModule = (atomic_load_explicit(&server.module_gil_acquired, memory_order_relaxed) == 0); return (inModule || pthread_equal(server.main_thread_id, pthread_self()) != 0); } /* Parse a parameters robj, extracting a valid DBID. - * Returns FALSE if DBID isn't valid. - */ + * Returns FALSE if DBID isn't valid. */ static bool getDbIdFromRobj(robj *obj, int *db_id) { long long value; if (getLongLongFromObject(obj, &value) != C_OK) return false; @@ -69,8 +68,7 @@ static bool getDbIdFromRobj(robj *obj, int *db_id) { } /* Parse the parameters of the COPY command, extracting the target DBID. - * Returns FALSE if the command would not run. - */ + * Returns FALSE if the command would not run. */ static bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid, int *target_dbid) { const int COPY_COMMAND_OPTIONAL_ARG_START_INDEX = 3; @@ -83,8 +81,7 @@ static bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid /* Note the parsing here needs to perfectly match what we have in Valkey OSS for COPY. * The following command is considered OK by Valkey 8.1 so we can't return here, but * must continue to parse till the last db which is the one that's effectively used. - * COPY key1 key2 db 1 db 2 db 3 // (This will use db 3) - */ + * COPY key1 key2 db 1 db 2 db 3 (This will use db 3) */ if (!getDbIdFromRobj(argv[i + 1], target_dbid)) { return false; // parse failure } @@ -98,8 +95,7 @@ static bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid /* Get parameters for the SWAPDB command. * The optional permission_client allows for checking of a client's permission for swapdb. - * Returns true if command would be executed. - */ + * Returns true if command would be executed. */ static bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, int *id1_p, int *id2_p) { static struct serverCommand *swapdb_cmd = NULL; @@ -131,8 +127,7 @@ static bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, /* Get parameters for the SELECT command. * The optional permission_client allows for checking of a client's permission for select. - * Returns true if command would be executed. - */ + * Returns true if command would be executed. */ static bool getParamsForSelect(int argc, robj **argv, client *permission_client, int *dbid_p) { static struct serverCommand *select_cmd = NULL; @@ -485,8 +480,7 @@ static sds debugBuffer; /* The full scan iterator performs the actual iteration over the Valkey keyset. The iterator is * only used from within the Valkey main thread. Iteration proceeds one DB at a time, based on * the DB ordering at the time of iterator creation. Each time the iterator returns items, all - * of the dictionary entries from a single hash bucket are returned. - */ + * of the dictionary entries from a single hash bucket are returned. */ struct fullScanIterator { genericIterator callbacks; // (must be first item) @@ -653,8 +647,7 @@ static genericIterator *fullScanIteratorCreate(void) { // Cluster Slot Iterator //============================================================================================= /* The cluster slot iterator performs iteration over one cluster slot of the Valkey keyset. The - * iterator is only used from within the Valkey main thread. - */ + * iterator is only used from within the Valkey main thread. */ struct clusterSlotIterator { genericIterator callbacks; // (must be first item) }; @@ -1224,8 +1217,7 @@ static bool expediteKeysForCopy(bgIterator *it, * Iterator: CONSISTENT = YES, REPLICATION = YES * (Combination only valid in cluster mode - no SWAPDB possible) * - Block if any write-key is in use by an the iterator - * - Block and immediately queue any key (read or write) that has not already been iterated - */ + * - Block and immediately queue any key (read or write) that has not already been iterated */ static bool expediteKeysForWrite(bgIterator *it, int dbid, struct serverCommand *cmd, @@ -1980,8 +1972,7 @@ static bool expediteKeysForMultiExec(client *c, hashtable *waitingOnKeys) { /* For MULTI/EXEC, Valkey buffers all of the commands until hitting the EXEC. * At this point, the client holds all of the commands to be executed. This function searches * for all of the keys used by any of the buffered write commands. In addition, if SWAPDB or - * SELECT is used, this tracks the DBIDs through various swap/select operations. - */ + * SELECT is used, this tracks the DBIDs through various swap/select operations. */ /* There's a special concern for a NON-consistent iteration with replication. If the keys are * all "future" keys (which haven't been processed by the iterator yet), then we don't expedite @@ -2005,8 +1996,7 @@ static bool expediteKeysForMultiExec(client *c, hashtable *waitingOnKeys) { * these will be caught on the 2nd time around. * * Checking replication status before/after ensures that there can only be a single recursive - * call. - */ + * call. */ bool initiallyAnIteratorWillReplicate = anIteratorWillReplicateForThisCommand(); bool mustBlock = false; @@ -2647,8 +2637,7 @@ void bgIteration_handleCommandReplication(int dbid, * In the case of a client driven DEL command, the key will have already been deleted when * we hit this routine. In the case of EXPIRE/EVICT, they propagate happens before the key * is deleted. So if the key is missing, we can use the cached replication decision. But - * if the key still exists (indicating EXPIRE/EVICT) we evaluate it specially. - */ + * if the key still exists (indicating EXPIRE/EVICT) we evaluate it specially. */ bool shouldReplicateDelCommand = false; bool isDelCommand = isDeleteCmd(cmd); if (isDelCommand) { @@ -2679,9 +2668,7 @@ void bgIteration_handleCommandReplication(int dbid, * 1) For consistent iteration - it->cur_cmd_may_replicate is always true * 2) For non-consistent, if any of the keys have been processed, expediteKeysForWrite * will ensure that ALL of the keys have been expedited - and we should replicate - * 3) For non-consistent, if NONE of the keys have been processed, no need to replicate - */ - + * 3) For non-consistent, if NONE of the keys have been processed, no need to replicate */ if (BGITERATION_DEBUG) { debugBuffer = sdscat(debugBuffer, " (queued)\n"); } diff --git a/src/server.c b/src/server.c index 3e9eb5d1fa2..14e32beb6a2 100644 --- a/src/server.c +++ b/src/server.c @@ -3720,7 +3720,7 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot) * * Called from alsoPropagate with regular params. * Called from propagatePendingCommands with dbid = -1 (to close multi/exec). */ -void propagateToBgIteration(int dbid, int argc, robj **argv, int target) { +static void propagateToBgIteration(int dbid, int argc, robj **argv, int target) { /* STATIC indicates that we have sent the MULTI, and need to match it with * an EXEC during propagatePendingCommands. */ static bool sentMultiToBgIterator = false; From a55f7054c62c00009c70c7e74604be1aa3fc7f1e Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Fri, 5 Jun 2026 22:42:29 +0000 Subject: [PATCH 37/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 50 ++++++++++++++++++++++++++-------------------- src/bgiteration.h | 51 ++++++++++++++++------------------------------- 2 files changed, 45 insertions(+), 56 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index 226c434549e..7c79a48a261 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -336,6 +336,12 @@ static dictType dictEntryPtrDictType = { .resizeAllowed = neverShrink, .entryDestructor = zfree}; +static hashtableType dbEntryPtrHashtableType = { + .hashFunction = pointerHash, + .keyCompare = pointerCompare, + .resizeAllowed = neverShrink}; + + // A TEMP set of robj's (of type sds). This is only for temporary sets as the robj's are not // ref-counted at insertion/deletion. static hashtableType tempKeysetHashtableType = { @@ -378,11 +384,10 @@ struct bgIterator { genericIterator *keyset_iter; // Low-level iterator (polymorphic) - dict *early_iterate_entries; // Used to keep track of what items have already been iterated - // over by out-of-order expedited process, ensuring a bgIterator - // does not try to reprocess items. - // Used only by main thread. - // dictEntry -> NULL + hashtable *early_iterate_entries; /* A set of dbEntry, compared by pointer. Used to track items + * which have already been iterated over by out-of-order + * expedited processing. Ensures a bgIterator does not try to + * reprocess items. Used only by main thread. */ mutexQueue *items_for_iterator; // Created/Destroyed in main thread, used in both (threadsafe) @@ -923,7 +928,7 @@ static void bgIteratorRelease(bgIterator *it) { it->keyset_iter->release(it->keyset_iter); it->keyset_iter = NULL; - dictRelease(it->early_iterate_entries); + hashtableRelease(it->early_iterate_entries); it->early_iterate_entries = NULL; sdsfree(it->name); @@ -1030,8 +1035,7 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { } // Remove any items which have been processed early - if (dictFind(it->early_iterate_entries, de) != NULL) { - dictDelete(it->early_iterate_entries, de); + if (hashtableDelete(it->early_iterate_entries, de)) { if (BGITERATION_DEBUG) { sds entryString = createEntryString(dbid, de); debugBuffer = sdscatprintf(debugBuffer, "SKIPPING ITEM(early iterate): %s\n", entryString); @@ -1072,8 +1076,8 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { static bool addEarlyIterationKey(bgIterator *it, dbEntry *earlyEntry, int cur_dbid) { - int rc = dictAdd(it->early_iterate_entries, earlyEntry, NULL); - serverAssert(rc == DICT_OK); + bool wasAdded = hashtableAdd(it->early_iterate_entries, earlyEntry); + serverAssert(wasAdded); int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) ? it->keyset_iter->originalDb(it->keyset_iter, cur_dbid) @@ -1120,7 +1124,7 @@ static bool expediteSingleKeyWithoutOptimization(bgIterator *it, dbEntry *de = dbFind(server.db[dbid], key); if (de != NULL) { if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) && - (dictFind(it->early_iterate_entries, de) == NULL)) { + !hashtableFind(it->early_iterate_entries, de, NULL)) { if (addEarlyIterationKey(it, de, dbid)) { mustBlock = true; hashtableAdd(waitingOnKeys, oKey); @@ -1267,7 +1271,7 @@ static bool expediteKeysForWrite(bgIterator *it, dbEntry *de = dbFind(server.db[dbid], key); if (de == NULL) continue; // New key, no need to expedite if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) && - dictFind(it->early_iterate_entries, de) == NULL && + !hashtableFind(it->early_iterate_entries, de, NULL) && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { if (addEarlyIterationKey(it, de, dbid)) { mustBlock = true; @@ -1306,7 +1310,7 @@ static bool expediteKeysForWrite(bgIterator *it, } if (iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) || - (dictFind(it->early_iterate_entries, de) != NULL)) { + hashtableFind(it->early_iterate_entries, de, NULL)) { someIterated = true; } else { dictAdd(notIteratedKeys, de, oKey); @@ -1799,7 +1803,7 @@ static void removePtrFromEarlyIterate(dbEntry *de) { listRewind(allIterators, &li); while ((node = listNext(&li)) != NULL) { bgIterator *it = listNodeValue(node); - dictDelete(it->early_iterate_entries, de); // just try delete (might not be here) + hashtableDelete(it->early_iterate_entries, de); // just try delete (might not be here) } } @@ -2096,8 +2100,8 @@ static bgIterator *bgIteratorCreate(const char *name, it->iteration_type = iter_type; it->consistent_modification_id = bgIteration_epoch++; it->keyset_iter = keyset_iter; - it->early_iterate_entries = dictCreate(&dictEntryPtrDictType); - dictExpand(it->early_iterate_entries, BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE); + it->early_iterate_entries = hashtableCreate(&dbEntryPtrHashtableType); + hashtableExpand(it->early_iterate_entries, BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE); it->current_item = NULL; it->client_is_active = false; it->completed = false; @@ -2351,7 +2355,7 @@ void bgIteration_keyDelete(int dbid, const_sds key) { if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { if (!it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) && - (dictFind(it->early_iterate_entries, de) == NULL)) { + !hashtableFind(it->early_iterate_entries, de, NULL)) { addEarlyIterationKey(it, de, dbid); // (may also add to inUseEntries) } } @@ -2557,7 +2561,7 @@ void bgIteration_handleCommandReplication(int dbid, if (special_dbEntry != NULL) { if (it->cur_cmd_may_replicate && !it->keyset_iter->hasPassedItem(it->keyset_iter, special_key, special_dbid)) { - dictAdd(it->early_iterate_entries, special_dbEntry, NULL); + hashtableAdd(it->early_iterate_entries, special_dbEntry); if (BGITERATION_DEBUG) { sds entryString = createEntryString(special_dbid, special_dbEntry); debugBuffer = sdscatprintf(debugBuffer, "EARLY(special): %s\n", entryString); @@ -2586,7 +2590,8 @@ void bgIteration_handleCommandReplication(int dbid, // key which we haven't yet reached in iteration, it needs to be added // to the set of early iterate entries. (We know that it's not already // in that set because it's a newly created key!) - dictAdd(it->early_iterate_entries, de, NULL); + bool wasAdded = hashtableAdd(it->early_iterate_entries, de); + serverAssert(wasAdded); if (BGITERATION_DEBUG) { sds entryString = createEntryString(dbid, de); debugBuffer = sdscatprintf(debugBuffer, "EARLY(NEW): %s\n", entryString); @@ -2649,7 +2654,7 @@ void bgIteration_handleCommandReplication(int dbid, // BEFORE the actual delete. So if the dbEntry still exists, we are doing // an expire/evict which is not preceded by blockClientIfRequired(). if (it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) || - (dictFind(it->early_iterate_entries, de) != NULL)) { + hashtableFind(it->early_iterate_entries, de, NULL)) { shouldReplicateDelCommand = true; } } else { @@ -2729,11 +2734,12 @@ void bgIteration_updateDbEntryPtr(dbEntry *old, dbEntry *new) { listRewind(allIterators, &li); while ((node = listNext(&li)) != NULL) { bgIterator *it = listNodeValue(node); - if (dictDelete(it->early_iterate_entries, old) == DICT_OK) { + if (hashtableDelete(it->early_iterate_entries, old)) { if (BGITERATION_DEBUG) { debugBuffer = sdscatprintf(debugBuffer, "EARLY LIST UPDATE %p -> %p\n", (void *)old, (void *)new); } - dictAdd(it->early_iterate_entries, new, NULL); + bool wasAdded = hashtableAdd(it->early_iterate_entries, new); + serverAssert(wasAdded); } } } diff --git a/src/bgiteration.h b/src/bgiteration.h index 8d247fa7bbf..78643311fe0 100644 --- a/src/bgiteration.h +++ b/src/bgiteration.h @@ -21,8 +21,7 @@ * implements the logic of the iteration client. * * Iteration clients are expected to read through the keyspace until the iteration is complete or - * terminated. An iteration client may not perform modifications on a key. - */ + * terminated. An iteration client may not perform modifications on a key. */ /* Avoids dependency on server.h */ typedef struct serverObject dbEntry; // An object with key/value inserted into main dictionary @@ -69,8 +68,7 @@ typedef enum { * Returns true when an iterator stops accepting any replication item into the queue for the client. * If false is returned, replication will continue, and bgiteration will periodically call the callback * until true is returned. In this context, returning false indicates that the client is not ready to - * stop receiving replication, it is requesting that replication be continued. - */ + * stop receiving replication, it is requesting that replication be continued. */ typedef bool (*bgIteratorReplDoneFunc)(void *privdata); @@ -80,8 +78,7 @@ typedef bool (*bgIteratorReplDoneFunc)(void *privdata); * TERMINATED: will be passed as TRUE if the iteration process was terminated early (either by * the main thread calling bgIteratorTerminate() or the iteration client calling * bgIteratorClose()). - * PRIVDATA: this pointer is for data private to the iteration client. - */ + * PRIVDATA: this pointer is for data private to the iteration client. */ typedef void (*bgIteratorCleanupFunc)(bool terminated, void *privdata); @@ -99,8 +96,7 @@ typedef void (*bgIteratorCleanupFunc)(bool terminated, void *privdata); * to implement the iteration client which will read from the returned bgIterator. * * There is no need to delete/destroy a bgIterator. It will automatically be cleaned up after the - * last item is read. - */ + * last item is read. */ bgIterator *bgIteratorCreateFullScanIter( const char *name, bgIteratorConsistency consistency, @@ -128,8 +124,7 @@ bgIterator *bgIteratorCreateFullScanIter( * just copy its data and leave the array untouched. * * There is no need to delete/destroy a bgIterator. It will automatically be cleaned up after the - * last item is read. - */ + * last item is read. */ bgIterator *bgIteratorCreateSlotsIter( const char *name, bgIteratorConsistency consistency, @@ -141,8 +136,7 @@ bgIterator *bgIteratorCreateSlotsIter( /* Find an existing bgIterator by name. - * Returns NULL if the iterator does not exist (or has completed). - */ + * Returns NULL if the iterator does not exist (or has completed). */ bgIterator *bgIteratorFind(const char *name); @@ -171,8 +165,7 @@ typedef struct { /* Get the status of a background iteration. * - * The caller-provided bgIteratorStatus will be populated. - */ + * The caller-provided bgIteratorStatus will be populated. */ void bgIteratorGetStatus(bgIterator *iter, bgIteratorStatus *status); @@ -181,8 +174,7 @@ void bgIteratorGetStatus(bgIterator *iter, bgIteratorStatus *status); * An iteration is terminated by the Valkey main thread. It is expected that the iteration client * will continue to read, receiving BGITERATOR_ITEM_TERMINATED or BGITERATOR_ITEM_COMPLETE to * complete the iteration. (This is necessary to ensure proper cleanup.) - * NOTE: If the iteration client wants to terminate iteration, it may call bgIteratorClose(). - */ + * NOTE: If the iteration client wants to terminate iteration, it may call bgIteratorClose(). */ void bgIteratorTerminate(bgIterator *iter); @@ -191,8 +183,7 @@ void bgIteratorTerminate(bgIterator *iter); * This checks if the iterator is in the process of terminating. For the Valkey main thread, this * can be used to determine if a call has already been made to bgIteratorTerminate. For an * iteration client, it normally learns about terminate by reading the next item, this allows - * out-of-band detection of termination which can be useful when processing a large key. - */ + * out-of-band detection of termination which can be useful when processing a large key. */ bool bgIteratorIsTerminating(bgIterator *iter); @@ -263,8 +254,7 @@ typedef struct { * NOTE: Reading an item returns previously read items to Valkey. It is unsafe to reference an item * previously read. * - * (All memory management is the responsibility of the bgIterator - not the reader.) - */ + * (All memory management is the responsibility of the bgIterator - not the reader.) */ bgIteratorItem *bgIteratorRead(bgIterator *iter); @@ -276,8 +266,7 @@ bgIteratorItem *bgIteratorRead(bgIterator *iter); * BGITERATOR_ITEM_TERMINATED and signals that the background activity is complete. * * This may also be called by the iteration client to force terminate an iteration early. The - * bgIterator will be marked as terminated. - */ + * bgIterator will be marked as terminated. */ void bgIteratorClose(bgIterator *iter); @@ -298,15 +287,13 @@ bool bgIteration_iterationActive(void); /* Notify bgIteration that a key is being deleted. In Valkey, key deletion can occur in a READ * command if the key is expired. Note that this notification is more about status than memory. * Since the dbEntry is a reference counted object, the dbEntry can't be physically deleted if - * bgIteration is still actively using it. - */ + * bgIteration is still actively using it. */ void bgIteration_keyDelete(int dbid, const_sds key); /* Iteration needs to know if a FLUSHALL is being performed. For normal clients, this comes through * the standard "blockClientIfRequired" interface. This interface is for cases where Valkey - * performs the FLUSHALL operation independently of clients (e.g. when syncing with master). - */ + * performs the FLUSHALL operation independently of clients (e.g. when syncing with master). */ void bgIteration_flushall(void); @@ -317,8 +304,7 @@ void bgIteration_flushall(void); * * We can't update the dbEntry if the entry is actually in use (bgIteration_isEntryInuse)! * - * To simplify calling code, this function does nothing if old_entry == new_entry. - */ + * To simplify calling code, this function does nothing if old_entry == new_entry. */ void bgIteration_updateDbEntryPtr(dbEntry *old_entry, dbEntry *new_entry); @@ -338,16 +324,14 @@ void bgIteration_updateDbEntryPtr(dbEntry *old_entry, dbEntry *new_entry); * performs SWAPDB, a synchronous block may be performed (returning false) on * individual commands within the script. * - * Note: this function should be called for all commands (not just writes). - */ + * Note: this function should be called for all commands (not just writes). */ bool bgIteration_blockClientIfRequired(client *c); /* After execution of a write command, the Valkey main thread must provide the command to iterators * which are interested in the replication feed. It is required that all commands have been passed * through bgIteration_blockClientIfRequired(), however, it is permitted that the command can be - * re-written for propagation. - */ + * re-written for propagation. */ void bgIteration_handleCommandReplication( int dbid, struct serverCommand *cmd, @@ -357,8 +341,7 @@ void bgIteration_handleCommandReplication( /* The memory that bgIteration uses while temporarily buffering replication data is not included in * the maxmemory computation used for eviction. This function provides insight into the current - * amount of memory used for buffered replication data. - */ + * amount of memory used for buffered replication data. */ size_t bgIteration_memoryInuseForReplication(void); From 3628aaed1203bf886f8d3b4e78b0568343cb99ad Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Mon, 8 Jun 2026 19:53:36 +0000 Subject: [PATCH 38/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 662 +++++++++++++++++++++------------------------- 1 file changed, 302 insertions(+), 360 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index 7c79a48a261..f65fbf30e06 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -34,8 +34,8 @@ static bool isScriptCallWriteCmd(struct serverCommand *cmd) { return ((cmd->proc == fcallCommand) || (cmd->proc == evalCommand) || (cmd->proc == evalShaCommand)); } -// The PFCOUNT command (which does NOT have the CMD_WRITE flag) modifies the underlying string and -// is replicated as a write. So it needs to be detected and handled specially. +/* The PFCOUNT command (which does NOT have the CMD_WRITE flag) modifies the underlying string and + * is replicated as a write. So it needs to be detected and handled specially. */ static bool isWriteCmd(struct serverCommand *cmd) { return ((cmd->flags & CMD_WRITE) || (cmd->proc == pfcountCommand) || (cmd->proc == execCommand) || (isScriptCallWriteCmd(cmd))); } @@ -262,19 +262,12 @@ typedef struct { } bgIteratorItemExtClose; -/* Used for dictEntryPtrDictType. This dict grows and shrinks constantly during the iteration. - * There is no point to rehash it all the time. */ -static int neverShrink(size_t moreMem, double usedRatio) { - UNUSED(moreMem); - return (usedRatio > 0.5); // Return true only if expanding -} - -// A dictionary with a pointer (itself) as a key (the address pointed to is NOT referenced). -// Nothing is duplicated, this is a very fast dictionary, but potentially unsafe if the original -// items are deleted or moved. -// WARNING: This needs to maintain safety with things that may move the object. -// * In db.c, if the object is reallocatd, bgIteration_updateDbEntryPtr() is called. -// * In defrag.c, we don't defrag if there are multiple references to an object (and we incr the refcount) +/* A dictionary with a pointer (itself) as a key (the address pointed to is NOT referenced). + * Nothing is duplicated, this is a very fast dictionary, but potentially unsafe if the original + * items are deleted or moved. + * WARNING: This needs to maintain safety with things that may move the object. + * + In db.c, if the object is reallocatd, bgIteration_updateDbEntryPtr() is called. + * + In defrag.c, we don't defrag if there are multiple references (and we incr the refcount). */ // Thomas Wang's 64-bit mix static uint64_t pointerHash(const void *key) { @@ -293,17 +286,43 @@ static int pointerCompare(const void *key1, const void *key2) { return key1 == key2; } +// This dict grows and shrinks constantly during the iteration. Avoid constant rehashing. +static int neverShrink(size_t moreMem, double usedRatio) { + UNUSED(moreMem); + return (usedRatio > 0.5); // Return true only if expanding +} + +static dictType dictEntryPtrDictType = { + .entryGetKey = dictEntryGetKey, + .hashFunction = pointerHash, + .keyCompare = pointerCompare, + .resizeAllowed = neverShrink, + .entryDestructor = zfree}; + +static hashtableType dbEntryPtrHashtableType = { + .hashFunction = pointerHash, + .keyCompare = pointerCompare, + .resizeAllowed = neverShrink}; + + // A free list for bgIteratorItem's - avoids churning zmalloc calls typedef struct itemListNode { struct itemListNode *next; } itemListNode; +static const int FREE_ITEM_MAX = 500; static itemListNode *freeItemStackHead = NULL; +static int freeItemStackCount = 0; static void itemFreeList_returnItemBackToFreeList(bgIteratorItem *item) { itemListNode *freedNode = (itemListNode *)item; - freedNode->next = freeItemStackHead; - freeItemStackHead = freedNode; + if (freeItemStackCount < FREE_ITEM_MAX) { + freedNode->next = freeItemStackHead; + freeItemStackHead = freedNode; + freeItemStackCount++; + } else { + zfree(freedNode); + } } // Pop a free node from the free list or allocate if none free @@ -312,8 +331,10 @@ static bgIteratorItem *itemFreeList_getElementOrAllocate(void) { if (freeItemStackHead) { item = (bgIteratorItem *)freeItemStackHead; freeItemStackHead = freeItemStackHead->next; + freeItemStackCount--; if (freeItemStackHead) valkey_prefetch(freeItemStackHead); } else { + serverAssert(freeItemStackCount == 0); // Create new listNode and item item = zmalloc(sizeof(bgIteratorItem)); } @@ -324,26 +345,15 @@ static void itemFreeList_release(void) { while (freeItemStackHead) { itemListNode *node = freeItemStackHead; freeItemStackHead = node->next; + freeItemStackCount--; zfree(node); } + serverAssert(freeItemStackCount == 0); } -static dictType dictEntryPtrDictType = { - .entryGetKey = dictEntryGetKey, - .hashFunction = pointerHash, - .keyCompare = pointerCompare, - .resizeAllowed = neverShrink, - .entryDestructor = zfree}; - -static hashtableType dbEntryPtrHashtableType = { - .hashFunction = pointerHash, - .keyCompare = pointerCompare, - .resizeAllowed = neverShrink}; - - -// A TEMP set of robj's (of type sds). This is only for temporary sets as the robj's are not -// ref-counted at insertion/deletion. +/* A TEMPORARY set of robj's (of type sds). This is only for temporary sets as the robj's are not + * ref-counted at insertion/deletion. */ static hashtableType tempKeysetHashtableType = { .hashFunction = dictObjHash, .keyCompare = dictObjKeyCompare}; @@ -370,8 +380,8 @@ struct genericIterator { }; -// This struct is used across threads. Unless otherwise noted, the fields are initialized at -// iterator creation (within the main thread) and are read-only by the client thread. +/* This struct is used across threads. Unless otherwise noted, the fields are initialized at + * iterator creation (within the main thread) and are read-only by the client thread. */ struct bgIterator { sds name; // Iterator name bgIteratorReplDoneFunc repldone; // Optional repldone function to be run on the main thread @@ -384,10 +394,10 @@ struct bgIterator { genericIterator *keyset_iter; // Low-level iterator (polymorphic) - hashtable *early_iterate_entries; /* A set of dbEntry, compared by pointer. Used to track items - * which have already been iterated over by out-of-order - * expedited processing. Ensures a bgIterator does not try to - * reprocess items. Used only by main thread. */ + /* A set of dbEntry, compared by pointer. Used to track items which have already been iterated + * over by out-of-order expedited processing. Ensures a bgIterator does not try to reprocess + * items. Used only by main thread. */ + hashtable *early_iterate_entries; mutexQueue *items_for_iterator; // Created/Destroyed in main thread, used in both (threadsafe) @@ -395,23 +405,22 @@ struct bgIterator { unsigned int item_count_target; // Used only by main thread - bgIteratorItem *volatile current_item; // current_item is normally only used in the iteration client. - // It's marked volatile here only to support snooping from the - // main thread when handling a FLUSHDB command. This prevents - // the compiler from generating code which might read the - // pointer multiple times (when it's coded to read only once). - // Also - this syntax is for a volatile POINTER to a - // non-volatile item. "volatile" at the beginning of the - // declaration, would indicate a (non-volatile) pointer to a - // volatile item. + /* current_item is normally only used in the iteration client. It's marked volatile here only + * to support snooping from the main thread when handling a FLUSHDB command. This prevents the + * compiler from generating code which might read the pointer multiple times (when it's coded to + * read only once). + * (A volatile POINTER to a non-volatile item.) */ + bgIteratorItem *volatile current_item; bool client_is_active; // Set to true when client performs 1st read - bool completed; // Set to true in main thread when last item from iteration has - // been queued to the client. No additional items will be - // enqueued to the client after this has been set. - volatile bool terminated; // Set to true in main thread when iteration is to be killed - // Set to true in iteration client when it decides to end early + /* Set to true in main thread when last item from iteration has been queued to the client. No + * additional items will be enqueued to the client after this has been set. */ + bool completed; + + /* Set to true in main thread when iteration is to be killed. + * Set to true in iteration client when it decides to end early. */ + volatile bool terminated; bool cur_cmd_may_replicate; // Used only in main thread during command processing @@ -428,10 +437,10 @@ struct bgIterator { unsigned long dbentry_clones_processed; // Updated by client thread monotime monotonic_start_time; // Time iteration started - volatile monotime monotonic_item_start_time; // The item start time is set in the iteration client. It is - // marked volatile as it can be read from the main thread by - // bgIteratorGetStatus. If 0, this indicates that the - // iteration client is waiting for an item to process. + /* The item start time is set in the iteration client. It is marked volatile as it can be read + * from the main thread by bgIteratorGetStatus. If 0, this indicates that the iteration client + * is waiting for an item to process. */ + volatile monotime monotonic_item_start_time; }; @@ -443,19 +452,19 @@ static dict *nameToIterator; // bgIterator->name -> bgIterator // Global, across all iterators, dict contains a dbEntry pointer -> ref count static dict *inUseEntries; // dbEntry -> ref count -// Key values in the current command which don't exist in the DB yet. Needed for determination of -// replication for NON-consistent iterations. +/* Key values in the current command which don't exist in the DB yet. Needed for determination of + * replication for NON-consistent iterations. */ static list *curCmdMissingKeys; // list of robj -// A counter of the total amount of memory used for buffered replication data. -// This amount is excluded when computing the need for evictions. +/* A counter of the total amount of memory used for buffered replication data. This amount is + * excluded when computing the need for evictions. */ static ssize_t bufferedReplicationBytes; // Memory pool to track current allocated memory of cloned items (in bytes) static ssize_t bgiteration_current_clone_memory_pool_size; -// Snapshot of the last queue size to seed the next queue -// We assume all bgIterators consume items at the same rate +/* Snapshot of the last queue size to seed the next queue. We assume all bgIterators consume items + * at roughly the same rate. */ static int last_item_count_target; // Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID) @@ -465,24 +474,26 @@ static long long bgIterator_timeproc_id; static uint32_t bgIteration_epoch = 1; -// BgIteration debug captures BgIteration activity to a large sds buffer. When an iterator is -// completed, the entire buffer is written to a file in the current working directory. Note that -// memory must be available for the ENTIRE debug in memory. This isn't captured incrementally to -// a file as the file I/O is more likely to affect timing. -// Future implementation: the current design is most useful for a single iterator. When items are -// queued to an iterator, the iterator name is not recorded (to save space). -// Developer note: using a CONST value here allows the compiler to completely remove all of the -// debugging code at compile time. There is no run-time performance overhead when set to FALSE. -// This is essentially like an IFDEF, however, it's better as it forces the compiler to validate -// syntax. +/* BgIteration debug captures BgIteration activity to a large sds buffer. When an iterator is + * completed, the entire buffer is written to a file in the current working directory. Note that + * memory must be available for the ENTIRE debug in memory. This isn't captured incrementally to + * a file as the file I/O is more likely to affect timing. + * + * Future implementation: the current design is most useful for a single iterator. When items are + * queued to an iterator, the iterator name is not recorded (to save space). + * + * Developer note: using a CONST value here allows the compiler to completely remove all of the + * debugging code at compile time. There is no run-time performance overhead when set to FALSE. + * This is essentially like an IFDEF, however, it's better as it forces the compiler to validate + * syntax. */ static const bool BGITERATION_DEBUG = false; // DO NOT SUBMIT WITH THIS SYMBOL SET TO TRUE! static sds debugBuffer; -//============================================================================================= -// Full Scan Iterator -//============================================================================================= -/* The full scan iterator performs the actual iteration over the Valkey keyset. The iterator is +/* ============================================================================================= + * Full Scan Iterator + * ============================================================================================= + * The full scan iterator performs the actual iteration over the Valkey keyset. The iterator is * only used from within the Valkey main thread. Iteration proceeds one DB at a time, based on * the DB ordering at the time of iterator creation. Each time the iterator returns items, all * of the dictionary entries from a single hash bucket are returned. */ @@ -490,17 +501,17 @@ static sds debugBuffer; struct fullScanIterator { genericIterator callbacks; // (must be first item) - // Array of mapping from original DB ID (at the time of iteration start) to that DB's - // current index. So, if the DB which was DB-0 is now at index 6, orig_to_cur_db[0]==6. + /* Array of mapping from original DB ID (at the time of iteration start) to that DB's current + * index. So, if the DB which was DB-0 is now at index 6, orig_to_cur_db[0]==6. */ int *orig_to_cur_db; - // The reverse of the above array. This maps a current DB index to its original index - // (at the time of iteration start). + /* The reverse of the above array. This maps a current DB index to its original index (at the + * time of iteration start). */ int *cur_to_orig_db; - // This is the DB we are currently iterating over. This is relative to the ORIGINAL - // DB ordering, at the time of iterator creation. Iteration proceeds from 0..N based on - // the original ordering. + /* This is the DB we are currently iterating over. This is relative to the ORIGINAL DB + * ordering, at the time of iterator creation. Iteration proceeds from 0..N based on the + * original ordering. */ int iter_db; // Iterator for the DB orig_to_cur_db[iter_db] @@ -596,8 +607,8 @@ static bool fullScanIteratorHasPassedItem(genericIterator *genIt, const_sds key, if (it->kvs == NULL) return true; // just finished this DB - // We're in the middle of processing a DB. In cluster-mode, the DB is divided into 1 hashtable - // per slot. In cluster-mode-disabled, we treat all keys as in slot 0. + /* We're in the middle of processing a DB. In cluster-mode, the DB is divided into 1 hashtable + * per slot. In cluster-mode-disabled, we treat all keys as in slot 0. */ int keySlot = server.cluster_enabled ? getKeySlot((sds)key) : 0; if (keySlot < it->kvs_didx) return true; if (keySlot > it->kvs_didx) return false; @@ -648,10 +659,10 @@ static genericIterator *fullScanIteratorCreate(void) { } -//============================================================================================= -// Cluster Slot Iterator -//============================================================================================= -/* The cluster slot iterator performs iteration over one cluster slot of the Valkey keyset. The +/* ============================================================================================= + * Cluster Slot Iterator + * ============================================================================================= + * The cluster slot iterator performs iteration over one cluster slot of the Valkey keyset. The * iterator is only used from within the Valkey main thread. */ struct clusterSlotIterator { genericIterator callbacks; // (must be first item) @@ -722,12 +733,12 @@ static genericIterator *clusterSlotIteratorCreate(const int *slots, size_t slots } -//============================================================================================= -// General iteration support (across all iterators) -//============================================================================================= +/* ============================================================================================= + * General iteration support (across all iterators) + * ============================================================================================= */ -// While an item is potentially in use by a background thread, we can't have -// rehashing by the main thread. Returns true if rehashing was paused. +/* While an item is potentially in use by a background thread, we can't have rehashing by the main + * thread. Returns true if rehashing was paused. */ static bool pauseRehashing(dbEntry *de) { switch (de->encoding) { case OBJ_ENCODING_HASHTABLE: { // SET or HASH @@ -808,14 +819,17 @@ static dbEntry *tryCloneDbEntry(dbEntry *de) { if (bgiteration_current_clone_memory_pool_size + bgiter_max_clone_item_bytes > bgiter_max_clone_pool_bytes) return NULL; - // Future optimization: Incorporate small ziplists, sorted sets, etc. - // OBJ_ENCODING_INT is omitted only because there isn't a good API for cloning it yet. + /* Future optimization: Incorporate small ziplists, sorted sets, etc. + * OBJ_ENCODING_INT is omitted only because there isn't a good API for cloning it yet. */ if (de->type == OBJ_STRING && de->encoding != OBJ_ENCODING_INT) { ssize_t itemSize = computeStringDbEntrySize(de); if (itemSize <= bgiter_max_clone_item_bytes) { bgiteration_current_clone_memory_pool_size += itemSize; - dbEntry *clone = createStringObjectWithKeyAndExpire((char *)objectGetVal(de), sdslen(objectGetVal(de)), objectGetKey(de), objectGetExpire(de)); + dbEntry *clone = createStringObjectWithKeyAndExpire((char *)objectGetVal(de), + sdslen(objectGetVal(de)), + objectGetKey(de), + objectGetExpire(de)); ((bgIterationEntryMetadata *)objectGetMetadata(clone))->iterator_epoch = ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch; return clone; @@ -825,16 +839,15 @@ static dbEntry *tryCloneDbEntry(dbEntry *de) { return NULL; } - static void freeClonedDictEntry(dbEntry *clonedEntry) { serverAssert(clonedEntry->type == OBJ_STRING); - // Add back to memory pool bgiteration_current_clone_memory_pool_size -= computeStringDbEntrySize(clonedEntry); decrRefCount(clonedEntry); } + static bgIteratorItem *makeDbEntryItem(dbEntry *de, int dbid, bool isCloned) { if (!isCloned) incrementEntryInuse(de); @@ -900,15 +913,15 @@ static void returnCurrentItemToValkey(bgIterator *it) { serverAssert(false); } - // Do this AFTER placing into return_to_valkey. This is volatile and snooped when there is a - // flushall event. Don't want an item to be missed. + /* Do this AFTER placing into return_to_valkey. This is volatile and snooped when there is a + * flushall event. Don't want an item to be missed. */ it->current_item = NULL; } -//============================================================================================= -// Background Iterator (private) -//============================================================================================= +/* ============================================================================================= + * Background Iterator (private) + * ============================================================================================= */ static void bgIteratorRelease(bgIterator *it) { serverAssert(onValkeyMainThread()); @@ -990,10 +1003,10 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { if (!it->client_is_active || (it->dbentries_queued > it->dbentries_processed)) { - // We are done feeding dict entries to the iterator, but before ending the - // replication processing make sure that the iterator has become active (has - // started reading) and make sure that all of the dict entries have been processed - // by the client. + /* We are done feeding dict entries to the iterator, but before ending the + * replication processing make sure that the iterator has become active (has + * started reading) and make sure that all of the dict entries have been + * processed by the client. */ break; } if (it->repldone) { @@ -1091,8 +1104,8 @@ static bool addEarlyIterationKey(bgIterator *it, dbEntry *earlyEntry, int cur_db if (isClonedEntry) it->dbentry_clones_queued++; if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { // JHB - can we optimize here in cluster mode (no swap) - // On consistent iteration, SWAPDB events are not provided. So there is no requirement to - // keep items in order or synchronized with SWAPDB. + /* On consistent iteration, SWAPDB events are not provided. So there is no requirement to + * keep items in order or synchronized with SWAPDB. */ if (BGITERATION_DEBUG) { sds entryString = createEntryString(dbid, item->u.dbe.de); debugBuffer = sdscatprintf(debugBuffer, "EARLY_1: %s\n", entryString); @@ -1156,10 +1169,10 @@ static bool expediteKeysForMove(bgIterator *it, bool mustBlock = false; robj *key = argv[1]; - // Not looking for special cases to optimize here. Just try to expedite both src and dest - // keys. Note that the dest key might exist (and need iteration) but could be expired and - // could be overwritten by MOVE. In this case, a DEL would replicate due to the expiry. So - // even if the target is expired, we need to replicate it before executing the command. + /* Not looking for special cases to optimize here. Just try to expedite both src and dest + * keys. Note that the dest key might exist (and need iteration) but could be expired and + * could be overwritten by MOVE. In this case, a DEL would replicate due to the expiry. So + * even if the target is expired, we need to replicate it before executing the command. */ if (expediteSingleKeyWithoutOptimization(it, dbid, key, waitingOnKeys)) mustBlock = true; if (expediteSingleKeyWithoutOptimization(it, destDbid, key, waitingOnKeys)) mustBlock = true; @@ -1181,8 +1194,8 @@ static bool expediteKeysForCopy(bgIterator *it, robj *srcKey = argv[1]; robj *destKey = argv[2]; - // Not trying to optimize COPY. Just expedite source and destination (if it exists). We - // don't really care if the value is overwritten or not (so no need to parse REPLACE option). + /* Not trying to optimize COPY. Just expedite source and destination (if it exists). We + * don't really care if the value is overwritten or not (so no need to parse REPLACE option). */ if (expediteSingleKeyWithoutOptimization(it, dbid, srcKey, waitingOnKeys)) mustBlock = true; if (expediteSingleKeyWithoutOptimization(it, destDbid, destKey, waitingOnKeys)) mustBlock = true; @@ -1234,25 +1247,25 @@ static bool expediteKeysForWrite(bgIterator *it, bool mustBlock = false; - // All keys of the command should either be in scope or not since in cluster mode enabled they - // should all be in the same slot. So we just check the first key. + /* All keys of the command should either be in scope or not since in cluster mode enabled they + * should all be in the same slot. So we just check the first key. */ robj *oKey = argv[keyrefs[0].pos]; sds key = objectGetVal(oKey); - // If it's not in the iteration scope for the current iterator, then we don't need to do - // anything with this command. + /* If it's not in the iteration scope for the current iterator, then we don't need to do + * anything with this command. */ if (!it->keyset_iter->isKeyInScope(it->keyset_iter, key)) return false; - // Note: performance optimization for commands which only modify the first key. If this flag - // is not available, we can safely remove this `if` statement. + /* Note: performance optimization for commands which only modify the first key. If this flag + * is not available, we can safely remove this `if` statement. */ if ((cmd->flags & CMD_WRITE_FIRSTKEY_ONLY) && !(it->iteration_flags & BGITERATOR_FLAG_REPLICATION)) { - // If this write command only modifies the 1st key, we don't need to expedite others - // unless replication enabled. + /* If this write command only modifies the 1st key, we don't need to expedite others + * unless replication enabled. */ numKeys = 1; } if (cmd->proc == moveCommand) { - // Unfortunate special case for MOVE + // Special case for MOVE return expediteKeysForMove(it, dbid, argc, argv, waitingOnKeys); } @@ -1286,15 +1299,15 @@ static bool expediteKeysForWrite(bgIterator *it, } it->cur_cmd_may_replicate = true; // Will replicate only if replication enabled } else { - // Identification of missing keys is only needed for non-consistent iteration. This only - // needs to be collected once (on the 1st non-consistent iteration) + /* Identification of missing keys is only needed for non-consistent iteration. This only + * needs to be collected once (on the 1st non-consistent iteration). */ bool collectMissing = (listLength(curCmdMissingKeys) == 0); if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { // CONSISTENT = NO, REPLICATION = YES bool someIterated = false; - // dict containing the keys that have not been iterated yet. - // Using a dict dedupes the keys in case the command contains duplicated keys. + /* dict containing the keys that have not been iterated yet. + * Using a dict dedupes the keys in case the command contains duplicated keys. */ dict *notIteratedKeys = dictCreate(&dictEntryPtrDictType); // dict of dbEntry* -> robj* for (int i = 0; i < numKeys; i++) { @@ -1321,26 +1334,26 @@ static bool expediteKeysForWrite(bgIterator *it, } } - // Since missing keys are considered as already iterated, if there are any missing keys - // we must consider that some keys have been iterated, and make sure all other keys - // will be expedited if needed. + /* Since missing keys are considered as already iterated, if there are any missing keys + * we must consider that some keys have been iterated, and make sure all other keys + * will be expedited if needed. */ if (listLength(curCmdMissingKeys) > 0) someIterated = true; - // This command may be executing as part of a larger transaction. If some parts of the - // transaction have already been identified to replicate, we must wait on all keys and - // replicate here as well. (Take care not to set cur_cmd_may_replicate to false.) + /* This command may be executing as part of a larger transaction. If some parts of the + * transaction have already been identified to replicate, we must wait on all keys and + * replicate here as well. (Take care not to set cur_cmd_may_replicate to false.) */ if (someIterated) { if (server.in_exec) { - // We are now executing the commands in a multi-exec block. - // - // Regarding MULTI/EXEC: Remember that this code is executed twice for commands - // within a MULTI/EXEC block. First, we parse all the commands when deciding - // if the EXEC should be blocked. Then, as each command is executed, it's - // re-parsed so that we can maintain the early iterated list as the commands - // execute. In this second pass, as each command is executed, we can't change - // the replication decision which was made earlier (when the EXEC was processed). - // We don't want to get tricked (by a key being removed and recreated) into - // into starting to replicate in the middle of a MULTI/EXEC block. + /* We are now executing the commands in a multi-exec block. + * + * Regarding MULTI/EXEC: Remember that this code is executed twice for commands + * within a MULTI/EXEC block. First, we parse all the commands when deciding + * if the EXEC should be blocked. Then, as each command is executed, it's + * re-parsed so that we can maintain the early iterated list as the commands + * execute. In this second pass, as each command is executed, we can't change + * the replication decision which was made earlier (when the EXEC was processed). + * We don't want to get tricked (by a key being removed and recreated) into + * starting to replicate in the middle of a MULTI/EXEC block. */ } else { it->cur_cmd_may_replicate = true; } @@ -1385,8 +1398,8 @@ static bool expediteKeysForWrite(bgIterator *it, } -// Called when an iterator is terminated. Pulls everything out of the queue -// and returns the items to Valkey (before they hit the iterator). +/* Called when an iterator is terminated. Pulls everything out of the queue + * and returns the items to Valkey (before they hit the iterator). */ static void returnAllItemsToValkey(bgIterator *it) { serverAssert(onValkeyMainThread()); @@ -1415,14 +1428,14 @@ static void returnAllItemsToValkey(bgIterator *it) { break; case BGITERATOR_ITEM_COMPLETE: - // This can only happen if the completion item has been enqueued and - // the iterator is terminated before reaching the completion item. + /* This can only happen if the completion item has been enqueued and + * the iterator is terminated before reaching the completion item. */ itemFreeList_returnItemBackToFreeList(item); continue; // Skip pushing this onto itemsToReturn case BGITERATOR_ITEM_TERMINATED: - // This can only happen if there is a race when terminating between - // the iteration client and main thread. + /* This can only happen if there is a race when terminating between + * the iteration client and main thread. */ itemFreeList_returnItemBackToFreeList(item); continue; // Skip pushing this onto itemsToReturn @@ -1442,9 +1455,9 @@ static void returnAllItemsToValkey(bgIterator *it) { } -//============================================================================================= -// Foreground support functions (private) -//============================================================================================= +/* ============================================================================================= + * Foreground support functions (private) + * ============================================================================================= */ static size_t replicationItemSize(bgIteratorItem *item) { serverAssert(item->type == BGITERATOR_ITEM_REPLICATION); @@ -1468,10 +1481,9 @@ static void processReturnOfItemToValkey(bgIteratorItem *item, bgIterator *iter) freeClonedDictEntry(item->u.dbe.de); } else { if (isEntryInuseBySingleIterator(item->u.dbe.de)) { - // This blocking mechanism isn't the best. Written for slot-migration, - // it assumes a single DB so if the same key appears in multiple DBs, - // commands might get unblocked only to get blocked again. (This would - // happen only rarely, and with minimal impact.) + /* This blocking mechanism assumes a single DB so if the same key appears in + * multiple DBs, commands might get unblocked only to get blocked again. (This + * would happen only rarely, and with minimal impact.) */ robj *key = createStringObjectFromSds(objectGetKey(item->u.dbe.de)); unblockClientsInUseOnKey(key); decrRefCount(key); @@ -1490,8 +1502,8 @@ static void processReturnOfItemToValkey(bgIteratorItem *item, bgIterator *iter) bgIterator *it = ((bgIteratorItemExtClose *)item)->iter; serverAssert(it == iter); if (it->terminated) { - // Abnormal termination - // Normally the item is TERMINATED, but might be COMPLETE in race + /* Abnormal termination + * Normally the item is TERMINATED, but might be COMPLETE in race */ serverAssert(it->current_item->type == BGITERATOR_ITEM_TERMINATED || it->current_item->type == BGITERATOR_ITEM_COMPLETE); // Release any items stranded on the iterator after early termination @@ -1563,11 +1575,10 @@ static void prepareAndProcessReturnedItems(int n, bgIteratorItem **items, bgIter #define PREFETCH_BATCH_SIZE 16 +// Returns true if we process at least one item from a given iterator's return_to_valkey queue. static bool receiveItemsBackFromOneIterator(bgIterator *it) { bgIteratorItem *batchPool[PREFETCH_BATCH_SIZE]; int n = 0; - // Returns true if we process at least one item from - // a given iterator's return_to_valkey queue, false otherwise. fifo *poppedFifo = mutexQueuePopAll(it->return_to_valkey, false); if (poppedFifo != NULL) { while (fifoLength(poppedFifo) > 0) { @@ -1586,10 +1597,9 @@ static bool receiveItemsBackFromOneIterator(bgIterator *it) { return false; } +/* Process each iterator's return_to_valkey queue + * If `blocking` is true, continue reading until at least one queue was not empty. */ static void receiveItemsBackFromIterators(bool blocking) { - // Process each iterator's return_to_valkey queue - // If `blocking` is true, continue reading until - // at least one queue was not empty. serverAssert(onValkeyMainThread()); listIter li; listNode *node; @@ -1635,8 +1645,8 @@ static long long bgIteration_feedIterators_task(struct aeEventLoop *eventLoop, long dutyTimeUs = BGITER_CYCLE_BUDGET_MS * 1000; if (lastFeedEndTime > 0) { - // If the timer was delayed, compute the proportional time we should have had, and increase - // the duty cycle to compensate (up to a limit). + /* If the timer was delayed, compute the proportional time we should have had, and increase + * the duty cycle to compensate (up to a limit). */ long starvationUs = (startTime - lastFeedEndTime) - BGITER_CYCLE_DELAY_MS * 1000; if (starvationUs > 0) { long starvationCompensationUs = starvationUs * BGITER_CYCLE_BUDGET_MS / @@ -1668,8 +1678,8 @@ static long long bgIteration_feedIterators_task(struct aeEventLoop *eventLoop, // Not static, but not API. Intended for unit tests where the event loop may not be active. void bgIteration_feedIterators(void) { - // For unit testing, force the item_count_target to 1 in each call. This ensures that we only - // feed a minimal amount to the iterators rather than a non-deterministic amount. + /* For unit testing, force the item_count_target to 1 in each call. This ensures that we only + * feed a minimal amount to the iterators rather than a non-deterministic amount. */ listIter li; listNode *node; listRewind(allIterators, &li); @@ -1684,64 +1694,75 @@ void bgIteration_feedIterators(void) { static void resetReplicationFlagForIterators(client *c) { - // For any given command, the command may or may not need to be replicated based on the status - // and flags of each iterator. Furthermore, if a command does need to be replicated, this - // replication must occur for an entire atomic unit; we can't replicate only part of a script - // or multi/exec. - // This function is the only place where the replication flag is cleared. + /* For any given command, the command may or may not need to be replicated based on the status + * and flags of each iterator. Furthermore, if a command does need to be replicated, this + * replication must occur for an entire atomic unit; we can't replicate only part of a script + * or multi/exec. + * This function is the only place where the replication flag is cleared. */ if (c->flag.multi || c->flag.script) { - // REGARDING MULTI/EXEC - // -------------------- - // When processing a MULTI/EXEC, blockClientIfRequired is called first for the MULTI. Then, - // all of the commands are queued up in server.c:processCommand(). It's only when EXEC is - // encountered, that server.c:call() is fired to begin execution. - // AFTER the EXEC is processed by call(), then each of the commands in the MULTI/EXEC block - // will be processed through call(). - // If write commands are present, MULTI & EXEC will be passed to the replication stream - // before/after the transaction commands. Note that MULTI & EXEC are not actually - // "executed" at the time when their replication is passed to the replication stream. - // - // Example: MULTI; SET A B; EXEC - // 1. blockClientIfRequired() called for MULTI. MULTI flag IS NOT set. (Won't block.) - // 2. blockClientIfRequired() called for EXEC. MULTI flag IS set. (Might block.) - // 3. blockClientIfRequired() called for SET. MULTI flag IS set. (Won't block.) - // 4. handleCommandReplication() is called for MULTI. - // 5. handleCommandReplication() is called for SET. - // 6. handleCommandReplication() is called for EXEC. - // - // SO - if the MULTI flag is set, we DON'T clear the flag. It should only be cleared at the - // start of the transaction, when MULTI is received - and the flag isn't set yet. - - // REGARDING SCRIPTS - // ----------------- - // When processing a script, blockClientIfRequired is called first for the EVAL/EVALSHA/FCALL. - // Then, all of the commands are processed using a special script client. The script - // client has the CLIENT_SCRIPT flag set. For scripts, the replication flag is set when - // processing the EVAL/EVALSHA/FCALL and should not be cleared when executing individual - // commands in the script. - - // If it's the EXEC command, we fall through and clear the flag below. But for all other - // commands within the transaction, we don't clear the flag. + /* REGARDING MULTI/EXEC + * -------------------- + * When processing a MULTI/EXEC, blockClientIfRequired is called first for the MULTI. Then, + * all of the commands are queued up in server.c:processCommand(). It's only when EXEC is + * encountered, that server.c:call() is fired to begin execution. + * + * AFTER the EXEC is processed by call(), then each of the commands in the MULTI/EXEC block + * will be processed through call(). + * + * If write commands are present, MULTI & EXEC will be passed to the replication stream + * before/after the transaction commands. Note that MULTI & EXEC are not actually + * "executed" at the time when their replication is passed to the replication stream. + * + * Example: MULTI; SET A B; EXEC + * 1. blockClientIfRequired() called for MULTI. MULTI flag IS NOT set. (Won't block.) + * 2. blockClientIfRequired() called for EXEC. MULTI flag IS set. (Might block.) + * 3. blockClientIfRequired() called for SET. MULTI flag IS set. (Won't block.) + * 4. handleCommandReplication() is called for MULTI. + * 5. handleCommandReplication() is called for SET. + * 6. handleCommandReplication() is called for EXEC. + * + * SO - if the MULTI flag is set, we DON'T clear the flag. It should only be cleared at the + * start of the transaction, when MULTI is received - and the flag isn't set yet. */ + + /* REGARDING SCRIPTS + * ----------------- + * When processing a script, blockClientIfRequired is called first for the EVAL/EVALSHA/FCALL. + * Then, all of the commands are processed using a special script client. The script + * client has the CLIENT_SCRIPT flag set. For scripts, the replication flag is set when + * processing the EVAL/EVALSHA/FCALL and should not be cleared when executing individual + * commands in the script. */ + + /* If it's the EXEC command, we fall through and clear the flag below. But for all other + * commands within the transaction, we don't clear the flag. */ if (c->cmd->proc != execCommand) return; } - // For most commands, the replication flag is cleared and we determine if replication is needed - // based on the keys being used and their state in each iterator. If a modified key hasn't been - // processed yet, there's no need to expedite the key or send the replication. The key will be - // sent later, when reached by the iterator. - // However, for scripts, it is not possible to perform this optimization. There is no way to - // know if an undeclared key might be modified. Since the entire script needs to be replicated - // (or not replicated) atomically, we can't take the chance that an undeclared key might be - // hit which requires replication. + /* For most commands, the replication flag is cleared and we determine if replication is needed + * based on the keys being used and their state in each iterator. If a modified key hasn't been + * processed yet, there's no need to expedite the key or send the replication. The key will be + * sent later, when reached by the iterator. + * + * However, for scripts, it is not possible to perform this optimization. There is no way to + * know if an undeclared key might be modified. Since the entire script needs to be replicated + * (or not replicated) atomically, we can't take the chance that an undeclared key might be + * hit which requires replication. */ bool isScript = isScriptCallWriteCmd(c->cmd); - getKeysResult result; - initGetKeysResult(&result); - getKeysFromCommand(c->cmd, c->argv, c->argc, &result); - - // [sm-bgiterator] TODO: ELMO-108525, This assumes all keys are in the same slot, should consider cross-slot script case. - sds check_key = (result.numkeys > 0) ? objectGetVal(c->argv[result.keys[0].pos]) : NULL; + sds firstScriptKey = NULL; + if (isScript) { + /* If it's a script, we will normally replicate. But if the keys are out of scope for the + * iteration, we shouldn't. The use-case for this is with slot iteration, when the script + * is acting on keys from a different slot. Here, we just check the first declared key, and + * if it's out of scope for the iteration, we won't replicate it. This might cause issues + * for cross-slot scripts (anti-pattern), but the alternative is replicating all scripts, + * regardless of slot. */ + getKeysResult result; + initGetKeysResult(&result); + getKeysFromCommand(c->cmd, c->argv, c->argc, &result); + if (result.numkeys > 0) firstScriptKey = objectGetVal(c->argv[result.keys[0].pos]); + getKeysFreeResult(&result); + } listIter li; listNode *node; @@ -1751,14 +1772,16 @@ static void resetReplicationFlagForIterators(client *c) { if (it->completed || it->terminated) { it->cur_cmd_may_replicate = false; } else { - // Set initial state of the replication flag for this transaction - // For full scan iterators, write commands within scripts must always be replicated. - // For cluster slot iterators, replication of script write commands depends on whether - // the key is in scope of the current iterator. - it->cur_cmd_may_replicate = isScript && it->keyset_iter->isKeyInScope(it->keyset_iter, check_key); + /* For normal commands, the flag is initialized to false (not to replicate). For these + * commands, we decide later based on the actual commands. + * + * However, for scripts, we don't know what commands will be executed. So IF it's a + * script, and the keys are in scope (on the right slot) we initialize the replication + * flag to true. */ + it->cur_cmd_may_replicate = isScript && firstScriptKey && + it->keyset_iter->isKeyInScope(it->keyset_iter, firstScriptKey); } } - getKeysFreeResult(&result); } @@ -1795,9 +1818,8 @@ static void handleSwapdb(int db1, int db2) { static void removePtrFromEarlyIterate(dbEntry *de) { - // If the item is being released, let's get the pointer out of our early_iterate_entries. - // Note that this is not strictly necessary, but it frees some memory and keeps the - // dictionary small. + /* If the item is being released, let's get the pointer out of our early_iterate_entries. + * This is not strictly necessary, but it frees some memory and keeps the dictionary small. */ listIter li; listNode *node; listRewind(allIterators, &li); @@ -1808,84 +1830,6 @@ static void removePtrFromEarlyIterate(dbEntry *de) { } -static int findDbForEntry(dbEntry *de) { - for (int i = 0; i < server.dbnum; i++) { - if (server.db[i] && dbFind(server.db[i], objectGetKey(de)) == de) return i; - } - serverAssert(false); // the entry MUST be in one of the DBs -} - - -static void terminateIteratorForFlush(bgIterator *it, int dbid) { - if (!it->terminated) bgIteratorTerminate(it); - - // Snoop on the iterator. There might be 1 item still being processed. If that item is in the - // DB being flushed, the item is removed from the dict and held for deferred deletion. This - // allows the iterator to complete processing on the current item without the item being - // deleted unexpectedly. - // Since this is running in parallel with a background thread, the results are volatile. This - // is OK as when the iterator completes processing the item, it still won't have been accepted - // back to Valkey yet, meaning the item will still be in inUseEntries. - bgIteratorItem *item = it->current_item; - if (item && item->type == BGITERATOR_ITEM_DBENTRY) { - dbEntry *de = item->u.dbe.de; - int deDb = findDbForEntry(de); - if (dbid == -1 || dbid == deDb) { - removePtrFromEarlyIterate(de); - } - } -} - - -static void preserveIteratorItemsForFlush(bgIterator *it, int dbid) { - serverAssert(onValkeyMainThread()); - serverAssert(!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)); - serverAssert(dbid >= 0); - // Since this is not a consistent iteration, it's OK if the early_iterate_entries contains - // pointers to items being deleted. The item is not actually accessed from the pointer. And - // if the pointer gets reused for a new item, there's no guarantee that we would iterate it - // anyway. If replication is enabled, both new items and early_iterate_entries are treated the - // same (replication is processed). So this is safe in all cases. - // Given this, we will just worry about preserving items in the iterator's processing queue. - // Because of commands like SWAPDB and MOVE, there's no attempt to remove unnecessary items - // from the queue. This is also safer to future Valkey extensions. - - // Temporarily yank all items from the iterator's queue - fifo *poppedFifo = mutexQueuePopAll(it->items_for_iterator, false); - if (poppedFifo != NULL) { - fifo *readdFifo = fifoCreate(); - while (fifoLength(poppedFifo) > 0) { - bgIteratorItem *item; - fifoPop(poppedFifo, (void **)&item); - if (item->type == BGITERATOR_ITEM_DBENTRY) { - dbEntry *de = item->u.dbe.de; - if (dbFind(server.db[dbid], objectGetKey(de)) == de) { - // Found the entry in the DB about to be flushed - removePtrFromEarlyIterate(de); - } - } - fifoPush(readdFifo, item); - } - fifoRelease(poppedFifo); - - // Now give the list back to the iterator - mutexQueueAddMultiple(it->items_for_iterator, readdFifo); - fifoRelease(readdFifo); - } - - // And snoop on the active item. Even if the background task finishes with this item as we look - // at it, the item can't have been returned to Valkey yet. - bgIteratorItem *item = it->current_item; - if (item && item->type == BGITERATOR_ITEM_DBENTRY) { - dbEntry *de = item->u.dbe.de; - if (dbFind(server.db[dbid], objectGetKey(de)) == de) { - // Found the entry in the DB about to be flushed - removePtrFromEarlyIterate(de); - } - } -} - - static bool isDbSignificant(int dbid) { unsigned long long totalKeys = 0; for (int i = 0; i < server.dbnum; i++) { @@ -1909,22 +1853,18 @@ static void handleFlushdb(int dbid) { it->keyset_iter->flushDb(it->keyset_iter, dbid); if (should_abort_iterators || it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { - terminateIteratorForFlush(it, dbid); + if (!it->terminated) bgIteratorTerminate(it); } else { - // In this (limited) case, we're only flushing a single DB that contains < half the - // keys. We don't want to kill a full-sync replication. We will just continue with - // iteration, knowing that a replication client will also receive the FLUSHDB on the - // replication stream. - // It would be nice to do this with consistent snapshot also, but given that this is a - // very rare condition, development is not justified to save off the DB for deferred - // delete. This would add a lot of complexity as well as memory implications. - preserveIteratorItemsForFlush(it, dbid); + /* In this (limited) case, we're only flushing a single DB that contains < half the + * keys. We don't want to kill a full-sync replication. We will just continue with + * iteration, knowing that a replication client will also receive the FLUSHDB on the + * replication stream. There's no need to worry about the items themselves. Since + * we've incremented the refcount, the items still in queue won't be physically deleted. */ // Send a flushdb event to notify the client if (BGITERATION_DEBUG) { debugBuffer = sdscatprintf(debugBuffer, "FLUSH: %d\n", dbid); } - bgIteratorItem *item = itemFreeList_getElementOrAllocate(); item->type = BGITERATOR_ITEM_FLUSHDB; item->dbid = dbid; @@ -2053,16 +1993,17 @@ static bool expediteKeysForMultiExec(client *c, hashtable *waitingOnKeys) { zfree(cur_to_orig_db); if (!initiallyAnIteratorWillReplicate && anIteratorWillReplicateForThisCommand()) { - // We've decided to replicate. Re-process the MULTI/EXEC just once more to make sure that - // we didn't miss any keys at the beginning. This can't continue to recurse because - // `initiallyAnIteratorWillReplicate` will be TRUE in the recursive call. Note that the - // recursive call may add additional entries to `waitingOnKeys`. + /* We've decided to replicate. Re-process the MULTI/EXEC just once more to make sure that + * we didn't miss any keys at the beginning. This can't continue to recurse because + * `initiallyAnIteratorWillReplicate` will be TRUE in the recursive call. Note that the + * recursive call may add additional entries to `waitingOnKeys`. */ if (expediteKeysForMultiExec(c, waitingOnKeys)) mustBlock = true; } return mustBlock; } + static bgIterator *bgIteratorCreate(const char *name, bgIteratorConsistency consistency, bgIteratorReplDoneFunc repldone, @@ -2142,9 +2083,9 @@ static bgIterator *bgIteratorCreate(const char *name, } -//============================================================================================= -// PUBLIC INTERFACE: Iterator creation and use -//============================================================================================= +/* ============================================================================================= + * PUBLIC INTERFACE: Iterator creation and use + * ============================================================================================= */ // PUBLIC API bgIterator *bgIteratorCreateFullScanIter(const char *name, @@ -2247,10 +2188,10 @@ bgIteratorItem *bgIteratorRead(bgIterator *it) { if (it->current_item != NULL) { returnCurrentItemToValkey(it); - // To support unit tests. Normal clients call bgIteratorRead from an alternate thread. - // Without this, a unit test could get stuck waiting on the completion event because - // feed won't get invoked. For production, this is called regularly from the main thread. - // Note - this is checking that the exact same thread is used and shouldn't count modules. + /* To support unit tests. Normal clients call bgIteratorRead from an alternate thread. + * Without this, a unit test could get stuck waiting on the completion event because + * feed won't get invoked. For production, feed is called regularly from the main thread. + * Note - this is checking that the exact same thread is used and shouldn't count modules. */ if (pthread_equal(server.main_thread_id, pthread_self()) != 0) bgIteration_feedIterators_task(NULL, 0, NULL); } else { it->client_is_active = true; @@ -2293,9 +2234,9 @@ void bgIteratorClose(bgIterator *it) { } -//============================================================================================= -// PUBLIC INTERFACE: Valkey main-thread support hooks -//============================================================================================= +/* ============================================================================================= + * PUBLIC INTERFACE: Valkey main-thread support hooks + * ============================================================================================= */ // PUBLIC API void bgIteration_init(void) { @@ -2363,17 +2304,16 @@ void bgIteration_keyDelete(int dbid, const_sds key) { removePtrFromEarlyIterate(de); - // We might be within the context of a command execution. This happens if the key is found to - // be expired when attempting to execute the command. In this case, we should treat the key as - // missing. If the key exists after the command executes, we can treat it like a new key. - // (If not in command execution, this is ok - it's reset at the beginning of command execution.) + /* We might be within the context of a command execution. This happens if the key is found to + * be expired when attempting to execute the command. In this case, we should treat the key as + * missing. If the key exists after the command executes, we can treat it like a new key. + * (If not in command execution, this is ok - it's reset at the beginning of command execution.) */ robj *oKey = createObject(OBJ_STRING, sdsdup(key)); listAddNodeHead(curCmdMissingKeys, oKey); } // PUBLIC API -// Notify bgIteration that a FLUSHALL is being performed outside of the normal client interface. void bgIteration_flushall(void) { handleFlushdb(-1); } @@ -2390,9 +2330,9 @@ bool bgIteration_blockClientIfRequired(client *c) { createSdsFromClientArgv(c->argc, c->argv)); } - // Before executing a command or atomic transaction, the replication flag is cleared for each - // iterator. If it's determined that the command should replicate, the flag will be set - // as the command and keys are examined for expedite. + /* Before executing a command or atomic transaction, the replication flag is cleared for each + * iterator. If it's determined that the command should replicate, the flag will be set + * as the command and keys are examined for expedite. */ resetReplicationFlagForIterators(c); if (c->cmd->proc == flushdbCommand || c->cmd->proc == flushallCommand) { @@ -2492,8 +2432,8 @@ void bgIteration_handleCommandReplication(int dbid, if (!bgIteration_iterationActive()) return; serverAssert(onValkeyMainThread()); - // Some commands are replicated which are not writes (like publish) these can be ignored. - // Be careful with MULTI which is not a write command, but must be replicated. + /* Some commands are replicated which are not writes (like publish) these can be ignored. + * Be careful with MULTI which is not a write command, but must be replicated. */ if (!isWriteCmd(cmd) && cmd->proc != multiCommand) return; if (BGITERATION_DEBUG) { @@ -2509,16 +2449,16 @@ void bgIteration_handleCommandReplication(int dbid, handleSwapdb(id1, id2); } - // In the case that a key is touched in a different DB (COPY/MOVE) the key is recorded as - // a "special" key and than handled below. + /* In the case that a key is touched in a different DB (COPY/MOVE) the key is recorded as + * a "special" key and than handled below. */ int special_dbid = 0; sds special_key = NULL; dbEntry *special_dbEntry = NULL; if (cmd->proc == moveCommand) { - // The MOVE command succeeded. However MOVE requires special handling as it creates a new - // key in a different database. We need to make sure that we don't later try to iterate - // on the key as it would be a duplicate key at that point. So, instead, we will mark the - // newly created key as "early iterated". + /* The MOVE command succeeded. However MOVE requires special handling as it creates a new + * key in a different database. We need to make sure that we don't later try to iterate + * on the key as it would be a duplicate key at that point. So, instead, we will mark the + * newly created key as "early iterated". */ bool success = getDbIdFromRobj(argv[MOVE_COMMAND_DBID_ARG_INDEX], &special_dbid); serverAssert(success); // the command already succeeded, so this should work! @@ -2552,10 +2492,11 @@ void bgIteration_handleCommandReplication(int dbid, bgIterator *it = listNodeValue(node); if (it->completed || it->terminated) continue; - // For consistent iteration, we only iterate values based on version. But for - // non-consistent iteration, we don't need to explicitly iterate any values newly created - // during the iteration. So we mark them as expedited. We know we have a new key if it - // was missing before the command, and exists now. + /* For consistent iteration, we only iterate values based on version. But for + * non-consistent iteration, we don't need to explicitly iterate any values newly created + * during the iteration. So we mark them as expedited. We know we have a new key if it + * was missing before the command, and exists now. */ + if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) { // Handle the special case of a key moved to a different DB if (special_dbEntry != NULL) { @@ -2569,11 +2510,12 @@ void bgIteration_handleCommandReplication(int dbid, } } - // Note: In the cases where there's a special command, we are copying or moving an - // item to a different DB. In these limited cases, we can only possibly be - // creating a single key. And if we've handled it here, we don't need to - // handle it as a "missing key" below. If we were to try to handle it as a - // standard "missing key", we would get the DBID incorrect. + /* Note: In the cases where there's a special command, we are copying or moving an + * item to a different DB. In these limited cases, we can only possibly be + * creating a single key. And if we've handled it here, we don't need to + * handle it as a "missing key" below. If we were to try to handle it as a + * standard "missing key", we would get the DBID incorrect. */ + } else if (listLength(curCmdMissingKeys) > 0) { listIter missingIt; listNode *missingNode; @@ -2586,10 +2528,10 @@ void bgIteration_handleCommandReplication(int dbid, // It exists now! if (it->cur_cmd_may_replicate && !it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) { - // If the current command is allowed to replicate, and there is a new - // key which we haven't yet reached in iteration, it needs to be added - // to the set of early iterate entries. (We know that it's not already - // in that set because it's a newly created key!) + /* If the current command is allowed to replicate, and there is a new + * key which we haven't yet reached in iteration, it needs to be added + * to the set of early iterate entries. (We know that it's not already + * in that set because it's a newly created key!) */ bool wasAdded = hashtableAdd(it->early_iterate_entries, de); serverAssert(wasAdded); if (BGITERATION_DEBUG) { From fd0d263f7e037b9b709cbbf4ec647853d0f3cdd1 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 9 Jun 2026 18:13:15 +0000 Subject: [PATCH 39/40] Forkless Save Signed-off-by: Jim Brunner --- src/bgiteration.c | 9 +++++---- src/defrag.c | 8 ++++++++ src/expire.c | 9 +++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/bgiteration.c b/src/bgiteration.c index f65fbf30e06..8d635dbff0f 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -2480,10 +2480,10 @@ void bgIteration_handleCommandReplication(int dbid, } /* Implementation note regarding LUA and MULTI: LUA scripts and MULTI-EXEC blocks must be - * treated atomically. We need to ensure that either ALL of the replication (or none of the - * replication) for the atomic operation is processed by the iterator(s). This is handled - * naturally as we can only "complete" the iteration during the feeding process - and feeding - * is only performed when handling timer events (after the LUA/MULTI has completed). */ + * treated atomically. We need to ensure that either ALL of the replication (or none of the + * replication) for the atomic operation is processed by the iterator(s). This is handled + * naturally as we can only "complete" the iteration during the feeding process - and feeding + * is only performed when handling timer events (after the LUA/MULTI has completed). */ listIter li; listNode *node; @@ -2643,6 +2643,7 @@ size_t bgIteration_memoryInuseForReplication(void) { // PUBLIC API bool bgIteration_isEntryInuse(dbEntry *de) { serverAssert(onValkeyMainThread()); + if (!bgIteration_iterationActive()) return false; return isEntryInuseByAnyIterator(de); } diff --git a/src/defrag.c b/src/defrag.c index 670f83bee73..3e6d8cef6e6 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -43,6 +43,7 @@ #include "eval.h" #include "script.h" #include "module.h" +#include "bgiteration.h" #include #include @@ -708,6 +709,8 @@ static void defragKey(defragKeysCtx *ctx, robj **elemref) { unsigned char *newzl; ob = *elemref; + if (bgIteration_isEntryInuse(ob)) return; + /* Try to defrag robj and/or string value. */ if ((newob = activeDefragStringOb(ob))) { *elemref = newob; @@ -815,6 +818,11 @@ static void defragPubsubScanCallback(void *privdata, void *elemref) { * and 1 if time is up and more work is needed. */ static int defragLaterItem(robj *ob, unsigned long *cursor, monotime endtime, int dbid) { if (ob) { + if (bgIteration_isEntryInuse(ob)) { + *cursor = 0; + return 0; + } + if (ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST) { return scanLaterList(ob, cursor, endtime); } else if (ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HASHTABLE) { diff --git a/src/expire.c b/src/expire.c index b31f57465cc..40b1410f1b7 100644 --- a/src/expire.c +++ b/src/expire.c @@ -39,6 +39,7 @@ #include "cluster.h" #include "cluster_migrateslots.h" #include "util.h" +#include "bgiteration.h" /*----------------------------------------------------------------------------- * Incremental collection of expired keys. @@ -167,6 +168,14 @@ void fieldExpireScanCallback(void *privdata, void *volaKey, int didx) { robj *o = volaKey; serverAssert(o); serverAssert(hashTypeHasVolatileFields(o)); + + data->has_more_expired_entries = false; + + if (bgIteration_isEntryInuse(o)) { + data->sampled++; + return; + } + mstime_t now = server.mstime; size_t expired_fields = dbReclaimExpiredFields(o, data->db, now, data->max_entries, didx); if (expired_fields) { From a4d5e28c9fa3b84f925f099ff20e5d6196a280d9 Mon Sep 17 00:00:00 2001 From: Jim Brunner Date: Tue, 9 Jun 2026 18:28:15 +0000 Subject: [PATCH 40/40] Forkless Save Signed-off-by: Jim Brunner --- src/defrag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/defrag.c b/src/defrag.c index 3e6d8cef6e6..e6ecad0227e 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -822,7 +822,7 @@ static int defragLaterItem(robj *ob, unsigned long *cursor, monotime endtime, in *cursor = 0; return 0; } - + if (ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST) { return scanLaterList(ob, cursor, endtime); } else if (ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HASHTABLE) {