diff --git a/.config/typos.toml b/.config/typos.toml index 10103279c57..da8175e4ff9 100644 --- a/.config/typos.toml +++ b/.config/typos.toml @@ -15,11 +15,15 @@ optin = "optin" smove = "smove" Parth = "Parth" # seems like the spellchecker does not like it is similar to "Path" nd = "nd" +threadsave = "threadsave" + +[default.extend-identifiers] +dbe = "dbe" [default] extend-ignore-re = [ - "SELECTed", - "WATCHed", + "[A-Z]{2,}ed", # SELECTed, WATCHed, etc. + "[A-Z]{2,}s", # SELECTs, etc. ] [type.c] @@ -65,6 +69,9 @@ pn = "pn" seeked = "seeked" tre = "tre" +[type.cpp.extend-words] +fo = "fo" + [type.systemd.extend-words] # systemd = .conf ake = "ake" @@ -72,6 +79,3 @@ ake = "ake" [type.tcl.extend-words] fo = "fo" tre = "tre" - -[type.cpp.extend-words] -fo = "fo" diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake index e2cd375bcc7..54964d079cd 100644 --- a/cmake/Modules/SourceFiles.cmake +++ b/cmake/Modules/SourceFiles.cmake @@ -36,6 +36,7 @@ set(VALKEY_SERVER_SRCS ${CMAKE_SOURCE_DIR}/src/t_hash.c ${CMAKE_SOURCE_DIR}/src/config.c ${CMAKE_SOURCE_DIR}/src/aof.c + ${CMAKE_SOURCE_DIR}/src/bgiteration.c ${CMAKE_SOURCE_DIR}/src/pubsub.c ${CMAKE_SOURCE_DIR}/src/multi.c ${CMAKE_SOURCE_DIR}/src/debug.c diff --git a/src/Makefile b/src/Makefile index 2c78f95986e..98f49108e46 100644 --- a/src/Makefile +++ b/src/Makefile @@ -457,6 +457,7 @@ ENGINE_SERVER_OBJ = \ allocator_defrag.o \ anet.o \ aof.o \ + bgiteration.o \ bio.o \ bitops.o \ blocked.o \ diff --git a/src/bgiteration.c b/src/bgiteration.c new file mode 100644 index 00000000000..3756ba0a60b --- /dev/null +++ b/src/bgiteration.c @@ -0,0 +1,2698 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + +#include "fmacros.h" +#include "bgiteration.h" +#include "dict.h" +#include "fifo.h" +#include "kvstore.h" +#include "monotonic.h" +#include "mutexqueue.h" +#include "server.h" + +int getFlushCommandFlags(client *c, int *flags); // in db.c +uint64_t dictObjHash(const void *key); // in server.c +int dictObjKeyCompare(const void *key1, const void *key2); // in server.c +size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid); // in object.c +robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const sds key, long long expire); // in object.c + + +static bool receiveItemsBackFromOneIterator(bgIterator *it); + + +// Future extendability +static bool ignoreKeyForSave(const_sds key) { + UNUSED(key); + return false; +} + +// Returns true if the cmd is a script command that may replicate. +static bool isScriptCallWriteCmd(struct serverCommand *cmd) { + return ((cmd->proc == fcallCommand) || (cmd->proc == evalCommand) || (cmd->proc == evalShaCommand)); +} + +/* The PFCOUNT command (which does NOT have the CMD_WRITE flag) modifies the underlying string and + * is replicated as a write. So it needs to be detected and handled specially. */ +static bool isWriteCmd(struct serverCommand *cmd) { + return ((cmd->flags & CMD_WRITE) || (cmd->proc == pfcountCommand) || (cmd->proc == execCommand) || (isScriptCallWriteCmd(cmd))); +} + +// Returns true if the command is a deletion based command (DEL or UNLINK) +static bool isDeleteCmd(struct serverCommand *cmd) { + return ((cmd->proc == delCommand) || (cmd->proc == unlinkCommand)); +} + + +/* This utility utilizes the main thread and background threads for processing. The API is split, + * with some of the functions intended for the main thread and others intended for the background + * clients. This sanity check ensures that we maintain thread safety, calling the API as intended. */ +static bool onValkeyMainThread(void) { + /* Modules interact with the main thread using a mutex. If a module owns the mutex, consider + * that equivalent to being on the main thread. */ + bool mightBeInModule = (atomic_load_explicit(&server.module_gil_acquired, memory_order_relaxed) == 0); + return (mightBeInModule || pthread_equal(server.main_thread_id, pthread_self()) != 0); +} + + +/* Parse a parameters robj, extracting a valid DBID. + * Returns FALSE if DBID isn't valid. */ +static bool getDbIdFromRobj(robj *obj, int *db_id) { + long long value; + if (getLongLongFromObject(obj, &value) != C_OK) return false; + if ((value < 0) || (value >= server.dbnum)) return false; + *db_id = (int)value; + return true; +} + +/* Parse the parameters of the COPY command, extracting the target DBID. + * Returns FALSE if the command would not run. */ +static bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid, int *target_dbid) { + const int COPY_COMMAND_OPTIONAL_ARG_START_INDEX = 3; + + *target_dbid = selected_dbid; + + for (int i = COPY_COMMAND_OPTIONAL_ARG_START_INDEX; i < argc; i++) { + if (!strcasecmp((char *)objectGetVal(argv[i]), "replace")) { + continue; + } else if (!strcasecmp((char *)objectGetVal(argv[i]), "db") && (i + 1 < argc)) { + /* Note the parsing here needs to perfectly match what we have in Valkey OSS for COPY. + * The following command is considered OK by Valkey 8.1 so we can't return here, but + * must continue to parse till the last db which is the one that's effectively used. + * COPY key1 key2 db 1 db 2 db 3 (This will use db 3) */ + if (!getDbIdFromRobj(argv[i + 1], target_dbid)) { + return false; // parse failure + } + i++; // Consume additional argument + } else { + return false; // parse failure + } + } + return true; +} + +/* Get parameters for the SWAPDB command. + * The optional permission_client allows for checking of a client's permission for swapdb. + * Returns true if command would be executed. */ +static bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, int *id1_p, int *id2_p) { + static struct serverCommand *swapdb_cmd = NULL; + + // We don't need to check permissions in the replication phase + if (permission_client != NULL) { + if (swapdb_cmd == NULL) { + swapdb_cmd = lookupCommandByCString("swapdb"); + serverAssert(swapdb_cmd != NULL); + } + + int idxptr; + if (ACLCheckAllUserCommandPerm(permission_client->user, swapdb_cmd, argv, argc, + permission_client->db->id, &idxptr) != ACL_OK) return false; + } + + long long dbid1, dbid2; + if (argc != 3) return false; + if (server.cluster_enabled) return false; + if (getLongLongFromObject(argv[1], &dbid1) != C_OK) return false; + if (getLongLongFromObject(argv[2], &dbid2) != C_OK) return false; + if (dbid1 < 0 || dbid1 >= server.dbnum) return false; + if (dbid2 < 0 || dbid2 >= server.dbnum) return false; + if (dbid1 == dbid2) return false; // Valid, but doesn't do anything + + *id1_p = (int)dbid1; + *id2_p = (int)dbid2; + return true; +} + +/* Get parameters for the SELECT command. + * The optional permission_client allows for checking of a client's permission for select. + * Returns true if command would be executed. */ +static bool getParamsForSelect(int argc, robj **argv, client *permission_client, int *dbid_p) { + static struct serverCommand *select_cmd = NULL; + + // We don't need to check permissions in the replication phase + if (permission_client != NULL) { + if (select_cmd == NULL) { + select_cmd = lookupCommandByCString("select"); + serverAssert(select_cmd != NULL); + } + + int idxptr; + if (ACLCheckAllUserCommandPerm(permission_client->user, select_cmd, argv, argc, + permission_client->db->id, &idxptr) != ACL_OK) return false; + } + + long long dbid; + if (argc != 2) return false; + if (getLongLongFromObject(argv[1], &dbid) != C_OK) return false; + if (dbid < 0 || dbid >= server.dbnum) return false; + + *dbid_p = (int)dbid; + return true; +} + +static void pauseRehashForKvsHashtable(kvstore *kvs, int didx) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (ht != NULL) hashtablePauseRehashing(ht); +} + +static void resumeRehashForKvsHashtable(kvstore *kvs, int didx) { + hashtable *ht = kvstoreGetHashtable(kvs, didx); + if (ht != NULL) hashtableResumeRehashing(ht); +} + + +/* DictType for SDS->ptr. The SDS is referenced, no destructor. */ +static dictType sdsrefToPtrDictType = { + .entryGetKey = dictEntryGetKey, + .hashFunction = dictSdsHash, + .keyCompare = dictSdsKeyCompare, + .entryDestructor = zfree}; + + +/* Wrap decrRefCount() so that it can be used as a callback requiring void. */ +static void decrRefCountVoid(void *o) { + decrRefCount(o); +} + + +/* Concatenate argc/argv into a command string for debugging. */ +static sds createSdsFromClientArgv(int argc, robj **argv) { + sds cmd = sdsempty(); + for (int i = 0; i < argc; i++) { + robj *arg = getDecodedObject(argv[i]); // some objects are int encoded + cmd = sdscatprintf(cmd, "'%s' ", (char *)objectGetVal(arg)); + decrRefCount(arg); + } + return cmd; +} + + +// ########################################################################### + + +/* bgIteration internal (compile time) configuration values */ +enum { + BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE = 16384, // Prevent initial rehashing + BGITER_MAX_CLONE_ITEM_BYTES = 512, // Max size item to clone + BGITER_MAX_CLONE_POOL_BYTES = (1 * 1024 * 1024), // Total limit for all cloned items + BGITER_QUEUE_INCREASE_INCR = 100, // Step size when increasing queue target + BGITER_CYCLE_DELAY_MS = 2, // Delay between calls on bgIteration timer + BGITER_CYCLE_BUDGET_MS = 1, // Normal time limit for timer processing + BGITER_CYCLE_BUDGET_MAX_MS = 10 // Maximum time limit when starvation seen +}; + +// dbEntry metadata +typedef struct { + uint32_t iterator_epoch; // iterator epoch of last modification +} bgIterationEntryMetadata; +static_assert(sizeof(bgIterationEntryMetadata) == BGITERATION_ENTRY_METADATA_SIZE, ""); + + +// These can be tweaked by unit tests +static int bgiter_max_clone_item_bytes = BGITER_MAX_CLONE_ITEM_BYTES; +static int bgiter_max_clone_pool_bytes = BGITER_MAX_CLONE_POOL_BYTES; + +void bgIteration_unitTestDisableCloning(void) { + bgiter_max_clone_item_bytes = 0; + bgiter_max_clone_pool_bytes = 0; +} +void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes) { + bgiter_max_clone_item_bytes = item_bytes; + bgiter_max_clone_pool_bytes = pool_bytes; +} + +typedef enum { + BGITERATION_TYPE_NONE, + BGITERATION_TYPE_FULLSCAN, + BGITERATION_TYPE_CLUSTERSLOT +} bgIterationType; + + +/* Flag indicates that a consistent iteration is required. This is used to create a point-in-time + * iteration. The iteration client will see all keys AS THEY EXISTED at the time when the iterator + * was created. + * Note: The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration + * start). SWAPDB events are NOT provided during a consistent iteration. */ +#define BGITERATOR_FLAG_CONSISTENT (1 << 0) + +/* Flag indicating that the replication stream for keys which have already been processed should be + * forwarded to the iteration client. Used for non-consistent iteration to track changes + * to keys already processed. By tracking changes, this allows an non-consistent iteration client + * to achieve a consistent view at the END of the iteration. + * NOTE: Replication events will be provided ordered and synchronized with any SWAPDB events. */ +#define BGITERATOR_FLAG_REPLICATION (1 << 1) + + +/* Extensions to bgIteratorItemType. These enumerations are used internally, and are not part of + * the published interface. These allow for extensibility in the internal information-passing + * between the Valkey main thread and the iteration client thread. */ +typedef enum { + /* Indicates that the iteration client has completed use of the bgIterator and that the + * bgIterator should be cleaned up and freed by the Valkey main thread. */ + BGITERATOR_ITEMEXT_ITER_CLOSED = 10 +} bgIteratorItemTypeExtended; + +/* Item for bgIteratorItemTypeExtended.BGITERATOR_ITEMEXT_ITER_CLOSED. Used to pass a bgIterator + * back to the Valkey main thread for cleanup/release. */ +typedef struct { + bgIteratorItemTypeExtended type; + bgIterator *iter; +} bgIteratorItemExtClose; + + +/* A dictionary with a pointer (itself) as a key (the address pointed to is NOT referenced). + * Nothing is duplicated, this is a very fast dictionary, but potentially unsafe if the original + * items are deleted or moved. + * WARNING: This needs to maintain safety with things that may move the object. + * + In db.c, if the object is reallocatd, bgIteration_updateDbEntryPtr() is called. + * + In defrag.c, we don't defrag if there are multiple references (and we incr the refcount). */ + +// Thomas Wang's 64-bit mix +static uint64_t pointerHash(const void *key) { + uint64_t h = (uint64_t)(uintptr_t)key; + h = (~h) + (h << 21); // h = (h << 21) - h - 1; + h = h ^ (h >> 24); + h = (h + (h << 3)) + (h << 8); // h * 265 + h = h ^ (h >> 14); + h = (h + (h << 2)) + (h << 4); // h * 21 + h = h ^ (h >> 28); + h = h + (h << 31); + return h; +} + +static int pointerCompare(const void *key1, const void *key2) { + return key1 == key2; +} + +// This dict grows and shrinks constantly during the iteration. Avoid constant rehashing. +static int onlyAllowExpansion(size_t moreMem, double usedRatio) { + UNUSED(moreMem); + return (usedRatio > 0.5); // Return true only if expanding +} + +static dictType dictEntryPtrDictType = { + .entryGetKey = dictEntryGetKey, + .hashFunction = pointerHash, + .keyCompare = pointerCompare, + .resizeAllowed = onlyAllowExpansion, + .entryDestructor = zfree}; + +static hashtableType dbEntryPtrHashtableType = { + .hashFunction = pointerHash, + .keyCompare = pointerCompare, + .resizeAllowed = onlyAllowExpansion}; + + +// A free list for bgIteratorItem's - avoids churning zmalloc calls +typedef struct itemListNode { + struct itemListNode *next; +} itemListNode; + +static const int FREE_ITEM_MAX = 500; +static itemListNode *freeItemStackHead = NULL; +static int freeItemStackCount = 0; + +static void itemFreeList_returnItemBackToFreeList(bgIteratorItem *item) { + itemListNode *freedNode = (itemListNode *)item; + if (freeItemStackCount < FREE_ITEM_MAX) { + freedNode->next = freeItemStackHead; + freeItemStackHead = freedNode; + freeItemStackCount++; + } else { + zfree(freedNode); + } +} + +// Pop a free node from the free list or allocate if none free +static bgIteratorItem *itemFreeList_getElementOrAllocate(void) { + bgIteratorItem *item; + if (freeItemStackHead) { + item = (bgIteratorItem *)freeItemStackHead; + freeItemStackHead = freeItemStackHead->next; + freeItemStackCount--; + if (freeItemStackHead) valkey_prefetch(freeItemStackHead); + } else { + serverAssert(freeItemStackCount == 0); + // Create new listNode and item + item = zmalloc(sizeof(bgIteratorItem)); + } + return item; +} + +static void itemFreeList_release(void) { + while (freeItemStackHead) { + itemListNode *node = freeItemStackHead; + freeItemStackHead = node->next; + freeItemStackCount--; + zfree(node); + } + serverAssert(freeItemStackCount == 0); +} + + +/* A TEMPORARY set of robj's (of type sds). This is only for temporary sets as the robj's are not + * ref-counted at insertion/deletion. */ +static hashtableType tempKeysetHashtableType = { + .hashFunction = dictObjHash, + .keyCompare = dictObjKeyCompare}; + + +typedef struct genericIterator genericIterator; +typedef void (*iteratorReleaseFunc)(genericIterator *genIt); +typedef fifo *(*iteratorGetEntriesFunc)(genericIterator *genIt, int *orig_dbid, int *cur_dbid); +typedef void (*iteratorSwapDbFunc)(genericIterator *genIt, int db1, int db2); +typedef void (*iteratorFlushDbFunc)(genericIterator *genIt, int cur_dbid); +typedef bool (*iteratorHasPassedItemFunc)(genericIterator *genIt, const_sds key, int cur_dbid); +typedef int (*iteratorOriginalDbFunc)(genericIterator *genIt, int cur_dbid); +typedef bool (*iteratorIsKeyInScopeFunc)(genericIterator *genIt, const_sds key); + +// Function pointers supporting polymorphic iterator implementation +struct genericIterator { + iteratorReleaseFunc release; + iteratorGetEntriesFunc getEntries; + iteratorSwapDbFunc swapDb; + iteratorFlushDbFunc flushDb; + iteratorHasPassedItemFunc hasPassedItem; + iteratorOriginalDbFunc originalDb; + iteratorIsKeyInScopeFunc isKeyInScope; +}; + + +/* This struct is used across threads. Unless otherwise noted, the fields are initialized at + * iterator creation (within the main thread) and are read-only by the client thread. */ +struct bgIterator { + sds name; // Iterator name + bgIteratorReplDoneFunc repldone; // Optional repldone function to be run on the main thread + bgIteratorCleanupFunc cleanup; // Optional cleanup function to be run on main thread + void *privdata; // Client's private data to be passed to cleanup function + + int iteration_flags; // Consistent and/or Replication + int iteration_type; // Full scan or cluster slot + uint32_t consistent_modification_id; // iterator epoch at time of iterator creation + + genericIterator *keyset_iter; // Low-level iterator (polymorphic) + + /* A set of dbEntry, compared by pointer. Used to track items which have already been iterated + * over by out-of-order expedited processing. Ensures a bgIterator does not try to reprocess + * items. Used only by main thread. */ + hashtable *early_iterate_entries; + + mutexQueue *items_for_iterator; // Created/Destroyed in main thread, used in both (threadsafe) + + mutexQueue *return_to_valkey; // Queue of items to be returned to the Valkey main thread (threadsafe) + + unsigned int item_count_target; // Used only by main thread + + /* current_item is normally only used in the iteration client. It's marked volatile here only + * to support snooping from the main thread when handling a FLUSHDB command. This prevents the + * compiler from generating code which might read the pointer multiple times (when it's coded to + * read only once). + * (A volatile POINTER to a non-volatile item.) */ + bgIteratorItem *volatile current_item; + + bool client_is_active; // Set to true when client performs 1st read + + /* Set to true in main thread when last item from iteration has been queued to the client. No + * additional items will be enqueued to the client after this has been set. */ + bool completed; + + /* Set to true in main thread when iteration is to be killed. + * Set to true in iteration client when it decides to end early. */ + volatile bool terminated; + + bool cur_cmd_may_replicate; // Used only in main thread during command processing + + // Variables maintaining runtime statistics + unsigned long dbentries_queued; // Updated by main thread + unsigned long dbentries_processed; // Updated by client thread + unsigned long replication_queued; // Updated by main thread + unsigned long replication_processed; // Updated by client thread + unsigned long swapdb_queued; // Updated by main thread + unsigned long swapdb_processed; // Updated by client thread + unsigned long flushdb_queued; // Updated by main thread + unsigned long flushdb_processed; // Updated by client thread + unsigned long dbentry_clones_queued; // Updated by main thread + unsigned long dbentry_clones_processed; // Updated by client thread + monotime monotonic_start_time; // Time iteration started + + /* The item start time is set in the iteration client. It is marked volatile as it can be read + * from the main thread by bgIteratorGetStatus. If 0, this indicates that the iteration client + * is waiting for an item to process. */ + volatile monotime monotonic_item_start_time; +}; + + +// These static values are only accessed from the main Valkey thread. + +static list *allIterators; // list of bgIterator +static dict *nameToIterator; // bgIterator->name -> bgIterator + +// Global, across all iterators, dict contains a dbEntry pointer -> ref count +static dict *inUseEntries; // dbEntry -> ref count + +/* Key values in the current command which don't exist in the DB yet. Needed for determination of + * replication for NON-consistent iterations. */ +static list *curCmdMissingKeys; // list of robj + +/* A counter of the total amount of memory used for buffered replication data. This amount is + * excluded when computing the need for evictions. */ +static ssize_t bufferedReplicationBytes; + +// Memory pool to track current allocated memory of cloned items (in bytes) +static ssize_t bgiteration_current_clone_memory_pool_size; + +/* Snapshot of the last queue size to seed the next queue. We assume all bgIterators consume items + * at roughly the same rate. */ +static int last_item_count_target; + +// Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID) +static long long bgIterator_timeproc_id; + +// Incremented on each new iteration, this is updated in dbEntry metadata whenever an entry is modified. +static uint32_t bgIteration_epoch = 1; + +/* If true, the iterators' cur_cmd_may_replicate flag was determined in the last call to + * blockClientIfRequired. Otherwise, we skipped over computing this flag (maybe because it was a + * READ command). + * If this is true, AND we are in the context of executing a command inside of call(), then we + * should respect the setting of cur_cmd_may_replicate. */ +static bool iteratorReplicationFlagsWereUpdated; + + +/* BgIteration debug captures BgIteration activity to a large sds buffer. When an iterator is + * completed, the entire buffer is written to a file in the current working directory. Note that + * memory must be available for the ENTIRE debug in memory. This isn't captured incrementally to + * a file as the file I/O is more likely to affect timing. + * + * Future implementation: the current design is most useful for a single iterator. When items are + * queued to an iterator, the iterator name is not recorded (to save space). + * + * Developer note: using a CONST value here allows the compiler to completely remove all of the + * debugging code at compile time. There is no run-time performance overhead when set to FALSE. + * This is essentially like an IFDEF, however, it's better as it forces the compiler to validate + * syntax. */ +static const bool BGITERATION_DEBUG = false; // DO NOT SUBMIT WITH THIS SYMBOL SET TO TRUE! +static sds debugBuffer; + + +/* ============================================================================================= + * Full Scan Iterator + * ============================================================================================= + * The full scan iterator performs the actual iteration over the Valkey keyset. The iterator is + * only used from within the Valkey main thread. Iteration proceeds one DB at a time, based on + * the DB ordering at the time of iterator creation. Each time the iterator returns items, all + * of the dictionary entries from a single hash bucket are returned. */ + +struct fullScanIterator { + genericIterator callbacks; // (must be first item) + + /* Array of mapping from original DB ID (at the time of iteration start) to that DB's current + * index. So, if the DB which was DB-0 is now at index 6, orig_to_cur_db[0]==6. */ + int *orig_to_cur_db; + + /* The reverse of the above array. This maps a current DB index to its original index (at the + * time of iteration start). */ + int *cur_to_orig_db; + + /* This is the DB we are currently iterating over. This is relative to the ORIGINAL DB + * ordering, at the time of iterator creation. Iteration proceeds from 0..N based on the + * original ordering. */ + int iter_db; + + // Iterator for the DB orig_to_cur_db[iter_db] + kvstore *kvs; // keep track of kvs associated with iter_dbi + int kvs_didx; // hashtable index within the kvstore + size_t ht_cursor; // cursor for scanning hashtable +}; + +static void fullScanIteratorRelease(genericIterator *genIt) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + if (it->kvs) resumeRehashForKvsHashtable(it->kvs, it->kvs_didx); + zfree(it->orig_to_cur_db); + zfree(it->cur_to_orig_db); + zfree(it); +} + +/* Scan callback used by fullScanIteratorGetEntries2 to collect entries into a fifo. */ +static void fullScanIteratorScanCallback(void *privdata, void *entry) { + fifo *dbEntryFifo = (fifo *)privdata; + dbEntry *de = (dbEntry *)entry; + if (ignoreKeyForSave(objectGetKey(de))) return; // slot migration: keys being purged + fifoPush(dbEntryFifo, de); +} + +static fifo *fullScanIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + if (it->iter_db >= server.dbnum) return NULL; // Finished scanning + + fifo *dbEntryFifo = fifoCreate(); + while (fifoLength(dbEntryFifo) == 0) { + while (it->kvs == NULL) { + if (++it->iter_db >= server.dbnum) { + fifoRelease(dbEntryFifo); + return NULL; // Iteration complete + } + serverDb *db = server.db[it->orig_to_cur_db[it->iter_db]]; + if (db != NULL) { + it->kvs = db->keys; + it->kvs_didx = kvstoreGetFirstNonEmptyHashtableIndex(it->kvs); + it->ht_cursor = 0; + if (it->kvs_didx == KVSTORE_INDEX_NOT_FOUND) it->kvs = NULL; + if (it->kvs != NULL) pauseRehashForKvsHashtable(it->kvs, it->kvs_didx); + } + } + + hashtable *ht = kvstoreGetHashtable(it->kvs, it->kvs_didx); + if (ht) { + it->ht_cursor = hashtableScan(ht, it->ht_cursor, fullScanIteratorScanCallback, dbEntryFifo); + } else { + it->ht_cursor = 0; + } + + if (it->ht_cursor == 0) { + /* Done with this hashtable, move to next. */ + resumeRehashForKvsHashtable(it->kvs, it->kvs_didx); + it->kvs_didx = kvstoreGetNextNonEmptyHashtableIndex(it->kvs, it->kvs_didx); + if (it->kvs_didx == KVSTORE_INDEX_NOT_FOUND) it->kvs = NULL; + if (it->kvs != NULL) pauseRehashForKvsHashtable(it->kvs, it->kvs_didx); + } + } + *orig_dbid = it->iter_db; + *cur_dbid = it->orig_to_cur_db[*orig_dbid]; + return dbEntryFifo; +} + +static void fullScanIteratorSwapDb(genericIterator *genIt, int db1, int db2) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + int temp = it->cur_to_orig_db[db1]; + it->cur_to_orig_db[db1] = it->cur_to_orig_db[db2]; + it->cur_to_orig_db[db2] = temp; + + it->orig_to_cur_db[it->cur_to_orig_db[db1]] = db1; + it->orig_to_cur_db[it->cur_to_orig_db[db2]] = db2; +} + +static void fullScanIteratorFlushDb(genericIterator *genIt, int cur_dbid) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + int orig_db = (cur_dbid == -1) ? it->iter_db : it->cur_to_orig_db[cur_dbid]; + if (orig_db == it->iter_db) { + // We are currently iterating on the DB that's being flushed. + it->kvs = NULL; + // Iteration will continue with the next DB. + } +} + +static bool fullScanIteratorHasPassedItem(genericIterator *genIt, const_sds key, int cur_dbid) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + int orig_dbid = it->cur_to_orig_db[cur_dbid]; + + if (orig_dbid < it->iter_db) return true; // Entire DB has already been processed + if (orig_dbid > it->iter_db) return false; // Haven't started this DB yet + // Now, orig_dbid == it->iter_db + + if (it->kvs == NULL) return true; // just finished this DB + + /* We're in the middle of processing a DB. In cluster-mode, the DB is divided into 1 hashtable + * per slot. In cluster-mode-disabled, we treat all keys as in slot 0. */ + int keySlot = server.cluster_enabled ? getKeySlot((sds)key) : 0; + if (keySlot < it->kvs_didx) return true; + if (keySlot > it->kvs_didx) return false; + + // At this point, we're down to a specific hashtable. + + hashtable *ht = kvstoreGetHashtable(it->kvs, keySlot); + if (hashtableScanHasPassedKey(ht, key, it->ht_cursor)) return true; + + if (ignoreKeyForSave(key)) return true; // if slot being purged, pretend we have passed it + return false; +} + +static int fullScanIteratorOriginalDb(genericIterator *genIt, int cur_dbid) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + return it->cur_to_orig_db[cur_dbid]; +} + +static bool fullScanIteratorIsKeyInScope(genericIterator *genIt, const_sds key) { + UNUSED(genIt); + UNUSED(key); + return true; // All keys are in scope +} + +static genericIterator *fullScanIteratorCreate(void) { + struct fullScanIterator *it = zmalloc(sizeof(struct fullScanIterator)); + it->orig_to_cur_db = zmalloc(sizeof(int) * server.dbnum); + it->cur_to_orig_db = zmalloc(sizeof(int) * server.dbnum); + for (int i = 0; i < server.dbnum; i++) { + it->orig_to_cur_db[i] = i; + it->cur_to_orig_db[i] = i; + } + it->iter_db = -1; + it->kvs = NULL; + + it->callbacks.release = fullScanIteratorRelease; + it->callbacks.getEntries = fullScanIteratorGetEntries; + it->callbacks.swapDb = fullScanIteratorSwapDb; + it->callbacks.flushDb = fullScanIteratorFlushDb; + it->callbacks.hasPassedItem = fullScanIteratorHasPassedItem; + it->callbacks.originalDb = fullScanIteratorOriginalDb; + it->callbacks.isKeyInScope = fullScanIteratorIsKeyInScope; + + return (genericIterator *)it; +} + + +/* ============================================================================================= + * Cluster Slot Iterator + * ============================================================================================= + * The cluster slot iterator performs iteration over one cluster slot of the Valkey keyset. The + * iterator is only used from within the Valkey main thread. */ +struct clusterSlotIterator { + genericIterator callbacks; // (must be first item) +}; + +static void clusterSlotIteratorRelease(genericIterator *genIt) { + UNUSED(genIt); + serverAssert(false); // Not yet implemented +} + +static fifo *clusterSlotIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) { + UNUSED(genIt); + UNUSED(orig_dbid); + UNUSED(cur_dbid); + serverAssert(false); // Not yet implemented +} + +static void clusterSlotIteratorSwapDb(genericIterator *genIt, int db1, int db2) { + UNUSED(genIt); + UNUSED(db1); + UNUSED(db2); + serverAssert(false); // swap not valid in cluster mode +} + +static void clusterSlotIteratorFlushDb(genericIterator *genIt, int cur_dbid) { + UNUSED(genIt); + UNUSED(cur_dbid); + serverAssert(false); // Not yet implemented +} + +static bool clusterSlotIteratorHasPassedItem(genericIterator *genIt, const_sds key, int cur_dbid) { + UNUSED(genIt); + UNUSED(key); + UNUSED(cur_dbid); + serverAssert(false); // Not yet implemented +} + +static int clusterSlotIteratorOriginalDb(genericIterator *genIt, int cur_dbid) { + UNUSED(genIt); + UNUSED(cur_dbid); + return cur_dbid; // swap not supported in cluster mode +} + +/* When checking if a command is in scope for this iterator, all of its keys should be either in + * scope or not. In cluster mode enabled a command cannot reference keys from different slots, so + * this assumption will always be true. */ +static bool clusterSlotIteratorIsKeyInScope(genericIterator *genIt, const_sds key) { + UNUSED(genIt); + UNUSED(key); + serverAssert(false); // Not yet implemented +} + +static genericIterator *clusterSlotIteratorCreate(const int *slots, size_t slots_count) { + struct clusterSlotIterator *it = zmalloc(sizeof(struct clusterSlotIterator)); + it->callbacks.release = clusterSlotIteratorRelease; + it->callbacks.getEntries = clusterSlotIteratorGetEntries; + it->callbacks.swapDb = clusterSlotIteratorSwapDb; + it->callbacks.flushDb = clusterSlotIteratorFlushDb; + it->callbacks.hasPassedItem = clusterSlotIteratorHasPassedItem; + it->callbacks.originalDb = clusterSlotIteratorOriginalDb; + it->callbacks.isKeyInScope = clusterSlotIteratorIsKeyInScope; + + UNUSED(slots); + UNUSED(slots_count); + serverAssert(false); // Not yet implemented + + return (genericIterator *)it; +} + + +/* ============================================================================================= + * General iteration support (across all iterators) + * ============================================================================================= */ + +/* While an item is potentially in use by a background thread, we can't have rehashing by the main + * thread. Returns true if rehashing was paused. */ +static bool pauseRehashing(dbEntry *de) { + switch (de->encoding) { + case OBJ_ENCODING_HASHTABLE: { // SET or HASH + hashtable *ht = objectGetVal(de); + hashtablePauseRehashing(ht); + return true; + } + case OBJ_ENCODING_SKIPLIST: { // SORTED SET + zset *zs = objectGetVal(de); + hashtablePauseRehashing(zs->ht); + return true; + } + default: + return false; + } +} + +static void resumeRehashing(dbEntry *de) { + switch (de->encoding) { + case OBJ_ENCODING_HASHTABLE: { // SET or HASH + hashtable *ht = objectGetVal(de); + hashtableResumeRehashing(ht); + break; + } + case OBJ_ENCODING_SKIPLIST: { // SORTED SET + zset *zs = objectGetVal(de); + hashtableResumeRehashing(zs->ht); + break; + } + default: + break; + } +} + +// Maintain a list of entries which are currently in-use. These items should not be modified. +static void incrementEntryInuse(dbEntry *de) { + dictEntry *existingEntry; + dictEntry *newEntry = dictAddRaw(inUseEntries, de, &existingEntry); + if (newEntry) { + incrRefCount(de); + dictSetSignedIntegerVal(newEntry, 1); + } else { + dictSetSignedIntegerVal(existingEntry, dictGetSignedIntegerVal(existingEntry) + 1); + } +} + + +static void decrementEntryInuse(dbEntry *de) { + dictEntry *entry = dictFind(inUseEntries, de); + if (dictGetSignedIntegerVal(entry) == 1) { + dictDelete(inUseEntries, de); + decrRefCount(de); + } else { + serverAssert(dictGetSignedIntegerVal(entry) > 1); + dictSetSignedIntegerVal(entry, dictGetSignedIntegerVal(entry) - 1); + } +} + +static bool isEntryInuseBySingleIterator(dbEntry *de) { + dictEntry *entry = dictFind(inUseEntries, de); + return dictGetSignedIntegerVal(entry) == 1; +} + +static bool isEntryInuseByAnyIterator(dbEntry *de) { + return (dictFind(inUseEntries, de) != NULL); +} + + +static ssize_t computeStringDbEntrySize(dbEntry *de) { + sds key = objectGetKey(de); + size_t valueSize = stringObjectLen(de); + + return sdslen(key) + valueSize; // ignore the rest of the overhead, it's minor & transient +} + + +static dbEntry *tryCloneDbEntry(dbEntry *de) { + if (bgiteration_current_clone_memory_pool_size + bgiter_max_clone_item_bytes > + bgiter_max_clone_pool_bytes) return NULL; + + /* Future optimization: Incorporate small ziplists, sorted sets, etc. + * OBJ_ENCODING_INT is omitted only because there isn't a good API for cloning it yet. */ + if (de->type == OBJ_STRING && de->encoding != OBJ_ENCODING_INT) { + ssize_t itemSize = computeStringDbEntrySize(de); + + if (itemSize <= bgiter_max_clone_item_bytes) { + bgiteration_current_clone_memory_pool_size += itemSize; + dbEntry *clone = createStringObjectWithKeyAndExpire((char *)objectGetVal(de), + sdslen(objectGetVal(de)), + objectGetKey(de), + objectGetExpire(de)); + ((bgIterationEntryMetadata *)objectGetMetadata(clone))->iterator_epoch = + ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch; + return clone; + } + } + + return NULL; +} + +static void freeClonedDictEntry(dbEntry *clonedEntry) { + serverAssert(clonedEntry->type == OBJ_STRING); + + bgiteration_current_clone_memory_pool_size -= computeStringDbEntrySize(clonedEntry); + + decrRefCount(clonedEntry); +} + + +static bgIteratorItem *makeDbEntryItem(dbEntry *de, int dbid, bool isCloned) { + if (!isCloned) incrementEntryInuse(de); + + bgIteratorItem *item = itemFreeList_getElementOrAllocate(); + item->type = BGITERATOR_ITEM_DBENTRY; + item->dbid = dbid; + item->u.dbe.de = de; + item->u.dbe.is_cloned = isCloned; + item->u.dbe.is_rehashing_paused = pauseRehashing(de); + + return item; +} + +static robj **cloneRobjArray(int argc, robj **argv) { + robj **newarray = zmalloc(sizeof(robj *) * argc); + for (int i = 0; i < argc; i++) { + newarray[i] = argv[i]; + incrRefCount(argv[i]); + } + return newarray; +} + + +static void freeRobjArray(int argc, robj **argv) { + for (int i = 0; i < argc; i++) { + decrRefCount(argv[i]); + } + zfree(argv); +} + + +// Called by iterator thread to release an item. +static void returnCurrentItemToValkey(bgIterator *it) { + bgIteratorItem *item = it->current_item; + if (item == NULL) return; + + switch (item->type) { + case BGITERATOR_ITEM_DBENTRY: + it->dbentries_processed++; + if (item->u.dbe.is_cloned) it->dbentry_clones_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_REPLICATION: + it->replication_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_SWAPDB: + it->swapdb_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_FLUSHDB: + it->flushdb_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + + case BGITERATOR_ITEM_COMPLETE: + case BGITERATOR_ITEM_TERMINATED: + // These are static and just used to wake the iterator - they should never be returned. + serverAssert(false); + break; + + default: + serverAssert(false); + } + + /* Do this AFTER placing into return_to_valkey. This is volatile and snooped when there is a + * flushall event. Don't want an item to be missed. */ + it->current_item = NULL; +} + + +/* ============================================================================================= + * Background Iterator (private) + * ============================================================================================= */ + +static void bgIteratorRelease(bgIterator *it) { + serverAssert(onValkeyMainThread()); + serverAssert(it->current_item == NULL); + serverAssert(mutexQueueLength(it->items_for_iterator) == 0); + serverAssert(mutexQueueLength(it->return_to_valkey) == 0); + + dictDelete(nameToIterator, it->name); + listDelNode(allIterators, listSearchKey(allIterators, it)); + + mutexQueueRelease(it->items_for_iterator); + it->items_for_iterator = NULL; + + mutexQueueRelease(it->return_to_valkey); + it->return_to_valkey = NULL; + + it->keyset_iter->release(it->keyset_iter); + it->keyset_iter = NULL; + + hashtableRelease(it->early_iterate_entries); + it->early_iterate_entries = NULL; + + sdsfree(it->name); + zfree(it); +} + + +static bool shouldFeedIteratorMore(bgIterator *it) { + return (!it->completed && + !it->terminated && + mutexQueueLength(it->items_for_iterator) < it->item_count_target); +} + + +// Debugging routine +static sds createEntryString(int dbid, dbEntry *de) { + sds key = objectGetKey(de); + + sds entrySds = sdsempty(); + entrySds = sdscatprintf(entrySds, "(%d)'%s'", dbid, key); + if (de->type == OBJ_STRING) { + robj *o = getDecodedObject(de); // might be encoded as int + const unsigned valuePrintLen = 20; + entrySds = sdscatprintf(entrySds, " : '%.*s'", valuePrintLen, (char *)objectGetVal(o)); + if (sdslen((sds)objectGetVal(o)) > valuePrintLen) entrySds = sdscat(entrySds, "..."); + decrRefCount(o); + } else { + entrySds = sdscatprintf(entrySds, " : type(%d)", de->type); + } + return entrySds; +} + + +static void feedIterator(bgIterator *it, monotime end_time_us) { + // Smart logic to dynamically adjust the size of the queue + unsigned int initial_queue_len = mutexQueueLength(it->items_for_iterator); + + if (initial_queue_len > 2 && it->item_count_target >= initial_queue_len) { + it->item_count_target -= initial_queue_len / 2; + } + + // Now do some feeding + bool have_time = (getMonotonicUs() < end_time_us); + int timeCheckCounter = 0; + while (shouldFeedIteratorMore(it) && have_time) { + int orig_dbid, cur_dbid; + fifo *dbEntryFifo = it->keyset_iter->getEntries(it->keyset_iter, &orig_dbid, &cur_dbid); + + if (dbEntryFifo == NULL) { + // Iteration of items is complete for this iterator + serverAssert(it->dbentries_queued >= it->dbentries_processed); + serverAssert(it->replication_queued >= it->replication_processed); + serverAssert(it->swapdb_queued >= it->swapdb_processed); + serverAssert(it->flushdb_queued >= it->flushdb_processed); + serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed); + + // Snapshot queue size to seed next iterator when terminated + last_item_count_target = it->item_count_target; + + if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { + if (!it->client_is_active || (it->dbentries_queued > it->dbentries_processed)) { + /* We are done feeding dict entries to the iterator, but before ending the + * replication processing make sure that the iterator has become active (has + * started reading) and make sure that all of the dict entries have been + * processed by the client. */ + break; + } + if (it->repldone) { + bool clientWantsMoreReplication = (!it->repldone(it->privdata)); + if (clientWantsMoreReplication) break; + } + } + bgIteratorItem *completionItem = itemFreeList_getElementOrAllocate(); + *completionItem = (bgIteratorItem){.type = BGITERATOR_ITEM_COMPLETE}; + if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { + rdbSaveInfo rsi; + completionItem->dbid = (rdbPopulateSaveInfo(&rsi)) ? rsi.repl_stream_db : 0; + completionItem->u.master_repl_offset = server.primary_repl_offset; + if (BGITERATION_DEBUG) { + debugBuffer = sdscat(debugBuffer, "REPLDONE FN\n"); + } + } + + if (BGITERATION_DEBUG) { + debugBuffer = sdscat(debugBuffer, "SENDING COMPLETE\n"); + } + + mutexQueueAdd(it->items_for_iterator, completionItem); + it->completed = true; + break; + } + + int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) ? orig_dbid : cur_dbid; + + fifo *itemsToAdd = fifoCreate(); + while (fifoLength(dbEntryFifo) > 0) { + dbEntry *de; + fifoPop(dbEntryFifo, (void **)&de); + + // Remove new/modified items during consistent iteration. + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT && + ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch > it->consistent_modification_id) { + continue; + } + + // Remove any items which have been processed early + if (hashtableDelete(it->early_iterate_entries, de)) { + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, de); + debugBuffer = sdscatprintf(debugBuffer, "SKIPPING ITEM(early iterate): %s\n", entryString); + sdsfree(entryString); + } + continue; + } + + // For items which are left, convert them from dbEntry to iteratorItem + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, de); + debugBuffer = sdscatprintf(debugBuffer, "ITEM: %s\n", entryString); + sdsfree(entryString); + } + + bgIteratorItem *item = makeDbEntryItem(de, dbid, false); + fifoPush(itemsToAdd, item); + } + fifoRelease(dbEntryFifo); + + if (fifoLength(itemsToAdd) > 0) { + it->dbentries_queued += fifoLength(itemsToAdd); + mutexQueueAddMultiple(it->items_for_iterator, itemsToAdd); + } + fifoRelease(itemsToAdd); + + // This is a predictably fast loop. We don't need to check the time on every pass. + if (++timeCheckCounter % 32 == 0) { + have_time = (getMonotonicUs() < end_time_us); + } + } + + // Smart logic to dynamically adjust the size of the queue + if (initial_queue_len == 0 && have_time) { + it->item_count_target += BGITER_QUEUE_INCREASE_INCR; + } +} + + +static bool addEarlyIterationKey(bgIterator *it, dbEntry *earlyEntry, int cur_dbid) { + bool wasAdded = hashtableAdd(it->early_iterate_entries, earlyEntry); + serverAssert(wasAdded); + + int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) + ? it->keyset_iter->originalDb(it->keyset_iter, cur_dbid) + : cur_dbid; + + dbEntry *cloneEntry = tryCloneDbEntry(earlyEntry); + bool isClonedEntry = (cloneEntry != NULL); + bgIteratorItem *item = makeDbEntryItem(isClonedEntry ? cloneEntry : earlyEntry, dbid, isClonedEntry); + + it->dbentries_queued++; + if (isClonedEntry) it->dbentry_clones_queued++; + + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT || server.cluster_enabled) { + /* On consistent iteration, SWAPDB events are not provided. So there is no requirement to + * keep items in order or synchronized with SWAPDB. In cluster mode, SWAPDB isn't supported. */ + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, item->u.dbe.de); + debugBuffer = sdscatprintf(debugBuffer, "EARLY_1: %s\n", entryString); + sdsfree(entryString); + } + mutexQueuePushPriority(it->items_for_iterator, item); + } else { + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, item->u.dbe.de); + debugBuffer = sdscatprintf(debugBuffer, "EARLY: %s\n", entryString); + sdsfree(entryString); + } + mutexQueueAdd(it->items_for_iterator, item); + } + return !isClonedEntry; // Block if the entry will be used by the background thread +} + + +// This expedites a single key and doesn't attempt to avoid expediting through optimization. +static bool expediteSingleKeyWithoutOptimization(bgIterator *it, + int dbid, + robj *oKey, + hashtable *waitingOnKeys) { + bool mustBlock = false; + + bool iterComplete = it->completed || it->terminated; + + sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], key); + if (de != NULL) { + if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) && + !hashtableFind(it->early_iterate_entries, de, NULL)) { + if (addEarlyIterationKey(it, de, dbid)) { + mustBlock = true; + hashtableAdd(waitingOnKeys, oKey); + } + } else { + if (isEntryInuseByAnyIterator(de)) { + mustBlock = true; + hashtableAdd(waitingOnKeys, oKey); + } + } + } + + return mustBlock; +} + + +// MOVE/COPY are unfortunate special commands. They work on 2 DBs at once. +const int MOVE_COMMAND_DBID_ARG_INDEX = 2; +static bool expediteKeysForMove(bgIterator *it, + int dbid, + int argc, + robj **argv, + hashtable *waitingOnKeys) { + if (argc <= MOVE_COMMAND_DBID_ARG_INDEX) return false; + + int destDbid; + if (!getDbIdFromRobj(argv[MOVE_COMMAND_DBID_ARG_INDEX], &destDbid)) return false; + + bool mustBlock = false; + robj *key = argv[1]; + + /* Not looking for special cases to optimize here. Just try to expedite both src and dest + * keys. Note that the dest key might exist (and need iteration) but could be expired and + * could be overwritten by MOVE. In this case, a DEL would replicate due to the expiry. So + * even if the target is expired, we need to replicate it before executing the command. */ + if (expediteSingleKeyWithoutOptimization(it, dbid, key, waitingOnKeys)) mustBlock = true; + if (expediteSingleKeyWithoutOptimization(it, destDbid, key, waitingOnKeys)) mustBlock = true; + + it->cur_cmd_may_replicate = true; + return mustBlock; +} + + +// MOVE/COPY are unfortunate special commands. They work on 2 DBs at once. +static bool expediteKeysForCopy(bgIterator *it, + int dbid, + int argc, + robj **argv, + hashtable *waitingOnKeys) { + int destDbid; + if (!getTargetDbIdForCopyCommand(argc, argv, dbid, &destDbid)) return false; + + bool mustBlock = false; + robj *srcKey = argv[1]; + robj *destKey = argv[2]; + + /* Not trying to optimize COPY. Just expedite source and destination (if it exists). We + * don't really care if the value is overwritten or not (so no need to parse REPLACE option). */ + if (expediteSingleKeyWithoutOptimization(it, dbid, srcKey, waitingOnKeys)) mustBlock = true; + if (expediteSingleKeyWithoutOptimization(it, destDbid, destKey, waitingOnKeys)) mustBlock = true; + + it->cur_cmd_may_replicate = true; + return mustBlock; +} + + +/* There are several cases where a client must be blocked on write operations. (Clients never need + * to be blocked for read operations.) + * + * Note: An Amazon extension to the Valkey command structure allows us to identify commands where + * the first key is for write and the rest are for read. This allows us to make the + * following optimizations: + * - for keys which are read only, there's no need to block if the key is in-use by an iterator + * - without replication, there's no need to immediately queue read keys on a consistent iteration + * + * Iterator: CONSISTENT = NO, REPLICATION = NO + * - Block if any write-key is in use by an the iterator + * + * Iterator: CONSISTENT = NO, REPLICATION = YES + * - Block if any write-key is in use by an the iterator + * - If ANY key has already been iterated (but some keys have not), then + * - Block and immediately queue any key (read or write) that has not + * already been iterated + * Example: SDIFFSTORE KEY_A KEY_B KEY_C + * In this case, KEY_A is written, KEY_B and KEY_C are read. If KEY_A has already been + * iterated over, the replication stream will contain this command. The receiver of this + * replication will need KEY_B and KEY_C in order to process the replication stream. So + * these need to be iterated and the client blocked. + * + * Iterator: CONSISTENT = YES, REPLICATION = NO + * - Block if any write-key is in use by an the iterator + * - Block and immediately queue any WRITE-key that has not already been iterated + * + * Iterator: CONSISTENT = YES, REPLICATION = YES + * (Combination only valid in cluster mode - no SWAPDB possible) + * - Block if any write-key is in use by an the iterator + * - Block and immediately queue any key (read or write) that has not already been iterated */ +static bool expediteKeysForWrite(bgIterator *it, + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv, + keyReference *keyrefs, + int numKeys, + hashtable *waitingOnKeys) { + serverAssert(numKeys > 0); + + bool mustBlock = false; + + /* All keys of the command should either be in scope or not since in cluster mode enabled they + * should all be in the same slot. So we just check the first key. */ + robj *oKey = argv[keyrefs[0].pos]; + sds key = objectGetVal(oKey); + /* If it's not in the iteration scope for the current iterator, then we don't need to do + * anything with this command. */ + if (!it->keyset_iter->isKeyInScope(it->keyset_iter, key)) return false; + + /* Note: performance optimization for commands which only modify the first key. If this flag + * is not available, we can safely remove this `if` statement. */ + if ((cmd->flags & CMD_WRITE_FIRSTKEY_ONLY) && + !(it->iteration_flags & BGITERATOR_FLAG_REPLICATION)) { + /* If this write command only modifies the 1st key, we don't need to expedite others + * unless replication enabled. */ + numKeys = 1; + } + + if (cmd->proc == moveCommand) { + // Special case for MOVE + return expediteKeysForMove(it, dbid, argc, argv, waitingOnKeys); + } + + if (cmd->proc == copyCommand) { + // Similar special case for COPY + return expediteKeysForCopy(it, dbid, argc, argv, waitingOnKeys); + } + + bool iterComplete = it->completed || it->terminated; + + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { + // CONSISTENT = YES, REPLICATION = YES / NO + for (int i = 0; i < numKeys; i++) { + robj *oKey = argv[keyrefs[i].pos]; + sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], key); + if (de == NULL) continue; // New key, no need to expedite + if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) && + !hashtableFind(it->early_iterate_entries, de, NULL) && + ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { + if (addEarlyIterationKey(it, de, dbid)) { + mustBlock = true; + hashtableAdd(waitingOnKeys, oKey); + } + } else { + if (isEntryInuseByAnyIterator(de)) { + mustBlock = true; + hashtableAdd(waitingOnKeys, oKey); + } + } + } + it->cur_cmd_may_replicate = true; // Will replicate only if replication enabled + } else { + /* Identification of missing keys is only needed for non-consistent iteration. This only + * needs to be collected once (on the 1st non-consistent iteration). */ + bool collectMissing = (listLength(curCmdMissingKeys) == 0); + + if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { + // CONSISTENT = NO, REPLICATION = YES + bool someIterated = false; + /* dict containing the keys that have not been iterated yet. + * Using a dict dedupes the keys in case the command contains duplicated keys. */ + dict *notIteratedKeys = dictCreate(&dictEntryPtrDictType); // dict of dbEntry* -> robj* + + for (int i = 0; i < numKeys; i++) { + robj *oKey = argv[keyrefs[i].pos]; + sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], key); + if (de == NULL) { + if (collectMissing) { + incrRefCount(oKey); + listAddNodeHead(curCmdMissingKeys, oKey); + } + continue; + } + if (iterComplete || + it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) || + hashtableFind(it->early_iterate_entries, de, NULL)) { + someIterated = true; + } else { + dictAdd(notIteratedKeys, de, oKey); + } + if (isEntryInuseByAnyIterator(de)) { + mustBlock = true; + hashtableAdd(waitingOnKeys, oKey); + } + } + + /* Since missing keys are considered as already iterated, if there are any missing keys + * we must consider that some keys have been iterated, and make sure all other keys + * will be expedited if needed. */ + if (listLength(curCmdMissingKeys) > 0) someIterated = true; + + /* This command may be executing as part of a larger transaction. If some parts of the + * transaction have already been identified to replicate, we must wait on all keys and + * replicate here as well. (Take care not to set cur_cmd_may_replicate to false.) */ + if (someIterated) { + if (server.in_exec) { + /* We are now executing the commands in a multi-exec block. + * + * Regarding MULTI/EXEC: Remember that this code is executed twice for commands + * within a MULTI/EXEC block. First, we parse all the commands when deciding + * if the EXEC should be blocked. Then, as each command is executed, it's + * re-parsed so that we can maintain the early iterated list as the commands + * execute. In this second pass, as each command is executed, we can't change + * the replication decision which was made earlier (when the EXEC was processed). + * We don't want to get tricked (by a key being removed and recreated) into + * starting to replicate in the middle of a MULTI/EXEC block. */ + } else { + it->cur_cmd_may_replicate = true; + } + } + if (it->cur_cmd_may_replicate) { + dictEntry *de; + dictIterator *di = dictGetIterator(notIteratedKeys); + while ((de = dictNext(di)) != NULL) { + dbEntry *notIteratedEntry = dictGetKey(de); + robj *oKey = dictGetVal(de); + + if (addEarlyIterationKey(it, notIteratedEntry, dbid)) { + mustBlock = true; + hashtableAdd(waitingOnKeys, oKey); + } + } + dictReleaseIterator(di); + } + dictRelease(notIteratedKeys); + } else { + // CONSISTENT = NO, REPLICATION = NO + for (int i = 0; i < numKeys; i++) { + robj *oKey = argv[keyrefs[i].pos]; + sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], key); + if (de == NULL) { + if (collectMissing) { + incrRefCount(oKey); + listAddNodeHead(curCmdMissingKeys, oKey); + } + continue; + } + if (isEntryInuseByAnyIterator(de)) { + mustBlock = true; + hashtableAdd(waitingOnKeys, oKey); + } + } + } + } + + return mustBlock; +} + + +/* Called when an iterator is terminated. Pulls everything out of the queue + * and returns the items to Valkey (before they hit the iterator). */ +static void returnAllItemsToValkey(bgIterator *it) { + serverAssert(onValkeyMainThread()); + + fifo *poppedFifo = mutexQueuePopAll(it->items_for_iterator, false); + if (poppedFifo == NULL) return; // Nothing to return + + // Release non-dictentry items first... + fifo *itemsToReturn = fifoCreate(); + while (fifoLength(poppedFifo) > 0) { + bgIteratorItem *item; + fifoPop(poppedFifo, (void **)&item); + switch (item->type) { + // back out the "queued" statistic + case BGITERATOR_ITEM_DBENTRY: + it->dbentries_queued--; + if (item->u.dbe.is_cloned) it->dbentry_clones_queued--; + break; + case BGITERATOR_ITEM_REPLICATION: + it->replication_queued--; + break; + case BGITERATOR_ITEM_SWAPDB: + it->swapdb_queued--; + break; + case BGITERATOR_ITEM_FLUSHDB: + it->flushdb_queued--; + break; + + case BGITERATOR_ITEM_COMPLETE: + /* This can only happen if the completion item has been enqueued and + * the iterator is terminated before reaching the completion item. */ + itemFreeList_returnItemBackToFreeList(item); + continue; // Skip pushing this onto itemsToReturn + + case BGITERATOR_ITEM_TERMINATED: + /* This can only happen if there is a race when terminating between + * the iteration client and main thread. */ + itemFreeList_returnItemBackToFreeList(item); + continue; // Skip pushing this onto itemsToReturn + + default: + serverAssert(false); + } + + fifoPush(itemsToReturn, item); + } + fifoRelease(poppedFifo); + + // Now release items all at once... + if (fifoLength(itemsToReturn) > 0) { + mutexQueueAddMultiple(it->return_to_valkey, itemsToReturn); + } + fifoRelease(itemsToReturn); +} + + +/* ============================================================================================= + * Foreground support functions (private) + * ============================================================================================= */ + +static size_t replicationItemSize(bgIteratorItem *item) { + serverAssert(item->type == BGITERATOR_ITEM_REPLICATION); + size_t itemSize = sizeof(bgIteratorItem); + for (int i = 0; i < item->u.repl.argc; i++) { + itemSize += objectComputeSize(NULL, item->u.repl.argv[i], 0, 0); + } + return itemSize; +} + +static void processReturnOfItemToValkey(bgIteratorItem *item, bgIterator *iter) { + serverAssert(onValkeyMainThread()); + switch ((int)item->type) { + case BGITERATOR_ITEM_REPLICATION: + bufferedReplicationBytes -= replicationItemSize(item); + freeRobjArray(item->u.repl.argc, item->u.repl.argv); + break; + + case BGITERATOR_ITEM_DBENTRY: + if (item->u.dbe.is_cloned) { + freeClonedDictEntry(item->u.dbe.de); + } else { + if (isEntryInuseBySingleIterator(item->u.dbe.de)) { + /* This blocking mechanism assumes a single DB so if the same key appears in + * multiple DBs, commands might get unblocked only to get blocked again. (This + * would happen only rarely, and with minimal impact.) */ + robj *key = createStringObjectFromSds(objectGetKey(item->u.dbe.de)); + unblockClientsInUseOnKey(key); + decrRefCount(key); + } + // resumeRehashing must be called before decrementEntryInuse, since decrementEntryInuse can free + if (item->u.dbe.is_rehashing_paused) resumeRehashing(item->u.dbe.de); + decrementEntryInuse(item->u.dbe.de); + } + break; + + case BGITERATOR_ITEM_SWAPDB: + case BGITERATOR_ITEM_FLUSHDB: + break; + + case BGITERATOR_ITEMEXT_ITER_CLOSED: { + bgIterator *it = ((bgIteratorItemExtClose *)item)->iter; + serverAssert(it == iter); + if (it->terminated) { + /* Abnormal termination + * Normally the item is TERMINATED, but might be COMPLETE in race */ + serverAssert(it->current_item->type == BGITERATOR_ITEM_TERMINATED || + it->current_item->type == BGITERATOR_ITEM_COMPLETE); + // Release any items stranded on the iterator after early termination + returnAllItemsToValkey(it); + receiveItemsBackFromOneIterator(it); + } else { + // Normal completion + serverAssert(it->current_item->type == BGITERATOR_ITEM_COMPLETE); + } + serverAssert(mutexQueueLength(it->items_for_iterator) == 0); + serverAssert(it->dbentries_queued == it->dbentries_processed); + serverAssert(it->replication_queued == it->replication_processed); + serverAssert(it->swapdb_queued == it->swapdb_processed); + serverAssert(it->flushdb_queued == it->flushdb_processed); + serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed); + + listEmpty(curCmdMissingKeys); // Just in case any remain + + itemFreeList_returnItemBackToFreeList(it->current_item); + it->current_item = NULL; + + bool terminated = it->terminated; + void *privdata = it->privdata; + bgIteratorCleanupFunc cleanup = it->cleanup; + bgIteratorRelease(it); // Fully release the iterator before calling cleanup + + if (BGITERATION_DEBUG) { + if (cleanup) debugBuffer = sdscatprintf(debugBuffer, "CLEANUP FN (%s)\n", + (terminated) ? "terminated" : "success"); + + sds filename = sdscatprintf(sdsempty(), "bgiteration_debug.%d", getpid()); + FILE *f = fopen(filename, "w"); + sdsfree(filename); + + fputs(debugBuffer, f); + + fclose(f); + sdsfree(debugBuffer); + debugBuffer = sdsempty(); + } + + if (cleanup) cleanup(terminated, privdata); + } break; + + default: + serverAssert(false); // Not expecting any other type of item! + } + + // We don't allocate extension items from the pool so we manually free them + if ((int)item->type == BGITERATOR_ITEMEXT_ITER_CLOSED) { + zfree(item); + } else { + itemFreeList_returnItemBackToFreeList(item); + } +} + +static void prepareAndProcessReturnedItems(int n, bgIteratorItem **items, bgIterator *iter) { + for (int i = 0; i < n; i++) valkey_prefetch(items[i]); + for (int i = 0; i < n; i++) { + if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue; + valkey_prefetch(items[i]->u.dbe.de); + } + for (int i = 0; i < n; i++) { + if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue; + valkey_prefetch(objectGetKey(items[i]->u.dbe.de)); + } + for (int i = 0; i < n; i++) processReturnOfItemToValkey(items[i], iter); +} + +#define PREFETCH_BATCH_SIZE 16 + +// Returns true if we process at least one item from a given iterator's return_to_valkey queue. +static bool receiveItemsBackFromOneIterator(bgIterator *it) { + bgIteratorItem *batchPool[PREFETCH_BATCH_SIZE]; + int n = 0; + fifo *poppedFifo = mutexQueuePopAll(it->return_to_valkey, false); + if (poppedFifo != NULL) { + while (fifoLength(poppedFifo) > 0) { + fifoPop(poppedFifo, (void **)&batchPool[n++]); + if (n == PREFETCH_BATCH_SIZE) { + prepareAndProcessReturnedItems(n, batchPool, it); + n = 0; + } + } + if (n > 0) { + prepareAndProcessReturnedItems(n, batchPool, it); + } + fifoRelease(poppedFifo); + return true; + } + return false; +} + +/* Process each iterator's return_to_valkey queue + * If `blocking` is true, continue reading until at least one queue was not empty. */ +static void receiveItemsBackFromIterators(bool blocking) { + serverAssert(onValkeyMainThread()); + listIter li; + listNode *node; + bool processedItems = false; + do { + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + processedItems |= receiveItemsBackFromOneIterator(it); + } + if (blocking && !processedItems) usleep(100); // Short sleep before retry + } while (blocking && !processedItems); +} + + +static long long bgIteration_feedIterators_task(struct aeEventLoop *eventLoop, + long long id, + void *clientData) { + UNUSED(eventLoop); + UNUSED(id); + UNUSED(clientData); + serverAssert(onValkeyMainThread()); + + static monotime lastFeedEndTime; // STATIC: Persists For checking starvation + monotime startTime = getMonotonicUs(); + + if (!bgIteration_iterationActive()) { + // No more iterators exist. Self-check, and terminate the "feed" task. + serverAssert(dictSize(nameToIterator) == 0); + serverAssert(dictSize(inUseEntries) == 0); + serverAssert(bufferedReplicationBytes == 0); + + // Shrink dict back to zero (doesn't normally shrink) + dictRelease(inUseEntries); + inUseEntries = dictCreate(&dictEntryPtrDictType); + + itemFreeList_release(); + + bgIterator_timeproc_id = AE_DELETED_EVENT_ID; + lastFeedEndTime = 0; + return AE_NOMORE; + } + + long dutyTimeUs = BGITER_CYCLE_BUDGET_MS * 1000; + if (lastFeedEndTime > 0) { + /* If the timer was delayed, compute the proportional time we should have had, and increase + * the duty cycle to compensate (up to a limit). */ + long starvationUs = (startTime - lastFeedEndTime) - BGITER_CYCLE_DELAY_MS * 1000; + if (starvationUs > 0) { + long starvationCompensationUs = starvationUs * BGITER_CYCLE_BUDGET_MS / + (BGITER_CYCLE_BUDGET_MS + BGITER_CYCLE_DELAY_MS); + dutyTimeUs += starvationCompensationUs; + dutyTimeUs = MIN(dutyTimeUs, BGITER_CYCLE_BUDGET_MAX_MS * 1000); + } + } + monotime endTime = startTime + dutyTimeUs; + + // Run this part regardless of time limit... + receiveItemsBackFromIterators(false); + + // Feeding iterators (below) respects endTime. The stuff above always runs to completion. + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL && getMonotonicUs() < endTime) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated) continue; + feedIterator(it, endTime); + } + + lastFeedEndTime = getMonotonicUs(); + return BGITER_CYCLE_DELAY_MS; +} + + +// Not static, but not API. Intended for unit tests where the event loop may not be active. +void bgIteration_feedIterators(void) { + /* For unit testing, force the item_count_target to 1 in each call. This ensures that we only + * feed a minimal amount to the iterators rather than a non-deterministic amount. */ + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + it->item_count_target = 1; + } + + // Invoke the feeding task (normally invoked by timer). + bgIteration_feedIterators_task(NULL, 0, NULL); +} + + +static void resetReplicationFlagForIterators(client *c) { + /* For any given command, the command may or may not need to be replicated based on the status + * and flags of each iterator. Furthermore, if a command does need to be replicated, this + * replication must occur for an entire atomic unit; we can't replicate only part of a script + * or multi/exec. + * This function is the only place where the replication flag is cleared. */ + + if (c->flag.multi || c->flag.script) { + /* REGARDING MULTI/EXEC + * -------------------- + * When processing a MULTI/EXEC, blockClientIfRequired is called first for the MULTI. Then, + * all of the commands are queued up in server.c:processCommand(). It's only when EXEC is + * encountered, that server.c:call() is fired to begin execution. + * + * AFTER the EXEC is processed by call(), then each of the commands in the MULTI/EXEC block + * will be processed through call(). + * + * If write commands are present, MULTI & EXEC will be passed to the replication stream + * before/after the transaction commands. Note that MULTI & EXEC are not actually + * "executed" at the time when their replication is passed to the replication stream. + * + * Example: MULTI; SET A B; EXEC + * 1. blockClientIfRequired() called for MULTI. MULTI flag IS NOT set. (Won't block.) + * 2. blockClientIfRequired() called for EXEC. MULTI flag IS set. (Might block.) + * 3. blockClientIfRequired() called for SET. MULTI flag IS set. (Won't block.) + * 4. handleCommandReplication() is called for MULTI. + * 5. handleCommandReplication() is called for SET. + * 6. handleCommandReplication() is called for EXEC. + * + * SO - if the MULTI flag is set, we DON'T clear the flag. It should only be cleared at the + * start of the transaction, when MULTI is received - and the flag isn't set yet. */ + + /* REGARDING SCRIPTS + * ----------------- + * When processing a script, blockClientIfRequired is called first for the EVAL/EVALSHA/FCALL. + * Then, all of the commands are processed using a special script client. The script + * client has the CLIENT_SCRIPT flag set. For scripts, the replication flag is set when + * processing the EVAL/EVALSHA/FCALL and should not be cleared when executing individual + * commands in the script. */ + + /* If it's the EXEC command, we fall through and clear the flag below. But for all other + * commands within the transaction, we don't clear the flag. */ + if (c->cmd->proc != execCommand) return; + } + + /* For most commands, the replication flag is cleared and we determine if replication is needed + * based on the keys being used and their state in each iterator. If a modified key hasn't been + * processed yet, there's no need to expedite the key or send the replication. The key will be + * sent later, when reached by the iterator. + * + * However, for scripts, it is not possible to perform this optimization. There is no way to + * know if an undeclared key might be modified. Since the entire script needs to be replicated + * (or not replicated) atomically, we can't take the chance that an undeclared key might be + * hit which requires replication. */ + bool isScript = isScriptCallWriteCmd(c->cmd); + + sds firstScriptKey = NULL; + if (isScript) { + /* If it's a script, we will normally replicate. But if the keys are out of scope for the + * iteration, we shouldn't. The use-case for this is with slot iteration, when the script + * is acting on keys from a different slot. Here, we just check the first declared key, and + * if it's out of scope for the iteration, we won't replicate it. This might cause issues + * for cross-slot scripts (anti-pattern), but the alternative is replicating all scripts, + * regardless of slot. */ + getKeysResult result; + initGetKeysResult(&result); + getKeysFromCommand(c->cmd, c->argv, c->argc, &result); + if (result.numkeys > 0) firstScriptKey = objectGetVal(c->argv[result.keys[0].pos]); + getKeysFreeResult(&result); + } + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated) { + it->cur_cmd_may_replicate = false; + } else { + /* For normal commands, the flag is initialized to false (not to replicate). For these + * commands, we decide later based on the actual commands. + * + * However, for scripts, we don't know what commands will be executed. So IF it's a + * script, and the keys are in scope (on the right slot) we initialize the replication + * flag to true. */ + it->cur_cmd_may_replicate = isScript && firstScriptKey && + it->keyset_iter->isKeyInScope(it->keyset_iter, firstScriptKey); + } + } +} + + +static void handleSwapdb(int db1, int db2) { + serverAssert(onValkeyMainThread()); + serverAssert(bgIteration_iterationActive()); + serverAssert(!server.cluster_enabled); + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated) continue; + + // Let the iterator internal mechanism know + it->keyset_iter->swapDb(it->keyset_iter, db1, db2); + + // Let the background client know + if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) { + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "SWAP: %d %d\n", db1, db2); + } + + bgIteratorItem *item = itemFreeList_getElementOrAllocate(); + item->type = BGITERATOR_ITEM_SWAPDB; + item->dbid = db1; + item->u.dbid2 = db2; + it->swapdb_queued++; + mutexQueueAdd(it->items_for_iterator, item); + } + } +} + + +static void removePtrFromEarlyIterate(dbEntry *de) { + /* If the item is being released, let's get the pointer out of our early_iterate_entries. + * This is not strictly necessary, but it frees some memory and keeps the dictionary small. */ + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + hashtableDelete(it->early_iterate_entries, de); // just try delete (might not be here) + } +} + + +static bool isDbSignificant(int dbid) { + unsigned long long totalKeys = 0; + for (int i = 0; i < server.dbnum; i++) { + totalKeys += (server.db[i]) ? dbSize(server.db[i]) : 0; + } + return (server.db[dbid]) ? (dbSize(server.db[dbid]) > totalKeys / 2) : false; +} + + +static void handleFlushdb(int dbid) { + // Invoked BEFORE the actual flush. -1 indicates FLUSHALL. + bool should_abort_iterators = (dbid == -1 || isDbSignificant(dbid)); + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + + // Let the low-level iterator know the DB is being flushed + it->keyset_iter->flushDb(it->keyset_iter, dbid); + + if (should_abort_iterators || it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { + if (!it->terminated) bgIteratorTerminate(it); + } else { + /* In this (limited) case, we're only flushing a single DB that contains < half the + * keys. We don't want to kill a full-sync replication. We will just continue with + * iteration, knowing that a replication client will also receive the FLUSHDB on the + * replication stream. There's no need to worry about the items themselves. Since + * we've incremented the refcount, the items still in queue won't be physically deleted. */ + + // Send a flushdb event to notify the client + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "FLUSH: %d\n", dbid); + } + bgIteratorItem *item = itemFreeList_getElementOrAllocate(); + item->type = BGITERATOR_ITEM_FLUSHDB; + item->dbid = dbid; + it->flushdb_queued++; + mutexQueueAdd(it->items_for_iterator, item); + } + } + receiveItemsBackFromIterators(false); // Receive items back before flushing the items +} + + +static bool expediteKeysForWriteOnAllIterators(int dbid, + struct serverCommand *cmd, + int argc, + robj **argv, + keyReference *keyrefs, + int numKeys, + hashtable *waitingOnKeys) { + bool mustBlock = false; + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (expediteKeysForWrite(it, dbid, cmd, argc, argv, keyrefs, numKeys, waitingOnKeys)) + mustBlock = true; + } + + return mustBlock; +} + + +static bool anIteratorWillReplicateForThisCommand(void) { + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->cur_cmd_may_replicate) return true; + } + return false; +} + + +static bool expediteKeysForMultiExec(client *c, hashtable *waitingOnKeys) { + serverAssert(c->cmd->proc == execCommand); + + /* For MULTI/EXEC, Valkey buffers all of the commands until hitting the EXEC. + * At this point, the client holds all of the commands to be executed. This function searches + * for all of the keys used by any of the buffered write commands. In addition, if SWAPDB or + * SELECT is used, this tracks the DBIDs through various swap/select operations. */ + + /* There's a special concern for a NON-consistent iteration with replication. If the keys are + * all "future" keys (which haven't been processed by the iterator yet), then we don't expedite + * the keys or replicate. However, if some keys have already been processed, we need to + * expedite the remaining keys and replicate everything. + * + * When processing a single command, this is all handled. But in this function, for MULTI/EXEC, + * we process 1 command at a time. There's an issue if the first command modifies a "future" + * key, we don't know (without reading ahead) if a later command will modify a prior key. This + * would require the future key to be expedited. + * + * This COULD be addressed by collecting all of the keys into a single structure and then + * analyzing them all at once. However, this won't share code well with the single commands. + * Also, building this structure is a little complex/time-consuming as we need to track both + * key AND dictID. One way to do this might be with a dict of dicts, where the first dict maps + * a dictID to a dict of keys. + * + * ALTERNATIVELY (and it's the simpler approach that's taken here) we can just check if the + * MULTI will be replicated. If so, we re-process the MULTI, just in case there were commands + * prior to deciding that replication was required that might have missed expediting. If so, + * these will be caught on the 2nd time around. + * + * Checking replication status before/after ensures that there can only be a single recursive + * call. */ + bool initiallyAnIteratorWillReplicate = anIteratorWillReplicateForThisCommand(); + + bool mustBlock = false; + int *cur_to_orig_db = NULL; + + int curDb = c->db->id; + for (int cmdNum = 0; cmdNum < c->mstate->count; cmdNum++) { + struct serverCommand *cmd = c->mstate->commands[cmdNum].cmd; + robj **argv = c->mstate->commands[cmdNum].argv; + int argc = c->mstate->commands[cmdNum].argc; + + if (cmd->proc == swapdbCommand) { + int id1, id2; + if (getParamsForSwapdb(argc, argv, c, &id1, &id2)) { + if (cur_to_orig_db == NULL) { + cur_to_orig_db = zmalloc(sizeof(int) * server.dbnum); + for (int i = 0; i < server.dbnum; i++) cur_to_orig_db[i] = i; + } + int temp = cur_to_orig_db[id1]; + cur_to_orig_db[id1] = cur_to_orig_db[id2]; + cur_to_orig_db[id2] = temp; + } + continue; + } + + if (cmd->proc == selectCommand) { + int id; + if (getParamsForSelect(argc, argv, c, &id)) { + curDb = id; + } + continue; + } + + if (!isWriteCmd(cmd)) continue; + + getKeysResult result; + initGetKeysResult(&result); + int numkeys = getKeysFromCommand(cmd, argv, argc, &result); + keyReference *keyrefs = result.keys; + if (numkeys == 0) { + getKeysFreeResult(&result); + continue; // Write command with no keys - like FLUSHDB + } + + if (expediteKeysForWriteOnAllIterators( + cur_to_orig_db ? cur_to_orig_db[curDb] : curDb, + cmd, argc, argv, keyrefs, numkeys, waitingOnKeys)) { + mustBlock = true; + } + getKeysFreeResult(&result); + } + + zfree(cur_to_orig_db); + + if (!initiallyAnIteratorWillReplicate && anIteratorWillReplicateForThisCommand()) { + /* We've decided to replicate. Re-process the MULTI/EXEC just once more to make sure that + * we didn't miss any keys at the beginning. This can't continue to recurse because + * `initiallyAnIteratorWillReplicate` will be TRUE in the recursive call. Note that the + * recursive call may add additional entries to `waitingOnKeys`. */ + if (expediteKeysForMultiExec(c, waitingOnKeys)) mustBlock = true; + } + + return mustBlock; +} + + +static bgIterator *bgIteratorCreate(const char *name, + bgIteratorConsistency consistency, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata, + bgIterationType iter_type, + genericIterator *keyset_iter) { + serverAssert(onValkeyMainThread()); + serverAssert(server.cluster_enabled || iter_type == BGITERATION_TYPE_FULLSCAN); + + int flags; + switch (consistency) { + case BGITERATOR_CONSISTENCY_NONE: flags = 0; break; + case BGITERATOR_CONSISTENCY_START: flags = BGITERATOR_FLAG_CONSISTENT; break; + case BGITERATOR_CONSISTENCY_EVENTUAL: flags = BGITERATOR_FLAG_REPLICATION; break; + default: serverAssert(false); + } + // Consistent, with replication - doesn't make sense. + serverAssert(!((flags & BGITERATOR_FLAG_CONSISTENT) && (flags & BGITERATOR_FLAG_REPLICATION))); + + bgIterator *it = zmalloc(sizeof(bgIterator)); + it->name = sdsnew(name); + it->repldone = repldone; + it->cleanup = cleanup; + it->privdata = privdata; + it->items_for_iterator = mutexQueueCreate(); + it->return_to_valkey = mutexQueueCreate(); + + // Floor queue size to bgiteration_queue_increase_incr or use last queue size value + if (last_item_count_target < BGITER_QUEUE_INCREASE_INCR) { + last_item_count_target = BGITER_QUEUE_INCREASE_INCR; + } + it->item_count_target = last_item_count_target; + it->iteration_flags = flags; + it->iteration_type = iter_type; + it->consistent_modification_id = bgIteration_epoch++; + it->keyset_iter = keyset_iter; + it->early_iterate_entries = hashtableCreate(&dbEntryPtrHashtableType); + hashtableExpand(it->early_iterate_entries, BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE); + it->current_item = NULL; + it->client_is_active = false; + it->completed = false; + it->terminated = false; + it->cur_cmd_may_replicate = false; + + it->dbentries_queued = 0; + it->dbentries_processed = 0; + it->replication_queued = 0; + it->replication_processed = 0; + it->swapdb_queued = 0; + it->swapdb_processed = 0; + it->flushdb_queued = 0; + it->flushdb_processed = 0; + it->dbentry_clones_queued = 0; + it->dbentry_clones_processed = 0; + + elapsedStart(&it->monotonic_start_time); + it->monotonic_item_start_time = 0; + + + if (bgIterator_timeproc_id <= 0) { + // If iteration is not currently active, start the feeding task. (Runs in main thread.) + bgIterator_timeproc_id = aeCreateTimeEvent(server.el, 1, bgIteration_feedIterators_task, NULL, NULL); + serverAssert(bgIterator_timeproc_id != AE_ERR); + } + + if (dictAdd(nameToIterator, it->name, it) != DICT_OK) { + // Can't have 2 iterators with the same name! + serverAssert(false); + } + + listAddNodeTail(allIterators, it); + + dictExpand(inUseEntries, listLength(allIterators) * it->item_count_target); + + return it; +} + + +/* ============================================================================================= + * PUBLIC INTERFACE: Iterator creation and use + * ============================================================================================= */ + +// PUBLIC API +bgIterator *bgIteratorCreateFullScanIter(const char *name, + bgIteratorConsistency consistency, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata) { + return bgIteratorCreate(name, consistency, repldone, cleanup, privdata, + BGITERATION_TYPE_FULLSCAN, fullScanIteratorCreate()); +} + +// PUBLIC API +bgIterator *bgIteratorCreateSlotsIter(const char *name, + bgIteratorConsistency consistency, + const int *slots, + int slots_count, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata) { + return bgIteratorCreate(name, consistency, repldone, cleanup, privdata, + BGITERATION_TYPE_CLUSTERSLOT, clusterSlotIteratorCreate(slots, slots_count)); +} + +// PUBLIC API +bgIterator *bgIteratorFind(const char *name) { + serverAssert(onValkeyMainThread()); + + sds sdsname = sdsnew(name); + bgIterator *it = dictFetchValue(nameToIterator, sdsname); + sdsfree(sdsname); + + return it; +} + + +// PUBLIC API +const char *bgIteratorName(bgIterator *it) { + return it->name; +} + + +// PUBLIC API +void bgIteratorGetStatus(bgIterator *it, bgIteratorStatus *status) { + status->dbentries_queued = it->dbentries_queued; + status->dbentries_processed = it->dbentries_processed; + status->replication_queued = it->replication_queued; + status->replication_processed = it->replication_processed; + status->swapdb_queued = it->swapdb_queued; + status->swapdb_processed = it->swapdb_processed; + status->flushdb_queued = it->flushdb_queued; + status->flushdb_processed = it->flushdb_processed; + status->dbentry_clones_queued = it->dbentry_clones_queued; + status->dbentry_clones_processed = it->dbentry_clones_processed; + + status->queue_length = mutexQueueLength(it->items_for_iterator); + status->queue_length_target = it->item_count_target; + + status->runtime_ms = elapsedMs(it->monotonic_start_time); + + monotime nonvolatile_item_start_time = it->monotonic_item_start_time; + status->current_item_ms = (nonvolatile_item_start_time == 0) + ? 0 + : elapsedMs(nonvolatile_item_start_time); +} + + +// PUBLIC API +void bgIteratorTerminate(bgIterator *it) { + serverAssert(onValkeyMainThread()); + + // Remove any items in the queue, but doesn't affect the 1 item that's being processed. + returnAllItemsToValkey(it); + + // We have to add an item, just in case the READER is waiting on the mutex. + if (BGITERATION_DEBUG) { + debugBuffer = sdscat(debugBuffer, "SENDING TERMINATE\n"); + } + + bgIteratorItem *terminationItem = itemFreeList_getElementOrAllocate(); + *terminationItem = (bgIteratorItem){.type = BGITERATOR_ITEM_TERMINATED}; + mutexQueueAdd(it->items_for_iterator, terminationItem); + + it->terminated = true; +} + + +// PUBLIC API +bool bgIteratorIsTerminating(bgIterator *it) { + return it->terminated; +} + + +// PUBLIC API +bgIteratorItem *bgIteratorRead(bgIterator *it) { + serverAssert(it->current_item == NULL || + (it->current_item->type != BGITERATOR_ITEM_COMPLETE && + it->current_item->type != BGITERATOR_ITEM_TERMINATED)); + + // First, clean up the previous item read + if (it->current_item != NULL) { + returnCurrentItemToValkey(it); + + /* To support unit tests. Normal clients call bgIteratorRead from an alternate thread. + * Without this, a unit test could get stuck waiting on the completion event because + * feed won't get invoked. For production, feed is called regularly from the main thread. + * Note - this is checking that the exact same thread is used and shouldn't count modules. */ + if (pthread_equal(server.main_thread_id, pthread_self()) != 0) bgIteration_feedIterators_task(NULL, 0, NULL); + } else { + it->client_is_active = true; + } + + it->monotonic_item_start_time = 0; // idle until blocking pop returns + it->current_item = mutexQueuePop(it->items_for_iterator, true); + it->monotonic_item_start_time = getMonotonicUs(); + + return it->current_item; +} + + +// PUBLIC API +void bgIteratorClose(bgIterator *it) { + if (it->current_item != NULL) { + if (it->current_item->type == BGITERATOR_ITEM_COMPLETE || + it->current_item->type == BGITERATOR_ITEM_TERMINATED) { + // Normal confirmation of background completion + } else { + // Client is initiating the termination + it->terminated = true; + returnCurrentItemToValkey(it); + + it->current_item = itemFreeList_getElementOrAllocate(); + *(it->current_item) = (bgIteratorItem){.type = BGITERATOR_ITEM_TERMINATED}; + } + } else { + // terminated before first item read + it->terminated = true; + it->current_item = itemFreeList_getElementOrAllocate(); + *(it->current_item) = (bgIteratorItem){.type = BGITERATOR_ITEM_TERMINATED}; + } + + // We don't allocate extension items from the free list + bgIteratorItemExtClose *itemClose = zmalloc(sizeof(bgIteratorItemExtClose)); + itemClose->type = BGITERATOR_ITEMEXT_ITER_CLOSED; + itemClose->iter = it; + mutexQueueAdd(it->return_to_valkey, itemClose); +} + + +/* ============================================================================================= + * PUBLIC INTERFACE: Valkey main-thread support hooks + * ============================================================================================= */ + +// PUBLIC API +void bgIteration_init(void) { + serverAssert(onValkeyMainThread()); + + /* This should be called once and only once from the Valkey main thread. However to support + * unit tests, this is not validated, and multiple invocations are ignored. */ + if (nameToIterator) return; // If already initialized, ignore (unit tests) + + nameToIterator = dictCreate(&sdsrefToPtrDictType); + serverAssert(nameToIterator != NULL); + + allIterators = listCreate(); + serverAssert(allIterators != NULL); + + inUseEntries = dictCreate(&dictEntryPtrDictType); + serverAssert(inUseEntries != NULL); + + curCmdMissingKeys = listCreate(); + serverAssert(curCmdMissingKeys != NULL); + listSetFreeMethod(curCmdMissingKeys, decrRefCountVoid); + + bufferedReplicationBytes = 0; + + if (BGITERATION_DEBUG) { + debugBuffer = sdsMakeRoomFor(sdsempty(), SDS_MAX_PREALLOC); + } +} + + +// PUBLIC API +bool bgIteration_iterationActive(void) { + return (allIterators != NULL && listLength(allIterators) > 0); +} + + +// PUBLIC API +void bgIteration_keyDelete(int dbid, const_sds key) { + if (!bgIteration_iterationActive()) return; + serverAssert(onValkeyMainThread()); + + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "KEYDEL: (%d)%s\n", dbid, key); + } + + dbEntry *de = dbFind(server.db[dbid], (sds)key); + if (de == NULL) return; + + // For consistent iterators, we need to make sure the item gets written before delete + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated || !it->keyset_iter->isKeyInScope(it->keyset_iter, key)) continue; + + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT && + ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { + if (!it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) && + !hashtableFind(it->early_iterate_entries, de, NULL)) { + addEarlyIterationKey(it, de, dbid); // (may also add to inUseEntries) + } + } + } + + removePtrFromEarlyIterate(de); + + /* We might be within the context of a command execution. This happens if the key is found to + * be expired when attempting to execute the command. In this case, we should treat the key as + * missing. If the key exists after the command executes, we can treat it like a new key. + * (If not in command execution, this is ok - it's reset at the beginning of command execution.) */ + robj *oKey = createObject(OBJ_STRING, sdsdup(key)); + listAddNodeHead(curCmdMissingKeys, oKey); +} + + +// PUBLIC API +void bgIteration_flushall(void) { + handleFlushdb(-1); +} + + +// PUBLIC API +bool bgIteration_blockClientIfRequired(client *c) { + serverAssert(onValkeyMainThread()); + iteratorReplicationFlagsWereUpdated = false; + if (!bgIteration_iterationActive()) return false; + if (!isWriteCmd(c->cmd)) return false; + + if (BGITERATION_DEBUG) { + sds sdsArgv = createSdsFromClientArgv(c->argc, c->argv); + debugBuffer = sdscatprintf(debugBuffer, "BLCK?: (%d)%s\n", c->db->id, sdsArgv); + sdsfree(sdsArgv); + } + + /* Before executing a command or atomic transaction, the replication flag is cleared for each + * iterator. If it's determined that the command should replicate, the flag will be set + * as the command and keys are examined for expedite. */ + resetReplicationFlagForIterators(c); + iteratorReplicationFlagsWereUpdated = true; + + if (c->cmd->proc == flushdbCommand || c->cmd->proc == flushallCommand) { + // Handle flush commands prior to execution + int flags; + if (getFlushCommandFlags(c, &flags) == C_OK) { + // The command parsed ok - we WILL flush + handleFlushdb((c->cmd->proc == flushdbCommand) ? c->db->id : -1); + } + } + + bool mustBlock = false; + hashtable *waitOnKeys = hashtableCreate(&tempKeysetHashtableType); // set of robj(sds) + listEmpty(curCmdMissingKeys); + + if (c->cmd->proc == execCommand) { + mustBlock = expediteKeysForMultiExec(c, waitOnKeys); + } else { + getKeysResult result; + initGetKeysResult(&result); + int numkeys = getKeysFromCommand(c->cmd, c->argv, c->argc, &result); + keyReference *keyrefs = result.keys; + if (numkeys > 0) { + mustBlock = expediteKeysForWriteOnAllIterators( + c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); + serverAssert(!(mustBlock && (c->flag.multi) && !(c->flag.script))); + + if (mustBlock && (c->flag.script)) { + /* For scripts, we will block for keys declared in EVAL/EVALSHA/FCALL. + * However, scripts are NOT required to declare keys. Even if it declares keys, + * it's not declaring the DB for the key. After a SELECT or SWAPDB, we might be on + * a key we haven't blocked for. In this case, there is no option but to execute a + * synchronous block and wait for the iterator(s) to be done with the key(s). + * (Yuck.) */ + while (mustBlock) { + receiveItemsBackFromIterators(true); // Blocking + hashtableEmpty(waitOnKeys, NULL); + mustBlock = expediteKeysForWriteOnAllIterators( + c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); + } + } + } else { + // WRITE commands with no keys should always be replicated. SWAPDB, FLUSH, FUNCTION, etc. + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + it->cur_cmd_may_replicate = true; + } + } + getKeysFreeResult(&result); + } + + if (mustBlock) { + serverAssert(hashtableSize(waitOnKeys) > 0); + robj **waitKeysArgv = zmalloc(sizeof(robj *) * hashtableSize(waitOnKeys)); + + robj *key; + hashtableIterator hi; + hashtableInitIterator(&hi, waitOnKeys, 0); + unsigned long argvCount = 0; + while (hashtableNext(&hi, (void **)&key)) { + waitKeysArgv[argvCount++] = key; + } + hashtableCleanupIterator(&hi); + serverAssert(argvCount == hashtableSize(waitOnKeys)); + + blockClientInUseOnKeys(c, argvCount, waitKeysArgv); + + zfree(waitKeysArgv); + } + + hashtableRelease(waitOnKeys); + + if (BGITERATION_DEBUG) { + if (mustBlock) debugBuffer = sdscat(debugBuffer, " (blocked)\n"); + } + + return mustBlock; +} + + +// PUBLIC API +void bgIteration_handleCommandReplication(int dbid, + struct serverCommand *cmd, + int argc, + robj **argv) { + if (BGITERATION_DEBUG) { + // DEBUG - enable this to capture replication not queued because iteration is inactive + if (0 && !bgIteration_iterationActive() && (isWriteCmd(cmd) || cmd->proc == multiCommand)) { + sds sdsArgv = createSdsFromClientArgv(argc, argv); + debugBuffer = sdscatprintf(debugBuffer, "REPL? INACT: (%d)%s\n", dbid, sdsArgv); + sdsfree(sdsArgv); + } + } + + if (!bgIteration_iterationActive()) return; + serverAssert(onValkeyMainThread()); + + /* Some commands are replicated which are not writes (like publish) these can be ignored. + * Be careful with MULTI which is not a write command, but must be replicated. */ + if (!isWriteCmd(cmd) && cmd->proc != multiCommand) return; + + if (BGITERATION_DEBUG) { + sds sdsArgv = createSdsFromClientArgv(argc, argv); + debugBuffer = sdscatprintf(debugBuffer, "REPL?: (%d)%s\n", dbid, sdsArgv); + sdsfree(sdsArgv); + } + + if (cmd->proc == swapdbCommand) { + // All iterators and clients must be informed of swapdb + int id1, id2; + // command has been processed, but Valkey allows "swapdb 0 0" (which can be ignored) + if (getParamsForSwapdb(argc, argv, NULL, &id1, &id2)) + handleSwapdb(id1, id2); + } + + /* In the case that a key is touched in a different DB (COPY/MOVE) the key is recorded as + * a "special" key and than handled below. */ + int special_dbid = 0; + sds special_key = NULL; + dbEntry *special_dbEntry = NULL; + if (cmd->proc == moveCommand) { + /* The MOVE command succeeded. However MOVE requires special handling as it creates a new + * key in a different database. We need to make sure that we don't later try to iterate + * on the key as it would be a duplicate key at that point. So, instead, we will mark the + * newly created key as "early iterated". */ + bool success = getDbIdFromRobj(argv[MOVE_COMMAND_DBID_ARG_INDEX], &special_dbid); + serverAssert(success); // the command already succeeded, so this should work! + + robj *oKey = argv[1]; + special_key = (sds)objectGetVal(oKey); + + special_dbEntry = dbFind(server.db[special_dbid], special_key); + } + if (cmd->proc == copyCommand) { + // The COPY command succeeded. However COPY requires special handling (like MOVE). + bool success = getTargetDbIdForCopyCommand(argc, argv, dbid, &special_dbid); + serverAssert(success); // the command already succeeded, so this should work! + + // Find the newly created entry. + robj *oKey = argv[2]; + special_key = (sds)objectGetVal(oKey); + + special_dbEntry = dbFind(server.db[special_dbid], special_key); + } + + /* Implementation note regarding LUA and MULTI: LUA scripts and MULTI-EXEC blocks must be + * treated atomically. We need to ensure that either ALL of the replication (or none of the + * replication) for the atomic operation is processed by the iterator(s). This is handled + * naturally as we can only "complete" the iteration during the feeding process - and feeding + * is only performed when handling timer events (after the LUA/MULTI has completed). */ + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated) continue; + + /* For consistent iteration, we only iterate values based on version. But for + * non-consistent iteration, we don't need to explicitly iterate any values newly created + * during the iteration. So we mark them as expedited. We know we have a new key if it + * was missing before the command, and exists now. */ + + if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) { + // Handle the special case of a key moved to a different DB + if (special_dbEntry != NULL) { + if (it->cur_cmd_may_replicate && + !it->keyset_iter->hasPassedItem(it->keyset_iter, special_key, special_dbid)) { + hashtableAdd(it->early_iterate_entries, special_dbEntry); + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(special_dbid, special_dbEntry); + debugBuffer = sdscatprintf(debugBuffer, "EARLY(special): %s\n", entryString); + sdsfree(entryString); + } + } + + /* Note: In the cases where there's a special command, we are copying or moving an + * item to a different DB. In these limited cases, we can only possibly be + * creating a single key. And if we've handled it here, we don't need to + * handle it as a "missing key" below. If we were to try to handle it as a + * standard "missing key", we would get the DBID incorrect. */ + + } else if (listLength(curCmdMissingKeys) > 0) { + listIter missingIt; + listNode *missingNode; + listRewind(curCmdMissingKeys, &missingIt); + while ((missingNode = listNext(&missingIt)) != NULL) { + robj *oKey = listNodeValue(missingNode); + const_sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], (sds)key); + if (de != NULL) { + // It exists now! + if (it->cur_cmd_may_replicate && + !it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) { + /* If the current command is allowed to replicate, and there is a new + * key which we haven't yet reached in iteration, it needs to be added + * to the set of early iterate entries. (We know that it's not already + * in that set because it's a newly created key!) */ + bool wasAdded = hashtableAdd(it->early_iterate_entries, de); + serverAssert(wasAdded); + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, de); + debugBuffer = sdscatprintf(debugBuffer, "EARLY(NEW): %s\n", entryString); + sdsfree(entryString); + } + } + } + } + } + } + + /* Deletes (and unlinks) are special. + * Developer context: For most commands, we call bgIteration_blockClientIfRequired before + * the command and then call bgIteration_handleCommandReplication after the command. While + * the "before" logic is determining the need to block, it can also determine (mostly) the + * need for replication (on each iterator). Doing this all in one place saves us from + * performing some of the same logic twice. When we get to this point in the code, we just + * use the previously determined information regarding replication. This works because + * Valkey is single-threaded and only processes one command at a time. + * + * But deletes (and unlinks) happen multiple ways - and occur outside the normal + * before/after logic for commands. These situations must be handled: + * - A normal (client-driven) DEL/UNLINK command will use the standard before/after + * logic. If the key is in use by bgIteration, the command will be blocked. + * - An EVICTION generates a DEL/UNLINK which happens outside of the context of a client + * issued command. The replication flags on the iterators are stale and relate to the + * prior command executed. + * - An EXPIRATION in the context of a client-driven WRITE command occurs when the client + * command attempts to access a key and it is found to be expired. In this case, the + * client-command has already gone through the blocking process, so it should be OK to + * use it->cmd_may_replicate. + * - An EXPIRATION in the context of a client-driven READ command occurs when the client + * command attempts to access a key and it is found to be expired. In this case, the + * client-command has NOT gone through the blocking process. The replication flags on + * the iterators are stale and relate to the prior (write) command executed. + * - An EXPIRATION outside of a client-driven command occurs due to active expiry. In + * this case, the replication flags on the iterator are stale and relate to the prior + * command executed. + * + * In the case of EXPIRE/EVICT occurring outside the context of a write command, this is + * handled. If the key is in-use by bgIterator, increment of robj's refcount prevents the + * key from deletion. In this case the key will be removed from the main dictionary, but + * held inside bgIteration until no longer needed. + * Even though the entry is not physically deleted yet, it is logically deleted and it is + * safe to replicate the DEL/UNLINK. Since iterators process items FIFO, the replication + * for DEL/UNLINK won't actually get processed until other queued replication is processed. + * + * In the case of a client driven DEL command, the key will have already been deleted when + * we hit this routine. In the case of EXPIRE/EVICT, they propagate happens before the key + * is deleted. So if the key is missing, we can use the cached replication decision. But + * if the key still exists (indicating EXPIRE/EVICT) we evaluate it specially. */ + bool shouldReplicateDelCommand = false; + bool isDelCommand = isDeleteCmd(cmd); + if (isDelCommand) { + sds key = objectGetVal(argv[1]); + if (it->keyset_iter->isKeyInScope(it->keyset_iter, key)) { + bool blockClientIfRequiredWasCalled = (server.in_call > 0); + if (blockClientIfRequiredWasCalled && iteratorReplicationFlagsWereUpdated) { + // Here we know that the DEL is related to the running command + shouldReplicateDelCommand = it->cur_cmd_may_replicate; + } else { + // Otherwise, it's something like active expiration or eviction (unrelated) + dbEntry *de = dbFind(server.db[dbid], key); + if (it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) || + (de && hashtableFind(it->early_iterate_entries, de, NULL))) { + shouldReplicateDelCommand = true; + } + } + } + } + + bool replicate = (it->iteration_flags & BGITERATOR_FLAG_REPLICATION && + ((!isDelCommand && it->cur_cmd_may_replicate) || shouldReplicateDelCommand)); + + if (replicate) { + /* We will replicate the command in these cases: + * 1) For consistent iteration - it->cur_cmd_may_replicate is always true + * 2) For non-consistent, if any of the keys have been processed, expediteKeysForWrite + * will ensure that ALL of the keys have been expedited - and we should replicate + * 3) For non-consistent, if NONE of the keys have been processed, no need to replicate */ + if (BGITERATION_DEBUG) { + debugBuffer = sdscat(debugBuffer, " (queued)\n"); + } + + bgIteratorItem *item = itemFreeList_getElementOrAllocate(); + item->type = BGITERATOR_ITEM_REPLICATION; + item->dbid = dbid; + item->u.repl.cmd = cmd; + item->u.repl.argv = cloneRobjArray(argc, argv); + item->u.repl.argc = argc; + bufferedReplicationBytes += replicationItemSize(item); + it->replication_queued++; + mutexQueueAdd(it->items_for_iterator, item); + } + } // allIterators loop +} + + +// PUBLIC API +size_t bgIteration_memoryInuseForReplication(void) { + return bufferedReplicationBytes; +} + + +// PUBLIC API +bool bgIteration_isEntryInuse(dbEntry *de) { + serverAssert(onValkeyMainThread()); + if (!bgIteration_iterationActive()) return false; + return isEntryInuseByAnyIterator(de); +} + + +// PUBLIC API +void bgIteration_dbEntryModified(dbEntry *de) { + if (bgIteration_iterationActive()) { + bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(de); + if (md) md->iterator_epoch = bgIteration_epoch; + } +} + + +// PUBLIC API +void bgIteration_keyModified(int dbid, const_sds key) { + if (bgIteration_iterationActive()) { + dbEntry *de = dbFind(server.db[dbid], (sds)key); + if (de) bgIteration_dbEntryModified(de); + } +} + + +// PUBLIC API +void bgIteration_updateDbEntryPtr(dbEntry *old, dbEntry *new) { + if (!bgIteration_iterationActive() || old == new) return; + serverAssert(onValkeyMainThread()); + serverAssert(!isEntryInuseByAnyIterator(old)); + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (hashtableDelete(it->early_iterate_entries, old)) { + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "EARLY LIST UPDATE %p -> %p\n", (void *)old, (void *)new); + } + bool wasAdded = hashtableAdd(it->early_iterate_entries, new); + serverAssert(wasAdded); + } + } +} diff --git a/src/bgiteration.h b/src/bgiteration.h new file mode 100644 index 00000000000..78643311fe0 --- /dev/null +++ b/src/bgiteration.h @@ -0,0 +1,358 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + +#ifndef __BGITERATION_H +#define __BGITERATION_H + +#include +#include "sds.h" + +/* A mechanism for creating iteration clients which iterate over the main dictionary in a + * background thread. + * + * This mechanism passes keys to the iteration client, while blocking the keys from write by the + * Valkey main thread. Once an iteration client is done with a key, it is returned to the Valkey + * main thread and any pending writers are unblocked. + * + * A bgIterator must be created on the main Valkey thread, and then passed to another thread which + * implements the logic of the iteration client. + * + * Iteration clients are expected to read through the keyspace until the iteration is complete or + * terminated. An iteration client may not perform modifications on a key. */ + +/* Avoids dependency on server.h */ +typedef struct serverObject dbEntry; // An object with key/value inserted into main dictionary +typedef struct serverObject robj; // An object with a value used for command parameters +typedef struct client client; + +/* The bgIterator is an opaque structure. */ +typedef struct bgIterator bgIterator; + + +/* Consistency type for iteration. */ +typedef enum { + /* With no consistency requirements, dbEntries are provided to the iteration client as they + * appear at the time of iteration. No replication is provided. The only guarantee is that + * dbEntries which existed at the start of iteration, and remained through the duration of + * iteration, will be provided to the iteration client once (and only once). If a dbEntry is + * modified during iteration, either the old or the new value may be provided. */ + BGITERATOR_CONSISTENCY_NONE = 0, + + /* With consistency at the start of iteration, a point-in-time iteration is performed. The + * iteration client will see all keys AS THEY EXISTED at the time when the iterator was created. + * Note: The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration + * start). SWAPDB events will not be provided. */ + BGITERATOR_CONSISTENCY_START = 1, + + /* With an eventually consistent iteration, dbEntries will be followed by relevant replication. + * This will allow a client to achieve a consistent state at the END of the iteration. Once a + * dbEntry has been provided to the iteration client, any replication related to that entry will + * also be forwarded to the iteration client. With eventual consistency, keys are provided as + * they are at the time of iteration. This mode requires that the iteration client be aware of + * SWAPDB events. If a SWAPDB is performed, the client will receive a SWAPDB event. + * Replication events will be provided ordered and synchronized with any SWAPDB events. */ + BGITERATOR_CONSISTENCY_EVENTUAL = 2 +} bgIteratorConsistency; + + +/* When running an iterator with replication, a replication-done function (callback) may be + * provided. This function will be executed after the last replication item has been fed into the + * queue for the client. This function will be run on the Valkey main thread, and allows a client + * to recognize the point where no additional replication data will be sent for processing. + * + * PRIVDATA: this pointer is for data private to the iteration client. + * + * Returns true when an iterator stops accepting any replication item into the queue for the client. + * If false is returned, replication will continue, and bgiteration will periodically call the callback + * until true is returned. In this context, returning false indicates that the client is not ready to + * stop receiving replication, it is requesting that replication be continued. */ +typedef bool (*bgIteratorReplDoneFunc)(void *privdata); + + +/* When creating a bgIterator, a cleanup function (callback) may be provided. This function will be + * executed once iteration has completed and this will run on the Valkey main thread. + * + * TERMINATED: will be passed as TRUE if the iteration process was terminated early (either by + * the main thread calling bgIteratorTerminate() or the iteration client calling + * bgIteratorClose()). + * PRIVDATA: this pointer is for data private to the iteration client. */ +typedef void (*bgIteratorCleanupFunc)(bool terminated, void *privdata); + + +/* Create a background full-scan iterator (bgIterator). + * This bgIterator will iterate through the entire keyspace (across all DBs). + * + * NAME: a human readable name for the iterator (must be unique) + * FLAGS: creation flags indicate iteration options + * REPLDONE: if provided, called after the last replication item has been queued (on the Valkey main thread) + * CLEANUP: if provided, called at the end of iteration (on the Valkey main thread) + * PRIVDATA: passed to cleanup function + * + * This method creates and initializes the bgIterator. It does not perform any thread management. + * It is expected that the main Valkey thread will call this method, and then start a new thread to + * to implement the iteration client which will read from the returned bgIterator. + * + * There is no need to delete/destroy a bgIterator. It will automatically be cleaned up after the + * last item is read. */ +bgIterator *bgIteratorCreateFullScanIter( + const char *name, + bgIteratorConsistency consistency, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata); + + +/* Create a background slots iterator (bgIterator). + * This bgIterator will iterate through the keys belonging to a set of cluster slots. + * + * NAME: a human readable name for the iterator (must be unique) + * FLAGS: creation flags indicate iteration options + * SLOTS: array of cluster slots to iterate over + * SLOTS_COUNT: size of the array of slots + * REPLDONE: if provided, called after the last replication item has been queued (on the Valkey main thread) + * CLEANUP: if provided, called at the end of iteration (on the Valkey main thread) + * PRIVDATA: passed to cleanup function + * + * This method creates and initializes the bgIterator. It does not perform any thread management. + * It is expected that the main Valkey thread will call this method, and then start a new thread to + * to implement the iteration client which will read from the returned bgIterator. + * + * The caller of this function has the ownership of the `slots` array's memory. This function will + * just copy its data and leave the array untouched. + * + * There is no need to delete/destroy a bgIterator. It will automatically be cleaned up after the + * last item is read. */ +bgIterator *bgIteratorCreateSlotsIter( + const char *name, + bgIteratorConsistency consistency, + const int *slots, + int slots_count, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata); + + +/* Find an existing bgIterator by name. + * Returns NULL if the iterator does not exist (or has completed). */ +bgIterator *bgIteratorFind(const char *name); + + +/* Get the name of an existing iterator. */ +const char *bgIteratorName(bgIterator *iter); + + +/* Struct to retrieve status information for an active iteration client. */ +typedef struct { + unsigned long dbentries_queued; // Cumulative BGITERATOR_ITEM_DBENTRY queued + unsigned long dbentries_processed; // Cumulative BGITERATOR_ITEM_DBENTRY processed + unsigned long replication_queued; // Cumulative BGITERATOR_ITEM_REPLICATION queued + unsigned long replication_processed; // Cumulative BGITERATOR_ITEM_REPLICATION processed + unsigned long swapdb_queued; // Cumulative BGITERATOR_ITEM_SWAPDB queued + unsigned long swapdb_processed; // Cumulative BGITERATOR_ITEM_SWAPDB processed + unsigned long flushdb_queued; // Cumulative BGITERATOR_ITEM_FLUSHDB queued + unsigned long flushdb_processed; // Cumulative BGITERATOR_ITEM_FLUSHDB processed + unsigned long dbentry_clones_queued; // A subset of dbentries_queued for cloned entries + unsigned long dbentry_clones_processed; // A subset of dbentries_processed for cloned entries + unsigned long queue_length; // Current length of queue to iteration client + unsigned long queue_length_target; // Dynamic target length for queue to iteration client + unsigned long runtime_ms; // Time, in milliseconds, that iterator has been running + unsigned long current_item_ms; // Time, in milliseconds, spent processing current item +} bgIteratorStatus; + + +/* Get the status of a background iteration. + * + * The caller-provided bgIteratorStatus will be populated. */ +void bgIteratorGetStatus(bgIterator *iter, bgIteratorStatus *status); + + +/* Terminate a background iteration. + * + * An iteration is terminated by the Valkey main thread. It is expected that the iteration client + * will continue to read, receiving BGITERATOR_ITEM_TERMINATED or BGITERATOR_ITEM_COMPLETE to + * complete the iteration. (This is necessary to ensure proper cleanup.) + * NOTE: If the iteration client wants to terminate iteration, it may call bgIteratorClose(). */ +void bgIteratorTerminate(bgIterator *iter); + + +/* Check if an iterator is being terminated. + * + * This checks if the iterator is in the process of terminating. For the Valkey main thread, this + * can be used to determine if a call has already been made to bgIteratorTerminate. For an + * iteration client, it normally learns about terminate by reading the next item, this allows + * out-of-band detection of termination which can be useful when processing a large key. */ +bool bgIteratorIsTerminating(bgIterator *iter); + + +typedef enum { + /* Indicates that the iteration has completed normally. No more items to read. + * If replication is enabled, on completion, the final replication offset is recorded in + * 'u.master_repl_offset' and 'dbid' is set to the selected replication db. The iteration + * client will have received all *applicable* replication data to this point. */ + BGITERATOR_ITEM_COMPLETE = 1, + + /* Indicates that the iteration has been terminated before completion. No more items to read.*/ + BGITERATOR_ITEM_TERMINATED, + + /* A dbEntry for DB=dbid. + * NOTE: The dbEntry MAY be expired. It is up to the client to decide how to handle + * expired entries. */ + BGITERATOR_ITEM_DBENTRY, + + /* A replication command for DB=dbid. cmd, argv, & argc provided. + * NOTE: The command may have been re-written before replication. */ + BGITERATOR_ITEM_REPLICATION, + + /* A SWAPDB event. dbid swapped with dbid2. + * Note that SWAPDB events are not provided during consistent iteration. */ + BGITERATOR_ITEM_SWAPDB, + + /* A FLUSHDB event. In most cases, iteration will be terminated, and this event will NOT be + * sent. However, in the case of a single minor DB being flushed, non-consistent iteration is + * permitted to continue. */ + BGITERATOR_ITEM_FLUSHDB +} bgIteratorItemType; + + +typedef struct { + dbEntry *de; + bool is_cloned; + bool is_rehashing_paused; +} dbEntryData; + +typedef struct { + struct serverCommand *cmd; + robj **argv; + int argc; +} replicationData; + +typedef struct { + bgIteratorItemType type; + int dbid; // orig DB ID for CONSISTENT, queue-time DB ID for !CONSISTENT. + union { + dbEntryData dbe; // for BGITERATOR_ITEM_DBENTRY + replicationData repl; // for BGITERATOR_ITEM_REPLICATION + long long master_repl_offset; // for BGITERATOR_ITEM_COMPLETE + int dbid2; // for BGITERATOR_ITEM_SWAPDB + } u; +} bgIteratorItem; + + +/* Read the next bgIteratorItem from the bgIterator. + * + * The iteration client is expected to call this function in a loop. After reading + * BGITERATOR_ITEM_COMPLETE or BGITERATOR_ITEM_TERMINATED, the iteration client must call + * bgIteratorClose to finalize the iteration process. + * + * This is a blocking call. If the main Valkey thread has been too busy to send items to the + * iterator, the iteration client's queue may run dry and this call will block until data is + * available. + * + * NOTE: Reading an item returns previously read items to Valkey. It is unsafe to reference an item + * previously read. + * + * (All memory management is the responsibility of the bgIterator - not the reader.) */ +bgIteratorItem *bgIteratorRead(bgIterator *iter); + + +/* Close the bgIterator, allowing the bgIterator to be deallocated. + * + * This must be called by an iteration client to release the bgIterator. + * + * It is required that this is called after receiving BGITERATOR_ITEM_COMPLETE or + * BGITERATOR_ITEM_TERMINATED and signals that the background activity is complete. + * + * This may also be called by the iteration client to force terminate an iteration early. The + * bgIterator will be marked as terminated. */ +void bgIteratorClose(bgIterator *iter); + + +/******************************************************************************************** + * BGITERATION HOOKS REQUIRED TO SUPPORT ITERATION - CALLS INSERTED INTO MAIN VALKEY CODE + ********************************************************************************************/ + +#define BGITERATION_ENTRY_METADATA_SIZE 4 + +/* Must be called once (and only once) at server startup. */ +void bgIteration_init(void); + + +/* Returns true if any iterators are currently active. */ +bool bgIteration_iterationActive(void); + + +/* Notify bgIteration that a key is being deleted. In Valkey, key deletion can occur in a READ + * command if the key is expired. Note that this notification is more about status than memory. + * Since the dbEntry is a reference counted object, the dbEntry can't be physically deleted if + * bgIteration is still actively using it. */ +void bgIteration_keyDelete(int dbid, const_sds key); + + +/* Iteration needs to know if a FLUSHALL is being performed. For normal clients, this comes through + * the standard "blockClientIfRequired" interface. This interface is for cases where Valkey + * performs the FLUSHALL operation independently of clients (e.g. when syncing with master). */ +void bgIteration_flushall(void); + + +/* Updating value or expiration of an existing key may lead to reallocation of the dbEntry (robj). + * BgIteration keeps track of expedited keys (by pointer) to avoid repeated iteration. BgIteration + * must be notified when dbEntries are reallocated. BgIteration will not dereference the pointers; + * it is safe to have deallocated the old dbEntry before calling this function. + * + * We can't update the dbEntry if the entry is actually in use (bgIteration_isEntryInuse)! + * + * To simplify calling code, this function does nothing if old_entry == new_entry. */ +void bgIteration_updateDbEntryPtr(dbEntry *old_entry, dbEntry *new_entry); + + +/* Before executing any command, the Valkey main thread must call this function. If the key(s) are + * blocked for writes by an iterator, the function returns true and the client is blocked. A + * blocked client will be unblocked once the key becomes available for write. + * + * This should be called for all commands - even commands which are executed as part of a MULTI/EXEC + * or LUA script. + * + * For MULTI/EXEC - This function is called when hitting the EXEC - after all of the commands + * have been queued. This may block the EXEC, but will NOT block individual + * commands as they are executed in the MULTI/EXEC block. + * + * For LUA script - This function is first called for EVAL/EVALSHA. It may block the script while + * waiting on declared keys. However, if the script accesses undeclared keys or + * performs SWAPDB, a synchronous block may be performed (returning false) on + * individual commands within the script. + * + * Note: this function should be called for all commands (not just writes). */ +bool bgIteration_blockClientIfRequired(client *c); + + +/* After execution of a write command, the Valkey main thread must provide the command to iterators + * which are interested in the replication feed. It is required that all commands have been passed + * through bgIteration_blockClientIfRequired(), however, it is permitted that the command can be + * re-written for propagation. */ +void bgIteration_handleCommandReplication( + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv); + + +/* The memory that bgIteration uses while temporarily buffering replication data is not included in + * the maxmemory computation used for eviction. This function provides insight into the current + * amount of memory used for buffered replication data. */ +size_t bgIteration_memoryInuseForReplication(void); + + +/* Check if a dbEntry is currently in-use/locked by bgIteration. */ +bool bgIteration_isEntryInuse(dbEntry *de); + + +/* Notify bgIteration that a dbEntry has been added/modified. + * - If caller has a dbEntry*, dbEntryModified is more efficient + * - If caller has a dbid/key, a lookup is performed to find the dbEntry */ +void bgIteration_dbEntryModified(dbEntry *de); +void bgIteration_keyModified(int dbid, const_sds key); + +#endif diff --git a/src/db.c b/src/db.c index ed906f22c4e..4b6f3c11a39 100644 --- a/src/db.c +++ b/src/db.c @@ -37,6 +37,7 @@ #include "module.h" #include "vector.h" #include "expire.h" +#include "bgiteration.h" /*----------------------------------------------------------------------------- * C-level DB API @@ -361,6 +362,7 @@ static void dbSetValue(serverDb *db, robj *key, robj **valref, int overwrite, vo val->lru = old->lru; long long expire = objectGetExpire(old); new = objectSetKeyAndExpire(val, objectGetVal(key), expire); + bgIteration_updateDbEntryPtr(old, new); *oldref = new; /* Replace the old value at its location in the expire space. */ if (expire >= 0) { @@ -430,6 +432,7 @@ void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags) { } else { dbSetValue(db, key, valref, 1, NULL); } + bgIteration_dbEntryModified(*valref); if (!(flags & SETKEY_KEEPTTL)) removeExpire(db, key); if (!(flags & SETKEY_NO_SIGNAL)) signalModifiedKey(c, db, key); } @@ -475,6 +478,8 @@ int dbGenericDeleteWithDictIndex(serverDb *db, robj *key, int async, int flags, hashtablePosition pos; void **ref = kvstoreHashtableTwoPhasePopFindRef(db->keys, dict_index, objectGetVal(key), &pos); if (ref != NULL) { + bgIteration_keyDelete(db->id, (sds)objectGetVal(key)); + robj *val = *ref; /* VM_StringDMA may call dbUnshareStringValue which may free val, so we * need to incr to retain val */ @@ -662,6 +667,9 @@ long long emptyData(int dbnum, int flags, void(callback)(hashtable *)) { return -1; } + /* bgIteration must be notified for flushall. */ + if (dbnum == -1) bgIteration_flushall(); + /* Fire the flushdb modules event. */ moduleFireServerEvent(VALKEYMODULE_EVENT_FLUSHDB, VALKEYMODULE_SUBEVENT_FLUSHDB_START, &fi); @@ -753,6 +761,7 @@ long long dbTotalServerKeyCount(void) { void signalModifiedKey(client *c, serverDb *db, robj *key) { touchWatchedKey(db, key); trackingInvalidateKey(c, key, 1); + bgIteration_keyModified(db->id, objectGetVal(key)); } void signalFlushedDb(int dbid, int async) { @@ -2257,7 +2266,7 @@ robj *dbFindExpires(serverDb *db, sds key) { } unsigned long long dbSize(serverDb *db) { - return kvstoreSize(db->keys); + return (db->keys) ? kvstoreSize(db->keys) : 0; } unsigned long long dbScan(serverDb *db, unsigned long long cursor, kvstoreScanFunction scan_cb, void *privdata) { diff --git a/src/defrag.c b/src/defrag.c index 670f83bee73..e6ecad0227e 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -43,6 +43,7 @@ #include "eval.h" #include "script.h" #include "module.h" +#include "bgiteration.h" #include #include @@ -708,6 +709,8 @@ static void defragKey(defragKeysCtx *ctx, robj **elemref) { unsigned char *newzl; ob = *elemref; + if (bgIteration_isEntryInuse(ob)) return; + /* Try to defrag robj and/or string value. */ if ((newob = activeDefragStringOb(ob))) { *elemref = newob; @@ -815,6 +818,11 @@ static void defragPubsubScanCallback(void *privdata, void *elemref) { * and 1 if time is up and more work is needed. */ static int defragLaterItem(robj *ob, unsigned long *cursor, monotime endtime, int dbid) { if (ob) { + if (bgIteration_isEntryInuse(ob)) { + *cursor = 0; + return 0; + } + if (ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST) { return scanLaterList(ob, cursor, endtime); } else if (ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HASHTABLE) { diff --git a/src/expire.c b/src/expire.c index b31f57465cc..efa027dad8c 100644 --- a/src/expire.c +++ b/src/expire.c @@ -39,6 +39,7 @@ #include "cluster.h" #include "cluster_migrateslots.h" #include "util.h" +#include "bgiteration.h" /*----------------------------------------------------------------------------- * Incremental collection of expired keys. @@ -167,13 +168,18 @@ void fieldExpireScanCallback(void *privdata, void *volaKey, int didx) { robj *o = volaKey; serverAssert(o); serverAssert(hashTypeHasVolatileFields(o)); + + data->has_more_expired_entries = false; + data->sampled++; + + if (bgIteration_isEntryInuse(o)) return; + mstime_t now = server.mstime; size_t expired_fields = dbReclaimExpiredFields(o, data->db, now, data->max_entries, didx); if (expired_fields) { data->has_more_expired_entries = (expired_fields == data->max_entries); data->expired++; } - data->sampled++; } static int expireShouldSkipTableForSamplingCb(hashtable *ht) { diff --git a/src/hashtable.c b/src/hashtable.c index dcae6dfa014..940adcc8c01 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -344,7 +344,7 @@ typedef struct { } position; static_assert(sizeof(hashtablePosition) >= sizeof(position), - "Opaque iterator size"); + "Opaque position size"); /* State for incremental find. */ typedef struct { @@ -1377,13 +1377,13 @@ void hashtableResumeAutoShrink(hashtable *ht) { * spaces, "holes", in the bucket chains, which wastes memory. Additionally, we * pause auto shrink when rehashing is paused, meaning the hashtable will not * shrink the bucket count. */ -static void hashtablePauseRehashing(hashtable *ht) { +void hashtablePauseRehashing(hashtable *ht) { ht->pause_rehash++; hashtablePauseAutoShrink(ht); } /* Resumes incremental rehashing, after pausing it. */ -static void hashtableResumeRehashing(hashtable *ht) { +void hashtableResumeRehashing(hashtable *ht) { ht->pause_rehash--; assert(ht->pause_rehash >= 0); hashtableResumeAutoShrink(ht); @@ -2562,3 +2562,19 @@ int hashtableLongestBucketChain(hashtable *ht) { } return maxlen; } + +// Temporary, waiting on PR #3803 +bool hashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor) { + if (cursor == 0) return false; + if (hashtableSize(ht) == 0) return true; + + /* The scan visits buckets in reverse-binary order based on the smallest + * table. During rehashing, a small-table bucket and its corresponding + * large-table buckets are processed together, so the small-table mask + * determines ordering in both cases. */ + int exp = ht->bucket_exp[0]; + if (hashtableIsRehashing(ht) && ht->bucket_exp[1] < exp) exp = ht->bucket_exp[1]; + size_t mask = expToMask(exp); + size_t bucket_idx = hashKey(ht, key) & mask; + return rev(bucket_idx) < rev(cursor & mask); +} diff --git a/src/hashtable.h b/src/hashtable.h index 8bbf5d8c05b..289bc183db1 100644 --- a/src/hashtable.h +++ b/src/hashtable.h @@ -129,6 +129,8 @@ size_t hashtableMemUsage(const hashtable *ht); void hashtablePauseAutoShrink(hashtable *ht); void hashtableResumeAutoShrink(hashtable *ht); bool hashtableIsRehashing(hashtable *ht); +void hashtablePauseRehashing(hashtable *ht); +void hashtableResumeRehashing(hashtable *ht); bool hashtableIsRehashingPaused(hashtable *ht); ssize_t hashtableGetRehashingIndex(hashtable *ht); void hashtableRehashingInfo(hashtable *ht, size_t *from_size, size_t *to_size); @@ -161,6 +163,7 @@ bool hashtableIncrementalFindGetResult(hashtableIncrementalFindState *state, voi /* Iteration & scan */ size_t hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata); size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata, void *(*defragfn)(void *), int flags); +bool hashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor); void hashtableInitIterator(hashtableIterator *iter, hashtable *ht, uint8_t flags); void hashtableRetargetIterator(hashtableIterator *iterator, hashtable *ht); void hashtableCleanupIterator(hashtableIterator *iter); diff --git a/src/object.c b/src/object.c index 21eb57e5cbd..f4545cf8025 100644 --- a/src/object.c +++ b/src/object.c @@ -38,6 +38,7 @@ #include "zmalloc.h" #include "sds.h" #include "module.h" +#include "bgiteration.h" #include #include @@ -340,7 +341,7 @@ robj *createStringObjectFromSds(const_sds s) { return createStringObject(s, sdslen(s)); } -static robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const_sds key, long long expire) { +robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const_sds key, long long expire) { if (shouldEmbedStringObject(len, key, expire)) { return createEmbeddedStringObjectWithKeyAndExpire(ptr, len, key, expire); } else { @@ -447,6 +448,7 @@ void objectUnembedVal(robj *o) { robj *objectSetKeyAndExpire(robj *o, const_sds key, long long expire) { if (o->type == OBJ_STRING && o->encoding == OBJ_ENCODING_EMBSTR) { robj *new = createStringObjectWithKeyAndExpire(objectGetVal(o), sdslen(objectGetVal(o)), key, expire); + bgIteration_updateDbEntryPtr(o, new); new->lru = o->lru; decrRefCount(o); return new; @@ -471,6 +473,7 @@ robj *objectSetKeyAndExpire(robj *o, const_sds key, long long expire) { serverPanic("Not implemented"); } robj *new = createUnembeddedObjectWithKeyAndExpire(o->type, ptr, key, expire); + bgIteration_updateDbEntryPtr(o, new); new->encoding = o->encoding; new->lru = o->lru; decrRefCount(o); diff --git a/src/server.c b/src/server.c index 4eb7798a924..ff854b72873 100644 --- a/src/server.c +++ b/src/server.c @@ -54,6 +54,7 @@ #include "util.h" #include "eval.h" +#include "bgiteration.h" #include "trace/trace_commands.h" @@ -2052,7 +2053,10 @@ void beforeSleep(struct aeEventLoop *eventLoop) { /* Before we are going to sleep, let the threads access the dataset by * releasing the GIL. The server main thread will not touch anything at this * time. */ - if (moduleCount()) moduleReleaseGIL(); + if (moduleCount()) { + atomic_store_explicit(&server.module_gil_acquired, 0, memory_order_relaxed); + moduleReleaseGIL(); + } /********************* WARNING ******************** * Do NOT add anything below moduleReleaseGIL !!! * ***************************** ********************/ @@ -2074,6 +2078,7 @@ void afterSleep(struct aeEventLoop *eventLoop, int numevents) { atomic_store_explicit(&server.module_gil_acquiring, 1, memory_order_relaxed); moduleAcquireGIL(); atomic_store_explicit(&server.module_gil_acquiring, 0, memory_order_relaxed); + atomic_store_explicit(&server.module_gil_acquired, 1, memory_order_relaxed); moduleFireServerEvent(VALKEYMODULE_EVENT_EVENTLOOP, VALKEYMODULE_SUBEVENT_EVENTLOOP_AFTER_SLEEP, NULL); latencyEndMonitor(latency); latencyAddSampleIfNeeded("module-acquire-GIL", latency); @@ -3018,8 +3023,11 @@ void initServer(void) { /* Set object metadata size before creating any database key objects */ if (server.forkless_options_supported) { - objectSetMetadataSize(sizeof(uint32_t)); /* This is a placeholder until Threadsave defines a metadata structure */ - /* 4 bytes for iterator_epoch for now*/ + /* NOTE: At this time, there is only one reason for dbEntry metadata: bgIteration. However, + * if/when new metadata options are added, we will need to compute the size of a variable + * size metadata, and provide appropriate accessors to access the specific portion of the + * metadata (each of which may/may not exist, based on immutable startup parameters). */ + objectSetMetadataSize(BGITERATION_ENTRY_METADATA_SIZE); } createDatabaseIfNeeded(0); /* The default database should always exist */ @@ -3036,6 +3044,7 @@ void initServer(void) { server.watching_clients = 0; server.cronloops = 0; server.in_exec = 0; + server.in_call = 0; server.busy_module_yield_flags = BUSY_MODULE_YIELD_NONE; server.busy_module_yield_reply = NULL; server.client_pause_in_transaction = 0; @@ -3141,6 +3150,7 @@ void initServer(void) { commandlogInit(); latencyMonitorInit(); initSharedQueryBuf(); + bgIteration_init(); /* Initialize ACL default password if it exists */ ACLUpdateDefaultUserPassword(server.requirepass); @@ -3702,6 +3712,58 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot) if (propagate_to_slot_migration) clusterFeedSlotExportJobs(dbid, argv, argc, slot); } +/* BgIteration requires that replicaton is sent after each command, however the + * alsoPropagate mechanism queues replication until the end of the transaction + * (when propagatePendingCommands is invoked). Also, the propagation mechanism + * strips out multi/exec, adding them back during propagatePendingCommands (if + * necessary). This function ensures that replication, including multi/exec are + * sequenced with the commands for bgIteration. + * + * Called from alsoPropagate with regular params. + * Called from propagatePendingCommands with dbid = -1 (to close multi/exec). */ +static void propagateToBgIteration(int dbid, int argc, robj **argv, int target) { + /* STATIC indicates that we have sent the MULTI, and need to match it with + * an EXEC during propagatePendingCommands. */ + static bool sentMultiToBgIterator = false; + /* STATIC indicates that last DBID that was sent, so that we can use the + * same DBID when sending a generated EXEC. */ + static int lastDbidSentToBgIterator; + + if (dbid >= 0) { + // Called from alsoPropagate() to replicate a command + if (target & PROPAGATE_REPL && bgIteration_iterationActive()) { + if (!sentMultiToBgIterator && (scriptIsRunning() || server.in_exec)) { + /* For a script or multi/exec, we should be sending the MULTI at + * the beginning of the execution unit. There shouldn't be any + * commands in the propagation queue yet. */ + serverAssert(server.also_propagate.numops == 0); + /* If this is the first propagated command of a script or multi, + * make it a transaction. It may turn out that there is only 1 + * command in the MULTI block, but we can't know that now. + * Unlike regular replication, we can't defer all of the + * replication until we know for sure. We must call bgIteration + * after each command. */ + static struct serverCommand *cmd_multi = NULL; // STATIC + if (cmd_multi == NULL) cmd_multi = lookupCommandOrOriginal(&shared.multi, 1); + bgIteration_handleCommandReplication(dbid, cmd_multi, 1, &shared.multi); + sentMultiToBgIterator = true; + } + struct serverCommand *cmd = lookupCommandOrOriginal(argv, argc); + bgIteration_handleCommandReplication(dbid, cmd, argc, argv); + lastDbidSentToBgIterator = dbid; + } + } else { + // Called from propagatePendingCommands() to finalize a transaction + if (sentMultiToBgIterator) { + // If a MULTI was sent to bgIterator via alsoPropagate(), then send the matching EXEC. + static struct serverCommand *cmd_exec = NULL; // STATIC + if (cmd_exec == NULL) cmd_exec = lookupCommandOrOriginal(&shared.exec, 1); + bgIteration_handleCommandReplication(lastDbidSentToBgIterator, cmd_exec, 1, &shared.exec); + sentMultiToBgIterator = false; + } + } +} + /* Used inside commands to schedule the propagation of additional commands * after the current command is propagated to AOF / Replication. * @@ -3714,6 +3776,8 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot) * stack allocated). The function automatically increments ref count of * passed objects, so the caller does not need to. */ void alsoPropagate(int dbid, robj **argv, int argc, int target, int slot) { + propagateToBgIteration(dbid, argc, argv, target); + robj **argvcopy; int j; @@ -3780,6 +3844,12 @@ void updateCommandLatencyHistogram(struct hdr_histogram **latency_histogram, int * multiple separated commands. Note that alsoPropagate() is not affected * by CLIENT_PREVENT_PROP flag. */ static void propagatePendingCommands(void) { + /* This is done before the check on server.also_propagate.numops. Numops + * might be zero if there is no replica but we might be running bgIteration + * for something other than replication. If we sent the multi (to + * bgIteration), we need to send the matching exec. */ + propagateToBgIteration(-1, 0, NULL, 0); + if (server.also_propagate.numops == 0) return; int j; @@ -3909,6 +3979,10 @@ int incrCommandStatsOnError(struct serverCommand *cmd, int flags) { * */ void call(client *c, int flags) { + if (bgIteration_blockClientIfRequired(c)) return; + + server.in_call++; + long long dirty; struct ClientFlags client_old_flags = c->flag; @@ -4123,6 +4197,7 @@ void call(client *c, int flags) { } server.executing_client = prev_client; + server.in_call--; } /* Used when a command that is ready for execution needs to be rejected, due to diff --git a/src/server.h b/src/server.h index 51db9a38baa..9cf59dfae0a 100644 --- a/src/server.h +++ b/src/server.h @@ -103,7 +103,19 @@ static_assert(sizeof(off_t) >= 8, "off_t must be 64-bit; ensure _FILE_OFFSET_BIT #define dismissMemory zmadvise_dontneed #define VALKEYMODULE_CORE 1 -typedef struct serverObject robj; + +/* serverObject (aka robj) is currently overloaded for 2 purposes. This is a legacy artifact. + * 1. It's carries a reference counted STRING (a keyless value) during parsing and command execution. + * 2. It's also used to carry a key/value pair which is inserted into the DB. In this form, the + * value is not limited to being a string. + * + * The typedef "dbEntry" is used to explicitly connote the latter form. It indicates a key/value + * pair which is suitable to exist in the DB. It might be active in the DB, or may be unlinked from + * the DB (but still contains a key/value). The value may be any of the Valkey data types/encodings. + */ +typedef struct serverObject robj; // A keyless string OR a key/value pair +typedef struct serverObject dbEntry; // Explicitly a key/value pair + #include "valkeymodule.h" /* Modules API defines. */ /* Following includes allow test functions to be called from main() */ @@ -1767,6 +1779,7 @@ struct valkeyServer { size_t initial_memory_usage; /* Bytes used after initialization. */ int always_show_logo; /* Show logo even for non-stdout logging. */ int in_exec; /* Are we inside EXEC? */ + int in_call; /* Nesting level within the call() function. */ int busy_module_yield_flags; /* Are we inside a busy module? (triggered by RM_Yield). see BUSY_MODULE_YIELD_ flags. */ const char *busy_module_yield_reply; /* When non-null, we are inside RM_Yield. */ char *ignore_warnings; /* Config: warnings that should be ignored. */ @@ -1785,6 +1798,7 @@ struct valkeyServer { pid_t child_pid; /* PID of current child */ int child_type; /* Type of current child */ _Atomic(int) module_gil_acquiring; /* Indicates whether the GIL is being acquiring by the main thread. */ + _Atomic(int) module_gil_acquired; /* Indicates if the main thread has the GIL acquired. */ /* Networking */ int port; /* TCP listening port */ int tls_port; /* TLS listening port */ diff --git a/src/unit/custom_matchers.hpp b/src/unit/custom_matchers.hpp index 2d9c8193d29..edc83bf33ea 100644 --- a/src/unit/custom_matchers.hpp +++ b/src/unit/custom_matchers.hpp @@ -14,7 +14,11 @@ MATCHER_P(robjEqualsStr, str, "robj string matcher") { assert(arg->type == OBJ_STRING); assert(sdsEncodedObject(arg)); - return strcmp(static_cast(objectGetVal(arg)), str) == 0; + + if (strcmp(static_cast(objectGetVal(arg)), str) == 0) return true; + + *result_listener << "robj(\"" << (char *)objectGetVal(arg) << "\") doesn't match \"" << str << "\""; + return false; } #endif // _CUSTOM_MATCHERS_HPP_ diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp new file mode 100644 index 00000000000..1faff9e7c20 --- /dev/null +++ b/src/unit/test_bgiteration.cpp @@ -0,0 +1,3051 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + +#include "generated_wrappers.hpp" +#include + +using namespace ::testing; + +extern "C" { +#include "bgiteration.h" +#include "server.h" +#include "stdlib.h" +#define using usingvar // compile hack +#include "module.h" // uses "using" keyword +#undef using +extern hashtableType commandSetType; +extern dictType keylistDictType; +void bgIteration_feedIterators(void); +void createSharedObjects(void); +void hashtableDump(hashtable *ht); +void bgIteration_unitTestDisableCloning(void); +void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes); +static size_t mockHashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata); +size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid); +} + + +// The private data is a pointer to arbitrary data. This value is used just to +// test that the correct value is passed through. +#define PRIVDATA reinterpret_cast(12345) + +typedef int32_t bgIterationEntryMetadata; // opaque 4 bytes +static_assert(sizeof(bgIterationEntryMetadata) == BGITERATION_ENTRY_METADATA_SIZE); + +// A bgIteration cleanup function used for testing. +static int cleanupCount; +static bool cleanupTerminated; +static void iteratorCleanupFn(bool terminated, void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + cleanupCount++; + cleanupTerminated = terminated; +} + +// A bgIteration repldone function used for testing. +static int replDoneConfirmed; +static bool iteratorRepldoneFn(void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + replDoneConfirmed++; + return true; +} + +// A more complicated repldone function that can delay the replcation done condition. +static int replDoneRejected; +static bool iteratorRepldoneFnNotBeingReadyInitially(void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + // This is to test the behavior when Repl Done function is not ready to be executed. + if (replDoneRejected == 0) { + replDoneRejected++; + return false; + } + replDoneConfirmed++; + return true; +} + + +/* This mock for hashtableScan will return the items in lexical order. It assumes that the entries + * are robjs containing an sds string for the key. The key is expected to begin with a capital + * letter [A-Z]. The caller passes 0 as the cursor to start the iteration. The returned cursor + * value will indicate the prior letter returned (1=A, ...). After entries starting with 'Z' have + * been returned, the cursor of 0 will indicate that the scan is complete. Note that all entries + * starting with the same letter will be returned in a single call. */ +static size_t mockHashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata) { + // Just in case, if it's not one of our hashtables, use the unmocked function + bool our_ht = (server.db[0]->keys && ht == kvstoreGetHashtable(server.db[0]->keys, 0)) || + (server.db[1]->keys && ht == kvstoreGetHashtable(server.db[1]->keys, 0)); + if (!our_ht) return __real_hashtableScan(ht, cursor, fn, privdata); + + // Collect all entries from the hashtable + std::vector entries; + hashtableIterator *iter = hashtableCreateIterator(ht, 0); + dbEntry *entry; + while (hashtableNext(iter, (void **)&entry)) { + char first = objectGetKey(entry)[0]; + assert(first >= 'A' && first <= 'Z'); + entries.push_back(entry); + } + hashtableReleaseIterator(iter); + + // Sort by key lexicographically + std::sort(entries.begin(), entries.end(), [](dbEntry *a, dbEntry *b) { + return strcmp(objectGetKey(a), objectGetKey(b)) < 0; + }); + + // cursor 0 means start at 'A', otherwise start after the cursor letter + char startLetter = (char)('A' + cursor); + + // Find the first letter to emit + char emitLetter = 0; + for (dbEntry *e : entries) { + char first = objectGetKey(e)[0]; + if (first >= startLetter) { + emitLetter = first; + break; + } + } + + if (emitLetter == 0) return 0; + + // Call fn for all entries starting with emitLetter + for (dbEntry *e : entries) { + char first = objectGetKey(e)[0]; + if (first == emitLetter) fn(privdata, (void *)e); + } + + size_t nextCursor = (size_t)(emitLetter - 'A' + 1); + return (nextCursor > 25) ? 0 : nextCursor; +} + + +static bool mockHashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor) { + // Just in case, if it's not one of our hashtables, use the unmocked function + if (ht != kvstoreGetHashtable(server.db[0]->keys, 0) && + ht != kvstoreGetHashtable(server.db[1]->keys, 0)) + return __real_hashtableScanHasPassedKey(ht, key, cursor); + + return ((const char *)key)[0] < (char)('A' + cursor); +} + + +static const char *logfile = ""; + +/* Most of the bgIteration unit tests are based on a CMD instance with 2 DBs. There are 8 keys in + * each DB. The hashtableScan function is mocked to return the keys in a predictable order. + * + * There are a number of helper functions to simulate certain key modification actions within our + * test configuration. Note that this is isolated from the actual call to processCommand. + * + * Because most of bgIteration is based on an ordered processing of keys, it doesn't matter if we + * are simulating CMD or CME, full scan, or slot-based. The majority of tests are independent of + * these concerns. + * + * However, there are some tests which are are unique to these configurations and use a specialized + * derived class to handle the differences. We do not want to duplicate all of the tests for + * the different configurations, but we do want to ensure that each configuration works properly. + * - bgIterationTestCluster - handles tests unique to full scan in cluster mode + * - bgIterationTestClusterSlots - handles tests unique to cluster slot-based iteration */ +class BgIterationTest : public ::testing::Test { + protected: + static const int DB_COUNT = 2; + static const int ITEMS_PER_DB = 8; + + private: + /* With the mock hashtableScan, we get keys in a predictable order. DB0 works with buckets + * containing groups of keys (which hashtableScan returns in a single call). DB1 returns + * each key individually, as more separate buckets. Convention (for test readability) is + * that keys beginning [A-M] would be in DB0 and keys beginning [N-Z] in DB1. Letters are + * intentionally skipped to allow for possible insertions. */ + const char *keys[DB_COUNT][ITEMS_PER_DB] = {{"B0", "B1", "B2", "E0", "E1", "H0", "H1", "H2"}, + {"N0", "O0", "Q0", "R0", "T0", "U0", "W0", "Y0"}}; + + protected: + static const int TOTAL_ITEMS = DB_COUNT * ITEMS_PER_DB; + static const int LAST_ITEM = TOTAL_ITEMS - 1; + + MockValkey mock; + RealValkey real; + client *c = nullptr; // for general use in the tests (with common cleanup) + robj **orig_argv = nullptr; // Used when simulating multi + int orig_argc = 0; // Used when simulating multi + + + struct serverCommand dummy_cmd = {0}; + + // Helper functions for accessing the keys. We can access by db(0..1) and seq(0..7) + // or by item number (0..15). + // NOTE: These virtual functions can be overridden in subclasses which may have different item layout. + virtual const char *getKeyAtDbSeq(int db, int seq) { + assert(db < DB_COUNT); + assert(seq < ITEMS_PER_DB); + return keys[db][seq]; + } + + virtual int getDbFromItemNum(int itemNum) { + assert(itemNum < DB_COUNT * ITEMS_PER_DB); + return itemNum / ITEMS_PER_DB; + } + + virtual int getSeqFromItemNum(int itemNum) { + assert(itemNum < DB_COUNT * ITEMS_PER_DB); + return itemNum % ITEMS_PER_DB; + } + + const char *keyStr(int itemNum) { + return getKeyAtDbSeq(getDbFromItemNum(itemNum), getSeqFromItemNum(itemNum)); + } + + int itemNumFromKey(const char *key) { + for (int itemNum = 0; itemNum < DB_COUNT * ITEMS_PER_DB; itemNum++) { + if (strcmp(key, keyStr(itemNum)) == 0) return itemNum; + } + return -1; + } + + + // Do some general initialization before starting the suite. Normally, the tests are run in + // isolation - and this isn't much different than SetUp(). But if running the + // entire test suite together (just manually running the test executable), this gets called + // only once. + static void SetUpTestSuite() { + monotonicInit(); + + bzero(&server, sizeof(server)); + server.hz = 100; + server.logfile = const_cast(logfile); + createSharedObjects(); + + moduleInitModulesSystem(); + + server.commands = hashtableCreate(&commandSetType); + server.orig_commands = hashtableCreate(&commandSetType); + populateCommandTable(); + } + + + static void TearDownTestSuite() { + hashtableRelease(server.commands); + hashtableRelease(server.orig_commands); + } + + + void initializeServerDb(int dbid, int slot_count_bits = 0) { + server.db[dbid] = static_cast(zcalloc(sizeof(serverDb))); + server.db[dbid]->id = dbid; + server.db[dbid]->keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, 0); + server.db[dbid]->expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, 0); + server.db[dbid]->watched_keys = dictCreate(&keylistDictType); + } + + + robj *createStringObjectFromCString(const char *s) { + return createStringObject(s, strlen(s)); + } + + + void addKeyToDb(int dbid, const char *key, const char *val) { + robj *key_obj = createStringObjectFromCString(key); + robj *val_obj = createStringObjectFromCString(val); + dbAdd(server.db[dbid], key_obj, &val_obj); + decrRefCount(key_obj); + } + + + virtual void setupDatabase() { + /* For these unit tests, a standard database is constructed. But we will use our own + * mocked scan function to ensure a consistent iteration order */ + + server.dbnum = DB_COUNT; + server.cluster_enabled = false; + server.db = static_cast(zcalloc(sizeof(serverDb *) * server.dbnum)); + + for (int dbid = 0; dbid < server.dbnum; dbid++) { + initializeServerDb(dbid); + for (int keynum = 0; keynum < ITEMS_PER_DB; keynum++) { + addKeyToDb(dbid, keys[dbid][keynum], keys[dbid][keynum]); + } + } + + EXPECT_CALL(mock, hashtableScan(_, _, _, _)) + .WillRepeatedly(Invoke(mockHashtableScan)); + EXPECT_CALL(mock, hashtableScanHasPassedKey(_, _, _)) + .WillRepeatedly(Invoke(mockHashtableScanHasPassedKey)); + + if (0) debugPrintBucketInfo(); + } + + + void SetUp() override { + server.main_thread_id = pthread_self(); + server.forkless_options_supported = 1; + objectSetMetadataSize(BGITERATION_ENTRY_METADATA_SIZE); + + bgIteration_unitTestDisableCloning(); + + setupDatabase(); + + EXPECT_CALL(mock, aeCreateTimeEvent(_, _, _, _, _)).WillRepeatedly(Return(0)); + bgIteration_init(); + + cleanupCount = 0; + replDoneConfirmed = 0; + replDoneRejected = 0; + + // By default, do nothing for these + EXPECT_CALL(mock, blockClientInUseOnKeys(_, _, _)).WillRepeatedly(Return()); + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).WillRepeatedly(Return()); + + // By default, expect no permission issues + EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_, _, _, _, _, _)) + .WillRepeatedly(Return(ACL_OK)); + } + + + void TearDown() override { + bgIteration_feedIterators(); // process returning stuff before deleting DB + bgIteration_feedIterators(); // in case an iterator was closed there might be more + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i]->keys) kvstoreRelease(server.db[i]->keys); + if (server.db[i]->expires) kvstoreRelease(server.db[i]->expires); + dictRelease(server.db[i]->watched_keys); + zfree(server.db[i]); + } + zfree(server.db); + + if (c != NULL) freeTestClient(c); + } + + + // Deletes an item from the DB (often at the start of a test) - but does NOT notify + // bgIteration. bgIteration_keyDelete() should be explicitly called where needed. + void simpleDelItem(int itemNum) { + int db = getDbFromItemNum(itemNum); + + sds delKey = sdsnew(keyStr(itemNum)); + int rc = kvstoreHashtableDelete(server.db[db]->keys, 0, delKey); + ASSERT_EQ(rc, 1); + sdsfree(delKey); + } + + + // Find the actual dbEntry object by itemNum + dbEntry *getItem(int itemNum) { + int db = getDbFromItemNum(itemNum); + sds key = sdsnew(keyStr(itemNum)); + dbEntry *de = dbFind(server.db[db], key); + sdsfree(key); + return de; + } + + + // The test expects that the next item read will be BGITERATOR_ITEM_COMPLETE + void expectReadComplete(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + EXPECT_EQ(item->type, BGITERATOR_ITEM_COMPLETE); + bgIteratorClose(iter); + + int oldCleanupCount = cleanupCount; + bgIteration_feedIterators(); + EXPECT_EQ(cleanupCount, oldCleanupCount + 1); + } + + + // The test is cleaning up and isn't validating the remaining cleanup + void expectAnythingCleanup(bgIterator *iter) { + while (true) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + if ((item->type == BGITERATOR_ITEM_COMPLETE || + item->type == BGITERATOR_ITEM_TERMINATED)) { + bgIteratorClose(iter); + break; + } + } + bgIteration_feedIterators(); // Recognize the closed iterator + EXPECT_EQ(cleanupCount, 1); + } + + + void expectDictEntryMetadataMatch(dbEntry *de1, dbEntry *de2) { + bgIterationEntryMetadata *dm1 = static_cast(objectGetMetadata(de1)); + bgIterationEntryMetadata *dm2 = static_cast(objectGetMetadata(de2)); + + EXPECT_NE(dm1, nullptr); + EXPECT_NE(dm2, nullptr); + EXPECT_EQ(*dm1, *dm2); + } + + + // Useful when debugging new tests. It reads/prints all remaining items then crashes. + void cleanupIteratorDebugPrint(bgIterator *iter) { + bool done = false; + printf("[DEBUG] Printing bgIterator '%s' items:\n", bgIteratorName(iter)); + while (!done) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + switch (item->type) { + case BGITERATOR_ITEM_DBENTRY: { + auto obj = item->u.dbe.de; + const char *keyStr = objectGetKey(obj); + printf("Entry: %s -> %s [itemNum: %i]\n", + keyStr, + static_cast(objectGetVal(obj)), + itemNumFromKey(keyStr)); + break; + } + case BGITERATOR_ITEM_REPLICATION: + printf("Repl: DB=%d : ", item->dbid); + for (int i = 0; i < item->u.repl.argc; i++) + printf("%s ", static_cast(objectGetVal(item->u.repl.argv[i]))); + printf("\n"); + break; + case BGITERATOR_ITEM_COMPLETE: + case BGITERATOR_ITEM_TERMINATED: + bgIteratorClose(iter); + done = true; + break; + default: + printf("unhandled: %d\n", item->type); + } + } + bgIteration_feedIterators(); // Recognize the closed iterator + ASSERT_TRUE(false); // Halt the test here + } + + + // Make a copy of the metadata + void *cloneMetadata(dbEntry *de) { + int size = objectGetMetadataSize(de); + void *metadata = zmalloc(size); + memcpy(metadata, objectGetMetadata(de), size); + return metadata; + } + + + // Compare a previous metadata copy to an existing entry + void compareAndFreeClonedMetadata(dbEntry *de, void *metadata) { + EXPECT_EQ(memcmp(objectGetMetadata(de), metadata, objectGetMetadataSize(de)), 0); + zfree(metadata); + } + + + // The test expects the next item will be a specific key + // The item value is verified against the default unless provided as a parameter. + void expectReadKey(bgIterator *iter, int itemNum, const char *value = nullptr) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_FALSE(item->u.dbe.is_cloned); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum)); + if (value) { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } else { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum))); + } + } + + + // The test expects the next item will be a specific key amd that the item is cloned. + // Metadata is tested (to make sure the clone includes the proper metadata). + // The item value is verified against the default unless provided as a parameter. + void expectReadClonedKey(bgIterator *iter, int itemNum, void *metadata, const char *value = nullptr) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_TRUE(item->u.dbe.is_cloned); + compareAndFreeClonedMetadata(item->u.dbe.de, metadata); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum)); + if (value) { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } else { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum))); + } + } + + + // Test expects the next key, but specified by key name, not itemNum. + void expectReadDbKeyValue(bgIterator *iter, int db, const char *key, const char *value) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), key); + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } + + + // Test expect to read a sequence of key items + void expectReadKeySequence(bgIterator *iter, int startItem, int endItem) { + for (int i = startItem; i <= endItem; i++) expectReadKey(iter, i); + } + + + // Just like expectReadKey, but also tests that a previous item is becoming unblocked. + void expectReadKeyWithUnblock(bgIterator *iter, int itemNum, int unblockItem, const char *value = nullptr) { + bool blocked = true; + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(unblockItem)))) + .WillOnce(Assign(&blocked, false)); + expectReadKey(iter, itemNum, value); + EXPECT_FALSE(blocked); + } + + + // Test expects to read a replication item matching the command help by client 'c' + void expectReadReplication(bgIterator *iter, client *c) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->dbid, c->db->id); + EXPECT_EQ(item->u.repl.cmd, c->cmd); + EXPECT_EQ(item->u.repl.argc, c->argc); + for (int i = 0; i < c->argc; i++) { + EXPECT_STREQ(static_cast(objectGetVal(item->u.repl.argv[i])), + static_cast(objectGetVal(c->argv[i]))); + } + } + + + // We expect to read a MULTI command which should have been inserted. + void expectReadMultiReplication(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("multi")); + } + + + // We expect to read an EXEC command which should have been inserted. + void expectReadExecReplication(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("exec")); + } + + + // Expecting that a DEL command should have been replicated. + void expectReadReplicationDel(bgIterator *iter, int itemNum) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->dbid, db); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("DEL")); + EXPECT_EQ(item->u.repl.argc, 2); + EXPECT_THAT(item->u.repl.argv[0], robjEqualsStr("DEL")); + EXPECT_THAT(item->u.repl.argv[1], robjEqualsStr(keyStr(itemNum))); + } + + + // Expecting that a special SWAPDB item has been inserted. + void expectReadSwapDB(bgIterator *iter, int db1, int db2) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_SWAPDB); + EXPECT_EQ(item->dbid, db1); + EXPECT_EQ(item->u.dbid2, db2); + } + + + static void debugPrintBucketInfoCb(void *privdata, void *entry) { + UNUSED(privdata); + dbEntry *de = (dbEntry *)entry; + printf("--- %s\n", objectGetKey(de)); + } + + void debugPrintBucketInfo() { + printf("*******DEBUG*******\n"); + for (int db = 0; db < server.dbnum; db++) { + int num_ht = kvstoreNumHashtables(server.db[db]->keys); + for (int slot = 0; slot < num_ht; slot++) { + hashtable *ht = kvstoreGetHashtable(server.db[db]->keys, slot); + if (!ht) continue; + + printf("DB: %d, slot: %d\n", db, slot); + size_t cursor = 0; + do { + cursor = hashtableScan(ht, cursor, debugPrintBucketInfoCb, NULL); + printf("-----------\n"); + } while (cursor != 0); + } + } + ASSERT_TRUE(false); + } + + + // Creates a client with a write command (SET) for the given itemNum + client *getWriteClient(int itemNum, const char *value) { + int db = getDbFromItemNum(itemNum); + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString("set"); + c->db = server.db[db]; + + c->argc = 3; + c->argv = static_cast(zcalloc(sizeof(robj *) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + c->argv[1] = createStringObjectFromCString(keyStr(itemNum)); + c->argv[2] = createStringObjectFromCString(value); + + return c; + } + + + // Create a client with a write command that touches multiple keys + client *getWriteMultiKeysClient(const char *cmdName, + int dstItemNum, + const std::vector &srcItemsNum) { + assert(!srcItemsNum.empty()); + + const int db = getDbFromItemNum(dstItemNum); + std::for_each(srcItemsNum.cbegin(), srcItemsNum.cend(), [&db, this](int srcItemNum) { + assert(db == getDbFromItemNum(srcItemNum)); + }); + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString(cmdName); + assert(c->cmd != nullptr); + c->db = server.db[db]; + + c->argc = 2 + srcItemsNum.size(); + c->argv = static_cast(zcalloc(sizeof(robj *) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + c->argv[1] = createStringObjectFromCString(keyStr(dstItemNum)); + for (unsigned int i = 0; i < srcItemsNum.size(); i++) { + c->argv[2 + i] = createStringObjectFromCString(keyStr(srcItemsNum[i])); + } + + return c; + } + + + client *getWrite2KeysClient(const char *cmdName, int dstItemNum, int srcItemNum) { + return getWriteMultiKeysClient(cmdName, dstItemNum, {srcItemNum}); + } + + + client *getWrite3KeysClient(const char *cmdName, int dstItemNum, int src1ItemNum, int src2ItemNum) { + return getWriteMultiKeysClient(cmdName, dstItemNum, {src1ItemNum, src2ItemNum}); + } + + + // Create a client with a MULTI/EXEC block. + // This parses a series of commands separated by ';' + // Example: getMultiClient("SET A0 xxx; SELECT 1; SET A1 xxx; SET B1 xxx") + client *getMultiClient(const char *commands, int dbid = 0) { + char *commandsCopy = zstrdup(commands); // a mutable copy + char *commandStr, *commandStrSave; + char *token, *tokenSave; + + client *c = static_cast(zcalloc(sizeof(client))); + c->db = server.db[dbid]; + initClientMultiState(c); + c->flag.multi = 1; + c->mstate->cmd_flags |= CMD_WRITE; + + commandStr = strtok_r(commandsCopy, ";", &commandStrSave); + while (commandStr != NULL) { + token = strtok_r(commandStr, " ", &tokenSave); + c->cmd = lookupCommandByCString(token); + + c->argv = static_cast(zcalloc(sizeof(robj *) * 5)); // command + 4 args + + for (int i = 0; token != NULL; i++) { + c->argv[i] = createStringObjectFromCString(token); + c->argc = i + 1; + token = strtok_r(NULL, " ", &tokenSave); + } + + queueMultiCommand(c, 0); + freeClientArgv(c); + + commandStr = strtok_r(NULL, ";", &commandStrSave); + } + + c->cmd = lookupCommandByCString("exec"); + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj *) * c->argc)); + c->argv[0] = createStringObjectFromCString("EXEC"); + + zfree(commandsCopy); + return c; + } + + + // Initially, a MULTI client is set up to execute the EXEC command (which examines the + // contents of the multi/exec block). This function advances the client to begin executing + // the individual commands within the multi/exec block. + void advanceMultiClientToCommand(client *c, int cmdNum) { + assert(cmdNum >= 0 && cmdNum < c->mstate->count); + if (cmdNum == 0) { + // Save off the EXEC + orig_argc = c->argc; + orig_argv = c->argv; + } + c->argc = c->mstate->commands[cmdNum].argc; + c->argv = c->mstate->commands[cmdNum].argv; + c->argv_len = c->mstate->commands[cmdNum].argv_len; + c->cmd = c->realcmd = c->mstate->commands[cmdNum].cmd; + } + + + // A client with a fictional command: + // SETGET + // - writes a value to the first key (making this CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY) + // - reads a second key + client *getSetGetClient(int itemNum1, const char *value1, int itemNum2) { + // Fictional command which writes to 1st key and reads the 2nd + int db = getDbFromItemNum(itemNum1); + assert(db == getDbFromItemNum(itemNum2)); // (this would be a testcase error) + + client *c = static_cast(zcalloc(sizeof(client))); + struct serverCommand *cmd = static_cast(zcalloc(sizeof(struct serverCommand))); + + cmd->fullname = sdsnew("SETGET"); + cmd->arity = 4; + cmd->flags = CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY; + + cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INDEX; + cmd->legacy_range_key_spec.bs.index.pos = 1; // firstkey + cmd->legacy_range_key_spec.fk.range.lastkey = -1; + cmd->legacy_range_key_spec.fk.range.keystep = 2; + + c->cmd = cmd; + c->db = server.db[db]; + + c->argc = 4; + c->argv = static_cast(zcalloc(sizeof(robj *) * c->argc)); + c->argv[0] = createStringObjectFromCString(cmd->fullname); + c->argv[1] = createStringObjectFromCString(keyStr(itemNum1)); + c->argv[2] = createStringObjectFromCString(value1); + c->argv[3] = createStringObjectFromCString(keyStr(itemNum2)); + + return c; + } + + + // Client with a fictional write command with no keys specified + client *getNoKeysWriteClient() { + // Fictional command which is marked WRITE, but has no keys. + client *c = static_cast(zcalloc(sizeof(client))); + struct serverCommand *cmd = static_cast(zcalloc(sizeof(struct serverCommand))); + + cmd->fullname = sdsnew("NOKEYSWRITE"); + cmd->arity = 1; + cmd->flags = CMD_WRITE; + + cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INVALID; // No keys + + c->cmd = cmd; + c->db = server.db[0]; + + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj *) * c->argc)); + c->argv[0] = createStringObjectFromCString(cmd->fullname); + + return c; + } + + + void freeClientArgv(client *c) { + for (int i = 0; i < c->argc; i++) decrRefCount(c->argv[i]); + zfree(c->argv); + c->argv = NULL; + c->argc = 0; + } + + + // During testing, we create some fake commands. This checks if the command is real or fake. + // A fake command is dynamically allocated and can be freed. Real commands are static. + bool isRealValkeyCommand(struct serverCommand *cmd) { + return lookupCommandByCString(cmd->declared_name); + } + + + void freeTestClient(client *c) { + // If the current command references one of the multi commands, set it back to the EXEC + if (c->mstate != NULL) { + for (int i = 0; i < c->mstate->count; i++) { + if (c->argv == c->mstate->commands[i].argv) { + c->argc = orig_argc; + c->argv = orig_argv; + orig_argc = 0; + orig_argv = nullptr; + break; + } + } + } + freeClientMultiState(c); + freeClientArgv(c); + + if (!isRealValkeyCommand(c->cmd)) { + sdsfree(c->cmd->fullname); + zfree(c->cmd); + } + + zfree(c); + } + + + // Simulate what happens when a write command is blocked + void simulateBlockedWrite(client *c, int expectedNumberBlockedKeys = 1) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c, expectedNumberBlockedKeys, _)).Times(1); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_TRUE(blocked); + } + + + // Simulate what happens when a write command isn't blocked + void simulateUnblockedWrite(client *c) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c, _, _)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + } + + + // Simulate what happens when a write command is NOT blocked, because the key can be cloned + // and expedited. This requires a scenario where we would normally need to block the + // client so that bgIteration can process the item. + void simulateClonedWrite(bgIterator *it, client *c) { + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + unsigned long initialClones = status.dbentry_clones_queued; + + // Client should not get blocked + EXPECT_CALL(mock, blockClientInUseOnKeys(c, _, _)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + + // Ensure that cloning took place + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, (initialClones + 1)); + + // Ensure that the real item isn't inuse (because we cloned it instead) + dbEntry *de = dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); + ASSERT_FALSE(bgIteration_isEntryInuse(de)); + } + + + // Simulates what happens when a write command (SET) actually executes. This requires a + // scenario where we would NOT be blocked on the write. It actually alters the value of + // the key and updates the metadata. + void simulateUnblockedWriteWithModification(client *c) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c, _, _)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + + // Fake execution of the command - touch the iterator_epoch counter and swap the value + // We need to duplicate the value because setKey() can reallocate it. + robj *value = dupStringObject(c->argv[2]); + setKey(c, c->db, c->argv[1], &value, SETKEY_ADD_OR_UPDATE); + + // Let's make sure that setKey updated the iteration epoch (as it should have) + dbEntry *de = dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); + bgIterationEntryMetadata *md = static_cast(objectGetMetadata(de)); + bgIterationEntryMetadata md_after_setkey = *md; + // Now update the md again, and it should still match + bgIteration_dbEntryModified(de); + EXPECT_EQ(md, objectGetMetadata(de)); // the md location shouldn't have changed + EXPECT_EQ(md_after_setkey, *md); // the md value should still be the same + + server.in_call++; + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + server.in_call--; + } + + + // Simulate the expiration (active expiration) of a key. This is independent of command execution. + void simulateExpiration(int itemNum) { + ASSERT_NE(getItem(itemNum), nullptr); // Should be there before expire + + // Send bgIteration the DEL + int db = getDbFromItemNum(itemNum); + robj *argv[2]; + argv[0] = createStringObjectFromCString("DEL"); + argv[1] = createStringObjectFromCString(keyStr(itemNum)); + serverCommand *cmd = lookupCommandByCString("DEL"); + // KeyDelete should be called before the deletion occurs + bgIteration_keyDelete(db, static_cast(objectGetVal(argv[1]))); + + simpleDelItem(itemNum); // Simulate the actual del + + // Replication happens after the deletion occurs + ASSERT_EQ(server.in_call, 0); // test sanity check + bgIteration_handleCommandReplication(db, cmd, 2, argv); + decrRefCount(argv[0]); + decrRefCount(argv[1]); + + EXPECT_EQ(getItem(itemNum), nullptr); + } + + + // Simulates an expiration, but validates behavior for an item inuse by bgIteration. + void simulateExpirationOfInuse(int itemNum) { + // An inuse item will have a refcount > 1. BgIteration should have incremented the + // refcount while it is inuse. + dbEntry *de = getItem(itemNum); + ASSERT_NE(de, nullptr); // Should be there before expire + EXPECT_TRUE(bgIteration_isEntryInuse(de)); + EXPECT_EQ(de->refcount, 2u); + + simulateExpiration(itemNum); + + // At this point, the item is removed from the DB, but still exists, and the refcount + // has been reduced to 1. This allows a background thread to continue using the item. + EXPECT_EQ(de->refcount, 1u); + } + + + // Simulates an expiration, but the item is a future item which will be expedited. + void simulateExpirationWithExpedite(int itemNum) { + // An inuse item will have a refcount > 1. BgIteration should have incremented the + // refcount while it is inuse. + dbEntry *de = getItem(itemNum); + ASSERT_NE(de, nullptr); // Should be there before expire + EXPECT_FALSE(bgIteration_isEntryInuse(de)); // Not yet inuse + EXPECT_EQ(de->refcount, 1u); + + simulateExpiration(itemNum); + + // At this point, the item is removed from the DB, but still exists, and the refcount + // has been reduced to 1. This allows a background thread to continue using the item. + EXPECT_TRUE(bgIteration_isEntryInuse(de)); // It's inuse now + EXPECT_EQ(getItem(itemNum), nullptr); // but it's not in the DB anymore + EXPECT_EQ(de->refcount, 1u); + } + + + // Simulate execution of a SWAPDB command + void simulateSwapDB(int dbid0, int dbid1) { + char dbStr[2] = {0}; + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString("swapdb"); + c->db = server.db[0]; + + c->argc = 3; + c->argv = static_cast(zcalloc(sizeof(robj *) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + dbStr[0] = '0' + dbid0; + c->argv[1] = createStringObjectFromCString(dbStr); + dbStr[0] = '0' + dbid1; + c->argv[2] = createStringObjectFromCString(dbStr); + + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // SWAPDB should never block + + // The real SWAP does more than this, but this is enough for unit tests + serverDb *aux = server.db[dbid0]; + server.db[dbid0] = server.db[dbid1]; + server.db[dbid1] = aux; + + bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv); + + freeTestClient(c); + } + + + // Simulate execution of a FLUSHDB or FLUSHALL command + void simulateFlushDB(int db, int anInUseItem) { + client *c = static_cast(zcalloc(sizeof(client))); + + if (db == -1) { + c->cmd = lookupCommandByCString("flushall"); + c->db = server.db[0]; + } else { + c->cmd = lookupCommandByCString("flushdb"); + c->db = server.db[db]; + } + + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj *) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + + dbEntry *de_in_use = getItem(anInUseItem); + EXPECT_EQ(de_in_use->refcount, 2u); + + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // FLUSHDB should never block + + // The real FLUSH does more than this, but this is enough for unit tests + + // Now flush the items + for (int d = 0; d < server.dbnum; d++) { + if (db == -1 || db == d) { + kvstoreRelease(server.db[d]->keys); + server.db[d]->keys = NULL; + } + } + + EXPECT_EQ(de_in_use->refcount, 1u); + + // and replicate + + bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv); + + freeTestClient(c); + } +}; + + +TEST_F(BgIterationTest, dbIsOK) { + // Just run the setup/teardown code to make sure the DB is OK. +} + + +///////////////////////////////////////////////////// +// Simple Full-scan iterator tests +///////////////////////////////////////////////////// + +// A simple full scan that just checks basic flow. +TEST_F(BgIterationTest, createAndCleanup) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + EXPECT_EQ(bgIteratorFind("simple"), it); + EXPECT_STREQ(bgIteratorName(it), "simple"); + + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + + EXPECT_EQ(status.dbentries_queued, 0u); + EXPECT_EQ(status.dbentries_processed, 0u); + EXPECT_EQ(status.replication_queued, 0u); + EXPECT_EQ(status.replication_processed, 0u); + EXPECT_EQ(status.swapdb_queued, 0u); + EXPECT_EQ(status.swapdb_processed, 0u); + EXPECT_EQ(status.flushdb_queued, 0u); + EXPECT_EQ(status.flushdb_processed, 0u); + + EXPECT_EQ(status.queue_length, 0u); + EXPECT_GT(status.queue_length_target, 0u); + + EXPECT_LT(status.runtime_ms, 5u); + EXPECT_EQ(status.current_item_ms, 0u); + + expectAnythingCleanup(it); + + EXPECT_EQ(bgIteratorFind("simple"), nullptr); +} + + +// Close client before reading anything +TEST_F(BgIterationTest, testClientCloseBeforeRead) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + bgIteration_feedIterators(); + + bgIteratorClose(it); // Immediately close before reading + + bgIteration_feedIterators(); // Recognize the closed iterator + + // Check that the cleanup callback was executed properly + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Test that the full scan hits each item in the expected sequence. +TEST_F(BgIterationTest, orderedIteration) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKeySequence(it, 0, LAST_ITEM); + + // Quick status check. At this point, the final item hasn't been returned yet. + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentries_queued, static_cast(TOTAL_ITEMS)); + EXPECT_EQ(status.dbentries_processed, static_cast(TOTAL_ITEMS) - 1); + + expectReadComplete(it); // Returns the final item, and reads the completion item + + // Check that the cleanup callback was executed properly + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); +} + + +// Test that two simultaneous iterations work properly. +TEST_F(BgIterationTest, twoOrderedIterations) { + bgIterator *it1 = bgIteratorCreateFullScanIter("simple1", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + bgIterator *it2 = bgIteratorCreateFullScanIter("simple2", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + EXPECT_EQ(bgIteratorFind("simple1"), it1); + EXPECT_EQ(bgIteratorFind("simple2"), it2); + + int it1Count = 0; + int it2Count = 0; + while (it1Count < TOTAL_ITEMS || it2Count < TOTAL_ITEMS) { + // Randomly read from either iterator + if ((rand() % 2) == 0) { + if (it1Count < TOTAL_ITEMS) expectReadKey(it1, it1Count++); + } else { + if (it2Count < TOTAL_ITEMS) expectReadKey(it2, it2Count++); + } + } + + // Nothing left but to read the final completions + expectReadComplete(it1); + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); + expectReadComplete(it2); + EXPECT_EQ(cleanupCount, 2); + EXPECT_FALSE(cleanupTerminated); +} + + +///////////////////////////////////////////////////// +// MODIFY A FUTURE ITEM +// The next tests validate the basic pattern when a key, not yet iterated, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a future item, without replication or consistency. +// Our expectation for this case is that the modification should proceed without blocking, the item +// shouldn't be expedited, and we will see the modified item once the iterator reaches it. +TEST_F(BgIterationTest, modFutureItem) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + c = getWriteClient(6, "xxx"); + + // We DONT expect the client to be blocked - not consistent + simulateUnblockedWriteWithModification(c); + + // Now continue reading, 1, 2, 3, 4, 5 + expectReadKeySequence(it, 1, 5); + + // Let's validate that key 6 shows the new value + expectReadKey(it, 6, "xxx"); + + // Continue... + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a future item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification SHOULD be blocked, as we have to save the +// the item in it's state before the modification. To reduce blocking time, the item should be +// moved to the head of the queue - there's no replication in this case, so out-of-order processing +// isn't a concern. +TEST_F(BgIterationTest, modFutureItem_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + c = getWriteClient(6, "xxx"); + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c); + + // On a consistent iterator, the event is expedited in-front of items already in queue! + // Read key 6 out of order. + expectReadKey(it, 6); + + // Now, when we read key 1, key 6 is released back to Valkey, and the client will be unblocked. + expectReadKeyWithUnblock(it, 1, 6); + simulateUnblockedWriteWithModification(c); // Now the write can proceed + + // Continue... + expectReadKeySequence(it, 2, 5); + // 6 has already been processed + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a future item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification should proceed without blocking, as the +// mode is inconsistent. We don't expect replication, as we haven't reached the item yet. We'll +// see the modified item later. +TEST_F(BgIterationTest, modFutureItem_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + c = getWriteClient(6, "xxx"); + + // We DONT expect the client to be blocked - not consistent + simulateUnblockedWriteWithModification(c); + + // NOTE: Since we haven't reached this item yet, and consistency is not required, there's no + // need to replicate this command. So everything should wrap up just fine - we will see + // the new value when we get to it. + + // Now continue reading, 1, 2, 3, 4, 5 + expectReadKeySequence(it, 1, 5); + + // Let's validate that key 6 shows the new value + expectReadKey(it, 6, "xxx"); + + // Continue... + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// MODIFY A CURRENT ITEM +// The next tests validate the basic pattern when a key, currently in use, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a current item, without replication or consistency. +// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't +// be expedited (it's already in use). +TEST_F(BgIterationTest, modCurrentItem) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication) + + // Continue... + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a current item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't +// be expedited (it's already in use). +TEST_F(BgIterationTest, modCurrentItem_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication) + + // Continue... + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a current item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification SHOULD be blocked. After the key is processed, +// the write will proceed, and the replication will be sent. +TEST_F(BgIterationTest, modCurrentItem_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write will cause replication + + expectReadKey(it, 4); // 4 got put in queue when 3 was read + + expectReadReplication(it, c); + + // Continue... + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// MODIFY A PAST ITEM +// The next tests validate the basic pattern when a key, not yet iterated on, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a past item, without replication or consistency. +// Our expectation for this case is that the modification should proceed without blocking. +// No replication is generated and keys are processed similar to no modification. +TEST_F(BgIterationTest, modPastItem) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKeySequence(it, 2, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a past item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification should proceed without blocking. +// No replication is generated and keys are processed similar to no modification. +TEST_F(BgIterationTest, modPastItem_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKeySequence(it, 2, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a past item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification should proceed without blocking. +// Replication will be sent. +TEST_F(BgIterationTest, modPastItem_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Key 2 was already in queue (same bucket as key 1). The replication will follow. + expectReadKey(it, 2); + expectReadReplication(it, c); + + // Continue... + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// TESTS FOR ITEM CLONING +///////////////////////////////////////////////////// + +// In a consistent iteration, verify that a simple string is properly cloned, and that a write can +// occur without blocking. Validate the cloned item and metadata. +TEST_F(BgIterationTest, modFutureItem_start_CloneExpeditedItem) { + // Initialize cloning configurations. + bgIteration_unitTestEnableCloning(50, 100); + + bgIteratorStatus status; + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + c = getWriteClient(6, "xxx"); + + // Quick status check. At this point, no clones exist yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 0u); + + // Since item 6 should be cloned, it will not block the client, allowing the write. + void *de6_md = cloneMetadata(getItem(6)); + simulateClonedWrite(it, c); // This wouldn't block, and queues the cloned value + simulateUnblockedWriteWithModification(c); // This modifies the real entry in the de (touching metadata) + + // At this point, one clone is in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // On a consistent iterator, the event is expedited in-front of items already in queue! + // Read key 6 (which is cloned) out of order. The value will still match the key. + expectReadClonedKey(it, 6, de6_md); // Also validates and frees the metadata + + // Quick status check. At this point, cloned items have not been marked as processed yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 0u); + + // Reading key 1 will release key 6, and the clone will finish processing. + expectReadKey(it, 1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Now, when we read key 2 should not have an impact on number of processed clones. + expectReadKey(it, 2); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Continue... + expectReadKeySequence(it, 3, 5); + // 6 has already been processed + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +// Check that cloning for simple strings is respecting the size limits and pool size. On a +// consistent iteration, we expect to block or clone on all future keys. We validate that we can +// clone if the item is small enough and the cloning pool has more space left. +TEST_F(BgIterationTest, modFutureItem_start_LargeItemOrClonePoolFull) { + // Initialize cloning configurations to test the clone pool functionality first. + bgIteration_unitTestEnableCloning(50, 50); + + bgIteratorStatus status; + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c6 = getWriteClient(6, "xxx"); + client *c7 = getWriteClient(7, "xxx"); + client *c8 = getWriteClient(8, "xxx"); + + // Quick status check. At this point, no clones exist yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 0u); + + // Since item 6 should be cloned, it will not block the client, allowing the write. + void *de6_md = cloneMetadata(getItem(6)); + simulateClonedWrite(it, c6); + simulateUnblockedWriteWithModification(c6); + + // At this point, one clone is in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // Now that cloning pool is full, item 7 will not be cloned and the client will be blocked. + simulateBlockedWrite(c7); + ASSERT_TRUE(bgIteration_isEntryInuse(getItem(7))); + + // There is still only one cloned item in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // Now change cloning configurations to test that large items will not be cloned. We adjust + // the clone pool size to allow two items, but set the maximum item size to be smaller than + // the size of item 8. The clone pool size must be larger than the total size of the existing + // clones plus the maximum item clone size. + bgIteration_unitTestEnableCloning(1, 101); + + // This write will pass the clone pool check but fail the item size check, blocking the client. + simulateBlockedWrite(c8); + ASSERT_TRUE(bgIteration_isEntryInuse(getItem(8))); + + // On a consistent iterator, the expedited item in-front of items already in queue! + // Read key 6 out of order. + expectReadClonedKey(it, 6, de6_md); + + // Now, when we expect to read key 7, which was expedited, key 6 will be released back to Valkey + // and the clone will be deallocated here. + expectReadKey(it, 7); + + // Now, when we read key 8, which was expedited, key 7 is released back to Valkey, and the client + // will be unblocked. + // (actually, unblock is called after every key [just in case] - but functionally we only care + // about this one) + expectReadKeyWithUnblock(it, 8, 7); + simulateUnblockedWriteWithModification(c7); + + // Now, when we read key 1, key 8 is released back to Valkey, and the client will be unblocked. + expectReadKeyWithUnblock(it, 1, 8); + simulateUnblockedWriteWithModification(c8); + + // Since only one item was cloned, there should be one clone processed + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Continue... + expectReadKeySequence(it, 2, 5); + // 6, 7, and 8 have already been processed + expectReadKeySequence(it, 9, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c6); + freeTestClient(c7); + freeTestClient(c8); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO MODIFICATION OF TWO ITEMS +// When 2 keys are modified, we need to ensure that both keys have been sent before we can send +// replication. This means that if replication is present, we may have to block/expedite for +// future keys, even in the inconsistent scenario. +///////////////////////////////////////////////////// + +// Replication enabled, but NOT consistent. In this case, if ANY of the keys have been iterated, +// ALL of the keys must be replicated so that the command can be processed properly on the replica. +TEST_F(BgIterationTest, modPastFutureItem_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Even though key 12 is for READ in this command, it must be expedited so that it exists before + // the associated replication is sent. + c = getSetGetClient(8, "xxx", 12); + simulateBlockedWrite(c); + + // Key 12 will be expedited, but not in front of existing items in queue (can only do that for + // consistent iterators) + + expectReadKey(it, 10); + expectReadKey(it, 12); // expedited + + expectReadKeyWithUnblock(it, 11, 12); // 13 is now in queue + + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKey(it, 13); + expectReadReplication(it, c); + + expectReadKeySequence(it, 14, LAST_ITEM); + expectReadComplete(it); +} + + +// Replication NOT enabled. A read-only key doesn't need to be expedited, even if other keys have +// been processed already. (This should work identically for both consistent/non-consistent. +TEST_F(BgIterationTest, modPastFutureItem_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter1", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Since there's no replication, we don't have to worry about expediting 12. The write will + // proceed without blocking. + c = getSetGetClient(8, "xxx", 12); + simulateUnblockedWriteWithModification(c); + + // Key 12 will not be expedited. Remaining keys should be received in normal order. + expectReadKeySequence(it, 10, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, modPastFutureItem) { + bgIterator *it = bgIteratorCreateFullScanIter("iter2", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Since there's no replication, we don't have to worry about expediting 12. The write will + // proceed without blocking. + c = getSetGetClient(8, "xxx", 12); + simulateUnblockedWriteWithModification(c); + + // Key 9 will not be expedited. Remaining keys should be received in normal order. + expectReadKeySequence(it, 10, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO MISSING ITEMS +// Missing items are tricky. A missing item might be logically located in the past or future, in +// relation to the current iteration position. The command may (or may not) create the "missing" +// key. Some general considerations: +// * In a consistent iteration, a missing key didn't exist at the time of consistency, or it was +// already processed (saved) at the time of the deletion. If the missing key gets created, we +// must be sure to skip it if we later iterate over it. +// * In a non-consistent iteration with replication: +// * If the key location is already passed, the replication is sent, allowing the key to be +// created (or not) based on the replication. +// * If the key location is in the future, we can allow the command to proceed, without +// replication. If the key is created, we will process it when the iterator gets to it. +// +// We expect: +// no-repl, no-consist: past items are ignored - future items are processed when iterated +// no-repl, yes-consist: past items are ignored - future items are ignored +// yes-repl, no-consist: past item skipped, but replicated - future items are created by replication and skipped later +// yes-repl, yes-consist: past item skipped, but replicated - future items are processed when iterated +///////////////////////////////////////////////////// + +// no-repl, no-consist: creation of PAST item has no impact +TEST_F(BgIterationTest, missingPastItem) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 1); + expectReadKey(it, 2); + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); +} + + +// no-repl, yes-consist: creation of PAST item has no impact +TEST_F(BgIterationTest, missingPastItem_start) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 1); + expectReadKey(it, 2); + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); +} + + +// yes-repl, no-consist: creation of a PAST item will be replicated +TEST_F(BgIterationTest, missingPastItem_eventual) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // replication will be added after item 4 (3,4 in same bucket) + + expectReadKey(it, 4); + + expectReadReplication(it, c); + + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +// no-repl, no-consist: creation of FUTURE item is seen when reached by the iteration. +TEST_F(BgIterationTest, missingFutureItem) { + // Using DB1 so we have lots of buckets + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + + const char *newValue = "xxx"; + c = getWriteClient(14, newValue); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 13); + + // We expect to see item 14. + // Note that for an inconsistent DB view, it is logically undefined if this value is seen (or not). + // But as implemented, we should see it and the test is helpful to understand if/when the + // functionality changes. + expectReadKey(it, 14, newValue); + + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); +} + + +// no-repl, yes-consist: creation of FUTURE item is ignored by consistent iteration. +TEST_F(BgIterationTest, missingFutureItem_start) { + // Using DB1 so we have lots of buckets + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + + c = getWriteClient(14, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 13); + // Key 14 is missing - it didn't exist at start of consistent iteration + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); +} + + +// yes-repl, no-consist: creation of FUTURE item is handled by the replication, and then the key is +// later skipped (treated like an early iteration case). +TEST_F(BgIterationTest, missingFutureItem_eventual) { + // Using DB1 so we have lots of buckets + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Items 1 & 2 are in queue (same bucket) + + c = getWriteClient(14, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 2); + + expectReadReplication(it, c); // Here's the replication creating item 14 + + expectReadKeySequence(it, 3, 13); + // We expect item 14 to be skipped, because it was created by the earlier replication + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO EXPIRATION +// Expiration can be tricky. When pre-evaluating a command with bgIteration_blockClientIfRequired, +// a key might exist, but be ready for expiration. Then, as the command executes, the key expires +// and gets deleted before the write operation. Consider SET K V. +// In the unexpired case, this appears to bgIteration as a single SET command (which replaces the value). +// In the expired case, bgIteration will receive a DEL followed by a SET. +// +// Another case is a READ command. A read command won't cause the client to be blocked. However, +// if the key is expired, this will cause a DEL. For consistent processing, this key might need to +// be expedited so that it can be processed before it gets deleted. In this case, the key is +// unlinked from the main Valkey dictionary, but the actual deletion is deferred. +///////////////////////////////////////////////////// + +TEST_F(BgIterationTest, expireKeys) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we no longer care + simulateExpirationOfInuse(2); // Current - it's inuse + simulateExpiration(5); // Future - we don't care (non-consistent) + + expectReadKeySequence(it, 2, 4); + // key 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, expireKeys_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we expect replication + simulateExpirationOfInuse(2); // Current - it's inuse, but we expect replication + simulateExpiration(5); // Future - we don't care (non-consistent) + + expectReadKey(it, 2); // this was already queued + + expectReadReplicationDel(it, 0); // Past item should replicate + expectReadReplicationDel(it, 2); // Current item should replicate + // Item 5 is a future item and doesn't need to replicate + + expectReadKeySequence(it, 3, 4); + // Item 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, expireKeys_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we no longer care + simulateExpirationOfInuse(2); // Current - we must defer + simulateExpirationWithExpedite(5); // Future - will become inuse and expedited for consistency + + expectReadKey(it, 5); // Expedited to front + + expectReadKeySequence(it, 2, 4); + // Item 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +// Special case during a non-consistent iteration with replication and expiration. +// 1. A future key is created (and processed by its replication) - considered early iterated +// 2. Later the key is expired and deleted during command processing (causes DEL to be sent) - no longer early iterated +// 3. The key is recreated as part of the command processing (and this command was replicated) - again early iterated +// 4. Finally, when we iterate to the key, it shouldn't be sent, because it was replicated in step 3. +TEST_F(BgIterationTest, expireKeys_eventual_FutureKeyCreatedThenExpiredDuringSet) { + simpleDelItem(8); // Start with a missing future item + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Get the iterator started + + c = getWriteClient(8, "xxx"); + simulateUnblockedWriteWithModification(c); // Not blocked because this is a future key (but we expect repl) + + // Now do it again, but break out the steps so that we can simulate an expiration + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // Shouldn't be blocked because this is a future key + + // Now, as the SET command tries to execute, simulate that the key is expired. Expiration + // processing sends the replication FIRST! + robj *argv[2]; + argv[0] = createStringObjectFromCString("DEL"); + argv[1] = c->argv[1]; + serverCommand *cmd = lookupCommandByCString("DEL"); + bgIteration_handleCommandReplication(getDbFromItemNum(8), cmd, 2, argv); + decrRefCount(argv[0]); + + // Now the call to keyDelete happens (after the replication). + bgIteration_keyDelete(getDbFromItemNum(8), static_cast(objectGetVal(c->argv[1]))); + simpleDelItem(8); // Simulate the actual del + + // Now the SET will run, re-creating the item (which is still a future item) + // We need to duplicate the value because setKey() can reallocate it. + robj *value = dupStringObject(c->argv[2]); + setKey(c, c->db, c->argv[1], &(value), SETKEY_ADD_OR_UPDATE); + + // Finally, replication will be sent because this is creating a new key + bgIteration_handleCommandReplication(getDbFromItemNum(8), c->cmd, c->argc, c->argv); + + // Test that everything comes as expected + expectReadKeySequence(it, 1, 2); // All one bucket - queued after key 0 read + + expectReadReplication(it, c); // Repl from the first SET command + expectReadReplicationDel(it, 8); // This is the expected replication of the DEL from expire + expectReadReplication(it, c); // Repl from the second SET command (recreating deleted key) + + expectReadKeySequence(it, 3, 7); // continue with normal iteration + // KEY 8 SHOULD BE OMITTED - This was already replicated + expectReadKeySequence(it, 9, LAST_ITEM); + + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// THE REMAINING TESTS ARE GENERAL / UNCATEGORIZED +///////////////////////////////////////////////////// + +// Iteration can be terminated from the main thread or from the child client. +// This tests termination driven from the main thread. +TEST_F(BgIterationTest, earlyTerminationFromMain) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + + // At this point, keys 1 & 2 are in queue. A termination should release those keys. + bool blocked1 = true; + bool blocked2 = true; + // We expect no general unblocks, we account for each specific unblock below. + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0); + // We should expect to see unblock called for items 1 & 2, as they are released from the queue. + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1)))) + .WillOnce(Assign(&blocked1, false)); + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2)))) + .WillOnce(Assign(&blocked2, false)); + bgIteratorTerminate(it); // queues the items for release + EXPECT_TRUE(bgIteratorIsTerminating(it)); + bgIteration_feedIterators(); // actually performs the release + EXPECT_FALSE(blocked1); + EXPECT_FALSE(blocked2); + + bool blocked0 = true; + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))) + .WillOnce(Assign(&blocked0, false)); + bgIteratorItem *item = bgIteratorRead(it); + EXPECT_FALSE(blocked0); + EXPECT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + EXPECT_EQ(cleanupCount, 0); + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Iteration can be terminated from the main thread or from the child client. +// This tests termination driven from the child client (the background thread). +TEST_F(BgIterationTest, earlyTerminationFromChild) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + + // At this point, keys 1 & 2 are in queue. A termination should release those keys. + bgIteratorClose(it); // background thread initiates the termination + EXPECT_TRUE(bgIteratorIsTerminating(it)); + + bool blocked0 = true; + bool blocked1 = true; + bool blocked2 = true; + // Expecting no extra unblocks + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0); + // We expect item 0 (the in progress item) to be released + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))) + .WillOnce(Assign(&blocked0, false)); + // We expect items 1-4 (the queued items) to be released + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1)))) + .WillOnce(Assign(&blocked1, false)); + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2)))) + .WillOnce(Assign(&blocked2, false)); + bgIteration_feedIterators(); + EXPECT_FALSE(blocked0); + EXPECT_FALSE(blocked1); + EXPECT_FALSE(blocked2); + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Edge case. Executing a command (like SUNIONSTORE) which REPLACES the first key and reads the +// second key. In this case, bgIteration will get notified of the key deletion during execution of +// SETUNIONSTORE. Given that both keys are in the future (not iterated yet), we'll allow the +// command to execute, unblocked. We won't replicate as we'll pick up the key when we get to it. +TEST_F(BgIterationTest, writeWith2Keys_eventual_keyDeletedDuringSetReplace) { + // Using DB1 so we have lots of buckets + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKeySequence(it, 0, 8); // 9 is in queue + + // Write command that has 2 keys. 1 existing key that we write to and 1 dependant future key. + c = getWrite2KeysClient("sunionstore", 12, 13); + + simulateUnblockedWrite(c); + + // Now the call to keyDelete happens + sds sdskey = sdsnew(keyStr(12)); + bgIteration_keyDelete(getDbFromItemNum(12), sdskey); + sdsfree(sdskey); + simpleDelItem(12); // So simulate the actual del + + // Now the write will run, re-creating the item (which is still a future item) + const char *const newValueStr = "new value"; + robj *newValueRobj = createStringObjectFromCString(newValueStr); + setKey(c, c->db, c->argv[1], &newValueRobj, SETKEY_ADD_OR_UPDATE); + + // Finally, we are letting bgIteration know that the write command was executed + bgIteration_handleCommandReplication(getDbFromItemNum(12), c->cmd, c->argc, c->argv); + + // Since the write command was not replicated, we expect all the keys to be read in the normal + // order from the dictionary. + expectReadKeySequence(it, 9, 11); + expectReadKey(it, 12, newValueStr); + expectReadKeySequence(it, 13, LAST_ITEM); + + expectReadComplete(it); +} + + +// Edge case. When we have a new key which is created by a command, AND replication is enabled, we +// expect that we will replicate the command rather than serializing the key/value later. As an +// example, consider SUNIONSTORE A B. We want to create A by replicating the command. We don't +// want to have to process A as a key later on. But in this case, we can't run the command until +// B has been sent. We expect the command to be blocked while we send B. +TEST_F(BgIterationTest, writeWith2Keys_eventual_setNewKey_DependantFuture) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKeySequence(it, 0, 8); // 9 is in queue + + // Write command that has 2 keys. 1 new key and 1 dependant future key. + c = getWrite2KeysClient("sunionstore", 12, 13); + + // We are simulating a new key in the dict. This command should block on the dependant key. + // This adds key 13 in the queue since the command depends on it. + simulateBlockedWrite(c); + + // Key 9 was already in the queue + expectReadKey(it, 9); + + // Key 13 is processed out of order since the write depends on it + expectReadKey(it, 13); + + // Reading key 10 will unblock key 13, allowing us to write. + expectReadKey(it, 10); + + // Now that key 13 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + // Key 11 was queued when we read key 10 + expectReadKey(it, 11); + + // The replication of the write command was enqueued after key 11 + expectReadReplication(it, c); + + // We shouldn't see key 12 - as that was processed via replication. + // We shouldn't see key 13 - as that was expedited earlier + + // Now resuming processing of dict entries + expectReadKeySequence(it, 14, LAST_ITEM); + + expectReadComplete(it); +} + + +// A new key is being created, but is dependent on another key which has already been processed. +// In this case, the command shouldn't be blocked. +TEST_F(BgIterationTest, writeWith2Keys_eventual_setNewKey_DependantPast) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // 10 is in queue, done with 8 + + // Write command that has 2 keys. 1 new key and 1 dependant past key. + c = getWrite2KeysClient("sunionstore", 12, 8); + + // We are simulating a new key in the dict. + // This command should not block since the dependant key has already been processed. + simulateUnblockedWriteWithModification(c); + + // Key 10 was put in the queue before the write + expectReadKey(it, 10); + + expectReadReplication(it, c); + + expectReadKey(it, 11); + + // Key 12 should be missing - it was processed by replication + + expectReadKeySequence(it, 13, LAST_ITEM); + expectReadComplete(it); +} + + +// A new key is being created, and has dependencies on 2 other keys - one already processed, one not. +// In this case, the command should be blocked so that the future key can be sent first. +TEST_F(BgIterationTest, writeWith3Keys_eventual_setNewKey_1DependantPast1DependantFuture) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // 8 has been returned, 9 is active, 10 is in queue + + // Write command that has 1 new key and 2 dependencies (past/future) + c = getWrite3KeysClient("sunionstore", 12, 8, 13); + + // The write should be blocked, so that item 13 can be processed. + simulateBlockedWrite(c); + + expectReadKey(it, 10); // 10 was already in queue + expectReadKey(it, 13); // 13 was expedited since the write depends on it + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(13)))).Times(1); + expectReadKey(it, 11); // Releases 13 so the command can execute + + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 14); // was queued when reading 11 (12 is missing, 13 was expedited) + + expectReadReplication(it, c); + + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); +} + + +// Test an edge case with the same (future) key being repeated in the command, like: +// SUNIONSTORE A B B +// In this test, A is a previously handled key, and B is a future key. We expect the future key B to +// be expedited (once). +TEST_F(BgIterationTest, writeWith3Keys_eventual_repeatedKey_1DependantPast1RepeatedFuture) { + // Using DB1 so we have lots of buckets + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // We're done with 8, and 10 is in queue + + // Write command that has 3 keys. 1 past key and 1 repeated key in the future. + c = getWrite3KeysClient("sunionstore", 8, 12, 12); + + // This command should block because 12 needs to be expedited. + simulateBlockedWrite(c); + + expectReadKey(it, 10); // was already in queue + expectReadKey(it, 12); // expedited + expectReadKey(it, 11); // releases 12 (unblocking the command) + + // Now that key 12 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 13); // queued when we read 11 + + expectReadReplication(it, c); + + // Now resuming processing of dict entries. + expectReadKeySequence(it, 14, LAST_ITEM); + expectReadComplete(it); +} + + +/* Tests the replication of a write command that creates a new key and depends on a + * future key which is duplicated in the command. */ +TEST_F(BgIterationTest, writeWith3Keys_eventual_repeatedKey_1newKey1RepeatedFuture) { + simpleDelItem(3); // Deleting key 3 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + // At this point, keys 1 & 2 are in queue. + + // Write command that has 3 keys. 1 new key and 1 repeated key in the future. + c = getWrite3KeysClient("sunionstore", 3, 5, 5); + + // This command should block on key 5. + // This adds key 5 in the queue because: + // - the command depends on key 5 which hasn't been processed yet + // - the command creates a new key (key 3). + simulateBlockedWrite(c); + + expectReadKeySequence(it, 1, 2); // These were already in queue + + // Key 5 is processed out of order since the write depends on it + expectReadKey(it, 5); + + // Keys 4 is the next in queue, and releases the expedited key 5 + expectReadKey(it, 4); + + // Now that key 4 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + // Key 6 & 7 are next, having been queued after reading key 4. + expectReadKeySequence(it, 6, 7); + + // The replication of the write command was enqueued after 5 was released (unblocking the command) + expectReadReplication(it, c); + + // Now resuming processing of dict entries. + expectReadKeySequence(it, 8, LAST_ITEM); + expectReadComplete(it); +} + + +/* A command modifying an in-progress key, but dependent on a future (repeated) key. */ +TEST_F(BgIterationTest, writeWith3Keys_start_repeatedKey_1DependantPast1RepeatedFuture) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + // At this point, keys 1 & 2 are in queue. + + // Write command that has 3 keys. 0 is in progress. 4 is still future. + // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a + // multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY). + c = getWriteMultiKeysClient("blpop", 0, {4, 4, 0}); + + // This command should block on 2 keys (0 and 4), since: + // - key 0 is in use by the iterator (still in the queue since it has not been processed by the consumer yet) + // - key 4 is in the future + // This adds key 4 in the queue since the command depends on it and it hasn't been processed yet. + simulateBlockedWrite(c, 2); + + // Key 4 is processed out of order since the write depends on it. + // Key 4 is processed before key 1 even though key 1 was already in the queue + // because key 4 was enqueued as a priority item with a no-replication iterator. + // Reading key 4 will release key 0 - releasing that lock on the command + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))).Times(1); + expectReadKey(it, 4); // This unblocks key 0 + + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(4)))).Times(1); + expectReadKey(it, 1); // this was already in queue (releases key 4) + + // Now that keys 4 and 0 were processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 2, 3); + + // 4 is skipped because it was already expedited + + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +/* Test that creates a new key, repeating the future key in the command. */ +TEST_F(BgIterationTest, writeWith3Keys_repeatedKey_1repeatedNewKey) { + simpleDelItem(6); // Deleting key 6 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + // Getting started + expectReadKeySequence(it, 0, 3); + // Now, 0,1,2 are in the past. 3 is being processed, and 4 is in queue. + + // Write command that has 3 keys. 1 new repeated key and 1 key in the past. + // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a + // multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY). + c = getWriteMultiKeysClient("blpop", 6, {0, 6, 0}); + + // The write command is not blocked since key 0 & 6 are not in use, and no consistency requirements + simulateUnblockedWriteWithModification(c); + + // Keys 2, 3 are next in the queue (it was put in the queue at the same time as key 1). + expectReadKeySequence(it, 4, 5); + + // There are no consistency requirements - so the new key should just be iterated. + // Key 6 is now in the dict with the value of key 0. + expectReadKey(it, 6, keyStr(0)); + + // Processing the rest of the dict entries. + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +/* In this test, the COPY command is copying from one DB to another. We will create the + * same key in both DBs. We make sure that the proper key is created via replication, and + * the proper key is created by iteration. */ +TEST_F(BgIterationTest, copyHandlesProperDb_eventual) { + // NOTE: Adding E0 to dict 1. Now there is a E0 in both dict 0 and dict 1. + addKeyToDb(1, "H0", "H0"); + + // The test: + // We will simulate (with DB0 selected): COPY B1 H0 DB 1 REPLACE + // This will overwrite DB1:H0 that was created above. + // Since DB0:B1 is already in queue, we need to expedite the target (DB1:H0) as well + // After DB1:H0 is "overwritten", it should be marked early iterate. + // We expect DB0:H0 to NOT be marked early iterate, and should get processed normally. + + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); // B0 + // At this point, keys 1(B1) & 2(B2) are in queue. + + // COPY B1 H0 DB 1 REPLACE + c = static_cast(zcalloc(sizeof(client))); + c->cmd = lookupCommandByCString("copy"); + c->db = server.db[0]; + c->argc = 6; + c->argv = static_cast(zcalloc(sizeof(robj *) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + c->argv[1] = createStringObjectFromCString("B1"); + c->argv[2] = createStringObjectFromCString("H0"); + c->argv[3] = createStringObjectFromCString("DB"); + c->argv[4] = createStringObjectFromCString("1"); + c->argv[5] = createStringObjectFromCString("REPLACE"); + + // This should block on 2 keys. DB0:B1 is in queue. DB1:H0 needs to be expedited. + simulateBlockedWrite(c, 2); + + // These 2 keys were already in queue + expectReadKey(it, 1); // DB0:B1 + expectReadKey(it, 2); // DB0:B2 + + // And now we expect to see the expedited DB1:H0 + expectReadDbKeyValue(it, 1, "H0", "H0"); + + expectReadKey(it, 3); // releases DB1:E0 + + // Now key 4 is still in the queue + + simulateUnblockedWrite(c); // We shouldn't be blocked this time + + // Now, we'll simulate the actual activity of the COPY. DB1:H0 will be deleted in order to + // be overwritten. + sds sdskey = sdsnew("H0"); + bgIteration_keyDelete(1, sdskey); // bgIteration would be signaled about the deletion + sdsfree(sdskey); + // At this point the key would actually be deleted and recreated by COPY (no need to actually do this) + + // And finally the replication (this should queue replication) + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + + // Now let's read everything... + expectReadKey(it, 4); // (this was previously in queue) + expectReadReplication(it, c); // This is the new replication (creating DB1:H0) + + // The rest should be normal. We shouldn't see DB1:E0 as it was recreated by replication + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +// Check that termination with replication in queue works OK. +TEST_F(BgIterationTest, terminateWithReplication) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + expectReadKey(it, 1); // makes sure we are done with key 0 (don't want to block) + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Should replicate + + bgIteratorTerminate(it); + + bgIteratorItem *item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// SWAPDB tests - Get ready for the mind-bend... + +/* In the non-consistent iterator (without replication), items are identified with the DBID at + * the time they are placed into the queue. The SWAPDB event signals the change to the + * iterating process - and this is properly sequenced with the DB info for each item. */ +TEST_F(BgIterationTest, swapDB) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + bgIteratorStatus status; + + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + simulateSwapDB(0, 1); // The swap event will be queued after item 2 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 0u); + + expectReadKey(it, 1); // These were already in queue, + expectReadKey(it, 2); // ... and the iteration client hasn't seen the swap yet + + expectReadSwapDB(it, 0, 1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 0u); // still processing it... + + // Since we've seen the swap event, items now have the new DBID + + expectReadDbKeyValue(it, 1, keyStr(3), keyStr(3)); // item 3 should show in DB1 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 1u); // done processing the swapdb + + // Keys 4 is in the queue - let's swap back! + simulateSwapDB(1, 0); // The swap event will be queued after item 4 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); // 2nd one queued + EXPECT_EQ(status.swapdb_processed, 1u); + + expectReadDbKeyValue(it, 1, keyStr(4), keyStr(4)); // item 4 should still show in DB1 + + expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); + EXPECT_EQ(status.swapdb_processed, 1u); // still processing it... + + // Since we've seen the second swap, items should now show with their original DB + + expectReadKey(it, 5); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); + EXPECT_EQ(status.swapdb_processed, 2u); // done processing all swaps + + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +/* In the consistent iterator (without replication) all items are presented to the iterating + * process using the DBID at the time of the iterator creation. No changes are evident. + * Swap events are not presented to the iteration client. */ +TEST_F(BgIterationTest, swapDB_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + simulateSwapDB(0, 1); // The swap occurs, but the iterator sees no change + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + + // Heck, let's go crazy with those swaps... + for (int itemNum = 4; itemNum <= LAST_ITEM; itemNum++) { + simulateSwapDB(0, 1); + expectReadKey(it, itemNum); + } + + expectReadComplete(it); +} + + +/* In the non-consistent iterator WITH replication, items are identified with the DBID at the + * time they are placed into the queue. The SWAPDB event signals the change to the iterating + * process - and this is properly sequenced with the DB info for each item. */ +TEST_F(BgIterationTest, swapDB_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + simulateSwapDB(0, 1); // The swap event will be queued after item 2 + + expectReadKey(it, 1); // These were already in queue, + expectReadKey(it, 2); // ... and the iteration client hasn't seen the swap yet + + expectReadSwapDB(it, 0, 1); // We should see a SWAPDB event + bgIteratorItem *item = bgIteratorRead(it); // followed by the associated replication + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + bgIteration_feedIterators(); + + // Since we've seen the swap event, items now have the new DBID + expectReadDbKeyValue(it, 1, keyStr(3), keyStr(3)); // item 3 is now in DB1 + + // Key 4 is in the queue - let's swap back! + simulateSwapDB(1, 0); // The swap event will be queued after item 4 + + expectReadDbKeyValue(it, 1, keyStr(4), keyStr(4)); // Still appears as DB1 + + expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap + item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + bgIteration_feedIterators(); + + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + +// There is no test for swapDB_YesReplication_YesConsistent because this configuration is not +// permitted with multiple DBs (not permitted with swaps). + + +// FLUSHDB & FLUSHALL Tests + +TEST_F(BgIterationTest, flushDB_flushAll) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + expectReadKey(it, 1); + + // key 1 is active in the iterator - this key won't be deallocated because of the refcount. + // keys 2 is in queue - but will be returned to Valkey before the flush. It is yanked + // back by Valkey and will not be seen by iterator. + simulateFlushDB(-1, 1); + + bgIteratorItem *item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + +TEST_F(BgIterationTest, flushDB_flushOne) { + bgIterator *it1 = bgIteratorCreateFullScanIter("iter1", BGITERATOR_CONSISTENCY_NONE, NULL, + iteratorCleanupFn, PRIVDATA); + bgIterator *it2 = bgIteratorCreateFullScanIter("iter2", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + bgIteratorStatus status; + + // The test flushes DB0. This is half the data. Since <= half, a non-consistent iterator is + // allowed to proceed. But the consistent iterator will be terminated. + + expectReadKey(it1, 0); + expectReadKey(it2, 0); + expectReadKey(it1, 1); + expectReadKey(it2, 1); + + // key 1 is active in the iterator - this key won't be deallocated because of the refcount. + // keys 2 is in queue - but will be returned to Valkey before the flush. These are yanked + // back by Valkey and will not be seen by iterator. + simulateFlushDB(0, 1); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 0u); + + // Testing the non-consistent one continues... + // Everything already on the iterator queue should be preserved (deleted from the DB). + // Keys 2 is already queued (and preserved). + expectReadKey(it1, 2); + + // Read the flushdb item on iterator 1. + bgIteratorItem *item = bgIteratorRead(it1); + ASSERT_EQ(item->type, BGITERATOR_ITEM_FLUSHDB); + ASSERT_EQ(item->dbid, 0); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 0u); // still processing it + + // And iterator 1 keeps processing with the 2nd DB + expectReadKey(it1, ITEMS_PER_DB); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 1u); // done with all flushdb's + + expectReadKeySequence(it1, ITEMS_PER_DB + 1, LAST_ITEM); + expectReadComplete(it1); + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); + + // But the consistent iterator should be terminated + item = bgIteratorRead(it2); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + bgIteratorClose(it2); // background thread completes the termination + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 2); + EXPECT_TRUE(cleanupTerminated); +} + + +/* A multi with one future and one past key must expedite and replicate. */ +TEST_F(BgIterationTest, multiTwoKeysFirstFuture) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Causes keys 1 & 2 to be queued (same bucket) + expectReadKey(it, 1); // Causes key 0 to be released + + // Now, B0(0) is in the past. H0(5) is in the future. R0(11) [in DB1] is also future. + + /* For a non-consistent iteration, with replication... + * Normally, H0 (future) wouldn't need to expedite - we'd just modify it in place (without + * replication and iterate on it later. But, in this case, since it's wrapped in a multi, with + * B0 (past) - we need to expedite H0 so that the multi can all be handled in the same way. + * Key R0(11) [DB1] just makes thing a little trickier. */ + c = getMultiClient("SET B0 xxx; SET H0 xxx; SELECT 1; SET R0 xxx"); + + // The EXEC should block on 2 keys, because H0(5) & R0(11) should be expedited + simulateBlockedWrite(c, 2); + + expectReadKey(it, 2); // (was already in queue) + + // Note - it would be logically OK if these 2 were reversed, but this is how the current algorithm works. + expectReadKey(it, 5); // Key 5 (H0) was expedited + expectReadKey(it, 11); // Key 11 (R0) was expedited + + // We don't need to actually simulate the multi. Just checking that the keys were expedited. + + // and clean up the rest... + expectReadKeySequence(it, 3, 4); + // Key 5 was already read above (expedited) + expectReadKeySequence(it, 6, 10); + // Key 11 was already read above (expedited) + expectReadKeySequence(it, 12, LAST_ITEM); + expectReadComplete(it); +} + +// Multi blocking on future items. Consistent. +TEST_F(BgIterationTest, multiBlocksOnFutureKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + // Since there's no replication, an expedited key will be moved to the front of the queue. + // Let's fake a modification to key 6 (H1) + // Dummy up a MULTI... + c = getMultiClient("SET H1 xxx"); + + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c); + + // H1 (key 6) will be expedited to the front of the queue (because no replication) + expectReadKey(it, 6); + + // Now that we've read key 6, key 0 (B0) is passed and should not block + freeTestClient(c); + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + + // and clean up the rest... + expectReadKeySequence(it, 1, 5); + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +// Scenario. We have a multi that doesn't need to be replicated because all of the keys exist +// but are all future keys. Note that missing keys are considered already-iterated, so all +// must exist for this test. Then: +// - we delete a key +// - we re-create the deleted (future) key - normally this would be replicated +// - we access another (future) key - we don't expect to get blocked! +TEST_F(BgIterationTest, multiNotReplicatedButDelRecreateAccess) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + c = getMultiClient("DEL H1; SET H1 xxx; SET H2 yyy"); + // Now let's process the multi. Since H1 & H2 are both future (existing) items, we shouldn't + // block or replicate. + simulateUnblockedWrite(c); // the EXEC + + // Simulate the DEL H1 + server.in_exec = 1; // Simulate actual execution of the MULTI/EXEC + + advanceMultiClientToCommand(c, 0); // DEL H1 + EXPECT_CALL(mock, blockClientInUseOnKeys(c, _, _)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + simpleDelItem(6); // H1 + sds delKey = sdsnew(keyStr(6)); + bgIteration_keyDelete(0, delKey); + sdsfree(delKey); + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); // shouldn't replicate + + // Simulate SET H1 - the key doesn't exist, and would normally replicate and mark early iterate, + // but this is in a transaction, and we are not replicating this transaction. + advanceMultiClientToCommand(c, 1); // SET H1 xxx + simulateUnblockedWriteWithModification(c); + + // Now write to another existing future key - this should work if we weren't confused by the DEL + advanceMultiClientToCommand(c, 2); // SET H2 yyy + simulateUnblockedWriteWithModification(c); + server.in_exec = 0; + + // Now we can continue iterating, and we should pick up keys 1... (and no replication!) + expectReadKeySequence(it, 1, 5); + expectReadKey(it, 6, "xxx"); + expectReadKey(it, 7, "yyy"); + expectReadKeySequence(it, 8, LAST_ITEM); + expectReadComplete(it); +} + + +// For this test, B0 is added into DB1 - so it exists in both DB 0 and 1. We will process it +// in DB0, but it will be unprocessed in DB1. See if we track SELECT properly. +TEST_F(BgIterationTest, multiHandlesSelectProperly) { + addKeyToDb(1, "B0", "B0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + // Read the 1st key - B0 in DB0. + expectReadKey(it, 0); + // Now, we are done with B0 in DB0, but not in DB1 + expectReadKey(it, 1); // Reads B1, and releases B0 in DB0 + + // These cases should NOT block... (they access B0 in DB0) + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SELECT 0; SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block... (they access B0 in DB1) + c = getMultiClient("SET B0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SET B0 xxx"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SET B0 xxx; SELECT 0"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SELECT 1; SET B0 xxx; SELECT 1"); + simulateBlockedWrite(c); + + expectAnythingCleanup(it); +} + +// For this test, B0 is added into DB1 - so it exists in both DB0 and DB1. We will process it +// in DB0, but it will be unprocessed in DB1. See if we track select properly - WHEN WE HAVE NO +// PERMISSION TO EXECUTE SELECT! +TEST_F(BgIterationTest, multiHandlesSelectNoPermissionProperly) { + addKeyToDb(1, "B0", "B0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + // Read the 1st key - B0 in DB0. + expectReadKey(it, 0); + // Now, we are done with B0 in DB0, but not in DB1 + expectReadKey(it, 1); // Reads B1, and releases B0 in DB0 + + // No permission for any commands (specifically select/swapdb) + EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_, _, _, _, _, _)) + .Times(AtLeast(1)) + .WillRepeatedly(Return(ACL_DENIED_CMD)); + + // These cases should NOT block... (they access B0 in DB0) + // The SELECTs below are inconsequential - with/without select, same result. + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SELECT 0; SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block IF SELECT IS WORKING... (they access B0 in DB1) + c = getMultiClient("SET B0 xxx"); + c->db = server.db[1]; // already starting on DB1 + simulateBlockedWrite(c); // will block, no select + freeTestClient(c); + c = getMultiClient("SELECT 1; SET B0 xxx"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + freeTestClient(c); + c = getMultiClient("SELECT 1; SET B0 xxx; SELECT 0"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + freeTestClient(c); + c = getMultiClient("SELECT 0; SELECT 1; SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + + expectAnythingCleanup(it); +} + +// For this test, B0 is added into DB1 - so it exists in both DB0 and DB1. We will process it +// in DB0, but it will be unprocessed in DB1. See if we track SWAPDB properly. +TEST_F(BgIterationTest, multiHandlesSwapdbProperly) { + addKeyToDb(1, "B0", "B0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + // Read the 1st key - B0 in DB0. + expectReadKey(it, 0); + // Now, we are done with B0 in DB0, but not in DB1 + expectReadKey(it, 1); // Reads B1, and releases B0 in DB0 + + // These cases should NOT block... (they access B0 in DB0) + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 0 1; SELECT 1; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block... (they access B0 in DB1) + c = getMultiClient("SET B0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SET B0 xxx; SWAPDB 0 1"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SELECT 0; SET B0 xxx; SWAPDB 0 1"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET B0 xxx; SELECT 1"); + simulateBlockedWrite(c); + + expectAnythingCleanup(it); +} + +// For this test, B0 is added into DB1 - so it exists in both DB0 and DB1. We will process it +// in DB0, but it will be unprocessed in DB1. See if we track select properly - WHEN WE HAVE NO +// PERMISSION TO EXECUTE SWAPDB! +TEST_F(BgIterationTest, multiHandlesSwapdbNoPermissionProperly) { + addKeyToDb(1, "B0", "B0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + // Read the 1st key - B0 in DB0. + expectReadKey(it, 0); + // Now, we are done with B0 in DB0, but not in DB1 + expectReadKey(it, 1); // Reads B1, and releases B0 in DB0 + + // No permission for any commands (specifically select/swapdb) + EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_, _, _, _, _, _)) + .Times(AtLeast(1)) + .WillRepeatedly(Return(ACL_DENIED_CMD)); + + // These cases should NOT block... (they access B0 in DB0) + // The SELECTs & SWAPDBs below are inconsequential - with/without select/swapdb, same result. + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 0 1; SELECT 1; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block IF SELECT/SWAPDB IS WORKING... (they access B0 in DB1) + c = getMultiClient("SET B0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SET B0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb fails) + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SELECT 0; SET B0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails) + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails) + + expectAnythingCleanup(it); +} + + +static void *pthreadWait200msAndReadTwoKeys(void *arg) { + bgIterator *it = static_cast(arg); + + usleep(200000); + bgIteratorRead(it); + bgIteratorRead(it); + return nullptr; +} + +static void asyncWait200msAndReadTwoKeys(bgIterator *it) { + int rc; + pthread_attr_t attr; + pthread_t thread; + + rc = pthread_attr_init(&attr); + assert(rc == 0); + rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + assert(rc == 0); + + rc = pthread_create(&thread, &attr, pthreadWait200msAndReadTwoKeys, it); + assert(rc == 0); + + rc = pthread_attr_destroy(&attr); + assert(rc == 0); +} + +TEST_F(BgIterationTest, testLuaWithUndeclaredKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_START, NULL, + iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // If we fake a modification to key 3, we won't know if it's handled out of order. + // So we fake a modification to key 4 + c = getWriteClient(4, "xxx"); + c->flag.script = 1; + + // Now for a LUA script, we have already blocked (on the eval/evalsha) for any declared keys + // But here, we're about to modify an undeclared key. We can't actually block in the middle + // of the LUA script. So this will behave as unblocked, but incur a synchronous wait. + + // Key 4 will get expedited when we simulate the write. After reading key 4, key 1 will need + // to be read to return key 4 to Valkey, unblocking the synchronous wait. + asyncWait200msAndReadTwoKeys(it); + + monotime blockTimer; + elapsedStart(&blockTimer); + simulateUnblockedWrite(c); // Not blocked, but delays internally + // Must have delayed at least 150ms (some time may have passed before timer start) + EXPECT_GT(elapsedMs(blockTimer), 150u); + + // Continue... + expectReadKeySequence(it, 2, 3); + // 4 has already been processed + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +// Make sure that replication received while processing the last key is sent +TEST_F(BgIterationTest, replicationReceivedWhileProcessingLastKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKeySequence(it, 0, LAST_ITEM); + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + + expectReadReplication(it, c); // Replication happened while processing the last item, should be here. + + simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing + + expectReadComplete(it); // We expect to see the completion instead +} + +TEST_F(BgIterationTest, repldoneFunctionCalled) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, + iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); + expectReadKeySequence(it, 0, LAST_ITEM); + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + + // Since in testing, we are only feeding one item at a time, and synchronously, we won't call + // the repldone function until after we release the last item. + EXPECT_EQ(replDoneConfirmed, 0); + expectReadReplication(it, c); // Replication happened while processing the last item, should be here. + EXPECT_EQ(replDoneConfirmed, 1); // Last key released, now done feeding replication + + simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing + + expectReadComplete(it); // We expect to see the completion instead +} + +TEST_F(BgIterationTest, repldoneFunctionCalledTwice) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, + iteratorRepldoneFnNotBeingReadyInitially, iteratorCleanupFn, PRIVDATA); + expectReadKeySequence(it, 0, LAST_ITEM); + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + + // Won't signal replDone until we've released the final item (which happens when reading the replication) + EXPECT_EQ(replDoneRejected, 0); + EXPECT_EQ(replDoneConfirmed, 0); + expectReadReplication(it, c); // Releases the final item + EXPECT_EQ(replDoneRejected, 1); // replDone called once (and rejected by client) + EXPECT_EQ(replDoneConfirmed, 0); + simulateUnblockedWriteWithModification(c); // This will replicate (because replDone returned false) + + expectReadReplication(it, c); // ReplDone gets called again (and accepted this time) + EXPECT_EQ(replDoneConfirmed, 1); + + simulateUnblockedWriteWithModification(c); // This won't replicate because replication is done + + expectReadComplete(it); // We expect to see the completion instead +} + +// Check that the memory reported for replication is correct +TEST_F(BgIterationTest, checkReplicationByteCount) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, + iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); + c = getWriteClient(0, "xxx"); + size_t expectedReplicationSize = sizeof(bgIteratorItem); + for (int i = 0; i < c->argc; i++) { + expectedReplicationSize += objectComputeSize(NULL, c->argv[i], 0, 0); + } + + expectReadKey(it, 0); + expectReadKey(it, 1); // Releases and unblocks 0 + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u); + + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize); + simulateUnblockedWriteWithModification(c); // and write again (2nd replication) + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize); + + expectReadKey(it, 2); // Keys 0..2 all in same bucket + + expectReadReplication(it, c); + // After reading the 1st replication, it hasn't been returned yet (it's the active item) + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize); + expectReadReplication(it, c); + // After reading the 2nd replication, the 1st has been returned + EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize); + + expectReadKey(it, 3); + // Now all replication has been returned/freed + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u); + + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); +} + +// Test that for an arbitrary write command having no keys, replication should occur. +TEST_F(BgIterationTest, checkNoKeysWriteIsReplicated) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", BGITERATOR_CONSISTENCY_EVENTUAL, NULL, + iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + + c = getNoKeysWriteClient(); + EXPECT_CALL(mock, blockClientInUseOnKeys(c, _, _)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + + expectReadKeySequence(it, 1, 2); // These were already in queue + + expectReadReplication(it, c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); +} diff --git a/src/unit/wrappers.h b/src/unit/wrappers.h index 0f4fb388b98..5bfc117fab2 100644 --- a/src/unit/wrappers.h +++ b/src/unit/wrappers.h @@ -61,6 +61,15 @@ extern "C" { long long __wrap_aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, aeTimeProc *proc, void *clientData, aeEventFinalizerProc *finalizerProc); int __wrap_processPendingCommandAndInputBuffer(client *c); void __wrap_beforeNextClient(client *c); + +void __wrap_blockClientInUseOnKeys(client *c, int nKeys, robj **keys); +void __wrap_unblockClientsInUseOnKey(robj *key); + +int __wrap_ACLCheckAllUserCommandPerm(user *u, struct serverCommand *cmd, robj **argv, int argc, int dbid, int *idxptr); + +size_t __wrap_hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata); +bool __wrap_hashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor); + #undef protected #undef _Bool #undef typename