diff --git a/src/bgiteration.c b/src/bgiteration.c index bb514daed1e..8d635dbff0f 100644 --- a/src/bgiteration.c +++ b/src/bgiteration.c @@ -13,9 +13,6 @@ #include "mutexqueue.h" #include "server.h" -// Just for the moment, until https://github.com/valkey-io/valkey/issues/3450 is resolved -// clang-format off - int getFlushCommandFlags(client *c, int *flags); // in db.c uint64_t dictObjHash(const void *key); // in server.c int dictObjKeyCompare(const void *key1, const void *key2); // in server.c @@ -23,33 +20,22 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid); robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const sds key, long long expire); // in object.c -// Non-public hashtable/kvstore functions... -hashtableIterator *kvstoreInternalIteratorGetCurrentHashtableIterator(kvstoreIterator *kvs_it); - - -static bool receiveItemsBackFromOneIterator(bgIterator *it); // in bgiteration.c - forward declaration - -// ################ TEMP COMPILE HACKS ########################### -// Issue found. server.db has changed from an array of db to an array of pointers to db (change all refs to server.db) -// Issue: iterators (kvstore/hashtable) are not safe across event loop invocations. Hashtable (kvstore?) needs to track and maintain safe iterators. +static bool receiveItemsBackFromOneIterator(bgIterator *it); -// Don't think there's any current need for this... +// Future extendability static bool ignoreKeyForSave(const_sds key) { UNUSED(key); return false; } -//------- END OF COMPILE HACKS ------------------- - - // Returns true if the cmd is a script command that may replicate. static bool isScriptCallWriteCmd(struct serverCommand *cmd) { return ((cmd->proc == fcallCommand) || (cmd->proc == evalCommand) || (cmd->proc == evalShaCommand)); } -// The PFCOUNT command (which does NOT have the CMD_WRITE flag) modifies the underlying string and -// is replicated as a write. So it needs to be detected and handled specially. +/* The PFCOUNT command (which does NOT have the CMD_WRITE flag) modifies the underlying string and + * is replicated as a write. So it needs to be detected and handled specially. */ static bool isWriteCmd(struct serverCommand *cmd) { return ((cmd->flags & CMD_WRITE) || (cmd->proc == pfcountCommand) || (cmd->proc == execCommand) || (isScriptCallWriteCmd(cmd))); } @@ -60,16 +46,19 @@ static bool isDeleteCmd(struct serverCommand *cmd) { } +/* This utility utilizes the main thread and background threads for processing. The API is split, + * with some of the functions intended for the main thread and others intended for the background + * clients. This sanity check ensures that we maintain thread safety, calling the API as intended. */ static bool onValkeyMainThread(void) { - // Modules interact with the main thread using a mutex. If a module owns the mutex, consider - // that equivalent to being on the main thread. + /* Modules interact with the main thread using a mutex. If a module owns the mutex, consider + * that equivalent to being on the main thread. */ bool inModule = (atomic_load_explicit(&server.module_gil_acquired, memory_order_relaxed) == 0); return (inModule || pthread_equal(server.main_thread_id, pthread_self()) != 0); } + /* Parse a parameters robj, extracting a valid DBID. - * Returns FALSE if DBID isn't valid. - */ + * Returns FALSE if DBID isn't valid. */ static bool getDbIdFromRobj(robj *obj, int *db_id) { long long value; if (getLongLongFromObject(obj, &value) != C_OK) return false; @@ -79,22 +68,20 @@ static bool getDbIdFromRobj(robj *obj, int *db_id) { } /* Parse the parameters of the COPY command, extracting the target DBID. - * Returns FALSE if the command would not run. - */ + * Returns FALSE if the command would not run. */ static bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid, int *target_dbid) { const int COPY_COMMAND_OPTIONAL_ARG_START_INDEX = 3; *target_dbid = selected_dbid; - for (int i = COPY_COMMAND_OPTIONAL_ARG_START_INDEX; i < argc; i++) { + for (int i = COPY_COMMAND_OPTIONAL_ARG_START_INDEX; i < argc; i++) { if (!strcasecmp((char *)objectGetVal(argv[i]), "replace")) { continue; } else if (!strcasecmp((char *)objectGetVal(argv[i]), "db") && (i + 1 < argc)) { /* Note the parsing here needs to perfectly match what we have in Valkey OSS for COPY. * The following command is considered OK by Valkey 8.1 so we can't return here, but * must continue to parse till the last db which is the one that's effectively used. - * COPY key1 key2 db 1 db 2 db 3 // (This will use db 3) - */ + * COPY key1 key2 db 1 db 2 db 3 (This will use db 3) */ if (!getDbIdFromRobj(argv[i + 1], target_dbid)) { return false; // parse failure } @@ -108,8 +95,7 @@ static bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid /* Get parameters for the SWAPDB command. * The optional permission_client allows for checking of a client's permission for swapdb. - * Returns true if command would be executed. - */ + * Returns true if command would be executed. */ static bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, int *id1_p, int *id2_p) { static struct serverCommand *swapdb_cmd = NULL; @@ -132,7 +118,7 @@ static bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, if (getLongLongFromObject(argv[2], &dbid2) != C_OK) return false; if (dbid1 < 0 || dbid1 >= server.dbnum) return false; if (dbid2 < 0 || dbid2 >= server.dbnum) return false; - if (dbid1 == dbid2) return false; // Valid, but doesn't do anything + if (dbid1 == dbid2) return false; // Valid, but doesn't do anything *id1_p = (int)dbid1; *id2_p = (int)dbid2; @@ -141,8 +127,7 @@ static bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, /* Get parameters for the SELECT command. * The optional permission_client allows for checking of a client's permission for select. - * Returns true if command would be executed. - */ + * Returns true if command would be executed. */ static bool getParamsForSelect(int argc, robj **argv, client *permission_client, int *dbid_p) { static struct serverCommand *select_cmd = NULL; @@ -177,13 +162,13 @@ static void resumeReshahForKvsHashtable(kvstore *kvs, int didx) { if (ht != NULL) hashtableResumeRehashing(ht); } + /* DictType for SDS->ptr. The SDS is referenced, no destructor. */ static dictType sdsrefToPtrDictType = { .entryGetKey = dictEntryGetKey, .hashFunction = dictSdsHash, .keyCompare = dictSdsKeyCompare, - .entryDestructor = zfree -}; + .entryDestructor = zfree}; /* Wrap decrRefCount() so that it can be used as a callback requiring void. */ @@ -204,20 +189,27 @@ static sds createSdsFromClientArgv(int argc, robj **argv) { } -//########################################################################### +// ########################################################################### /* bgIteration internal (compile time) configuration values */ enum { - BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE = 16384, // Prevent initial rehashing - BGITER_MAX_CLONE_ITEM_BYTES = 512, // Max size item to clone - BGITER_MAX_CLONE_POOL_BYTES = (1 * 1024 * 1024), // Total limit for all cloned items - BGITER_QUEUE_INCREASE_INCR = 100, // Step size when increasing queue target - BGITER_CYCLE_DELAY_MS = 2, // Delay between calls on bgIteration timer - BGITER_CYCLE_BUDGET_MS = 1, // Normal time limit for timer processing - BGITER_CYCLE_BUDGET_MAX_MS = 10 // Maximum time limit when starvation seen + BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE = 16384, // Prevent initial rehashing + BGITER_MAX_CLONE_ITEM_BYTES = 512, // Max size item to clone + BGITER_MAX_CLONE_POOL_BYTES = (1 * 1024 * 1024), // Total limit for all cloned items + BGITER_QUEUE_INCREASE_INCR = 100, // Step size when increasing queue target + BGITER_CYCLE_DELAY_MS = 2, // Delay between calls on bgIteration timer + BGITER_CYCLE_BUDGET_MS = 1, // Normal time limit for timer processing + BGITER_CYCLE_BUDGET_MAX_MS = 10 // Maximum time limit when starvation seen }; +// dbEntry metadata +typedef struct { + uint32_t iterator_epoch; // iterator epoch of last modification +} bgIterationEntryMetadata; +static_assert(sizeof(bgIterationEntryMetadata) == BGITERATION_ENTRY_METADATA_SIZE, ""); + + // These can be tweaked by unit tests static int bgiter_max_clone_item_bytes = BGITER_MAX_CLONE_ITEM_BYTES; static int bgiter_max_clone_pool_bytes = BGITER_MAX_CLONE_POOL_BYTES; @@ -237,6 +229,22 @@ typedef enum { BGITERATION_TYPE_CLUSTERSLOT } bgIterationType; + +/* Flag indicates that a consistent iteration is required. This is used to create a point-in-time + * iteration. The iteration client will see all keys AS THEY EXISTED at the time when the iterator + * was created. + * Note: The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration + * start). SWAPDB events are NOT provided during a consistent iteration. */ +#define BGITERATOR_FLAG_CONSISTENT (1 << 0) + +/* Flag indicating that the replication stream for keys which have already been processed should be + * forwarded to the iteration client. Used for non-consistent iteration to track changes + * to keys already processed. By tracking changes, this allows an non-consistent iteration client + * to achieve a consistent view at the END of the iteration. + * NOTE: Replication events will be provided ordered and synchronized with any SWAPDB events. */ +#define BGITERATOR_FLAG_REPLICATION (1 << 1) + + /* Extensions to bgIteratorItemType. These enumerations are used internally, and are not part of * the published interface. These allow for extensibility in the internal information-passing * between the Valkey main thread and the iteration client thread. */ @@ -253,19 +261,13 @@ typedef struct { bgIterator *iter; } bgIteratorItemExtClose; -/* Used for dictEntryPtrDictType. This dict grows and shrinks constantly during the iteration. - * There is no point to rehash it all the time. */ -static int neverShrink(size_t moreMem, double usedRatio) { - UNUSED(moreMem); - return (usedRatio > 0.5); // Return true only if expanding -} -// A dictionary with a pointer (itself) as a key (the address pointed to is NOT referenced). -// Nothing is duplicated, this is a very fast dictionary, but potentially unsafe if the original -// items are deleted or moved. -// WARNING: Can't have active defrag running! It might reallocate memory blocks, swapping their -// pointer values! A check must be made in active defrag to ensure that no iteration is -// active. +/* A dictionary with a pointer (itself) as a key (the address pointed to is NOT referenced). + * Nothing is duplicated, this is a very fast dictionary, but potentially unsafe if the original + * items are deleted or moved. + * WARNING: This needs to maintain safety with things that may move the object. + * + In db.c, if the object is reallocatd, bgIteration_updateDbEntryPtr() is called. + * + In defrag.c, we don't defrag if there are multiple references (and we incr the refcount). */ // Thomas Wang's 64-bit mix static uint64_t pointerHash(const void *key) { @@ -284,81 +286,102 @@ static int pointerCompare(const void *key1, const void *key2) { return key1 == key2; } +// This dict grows and shrinks constantly during the iteration. Avoid constant rehashing. +static int neverShrink(size_t moreMem, double usedRatio) { + UNUSED(moreMem); + return (usedRatio > 0.5); // Return true only if expanding +} + static dictType dictEntryPtrDictType = { .entryGetKey = dictEntryGetKey, .hashFunction = pointerHash, .keyCompare = pointerCompare, .resizeAllowed = neverShrink, - .entryDestructor = zfree -}; + .entryDestructor = zfree}; -// A TEMP set of robj's (of type sds). This is only for temporary sets as the robj's are not -// ref-counted at insertion/deletion. -static hashtableType tempKeysetHashtableType = { - .hashFunction = dictObjHash, - .keyCompare = dictObjKeyCompare -}; - -typedef struct genericIterator genericIterator; -typedef void (*iteratorReleaseFunc) (genericIterator *genIt); -typedef fifo * (*iteratorGetEntriesFunc) (genericIterator *genIt, int *orig_dbid, int *cur_dbid); -typedef void (*iteratorSwapDbFunc) (genericIterator *genIt, int db1, int db2); -typedef void (*iteratorFlushDbFunc) (genericIterator *genIt, int cur_dbid); -typedef bool (*iteratorHasPassedItemFunc) (genericIterator *genIt, const_sds key, int cur_dbid); -typedef int (*iteratorOriginalDbFunc) (genericIterator *genIt, int cur_dbid); -typedef bool (*iteratorIsKeyInScopeFunc) (genericIterator *genIt, const_sds key); +static hashtableType dbEntryPtrHashtableType = { + .hashFunction = pointerHash, + .keyCompare = pointerCompare, + .resizeAllowed = neverShrink}; -// Function pointers supporting polymorphic iterator implementation -struct genericIterator { - iteratorReleaseFunc release; - iteratorGetEntriesFunc getEntries; - iteratorSwapDbFunc swapDb; - iteratorFlushDbFunc flushDb; - iteratorHasPassedItemFunc hasPassedItem; - iteratorOriginalDbFunc originalDb; - iteratorIsKeyInScopeFunc isKeyInScope; -}; +// A free list for bgIteratorItem's - avoids churning zmalloc calls typedef struct itemListNode { struct itemListNode *next; } itemListNode; +static const int FREE_ITEM_MAX = 500; static itemListNode *freeItemStackHead = NULL; - -static void itemFreeList_returnItemBackToFreeList(bgIteratorItem* item) { - itemListNode *freedNode = (itemListNode*)item; - freedNode->next = freeItemStackHead; - freeItemStackHead = freedNode; +static int freeItemStackCount = 0; + +static void itemFreeList_returnItemBackToFreeList(bgIteratorItem *item) { + itemListNode *freedNode = (itemListNode *)item; + if (freeItemStackCount < FREE_ITEM_MAX) { + freedNode->next = freeItemStackHead; + freeItemStackHead = freedNode; + freeItemStackCount++; + } else { + zfree(freedNode); + } } +// Pop a free node from the free list or allocate if none free static bgIteratorItem *itemFreeList_getElementOrAllocate(void) { - bgIteratorItem *item; - // Pop a free node from the free list or allocate if none free if (freeItemStackHead) { - item = (bgIteratorItem*)freeItemStackHead; + item = (bgIteratorItem *)freeItemStackHead; freeItemStackHead = freeItemStackHead->next; - if (freeItemStackHead) { - valkey_prefetch(freeItemStackHead); - } - } - else { + freeItemStackCount--; + if (freeItemStackHead) valkey_prefetch(freeItemStackHead); + } else { + serverAssert(freeItemStackCount == 0); // Create new listNode and item item = zmalloc(sizeof(bgIteratorItem)); } return item; } - -static void itemFreeList_release(void) { - while(freeItemStackHead) { + +static void itemFreeList_release(void) { + while (freeItemStackHead) { itemListNode *node = freeItemStackHead; freeItemStackHead = node->next; - zfree((bgIteratorItem*)node); + freeItemStackCount--; + zfree(node); } + serverAssert(freeItemStackCount == 0); } -// This struct is used across threads. Unless otherwise noted, the fields are initialized at -// iterator creation (within the main thread) and are read-only by the client thread. + +/* A TEMPORARY set of robj's (of type sds). This is only for temporary sets as the robj's are not + * ref-counted at insertion/deletion. */ +static hashtableType tempKeysetHashtableType = { + .hashFunction = dictObjHash, + .keyCompare = dictObjKeyCompare}; + + +typedef struct genericIterator genericIterator; +typedef void (*iteratorReleaseFunc)(genericIterator *genIt); +typedef fifo *(*iteratorGetEntriesFunc)(genericIterator *genIt, int *orig_dbid, int *cur_dbid); +typedef void (*iteratorSwapDbFunc)(genericIterator *genIt, int db1, int db2); +typedef void (*iteratorFlushDbFunc)(genericIterator *genIt, int cur_dbid); +typedef bool (*iteratorHasPassedItemFunc)(genericIterator *genIt, const_sds key, int cur_dbid); +typedef int (*iteratorOriginalDbFunc)(genericIterator *genIt, int cur_dbid); +typedef bool (*iteratorIsKeyInScopeFunc)(genericIterator *genIt, const_sds key); + +// Function pointers supporting polymorphic iterator implementation +struct genericIterator { + iteratorReleaseFunc release; + iteratorGetEntriesFunc getEntries; + iteratorSwapDbFunc swapDb; + iteratorFlushDbFunc flushDb; + iteratorHasPassedItemFunc hasPassedItem; + iteratorOriginalDbFunc originalDb; + iteratorIsKeyInScopeFunc isKeyInScope; +}; + + +/* This struct is used across threads. Unless otherwise noted, the fields are initialized at + * iterator creation (within the main thread) and are read-only by the client thread. */ struct bgIterator { sds name; // Iterator name bgIteratorReplDoneFunc repldone; // Optional repldone function to be run on the main thread @@ -371,35 +394,33 @@ struct bgIterator { genericIterator *keyset_iter; // Low-level iterator (polymorphic) - dict *early_iterate_entries; // Used to keep track of what items have already been iterated - // over by out-of-order expedited process, ensuring a bgIterator - // does not try to reprocess items. - // Used only by main thread. - // dictEntry -> NULL + /* A set of dbEntry, compared by pointer. Used to track items which have already been iterated + * over by out-of-order expedited processing. Ensures a bgIterator does not try to reprocess + * items. Used only by main thread. */ + hashtable *early_iterate_entries; mutexQueue *items_for_iterator; // Created/Destroyed in main thread, used in both (threadsafe) mutexQueue *return_to_valkey; // Queue of items to be returned to the Valkey main thread (threadsafe) - + unsigned int item_count_target; // Used only by main thread - bgIteratorItem *volatile current_item; // current_item is normally only used in the iteration client. - // It's marked volatile here only to support snooping from the - // main thread when handling a FLUSHDB command. This prevents - // the compiler from generating code which might read the - // pointer multiple times (when it's coded to read only once). - // Also - this syntax is for a volatile POINTER to a - // non-volatile item. "volatile" at the beginning of the - // declaration, would indicate a (non-volatile) pointer to a - // volatile item. + /* current_item is normally only used in the iteration client. It's marked volatile here only + * to support snooping from the main thread when handling a FLUSHDB command. This prevents the + * compiler from generating code which might read the pointer multiple times (when it's coded to + * read only once). + * (A volatile POINTER to a non-volatile item.) */ + bgIteratorItem *volatile current_item; bool client_is_active; // Set to true when client performs 1st read - bool completed; // Set to true in main thread when last item from iteration has - // been queued to the client. No additional items will be - // enqueued to the client after this has been set. - volatile bool terminated; // Set to true in main thread when iteration is to be killed - // Set to true in iteration client when it decides to end early + /* Set to true in main thread when last item from iteration has been queued to the client. No + * additional items will be enqueued to the client after this has been set. */ + bool completed; + + /* Set to true in main thread when iteration is to be killed. + * Set to true in iteration client when it decides to end early. */ + volatile bool terminated; bool cur_cmd_may_replicate; // Used only in main thread during command processing @@ -416,10 +437,10 @@ struct bgIterator { unsigned long dbentry_clones_processed; // Updated by client thread monotime monotonic_start_time; // Time iteration started - volatile monotime monotonic_item_start_time; // The item start time is set in the iteration client. It is - // marked volatile as it can be read from the main thread by - // bgIteratorGetStatus. If 0, this indicates that the - // iteration client is waiting for an item to process. + /* The item start time is set in the iteration client. It is marked volatile as it can be read + * from the main thread by bgIteratorGetStatus. If 0, this indicates that the iteration client + * is waiting for an item to process. */ + volatile monotime monotonic_item_start_time; }; @@ -431,19 +452,19 @@ static dict *nameToIterator; // bgIterator->name -> bgIterator // Global, across all iterators, dict contains a dbEntry pointer -> ref count static dict *inUseEntries; // dbEntry -> ref count -// Key values in the current command which don't exist in the DB yet. Needed for determination of -// replication for NON-consistent iterations. +/* Key values in the current command which don't exist in the DB yet. Needed for determination of + * replication for NON-consistent iterations. */ static list *curCmdMissingKeys; // list of robj -// A counter of the total amount of memory used for buffered replication data. -// This amount is excluded when computing the need for evictions. +/* A counter of the total amount of memory used for buffered replication data. This amount is + * excluded when computing the need for evictions. */ static ssize_t bufferedReplicationBytes; // Memory pool to track current allocated memory of cloned items (in bytes) static ssize_t bgiteration_current_clone_memory_pool_size; -// Snapshot of the last queue size to seed the next queue -// We assume all bgIterators consume items at the same rate +/* Snapshot of the last queue size to seed the next queue. We assume all bgIterators consume items + * at roughly the same rate. */ static int last_item_count_target; // Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID) @@ -453,44 +474,44 @@ static long long bgIterator_timeproc_id; static uint32_t bgIteration_epoch = 1; -// BgIteration debug captures BgIteration activity to a large sds buffer. When an iterator is -// completed, the entire buffer is written to a file in the current working directory. Note that -// memory must be available for the ENTIRE debug in memory. This isn't captured incrementally to -// a file as the file I/O is more likely to affect timing. -// Future implementation: the current design is most useful for a single iterator. When items are -// queued to an iterator, the iterator name is not recorded (to save space). -// Developer note: using a CONST value here allows the compiler to completely remove all of the -// debugging code at compile time. There is no run-time performance overhead when set to FALSE. -// This is essentially like an IFDEF, however, it's better as it forces the compiler to validate -// syntax. +/* BgIteration debug captures BgIteration activity to a large sds buffer. When an iterator is + * completed, the entire buffer is written to a file in the current working directory. Note that + * memory must be available for the ENTIRE debug in memory. This isn't captured incrementally to + * a file as the file I/O is more likely to affect timing. + * + * Future implementation: the current design is most useful for a single iterator. When items are + * queued to an iterator, the iterator name is not recorded (to save space). + * + * Developer note: using a CONST value here allows the compiler to completely remove all of the + * debugging code at compile time. There is no run-time performance overhead when set to FALSE. + * This is essentially like an IFDEF, however, it's better as it forces the compiler to validate + * syntax. */ static const bool BGITERATION_DEBUG = false; // DO NOT SUBMIT WITH THIS SYMBOL SET TO TRUE! static sds debugBuffer; - -//============================================================================================= -// Full Scan Iterator -//============================================================================================= -/* The full scan iterator performs the actual iteration over the Valkey keyset. The iterator is +/* ============================================================================================= + * Full Scan Iterator + * ============================================================================================= + * The full scan iterator performs the actual iteration over the Valkey keyset. The iterator is * only used from within the Valkey main thread. Iteration proceeds one DB at a time, based on * the DB ordering at the time of iterator creation. Each time the iterator returns items, all - * of the dictionary entries from a single hash bucket are returned. - */ + * of the dictionary entries from a single hash bucket are returned. */ struct fullScanIterator { genericIterator callbacks; // (must be first item) - // Array of mapping from original DB ID (at the time of iteration start) to that DB's - // current index. So, if the DB which was DB-0 is now at index 6, orig_to_cur_db[0]==6. + /* Array of mapping from original DB ID (at the time of iteration start) to that DB's current + * index. So, if the DB which was DB-0 is now at index 6, orig_to_cur_db[0]==6. */ int *orig_to_cur_db; - // The reverse of the above array. This maps a current DB index to its original index - // (at the time of iteration start). + /* The reverse of the above array. This maps a current DB index to its original index (at the + * time of iteration start). */ int *cur_to_orig_db; - // This is the DB we are currently iterating over. This is relative to the ORIGINAL - // DB ordering, at the time of iterator creation. Iteration proceeds from 0..N based on - // the original ordering. + /* This is the DB we are currently iterating over. This is relative to the ORIGINAL DB + * ordering, at the time of iterator creation. Iteration proceeds from 0..N based on the + * original ordering. */ int iter_db; // Iterator for the DB orig_to_cur_db[iter_db] @@ -568,7 +589,7 @@ static void fullScanIteratorSwapDb(genericIterator *genIt, int db1, int db2) { static void fullScanIteratorFlushDb(genericIterator *genIt, int cur_dbid) { struct fullScanIterator *it = (struct fullScanIterator *)genIt; - int orig_db = it->cur_to_orig_db[cur_dbid]; + int orig_db = (cur_dbid == -1) ? it->iter_db : it->cur_to_orig_db[cur_dbid]; if (orig_db == it->iter_db) { // We are currently iterating on the DB that's being flushed. it->kvs = NULL; @@ -577,7 +598,7 @@ static void fullScanIteratorFlushDb(genericIterator *genIt, int cur_dbid) { } static bool fullScanIteratorHasPassedItem(genericIterator *genIt, const_sds key, int cur_dbid) { - struct fullScanIterator *it = (struct fullScanIterator *) genIt; + struct fullScanIterator *it = (struct fullScanIterator *)genIt; int orig_dbid = it->cur_to_orig_db[cur_dbid]; if (orig_dbid < it->iter_db) return true; // Entire DB has already been processed @@ -586,8 +607,8 @@ static bool fullScanIteratorHasPassedItem(genericIterator *genIt, const_sds key, if (it->kvs == NULL) return true; // just finished this DB - // We're in the middle of processing a DB. In cluster-mode, the DB is divided into 1 hashtable - // per slot. In cluster-mode-disabled, we treat all keys as in slot 0. + /* We're in the middle of processing a DB. In cluster-mode, the DB is divided into 1 hashtable + * per slot. In cluster-mode-disabled, we treat all keys as in slot 0. */ int keySlot = server.cluster_enabled ? getKeySlot((sds)key) : 0; if (keySlot < it->kvs_didx) return true; if (keySlot > it->kvs_didx) return false; @@ -615,7 +636,7 @@ static bool fullScanIteratorIsKeyInScope(genericIterator *genIt, const_sds key) return true; // All keys are in scope } -static genericIterator * fullScanIteratorCreate(void) { +static genericIterator *fullScanIteratorCreate(void) { struct fullScanIterator *it = zmalloc(sizeof(struct fullScanIterator)); it->orig_to_cur_db = zmalloc(sizeof(int) * server.dbnum); it->cur_to_orig_db = zmalloc(sizeof(int) * server.dbnum); @@ -638,13 +659,11 @@ static genericIterator * fullScanIteratorCreate(void) { } - -//============================================================================================= -// Cluster Slot Iterator -//============================================================================================= -/* The cluster slot iterator performs iteration over one cluster slot of the Valkey keyset. The - * iterator is only used from within the Valkey main thread. - */ +/* ============================================================================================= + * Cluster Slot Iterator + * ============================================================================================= + * The cluster slot iterator performs iteration over one cluster slot of the Valkey keyset. The + * iterator is only used from within the Valkey main thread. */ struct clusterSlotIterator { genericIterator callbacks; // (must be first item) }; @@ -654,7 +673,7 @@ static void clusterSlotIteratorRelease(genericIterator *genIt) { serverAssert(false); // Not yet implemented } -static fifo * clusterSlotIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) { +static fifo *clusterSlotIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) { UNUSED(genIt); UNUSED(orig_dbid); UNUSED(cur_dbid); @@ -696,7 +715,7 @@ static bool clusterSlotIteratorIsKeyInScope(genericIterator *genIt, const_sds ke serverAssert(false); // Not yet implemented } -static genericIterator * clusterSlotIteratorCreate(const int *slots, size_t slots_count) { +static genericIterator *clusterSlotIteratorCreate(const int *slots, size_t slots_count) { struct clusterSlotIterator *it = zmalloc(sizeof(struct clusterSlotIterator)); it->callbacks.release = clusterSlotIteratorRelease; it->callbacks.getEntries = clusterSlotIteratorGetEntries; @@ -714,44 +733,43 @@ static genericIterator * clusterSlotIteratorCreate(const int *slots, size_t slot } +/* ============================================================================================= + * General iteration support (across all iterators) + * ============================================================================================= */ -//============================================================================================= -// General iteration support (across all iterators) -//============================================================================================= - -// While an item is potentially in use by a background thread, we can't have -// rehashing by the main thread. Returns true if rehashing was paused. +/* While an item is potentially in use by a background thread, we can't have rehashing by the main + * thread. Returns true if rehashing was paused. */ static bool pauseRehashing(dbEntry *de) { switch (de->encoding) { - case OBJ_ENCODING_HASHTABLE: { // SET or HASH - hashtable *ht = objectGetVal(de); - hashtablePauseRehashing(ht); - return true; - } - case OBJ_ENCODING_SKIPLIST: { // SORTED SET - zset *zs = objectGetVal(de); - hashtablePauseRehashing(zs->ht); - return true; - } - default: - return false; + case OBJ_ENCODING_HASHTABLE: { // SET or HASH + hashtable *ht = objectGetVal(de); + hashtablePauseRehashing(ht); + return true; + } + case OBJ_ENCODING_SKIPLIST: { // SORTED SET + zset *zs = objectGetVal(de); + hashtablePauseRehashing(zs->ht); + return true; + } + default: + return false; } } static void resumeRehashing(dbEntry *de) { switch (de->encoding) { - case OBJ_ENCODING_HASHTABLE: { // SET or HASH - hashtable *ht = objectGetVal(de); - hashtableResumeRehashing(ht); - break; - } - case OBJ_ENCODING_SKIPLIST: { // SORTED SET - zset *zs = objectGetVal(de); - hashtableResumeRehashing(zs->ht); - break; - } - default: - break; + case OBJ_ENCODING_HASHTABLE: { // SET or HASH + hashtable *ht = objectGetVal(de); + hashtableResumeRehashing(ht); + break; + } + case OBJ_ENCODING_SKIPLIST: { // SORTED SET + zset *zs = objectGetVal(de); + hashtableResumeRehashing(zs->ht); + break; + } + default: + break; } } @@ -798,21 +816,22 @@ static ssize_t computeStringDbEntrySize(dbEntry *de) { static dbEntry *tryCloneDbEntry(dbEntry *de) { - if (bgiteration_current_clone_memory_pool_size + bgiter_max_clone_item_bytes - > bgiter_max_clone_pool_bytes) { - return NULL; - } + if (bgiteration_current_clone_memory_pool_size + bgiter_max_clone_item_bytes > + bgiter_max_clone_pool_bytes) return NULL; - // Future optimization: Incorporate small ziplists, sorted sets, etc. - // OBJ_ENCODING_INT is omitted only because there isn't a good API for cloning it yet. + /* Future optimization: Incorporate small ziplists, sorted sets, etc. + * OBJ_ENCODING_INT is omitted only because there isn't a good API for cloning it yet. */ if (de->type == OBJ_STRING && de->encoding != OBJ_ENCODING_INT) { ssize_t itemSize = computeStringDbEntrySize(de); if (itemSize <= bgiter_max_clone_item_bytes) { bgiteration_current_clone_memory_pool_size += itemSize; - dbEntry *clone = createStringObjectWithKeyAndExpire((char *)objectGetVal(de), sdslen(objectGetVal(de)), objectGetKey(de), objectGetExpire(de)); - ((bgIterationEntryMetadata *)objectGetMetadata(clone))->iterator_epoch - = ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch; + dbEntry *clone = createStringObjectWithKeyAndExpire((char *)objectGetVal(de), + sdslen(objectGetVal(de)), + objectGetKey(de), + objectGetExpire(de)); + ((bgIterationEntryMetadata *)objectGetMetadata(clone))->iterator_epoch = + ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch; return clone; } } @@ -820,17 +839,16 @@ static dbEntry *tryCloneDbEntry(dbEntry *de) { return NULL; } - static void freeClonedDictEntry(dbEntry *clonedEntry) { serverAssert(clonedEntry->type == OBJ_STRING); - // Add back to memory pool bgiteration_current_clone_memory_pool_size -= computeStringDbEntrySize(clonedEntry); decrRefCount(clonedEntry); } -static bgIteratorItem * makeDbEntryItem(dbEntry *de, int dbid, bool isCloned) { + +static bgIteratorItem *makeDbEntryItem(dbEntry *de, int dbid, bool isCloned) { if (!isCloned) incrementEntryInuse(de); bgIteratorItem *item = itemFreeList_getElementOrAllocate(); @@ -843,8 +861,8 @@ static bgIteratorItem * makeDbEntryItem(dbEntry *de, int dbid, bool isCloned) { return item; } -static robj ** cloneRobjArray(int argc, robj **argv) { - robj **newarray = zmalloc(sizeof(robj*) * argc); +static robj **cloneRobjArray(int argc, robj **argv) { + robj **newarray = zmalloc(sizeof(robj *) * argc); for (int i = 0; i < argc; i++) { newarray[i] = argv[i]; incrRefCount(argv[i]); @@ -867,44 +885,43 @@ static void returnCurrentItemToValkey(bgIterator *it) { if (item == NULL) return; switch (item->type) { - case BGITERATOR_ITEM_DBENTRY: - it->dbentries_processed++; - if (item->u.dbe.is_cloned) it->dbentry_clones_processed++; - mutexQueueAdd(it->return_to_valkey, item); - break; - case BGITERATOR_ITEM_REPLICATION: - it->replication_processed++; - mutexQueueAdd(it->return_to_valkey, item); - break; - case BGITERATOR_ITEM_SWAPDB: - it->swapdb_processed++; - mutexQueueAdd(it->return_to_valkey, item); - break; - case BGITERATOR_ITEM_FLUSHDB: - it->flushdb_processed++; - mutexQueueAdd(it->return_to_valkey, item); - break; - - case BGITERATOR_ITEM_COMPLETE: - case BGITERATOR_ITEM_TERMINATED: - // These are static and just used to wake the iterator - they should never be returned. - serverAssert(false); - break; + case BGITERATOR_ITEM_DBENTRY: + it->dbentries_processed++; + if (item->u.dbe.is_cloned) it->dbentry_clones_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_REPLICATION: + it->replication_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_SWAPDB: + it->swapdb_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_FLUSHDB: + it->flushdb_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + + case BGITERATOR_ITEM_COMPLETE: + case BGITERATOR_ITEM_TERMINATED: + // These are static and just used to wake the iterator - they should never be returned. + serverAssert(false); + break; - default: - serverAssert(false); + default: + serverAssert(false); } - // Do this AFTER placing into return_to_valkey. This is volatile and snooped when there is a - // flushall event. Don't want an item to be missed. + /* Do this AFTER placing into return_to_valkey. This is volatile and snooped when there is a + * flushall event. Don't want an item to be missed. */ it->current_item = NULL; } - -//============================================================================================= -// Background Iterator (private) -//============================================================================================= +/* ============================================================================================= + * Background Iterator (private) + * ============================================================================================= */ static void bgIteratorRelease(bgIterator *it) { serverAssert(onValkeyMainThread()); @@ -924,7 +941,7 @@ static void bgIteratorRelease(bgIterator *it) { it->keyset_iter->release(it->keyset_iter); it->keyset_iter = NULL; - dictRelease(it->early_iterate_entries); + hashtableRelease(it->early_iterate_entries); it->early_iterate_entries = NULL; sdsfree(it->name); @@ -933,9 +950,9 @@ static void bgIteratorRelease(bgIterator *it) { static bool shouldFeedIteratorMore(bgIterator *it) { - return (!it->completed - && !it->terminated - && mutexQueueLength(it->items_for_iterator) < it->item_count_target); + return (!it->completed && + !it->terminated && + mutexQueueLength(it->items_for_iterator) < it->item_count_target); } @@ -975,10 +992,10 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { if (dbEntryFifo == NULL) { // Iteration of items is complete for this iterator - serverAssert(it->dbentries_queued >= it->dbentries_processed); - serverAssert(it->replication_queued >= it->replication_processed); - serverAssert(it->swapdb_queued >= it->swapdb_processed); - serverAssert(it->flushdb_queued >= it->flushdb_processed); + serverAssert(it->dbentries_queued >= it->dbentries_processed); + serverAssert(it->replication_queued >= it->replication_processed); + serverAssert(it->swapdb_queued >= it->swapdb_processed); + serverAssert(it->flushdb_queued >= it->flushdb_processed); serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed); // Snapshot queue size to seed next iterator when terminated @@ -986,10 +1003,10 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { if (!it->client_is_active || (it->dbentries_queued > it->dbentries_processed)) { - // We are done feeding dict entries to the iterator, but before ending the - // replication processing make sure that the iterator has become active (has - // started reading) and make sure that all of the dict entries have been processed - // by the client. + /* We are done feeding dict entries to the iterator, but before ending the + * replication processing make sure that the iterator has become active (has + * started reading) and make sure that all of the dict entries have been + * processed by the client. */ break; } if (it->repldone) { @@ -998,7 +1015,7 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { } } bgIteratorItem *completionItem = itemFreeList_getElementOrAllocate(); - *completionItem = (bgIteratorItem){ .type = BGITERATOR_ITEM_COMPLETE }; + *completionItem = (bgIteratorItem){.type = BGITERATOR_ITEM_COMPLETE}; if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { rdbSaveInfo rsi; completionItem->dbid = (rdbPopulateSaveInfo(&rsi)) ? rsi.repl_stream_db : 0; @@ -1025,14 +1042,13 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { fifoPop(dbEntryFifo, (void **)&de); // Remove new/modified items during consistent iteration. - if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT - && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch > it->consistent_modification_id) { + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT && + ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch > it->consistent_modification_id) { continue; } // Remove any items which have been processed early - if (dictFind(it->early_iterate_entries, de) != NULL) { - dictDelete(it->early_iterate_entries, de); + if (hashtableDelete(it->early_iterate_entries, de)) { if (BGITERATION_DEBUG) { sds entryString = createEntryString(dbid, de); debugBuffer = sdscatprintf(debugBuffer, "SKIPPING ITEM(early iterate): %s\n", entryString); @@ -1073,12 +1089,12 @@ static void feedIterator(bgIterator *it, monotime end_time_us) { static bool addEarlyIterationKey(bgIterator *it, dbEntry *earlyEntry, int cur_dbid) { - int rc = dictAdd(it->early_iterate_entries, earlyEntry, NULL); - serverAssert(rc == DICT_OK); - + bool wasAdded = hashtableAdd(it->early_iterate_entries, earlyEntry); + serverAssert(wasAdded); + int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) - ? it->keyset_iter->originalDb(it->keyset_iter, cur_dbid) - : cur_dbid; + ? it->keyset_iter->originalDb(it->keyset_iter, cur_dbid) + : cur_dbid; dbEntry *cloneEntry = tryCloneDbEntry(earlyEntry); bool isClonedEntry = (cloneEntry != NULL); @@ -1088,8 +1104,8 @@ static bool addEarlyIterationKey(bgIterator *it, dbEntry *earlyEntry, int cur_db if (isClonedEntry) it->dbentry_clones_queued++; if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { // JHB - can we optimize here in cluster mode (no swap) - // On consistent iteration, SWAPDB events are not provided. So there is no requirement to - // keep items in order or synchronized with SWAPDB. + /* On consistent iteration, SWAPDB events are not provided. So there is no requirement to + * keep items in order or synchronized with SWAPDB. */ if (BGITERATION_DEBUG) { sds entryString = createEntryString(dbid, item->u.dbe.de); debugBuffer = sdscatprintf(debugBuffer, "EARLY_1: %s\n", entryString); @@ -1109,12 +1125,10 @@ static bool addEarlyIterationKey(bgIterator *it, dbEntry *earlyEntry, int cur_db // This expedites a single key and doesn't attempt to avoid expediting through optimization. -static bool expediteSingleKeyWithoutOptimization( - bgIterator *it, - int dbid, - robj *oKey, - hashtable *waitingOnKeys) { - +static bool expediteSingleKeyWithoutOptimization(bgIterator *it, + int dbid, + robj *oKey, + hashtable *waitingOnKeys) { bool mustBlock = false; bool iterComplete = it->completed || it->terminated; @@ -1122,11 +1136,11 @@ static bool expediteSingleKeyWithoutOptimization( sds key = objectGetVal(oKey); dbEntry *de = dbFind(server.db[dbid], key); if (de != NULL) { - if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) - && (dictFind(it->early_iterate_entries, de) == NULL)) { + if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) && + !hashtableFind(it->early_iterate_entries, de, NULL)) { if (addEarlyIterationKey(it, de, dbid)) { mustBlock = true; - hashtableAdd(waitingOnKeys, oKey); + hashtableAdd(waitingOnKeys, oKey); } } else { if (isEntryInuseByAnyIterator(de)) { @@ -1142,12 +1156,11 @@ static bool expediteSingleKeyWithoutOptimization( // MOVE/COPY are unfortunate special commands. They work on 2 DBs at once. const int MOVE_COMMAND_DBID_ARG_INDEX = 2; -static bool expediteKeysForMove( - bgIterator *it, - int dbid, - int argc, - robj **argv, - hashtable *waitingOnKeys) { +static bool expediteKeysForMove(bgIterator *it, + int dbid, + int argc, + robj **argv, + hashtable *waitingOnKeys) { if (argc <= MOVE_COMMAND_DBID_ARG_INDEX) return false; int destDbid; @@ -1156,10 +1169,10 @@ static bool expediteKeysForMove( bool mustBlock = false; robj *key = argv[1]; - // Not looking for special cases to optimize here. Just try to expedite both src and dest - // keys. Note that the dest key might exist (and need iteration) but could be expired and - // could be overwritten by MOVE. In this case, a DEL would replicate due to the expiry. So - // even if the target is expired, we need to replicate it before executing the command. + /* Not looking for special cases to optimize here. Just try to expedite both src and dest + * keys. Note that the dest key might exist (and need iteration) but could be expired and + * could be overwritten by MOVE. In this case, a DEL would replicate due to the expiry. So + * even if the target is expired, we need to replicate it before executing the command. */ if (expediteSingleKeyWithoutOptimization(it, dbid, key, waitingOnKeys)) mustBlock = true; if (expediteSingleKeyWithoutOptimization(it, destDbid, key, waitingOnKeys)) mustBlock = true; @@ -1169,13 +1182,11 @@ static bool expediteKeysForMove( // MOVE/COPY are unfortunate special commands. They work on 2 DBs at once. -static bool expediteKeysForCopy( - bgIterator *it, - int dbid, - int argc, - robj **argv, - hashtable *waitingOnKeys) { - +static bool expediteKeysForCopy(bgIterator *it, + int dbid, + int argc, + robj **argv, + hashtable *waitingOnKeys) { int destDbid; if (!getTargetDbIdForCopyCommand(argc, argv, dbid, &destDbid)) return false; @@ -1183,8 +1194,8 @@ static bool expediteKeysForCopy( robj *srcKey = argv[1]; robj *destKey = argv[2]; - // Not trying to optimize COPY. Just expedite source and destination (if it exists). We - // don't really care if the value is overwritten or not (so no need to parse REPLACE option). + /* Not trying to optimize COPY. Just expedite source and destination (if it exists). We + * don't really care if the value is overwritten or not (so no need to parse REPLACE option). */ if (expediteSingleKeyWithoutOptimization(it, dbid, srcKey, waitingOnKeys)) mustBlock = true; if (expediteSingleKeyWithoutOptimization(it, destDbid, destKey, waitingOnKeys)) mustBlock = true; @@ -1223,40 +1234,38 @@ static bool expediteKeysForCopy( * Iterator: CONSISTENT = YES, REPLICATION = YES * (Combination only valid in cluster mode - no SWAPDB possible) * - Block if any write-key is in use by an the iterator - * - Block and immediately queue any key (read or write) that has not already been iterated - */ -static bool expediteKeysForWrite( - bgIterator *it, - int dbid, - struct serverCommand *cmd, - int argc, - robj **argv, - keyReference *keyrefs, - int numKeys, - hashtable *waitingOnKeys) { + * - Block and immediately queue any key (read or write) that has not already been iterated */ +static bool expediteKeysForWrite(bgIterator *it, + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv, + keyReference *keyrefs, + int numKeys, + hashtable *waitingOnKeys) { serverAssert(numKeys > 0); bool mustBlock = false; - // All keys of the command should either be in scope or not since in cluster mode enabled they - // should all be in the same slot. So we just check the first key. + /* All keys of the command should either be in scope or not since in cluster mode enabled they + * should all be in the same slot. So we just check the first key. */ robj *oKey = argv[keyrefs[0].pos]; sds key = objectGetVal(oKey); - // If it's not in the iteration scope for the current iterator, then we don't need to do - // anything with this command. + /* If it's not in the iteration scope for the current iterator, then we don't need to do + * anything with this command. */ if (!it->keyset_iter->isKeyInScope(it->keyset_iter, key)) return false; - // Note: performance optimization for commands which only modify the first key. If this flag - // is not available, we can safely remove this `if` statement. - if ((cmd->flags & CMD_WRITE_FIRSTKEY_ONLY) - && !(it->iteration_flags & BGITERATOR_FLAG_REPLICATION)) { - // If this write command only modifies the 1st key, we don't need to expedite others - // unless replication enabled. + /* Note: performance optimization for commands which only modify the first key. If this flag + * is not available, we can safely remove this `if` statement. */ + if ((cmd->flags & CMD_WRITE_FIRSTKEY_ONLY) && + !(it->iteration_flags & BGITERATOR_FLAG_REPLICATION)) { + /* If this write command only modifies the 1st key, we don't need to expedite others + * unless replication enabled. */ numKeys = 1; } if (cmd->proc == moveCommand) { - // Unfortunate special case for MOVE + // Special case for MOVE return expediteKeysForMove(it, dbid, argc, argv, waitingOnKeys); } @@ -1274,12 +1283,12 @@ static bool expediteKeysForWrite( sds key = objectGetVal(oKey); dbEntry *de = dbFind(server.db[dbid], key); if (de == NULL) continue; // New key, no need to expedite - if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) - && dictFind(it->early_iterate_entries, de) == NULL - && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { + if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) && + !hashtableFind(it->early_iterate_entries, de, NULL) && + ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { if (addEarlyIterationKey(it, de, dbid)) { mustBlock = true; - hashtableAdd(waitingOnKeys, oKey); + hashtableAdd(waitingOnKeys, oKey); } } else { if (isEntryInuseByAnyIterator(de)) { @@ -1290,15 +1299,15 @@ static bool expediteKeysForWrite( } it->cur_cmd_may_replicate = true; // Will replicate only if replication enabled } else { - // Identification of missing keys is only needed for non-consistent iteration. This only - // needs to be collected once (on the 1st non-consistent iteration) + /* Identification of missing keys is only needed for non-consistent iteration. This only + * needs to be collected once (on the 1st non-consistent iteration). */ bool collectMissing = (listLength(curCmdMissingKeys) == 0); if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { // CONSISTENT = NO, REPLICATION = YES bool someIterated = false; - // dict containing the keys that have not been iterated yet. - // Using a dict dedupes the keys in case the command contains duplicated keys. + /* dict containing the keys that have not been iterated yet. + * Using a dict dedupes the keys in case the command contains duplicated keys. */ dict *notIteratedKeys = dictCreate(&dictEntryPtrDictType); // dict of dbEntry* -> robj* for (int i = 0; i < numKeys; i++) { @@ -1312,9 +1321,9 @@ static bool expediteKeysForWrite( } continue; } - if (iterComplete - || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) - || (dictFind(it->early_iterate_entries, de) != NULL)) { + if (iterComplete || + it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) || + hashtableFind(it->early_iterate_entries, de, NULL)) { someIterated = true; } else { dictAdd(notIteratedKeys, de, oKey); @@ -1325,26 +1334,26 @@ static bool expediteKeysForWrite( } } - // Since missing keys are considered as already iterated, if there are any missing keys - // we must consider that some keys have been iterated, and make sure all other keys - // will be expedited if needed. + /* Since missing keys are considered as already iterated, if there are any missing keys + * we must consider that some keys have been iterated, and make sure all other keys + * will be expedited if needed. */ if (listLength(curCmdMissingKeys) > 0) someIterated = true; - // This command may be executing as part of a larger transaction. If some parts of the - // transaction have already been identified to replicate, we must wait on all keys and - // replicate here as well. (Take care not to set cur_cmd_may_replicate to false.) + /* This command may be executing as part of a larger transaction. If some parts of the + * transaction have already been identified to replicate, we must wait on all keys and + * replicate here as well. (Take care not to set cur_cmd_may_replicate to false.) */ if (someIterated) { if (server.in_exec) { - // We are now executing the commands in a multi-exec block. - // - // Regarding MULTI/EXEC: Remember that this code is executed twice for commands - // within a MULTI/EXEC block. First, we parse all the commands when deciding - // if the EXEC should be blocked. Then, as each command is executed, it's - // re-parsed so that we can maintain the early iterated list as the commands - // execute. In this second pass, as each command is executed, we can't change - // the replication decision which was made earlier (when the EXEC was processed). - // We don't want to get tricked (by a key being removed and recreated) into - // into starting to replicate in the middle of a MULTI/EXEC block. + /* We are now executing the commands in a multi-exec block. + * + * Regarding MULTI/EXEC: Remember that this code is executed twice for commands + * within a MULTI/EXEC block. First, we parse all the commands when deciding + * if the EXEC should be blocked. Then, as each command is executed, it's + * re-parsed so that we can maintain the early iterated list as the commands + * execute. In this second pass, as each command is executed, we can't change + * the replication decision which was made earlier (when the EXEC was processed). + * We don't want to get tricked (by a key being removed and recreated) into + * starting to replicate in the middle of a MULTI/EXEC block. */ } else { it->cur_cmd_may_replicate = true; } @@ -1358,7 +1367,7 @@ static bool expediteKeysForWrite( if (addEarlyIterationKey(it, notIteratedEntry, dbid)) { mustBlock = true; - hashtableAdd(waitingOnKeys, oKey); + hashtableAdd(waitingOnKeys, oKey); } } dictReleaseIterator(di); @@ -1389,8 +1398,8 @@ static bool expediteKeysForWrite( } -// Called when an iterator is terminated. Pulls everything out of the queue -// and returns the items to Valkey (before they hit the iterator). +/* Called when an iterator is terminated. Pulls everything out of the queue + * and returns the items to Valkey (before they hit the iterator). */ static void returnAllItemsToValkey(bgIterator *it) { serverAssert(onValkeyMainThread()); @@ -1403,35 +1412,35 @@ static void returnAllItemsToValkey(bgIterator *it) { bgIteratorItem *item; fifoPop(poppedFifo, (void **)&item); switch (item->type) { - // back out the "queued" statistic - case BGITERATOR_ITEM_DBENTRY: - it->dbentries_queued--; - if (item->u.dbe.is_cloned) it->dbentry_clones_queued--; - break; - case BGITERATOR_ITEM_REPLICATION: - it->replication_queued--; - break; - case BGITERATOR_ITEM_SWAPDB: - it->swapdb_queued--; - break; - case BGITERATOR_ITEM_FLUSHDB: - it->flushdb_queued--; - break; - - case BGITERATOR_ITEM_COMPLETE: - // This can only happen if the completion item has been enqueued and - // the iterator is terminated before reaching the completion item. - itemFreeList_returnItemBackToFreeList(item); - continue; // Skip pushing this onto itemsToReturn - - case BGITERATOR_ITEM_TERMINATED: - // This can only happen if there is a race when terminating between - // the iteration client and main thread. - itemFreeList_returnItemBackToFreeList(item); - continue; // Skip pushing this onto itemsToReturn - - default: - serverAssert(false); + // back out the "queued" statistic + case BGITERATOR_ITEM_DBENTRY: + it->dbentries_queued--; + if (item->u.dbe.is_cloned) it->dbentry_clones_queued--; + break; + case BGITERATOR_ITEM_REPLICATION: + it->replication_queued--; + break; + case BGITERATOR_ITEM_SWAPDB: + it->swapdb_queued--; + break; + case BGITERATOR_ITEM_FLUSHDB: + it->flushdb_queued--; + break; + + case BGITERATOR_ITEM_COMPLETE: + /* This can only happen if the completion item has been enqueued and + * the iterator is terminated before reaching the completion item. */ + itemFreeList_returnItemBackToFreeList(item); + continue; // Skip pushing this onto itemsToReturn + + case BGITERATOR_ITEM_TERMINATED: + /* This can only happen if there is a race when terminating between + * the iteration client and main thread. */ + itemFreeList_returnItemBackToFreeList(item); + continue; // Skip pushing this onto itemsToReturn + + default: + serverAssert(false); } fifoPush(itemsToReturn, item); @@ -1446,10 +1455,9 @@ static void returnAllItemsToValkey(bgIterator *it) { } - -//============================================================================================= -// Foreground support functions (private) -//============================================================================================= +/* ============================================================================================= + * Foreground support functions (private) + * ============================================================================================= */ static size_t replicationItemSize(bgIteratorItem *item) { serverAssert(item->type == BGITERATOR_ITEM_REPLICATION); @@ -1463,94 +1471,89 @@ static size_t replicationItemSize(bgIteratorItem *item) { static void processReturnOfItemToValkey(bgIteratorItem *item, bgIterator *iter) { serverAssert(onValkeyMainThread()); switch ((int)item->type) { - case BGITERATOR_ITEM_REPLICATION: - bufferedReplicationBytes -= replicationItemSize(item); - freeRobjArray(item->u.repl.argc, item->u.repl.argv); - break; - - case BGITERATOR_ITEM_DBENTRY: - { - if (item->u.dbe.is_cloned) { - freeClonedDictEntry(item->u.dbe.de); - } else { - if (isEntryInuseBySingleIterator(item->u.dbe.de)) { - // This blocking mechanism isn't the best. Written for slot-migration, - // it assumes a single DB so if the same key appears in multiple DBs, - // commands might get unblocked only to get blocked again. (This would - // happen only rarely, and with minimal impact.) - robj *key = createStringObjectFromSds(objectGetKey(item->u.dbe.de)); - unblockClientsInUseOnKey(key); - decrRefCount(key); - } - // resumeRehashing must be called before decrementEntryInuse, since decrementEntryInuse can free - if (item->u.dbe.is_rehashing_paused) resumeRehashing(item->u.dbe.de); - decrementEntryInuse(item->u.dbe.de); - } + case BGITERATOR_ITEM_REPLICATION: + bufferedReplicationBytes -= replicationItemSize(item); + freeRobjArray(item->u.repl.argc, item->u.repl.argv); + break; + + case BGITERATOR_ITEM_DBENTRY: + if (item->u.dbe.is_cloned) { + freeClonedDictEntry(item->u.dbe.de); + } else { + if (isEntryInuseBySingleIterator(item->u.dbe.de)) { + /* This blocking mechanism assumes a single DB so if the same key appears in + * multiple DBs, commands might get unblocked only to get blocked again. (This + * would happen only rarely, and with minimal impact.) */ + robj *key = createStringObjectFromSds(objectGetKey(item->u.dbe.de)); + unblockClientsInUseOnKey(key); + decrRefCount(key); } - break; - - case BGITERATOR_ITEM_SWAPDB: - case BGITERATOR_ITEM_FLUSHDB: - break; - - case BGITERATOR_ITEMEXT_ITER_CLOSED: - { - bgIterator *it = ((bgIteratorItemExtClose*)item)->iter; - serverAssert(it == iter); - if (it->terminated) { - // Abnormal termination - // Normally the item is TERMINATED, but might be COMPLETE in race - serverAssert(it->current_item->type == BGITERATOR_ITEM_TERMINATED - || it->current_item->type == BGITERATOR_ITEM_COMPLETE); - // Release any items stranded on the iterator after early termination - returnAllItemsToValkey(it); - receiveItemsBackFromOneIterator(it); - } else { - // Normal completion - serverAssert(it->current_item->type == BGITERATOR_ITEM_COMPLETE); - } - serverAssert(mutexQueueLength(it->items_for_iterator) == 0); - serverAssert(it->dbentries_queued == it->dbentries_processed); - serverAssert(it->replication_queued == it->replication_processed); - serverAssert(it->swapdb_queued == it->swapdb_processed); - serverAssert(it->flushdb_queued == it->flushdb_processed); - serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed); + // resumeRehashing must be called before decrementEntryInuse, since decrementEntryInuse can free + if (item->u.dbe.is_rehashing_paused) resumeRehashing(item->u.dbe.de); + decrementEntryInuse(item->u.dbe.de); + } + break; + + case BGITERATOR_ITEM_SWAPDB: + case BGITERATOR_ITEM_FLUSHDB: + break; + + case BGITERATOR_ITEMEXT_ITER_CLOSED: { + bgIterator *it = ((bgIteratorItemExtClose *)item)->iter; + serverAssert(it == iter); + if (it->terminated) { + /* Abnormal termination + * Normally the item is TERMINATED, but might be COMPLETE in race */ + serverAssert(it->current_item->type == BGITERATOR_ITEM_TERMINATED || + it->current_item->type == BGITERATOR_ITEM_COMPLETE); + // Release any items stranded on the iterator after early termination + returnAllItemsToValkey(it); + receiveItemsBackFromOneIterator(it); + } else { + // Normal completion + serverAssert(it->current_item->type == BGITERATOR_ITEM_COMPLETE); + } + serverAssert(mutexQueueLength(it->items_for_iterator) == 0); + serverAssert(it->dbentries_queued == it->dbentries_processed); + serverAssert(it->replication_queued == it->replication_processed); + serverAssert(it->swapdb_queued == it->swapdb_processed); + serverAssert(it->flushdb_queued == it->flushdb_processed); + serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed); - listEmpty(curCmdMissingKeys); // Just in case any remain + listEmpty(curCmdMissingKeys); // Just in case any remain - itemFreeList_returnItemBackToFreeList(it->current_item); - it->current_item = NULL; + itemFreeList_returnItemBackToFreeList(it->current_item); + it->current_item = NULL; - bool terminated = it->terminated; - void *privdata = it->privdata; - bgIteratorCleanupFunc cleanup = it->cleanup; - bgIteratorRelease(it); // Fully release the iterator before calling cleanup + bool terminated = it->terminated; + void *privdata = it->privdata; + bgIteratorCleanupFunc cleanup = it->cleanup; + bgIteratorRelease(it); // Fully release the iterator before calling cleanup - if (BGITERATION_DEBUG) { - if (cleanup) debugBuffer = sdscatprintf(debugBuffer, "CLEANUP FN (%s)\n", - (terminated) ? "terminated" : "success"); + if (BGITERATION_DEBUG) { + if (cleanup) debugBuffer = sdscatprintf(debugBuffer, "CLEANUP FN (%s)\n", + (terminated) ? "terminated" : "success"); - sds filename = sdscatprintf(sdsempty(), "bgiteration_debug.%d", getpid()); - FILE *f = fopen(filename, "w"); - sdsfree(filename); + sds filename = sdscatprintf(sdsempty(), "bgiteration_debug.%d", getpid()); + FILE *f = fopen(filename, "w"); + sdsfree(filename); - fputs(debugBuffer, f); + fputs(debugBuffer, f); - fclose(f); - sdsfree(debugBuffer); - debugBuffer = sdsempty(); - } + fclose(f); + sdsfree(debugBuffer); + debugBuffer = sdsempty(); + } - if (cleanup) cleanup(terminated, privdata); - } - break; + if (cleanup) cleanup(terminated, privdata); + } break; - default: - serverAssert(false); // Not expecting any other type of item! + default: + serverAssert(false); // Not expecting any other type of item! } // We don't allocate extension items from the pool so we manually free them - if((int)item->type == BGITERATOR_ITEMEXT_ITER_CLOSED) { + if ((int)item->type == BGITERATOR_ITEMEXT_ITER_CLOSED) { zfree(item); } else { itemFreeList_returnItemBackToFreeList(item); @@ -1558,29 +1561,24 @@ static void processReturnOfItemToValkey(bgIteratorItem *item, bgIterator *iter) } static void prepareAndProcessReturnedItems(int n, bgIteratorItem **items, bgIterator *iter) { - int i = 0; - for (i = 0; i < n; i++) valkey_prefetch(items[i]); - for (i = 0; i < n; i++) { + for (int i = 0; i < n; i++) valkey_prefetch(items[i]); + for (int i = 0; i < n; i++) { if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue; - // Prefetch can have a significant perf hit on NULL - // but we never expect items[i]->u.dbe.de to be NULL valkey_prefetch(items[i]->u.dbe.de); } - for (i = 0; i < n; i++) { + for (int i = 0; i < n; i++) { if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue; - // Same as above, assume key is never NULL valkey_prefetch(objectGetKey(items[i]->u.dbe.de)); } - for (i = 0; i < n; i++) processReturnOfItemToValkey(items[i], iter); + for (int i = 0; i < n; i++) processReturnOfItemToValkey(items[i], iter); } #define PREFETCH_BATCH_SIZE 16 +// Returns true if we process at least one item from a given iterator's return_to_valkey queue. static bool receiveItemsBackFromOneIterator(bgIterator *it) { - bgIteratorItem* batchPool[PREFETCH_BATCH_SIZE]; + bgIteratorItem *batchPool[PREFETCH_BATCH_SIZE]; int n = 0; - // Returns true if we process at least one item from - // a given iterator's return_to_valkey queue, false otherwise. fifo *poppedFifo = mutexQueuePopAll(it->return_to_valkey, false); if (poppedFifo != NULL) { while (fifoLength(poppedFifo) > 0) { @@ -1599,10 +1597,9 @@ static bool receiveItemsBackFromOneIterator(bgIterator *it) { return false; } +/* Process each iterator's return_to_valkey queue + * If `blocking` is true, continue reading until at least one queue was not empty. */ static void receiveItemsBackFromIterators(bool blocking) { - // Process each iterator's return_to_valkey queue - // If `blocking` is true, continue reading until - // at least one queue was not empty. serverAssert(onValkeyMainThread()); listIter li; listNode *node; @@ -1613,15 +1610,14 @@ static void receiveItemsBackFromIterators(bool blocking) { bgIterator *it = listNodeValue(node); processedItems |= receiveItemsBackFromOneIterator(it); } - if (blocking) usleep(100); // Sleep for 1ms and re-try processing iterators + if (blocking && !processedItems) usleep(100); // Short sleep before retry } while (blocking && !processedItems); } -static long long bgIteration_feedIterators_task( - struct aeEventLoop *eventLoop, - long long id, - void *clientData) { +static long long bgIteration_feedIterators_task(struct aeEventLoop *eventLoop, + long long id, + void *clientData) { UNUSED(eventLoop); UNUSED(id); UNUSED(clientData); @@ -1649,12 +1645,12 @@ static long long bgIteration_feedIterators_task( long dutyTimeUs = BGITER_CYCLE_BUDGET_MS * 1000; if (lastFeedEndTime > 0) { - // If the timer was delayed, compute the proportional time we should have had, and increase - // the duty cycle to compensate (up to a limit). + /* If the timer was delayed, compute the proportional time we should have had, and increase + * the duty cycle to compensate (up to a limit). */ long starvationUs = (startTime - lastFeedEndTime) - BGITER_CYCLE_DELAY_MS * 1000; if (starvationUs > 0) { - long starvationCompensationUs = starvationUs * BGITER_CYCLE_BUDGET_MS - / (BGITER_CYCLE_BUDGET_MS + BGITER_CYCLE_DELAY_MS); + long starvationCompensationUs = starvationUs * BGITER_CYCLE_BUDGET_MS / + (BGITER_CYCLE_BUDGET_MS + BGITER_CYCLE_DELAY_MS); dutyTimeUs += starvationCompensationUs; dutyTimeUs = MIN(dutyTimeUs, BGITER_CYCLE_BUDGET_MAX_MS * 1000); } @@ -1682,8 +1678,8 @@ static long long bgIteration_feedIterators_task( // Not static, but not API. Intended for unit tests where the event loop may not be active. void bgIteration_feedIterators(void) { - // For unit testing, force the item_count_target to 1 in each call. This ensures that we only - // feed a minimal amount to the iterators rather than a non-deterministic amount. + /* For unit testing, force the item_count_target to 1 in each call. This ensures that we only + * feed a minimal amount to the iterators rather than a non-deterministic amount. */ listIter li; listNode *node; listRewind(allIterators, &li); @@ -1698,64 +1694,75 @@ void bgIteration_feedIterators(void) { static void resetReplicationFlagForIterators(client *c) { - // For any given command, the command may or may not need to be replicated based on the status - // and flags of each iterator. Furthermore, if a command does need to be replicated, this - // replication must occur for an entire atomic unit; we can't replicate only part of a script - // or multi/exec. - // This function is the only place where the replication flag is cleared. + /* For any given command, the command may or may not need to be replicated based on the status + * and flags of each iterator. Furthermore, if a command does need to be replicated, this + * replication must occur for an entire atomic unit; we can't replicate only part of a script + * or multi/exec. + * This function is the only place where the replication flag is cleared. */ if (c->flag.multi || c->flag.script) { - // REGARDING MULTI/EXEC - // -------------------- - // When processing a MULTI/EXEC, blockClientIfRequired is called first for the MULTI. Then, - // all of the commands are queued up in server.c:processCommand(). It's only when EXEC is - // encountered, that server.c:call() is fired to begin execution. - // AFTER the EXEC is processed by call(), then each of the commands in the MULTI/EXEC block - // will be processed through call(). - // If write commands are present, MULTI & EXEC will be passed to the replication stream - // before/after the transaction commands. Note that MULTI & EXEC are not actually - // "executed" at the time when their replication is passed to the replication stream. - // - // Example: MULTI; SET A B; EXEC - // 1. blockClientIfRequired() called for MULTI. MULTI flag IS NOT set. (Won't block.) - // 2. blockClientIfRequired() called for EXEC. MULTI flag IS set. (Might block.) - // 3. blockClientIfRequired() called for SET. MULTI flag IS set. (Won't block.) - // 4. handleCommandReplication() is called for MULTI. - // 5. handleCommandReplication() is called for SET. - // 6. handleCommandReplication() is called for EXEC. - // - // SO - if the MULTI flag is set, we DON'T clear the flag. It should only be cleared at the - // start of the transaction, when MULTI is received - and the flag isn't set yet. - - // REGARDING SCRIPTS - // ----------------- - // When processing a script, blockClientIfRequired is called first for the EVAL/EVALSHA/FCALL. - // Then, all of the commands are processed using a special script client. The script - // client has the CLIENT_SCRIPT flag set. For scripts, the replication flag is set when - // processing the EVAL/EVALSHA/FCALL and should not be cleared when executing individual - // commands in the script. - - // If it's the EXEC command, we fall through and clear the flag below. But for all other - // commands within the transaction, we don't clear the flag. + /* REGARDING MULTI/EXEC + * -------------------- + * When processing a MULTI/EXEC, blockClientIfRequired is called first for the MULTI. Then, + * all of the commands are queued up in server.c:processCommand(). It's only when EXEC is + * encountered, that server.c:call() is fired to begin execution. + * + * AFTER the EXEC is processed by call(), then each of the commands in the MULTI/EXEC block + * will be processed through call(). + * + * If write commands are present, MULTI & EXEC will be passed to the replication stream + * before/after the transaction commands. Note that MULTI & EXEC are not actually + * "executed" at the time when their replication is passed to the replication stream. + * + * Example: MULTI; SET A B; EXEC + * 1. blockClientIfRequired() called for MULTI. MULTI flag IS NOT set. (Won't block.) + * 2. blockClientIfRequired() called for EXEC. MULTI flag IS set. (Might block.) + * 3. blockClientIfRequired() called for SET. MULTI flag IS set. (Won't block.) + * 4. handleCommandReplication() is called for MULTI. + * 5. handleCommandReplication() is called for SET. + * 6. handleCommandReplication() is called for EXEC. + * + * SO - if the MULTI flag is set, we DON'T clear the flag. It should only be cleared at the + * start of the transaction, when MULTI is received - and the flag isn't set yet. */ + + /* REGARDING SCRIPTS + * ----------------- + * When processing a script, blockClientIfRequired is called first for the EVAL/EVALSHA/FCALL. + * Then, all of the commands are processed using a special script client. The script + * client has the CLIENT_SCRIPT flag set. For scripts, the replication flag is set when + * processing the EVAL/EVALSHA/FCALL and should not be cleared when executing individual + * commands in the script. */ + + /* If it's the EXEC command, we fall through and clear the flag below. But for all other + * commands within the transaction, we don't clear the flag. */ if (c->cmd->proc != execCommand) return; } - // For most commands, the replication flag is cleared and we determine if replication is needed - // based on the keys being used and their state in each iterator. If a modified key hasn't been - // processed yet, there's no need to expedite the key or send the replication. The key will be - // sent later, when reached by the iterator. - // However, for scripts, it is not possible to perform this optimization. There is no way to - // know if an undeclared key might be modified. Since the entire script needs to be replicated - // (or not replicated) atomically, we can't take the chance that an undeclared key might be - // hit which requires replication. + /* For most commands, the replication flag is cleared and we determine if replication is needed + * based on the keys being used and their state in each iterator. If a modified key hasn't been + * processed yet, there's no need to expedite the key or send the replication. The key will be + * sent later, when reached by the iterator. + * + * However, for scripts, it is not possible to perform this optimization. There is no way to + * know if an undeclared key might be modified. Since the entire script needs to be replicated + * (or not replicated) atomically, we can't take the chance that an undeclared key might be + * hit which requires replication. */ bool isScript = isScriptCallWriteCmd(c->cmd); - getKeysResult result; - initGetKeysResult(&result); - getKeysFromCommand(c->cmd, c->argv, c->argc, &result); - - // [sm-bgiterator] TODO: ELMO-108525, This assumes all keys are in the same slot, should consider cross-slot script case. - sds check_key = (result.numkeys > 0) ? objectGetVal(c->argv[result.keys[0].pos]) : NULL; + sds firstScriptKey = NULL; + if (isScript) { + /* If it's a script, we will normally replicate. But if the keys are out of scope for the + * iteration, we shouldn't. The use-case for this is with slot iteration, when the script + * is acting on keys from a different slot. Here, we just check the first declared key, and + * if it's out of scope for the iteration, we won't replicate it. This might cause issues + * for cross-slot scripts (anti-pattern), but the alternative is replicating all scripts, + * regardless of slot. */ + getKeysResult result; + initGetKeysResult(&result); + getKeysFromCommand(c->cmd, c->argv, c->argc, &result); + if (result.numkeys > 0) firstScriptKey = objectGetVal(c->argv[result.keys[0].pos]); + getKeysFreeResult(&result); + } listIter li; listNode *node; @@ -1765,14 +1772,16 @@ static void resetReplicationFlagForIterators(client *c) { if (it->completed || it->terminated) { it->cur_cmd_may_replicate = false; } else { - // Set initial state of the replication flag for this transaction - // For full scan iterators, write commands within scripts must always be replicated. - // For cluster slot iterators, replication of script write commands depends on whether - // the key is in scope of the current iterator. - it->cur_cmd_may_replicate = isScript && it->keyset_iter->isKeyInScope(it->keyset_iter, check_key); + /* For normal commands, the flag is initialized to false (not to replicate). For these + * commands, we decide later based on the actual commands. + * + * However, for scripts, we don't know what commands will be executed. So IF it's a + * script, and the keys are in scope (on the right slot) we initialize the replication + * flag to true. */ + it->cur_cmd_may_replicate = isScript && firstScriptKey && + it->keyset_iter->isKeyInScope(it->keyset_iter, firstScriptKey); } } - getKeysFreeResult(&result); } @@ -1809,93 +1818,14 @@ static void handleSwapdb(int db1, int db2) { static void removePtrFromEarlyIterate(dbEntry *de) { - // If the item is being released, let's get the pointer out of our early_iterate_entries. - // Note that this is not strictly necessary, but it frees some memory and keeps the - // dictionary small. + /* If the item is being released, let's get the pointer out of our early_iterate_entries. + * This is not strictly necessary, but it frees some memory and keeps the dictionary small. */ listIter li; listNode *node; listRewind(allIterators, &li); while ((node = listNext(&li)) != NULL) { bgIterator *it = listNodeValue(node); - dictDelete(it->early_iterate_entries, de); // just try delete (might not be here) - } -} - - -static int findDbForEntry(dbEntry *de) { - for (int i = 0; i < server.dbnum; i++) { - if (server.db[i] && dbFind(server.db[i], objectGetKey(de)) == de) return i; - } - serverAssert(false); // the entry MUST be in one of the DBs -} - - -static void terminateIteratorForFlush(bgIterator *it, int dbid) { - if (!it->terminated) bgIteratorTerminate(it); - - // Snoop on the iterator. There might be 1 item still being processed. If that item is in the - // DB being flushed, the item is removed from the dict and held for deferred deletion. This - // allows the iterator to complete processing on the current item without the item being - // deleted unexpectedly. - // Since this is running in parallel with a background thread, the results are volatile. This - // is OK as when the iterator completes processing the item, it still won't have been accepted - // back to Valkey yet, meaning the item will still be in inUseEntries. - bgIteratorItem *item = it->current_item; - if (item && item->type == BGITERATOR_ITEM_DBENTRY) { - dbEntry *de = item->u.dbe.de; - int deDb = findDbForEntry(de); - if (dbid == -1 || dbid == deDb) { - removePtrFromEarlyIterate(de); - } - } -} - - -static void preserveIteratorItemsForFlush(bgIterator *it, int dbid) { - serverAssert(onValkeyMainThread()); - serverAssert(!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)); - serverAssert(dbid >= 0); - // Since this is not a consistent iteration, it's OK if the early_iterate_entries contains - // pointers to items being deleted. The item is not actually accessed from the pointer. And - // if the pointer gets reused for a new item, there's no guarantee that we would iterate it - // anyway. If replication is enabled, both new items and early_iterate_entries are treated the - // same (replication is processed). So this is safe in all cases. - // Given this, we will just worry about preserving items in the iterator's processing queue. - // Because of commands like SWAPDB and MOVE, there's no attempt to remove unnecessary items - // from the queue. This is also safer to future Valkey extensions. - - // Temporarily yank all items from the iterator's queue - fifo *poppedFifo = mutexQueuePopAll(it->items_for_iterator, false); - if (poppedFifo != NULL) { - fifo *readdFifo = fifoCreate(); - while(fifoLength(poppedFifo) > 0) { - bgIteratorItem *item; - fifoPop(poppedFifo, (void **)&item); - if (item->type == BGITERATOR_ITEM_DBENTRY) { - dbEntry *de = item->u.dbe.de; - if (dbFind(server.db[dbid], objectGetKey(de)) == de) { - // Found the entry in the DB about to be flushed - removePtrFromEarlyIterate(de); - } - } - fifoPush(readdFifo, item); - } - fifoRelease(poppedFifo); - - // Now give the list back to the iterator - mutexQueueAddMultiple(it->items_for_iterator, readdFifo); - fifoRelease(readdFifo); - } - - // And snoop on the active item. Even if the background task finishes with this item as we look - // at it, the item can't have been returned to Valkey yet. - bgIteratorItem *item = it->current_item; - if (item && item->type == BGITERATOR_ITEM_DBENTRY) { - dbEntry *de = item->u.dbe.de; - if (dbFind(server.db[dbid], objectGetKey(de)) == de) { - // Found the entry in the DB about to be flushed - removePtrFromEarlyIterate(de); - } + hashtableDelete(it->early_iterate_entries, de); // just try delete (might not be here) } } @@ -1919,24 +1849,22 @@ static void handleFlushdb(int dbid) { while ((node = listNext(&li)) != NULL) { bgIterator *it = listNodeValue(node); + // Let the low-level iterator know the DB is being flushed + it->keyset_iter->flushDb(it->keyset_iter, dbid); + if (should_abort_iterators || it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { - terminateIteratorForFlush(it, dbid); + if (!it->terminated) bgIteratorTerminate(it); } else { - // In this (limited) case, we're only flushing a single DB that contains < half the - // keys. We don't want to kill a full-sync replication. We will just continue with - // iteration, knowing that a replication client will also receive the FLUSHDB on the - // replication stream. - // It would be nice to do this with consistent snapshot also, but given that this is a - // very rare condition, development is not justified to save off the DB for deferred - // delete. This would add a lot of complexity as well as memory implications. - preserveIteratorItemsForFlush(it, dbid); - it->keyset_iter->flushDb(it->keyset_iter, dbid); + /* In this (limited) case, we're only flushing a single DB that contains < half the + * keys. We don't want to kill a full-sync replication. We will just continue with + * iteration, knowing that a replication client will also receive the FLUSHDB on the + * replication stream. There's no need to worry about the items themselves. Since + * we've incremented the refcount, the items still in queue won't be physically deleted. */ // Send a flushdb event to notify the client if (BGITERATION_DEBUG) { debugBuffer = sdscatprintf(debugBuffer, "FLUSH: %d\n", dbid); } - bgIteratorItem *item = itemFreeList_getElementOrAllocate(); item->type = BGITERATOR_ITEM_FLUSHDB; item->dbid = dbid; @@ -1948,14 +1876,13 @@ static void handleFlushdb(int dbid) { } -static bool expediteKeysForWriteOnAllIterators( - int dbid, - struct serverCommand *cmd, - int argc, - robj **argv, - keyReference *keyrefs, - int numKeys, - hashtable *waitingOnKeys) { +static bool expediteKeysForWriteOnAllIterators(int dbid, + struct serverCommand *cmd, + int argc, + robj **argv, + keyReference *keyrefs, + int numKeys, + hashtable *waitingOnKeys) { bool mustBlock = false; listIter li; @@ -1989,8 +1916,7 @@ static bool expediteKeysForMultiExec(client *c, hashtable *waitingOnKeys) { /* For MULTI/EXEC, Valkey buffers all of the commands until hitting the EXEC. * At this point, the client holds all of the commands to be executed. This function searches * for all of the keys used by any of the buffered write commands. In addition, if SWAPDB or - * SELECT is used, this tracks the DBIDs through various swap/select operations. - */ + * SELECT is used, this tracks the DBIDs through various swap/select operations. */ /* There's a special concern for a NON-consistent iteration with replication. If the keys are * all "future" keys (which haven't been processed by the iterator yet), then we don't expedite @@ -2014,8 +1940,7 @@ static bool expediteKeysForMultiExec(client *c, hashtable *waitingOnKeys) { * these will be caught on the 2nd time around. * * Checking replication status before/after ensures that there can only be a single recursive - * call. - */ + * call. */ bool initiallyAnIteratorWillReplicate = anIteratorWillReplicateForThisCommand(); bool mustBlock = false; @@ -2068,29 +1993,36 @@ static bool expediteKeysForMultiExec(client *c, hashtable *waitingOnKeys) { zfree(cur_to_orig_db); if (!initiallyAnIteratorWillReplicate && anIteratorWillReplicateForThisCommand()) { - // We've decided to replicate. Re-process the MULTI/EXEC just once more to make sure that - // we didn't miss any keys at the beginning. This can't continue to recurse because - // `initiallyAnIteratorWillReplicate` will be TRUE in the recursive call. Note that the - // recursive call may add additional entries to `waitingOnKeys`. + /* We've decided to replicate. Re-process the MULTI/EXEC just once more to make sure that + * we didn't miss any keys at the beginning. This can't continue to recurse because + * `initiallyAnIteratorWillReplicate` will be TRUE in the recursive call. Note that the + * recursive call may add additional entries to `waitingOnKeys`. */ if (expediteKeysForMultiExec(c, waitingOnKeys)) mustBlock = true; } return mustBlock; } -static bgIterator * bgIteratorCreate( - const char *name, - int flags, - bgIteratorReplDoneFunc repldone, - bgIteratorCleanupFunc cleanup, - void *privdata, - bgIterationType iter_type, - genericIterator *keyset_iter) { + +static bgIterator *bgIteratorCreate(const char *name, + bgIteratorConsistency consistency, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata, + bgIterationType iter_type, + genericIterator *keyset_iter) { serverAssert(onValkeyMainThread()); serverAssert(server.cluster_enabled || iter_type == BGITERATION_TYPE_FULLSCAN); - serverAssert(server.cluster_enabled // Don't allow CONSISTENT & REPLICATION - || !(flags & BGITERATOR_FLAG_CONSISTENT) // unless cluster mode (avoids - || !(flags & BGITERATOR_FLAG_REPLICATION)); // complications with SWAPDB & FLUSHDB) + + int flags; + switch (consistency) { + case BGITERATOR_CONSISTENCY_NONE: flags = 0; break; + case BGITERATOR_CONSISTENCY_START: flags = BGITERATOR_FLAG_CONSISTENT; break; + case BGITERATOR_CONSISTENCY_EVENTUAL: flags = BGITERATOR_FLAG_REPLICATION; break; + default: serverAssert(false); + } + // Consistent, with replication - doesn't make sense. + serverAssert(!((flags & BGITERATOR_FLAG_CONSISTENT) && (flags & BGITERATOR_FLAG_REPLICATION))); bgIterator *it = zmalloc(sizeof(bgIterator)); it->name = sdsnew(name); @@ -2109,8 +2041,8 @@ static bgIterator * bgIteratorCreate( it->iteration_type = iter_type; it->consistent_modification_id = bgIteration_epoch++; it->keyset_iter = keyset_iter; - it->early_iterate_entries = dictCreate(&dictEntryPtrDictType); - dictExpand(it->early_iterate_entries, BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE); + it->early_iterate_entries = hashtableCreate(&dbEntryPtrHashtableType); + hashtableExpand(it->early_iterate_entries, BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE); it->current_item = NULL; it->client_is_active = false; it->completed = false; @@ -2138,7 +2070,7 @@ static bgIterator * bgIteratorCreate( serverAssert(bgIterator_timeproc_id != AE_ERR); } - if (dictAdd(nameToIterator, (void*)it->name, it) != DICT_OK) { + if (dictAdd(nameToIterator, it->name, it) != DICT_OK) { // Can't have 2 iterators with the same name! serverAssert(false); } @@ -2151,37 +2083,34 @@ static bgIterator * bgIteratorCreate( } - -//============================================================================================= -// PUBLIC INTERFACE: Iterator creation and use -//============================================================================================= +/* ============================================================================================= + * PUBLIC INTERFACE: Iterator creation and use + * ============================================================================================= */ // PUBLIC API -bgIterator * bgIteratorCreateFullScanIter( - const char *name, - int flags, - bgIteratorReplDoneFunc repldone, - bgIteratorCleanupFunc cleanup, - void *privdata) { - return bgIteratorCreate(name, flags, repldone, cleanup, privdata, BGITERATION_TYPE_FULLSCAN, - fullScanIteratorCreate()); +bgIterator *bgIteratorCreateFullScanIter(const char *name, + bgIteratorConsistency consistency, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata) { + return bgIteratorCreate(name, consistency, repldone, cleanup, privdata, + BGITERATION_TYPE_FULLSCAN, fullScanIteratorCreate()); } // PUBLIC API -bgIterator * bgIteratorCreateSlotsIter( - const char *name, - int flags, - const int *slots, - int slots_count, - bgIteratorReplDoneFunc repldone, - bgIteratorCleanupFunc cleanup, - void *privdata) { - return bgIteratorCreate(name, flags, repldone, cleanup, privdata, BGITERATION_TYPE_CLUSTERSLOT, - clusterSlotIteratorCreate(slots, slots_count)); +bgIterator *bgIteratorCreateSlotsIter(const char *name, + bgIteratorConsistency consistency, + const int *slots, + int slots_count, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata) { + return bgIteratorCreate(name, consistency, repldone, cleanup, privdata, + BGITERATION_TYPE_CLUSTERSLOT, clusterSlotIteratorCreate(slots, slots_count)); } // PUBLIC API -bgIterator * bgIteratorFind(const char *name) { +bgIterator *bgIteratorFind(const char *name) { serverAssert(onValkeyMainThread()); sds sdsname = sdsnew(name); @@ -2200,14 +2129,14 @@ const char *bgIteratorName(bgIterator *it) { // PUBLIC API void bgIteratorGetStatus(bgIterator *it, bgIteratorStatus *status) { - status->dbentries_queued = it->dbentries_queued; - status->dbentries_processed = it->dbentries_processed; - status->replication_queued = it->replication_queued; + status->dbentries_queued = it->dbentries_queued; + status->dbentries_processed = it->dbentries_processed; + status->replication_queued = it->replication_queued; status->replication_processed = it->replication_processed; - status->swapdb_queued = it->swapdb_queued; - status->swapdb_processed = it->swapdb_processed; - status->flushdb_queued = it->flushdb_queued; - status->flushdb_processed = it->flushdb_processed; + status->swapdb_queued = it->swapdb_queued; + status->swapdb_processed = it->swapdb_processed; + status->flushdb_queued = it->flushdb_queued; + status->flushdb_processed = it->flushdb_processed; status->dbentry_clones_queued = it->dbentry_clones_queued; status->dbentry_clones_processed = it->dbentry_clones_processed; @@ -2217,8 +2146,9 @@ void bgIteratorGetStatus(bgIterator *it, bgIteratorStatus *status) { status->runtime_ms = elapsedMs(it->monotonic_start_time); monotime nonvolatile_item_start_time = it->monotonic_item_start_time; - status->current_item_ms = - (nonvolatile_item_start_time == 0) ? 0 : elapsedMs(nonvolatile_item_start_time); + status->current_item_ms = (nonvolatile_item_start_time == 0) + ? 0 + : elapsedMs(nonvolatile_item_start_time); } @@ -2235,7 +2165,7 @@ void bgIteratorTerminate(bgIterator *it) { } bgIteratorItem *terminationItem = itemFreeList_getElementOrAllocate(); - *terminationItem = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED }; + *terminationItem = (bgIteratorItem){.type = BGITERATOR_ITEM_TERMINATED}; mutexQueueAdd(it->items_for_iterator, terminationItem); it->terminated = true; @@ -2249,19 +2179,20 @@ bool bgIteratorIsTerminating(bgIterator *it) { // PUBLIC API -bgIteratorItem * bgIteratorRead(bgIterator *it) { - serverAssert(it->current_item == NULL - || (it->current_item->type != BGITERATOR_ITEM_COMPLETE - && it->current_item->type != BGITERATOR_ITEM_TERMINATED)); +bgIteratorItem *bgIteratorRead(bgIterator *it) { + serverAssert(it->current_item == NULL || + (it->current_item->type != BGITERATOR_ITEM_COMPLETE && + it->current_item->type != BGITERATOR_ITEM_TERMINATED)); // First, clean up the previous item read if (it->current_item != NULL) { returnCurrentItemToValkey(it); - // To support unit tests. Normal clients call bgIteratorRead from an alternate thread. - // Without this, a unit test could get stuck waiting on the completion event because - // feed won't get invoked. For production, this is called regularly from the main thread. - if (onValkeyMainThread()) bgIteration_feedIterators_task(NULL, 0, NULL); + /* To support unit tests. Normal clients call bgIteratorRead from an alternate thread. + * Without this, a unit test could get stuck waiting on the completion event because + * feed won't get invoked. For production, feed is called regularly from the main thread. + * Note - this is checking that the exact same thread is used and shouldn't count modules. */ + if (pthread_equal(server.main_thread_id, pthread_self()) != 0) bgIteration_feedIterators_task(NULL, 0, NULL); } else { it->client_is_active = true; } @@ -2277,8 +2208,8 @@ bgIteratorItem * bgIteratorRead(bgIterator *it) { // PUBLIC API void bgIteratorClose(bgIterator *it) { if (it->current_item != NULL) { - if (it->current_item->type == BGITERATOR_ITEM_COMPLETE - || it->current_item->type == BGITERATOR_ITEM_TERMINATED) { + if (it->current_item->type == BGITERATOR_ITEM_COMPLETE || + it->current_item->type == BGITERATOR_ITEM_TERMINATED) { // Normal confirmation of background completion } else { // Client is initiating the termination @@ -2286,13 +2217,13 @@ void bgIteratorClose(bgIterator *it) { returnCurrentItemToValkey(it); it->current_item = itemFreeList_getElementOrAllocate(); - *(it->current_item) = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED }; + *(it->current_item) = (bgIteratorItem){.type = BGITERATOR_ITEM_TERMINATED}; } } else { // terminated before first item read it->terminated = true; it->current_item = itemFreeList_getElementOrAllocate(); - *(it->current_item) = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED }; + *(it->current_item) = (bgIteratorItem){.type = BGITERATOR_ITEM_TERMINATED}; } // We don't allocate extension items from the free list @@ -2303,10 +2234,9 @@ void bgIteratorClose(bgIterator *it) { } - -//============================================================================================= -// PUBLIC INTERFACE: Valkey main-thread support hooks -//============================================================================================= +/* ============================================================================================= + * PUBLIC INTERFACE: Valkey main-thread support hooks + * ============================================================================================= */ // PUBLIC API void bgIteration_init(void) { @@ -2363,10 +2293,10 @@ void bgIteration_keyDelete(int dbid, const_sds key) { bgIterator *it = listNodeValue(node); if (it->completed || it->terminated || !it->keyset_iter->isKeyInScope(it->keyset_iter, key)) continue; - if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT - && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { - if (!it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) - && !(dictFind(it->early_iterate_entries, de) != NULL)) { + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT && + ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { + if (!it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) && + !hashtableFind(it->early_iterate_entries, de, NULL)) { addEarlyIterationKey(it, de, dbid); // (may also add to inUseEntries) } } @@ -2374,17 +2304,16 @@ void bgIteration_keyDelete(int dbid, const_sds key) { removePtrFromEarlyIterate(de); - // We might be within the context of a command execution. This happens if the key is found to - // be expired when attempting to execute the command. In this case, we should treat the key as - // missing. If the key exists after the command executes, we can treat it like a new key. - // (If not in command execution, this is ok - it's reset at the beginning of command execution.) + /* We might be within the context of a command execution. This happens if the key is found to + * be expired when attempting to execute the command. In this case, we should treat the key as + * missing. If the key exists after the command executes, we can treat it like a new key. + * (If not in command execution, this is ok - it's reset at the beginning of command execution.) */ robj *oKey = createObject(OBJ_STRING, sdsdup(key)); listAddNodeHead(curCmdMissingKeys, oKey); } // PUBLIC API -// Notify bgIteration that a FLUSHALL is being performed outside of the normal client interface. void bgIteration_flushall(void) { handleFlushdb(-1); } @@ -2398,12 +2327,12 @@ bool bgIteration_blockClientIfRequired(client *c) { if (BGITERATION_DEBUG) { debugBuffer = sdscatprintf(debugBuffer, "BLCK?: (%d)%s\n", c->db->id, - createSdsFromClientArgv(c->argc, c->argv)); + createSdsFromClientArgv(c->argc, c->argv)); } - // Before executing a command or atomic transaction, the replication flag is cleared for each - // iterator. If it's determined that the command should replicate, the flag will be set - // as the command and keys are examined for expedite. + /* Before executing a command or atomic transaction, the replication flag is cleared for each + * iterator. If it's determined that the command should replicate, the flag will be set + * as the command and keys are examined for expedite. */ resetReplicationFlagForIterators(c); if (c->cmd->proc == flushdbCommand || c->cmd->proc == flushallCommand) { @@ -2428,7 +2357,7 @@ bool bgIteration_blockClientIfRequired(client *c) { keyReference *keyrefs = result.keys; if (numkeys > 0) { mustBlock = expediteKeysForWriteOnAllIterators( - c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); + c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); serverAssert(!(mustBlock && (c->flag.multi) && !(c->flag.script))); if (mustBlock && (c->flag.script)) { @@ -2442,7 +2371,7 @@ bool bgIteration_blockClientIfRequired(client *c) { receiveItemsBackFromIterators(true); // Blocking hashtableEmpty(waitOnKeys, NULL); mustBlock = expediteKeysForWriteOnAllIterators( - c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); + c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); } } getKeysFreeResult(&result); @@ -2460,7 +2389,7 @@ bool bgIteration_blockClientIfRequired(client *c) { if (mustBlock) { serverAssert(hashtableSize(waitOnKeys) > 0); - robj **waitKeysArgv = zmalloc(sizeof(robj*) * hashtableSize(waitOnKeys)); + robj **waitKeysArgv = zmalloc(sizeof(robj *) * hashtableSize(waitOnKeys)); robj *key; hashtableIterator hi; @@ -2488,29 +2417,28 @@ bool bgIteration_blockClientIfRequired(client *c) { // PUBLIC API -void bgIteration_handleCommandReplication( - int dbid, - struct serverCommand *cmd, - int argc, - robj **argv) { +void bgIteration_handleCommandReplication(int dbid, + struct serverCommand *cmd, + int argc, + robj **argv) { if (BGITERATION_DEBUG) { // DEBUG - enable this to capture replication not queued because iteration is inactive if (0 && !bgIteration_iterationActive() && (isWriteCmd(cmd) || cmd->proc == multiCommand)) { debugBuffer = sdscatprintf(debugBuffer, "REPL? INACT: (%d)%s\n", dbid, - createSdsFromClientArgv(argc, argv)); + createSdsFromClientArgv(argc, argv)); } } if (!bgIteration_iterationActive()) return; serverAssert(onValkeyMainThread()); - // Some commands are replicated which are not writes (like publish) these can be ignored. - // Be careful with MULTI which is not a write command, but must be replicated. + /* Some commands are replicated which are not writes (like publish) these can be ignored. + * Be careful with MULTI which is not a write command, but must be replicated. */ if (!isWriteCmd(cmd) && cmd->proc != multiCommand) return; if (BGITERATION_DEBUG) { debugBuffer = sdscatprintf(debugBuffer, "REPL?: (%d)%s\n", dbid, - createSdsFromClientArgv(argc, argv)); + createSdsFromClientArgv(argc, argv)); } if (cmd->proc == swapdbCommand) { @@ -2521,16 +2449,16 @@ void bgIteration_handleCommandReplication( handleSwapdb(id1, id2); } - // In the case that a key is touched in a different DB (COPY/MOVE) the key is recorded as - // a "special" key and than handled below. + /* In the case that a key is touched in a different DB (COPY/MOVE) the key is recorded as + * a "special" key and than handled below. */ int special_dbid = 0; sds special_key = NULL; dbEntry *special_dbEntry = NULL; if (cmd->proc == moveCommand) { - // The MOVE command succeeded. However MOVE requires special handling as it creates a new - // key in a different database. We need to make sure that we don't later try to iterate - // on the key as it would be a duplicate key at that point. So, instead, we will mark the - // newly created key as "early iterated". + /* The MOVE command succeeded. However MOVE requires special handling as it creates a new + * key in a different database. We need to make sure that we don't later try to iterate + * on the key as it would be a duplicate key at that point. So, instead, we will mark the + * newly created key as "early iterated". */ bool success = getDbIdFromRobj(argv[MOVE_COMMAND_DBID_ARG_INDEX], &special_dbid); serverAssert(success); // the command already succeeded, so this should work! @@ -2552,10 +2480,10 @@ void bgIteration_handleCommandReplication( } /* Implementation note regarding LUA and MULTI: LUA scripts and MULTI-EXEC blocks must be - * treated atomically. We need to ensure that either ALL of the replication (or none of the - * replication) for the atomic operation is processed by the iterator(s). This is handled - * naturally as we can only "complete" the iteration during the feeding process - and feeding - * is only performed when handling timer events (after the LUA/MULTI has completed). */ + * treated atomically. We need to ensure that either ALL of the replication (or none of the + * replication) for the atomic operation is processed by the iterator(s). This is handled + * naturally as we can only "complete" the iteration during the feeding process - and feeding + * is only performed when handling timer events (after the LUA/MULTI has completed). */ listIter li; listNode *node; @@ -2564,16 +2492,17 @@ void bgIteration_handleCommandReplication( bgIterator *it = listNodeValue(node); if (it->completed || it->terminated) continue; - // For consistent iteration, we only iterate values based on version. But for - // non-consistent iteration, we don't need to explicitly iterate any values newly created - // during the iteration. So we mark them as expedited. We know we have a new key if it - // was missing before the command, and exists now. + /* For consistent iteration, we only iterate values based on version. But for + * non-consistent iteration, we don't need to explicitly iterate any values newly created + * during the iteration. So we mark them as expedited. We know we have a new key if it + * was missing before the command, and exists now. */ + if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) { // Handle the special case of a key moved to a different DB if (special_dbEntry != NULL) { - if (it->cur_cmd_may_replicate - && !it->keyset_iter->hasPassedItem(it->keyset_iter, special_key, special_dbid)) { - dictAdd(it->early_iterate_entries, special_dbEntry, NULL); + if (it->cur_cmd_may_replicate && + !it->keyset_iter->hasPassedItem(it->keyset_iter, special_key, special_dbid)) { + hashtableAdd(it->early_iterate_entries, special_dbEntry); if (BGITERATION_DEBUG) { sds entryString = createEntryString(special_dbid, special_dbEntry); debugBuffer = sdscatprintf(debugBuffer, "EARLY(special): %s\n", entryString); @@ -2581,11 +2510,12 @@ void bgIteration_handleCommandReplication( } } - // Note: In the cases where there's a special command, we are copying or moving an - // item to a different DB. In these limited cases, we can only possibly be - // creating a single key. And if we've handled it here, we don't need to - // handle it as a "missing key" below. If we were to try to handle it as a - // standard "missing key", we would get the DBID incorrect. + /* Note: In the cases where there's a special command, we are copying or moving an + * item to a different DB. In these limited cases, we can only possibly be + * creating a single key. And if we've handled it here, we don't need to + * handle it as a "missing key" below. If we were to try to handle it as a + * standard "missing key", we would get the DBID incorrect. */ + } else if (listLength(curCmdMissingKeys) > 0) { listIter missingIt; listNode *missingNode; @@ -2596,13 +2526,14 @@ void bgIteration_handleCommandReplication( dbEntry *de = dbFind(server.db[dbid], (sds)key); if (de != NULL) { // It exists now! - if (it->cur_cmd_may_replicate - && !it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) { - // If the current command is allowed to replicate, and there is a new - // key which we haven't yet reached in iteration, it needs to be added - // to the set of early iterate entries. (We know that it's not already - // in that set because it's a newly created key!) - dictAdd(it->early_iterate_entries, de, NULL); + if (it->cur_cmd_may_replicate && + !it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) { + /* If the current command is allowed to replicate, and there is a new + * key which we haven't yet reached in iteration, it needs to be added + * to the set of early iterate entries. (We know that it's not already + * in that set because it's a newly created key!) */ + bool wasAdded = hashtableAdd(it->early_iterate_entries, de); + serverAssert(wasAdded); if (BGITERATION_DEBUG) { sds entryString = createEntryString(dbid, de); debugBuffer = sdscatprintf(debugBuffer, "EARLY(NEW): %s\n", entryString); @@ -2653,8 +2584,7 @@ void bgIteration_handleCommandReplication( * In the case of a client driven DEL command, the key will have already been deleted when * we hit this routine. In the case of EXPIRE/EVICT, they propagate happens before the key * is deleted. So if the key is missing, we can use the cached replication decision. But - * if the key still exists (indicating EXPIRE/EVICT) we evaluate it specially. - */ + * if the key still exists (indicating EXPIRE/EVICT) we evaluate it specially. */ bool shouldReplicateDelCommand = false; bool isDelCommand = isDeleteCmd(cmd); if (isDelCommand) { @@ -2665,8 +2595,8 @@ void bgIteration_handleCommandReplication( // NOTE: It's weird, but helpful, for both EXPIRE and EVICT the propagation happens // BEFORE the actual delete. So if the dbEntry still exists, we are doing // an expire/evict which is not preceded by blockClientIfRequired(). - if (it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) - || (dictFind(it->early_iterate_entries, de) != NULL)) { + if (it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) || + hashtableFind(it->early_iterate_entries, de, NULL)) { shouldReplicateDelCommand = true; } } else { @@ -2678,17 +2608,14 @@ void bgIteration_handleCommandReplication( } bool replicate = (it->iteration_flags & BGITERATOR_FLAG_REPLICATION && - ((!isDelCommand && it->cur_cmd_may_replicate) - || shouldReplicateDelCommand)); + ((!isDelCommand && it->cur_cmd_may_replicate) || shouldReplicateDelCommand)); if (replicate) { /* We will replicate the command in these cases: * 1) For consistent iteration - it->cur_cmd_may_replicate is always true * 2) For non-consistent, if any of the keys have been processed, expediteKeysForWrite * will ensure that ALL of the keys have been expedited - and we should replicate - * 3) For non-consistent, if NONE of the keys have been processed, no need to replicate - */ - + * 3) For non-consistent, if NONE of the keys have been processed, no need to replicate */ if (BGITERATION_DEBUG) { debugBuffer = sdscat(debugBuffer, " (queued)\n"); } @@ -2716,13 +2643,26 @@ size_t bgIteration_memoryInuseForReplication(void) { // PUBLIC API bool bgIteration_isEntryInuse(dbEntry *de) { serverAssert(onValkeyMainThread()); + if (!bgIteration_iterationActive()) return false; return isEntryInuseByAnyIterator(de); } // PUBLIC API -uint32_t bgIteration_getEpoch(void) { - return bgIteration_epoch; +void bgIteration_dbEntryModified(dbEntry *de) { + if (bgIteration_iterationActive()) { + bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(de); + if (md) md->iterator_epoch = bgIteration_epoch; + } +} + + +// PUBLIC API +void bgIteration_keyModified(int dbid, const_sds key) { + if (bgIteration_iterationActive()) { + dbEntry *de = dbFind(server.db[dbid], (sds)key); + if (de) bgIteration_dbEntryModified(de); + } } @@ -2737,11 +2677,12 @@ void bgIteration_updateDbEntryPtr(dbEntry *old, dbEntry *new) { listRewind(allIterators, &li); while ((node = listNext(&li)) != NULL) { bgIterator *it = listNodeValue(node); - if (dictDelete(it->early_iterate_entries, old) == DICT_OK) { + if (hashtableDelete(it->early_iterate_entries, old)) { if (BGITERATION_DEBUG) { debugBuffer = sdscatprintf(debugBuffer, "EARLY LIST UPDATE %p -> %p\n", (void *)old, (void *)new); } - dictAdd(it->early_iterate_entries, new, NULL); + bool wasAdded = hashtableAdd(it->early_iterate_entries, new); + serverAssert(wasAdded); } } } diff --git a/src/bgiteration.h b/src/bgiteration.h index dd6da71608f..78643311fe0 100644 --- a/src/bgiteration.h +++ b/src/bgiteration.h @@ -21,8 +21,7 @@ * implements the logic of the iteration client. * * Iteration clients are expected to read through the keyspace until the iteration is complete or - * terminated. An iteration client may not perform modifications on a key. - */ + * terminated. An iteration client may not perform modifications on a key. */ /* Avoids dependency on server.h */ typedef struct serverObject dbEntry; // An object with key/value inserted into main dictionary @@ -33,21 +32,30 @@ typedef struct client client; typedef struct bgIterator bgIterator; -/* Flag indicates that a consistent iteration is required. This is used to create a point-in-time - * iteration. The iteration client will see all keys AS THEY EXISTED at the time when the iterator - * was created. - * Note: The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration - * start). SWAPDB events are NOT provided during a consistent iteration. */ -#define BGITERATOR_FLAG_CONSISTENT (1 << 0) - -/* Flag indicating that the replication stream for keys which have already been processed should be - * forwarded to the iteration client. Most useful for non-consistent iteration to track changes - * to keys already processed. By tracking changes, this allows an non-consistent iteration client - * to achieve a consistent view at the END of the iteration. - * NOTE: Replication events will be provided ordered and synchronized with any SWAPDB events. - * LIMITATION: Since SWAPDB events are not provided during CONSISTENT iteration, it is not - * permitted to use both CONSISTENT and REPLICATION on a non-clustermode instance. */ -#define BGITERATOR_FLAG_REPLICATION (1 << 1) +/* Consistency type for iteration. */ +typedef enum { + /* With no consistency requirements, dbEntries are provided to the iteration client as they + * appear at the time of iteration. No replication is provided. The only guarantee is that + * dbEntries which existed at the start of iteration, and remained through the duration of + * iteration, will be provided to the iteration client once (and only once). If a dbEntry is + * modified during iteration, either the old or the new value may be provided. */ + BGITERATOR_CONSISTENCY_NONE = 0, + + /* With consistency at the start of iteration, a point-in-time iteration is performed. The + * iteration client will see all keys AS THEY EXISTED at the time when the iterator was created. + * Note: The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration + * start). SWAPDB events will not be provided. */ + BGITERATOR_CONSISTENCY_START = 1, + + /* With an eventually consistent iteration, dbEntries will be followed by relevant replication. + * This will allow a client to achieve a consistent state at the END of the iteration. Once a + * dbEntry has been provided to the iteration client, any replication related to that entry will + * also be forwarded to the iteration client. With eventual consistency, keys are provided as + * they are at the time of iteration. This mode requires that the iteration client be aware of + * SWAPDB events. If a SWAPDB is performed, the client will receive a SWAPDB event. + * Replication events will be provided ordered and synchronized with any SWAPDB events. */ + BGITERATOR_CONSISTENCY_EVENTUAL = 2 +} bgIteratorConsistency; /* When running an iterator with replication, a replication-done function (callback) may be @@ -60,8 +68,7 @@ typedef struct bgIterator bgIterator; * Returns true when an iterator stops accepting any replication item into the queue for the client. * If false is returned, replication will continue, and bgiteration will periodically call the callback * until true is returned. In this context, returning false indicates that the client is not ready to - * stop receiving replication, it is requesting that replication be continued. - */ + * stop receiving replication, it is requesting that replication be continued. */ typedef bool (*bgIteratorReplDoneFunc)(void *privdata); @@ -71,8 +78,7 @@ typedef bool (*bgIteratorReplDoneFunc)(void *privdata); * TERMINATED: will be passed as TRUE if the iteration process was terminated early (either by * the main thread calling bgIteratorTerminate() or the iteration client calling * bgIteratorClose()). - * PRIVDATA: this pointer is for data private to the iteration client. - */ + * PRIVDATA: this pointer is for data private to the iteration client. */ typedef void (*bgIteratorCleanupFunc)(bool terminated, void *privdata); @@ -90,11 +96,10 @@ typedef void (*bgIteratorCleanupFunc)(bool terminated, void *privdata); * to implement the iteration client which will read from the returned bgIterator. * * There is no need to delete/destroy a bgIterator. It will automatically be cleaned up after the - * last item is read. - */ + * last item is read. */ bgIterator *bgIteratorCreateFullScanIter( const char *name, - int flags, + bgIteratorConsistency consistency, bgIteratorReplDoneFunc repldone, bgIteratorCleanupFunc cleanup, void *privdata); @@ -119,11 +124,10 @@ bgIterator *bgIteratorCreateFullScanIter( * just copy its data and leave the array untouched. * * There is no need to delete/destroy a bgIterator. It will automatically be cleaned up after the - * last item is read. - */ + * last item is read. */ bgIterator *bgIteratorCreateSlotsIter( const char *name, - int flags, + bgIteratorConsistency consistency, const int *slots, int slots_count, bgIteratorReplDoneFunc repldone, @@ -132,8 +136,7 @@ bgIterator *bgIteratorCreateSlotsIter( /* Find an existing bgIterator by name. - * Returns NULL if the iterator does not exist (or has completed). - */ + * Returns NULL if the iterator does not exist (or has completed). */ bgIterator *bgIteratorFind(const char *name); @@ -162,8 +165,7 @@ typedef struct { /* Get the status of a background iteration. * - * The caller-provided bgIteratorStatus will be populated. - */ + * The caller-provided bgIteratorStatus will be populated. */ void bgIteratorGetStatus(bgIterator *iter, bgIteratorStatus *status); @@ -172,8 +174,7 @@ void bgIteratorGetStatus(bgIterator *iter, bgIteratorStatus *status); * An iteration is terminated by the Valkey main thread. It is expected that the iteration client * will continue to read, receiving BGITERATOR_ITEM_TERMINATED or BGITERATOR_ITEM_COMPLETE to * complete the iteration. (This is necessary to ensure proper cleanup.) - * NOTE: If the iteration client wants to terminate iteration, it may call bgIteratorClose(). - */ + * NOTE: If the iteration client wants to terminate iteration, it may call bgIteratorClose(). */ void bgIteratorTerminate(bgIterator *iter); @@ -182,8 +183,7 @@ void bgIteratorTerminate(bgIterator *iter); * This checks if the iterator is in the process of terminating. For the Valkey main thread, this * can be used to determine if a call has already been made to bgIteratorTerminate. For an * iteration client, it normally learns about terminate by reading the next item, this allows - * out-of-band detection of termination which can be useful when processing a large key. - */ + * out-of-band detection of termination which can be useful when processing a large key. */ bool bgIteratorIsTerminating(bgIterator *iter); @@ -254,8 +254,7 @@ typedef struct { * NOTE: Reading an item returns previously read items to Valkey. It is unsafe to reference an item * previously read. * - * (All memory management is the responsibility of the bgIterator - not the reader.) - */ + * (All memory management is the responsibility of the bgIterator - not the reader.) */ bgIteratorItem *bgIteratorRead(bgIterator *iter); @@ -267,8 +266,7 @@ bgIteratorItem *bgIteratorRead(bgIterator *iter); * BGITERATOR_ITEM_TERMINATED and signals that the background activity is complete. * * This may also be called by the iteration client to force terminate an iteration early. The - * bgIterator will be marked as terminated. - */ + * bgIterator will be marked as terminated. */ void bgIteratorClose(bgIterator *iter); @@ -276,10 +274,7 @@ void bgIteratorClose(bgIterator *iter); * BGITERATION HOOKS REQUIRED TO SUPPORT ITERATION - CALLS INSERTED INTO MAIN VALKEY CODE ********************************************************************************************/ -typedef struct { - uint32_t iterator_epoch; // iterator epoch of last modification -} bgIterationEntryMetadata; - +#define BGITERATION_ENTRY_METADATA_SIZE 4 /* Must be called once (and only once) at server startup. */ void bgIteration_init(void); @@ -292,15 +287,13 @@ bool bgIteration_iterationActive(void); /* Notify bgIteration that a key is being deleted. In Valkey, key deletion can occur in a READ * command if the key is expired. Note that this notification is more about status than memory. * Since the dbEntry is a reference counted object, the dbEntry can't be physically deleted if - * bgIteration is still actively using it. - */ + * bgIteration is still actively using it. */ void bgIteration_keyDelete(int dbid, const_sds key); /* Iteration needs to know if a FLUSHALL is being performed. For normal clients, this comes through * the standard "blockClientIfRequired" interface. This interface is for cases where Valkey - * performs the FLUSHALL operation independently of clients (e.g. when syncing with master). - */ + * performs the FLUSHALL operation independently of clients (e.g. when syncing with master). */ void bgIteration_flushall(void); @@ -311,8 +304,7 @@ void bgIteration_flushall(void); * * We can't update the dbEntry if the entry is actually in use (bgIteration_isEntryInuse)! * - * To simplify calling code, this function does nothing if old_entry == new_entry. - */ + * To simplify calling code, this function does nothing if old_entry == new_entry. */ void bgIteration_updateDbEntryPtr(dbEntry *old_entry, dbEntry *new_entry); @@ -332,16 +324,14 @@ void bgIteration_updateDbEntryPtr(dbEntry *old_entry, dbEntry *new_entry); * performs SWAPDB, a synchronous block may be performed (returning false) on * individual commands within the script. * - * Note: this function should be called for all commands (not just writes). - */ + * Note: this function should be called for all commands (not just writes). */ bool bgIteration_blockClientIfRequired(client *c); /* After execution of a write command, the Valkey main thread must provide the command to iterators * which are interested in the replication feed. It is required that all commands have been passed * through bgIteration_blockClientIfRequired(), however, it is permitted that the command can be - * re-written for propagation. - */ + * re-written for propagation. */ void bgIteration_handleCommandReplication( int dbid, struct serverCommand *cmd, @@ -351,8 +341,7 @@ void bgIteration_handleCommandReplication( /* The memory that bgIteration uses while temporarily buffering replication data is not included in * the maxmemory computation used for eviction. This function provides insight into the current - * amount of memory used for buffered replication data. - */ + * amount of memory used for buffered replication data. */ size_t bgIteration_memoryInuseForReplication(void); @@ -360,7 +349,10 @@ size_t bgIteration_memoryInuseForReplication(void); bool bgIteration_isEntryInuse(dbEntry *de); -/* Get the current iteration epoch, for tagging metadata on keys. */ -uint32_t bgIteration_getEpoch(void); +/* Notify bgIteration that a dbEntry has been added/modified. + * - If caller has a dbEntry*, dbEntryModified is more efficient + * - If caller has a dbid/key, a lookup is performed to find the dbEntry */ +void bgIteration_dbEntryModified(dbEntry *de); +void bgIteration_keyModified(int dbid, const_sds key); #endif diff --git a/src/db.c b/src/db.c index cdf41c6a8b7..4b6f3c11a39 100644 --- a/src/db.c +++ b/src/db.c @@ -432,8 +432,7 @@ void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags) { } else { dbSetValue(db, key, valref, 1, NULL); } - bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(*valref); - if (md) md->iterator_epoch = bgIteration_getEpoch(); + bgIteration_dbEntryModified(*valref); if (!(flags & SETKEY_KEEPTTL)) removeExpire(db, key); if (!(flags & SETKEY_NO_SIGNAL)) signalModifiedKey(c, db, key); } @@ -762,15 +761,7 @@ long long dbTotalServerKeyCount(void) { void signalModifiedKey(client *c, serverDb *db, robj *key) { touchWatchedKey(db, key); trackingInvalidateKey(c, key, 1); - - /* If bgIteration is running, need to maintain the iteration epoch. */ - if (bgIteration_iterationActive()) { - dbEntry *o = dbFind(db, objectGetVal(key)); - if (o) { - bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(o); - if (md) md->iterator_epoch = bgIteration_getEpoch(); - } - } + bgIteration_keyModified(db->id, objectGetVal(key)); } void signalFlushedDb(int dbid, int async) { diff --git a/src/defrag.c b/src/defrag.c index 670f83bee73..e6ecad0227e 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -43,6 +43,7 @@ #include "eval.h" #include "script.h" #include "module.h" +#include "bgiteration.h" #include #include @@ -708,6 +709,8 @@ static void defragKey(defragKeysCtx *ctx, robj **elemref) { unsigned char *newzl; ob = *elemref; + if (bgIteration_isEntryInuse(ob)) return; + /* Try to defrag robj and/or string value. */ if ((newob = activeDefragStringOb(ob))) { *elemref = newob; @@ -815,6 +818,11 @@ static void defragPubsubScanCallback(void *privdata, void *elemref) { * and 1 if time is up and more work is needed. */ static int defragLaterItem(robj *ob, unsigned long *cursor, monotime endtime, int dbid) { if (ob) { + if (bgIteration_isEntryInuse(ob)) { + *cursor = 0; + return 0; + } + if (ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST) { return scanLaterList(ob, cursor, endtime); } else if (ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HASHTABLE) { diff --git a/src/expire.c b/src/expire.c index b31f57465cc..40b1410f1b7 100644 --- a/src/expire.c +++ b/src/expire.c @@ -39,6 +39,7 @@ #include "cluster.h" #include "cluster_migrateslots.h" #include "util.h" +#include "bgiteration.h" /*----------------------------------------------------------------------------- * Incremental collection of expired keys. @@ -167,6 +168,14 @@ void fieldExpireScanCallback(void *privdata, void *volaKey, int didx) { robj *o = volaKey; serverAssert(o); serverAssert(hashTypeHasVolatileFields(o)); + + data->has_more_expired_entries = false; + + if (bgIteration_isEntryInuse(o)) { + data->sampled++; + return; + } + mstime_t now = server.mstime; size_t expired_fields = dbReclaimExpiredFields(o, data->db, now, data->max_entries, didx); if (expired_fields) { diff --git a/src/server.c b/src/server.c index 6b2942c3714..14e32beb6a2 100644 --- a/src/server.c +++ b/src/server.c @@ -3023,11 +3023,11 @@ void initServer(void) { /* Set object metadata size before creating any database key objects */ if (server.forkless_options_supported) { - /* NOTE: At this time, there is only one reason for dbEntry metadata. bgIteration. However, + /* NOTE: At this time, there is only one reason for dbEntry metadata: bgIteration. However, * if/when new metadata options are added, we will need to compute the size of a variable * size metadata, and provide appropriate accessors to access the specific portion of the * metadata (each of which may/may not exist, based on immutable startup parameters). */ - objectSetMetadataSize(sizeof(bgIterationEntryMetadata)); + objectSetMetadataSize(BGITERATION_ENTRY_METADATA_SIZE); } createDatabaseIfNeeded(0); /* The default database should always exist */ @@ -3711,10 +3711,57 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot) if (propagate_to_slot_migration) clusterFeedSlotExportJobs(dbid, argv, argc, slot); } -// If true, a MULTI has been sent to bgIterator. -// Remember to send the matching EXEC in propagatePendingCommands(). -static bool sentMultiToBgIterator = false; -static int lastDbidSentToBgIterator; +/* BgIteration requires that replicaton is sent after each command, however the + * alsoPropagate mechanism queues replication until the end of the transaction + * (when propagatePendingCommands is invoked). Also, the propagation mechanism + * strips out multi/exec, adding them back during propagatePendingCommands (if + * necessary). This function ensures that replication, including multi/exec are + * sequenced with the commands for bgIteration. + * + * Called from alsoPropagate with regular params. + * Called from propagatePendingCommands with dbid = -1 (to close multi/exec). */ +static void propagateToBgIteration(int dbid, int argc, robj **argv, int target) { + /* STATIC indicates that we have sent the MULTI, and need to match it with + * an EXEC during propagatePendingCommands. */ + static bool sentMultiToBgIterator = false; + /* STATIC indicates that last DBID that was sent, so that we can use the + * same DBID when sending a generated EXEC. */ + static int lastDbidSentToBgIterator; + + if (dbid >= 0) { + // Called from alsoPropagate() to replicate a command + if (target & PROPAGATE_REPL && bgIteration_iterationActive()) { + if (!sentMultiToBgIterator && (scriptIsRunning() || server.in_exec)) { + /* For a script or multi/exec, we should be sending the MULTI at + * the beginning of the execution unit. There shouldn't be any + * commands in the propagation queue yet. */ + serverAssert(server.also_propagate.numops == 0); + /* If this is the first propagated command of a script or multi, + * make it a transaction. It may turn out that there is only 1 + * command in the MULTI block, but we can't know that now. + * Unlike regular replication, we can't defer all of the + * replication until we know for sure. We must call bgIteration + * after each command. */ + static struct serverCommand *cmd_multi = NULL; // STATIC + if (cmd_multi == NULL) cmd_multi = lookupCommandOrOriginal(&shared.multi, 1); + bgIteration_handleCommandReplication(dbid, cmd_multi, 1, &shared.multi); + sentMultiToBgIterator = true; + } + struct serverCommand *cmd = lookupCommandOrOriginal(argv, argc); + bgIteration_handleCommandReplication(dbid, cmd, argc, argv); + lastDbidSentToBgIterator = dbid; + } + } else { + // Called from propagatePendingCommands() to finalize a transaction + if (sentMultiToBgIterator) { + // If a MULTI was sent to bgIterator via alsoPropagate(), then send the matching EXEC. + static struct serverCommand *cmd_exec = NULL; // STATIC + if (cmd_exec == NULL) cmd_exec = lookupCommandOrOriginal(&shared.exec, 1); + bgIteration_handleCommandReplication(lastDbidSentToBgIterator, cmd_exec, 1, &shared.exec); + sentMultiToBgIterator = false; + } + } +} /* Used inside commands to schedule the propagation of additional commands * after the current command is propagated to AOF / Replication. @@ -3728,28 +3775,7 @@ static int lastDbidSentToBgIterator; * stack allocated). The function automatically increments ref count of * passed objects, so the caller does not need to. */ void alsoPropagate(int dbid, robj **argv, int argc, int target, int slot) { - if (target & PROPAGATE_REPL && bgIteration_iterationActive()) { - // Note that bgIterator must be invoked immediately after each command. This is required - // by the bgIterator state machine. It's NOT ok to call bgIterator from propagateNow as - // that handles all of the commands for a transaction at the end. - // THIS FUNCTION (alsoPropagate) is called after each command. - if (!sentMultiToBgIterator && (scriptIsRunning() || server.in_exec)) { - // For a script or multi/exec, we should be sending the MULTI at the beginning of the - // execution unit. There shouldn't be any commands in the propagation queue yet. - serverAssert(server.also_propagate.numops == 0); - // If this is the first propagated command of a script or multi, make it a transaction. - // It may turn out that there is only 1 command in the MULTI block, but we can't know - // that now. Unlike regular replication, we can't defer all of the replication until - // we know for sure. We must call bgIterator after each command. - static struct serverCommand *cmd_multi = NULL; // STATIC to avoid repeated lookups - if (cmd_multi == NULL) cmd_multi = lookupCommandOrOriginal(&shared.multi, 1); - bgIteration_handleCommandReplication(dbid, cmd_multi, 1, &shared.multi); - sentMultiToBgIterator = true; - } - struct serverCommand *cmd = lookupCommandOrOriginal(argv, argc); - bgIteration_handleCommandReplication(dbid, cmd, argc, argv); - lastDbidSentToBgIterator = dbid; - } + propagateToBgIteration(dbid, argc, argv, target); robj **argvcopy; int j; @@ -3817,16 +3843,11 @@ void updateCommandLatencyHistogram(struct hdr_histogram **latency_histogram, int * multiple separated commands. Note that alsoPropagate() is not affected * by CLIENT_PREVENT_PROP flag. */ static void propagatePendingCommands(void) { - // Note: This is done before the check on server.also_propagate.numops. Numops might be zero - // if there is no replica but we might be running bgIteration for something other than - // replication. If we sent the multi (to bgIteration), we need to send the matching exec. - if (sentMultiToBgIterator) { - // If a MULTI was sent to bgIterator via alsoPropagate(), then send the matching EXEC. - static struct serverCommand *cmd_exec = NULL; // STATIC to avoid repeated lookups - if (cmd_exec == NULL) cmd_exec = lookupCommandOrOriginal(&shared.exec, 1); - bgIteration_handleCommandReplication(lastDbidSentToBgIterator, cmd_exec, 1, &shared.exec); - sentMultiToBgIterator = false; - } + /* This is done before the check on server.also_propagate.numops. Numops + * might be zero if there is no replica but we might be running bgIteration + * for something other than replication. If we sent the multi (to + * bgIteration), we need to send the matching exec. */ + propagateToBgIteration(-1, 0, NULL, 0); if (server.also_propagate.numops == 0) return; diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp new file mode 100644 index 00000000000..5c6b38ce115 --- /dev/null +++ b/src/unit/test_bgiteration.cpp @@ -0,0 +1,3076 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + +// Just for the moment, until https://github.com/valkey-io/valkey/issues/3450 is resolved +// clang-format off +#include "generated_wrappers.hpp" +#include + +using namespace ::testing; + +extern "C" { + #include "stdlib.h" + #include "bgiteration.h" + #include "server.h" + #define using usingvar // compile hack + #include "module.h" // uses "using" keyword + #undef using + extern hashtableType commandSetType; + extern dictType keylistDictType; + void bgIteration_feedIterators(void); + void createSharedObjects(void); + void hashtableDump(hashtable *ht); + void bgIteration_unitTestDisableCloning(void); + void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes); + static size_t mockHashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata); + size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid); +} + + +// The private data is a pointer to arbitrary data. This value is used just to +// test that the correct value is passed through. +#define PRIVDATA reinterpret_cast(12345) + +typedef int32_t bgIterationEntryMetadata; // opaque 4 bytes +static_assert(sizeof(bgIterationEntryMetadata) == BGITERATION_ENTRY_METADATA_SIZE); + +// A bgIteration cleanup function used for testing. +static int cleanupCount; +static bool cleanupTerminated; +static void iteratorCleanupFn(bool terminated, void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + cleanupCount++; + cleanupTerminated = terminated; +} + +// A bgIteration repldone function used for testing. +static int replDoneConfirmed; +static bool iteratorRepldoneFn(void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + replDoneConfirmed++; + return true; +} + +// A more complicated repldone function that can delay the replcation done condition. +static int replDoneRejected; +static bool iteratorRepldoneFnNotBeingReadyInitially(void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + // This is to test the behavior when Repl Done function is not ready to be executed. + if (replDoneRejected == 0) { + replDoneRejected++; + return false; + } + replDoneConfirmed++; + return true; +} + + +/* This mock for hashtableScan will return the items in lexical order. It assumes that the entries + * are robjs containing an sds string for the key. The key is expected to begin with a capital + * letter [A-Z]. The caller passes 0 as the cursor to start the iteration. The returned cursor + * value will indicate the prior letter returned (1=A, ...). After entries starting with 'Z' have + * been returned, the cursor of 0 will indicate that the scan is complete. Note that all entries + * starting with the same letter will be returned in a single call. */ +static size_t mockHashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata) { + // Just in case, if it's not one of our hashtables, use the unmocked function + bool our_ht = (server.db[0]->keys && ht == kvstoreGetHashtable(server.db[0]->keys, 0)) + || (server.db[1]->keys && ht == kvstoreGetHashtable(server.db[1]->keys, 0)); + if (!our_ht) return __real_hashtableScan(ht, cursor, fn, privdata); + + // Collect all entries from the hashtable + std::vector entries; + hashtableIterator *iter = hashtableCreateIterator(ht, 0); + dbEntry *entry; + while (hashtableNext(iter, (void **)&entry)) { + char first = objectGetKey(entry)[0]; + assert(first >= 'A' && first <= 'Z'); + entries.push_back(entry); + } + hashtableReleaseIterator(iter); + + // Sort by key lexicographically + std::sort(entries.begin(), entries.end(), [](dbEntry *a, dbEntry *b) { + return strcmp(objectGetKey(a), objectGetKey(b)) < 0; + }); + + // cursor 0 means start at 'A', otherwise start after the cursor letter + char startLetter = (char)('A' + cursor); + + // Find the first letter to emit + char emitLetter = 0; + for (dbEntry *e : entries) { + char first = objectGetKey(e)[0]; + if (first >= startLetter) { + emitLetter = first; + break; + } + } + + if (emitLetter == 0) return 0; + + // Call fn for all entries starting with emitLetter + for (dbEntry *e : entries) { + char first = objectGetKey(e)[0]; + if (first == emitLetter) fn(privdata, (void *)e); + } + + size_t nextCursor = (size_t)(emitLetter - 'A' + 1); + return (nextCursor > 25) ? 0 : nextCursor; +} + + +static bool mockHashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor) { + // Just in case, if it's not one of our hashtables, use the unmocked function + if (ht != kvstoreGetHashtable(server.db[0]->keys, 0) + && ht != kvstoreGetHashtable(server.db[1]->keys, 0)) + return __real_hashtableScanHasPassedKey(ht, key, cursor); + + return ((const char *)key)[0] < (char)('A' + cursor); +} + + +static const char *logfile = ""; + +/* Most of the bgIteration unit tests are based on a CMD instance with 2 DBs. There are 8 keys in + * each DB. The hashtableScan function is mocked to return the keys in a predictable order. + * + * There are a number of helper functions to simulate certain key modification actions within our + * test configuration. Note that this is isolated from the actual call to processCommand. + * + * Because most of bgIteration is based on an ordered processing of keys, it doesn't matter if we + * are simulating CMD or CME, full scan, or slot-based. The majority of tests are independent of + * these concerns. + * + * However, there are some tests which are are unique to these configurations and use a specialized + * derived class to handle the differences. We do not want to duplicate all of the tests for + * the different configurations, but we do want to ensure that each configuration works properly. + * - bgIterationTestCluster - handles tests unique to full scan in cluster mode + * - bgIterationTestClusterSlots - handles tests unique to cluster slot-based iteration + */ +class BgIterationTest : public ::testing::Test { + protected: + static const int DB_COUNT = 2; + static const int ITEMS_PER_DB = 8; + + private: + /* With the mock hashtableScan, we get keys in a predictable order. DB0 works with buckets + * containing groups of keys (which hashtableScan returns in a single call). DB1 returns + * each key individually, as more separate buckets. Convention (for test readability) is + * that keys beginning [A-M] would be in DB0 and keys beginning [N-Z] in DB1. Letters are + * intentionally skipped to allow for possible insertions. */ + const char *keys[DB_COUNT][ITEMS_PER_DB] = {{"B0", "B1", "B2", "E0", "E1", "H0", "H1", "H2"}, + {"N0", "O0", "Q0", "R0", "T0", "U0", "W0", "Y0"}}; + + protected: + static const int TOTAL_ITEMS = DB_COUNT * ITEMS_PER_DB; + static const int LAST_ITEM = TOTAL_ITEMS - 1; + + MockValkey mock; + RealValkey real; + client *c = nullptr; // for general use in the tests (with common cleanup) + robj **orig_argv = nullptr; // Used when simulating multi + int orig_argc = 0; // Used when simulating multi + + + struct serverCommand dummy_cmd = {0}; + + // Helper functions for accessing the keys. We can access by db(0..1) and seq(0..7) + // or by item number (0..15). + // NOTE: These virtual functions can be overridden in subclasses which may have different item layout. + virtual const char * getKeyAtDbSeq(int db, int seq) { + assert(db < DB_COUNT); + assert(seq < ITEMS_PER_DB); + return keys[db][seq]; + } + + virtual int getDbFromItemNum(int itemNum) { + assert(itemNum < DB_COUNT * ITEMS_PER_DB); + return itemNum / ITEMS_PER_DB; + } + + virtual int getSeqFromItemNum(int itemNum) { + assert(itemNum < DB_COUNT * ITEMS_PER_DB); + return itemNum % ITEMS_PER_DB; + } + + const char * keyStr(int itemNum) { + return getKeyAtDbSeq(getDbFromItemNum(itemNum), getSeqFromItemNum(itemNum)); + } + + int itemNumFromKey(const char *key) { + for (int itemNum = 0; itemNum < DB_COUNT * ITEMS_PER_DB; itemNum++) { + if (strcmp(key, keyStr(itemNum)) == 0) return itemNum; + } + return -1; + } + + + // Do some general initialization before starting the suite. Normally, the tests are run in + // isolation - and this isn't much different than SetUp(). But if running the + // entire test suite together (just manually running the test executable), this gets called + // only once. + static void SetUpTestSuite() { + monotonicInit(); + + bzero(&server, sizeof(server)); + server.hz = 100; + server.logfile = const_cast(logfile); + createSharedObjects(); + + moduleInitModulesSystem(); + + server.commands = hashtableCreate(&commandSetType); + server.orig_commands = hashtableCreate(&commandSetType); + populateCommandTable(); + } + + + static void TearDownTestSuite() { + hashtableRelease(server.commands); + hashtableRelease(server.orig_commands); + } + + + void initializeServerDb(int dbid, int slot_count_bits = 0) { + server.db[dbid] = static_cast(zcalloc(sizeof(serverDb))); + server.db[dbid]->id = dbid; + server.db[dbid]->keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, 0); + server.db[dbid]->expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, 0); + server.db[dbid]->watched_keys = dictCreate(&keylistDictType); + } + + + robj *createStringObjectFromCString(const char *s) { + return createStringObject(s, strlen(s)); + } + + + void addKeyToDb(int dbid, const char *key, const char *val) { + robj *key_obj = createStringObjectFromCString(key); + robj *val_obj = createStringObjectFromCString(val); + dbAdd(server.db[dbid], key_obj, &val_obj); + decrRefCount(key_obj); + } + + + virtual void setupDatabase() { + /* For these unit tests, a standard database is constructed. But we will use our own + * mocked scan function to ensure a consistent iteration order */ + + server.dbnum = DB_COUNT; + server.cluster_enabled = false; + server.db = static_cast(zcalloc(sizeof(serverDb *) * server.dbnum)); + + for (int dbid = 0; dbid < server.dbnum; dbid++) { + initializeServerDb(dbid); + for (int keynum = 0; keynum < ITEMS_PER_DB; keynum++) { + addKeyToDb(dbid, keys[dbid][keynum], keys[dbid][keynum]); + } + } + + EXPECT_CALL(mock, hashtableScan(_,_,_,_)).WillRepeatedly(Invoke(mockHashtableScan)); + EXPECT_CALL(mock, hashtableScanHasPassedKey(_,_,_)).WillRepeatedly(Invoke(mockHashtableScanHasPassedKey)); + + if (0) debugPrintBucketInfo(); + } + + + void SetUp() override { + server.main_thread_id = pthread_self(); + server.forkless_options_supported = 1; + objectSetMetadataSize(BGITERATION_ENTRY_METADATA_SIZE); + + bgIteration_unitTestDisableCloning(); + + setupDatabase(); + + EXPECT_CALL(mock, aeCreateTimeEvent(_,_,_,_,_)).WillRepeatedly(Return(0)); + bgIteration_init(); + + cleanupCount = 0; + replDoneConfirmed = 0; + replDoneRejected = 0; + + // By default, do nothing for these + EXPECT_CALL(mock, blockClientInUseOnKeys(_,_,_)).WillRepeatedly(Return()); + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).WillRepeatedly(Return()); + + // By default, expect no permission issues + EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_,_,_,_,_,_)).WillRepeatedly(Return(ACL_OK)); + } + + + void TearDown() override { + bgIteration_feedIterators(); // process returning stuff before deleting DB + bgIteration_feedIterators(); // in case an iterator was closed there might be more + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i]->keys) kvstoreRelease(server.db[i]->keys); + if (server.db[i]->expires) kvstoreRelease(server.db[i]->expires); + dictRelease(server.db[i]->watched_keys); + zfree(server.db[i]); + } + zfree(server.db); + + if (c != NULL) freeTestClient(c); + } + + + // Deletes an item from the DB (often at the start of a test) - but does NOT notify + // bgIteration. bgIteration_keyDelete() should be explicitly called where needed. + void simpleDelItem(int itemNum) { + int db = getDbFromItemNum(itemNum); + + sds delKey = sdsnew(keyStr(itemNum)); + int rc = kvstoreHashtableDelete(server.db[db]->keys, 0, delKey); + ASSERT_EQ(rc, 1); + sdsfree(delKey); + } + + + // Find the actual dbEntry object by itemNum + dbEntry * getItem(int itemNum) { + int db = getDbFromItemNum(itemNum); + sds key = sdsnew(keyStr(itemNum)); + dbEntry *de = dbFind(server.db[db], key); + sdsfree(key); + return de; + } + + + // The test expects that the next item read will be BGITERATOR_ITEM_COMPLETE + void expectReadComplete(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + EXPECT_EQ(item->type, BGITERATOR_ITEM_COMPLETE); + bgIteratorClose(iter); + + int oldCleanupCount = cleanupCount; + bgIteration_feedIterators(); + EXPECT_EQ(cleanupCount, oldCleanupCount + 1); + } + + + // The test is cleaning up and isn't validating the remaining cleanup + void expectAnythingCleanup(bgIterator *iter) { + while (true) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + if ((item->type == BGITERATOR_ITEM_COMPLETE + || item->type == BGITERATOR_ITEM_TERMINATED)) { + bgIteratorClose(iter); + break; + } + } + bgIteration_feedIterators(); // Recognize the closed iterator + EXPECT_EQ(cleanupCount, 1); + } + + + void expectDictEntryMetadataMatch(dbEntry *de1, dbEntry *de2) { + bgIterationEntryMetadata *dm1 = static_cast(objectGetMetadata(de1)); + bgIterationEntryMetadata *dm2 = static_cast(objectGetMetadata(de2)); + + EXPECT_NE(dm1, nullptr); + EXPECT_NE(dm2, nullptr); + EXPECT_EQ(*dm1, *dm2); + } + + + // Useful when debugging new tests. It reads/prints all remaining items then crashes. + void cleanupIteratorDebugPrint(bgIterator *iter) { + bool done = false; + printf("[DEBUG] Printing bgIterator '%s' items:\n", bgIteratorName(iter)); + while (!done) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + switch (item->type) { + case BGITERATOR_ITEM_DBENTRY: + { + auto obj = item->u.dbe.de; + const char * keyStr = objectGetKey(obj); + printf("Entry: %s -> %s [itemNum: %i]\n", + keyStr, + static_cast(objectGetVal(obj)), + itemNumFromKey(keyStr)); + break; + } + case BGITERATOR_ITEM_REPLICATION: + printf("Repl: DB=%d : ", item->dbid); + for (int i = 0; i < item->u.repl.argc; i++) + printf("%s ", static_cast(objectGetVal(item->u.repl.argv[i]))); + printf("\n"); + break; + case BGITERATOR_ITEM_COMPLETE: + case BGITERATOR_ITEM_TERMINATED: + bgIteratorClose(iter); + done = true; + break; + default: + printf("unhandled: %d\n", item->type); + } + } + bgIteration_feedIterators(); // Recognize the closed iterator + ASSERT_TRUE(false); // Halt the test here + } + + + // Make a copy of the metadata + void * cloneMetadata(dbEntry *de) { + int size = objectGetMetadataSize(de); + void *metadata = zmalloc(size); + memcpy(metadata, objectGetMetadata(de), size); + return metadata; + } + + + // Compare a previous metadata copy to an existing entry + void compareAndFreeClonedMetadata(dbEntry *de, void *metadata) { + EXPECT_EQ(memcmp(objectGetMetadata(de), metadata, objectGetMetadataSize(de)), 0); + zfree(metadata); + } + + + // The test expects the next item will be a specific key + // The item value is verified against the default unless provided as a parameter. + void expectReadKey(bgIterator *iter, int itemNum, const char *value=nullptr) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_FALSE(item->u.dbe.is_cloned); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum)); + if (value) { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } else { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum))); + } + } + + + // The test expects the next item will be a specific key amd that the item is cloned. + // Metadata is tested (to make sure the clone includes the proper metadata). + // The item value is verified against the default unless provided as a parameter. + void expectReadClonedKey(bgIterator *iter, int itemNum, void *metadata, const char *value=nullptr) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_TRUE(item->u.dbe.is_cloned); + compareAndFreeClonedMetadata(item->u.dbe.de, metadata); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum)); + if (value) { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } else { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum))); + } + } + + + // Test expects the next key, but specified by key name, not itemNum. + void expectReadDbKeyValue(bgIterator *iter, int db, const char *key, const char *value) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), key); + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } + + + // Test expect to read a sequence of key items + void expectReadKeySequence(bgIterator *iter, int startItem, int endItem) { + for (int i = startItem; i <= endItem; i++) expectReadKey(iter, i); + } + + + // Just like expectReadKey, but also tests that a previous item is becoming unblocked. + void expectReadKeyWithUnblock(bgIterator *iter, int itemNum, int unblockItem, const char *value=nullptr) { + bool blocked = true; + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(unblockItem)))) + .WillOnce(Assign(&blocked, false)); + expectReadKey(iter, itemNum, value); + EXPECT_FALSE(blocked); + } + + + // Test expects to read a replication item matching the command help by client 'c' + void expectReadReplication(bgIterator *iter, client *c) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->dbid, c->db->id); + EXPECT_EQ(item->u.repl.cmd, c->cmd); + EXPECT_EQ(item->u.repl.argc, c->argc); + for (int i = 0; i < c->argc; i++) { + EXPECT_STREQ(static_cast(objectGetVal(item->u.repl.argv[i])), + static_cast(objectGetVal(c->argv[i]))); + } + } + + + // We expect to read a MULTI command which should have been inserted. + void expectReadMultiReplication(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("multi")); + } + + + // We expect to read an EXEC command which should have been inserted. + void expectReadExecReplication(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("exec")); + } + + + // Expecting that a DEL command should have been replicated. + void expectReadReplicationDel(bgIterator *iter, int itemNum) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->dbid, db); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("DEL")); + EXPECT_EQ(item->u.repl.argc, 2); + EXPECT_THAT(item->u.repl.argv[0], robjEqualsStr("DEL")); + EXPECT_THAT(item->u.repl.argv[1], robjEqualsStr(keyStr(itemNum))); + } + + + // Expecting that a special SWAPDB item has been inserted. + void expectReadSwapDB(bgIterator *iter, int db1, int db2) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_SWAPDB); + EXPECT_EQ(item->dbid, db1); + EXPECT_EQ(item->u.dbid2, db2); + } + + + static void debugPrintBucketInfoCb(void *privdata, void *entry) { + UNUSED(privdata); + dbEntry *de = (dbEntry *)entry; + printf("--- %s\n", objectGetKey(de)); + } + + void debugPrintBucketInfo() { + printf("*******DEBUG*******\n"); + for (int db = 0; db < server.dbnum; db++) { + int num_ht = kvstoreNumHashtables(server.db[db]->keys); + for (int slot = 0; slot < num_ht; slot++) { + hashtable *ht = kvstoreGetHashtable(server.db[db]->keys, slot); + if (!ht) continue; + + printf("DB: %d, slot: %d\n", db, slot); + size_t cursor = 0; + do { + cursor = hashtableScan(ht, cursor, debugPrintBucketInfoCb, NULL); + printf("-----------\n"); + } while (cursor != 0); + } + } + ASSERT_TRUE(false); + } + + + // Creates a client with a write command (SET) for the given itemNum + client *getWriteClient(int itemNum, const char *value) { + int db = getDbFromItemNum(itemNum); + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString("set"); + c->db = server.db[db]; + + c->argc = 3; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + c->argv[1] = createStringObjectFromCString(keyStr(itemNum)); + c->argv[2] = createStringObjectFromCString(value); + + return c; + } + + + // Create a client with a write command that touches multiple keys + client *getWriteMultiKeysClient( + const char * cmdName, + int dstItemNum, + const std::vector & srcItemsNum) { + + assert(!srcItemsNum.empty()); + + const int db = getDbFromItemNum(dstItemNum); + std::for_each(srcItemsNum.cbegin(), srcItemsNum.cend(), [&db, this](int srcItemNum) { + assert(db == getDbFromItemNum(srcItemNum)); + }); + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString(cmdName); + assert(c->cmd != nullptr); + c->db = server.db[db]; + + c->argc = 2 + srcItemsNum.size(); + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + c->argv[1] = createStringObjectFromCString(keyStr(dstItemNum)); + for (unsigned int i = 0; i < srcItemsNum.size(); i++) { + c->argv[2 + i] = createStringObjectFromCString(keyStr(srcItemsNum[i])); + } + + return c; + } + + + client *getWrite2KeysClient(const char * cmdName, int dstItemNum, int srcItemNum) { + return getWriteMultiKeysClient(cmdName, dstItemNum, {srcItemNum}); + } + + + client *getWrite3KeysClient( + const char * cmdName, int dstItemNum, int src1ItemNum, int src2ItemNum) { + return getWriteMultiKeysClient(cmdName, dstItemNum, {src1ItemNum, src2ItemNum}); + } + + + // Create a client with a MULTI/EXEC block. + // This parses a series of commands separated by ';' + // Example: getMultiClient("SET A0 xxx; SELECT 1; SET A1 xxx; SET B1 xxx") + client *getMultiClient(const char *commands, int dbid = 0) { + char *commandsCopy = zstrdup(commands); // a mutable copy + char *commandStr, *commandStrSave; + char *token, *tokenSave; + + client *c = static_cast(zcalloc(sizeof(client))); + c->db = server.db[dbid]; + initClientMultiState(c); + c->flag.multi = 1; + c->mstate->cmd_flags |= CMD_WRITE; + + commandStr = strtok_r(commandsCopy, ";", &commandStrSave); + while (commandStr != NULL) { + + token = strtok_r(commandStr, " ", &tokenSave); + c->cmd = lookupCommandByCString(token); + + c->argv = static_cast(zcalloc(sizeof(robj*) * 5)); // command + 4 args + + for (int i = 0; token != NULL; i++) { + c->argv[i] = createStringObjectFromCString(token); + c->argc = i+1; + token = strtok_r(NULL, " ", &tokenSave); + } + + queueMultiCommand(c, 0); + freeClientArgv(c); + + commandStr = strtok_r(NULL, ";", &commandStrSave); + } + + c->cmd = lookupCommandByCString("exec"); + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString("EXEC"); + + zfree(commandsCopy); + return c; + } + + + // Initially, a MULTI client is set up to execute the EXEC command (which examines the + // contents of the multi/exec block). This function advances the client to begin executing + // the individual commands within the multi/exec block. + void advanceMultiClientToCommand(client *c, int cmdNum) { + assert(cmdNum >= 0 && cmdNum < c->mstate->count); + if (cmdNum == 0) { + // Save off the EXEC + orig_argc = c->argc; + orig_argv = c->argv; + } + c->argc = c->mstate->commands[cmdNum].argc; + c->argv = c->mstate->commands[cmdNum].argv; + c->argv_len = c->mstate->commands[cmdNum].argv_len; + c->cmd = c->realcmd = c->mstate->commands[cmdNum].cmd; + } + + + // A client with a fictional command: + // SETGET + // - writes a value to the first key (making this CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY) + // - reads a second key + client *getSetGetClient(int itemNum1, const char *value1, int itemNum2) { + // Fictional command which writes to 1st key and reads the 2nd + int db = getDbFromItemNum(itemNum1); + assert(db == getDbFromItemNum(itemNum2)); // (this would be a testcase error) + + client *c = static_cast(zcalloc(sizeof(client))); + struct serverCommand *cmd + = static_cast(zcalloc(sizeof(struct serverCommand))); + + cmd->fullname = const_cast("SETGET"); + cmd->arity = 4; + cmd->flags = CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY; + + cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INDEX; + cmd->legacy_range_key_spec.bs.index.pos = 1; // firstkey + cmd->legacy_range_key_spec.fk.range.lastkey = -1; + cmd->legacy_range_key_spec.fk.range.keystep = 2; + + c->cmd = cmd; + c->db = server.db[db]; + + c->argc = 4; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(cmd->fullname); + c->argv[1] = createStringObjectFromCString(keyStr(itemNum1)); + c->argv[2] = createStringObjectFromCString(value1); + c->argv[3] = createStringObjectFromCString(keyStr(itemNum2)); + + return c; + } + + + // Client with a fictional write command with no keys specified + client *getNoKeysWriteClient() { + // Fictional command which is marked WRITE, but has no keys. + client *c = static_cast(zcalloc(sizeof(client))); + struct serverCommand *cmd + = static_cast(zcalloc(sizeof(struct serverCommand))); + + cmd->fullname = const_cast("NOKEYSWRITE"); + cmd->arity = 1; + cmd->flags = CMD_WRITE; + + cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INVALID; // No keys + + c->cmd = cmd; + c->db = server.db[0]; + + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(cmd->fullname); + + return c; + } + + + void freeClientArgv(client *c) { + for (int i = 0; i < c->argc; i++) decrRefCount(c->argv[i]); + zfree(c->argv); + c->argv = NULL; + c->argc = 0; + } + + + // During testing, we create some fake commands. This checks if the command is real or fake. + // A fake command is dynamically allocated and can be freed. Real commands are static. + bool isRealValkeyCommand(struct serverCommand *cmd) { + return lookupCommandByCString(cmd->declared_name); + } + + + void freeTestClient(client *c) { + // If the current command references one of the multi commands, set it back to the EXEC + if (c->mstate != NULL) { + for (int i = 0; i < c->mstate->count; i++) { + if (c->argv == c->mstate->commands[i].argv) { + c->argc = orig_argc; + c->argv = orig_argv; + orig_argc = 0; + orig_argv = nullptr; + break; + } + } + } + freeClientMultiState(c); + freeClientArgv(c); + + if (!isRealValkeyCommand(c->cmd)) zfree(c->cmd); + + zfree(c); + } + + + // Simulate what happens when a write command is blocked + void simulateBlockedWrite(client *c, int expectedNumberBlockedKeys = 1) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c,expectedNumberBlockedKeys,_)).Times(1); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_TRUE(blocked); + } + + + // Simulate what happens when a write command isn't blocked + void simulateUnblockedWrite(client *c) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + } + + + // Simulate what happens when a write command is NOT blocked, because the key can be cloned + // and expedited. This requires a scenario where we would normally need to block the + // client so that bgIteration can process the item. + void simulateClonedWrite(bgIterator *it, client *c) { + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + unsigned long initialClones = status.dbentry_clones_queued; + + // Client should not get blocked + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + + // Ensure that cloning took place + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, (initialClones + 1)); + + // Ensure that the real item isn't inuse (because we cloned it instead) + dbEntry *de = dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); + ASSERT_FALSE(bgIteration_isEntryInuse(de)); + } + + + // Simulates what happens when a write command (SET) actually executes. This requires a + // scenario where we would NOT be blocked on the write. It actually alters the value of + // the key and updates the metadata. + void simulateUnblockedWriteWithModification(client *c) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + + // Fake execution of the command - touch the iterator_epoch counter and swap the value + // We need to duplicate the value because setKey() can reallocate it. + robj *value = dupStringObject(c->argv[2]); + setKey(c, c->db, c->argv[1], &value, SETKEY_ADD_OR_UPDATE); + + // Let's make sure that setKey updated the iteration epoch (as it should have) + dbEntry *de = dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); + bgIterationEntryMetadata *md = static_cast(objectGetMetadata(de)); + bgIterationEntryMetadata md_after_setkey = *md; + // Now update the md again, and it should still match + bgIteration_dbEntryModified(de); + EXPECT_EQ(md, objectGetMetadata(de)); // the md location shouldn't have changed + EXPECT_EQ(md_after_setkey, *md); // the md value should still be the same + + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + } + + + // Simulate the expiration (active expiration) of a key. This is independent of command execution. + void simulateExpiration(int itemNum) { + ASSERT_NE(getItem(itemNum), nullptr); // Should be there before expire + + // NOTE: This seems weird, but Valkey propagates the delete before actually expiring the + // key. BgIterator expects this behavior and expects the key to exist when the + // DEL is received for propagation. + + // Send bgIteration the DEL + int db = getDbFromItemNum(itemNum); + robj *argv[2]; + argv[0] = createStringObjectFromCString("DEL"); + argv[1] = createStringObjectFromCString(keyStr(itemNum)); + serverCommand *cmd = lookupCommandByCString("DEL"); + bgIteration_handleCommandReplication(db, cmd, 2, argv); + bgIteration_keyDelete(db, static_cast(objectGetVal(argv[1]))); + decrRefCount(argv[0]); + decrRefCount(argv[1]); + + simpleDelItem(itemNum); // Simulate the actual del + + EXPECT_EQ(getItem(itemNum), nullptr); + } + + + // Simulates an expiration, but validates behavior for an item inuse by bgIteration. + void simulateExpirationOfInuse(int itemNum) { + // An inuse item will have a refcount > 1. BgIteration should have incremented the + // refcount while it is inuse. + dbEntry *de = getItem(itemNum); + ASSERT_NE(de, nullptr); // Should be there before expire + EXPECT_TRUE(bgIteration_isEntryInuse(de)); + EXPECT_EQ(de->refcount, 2u); + + simulateExpiration(itemNum); + + // At this point, the item is removed from the DB, but still exists, and the refcount + // has been reduced to 1. This allows a background thread to continue using the item. + EXPECT_EQ(de->refcount, 1u); + } + + + // Simulates an expiration, but the item is a future item which will be expedited. + void simulateExpirationWithExpedite(int itemNum) { + // An inuse item will have a refcount > 1. BgIteration should have incremented the + // refcount while it is inuse. + dbEntry *de = getItem(itemNum); + ASSERT_NE(de, nullptr); // Should be there before expire + EXPECT_FALSE(bgIteration_isEntryInuse(de)); // Not yet inuse + EXPECT_EQ(de->refcount, 1u); + + simulateExpiration(itemNum); + + // At this point, the item is removed from the DB, but still exists, and the refcount + // has been reduced to 1. This allows a background thread to continue using the item. + EXPECT_TRUE(bgIteration_isEntryInuse(de)); // It's inuse now + EXPECT_EQ(getItem(itemNum), nullptr); // but it's not in the DB anymore + EXPECT_EQ(de->refcount, 1u); + } + + + // Simulate execution of a SWAPDB command + void simulateSwapDB(int dbid0, int dbid1) { + char dbStr[2] = {0}; + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString("swapdb"); + c->db = server.db[0]; + + c->argc = 3; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + dbStr[0] = '0' + dbid0; + c->argv[1] = createStringObjectFromCString(dbStr); + dbStr[0] = '0' + dbid1; + c->argv[2] = createStringObjectFromCString(dbStr); + + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // SWAPDB should never block + + // The real SWAP does more than this, but this is enough for unit tests + serverDb *aux = server.db[dbid0]; + server.db[dbid0] = server.db[dbid1]; + server.db[dbid1] = aux; + + bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv); + + freeTestClient(c); + } + + + // Simulate execution of a FLUSHDB or FLUSHALL command + void simulateFlushDB(int db, int anInUseItem) { + client *c = static_cast(zcalloc(sizeof(client))); + + if (db == -1) { + c->cmd = lookupCommandByCString("flushall"); + c->db = server.db[0]; + } else { + c->cmd = lookupCommandByCString("flushdb"); + c->db = server.db[db]; + } + + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + + dbEntry *de_in_use = getItem(anInUseItem); + EXPECT_EQ(de_in_use->refcount, 2u); + + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // FLUSHDB should never block + + // The real FLUSH does more than this, but this is enough for unit tests + + // Now flush the items + for (int d = 0; d < server.dbnum; d++) { + if (db == -1 || db == d) { + kvstoreRelease(server.db[d]->keys); + server.db[d]->keys = NULL; + } + } + + EXPECT_EQ(de_in_use->refcount, 1u); + + // and replicate + + bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv); + + freeTestClient(c); + } +}; + + +TEST_F(BgIterationTest, dbIsOK) { + // Just run the setup/teardown code to make sure the DB is OK. +} + + +///////////////////////////////////////////////////// +// Simple Full-scan iterator tests +///////////////////////////////////////////////////// + +// A simple full scan that just checks basic flow. +TEST_F(BgIterationTest, createAndCleanup) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + EXPECT_EQ(bgIteratorFind("simple"), it); + EXPECT_STREQ(bgIteratorName(it), "simple"); + + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + + EXPECT_EQ(status.dbentries_queued, 0u); + EXPECT_EQ(status.dbentries_processed, 0u); + EXPECT_EQ(status.replication_queued, 0u); + EXPECT_EQ(status.replication_processed, 0u); + EXPECT_EQ(status.swapdb_queued, 0u); + EXPECT_EQ(status.swapdb_processed, 0u); + EXPECT_EQ(status.flushdb_queued, 0u); + EXPECT_EQ(status.flushdb_processed, 0u); + + EXPECT_EQ(status.queue_length, 0u); + EXPECT_GT(status.queue_length_target, 0u); + + EXPECT_LT(status.runtime_ms, 5u); + EXPECT_EQ(status.current_item_ms, 0u); + + expectAnythingCleanup(it); + + EXPECT_EQ(bgIteratorFind("simple"), nullptr); +} + + +// Close client before reading anything +TEST_F(BgIterationTest, testClientCloseBeforeRead) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + bgIteration_feedIterators(); + + bgIteratorClose(it); // Immediately close before reading + + bgIteration_feedIterators(); // Recognize the closed iterator + + // Check that the cleanup callback was executed properly + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Test that the full scan hits each item in the expected sequence. +TEST_F(BgIterationTest, orderedIteration) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, LAST_ITEM); + + // Quick status check. At this point, the final item hasn't been returned yet. + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentries_queued, static_cast(TOTAL_ITEMS)); + EXPECT_EQ(status.dbentries_processed, static_cast(TOTAL_ITEMS) - 1); + + expectReadComplete(it); // Returns the final item, and reads the completion item + + // Check that the cleanup callback was executed properly + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); +} + + +// Test that two simultaneous iterations work properly. +TEST_F(BgIterationTest, twoOrderedIterations) { + bgIterator *it1 = bgIteratorCreateFullScanIter("simple1", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + bgIterator *it2 = bgIteratorCreateFullScanIter("simple2", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + EXPECT_EQ(bgIteratorFind("simple1"), it1); + EXPECT_EQ(bgIteratorFind("simple2"), it2); + + int it1Count = 0; + int it2Count = 0; + while (it1Count < TOTAL_ITEMS || it2Count < TOTAL_ITEMS) { + // Randomly read from either iterator + if ((rand() % 2) == 0) { + if (it1Count < TOTAL_ITEMS) expectReadKey(it1, it1Count++); + } else { + if (it2Count < TOTAL_ITEMS) expectReadKey(it2, it2Count++); + } + } + + // Nothing left but to read the final completions + expectReadComplete(it1); + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); + expectReadComplete(it2); + EXPECT_EQ(cleanupCount, 2); + EXPECT_FALSE(cleanupTerminated); +} + + +///////////////////////////////////////////////////// +// MODIFY A FUTURE ITEM +// The next tests validate the basic pattern when a key, not yet iterated, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a future item, without replication or consistency. +// Our expectation for this case is that the modification should proceed without blocking, the item +// shouldn't be expedited, and we will see the modified item once the iterator reaches it. +TEST_F(BgIterationTest, modFutureItem) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + c = getWriteClient(6, "xxx"); + + // We DONT expect the client to be blocked - not consistent + simulateUnblockedWriteWithModification(c); + + // Now continue reading, 1, 2, 3, 4, 5 + expectReadKeySequence(it, 1, 5); + + // Let's validate that key 6 shows the new value + expectReadKey(it, 6, "xxx"); + + // Continue... + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a future item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification SHOULD be blocked, as we have to save the +// the item in it's state before the modification. To reduce blocking time, the item should be +// moved to the head of the queue - there's no replication in this case, so out-of-order processing +// isn't a concern. +TEST_F(BgIterationTest, modFutureItem_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + c = getWriteClient(6, "xxx"); + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c); + + // On a consistent iterator, the event is expedited in-front of items already in queue! + // Read key 6 out of order. + expectReadKey(it, 6); + + // Now, when we read key 1, key 6 is released back to Valkey, and the client will be unblocked. + expectReadKeyWithUnblock(it, 1, 6); + simulateUnblockedWriteWithModification(c); // Now the write can proceed + + // Continue... + expectReadKeySequence(it, 2, 5); + // 6 has already been processed + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a future item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification should proceed without blocking, as the +// mode is inconsistent. We don't expect replication, as we haven't reached the item yet. We'll +// see the modified item later. +TEST_F(BgIterationTest, modFutureItem_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + c = getWriteClient(6, "xxx"); + + // We DONT expect the client to be blocked - not consistent + simulateUnblockedWriteWithModification(c); + + // NOTE: Since we haven't reached this item yet, and consistency is not required, there's no + // need to replicate this command. So everything should wrap up just fine - we will see + // the new value when we get to it. + + // Now continue reading, 1, 2, 3, 4, 5 + expectReadKeySequence(it, 1, 5); + + // Let's validate that key 6 shows the new value + expectReadKey(it, 6, "xxx"); + + // Continue... + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// MODIFY A CURRENT ITEM +// The next tests validate the basic pattern when a key, currently in use, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a current item, without replication or consistency. +// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't +// be expedited (it's already in use). +TEST_F(BgIterationTest, modCurrentItem) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication) + + // Continue... + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a current item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't +// be expedited (it's already in use). +TEST_F(BgIterationTest, modCurrentItem_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication) + + // Continue... + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a current item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification SHOULD be blocked. After the key is processed, +// the write will proceed, and the replication will be sent. +TEST_F(BgIterationTest, modCurrentItem_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write will cause replication + + expectReadKey(it, 4); // 4 got put in queue when 3 was read + + expectReadReplication(it, c); + + // Continue... + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// MODIFY A PAST ITEM +// The next tests validate the basic pattern when a key, not yet iterated on, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a past item, without replication or consistency. +// Our expectation for this case is that the modification should proceed without blocking. +// No replication is generated and keys are processed similar to no modification. +TEST_F(BgIterationTest, modPastItem) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKeySequence(it, 2, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a past item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification should proceed without blocking. +// No replication is generated and keys are processed similar to no modification. +TEST_F(BgIterationTest, modPastItem_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKeySequence(it, 2, LAST_ITEM); + expectReadComplete(it); +} + + +// Modify a past item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification should proceed without blocking. +// Replication will be sent. +TEST_F(BgIterationTest, modPastItem_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Key 2 was already in queue (same bucket as key 1). The replication will follow. + expectReadKey(it, 2); + expectReadReplication(it, c); + + // Continue... + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// TESTS FOR ITEM CLONING +///////////////////////////////////////////////////// + +// In a consistent iteration, verify that a simple string is properly cloned, and that a write can +// occur without blocking. Validate the cloned item and metadata. +TEST_F(BgIterationTest, modFutureItem_start_CloneExpeditedItem) { + // Initialize cloning configurations. + bgIteration_unitTestEnableCloning(50, 100); + + bgIteratorStatus status; + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + c = getWriteClient(6, "xxx"); + + // Quick status check. At this point, no clones exist yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 0u); + + // Since item 6 should be cloned, it will not block the client, allowing the write. + void *de6_md = cloneMetadata(getItem(6)); + simulateClonedWrite(it, c); // This wouldn't block, and queues the cloned value + simulateUnblockedWriteWithModification(c); // This modifies the real entry in the de (touching metadata) + + // At this point, one clone is in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // On a consistent iterator, the event is expedited in-front of items already in queue! + // Read key 6 (which is cloned) out of order. The value will still match the key. + expectReadClonedKey(it, 6, de6_md); // Also validates and frees the metadata + + // Quick status check. At this point, cloned items have not been marked as processed yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 0u); + + // Reading key 1 will release key 6, and the clone will finish processing. + expectReadKey(it, 1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Now, when we read key 2 should not have an impact on number of processed clones. + expectReadKey(it, 2); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Continue... + expectReadKeySequence(it, 3, 5); + // 6 has already been processed + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +// Check that cloning for simple strings is respecting the size limits and pool size. On a +// consistent iteration, we expect to block or clone on all future keys. We validate that we can +// clone if the item is small enough and the cloning pool has more space left. +TEST_F(BgIterationTest, modFutureItem_start_LargeItemOrClonePoolFull) { + // Initialize cloning configurations to test the clone pool functionality first. + bgIteration_unitTestEnableCloning(50, 50); + + bgIteratorStatus status; + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c6 = getWriteClient(6, "xxx"); + client *c7 = getWriteClient(7, "xxx"); + client *c8 = getWriteClient(8, "xxx"); + + // Quick status check. At this point, no clones exist yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 0u); + + // Since item 6 should be cloned, it will not block the client, allowing the write. + void *de6_md = cloneMetadata(getItem(6)); + simulateClonedWrite(it, c6); + simulateUnblockedWriteWithModification(c6); + + // At this point, one clone is in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // Now that cloning pool is full, item 7 will not be cloned and the client will be blocked. + simulateBlockedWrite(c7); + ASSERT_TRUE(bgIteration_isEntryInuse(getItem(7))); + + // There is still only one cloned item in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // Now change cloning configurations to test that large items will not be cloned. We adjust + // the clone pool size to allow two items, but set the maximum item size to be smaller than + // the size of item 8. The clone pool size must be larger than the total size of the existing + // clones plus the maximum item clone size. + bgIteration_unitTestEnableCloning(1, 101); + + // This write will pass the clone pool check but fail the item size check, blocking the client. + simulateBlockedWrite(c8); + ASSERT_TRUE(bgIteration_isEntryInuse(getItem(8))); + + // On a consistent iterator, the expedited item in-front of items already in queue! + // Read key 6 out of order. + expectReadClonedKey(it, 6, de6_md); + + // Now, when we expect to read key 7, which was expedited, key 6 will be released back to Valkey + // and the clone will be deallocated here. + expectReadKey(it, 7); + + // Now, when we read key 8, which was expedited, key 7 is released back to Valkey, and the client + // will be unblocked. + // (actually, unblock is called after every key [just in case] - but functionally we only care + // about this one) + expectReadKeyWithUnblock(it, 8, 7); + simulateUnblockedWriteWithModification(c7); + + // Now, when we read key 1, key 8 is released back to Valkey, and the client will be unblocked. + expectReadKeyWithUnblock(it, 1, 8); + simulateUnblockedWriteWithModification(c8); + + // Since only one item was cloned, there should be one clone processed + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Continue... + expectReadKeySequence(it, 2, 5); + // 6, 7, and 8 have already been processed + expectReadKeySequence(it, 9, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c6); + freeTestClient(c7); + freeTestClient(c8); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO MODIFICATION OF TWO ITEMS +// When 2 keys are modified, we need to ensure that both keys have been sent before we can send +// replication. This means that if replication is present, we may have to block/expedite for +// future keys, even in the inconsistent scenario. +///////////////////////////////////////////////////// + +// Replication enabled, but NOT consistent. In this case, if ANY of the keys have been iterated, +// ALL of the keys must be replicated so that the command can be processed properly on the replica. +TEST_F(BgIterationTest, modPastFutureItem_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Even though key 12 is for READ in this command, it must be expedited so that it exists before + // the associated replication is sent. + c = getSetGetClient(8, "xxx", 12); + simulateBlockedWrite(c); + + // Key 12 will be expedited, but not in front of existing items in queue (can only do that for + // consistent iterators) + + expectReadKey(it, 10); + expectReadKey(it, 12); // expedited + expectReadKeyWithUnblock(it, 11, 12); // 13 is now in queue + + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKey(it, 13); + expectReadReplication(it, c); + + expectReadKeySequence(it, 14, LAST_ITEM); + expectReadComplete(it); +} + + +// Replication NOT enabled. A read-only key doesn't need to be expedited, even if other keys have +// been processed already. (This should work identically for both consistent/non-consistent. +TEST_F(BgIterationTest, modPastFutureItem_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter1", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Since there's no replication, we don't have to worry about expediting 12. The write will + // proceed without blocking. + c = getSetGetClient(8, "xxx", 12); + simulateUnblockedWriteWithModification(c); + + // Key 12 will not be expedited. Remaining keys should be received in normal order. + expectReadKeySequence(it, 10, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, modPastFutureItem) { + bgIterator *it = bgIteratorCreateFullScanIter("iter2", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Since there's no replication, we don't have to worry about expediting 12. The write will + // proceed without blocking. + c = getSetGetClient(8, "xxx", 12); + simulateUnblockedWriteWithModification(c); + + // Key 9 will not be expedited. Remaining keys should be received in normal order. + expectReadKeySequence(it, 10, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO MISSING ITEMS +// Missing items are tricky. A missing item might be logically located in the past or future, in +// relation to the current iteration position. The command may (or may not) create the "missing" +// key. Some general considerations: +// * In a consistent iteration, a missing key didn't exist at the time of consistency, or it was +// already processed (saved) at the time of the deletion. If the missing key gets created, we +// must be sure to skip it if we later iterate over it. +// * In a non-consistent iteration with replication: +// * If the key location is already passed, the replication is sent, allowing the key to be +// created (or not) based on the replication. +// * If the key location is in the future, we can allow the command to proceed, without +// replication. If the key is created, we will process it when the iterator gets to it. +// +// We expect: +// no-repl, no-consist: past items are ignored - future items are processed when iterated +// no-repl, yes-consist: past items are ignored - future items are ignored +// yes-repl, no-consist: past item skipped, but replicated - future items are created by replication and skipped later +// yes-repl, yes-consist: past item skipped, but replicated - future items are processed when iterated +///////////////////////////////////////////////////// + +// no-repl, no-consist: creation of PAST item has no impact +TEST_F(BgIterationTest, missingPastItem) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); +} + + +// no-repl, yes-consist: creation of PAST item has no impact +TEST_F(BgIterationTest, missingPastItem_start) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); +} + + +// yes-repl, no-consist: creation of a PAST item will be replicated +TEST_F(BgIterationTest, missingPastItem_eventual) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // replication will be added after item 4 (3,4 in same bucket) + + expectReadKey(it, 4); + + expectReadReplication(it, c); + + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +// no-repl, no-consist: creation of FUTURE item is seen when reached by the iteration. +TEST_F(BgIterationTest, missingFutureItem) { + // Using DB1 so we have lots of buckets + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + const char * newValue = "xxx"; + c = getWriteClient(14, newValue); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 13); + + // We expect to see item 14. + // Note that for an inconsistent DB view, it is logically undefined if this value is seen (or not). + // But as implemented, we should see it and the test is helpful to understand if/when the + // functionality changes. + expectReadKey(it, 14, newValue); + + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); +} + + +// no-repl, yes-consist: creation of FUTURE item is ignored by consistent iteration. +TEST_F(BgIterationTest, missingFutureItem_start) { + // Using DB1 so we have lots of buckets + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + c = getWriteClient(14, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 13); + // Key 14 is missing - it didn't exist at start of consistent iteration + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); +} + + +// yes-repl, no-consist: creation of FUTURE item is handled by the replication, and then the key is +// later skipped (treated like an early iteration case). +TEST_F(BgIterationTest, missingFutureItem_eventual) { + // Using DB1 so we have lots of buckets + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Items 1 & 2 are in queue (same bucket) + + c = getWriteClient(14, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 2); + + expectReadReplication(it, c); // Here's the replication creating item 14 + + expectReadKeySequence(it, 3, 13); + // We expect item 14 to be skipped, because it was created by the earlier replication + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO EXPIRATION +// Expiration can be tricky. When pre-evaluating a command with bgIteration_blockClientIfRequired, +// a key might exist, but be ready for expiration. Then, as the command executes, the key expires +// and gets deleted before the write operation. Consider SET K V. +// In the unexpired case, this appears to bgIteration as a single SET command (which replaces the value). +// In the expired case, bgIteration will receive a DEL followed by a SET. +// +// Another case is a READ command. A read command won't cause the client to be blocked. However, +// if the key is expired, this will cause a DEL. For consistent processing, this key might need to +// be expedited so that it can be processed before it gets deleted. In this case, the key is +// unlinked from the main Valkey dictionary, but the actual deletion is deferred. +///////////////////////////////////////////////////// + +TEST_F(BgIterationTest, expireKeys) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we no longer care + simulateExpirationOfInuse(2); // Current - it's inuse + simulateExpiration(5); // Future - we don't care (non-consistent) + + expectReadKeySequence(it, 2, 4); + // key 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, expireKeys_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we expect replication + simulateExpirationOfInuse(2); // Current - it's inuse, but we expect replication + simulateExpiration(5); // Future - we don't care (non-consistent) + + expectReadKey(it, 2); // this was already queued + expectReadReplicationDel(it, 0); // Past item should replicate + expectReadReplicationDel(it, 2); // Current item should replicate + // Item 5 is a future item and doesn't need to replicate + + expectReadKeySequence(it, 3, 4); + // Item 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, expireKeys_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we no longer care + simulateExpirationOfInuse(2); // Current - we must defer + simulateExpirationWithExpedite(5); // Future - will become inuse and expedited for consistency + + expectReadKey(it, 5); // Expedited to front + + expectReadKeySequence(it, 2, 4); + // Item 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +// Special case during a non-consistent iteration with replication and expiration. +// 1. A future key is created (and processed by its replication) - considered early iterated +// 2. Later the key is expired and deleted during command processing (causes DEL to be sent) - no longer early iterated +// 3. The key is recreated as part of the command processing (and this command was replicated) - again early iterated +// 4. Finally, when we iterate to the key, it shouldn't be sent, because it was replicated in step 3. +TEST_F(BgIterationTest, expireKeys_eventual_FutureKeyCreatedThenExpiredDuringSet) { + simpleDelItem(8); // Start with a missing future item + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Get the iterator started + + c = getWriteClient(8, "xxx"); + simulateUnblockedWriteWithModification(c); // Not blocked because this is a future key (but we expect repl) + + // Now do it again, but break out the steps so that we can simulate an expiration + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // Shouldn't be blocked because this is a future key + + // Now, as the SET command tries to execute, simulate that the key is expired. Expiration + // processing sends the replication FIRST! + robj *argv[2]; + argv[0] = createStringObjectFromCString("DEL"); + argv[1] = c->argv[1]; + serverCommand *cmd = lookupCommandByCString("DEL"); + bgIteration_handleCommandReplication(getDbFromItemNum(8), cmd, 2, argv); + decrRefCount(argv[0]); + + // Now the call to keyDelete happens (after the replication). + bgIteration_keyDelete(getDbFromItemNum(8), static_cast(objectGetVal(c->argv[1]))); + simpleDelItem(8); // Simulate the actual del + + // Now the SET will run, re-creating the item (which is still a future item) + // We need to duplicate the value because setKey() can reallocate it. + robj *value = dupStringObject(c->argv[2]); + setKey(c, c->db, c->argv[1], &(value), SETKEY_ADD_OR_UPDATE); + + // Finally, replication will be sent because this is creating a new key + bgIteration_handleCommandReplication(getDbFromItemNum(8), c->cmd, c->argc, c->argv); + + // Test that everything comes as expected + expectReadKeySequence(it, 1, 2); // All one bucket - queued after key 0 read + + expectReadReplication(it, c); // Repl from the first SET command + expectReadReplicationDel(it, 8); // This is the expected replication of the DEL from expire + expectReadReplication(it, c); // Repl from the second SET command (recreating deleted key) + + expectReadKeySequence(it, 3, 7); // continue with normal iteration + // KEY 8 SHOULD BE OMITTED - This was already replicated + expectReadKeySequence(it, 9, LAST_ITEM); + + expectReadComplete(it); +} + + +///////////////////////////////////////////////////// +// THE REMAINING TESTS ARE GENERAL / UNCATEGORIZED +///////////////////////////////////////////////////// + +// Iteration can be terminated from the main thread or from the child client. +// This tests termination driven from the main thread. +TEST_F(BgIterationTest, earlyTerminationFromMain) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + + // At this point, keys 1 & 2 are in queue. A termination should release those keys. + bool blocked1 = true; + bool blocked2 = true; + // We expect no general unblocks, we account for each specific unblock below. + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0); + // We should expect to see unblock called for items 1 & 2, as they are released from the queue. + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1)))) + .WillOnce(Assign(&blocked1, false)); + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2)))) + .WillOnce(Assign(&blocked2, false)); + bgIteratorTerminate(it); // queues the items for release + EXPECT_TRUE(bgIteratorIsTerminating(it)); + bgIteration_feedIterators(); // actually performs the release + EXPECT_FALSE(blocked1); + EXPECT_FALSE(blocked2); + + bool blocked0 = true; + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))) + .WillOnce(Assign(&blocked0, false)); + bgIteratorItem *item = bgIteratorRead(it); + EXPECT_FALSE(blocked0); + EXPECT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + EXPECT_EQ(cleanupCount, 0); + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Iteration can be terminated from the main thread or from the child client. +// This tests termination driven from the child client (the background thread). +TEST_F(BgIterationTest, earlyTerminationFromChild) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + // At this point, keys 1 & 2 are in queue. A termination should release those keys. + bgIteratorClose(it); // background thread initiates the termination + EXPECT_TRUE(bgIteratorIsTerminating(it)); + + bool blocked0 = true; + bool blocked1 = true; + bool blocked2 = true; + // Expecting no extra unblocks + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0); + // We expect item 0 (the in progress item) to be released + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))) + .WillOnce(Assign(&blocked0, false)); + // We expect items 1-4 (the queued items) to be released + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1)))) + .WillOnce(Assign(&blocked1, false)); + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2)))) + .WillOnce(Assign(&blocked2, false)); + bgIteration_feedIterators(); + EXPECT_FALSE(blocked0); + EXPECT_FALSE(blocked1); + EXPECT_FALSE(blocked2); + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Edge case. Executing a command (like SUNIONSTORE) which REPLACES the first key and reads the +// second key. In this case, bgIteration will get notified of the key deletion during execution of +// SETUNIONSTORE. Given that both keys are in the future (not iterated yet), we'll allow the +// command to execute, unblocked. We won't replicate as we'll pick up the key when we get to it. +TEST_F(BgIterationTest, writeWith2Keys_eventual_keyDeletedDuringSetReplace) { + // Using DB1 so we have lots of buckets + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 8); // 9 is in queue + + // Write command that has 2 keys. 1 existing key that we write to and 1 dependant future key. + c = getWrite2KeysClient("sunionstore", 12, 13); + + simulateUnblockedWrite(c); + + // Now the call to keyDelete happens + sds sdskey = sdsnew(keyStr(12)); + bgIteration_keyDelete(getDbFromItemNum(12), sdskey); + sdsfree(sdskey); + simpleDelItem(12); // So simulate the actual del + + // Now the write will run, re-creating the item (which is still a future item) + const char * const newValueStr = "new value"; + robj *newValueRobj = createStringObjectFromCString(newValueStr); + setKey(c, c->db, c->argv[1], &newValueRobj, SETKEY_ADD_OR_UPDATE); + + // Finally, we are letting bgIteration know that the write command was executed + bgIteration_handleCommandReplication(getDbFromItemNum(12), c->cmd, c->argc, c->argv); + + // Since the write command was not replicated, we expect all the keys to be read in the normal + // order from the dictionary. + expectReadKeySequence(it, 9, 11); + expectReadKey(it, 12, newValueStr); + expectReadKeySequence(it, 13, LAST_ITEM); + + expectReadComplete(it); +} + + +// Edge case. When we have a new key which is created by a command, AND replication is enabled, we +// expect that we will replicate the command rather than serializing the key/value later. As an +// example, consider SUNIONSTORE A B. We want to create A by replicating the command. We don't +// want to have to process A as a key later on. But in this case, we can't run the command until +// B has been sent. We expect the command to be blocked while we send B. +TEST_F(BgIterationTest, writeWith2Keys_eventual_setNewKey_DependantFuture) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 8); // 9 is in queue + + // Write command that has 2 keys. 1 new key and 1 dependant future key. + c = getWrite2KeysClient("sunionstore", 12, 13); + + // We are simulating a new key in the dict. This command should block on the dependant key. + // This adds key 13 in the queue since the command depends on it. + simulateBlockedWrite(c); + + // Key 9 was already in the queue + expectReadKey(it, 9); + + // Key 13 is processed out of order since the write depends on it + expectReadKey(it, 13); + + // Reading key 10 will unblock key 13, allowing us to write. + expectReadKey(it, 10); + + // Now that key 13 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + // Key 11 was queued when we read key 10 + expectReadKey(it, 11); + + // The replication of the write command was enqueued after key 11 + expectReadReplication(it, c); + + // We shouldn't see key 12 - as that was processed via replication. + // We shouldn't see key 13 - as that was expedited earlier + + // Now resuming processing of dict entries + expectReadKeySequence(it, 14, LAST_ITEM); + + expectReadComplete(it); +} + + +// A new key is being created, but is dependent on another key which has already been processed. +// In this case, the command shouldn't be blocked. +TEST_F(BgIterationTest, writeWith2Keys_eventual_setNewKey_DependantPast) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // 10 is in queue, done with 8 + + // Write command that has 2 keys. 1 new key and 1 dependant past key. + c = getWrite2KeysClient("sunionstore", 12, 8); + + // We are simulating a new key in the dict. + // This command should not block since the dependant key has already been processed. + simulateUnblockedWriteWithModification(c); + + // Key 10 was put in the queue before the write + expectReadKey(it, 10); + + expectReadReplication(it, c); + + expectReadKey(it, 11); + + // Key 12 should be missing - it was processed by replication + + expectReadKeySequence(it, 13, LAST_ITEM); + expectReadComplete(it); +} + + +// A new key is being created, and has dependencies on 2 other keys - one already processed, one not. +// In this case, the command should be blocked so that the future key can be sent first. +TEST_F(BgIterationTest, writeWith3Keys_eventual_setNewKey_1DependantPast1DependantFuture) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // 8 has been returned, 9 is active, 10 is in queue + + // Write command that has 1 new key and 2 dependencies (past/future) + c = getWrite3KeysClient("sunionstore", 12, 8, 13); + + // The write should be blocked, so that item 13 can be processed. + simulateBlockedWrite(c); + + expectReadKey(it, 10); // 10 was already in queue + expectReadKey(it, 13); // 13 was expedited since the write depends on it + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(13)))).Times(1); + expectReadKey(it, 11); // Releases 13 so the command can execute + + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 14); // was queued when reading 11 (12 is missing, 13 was expedited) + + expectReadReplication(it, c); + + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); +} + + +// Test an edge case with the same (future) key being repeated in the command, like: +// SUNIONSTORE A B B +// In this test, A is a previously handled key, and B is a future key. We expect the future key B to +// be expedited (once). +TEST_F(BgIterationTest, writeWith3Keys_eventual_repeatedKey_1DependantPast1RepeatedFuture) { + // Using DB1 so we have lots of buckets + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // We're done with 8, and 10 is in queue + + // Write command that has 3 keys. 1 past key and 1 repeated key in the future. + c = getWrite3KeysClient("sunionstore", 8, 12, 12); + + // This command should block because 12 needs to be expedited. + simulateBlockedWrite(c); + + expectReadKey(it, 10); // was already in queue + expectReadKey(it, 12); // expedited + expectReadKey(it, 11); // releases 12 (unblocking the command) + + // Now that key 12 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 13); // queued when we read 11 + + expectReadReplication(it, c); + + // Now resuming processing of dict entries. + expectReadKeySequence(it, 14, LAST_ITEM); + expectReadComplete(it); +} + + +/* Tests the replication of a write command that creates a new key and depends on a + * future key which is duplicated in the command. */ +TEST_F(BgIterationTest, writeWith3Keys_eventual_repeatedKey_1newKey1RepeatedFuture) { + simpleDelItem(3); // Deleting key 3 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + // At this point, keys 1 & 2 are in queue. + + // Write command that has 3 keys. 1 new key and 1 repeated key in the future. + c = getWrite3KeysClient("sunionstore", 3, 5, 5); + + // This command should block on key 5. + // This adds key 5 in the queue because: + // - the command depends on key 5 which hasn't been processed yet + // - the command creates a new key (key 3). + simulateBlockedWrite(c); + + expectReadKeySequence(it, 1, 2); // These were already in queue + + // Key 5 is processed out of order since the write depends on it + expectReadKey(it, 5); + + // Keys 4 is the next in queue, and releases the expedited key 5 + expectReadKey(it, 4); + + // Now that key 4 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + // Key 6 & 7 are next, having been queued after reading key 4. + expectReadKeySequence(it, 6, 7); + + // The replication of the write command was enqueued after 5 was released (unblocking the command) + expectReadReplication(it, c); + + // Now resuming processing of dict entries. + expectReadKeySequence(it, 8, LAST_ITEM); + expectReadComplete(it); +} + + +/* A command modifying an in-progress key, but dependent on a future (repeated) key. */ +TEST_F(BgIterationTest, writeWith3Keys_start_repeatedKey_1DependantPast1RepeatedFuture) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + // At this point, keys 1 & 2 are in queue. + + // Write command that has 3 keys. 0 is in progress. 4 is still future. + // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a + // multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY). + c = getWriteMultiKeysClient("blpop", 0, {4, 4, 0}); + + // This command should block on 2 keys (0 and 4), since: + // - key 0 is in use by the iterator (still in the queue since it has not been processed by the consumer yet) + // - key 4 is in the future + // This adds key 4 in the queue since the command depends on it and it hasn't been processed yet. + simulateBlockedWrite(c, 2); + + // Key 4 is processed out of order since the write depends on it. + // Key 4 is processed before key 1 even though key 1 was already in the queue + // because key 4 was enqueued as a priority item with a no-replication iterator. + // Reading key 4 will release key 0 - releasing that lock on the command + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))).Times(1); + expectReadKey(it, 4); // This unblocks key 0 + + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(4)))).Times(1); + expectReadKey(it, 1); // this was already in queue (releases key 4) + + // Now that keys 4 and 0 were processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 2, 3); + + // 4 is skipped because it was already expedited + + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +/* Test that creates a new key, repeating the future key in the command. */ +TEST_F(BgIterationTest, writeWith3Keys_repeatedKey_1repeatedNewKey) { + simpleDelItem(6); // Deleting key 6 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + // Getting started + expectReadKeySequence(it, 0, 3); + // Now, 0,1,2 are in the past. 3 is being processed, and 4 is in queue. + + // Write command that has 3 keys. 1 new repeated key and 1 key in the past. + // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a + // multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY). + c = getWriteMultiKeysClient("blpop", 6, {0, 6, 0}); + + // The write command is not blocked since key 0 & 6 are not in use, and no consistency requirements + simulateUnblockedWriteWithModification(c); + + // Keys 2, 3 are next in the queue (it was put in the queue at the same time as key 1). + expectReadKeySequence(it, 4, 5); + + // There are no consistency requirements - so the new key should just be iterated. + // Key 6 is now in the dict with the value of key 0. + expectReadKey(it, 6, keyStr(0)); + + // Processing the rest of the dict entries. + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +/* In this test, the COPY command is copying from one DB to another. We will create the + * same key in both DBs. We make sure that the proper key is created via replication, and + * the proper key is created by iteration. */ +TEST_F(BgIterationTest, copyHandlesProperDb_eventual) { + + // NOTE: Adding E0 to dict 1. Now there is a E0 in both dict 0 and dict 1. + addKeyToDb(1, "H0", "H0"); + + // The test: + // We will simulate (with DB0 selected): COPY B1 H0 DB 1 REPLACE + // This will overwrite DB1:H0 that was created above. + // Since DB0:B1 is already in queue, we need to expedite the target (DB1:H0) as well + // After DB1:H0 is "overwritten", it should be marked early iterate. + // We expect DB0:H0 to NOT be marked early iterate, and should get processed normally. + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // B0 + // At this point, keys 1(B1) & 2(B2) are in queue. + + // COPY B1 H0 DB 1 REPLACE + c = static_cast(zcalloc(sizeof(client))); + c->cmd = lookupCommandByCString("copy"); + c->db = server.db[0]; + c->argc = 6; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromCString(c->cmd->fullname); + c->argv[1] = createStringObjectFromCString("B1"); + c->argv[2] = createStringObjectFromCString("H0"); + c->argv[3] = createStringObjectFromCString("DB"); + c->argv[4] = createStringObjectFromCString("1"); + c->argv[5] = createStringObjectFromCString("REPLACE"); + + // This should block on 2 keys. DB0:B1 is in queue. DB1:H0 needs to be expedited. + simulateBlockedWrite(c, 2); + + // These 2 keys were already in queue + expectReadKey(it, 1); // DB0:B1 + expectReadKey(it, 2); // DB0:B2 + + // And now we expect to see the expedited DB1:H0 + expectReadDbKeyValue(it, 1, "H0", "H0"); + + expectReadKey(it, 3); // releases DB1:E0 + + // Now key 4 is still in the queue + + simulateUnblockedWrite(c); // We shouldn't be blocked this time + + // Now, we'll simulate the actual activity of the COPY. DB1:H0 will be deleted in order to + // be overwritten. + sds sdskey = sdsnew("H0"); + bgIteration_keyDelete(1, sdskey); // bgIteration would be signaled about the deletion + sdsfree(sdskey); + // At this point the key would actually be deleted and recreated by COPY (no need to actually do this) + + // And finally the replication (this should queue replication) + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + + // Now let's read everything... + expectReadKey(it, 4); // (this was previously in queue) + expectReadReplication(it, c); // This is the new replication (creating DB1:H0) + + // The rest should be normal. We shouldn't see DB1:E0 as it was recreated by replication + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +// Check that termination with replication in queue works OK. +TEST_F(BgIterationTest, terminateWithReplication) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); // makes sure we are done with key 0 (don't want to block) + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Should replicate + + bgIteratorTerminate(it); + + bgIteratorItem *item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// SWAPDB tests - Get ready for the mind-bend... + +/* In the non-consistent iterator (without replication), items are identified with the DBID at + * the time they are placed into the queue. The SWAPDB event signals the change to the + * iterating process - and this is properly sequenced with the DB info for each item. */ +TEST_F(BgIterationTest, swapDB) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + bgIteratorStatus status; + + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + simulateSwapDB(0, 1); // The swap event will be queued after item 2 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 0u); + + expectReadKey(it, 1); // These were already in queue, + expectReadKey(it, 2); // ... and the iteration client hasn't seen the swap yet + + expectReadSwapDB(it, 0, 1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 0u); // still processing it... + + // Since we've seen the swap event, items now have the new DBID + + expectReadDbKeyValue(it, 1, keyStr(3), keyStr(3)); // item 3 should show in DB1 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 1u); // done processing the swapdb + + // Keys 4 is in the queue - let's swap back! + simulateSwapDB(1, 0); // The swap event will be queued after item 4 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); // 2nd one queued + EXPECT_EQ(status.swapdb_processed, 1u); + + expectReadDbKeyValue(it, 1, keyStr(4), keyStr(4)); // item 4 should still show in DB1 + + expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); + EXPECT_EQ(status.swapdb_processed, 1u); // still processing it... + + // Since we've seen the second swap, items should now show with their original DB + + expectReadKey(it, 5); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); + EXPECT_EQ(status.swapdb_processed, 2u); // done processing all swaps + + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +/* In the consistent iterator (without replication) all items are presented to the iterating + * process using the DBID at the time of the iterator creation. No changes are evident. + * Swap events are not presented to the iteration client. */ +TEST_F(BgIterationTest, swapDB_start) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + simulateSwapDB(0, 1); // The swap occurs, but the iterator sees no change + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + + // Heck, let's go crazy with those swaps... + for (int itemNum = 4; itemNum <= LAST_ITEM; itemNum++) { + simulateSwapDB(0, 1); + expectReadKey(it, itemNum); + } + + expectReadComplete(it); +} + + +/* In the non-consistent iterator WITH replication, items are identified with the DBID at the + * time they are placed into the queue. The SWAPDB event signals the change to the iterating + * process - and this is properly sequenced with the DB info for each item. */ +TEST_F(BgIterationTest, swapDB_eventual) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + simulateSwapDB(0, 1); // The swap event will be queued after item 2 + + expectReadKey(it, 1); // These were already in queue, + expectReadKey(it, 2); // ... and the iteration client hasn't seen the swap yet + + expectReadSwapDB(it, 0, 1); // We should see a SWAPDB event + bgIteratorItem *item = bgIteratorRead(it); // followed by the associated replication + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + bgIteration_feedIterators(); + + // Since we've seen the swap event, items now have the new DBID + expectReadDbKeyValue(it, 1, keyStr(3), keyStr(3)); // item 3 is now in DB1 + + // Key 4 is in the queue - let's swap back! + simulateSwapDB(1, 0); // The swap event will be queued after item 4 + + expectReadDbKeyValue(it, 1, keyStr(4), keyStr(4)); // Still appears as DB1 + + expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap + item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + bgIteration_feedIterators(); + + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + +// There is no test for swapDB_YesReplication_YesConsistent because this configuration is not +// permitted with multiple DBs (not permitted with swaps). + + +// FLUSHDB & FLUSHALL Tests + +TEST_F(BgIterationTest, flushDB_flushAll) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // key 1 is active in the iterator - this key won't be deallocated because of the refcount. + // keys 2 is in queue - but will be returned to Valkey before the flush. It is yanked + // back by Valkey and will not be seen by iterator. + simulateFlushDB(-1, 1); + + bgIteratorItem *item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + +TEST_F(BgIterationTest, flushDB_flushOne) { + bgIterator *it1 = bgIteratorCreateFullScanIter("iter1", + BGITERATOR_CONSISTENCY_NONE, NULL, iteratorCleanupFn, PRIVDATA); + bgIterator *it2 = bgIteratorCreateFullScanIter("iter2", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + bgIteratorStatus status; + + // The test flushes DB0. This is half the data. Since <= half, a non-consistent iterator is + // allowed to proceed. But the consistent iterator will be terminated. + + expectReadKey(it1, 0); + expectReadKey(it2, 0); + expectReadKey(it1, 1); + expectReadKey(it2, 1); + + // key 1 is active in the iterator - this key won't be deallocated because of the refcount. + // keys 2 is in queue - but will be returned to Valkey before the flush. These are yanked + // back by Valkey and will not be seen by iterator. + simulateFlushDB(0, 1); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 0u); + + // Testing the non-consistent one continues... + // Everything already on the iterator queue should be preserved (deleted from the DB). + // Keys 2 is already queued (and preserved). + expectReadKey(it1, 2); + + // Read the flushdb item on iterator 1. + bgIteratorItem *item = bgIteratorRead(it1); + ASSERT_EQ(item->type, BGITERATOR_ITEM_FLUSHDB); + ASSERT_EQ(item->dbid, 0); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 0u); // still processing it + + // And iterator 1 keeps processing with the 2nd DB + expectReadKey(it1, ITEMS_PER_DB); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 1u); // done with all flushdb's + + expectReadKeySequence(it1, ITEMS_PER_DB + 1, LAST_ITEM); + expectReadComplete(it1); + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); + + // But the consistent iterator should be terminated + item = bgIteratorRead(it2); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + bgIteratorClose(it2); // background thread completes the termination + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 2); + EXPECT_TRUE(cleanupTerminated); +} + + +/* A multi with one future and one past key must expedite and replicate. */ +TEST_F(BgIterationTest, multiTwoKeysFirstFuture) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, + NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Causes keys 1 & 2 to be queued (same bucket) + expectReadKey(it, 1); // Causes key 0 to be released + + // Now, B0(0) is in the past. H0(5) is in the future. R0(11) [in DB1] is also future. + + /* For a non-consistent iteration, with replication... + * Normally, H0 (future) wouldn't need to expedite - we'd just modify it in place (without + * replication and iterate on it later. But, in this case, since it's wrapped in a multi, with + * B0 (past) - we need to expedite H0 so that the multi can all be handled in the same way. + * Key R0(11) [DB1] just makes thing a little trickier. */ + c = getMultiClient("SET B0 xxx; SET H0 xxx; SELECT 1; SET R0 xxx"); + + // The EXEC should block on 2 keys, because H0(5) & R0(11) should be expedited + simulateBlockedWrite(c, 2); + + expectReadKey(it, 2); // (was already in queue) + + // Note - it would be logically OK if these 2 were reversed, but this is how the current algorithm works. + expectReadKey(it, 5); // Key 5 (H0) was expedited + expectReadKey(it, 11); // Key 11 (R0) was expedited + + // We don't need to actually simulate the multi. Just checking that the keys were expedited. + + // and clean up the rest... + expectReadKeySequence(it, 3, 4); + // Key 5 was already read above (expedited) + expectReadKeySequence(it, 6, 10); + // Key 11 was already read above (expedited) + expectReadKeySequence(it, 12, LAST_ITEM); + expectReadComplete(it); +} + +// Multi blocking on future items. Consistent. +TEST_F(BgIterationTest, multiBlocksOnFutureKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + // Since there's no replication, an expedited key will be moved to the front of the queue. + // Let's fake a modification to key 6 (H1) + // Dummy up a MULTI... + c = getMultiClient("SET H1 xxx"); + + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c); + + // H1 (key 6) will be expedited to the front of the queue (because no replication) + expectReadKey(it, 6); + + // Now that we've read key 6, key 0 (B0) is passed and should not block + freeTestClient(c); + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + + // and clean up the rest... + expectReadKeySequence(it, 1, 5); + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); +} + + +// Scenario. We have a multi that doesn't need to be replicated because all of the keys exist +// but are all future keys. Note that missing keys are considered already-iterated, so all +// must exist for this test. Then: +// - we delete a key +// - we re-create the deleted (future) key - normally this would be replicated +// - we access another (future) key - we don't expect to get blocked! +TEST_F(BgIterationTest, multiNotReplicatedButDelRecreateAccess) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + // Keys 1 & 2 are in queue + + c = getMultiClient("DEL H1; SET H1 xxx; SET H2 yyy"); + // Now let's process the multi. Since H1 & H2 are both future (existing) items, we shouldn't + // block or replicate. + simulateUnblockedWrite(c); // the EXEC + + // Simulate the DEL H1 + server.in_exec = 1; // Simulate actual execution of the MULTI/EXEC + advanceMultiClientToCommand(c, 0); // DEL H1 + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + simpleDelItem(6); // H1 + sds delKey = sdsnew(keyStr(6)); + bgIteration_keyDelete(0, delKey); + sdsfree(delKey); + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); // shouldn't replicate + + // Simulate SET H1 - the key doesn't exist, and would normally replicate and mark early iterate, + // but this is in a transaction, and we are not replicating this transaction. + advanceMultiClientToCommand(c, 1); // SET H1 xxx + simulateUnblockedWriteWithModification(c); + + // Now write to another existing future key - this should work if we weren't confused by the DEL + advanceMultiClientToCommand(c, 2); // SET H2 yyy + simulateUnblockedWriteWithModification(c); + server.in_exec = 0; + + // Now we can continue iterating, and we should pick up keys 1... (and no replication!) + expectReadKeySequence(it, 1, 5); + expectReadKey(it, 6, "xxx"); + expectReadKey(it, 7, "yyy"); + expectReadKeySequence(it, 8, LAST_ITEM); + expectReadComplete(it); +} + + +// For this test, B0 is added into DB1 - so it exists in both DB 0 and 1. We will process it +// in DB0, but it will be unprocessed in DB1. See if we track SELECT properly. +TEST_F(BgIterationTest, multiHandlesSelectProperly) { + addKeyToDb(1, "B0", "B0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - B0 in DB0. + expectReadKey(it, 0); + // Now, we are done with B0 in DB0, but not in DB1 + expectReadKey(it, 1); // Reads B1, and releases B0 in DB0 + + // These cases should NOT block... (they access B0 in DB0) + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SELECT 0; SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block... (they access B0 in DB1) + c = getMultiClient("SET B0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SET B0 xxx"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SET B0 xxx; SELECT 0"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SELECT 1; SET B0 xxx; SELECT 1"); + simulateBlockedWrite(c); + + expectAnythingCleanup(it); +} + +// For this test, B0 is added into DB1 - so it exists in both DB0 and DB1. We will process it +// in DB0, but it will be unprocessed in DB1. See if we track select properly - WHEN WE HAVE NO +// PERMISSION TO EXECUTE SELECT! +TEST_F(BgIterationTest, multiHandlesSelectNoPermissionProperly) { + addKeyToDb(1, "B0", "B0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - B0 in DB0. + expectReadKey(it, 0); + // Now, we are done with B0 in DB0, but not in DB1 + expectReadKey(it, 1); // Reads B1, and releases B0 in DB0 + + // No permission for any commands (specifically select/swapdb) + EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_,_,_,_,_,_)) + .Times(AtLeast(1)).WillRepeatedly(Return(ACL_DENIED_CMD)); + + // These cases should NOT block... (they access B0 in DB0) + // The SELECTs below are inconsequential - with/without select, same result. + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SELECT 0; SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block IF SELECT IS WORKING... (they access B0 in DB1) + c = getMultiClient("SET B0 xxx"); + c->db = server.db[1]; // already starting on DB1 + simulateBlockedWrite(c); // will block, no select + freeTestClient(c); + c = getMultiClient("SELECT 1; SET B0 xxx"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + freeTestClient(c); + c = getMultiClient("SELECT 1; SET B0 xxx; SELECT 0"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + freeTestClient(c); + c = getMultiClient("SELECT 0; SELECT 1; SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + + expectAnythingCleanup(it); +} + +// For this test, B0 is added into DB1 - so it exists in both DB0 and DB1. We will process it +// in DB0, but it will be unprocessed in DB1. See if we track SWAPDB properly. +TEST_F(BgIterationTest, multiHandlesSwapdbProperly) { + addKeyToDb(1, "B0", "B0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - B0 in DB0. + expectReadKey(it, 0); + // Now, we are done with B0 in DB0, but not in DB1 + expectReadKey(it, 1); // Reads B1, and releases B0 in DB0 + + // These cases should NOT block... (they access B0 in DB0) + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 0 1; SELECT 1; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block... (they access B0 in DB1) + c = getMultiClient("SET B0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SET B0 xxx; SWAPDB 0 1"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SELECT 0; SET B0 xxx; SWAPDB 0 1"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET B0 xxx; SELECT 1"); + simulateBlockedWrite(c); + + expectAnythingCleanup(it); +} + +// For this test, B0 is added into DB1 - so it exists in both DB0 and DB1. We will process it +// in DB0, but it will be unprocessed in DB1. See if we track select properly - WHEN WE HAVE NO +// PERMISSION TO EXECUTE SWAPDB! +TEST_F(BgIterationTest, multiHandlesSwapdbNoPermissionProperly) { + addKeyToDb(1, "B0", "B0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - B0 in DB0. + expectReadKey(it, 0); + // Now, we are done with B0 in DB0, but not in DB1 + expectReadKey(it, 1); // Reads B1, and releases B0 in DB0 + + // No permission for any commands (specifically select/swapdb) + EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_,_,_,_,_,_)) + .Times(AtLeast(1)).WillRepeatedly(Return(ACL_DENIED_CMD)); + + // These cases should NOT block... (they access B0 in DB0) + // The SELECTs & SWAPDBs below are inconsequential - with/without select/swapdb, same result. + c = getMultiClient("SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET B0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 0 1; SELECT 1; SET B0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block IF SELECT/SWAPDB IS WORKING... (they access B0 in DB1) + c = getMultiClient("SET B0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SET B0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb fails) + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SELECT 0; SET B0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails) + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET B0 xxx; SELECT 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails) + + expectAnythingCleanup(it); +} + + +static void *pthreadWait200msAndReadTwoKeys(void *arg) { + bgIterator *it = static_cast(arg); + + usleep(200000); + bgIteratorRead(it); + bgIteratorRead(it); + return nullptr; +} + +static void asyncWait200msAndReadTwoKeys(bgIterator *it) { + int rc; + pthread_attr_t attr; + pthread_t thread; + + rc = pthread_attr_init(&attr); + assert(rc == 0); + rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + assert(rc == 0); + + rc = pthread_create(&thread, &attr, pthreadWait200msAndReadTwoKeys, it); + assert(rc == 0); + + rc = pthread_attr_destroy(&attr); + assert(rc == 0); +} + +TEST_F(BgIterationTest, testLuaWithUndeclaredKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_START, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // If we fake a modification to key 3, we won't know if it's handled out of order. + // So we fake a modification to key 4 + c = getWriteClient(4, "xxx"); + c->flag.script = 1; + + // Now for a LUA script, we have already blocked (on the eval/evalsha) for any declared keys + // But here, we're about to modify an undeclared key. We can't actually block in the middle + // of the LUA script. So this will behave as unblocked, but incur a synchronous wait. + + // Key 4 will get expedited when we simulate the write. After reading key 4, key 1 will need + // to be read to return key 4 to Valkey, unblocking the synchronous wait. + asyncWait200msAndReadTwoKeys(it); + + monotime blockTimer; + elapsedStart(&blockTimer); + simulateUnblockedWrite(c); // Not blocked, but delays internally + // Must have delayed at least 150ms (some time may have passed before timer start) + EXPECT_GT(elapsedMs(blockTimer), 150u); + + // Continue... + expectReadKeySequence(it, 2, 3); + // 4 has already been processed + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); +} + + +// Make sure that replication received while processing the last key is sent +TEST_F(BgIterationTest, replicationReceivedWhileProcessingLastKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, LAST_ITEM); + + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + expectReadReplication(it, c); // Replication happened while processing the last item, should be here. + + simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing + expectReadComplete(it); // We expect to see the completion instead +} + +TEST_F(BgIterationTest, repldoneFunctionCalled) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, LAST_ITEM); + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + + // Since in testing, we are only feeding one item at a time, and synchronously, we won't call + // the repldone function until after we release the last item. + EXPECT_EQ(replDoneConfirmed, 0); + expectReadReplication(it, c); // Replication happened while processing the last item, should be here. + EXPECT_EQ(replDoneConfirmed, 1); // Last key released, now done feeding replication + + simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing + expectReadComplete(it); // We expect to see the completion instead +} + +TEST_F(BgIterationTest, repldoneFunctionCalledTwice) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, iteratorRepldoneFnNotBeingReadyInitially, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, LAST_ITEM); + c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + + // Won't signal replDone until we've released the final item (which happens when reading the replication) + EXPECT_EQ(replDoneRejected, 0); + EXPECT_EQ(replDoneConfirmed, 0); + expectReadReplication(it, c); // Releases the final item + EXPECT_EQ(replDoneRejected, 1); // replDone called once (and rejected by client) + EXPECT_EQ(replDoneConfirmed, 0); + simulateUnblockedWriteWithModification(c); // This will replicate (because replDone returned false) + expectReadReplication(it, c); // ReplDone gets called again (and accepted this time) + EXPECT_EQ(replDoneConfirmed, 1); + + simulateUnblockedWriteWithModification(c); // This won't replicate because replication is done + expectReadComplete(it); // We expect to see the completion instead +} + +// Check that the memory reported for replication is correct +TEST_F(BgIterationTest, checkReplicationByteCount) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); + + c = getWriteClient(0, "xxx"); + size_t expectedReplicationSize = sizeof(bgIteratorItem); + for (int i = 0; i < c->argc; i++) { + expectedReplicationSize += objectComputeSize(NULL, c->argv[i], 0, 0); + } + + expectReadKey(it, 0); + expectReadKey(it, 1); // Releases and unblocks 0 + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u); + + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize); + simulateUnblockedWriteWithModification(c); // and write again (2nd replication) + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize); + + expectReadKey(it, 2); // Keys 0..2 all in same bucket + + expectReadReplication(it, c); + // After reading the 1st replication, it hasn't been returned yet (it's the active item) + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize); + expectReadReplication(it, c); + // After reading the 2nd replication, the 1st has been returned + EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize); + + expectReadKey(it, 3); + // Now all replication has been returned/freed + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u); + + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); +} + +// Test that for an arbitrary write command having no keys, replication should occur. +TEST_F(BgIterationTest, checkNoKeysWriteIsReplicated) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_CONSISTENCY_EVENTUAL, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + c = getNoKeysWriteClient(); + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + + expectReadKeySequence(it, 1, 2); // These were already in queue + + expectReadReplication(it, c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); +} diff --git a/src/unit/wrappers.h b/src/unit/wrappers.h index 0f80919d6f7..5bfc117fab2 100644 --- a/src/unit/wrappers.h +++ b/src/unit/wrappers.h @@ -67,6 +67,9 @@ void __wrap_unblockClientsInUseOnKey(robj *key); int __wrap_ACLCheckAllUserCommandPerm(user *u, struct serverCommand *cmd, robj **argv, int argc, int dbid, int *idxptr); +size_t __wrap_hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata); +bool __wrap_hashtableScanHasPassedKey(hashtable *ht, const void *key, size_t cursor); + #undef protected #undef _Bool #undef typename