diff --git a/.config/typos.toml b/.config/typos.toml index 10103279c57..ff90d3a679d 100644 --- a/.config/typos.toml +++ b/.config/typos.toml @@ -15,11 +15,12 @@ optin = "optin" smove = "smove" Parth = "Parth" # seems like the spellchecker does not like it is similar to "Path" nd = "nd" +threadsave = "threadsave" [default] extend-ignore-re = [ - "SELECTed", - "WATCHed", + "[A-Z]{2,}ed", # SELECTed, WATCHed, etc. + "[A-Z]{2,}s", # SELECTs, etc. ] [type.c] @@ -64,6 +65,7 @@ pathc = "pathc" pn = "pn" seeked = "seeked" tre = "tre" +dbe = "dbe" [type.systemd.extend-words] # systemd = .conf diff --git a/src/Makefile b/src/Makefile index 2c78f95986e..98f49108e46 100644 --- a/src/Makefile +++ b/src/Makefile @@ -457,6 +457,7 @@ ENGINE_SERVER_OBJ = \ allocator_defrag.o \ anet.o \ aof.o \ + bgiteration.o \ bio.o \ bitops.o \ blocked.o \ diff --git a/src/bgiteration.c b/src/bgiteration.c new file mode 100644 index 00000000000..ed6ac40bddc --- /dev/null +++ b/src/bgiteration.c @@ -0,0 +1,2728 @@ +#include "fmacros.h" +#include "bgiteration.h" +#include "dict.h" +#include "fifo.h" +#include "kvstore.h" +#include "monotonic.h" +#include "mutexqueue.h" +#include "server.h" + +int getFlushCommandFlags(client *c, int *flags); // in db.c +uint64_t dictObjHash(const void *key); // in server.c +int dictObjKeyCompare(const void *key1, const void *key2); // in server.c +size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid); // in object.c +robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const sds key, long long expire); // in object.c + + +// Non-public hashtable/kvstore functions... +bool hashtableInternalFindBucketIdx(hashtable *ht, void *key, int *table_idx, size_t *bucket_idx); +void hashtableInternalIteratorGetBucketIdx(hashtableIterator *iterator, int *table_idx, size_t *bucket_idx); +bool hashtableInternalIteratorIsBucketIdxComplete(hashtableIterator *iterator); +hashtableIterator *kvstoreInternalIteratorGetCurrentHashtableIterator(kvstoreIterator *kvs_it); + + +static bool receiveItemsBackFromOneIterator(bgIterator *it); // in bgiteration.c - forward declaration + +// ################ TEMP COMPILE HACKS ########################### +// Issue found. server.db has changed from an array of db to an array of pointers to db (change all refs to server.db) +// Issue: iterators (kvstore/hashtable) are not safe across event loop invocations. Hashtable (kvstore?) needs to track and maintain safe iterators. + + +// Don't think there's any current need for this... +static bool ignoreKeyForSave(const_sds key) { + UNUSED(key); + return false; +} + +//------- END OF COMPILE HACKS ------------------- + + +// Returns true if the cmd is a script command that may replicate. +static bool isScriptCallWriteCmd(struct serverCommand *cmd) { + return ((cmd->proc == fcallCommand) || (cmd->proc == evalCommand) || (cmd->proc == evalShaCommand)); +} + +// The PFCOUNT command (which does NOT have the CMD_WRITE flag) modifies the underlying string and +// is replicated as a write. So it needs to be detected and handled specially. +static bool isWriteCmd(struct serverCommand *cmd) { + return ((cmd->flags & CMD_WRITE) || (cmd->proc == pfcountCommand) || (cmd->proc == execCommand) || (isScriptCallWriteCmd(cmd))); +} + +// Returns true if the command is a deletion based command (DEL or UNLINK) +static bool isDeleteCmd(struct serverCommand *cmd) { + return ((cmd->proc == delCommand) || (cmd->proc == unlinkCommand)); +} + + +static bool onValkeyMainThread(void) { + return (pthread_equal(server.main_thread_id, pthread_self()) != 0); +} + +/* Parse a parameters robj, extracting a valid DBID. + * Returns FALSE if DBID isn't valid. + */ +static bool getDbIdFromRobj(robj *obj, int *db_id) { + long long value; + if (getLongLongFromObject(obj, &value) != C_OK) return false; + if ((value < 0) || (value >= server.dbnum)) return false; + *db_id = (int)value; + return true; +} + +/* Parse the parameters of the COPY command, extracting the target DBID. + * Returns FALSE if the command would not run. + */ +static bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid, int *target_dbid) { + const int COPY_COMMAND_OPTIONAL_ARG_START_INDEX = 3; + + *target_dbid = selected_dbid; + + for (int i = COPY_COMMAND_OPTIONAL_ARG_START_INDEX; i < argc; i++) { + if (!strcasecmp((char *)objectGetVal(argv[i]), "replace")) { + continue; + } else if (!strcasecmp((char *)objectGetVal(argv[i]), "db") && (i + 1 < argc)) { + /* Note the parsing here needs to perfectly match what we have in Valkey OSS for COPY. + * The following command is considered OK by Valkey 8.1 so we can't return here, but + * must continue to parse till the last db which is the one that's effectively used. + * COPY key1 key2 db 1 db 2 db 3 // (This will use db 3) + */ + if (!getDbIdFromRobj(argv[i + 1], target_dbid)) { + return false; // parse failure + } + i++; // Consume additional argument + } else { + return false; // parse failure + } + } + return true; +} + +/* Get parameters for the SWAPDB command. + * The optional permission_client allows for checking of a client's permission for swapdb. + * Returns true if command would be executed. + */ +bool getParamsForSwapdb(int argc, robj **argv, client *permission_client, int *id1_p, int *id2_p) { + static struct serverCommand *swapdb_cmd = NULL; + + // We don't need to check permissions in the replication phase + if (permission_client != NULL) { + if (swapdb_cmd == NULL) { + swapdb_cmd = lookupCommandByCString("swapdb"); + serverAssert(swapdb_cmd != NULL); + } + + int idxptr; + if (ACLCheckAllUserCommandPerm(permission_client->user, swapdb_cmd, argv, argc, + permission_client->db->id, &idxptr) != ACL_OK) return false; + } + + long long dbid1, dbid2; + if (argc != 3) return false; + if (server.cluster_enabled) return false; + if (getLongLongFromObject(argv[1], &dbid1) != C_OK) return false; + if (getLongLongFromObject(argv[2], &dbid2) != C_OK) return false; + if (dbid1 < 0 || dbid1 >= server.dbnum) return false; + if (dbid2 < 0 || dbid2 >= server.dbnum) return false; + if (dbid1 == dbid2) return false; // Valid, but doesn't do anything + + *id1_p = (int)dbid1; + *id2_p = (int)dbid2; + return true; +} + +/* Get parameters for the SELECT command. + * The optional permission_client allows for checking of a client's permission for select. + * Returns true if command would be executed. + */ +bool getParamsForSelect(int argc, robj **argv, client *permission_client, int *dbid_p) { + static struct serverCommand *select_cmd = NULL; + + // We don't need to check permissions in the replication phase + if (permission_client != NULL) { + if (select_cmd == NULL) { + select_cmd = lookupCommandByCString("select"); + serverAssert(select_cmd != NULL); + } + + int idxptr; + if (ACLCheckAllUserCommandPerm(permission_client->user, select_cmd, argv, argc, + permission_client->db->id, &idxptr) != ACL_OK) return false; + } + + long long dbid; + if (argc != 2) return false; + if (getLongLongFromObject(argv[1], &dbid) != C_OK) return false; + if (dbid < 0 || dbid >= server.dbnum) return false; + + *dbid_p = (int)dbid; + return true; +} + + +/* DictType for SDS->ptr. The SDS is referenced, no destructor. */ +static dictType sdsrefToPtrDictType = { + .entryGetKey = dictEntryGetKey, + .hashFunction = dictSdsHash, + .keyCompare = dictSdsKeyCompare +}; + + +/* Wrap decrRefCount() so that it can be used as a callback requiring void. */ +static void decrRefCountVoid(void *o) { + decrRefCount(o); +} + + +/* Concatenate argc/argv into a command string for debugging. */ +static sds createSdsFromClientArgv(int argc, robj **argv) { + sds cmd = sdsempty(); + for (int i = 0; i < argc; i++) { + robj *arg = getDecodedObject(argv[i]); // some objects are int encoded + cmd = sdscatprintf(cmd, "'%s' ", (char *)objectGetVal(arg)); + decrRefCount(arg); + } + return cmd; +} + + +//########################################################################### + + +/* bgIteration internal (compile time) configuration values */ +enum { + BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE = 16384, // Prevent initial rehashing + BGITER_MAX_CLONE_ITEM_BYTES = 512, // Max size item to clone + BGITER_MAX_CLONE_POOL_BYTES = (1 * 1024 * 1024), // Total limit for all cloned items + BGITER_QUEUE_INCREASE_INCR = 100, // Step size when increasing queue target + BGITER_CYCLE_DELAY_MS = 2, // Delay between calls on bgIteration timer + BGITER_CYCLE_BUDGET_MS = 1, // Normal time limit for timer processing + BGITER_CYCLE_BUDGET_MAX_MS = 10 // Maximum time limit when starvation seen +}; + +// These can be tweaked by unit tests +static int bgiter_max_clone_item_bytes = BGITER_MAX_CLONE_ITEM_BYTES; +static int bgiter_max_clone_pool_bytes = BGITER_MAX_CLONE_POOL_BYTES; + +void bgIteration_unitTestDisableCloning(void) { + bgiter_max_clone_item_bytes = 0; + bgiter_max_clone_pool_bytes = 0; +} +void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes) { + bgiter_max_clone_item_bytes = item_bytes; + bgiter_max_clone_pool_bytes = pool_bytes; +} + +typedef enum { + BGITERATION_TYPE_NONE, + BGITERATION_TYPE_FULLSCAN, + BGITERATION_TYPE_CLUSTERSLOT +} bgIterationType; + +/* Extensions to bgIteratorItemType. These enumerations are used internally, and are not part of + * the published interface. These allow for extensibility in the internal information-passing + * between the Valkey main thread and the iteration client thread. */ +typedef enum { + /* Indicates that the iteration client has completed use of the bgIterator and that the + * bgIterator should be cleaned up and freed by the Valkey main thread. */ + BGITERATOR_ITEMEXT_ITER_CLOSED = 10 +} bgIteratorItemTypeExtended; + +/* Item for bgIteratorItemTypeExtended.BGITERATOR_ITEMEXT_ITER_CLOSED. Used to pass a bgIterator + * back to the Valkey main thread for cleanup/release. */ +typedef struct { + bgIteratorItemTypeExtended type; + bgIterator *iter; +} bgIteratorItemExtClose; + +/* Used for dictEntryPtrDictType. This dict grows and shrinks constantly during the iteration. + * There is no point to rehash it all the time. */ +static int neverShrink(size_t moreMem, double usedRatio) { + UNUSED(moreMem); + return (usedRatio > 0.5); // Return true only if expanding +} + +// A dictionary with a pointer (itself) as a key (the address pointed to is NOT referenced). +// Nothing is duplicated, this is a very fast dictionary, but potentially unsafe if the original +// items are deleted or moved. +// WARNING: Can't have active defrag running! It might reallocate memory blocks, swapping their +// pointer values! A check must be made in active defrag to ensure that no iteration is +// active. + +// Thomas Wang's 64-bit mix +static uint64_t pointerHash(const void *key) { + uint64_t h = (uint64_t)(uintptr_t)key; + h = (~h) + (h << 21); // h = (h << 21) - h - 1; + h = h ^ (h >> 24); + h = (h + (h << 3)) + (h << 8); // h * 265 + h = h ^ (h >> 14); + h = (h + (h << 2)) + (h << 4); // h * 21 + h = h ^ (h >> 28); + h = h + (h << 31); + return h; +} + +static int pointerCompare(const void *key1, const void *key2) { + return key1 == key2; +} + +static dictType dictEntryPtrDictType = { + .entryGetKey = dictEntryGetKey, + .hashFunction = pointerHash, + .keyCompare = pointerCompare, + .resizeAllowed = neverShrink +}; + +// A TEMP set of robj's (of type sds). This is only for temporary sets as the robj's are not +// ref-counted at insertion/deletion. Used for robj->NULL. +static dictType tempKeysetDictType = { + .entryGetKey = dictEntryGetKey, + .hashFunction = dictObjHash, + .keyCompare = dictObjKeyCompare +}; + +typedef struct genericIterator genericIterator; +typedef void (*iteratorReleaseFunc) (genericIterator *genIt); +typedef fifo * (*iteratorGetEntriesFunc) (genericIterator *genIt, int *orig_dbid, int *cur_dbid); +typedef void (*iteratorSwapDbFunc) (genericIterator *genIt, int db1, int db2); +typedef void (*iteratorFlushDbFunc) (genericIterator *genIt, int cur_dbid); +typedef bool (*iteratorHasPassedItemFunc) (genericIterator *genIt, const_sds key, int cur_dbid); +typedef int (*iteratorOriginalDbFunc) (genericIterator *genIt, int cur_dbid); +typedef bool (*iteratorIsKeyInScopeFunc) (genericIterator *genIt, const_sds key); + +// Function pointers supporting polymorphic iterator implementation +struct genericIterator { + iteratorReleaseFunc release; + iteratorGetEntriesFunc getEntries; + iteratorSwapDbFunc swapDb; + iteratorFlushDbFunc flushDb; + iteratorHasPassedItemFunc hasPassedItem; + iteratorOriginalDbFunc originalDb; + iteratorIsKeyInScopeFunc isKeyInScope; +}; + +typedef struct itemListNode { + struct itemListNode *next; +} itemListNode; + +static itemListNode *freeItemStackHead = NULL; + +static void itemFreeList_returnItemBackToFreeList(bgIteratorItem* item) { + itemListNode *freedNode = (itemListNode*)item; + freedNode->next = freeItemStackHead; + freeItemStackHead = freedNode; +} + +static bgIteratorItem *itemFreeList_getElementOrAllocate(void) { + + bgIteratorItem *item; + // Pop a free node from the free list or allocate if none free + if (freeItemStackHead) { + item = (bgIteratorItem*)freeItemStackHead; + freeItemStackHead = freeItemStackHead->next; + if (freeItemStackHead) { + valkey_prefetch(freeItemStackHead); + } + } + else { + // Create new listNode and item + item = zmalloc(sizeof(bgIteratorItem)); + } + return item; +} + +static void itemFreeList_release(void) { + while(freeItemStackHead) { + itemListNode *node = freeItemStackHead; + freeItemStackHead = node->next; + zfree((bgIteratorItem*)node); + } +} + +// This struct is used across threads. Unless otherwise noted, the fields are initialized at +// iterator creation (within the main thread) and are read-only by the client thread. +struct bgIterator { + sds name; // Iterator name + bgIteratorReplDoneFunc repldone; // Optional repldone function to be run on the main thread + bgIteratorCleanupFunc cleanup; // Optional cleanup function to be run on main thread + void *privdata; // Client's private data to be passed to cleanup function + + int iteration_flags; // Consistent and/or Replication + int iteration_type; // Full scan or cluster slot + uint32_t consistent_modification_id; // iterator epoch at time of iterator creation + + genericIterator *keyset_iter; // Low-level iterator (polymorphic) + + dict *early_iterate_entries; // Used to keep track of what items have already been iterated + // over by out-of-order expedited process, ensuring a bgIterator + // does not try to reprocess items. + // Used only by main thread. + // dictEntry -> NULL + + mutexQueue *items_for_iterator; // Created/Destroyed in main thread, used in both (threadsafe) + + mutexQueue *return_to_valkey; // Queue of items to be returned to the Valkey main thread (threadsafe) + + unsigned int item_count_target; // Used only by main thread + + bgIteratorItem *volatile current_item; // current_item is normally only used in the iteration client. + // It's marked volatile here only to support snooping from the + // main thread when handling a FLUSHDB command. This prevents + // the compiler from generating code which might read the + // pointer multiple times (when it's coded to read only once). + // Also - this syntax is for a volatile POINTER to a + // non-volatile item. "volatile" at the beginning of the + // declaration, would indicate a (non-volatile) pointer to a + // volatile item. + + bool client_is_active; // Set to true when client performs 1st read + bool completed; // Set to true in main thread when last item from iteration has + // been queued to the client. No additional items will be + // enqueued to the client after this has been set. + + volatile bool terminated; // Set to true in main thread when iteration is to be killed + // Set to true in iteration client when it decides to end early + + bool cur_cmd_may_replicate; // Used only in main thread during command processing + + // Variables maintaining runtime statistics + unsigned long dbentries_queued; // Updated by main thread + unsigned long dbentries_processed; // Updated by client thread + unsigned long replication_queued; // Updated by main thread + unsigned long replication_processed; // Updated by client thread + unsigned long swapdb_queued; // Updated by main thread + unsigned long swapdb_processed; // Updated by client thread + unsigned long flushdb_queued; // Updated by main thread + unsigned long flushdb_processed; // Updated by client thread + unsigned long dbentry_clones_queued; // Updated by main thread + unsigned long dbentry_clones_processed; // Updated by client thread + monotime monotonic_start_time; // Time iteration started + + volatile monotime monotonic_item_start_time; // The item start time is set in the iteration client. It is + // marked volatile as it can be read from the main thread by + // bgIteratorGetStatus. If 0, this indicates that the + // iteration client is waiting for an item to process. +}; + + +// These static values are only accessed from the main Valkey thread. + +static list *allIterators; // list of bgIterator +static dict *nameToIterator; // bgIterator->name -> bgIterator + +// Global, across all iterators, dict contains a dbEntry pointer -> ref count +static dict *inUseEntries; // dbEntry -> ref count + +// Key values in the current command which don't exist in the DB yet. Needed for determination of +// replication for NON-consistent iterations. +static list *curCmdMissingKeys; // list of robj + +// A counter of the total amount of memory used for buffered replication data. +// This amount is excluded when computing the need for evictions. +static ssize_t bufferedReplicationBytes; + +// Memory pool to track current allocated memory of cloned items (in bytes) +static ssize_t bgiteration_current_clone_memory_pool_size; + +// Snapshot of the last queue size to seed the next queue +// We assume all bgIterators consume items at the same rate +static int last_item_count_target; + +// Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID) +static long long bgIterator_timeproc_id; + +// Incremented on each new iteration, this is updated in dbEntry metadata whenever an entry is modified. +static uint32_t bgIteration_epoch = 1; + + +// BgIteration debug captures BgIteration activity to a large sds buffer. When an iterator is +// completed, the entire buffer is written to a file in the current working directory. Note that +// memory must be available for the ENTIRE debug in memory. This isn't captured incrementally to +// a file as the file I/O is more likely to affect timing. +// Future implementation: the current design is most useful for a single iterator. When items are +// queued to an iterator, the iterator name is not recorded (to save space). +// Developer note: using a CONST value here allows the compiler to completely remove all of the +// debugging code at compile time. There is no run-time performance overhead when set to FALSE. +// This is essentially like an IFDEF, however, it's better as it forces the compiler to validate +// syntax. +static const bool BGITERATION_DEBUG = false; // DO NOT SUBMIT WITH THIS SYMBOL SET TO TRUE! +static sds debugBuffer; + + + +//============================================================================================= +// Full Scan Iterator +//============================================================================================= +/* The full scan iterator performs the actual iteration over the Valkey keyset. The iterator is + * only used from within the Valkey main thread. Iteration proceeds one DB at a time, based on + * the DB ordering at the time of iterator creation. Each time the iterator returns items, all + * of the dictionary entries from a single hash bucket are returned. + */ + +struct fullScanIterator { + genericIterator callbacks; // (must be first item) + + // Array of mapping from original DB ID (at the time of iteration start) to that DB's + // current index. So, if the DB which was DB-0 is now at index 6, orig_to_cur_db[0]==6. + int *orig_to_cur_db; + + // The reverse of the above array. This maps a current DB index to its original index + // (at the time of iteration start). + int *cur_to_orig_db; + + // This is the DB we are currently iterating over. This is relative to the ORIGINAL + // DB ordering, at the time of iterator creation. Iteration proceeds from 0..N based on + // the original ordering. + int iter_db; + + // Iterator for the DB orig_to_cur_db[iter_db] + kvstore *kvs; // keep track of kvs associated with iter_dbi + kvstoreIterator *iter_dbi; +}; + +static void fullScanIteratorRelease(genericIterator *genIt) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + if (it->iter_dbi) kvstoreIteratorRelease(it->iter_dbi); + zfree(it->orig_to_cur_db); + zfree(it->cur_to_orig_db); + zfree(it); +} + +static fifo * fullScanIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + if (it->iter_db >= server.dbnum) return NULL; // Finished scanning + + fifo *dbEntryFifo = fifoCreate(); + while (fifoLength(dbEntryFifo) == 0) { + while (it->iter_dbi == NULL) { + if (++it->iter_db >= server.dbnum) { + fifoRelease(dbEntryFifo); + return NULL; // Iteration complete + } + serverDb *db = server.db[it->orig_to_cur_db[it->iter_db]]; + if (db != NULL) { + it->kvs = db->keys; + it->iter_dbi = kvstoreIteratorInit(it->kvs, HASHTABLE_ITER_SAFE); + } + } + + hashtableIterator *ht_it = NULL; + do { + dbEntry *de; + if (!kvstoreIteratorNext(it->iter_dbi, (void **)&de)) { + kvstoreIteratorRelease(it->iter_dbi); + it->kvs = NULL, it->iter_dbi = NULL; + break; + } + + ht_it = kvstoreInternalIteratorGetCurrentHashtableIterator(it->iter_dbi); + if (ignoreKeyForSave(objectGetKey(de))) continue; // slot migration: keys being purged + fifoPush(dbEntryFifo, de); + } while (!hashtableInternalIteratorIsBucketIdxComplete(ht_it)); + } + *orig_dbid = it->iter_db; + *cur_dbid = it->orig_to_cur_db[*orig_dbid]; + return dbEntryFifo; +} + +static void fullScanIteratorSwapDb(genericIterator *genIt, int db1, int db2) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + int temp = it->cur_to_orig_db[db1]; + it->cur_to_orig_db[db1] = it->cur_to_orig_db[db2]; + it->cur_to_orig_db[db2] = temp; + + it->orig_to_cur_db[it->cur_to_orig_db[db1]] = db1; + it->orig_to_cur_db[it->cur_to_orig_db[db2]] = db2; +} + +static void fullScanIteratorFlushDb(genericIterator *genIt, int cur_dbid) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + int orig_db = it->cur_to_orig_db[cur_dbid]; + if (orig_db == it->iter_db) { + // We are currently iterating on the DB that's being flushed. + kvstoreIteratorRelease(it->iter_dbi); + it->kvs = NULL, it->iter_dbi = NULL; + // Iteration will continue with the next DB. + } +} + +static bool fullScanIteratorHasPassedItem(genericIterator *genIt, const_sds key, int cur_dbid) { + struct fullScanIterator *it = (struct fullScanIterator *) genIt; + int orig_dbid = it->cur_to_orig_db[cur_dbid]; + + if (orig_dbid < it->iter_db) return true; // Entire DB has already been processed + if (orig_dbid > it->iter_db) return false; // Haven't started this DB yet + // Now, orig_dbid == it->iter_db + + if (it->iter_dbi == NULL) return true; // just finished this DB + + // We're in the middle of processing a DB. In cluster-mode, the DB is divided into 1 hashtable + // per slot. In cluster-mode-disabled, we treat all keys as in slot 0. + int keySlot = server.cluster_enabled ? getKeySlot((sds)key) : 0; + if (keySlot < kvstoreIteratorGetCurrentHashtableIndex(it->iter_dbi)) return true; + if (keySlot > kvstoreIteratorGetCurrentHashtableIndex(it->iter_dbi)) return false; + + // At this point, we're down to a specific hashtable. + + hashtable *iter_current_ht = kvstoreGetHashtable(it->kvs, keySlot); + int table; // 0 or 1 (supporting rehashing) + size_t index; // bucket number within the hashtable + // If key doesn't exist, we consider it passed - we MIGHT have iterated over it had it existed. + if (!hashtableInternalFindBucketIdx(iter_current_ht, (void *)key, &table, &index)) return true; + + hashtableIterator *htIter = kvstoreInternalIteratorGetCurrentHashtableIterator(it->iter_dbi); + int iter_table; + size_t iter_index; + hashtableInternalIteratorGetBucketIdx(htIter, &iter_table, &iter_index); + if (table < iter_table) return true; // iteration in table 1, but item is in table 0 + if (table > iter_table) return false; // iteration in table 0, but item is in table 1 + // if index <= iterator index, it has been passed. bgIterator + // processes buckets atomically. hashtableIterator points to the + // last returned position. It means bucket at iter_index has + // already been processed. + if (index <= iter_index) return true; + if (ignoreKeyForSave(key)) return true; // if slot being purged, pretend we have passed it + return false; +} + +static int fullScanIteratorOriginalDb(genericIterator *genIt, int cur_dbid) { + struct fullScanIterator *it = (struct fullScanIterator *)genIt; + return it->cur_to_orig_db[cur_dbid]; +} + +static bool fullScanIteratorIsKeyInScope(genericIterator *genIt, const_sds key) { + UNUSED(genIt); + UNUSED(key); + return true; // All keys are in scope +} + +static genericIterator * fullScanIteratorCreate(void) { + struct fullScanIterator *it = zmalloc(sizeof(struct fullScanIterator)); + it->orig_to_cur_db = zmalloc(sizeof(int) * server.dbnum); + it->cur_to_orig_db = zmalloc(sizeof(int) * server.dbnum); + for (int i = 0; i < server.dbnum; i++) { + it->orig_to_cur_db[i] = i; + it->cur_to_orig_db[i] = i; + } + it->iter_db = -1; + it->kvs = NULL; + it->iter_dbi = NULL; + + it->callbacks.release = fullScanIteratorRelease; + it->callbacks.getEntries = fullScanIteratorGetEntries; + it->callbacks.swapDb = fullScanIteratorSwapDb; + it->callbacks.flushDb = fullScanIteratorFlushDb; + it->callbacks.hasPassedItem = fullScanIteratorHasPassedItem; + it->callbacks.originalDb = fullScanIteratorOriginalDb; + it->callbacks.isKeyInScope = fullScanIteratorIsKeyInScope; + + return (genericIterator *)it; +} + + + +//============================================================================================= +// Cluster Slot Iterator +//============================================================================================= +/* The cluster slot iterator performs iteration over one cluster slot of the Valkey keyset. The + * iterator is only used from within the Valkey main thread. + */ +struct clusterSlotIterator { + genericIterator callbacks; // (must be first item) +}; + +static void clusterSlotIteratorRelease(genericIterator *genIt) { + UNUSED(genIt); + serverAssert(false); // Not yet implemented +} + +static fifo * clusterSlotIteratorGetEntries(genericIterator *genIt, int *orig_dbid, int *cur_dbid) { + UNUSED(genIt); + UNUSED(orig_dbid); + UNUSED(cur_dbid); + serverAssert(false); // Not yet implemented +} + +static void clusterSlotIteratorSwapDb(genericIterator *genIt, int db1, int db2) { + UNUSED(genIt); + UNUSED(db1); + UNUSED(db2); + serverAssert(false); // swap not valid in cluster mode +} + +static void clusterSlotIteratorFlushDb(genericIterator *genIt, int cur_dbid) { + UNUSED(genIt); + UNUSED(cur_dbid); + serverAssert(false); // Not yet implemented +} + +static bool clusterSlotIteratorHasPassedItem(genericIterator *genIt, const_sds key, int cur_dbid) { + UNUSED(genIt); + UNUSED(key); + UNUSED(cur_dbid); + serverAssert(false); // Not yet implemented +} + +static int clusterSlotIteratorOriginalDb(genericIterator *genIt, int cur_dbid) { + UNUSED(genIt); + UNUSED(cur_dbid); + return cur_dbid; // swap not supported in cluster mode +} + +/* When checking if a command is in scope for this iterator, all of its keys should be either in + * scope or not. In cluster mode enabled a command cannot reference keys from different slots, so + * this assumption will always be true. */ +static bool clusterSlotIteratorIsKeyInScope(genericIterator *genIt, const_sds key) { + UNUSED(genIt); + UNUSED(key); + serverAssert(false); // Not yet implemented +} + +static genericIterator * clusterSlotIteratorCreate(const int *slots, size_t slots_count) { + struct clusterSlotIterator *it = zmalloc(sizeof(struct clusterSlotIterator)); + it->callbacks.release = clusterSlotIteratorRelease; + it->callbacks.getEntries = clusterSlotIteratorGetEntries; + it->callbacks.swapDb = clusterSlotIteratorSwapDb; + it->callbacks.flushDb = clusterSlotIteratorFlushDb; + it->callbacks.hasPassedItem = clusterSlotIteratorHasPassedItem; + it->callbacks.originalDb = clusterSlotIteratorOriginalDb; + it->callbacks.isKeyInScope = clusterSlotIteratorIsKeyInScope; + + UNUSED(slots); + UNUSED(slots_count); + serverAssert(false); // Not yet implemented + + return (genericIterator *)it; +} + + + +//============================================================================================= +// General iteration support (across all iterators) +//============================================================================================= + +// While an item is potentially in use by a background thread, we can't have +// rehashing by the main thread. Returns true if rehashing was paused. +static bool pauseRehashing(dbEntry *de) { + switch (de->encoding) { + case OBJ_ENCODING_HASHTABLE: { // SET or HASH + hashtable *ht = objectGetVal(de); + hashtablePauseRehashing(ht); + return true; + } + case OBJ_ENCODING_SKIPLIST: { // SORTED SET + zset *zs = objectGetVal(de); + hashtablePauseRehashing(zs->ht); + return true; + } + default: + return false; + } +} + +static void resumeRehashing(dbEntry *de) { + switch (de->encoding) { + case OBJ_ENCODING_HASHTABLE: { // SET or HASH + hashtable *ht = objectGetVal(de); + hashtableResumeRehashing(ht); + break; + } + case OBJ_ENCODING_SKIPLIST: { // SORTED SET + zset *zs = objectGetVal(de); + hashtableResumeRehashing(zs->ht); + break; + } + default: + break; + } +} + +// Maintain a list of entries which are currently in-use. These items should not be modified. +static void incrementEntryInuse(dbEntry *de) { + dictEntry *existingEntry; + dictEntry *newEntry = dictAddRaw(inUseEntries, de, &existingEntry); + if (newEntry) { + incrRefCount(de); + dictSetSignedIntegerVal(newEntry, 1); + } else { + dictSetSignedIntegerVal(existingEntry, dictGetSignedIntegerVal(existingEntry) + 1); + } +} + + +static void decrementEntryInuse(dbEntry *de) { + dictEntry *entry = dictFind(inUseEntries, de); + if (dictGetSignedIntegerVal(entry) == 1) { + dictDelete(inUseEntries, de); + decrRefCount(de); + } else { + serverAssert(dictGetSignedIntegerVal(entry) > 1); + dictSetSignedIntegerVal(entry, dictGetSignedIntegerVal(entry) - 1); + } +} + +static bool isEntryInuseBySingleIterator(dbEntry *de) { + dictEntry *entry = dictFind(inUseEntries, de); + return dictGetSignedIntegerVal(entry) == 1; +} + +static bool isEntryInuseByAnyIterator(dbEntry *de) { + return (dictFind(inUseEntries, de) != NULL); +} + + +static ssize_t computeStringDbEntrySize(dbEntry *de) { + sds key = objectGetKey(de); + size_t valueSize = stringObjectLen(de); + + return sdslen(key) + valueSize; // ignore the rest of the overhead, it's minor & transient +} + + +static dbEntry *tryCloneDbEntry(dbEntry *de) { + if (bgiteration_current_clone_memory_pool_size + bgiter_max_clone_item_bytes + > bgiter_max_clone_pool_bytes) { + return NULL; + } + + // Future optimization: Incorporate small ziplists, sorted sets, etc. + // OBJ_ENCODING_INT is omitted only because there isn't a good API for cloning it yet. + if (de->type == OBJ_STRING && de->encoding != OBJ_ENCODING_INT) { + ssize_t itemSize = computeStringDbEntrySize(de); + + if (itemSize <= bgiter_max_clone_item_bytes) { + bgiteration_current_clone_memory_pool_size += itemSize; + dbEntry *clone = createStringObjectWithKeyAndExpire((char *)objectGetVal(de), sdslen(objectGetVal(de)), objectGetKey(de), objectGetExpire(de)); + ((bgIterationEntryMetadata *)objectGetMetadata(clone))->iterator_epoch + = ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch; + return clone; + } + } + + return NULL; +} + + +static void freeClonedDictEntry(dbEntry *clonedEntry) { + serverAssert(clonedEntry->type == OBJ_STRING); + + // Add back to memory pool + bgiteration_current_clone_memory_pool_size -= computeStringDbEntrySize(clonedEntry); + + decrRefCount(clonedEntry); +} + +static bgIteratorItem * makeDbEntryItem(dbEntry *de, int dbid, bool isCloned) { + if (!isCloned) incrementEntryInuse(de); + + bgIteratorItem *item = itemFreeList_getElementOrAllocate(); + item->type = BGITERATOR_ITEM_DBENTRY; + item->dbid = dbid; + item->u.dbe.de = de; + item->u.dbe.is_cloned = isCloned; + item->u.dbe.is_rehashing_paused = pauseRehashing(de); + + return item; +} + +static robj ** cloneRobjArray(int argc, robj **argv) { + robj **newarray = zmalloc(sizeof(robj*) * argc); + for (int i = 0; i < argc; i++) { + newarray[i] = argv[i]; + incrRefCount(argv[i]); + } + return newarray; +} + + +static void freeRobjArray(int argc, robj **argv) { + for (int i = 0; i < argc; i++) { + decrRefCount(argv[i]); + } + zfree(argv); +} + + +// Called by iterator thread to release an item. +static void returnCurrentItemToValkey(bgIterator *it) { + bgIteratorItem *item = it->current_item; + if (item == NULL) return; + + switch (item->type) { + case BGITERATOR_ITEM_DBENTRY: + it->dbentries_processed++; + if (item->u.dbe.is_cloned) it->dbentry_clones_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_REPLICATION: + it->replication_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_SWAPDB: + it->swapdb_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + case BGITERATOR_ITEM_FLUSHDB: + it->flushdb_processed++; + mutexQueueAdd(it->return_to_valkey, item); + break; + + case BGITERATOR_ITEM_COMPLETE: + case BGITERATOR_ITEM_TERMINATED: + // These are static and just used to wake the iterator - they should never be returned. + serverAssert(false); + break; + + default: + serverAssert(false); + } + + // Do this AFTER placing into return_to_valkey. This is volatile and snooped when there is a + // flushall event. Don't want an item to be missed. + it->current_item = NULL; +} + + + +//============================================================================================= +// Background Iterator (private) +//============================================================================================= + +static void bgIteratorRelease(bgIterator *it) { + serverAssert(onValkeyMainThread()); + serverAssert(it->current_item == NULL); + serverAssert(mutexQueueLength(it->items_for_iterator) == 0); + serverAssert(mutexQueueLength(it->return_to_valkey) == 0); + + dictDelete(nameToIterator, it->name); + listDelNode(allIterators, listSearchKey(allIterators, it)); + + mutexQueueRelease(it->items_for_iterator); + it->items_for_iterator = NULL; + + mutexQueueRelease(it->return_to_valkey); + it->return_to_valkey = NULL; + + it->keyset_iter->release(it->keyset_iter); + it->keyset_iter = NULL; + + dictRelease(it->early_iterate_entries); + it->early_iterate_entries = NULL; + + sdsfree(it->name); + zfree(it); +} + + +static bool shouldFeedIteratorMore(bgIterator *it) { + return (!it->completed + && !it->terminated + && mutexQueueLength(it->items_for_iterator) < it->item_count_target); +} + + +// Debugging routine +static sds createEntryString(int dbid, dbEntry *de) { + sds key = objectGetKey(de); + + sds entrySds = sdsempty(); + entrySds = sdscatprintf(entrySds, "(%d)'%s'", dbid, key); + if (de->type == OBJ_STRING) { + robj *o = getDecodedObject(de); // might be encoded as int + const unsigned valuePrintLen = 20; + entrySds = sdscatprintf(entrySds, " : '%.*s'", valuePrintLen, (char *)objectGetVal(o)); + if (sdslen((sds)objectGetVal(o)) > valuePrintLen) entrySds = sdscat(entrySds, "..."); + decrRefCount(o); + } else { + entrySds = sdscatprintf(entrySds, " : type(%d)", de->type); + } + return entrySds; +} + + +static void feedIterator(bgIterator *it, monotime end_time_us) { + // Smart logic to dynamically adjust the size of the queue + unsigned int initial_queue_len = mutexQueueLength(it->items_for_iterator); + + if (initial_queue_len > 2 && it->item_count_target >= initial_queue_len) { + it->item_count_target -= initial_queue_len / 2; + } + + // Now do some feeding + bool have_time = (getMonotonicUs() < end_time_us); + int timeCheckCounter = 0; + while (shouldFeedIteratorMore(it) && have_time) { + int orig_dbid, cur_dbid; + fifo *dbEntryFifo = it->keyset_iter->getEntries(it->keyset_iter, &orig_dbid, &cur_dbid); + + if (dbEntryFifo == NULL) { + // Iteration of items is complete for this iterator + serverAssert(it->dbentries_queued >= it->dbentries_processed); + serverAssert(it->replication_queued >= it->replication_processed); + serverAssert(it->swapdb_queued >= it->swapdb_processed); + serverAssert(it->flushdb_queued >= it->flushdb_processed); + serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed); + + // Snapshot queue size to seed next iterator when terminated + last_item_count_target = it->item_count_target; + + if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { + if (!it->client_is_active || (it->dbentries_queued > it->dbentries_processed)) { + // We are done feeding dict entries to the iterator, but before ending the + // replication processing make sure that the iterator has become active (has + // started reading) and make sure that all of the dict entries have been processed + // by the client. + break; + } + if (it->repldone) { + bool clientWantsMoreReplication = (!it->repldone(it->privdata)); + if (clientWantsMoreReplication) break; + } + } + bgIteratorItem *completionItem = itemFreeList_getElementOrAllocate(); + *completionItem = (bgIteratorItem){ .type = BGITERATOR_ITEM_COMPLETE }; + if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { + rdbSaveInfo rsi; + completionItem->dbid = (rdbPopulateSaveInfo(&rsi)) ? rsi.repl_stream_db : 0; + completionItem->u.master_repl_offset = server.primary_repl_offset; + if (BGITERATION_DEBUG) { + debugBuffer = sdscat(debugBuffer, "REPLDONE FN\n"); + } + } + + if (BGITERATION_DEBUG) { + debugBuffer = sdscat(debugBuffer, "SENDING COMPLETE\n"); + } + + mutexQueueAdd(it->items_for_iterator, completionItem); + it->completed = true; + break; + } + + int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) ? orig_dbid : cur_dbid; + + fifo *itemsToAdd = fifoCreate(); + while (fifoLength(dbEntryFifo) > 0) { + dbEntry *de; + fifoPop(dbEntryFifo, (void **)&de); + + // Remove new/modified items during consistent iteration. + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT + && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch > it->consistent_modification_id) { + continue; + } + + // Remove any items which have been processed early + if (dictFind(it->early_iterate_entries, de) != NULL) { + dictDelete(it->early_iterate_entries, de); + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, de); + debugBuffer = sdscatprintf(debugBuffer, "SKIPPING ITEM(early iterate): %s\n", entryString); + sdsfree(entryString); + } + continue; + } + + // For items which are left, convert them from dbEntry to iteratorItem + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, de); + debugBuffer = sdscatprintf(debugBuffer, "ITEM: %s\n", entryString); + sdsfree(entryString); + } + + bgIteratorItem *item = makeDbEntryItem(de, dbid, false); + + fifoPush(itemsToAdd, item); + } + fifoRelease(dbEntryFifo); + + if (fifoLength(itemsToAdd) > 0) { + it->dbentries_queued += fifoLength(itemsToAdd); + mutexQueueAddMultiple(it->items_for_iterator, itemsToAdd); + } + fifoRelease(itemsToAdd); + + // This is a predictably fast loop. We don't need to check the time on every pass. + if (++timeCheckCounter % 32 == 0) { + have_time = (getMonotonicUs() < end_time_us); + } + } + + // Smart logic to dynamically adjust the size of the queue + if (initial_queue_len == 0 && have_time) { + it->item_count_target += BGITER_QUEUE_INCREASE_INCR; + } +} + + +static bool addEarlyIterationKey(bgIterator *it, dbEntry *earlyEntry, int cur_dbid) { + int rc = dictAdd(it->early_iterate_entries, earlyEntry, NULL); + serverAssert(rc == DICT_OK); + + int dbid = (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) + ? it->keyset_iter->originalDb(it->keyset_iter, cur_dbid) + : cur_dbid; + + dbEntry *cloneEntry = tryCloneDbEntry(earlyEntry); + bool isClonedEntry = (cloneEntry != NULL); + bgIteratorItem *item = makeDbEntryItem(isClonedEntry ? cloneEntry : earlyEntry, dbid, isClonedEntry); + + it->dbentries_queued++; + if (isClonedEntry) it->dbentry_clones_queued++; + + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { // JHB - can we optimize here in cluster mode (no swap) + // On consistent iteration, SWAPDB events are not provided. So there is no requirement to + // keep items in order or synchronized with SWAPDB. + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, item->u.dbe.de); + debugBuffer = sdscatprintf(debugBuffer, "EARLY_1: %s\n", entryString); + sdsfree(entryString); + } + mutexQueuePushPriority(it->items_for_iterator, item); + } else { + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, item->u.dbe.de); + debugBuffer = sdscatprintf(debugBuffer, "EARLY: %s\n", entryString); + sdsfree(entryString); + } + mutexQueueAdd(it->items_for_iterator, item); + } + return !isClonedEntry; // Block if the entry will be used by the background thread +} + + +// This expedites a single key and doesn't attempt to avoid expediting through optimization. +static bool expediteSingleKeyWithoutOptimization( + bgIterator *it, + int dbid, + robj *oKey, + dict *waitingOnKeys) { + + bool mustBlock = false; + + bool iterComplete = it->completed || it->terminated; + + sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], key); + if (de != NULL) { + if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) + && (dictFind(it->early_iterate_entries, de) == NULL)) { + if (addEarlyIterationKey(it, de, dbid)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } else { + if (isEntryInuseByAnyIterator(de)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } + } + + return mustBlock; +} + + +// MOVE/COPY are unfortunate special commands. They work on 2 DBs at once. +const int MOVE_COMMAND_DBID_ARG_INDEX = 2; +static bool expediteKeysForMove( + bgIterator *it, + int dbid, + int argc, + robj **argv, + dict *waitingOnKeys) { + if (argc <= MOVE_COMMAND_DBID_ARG_INDEX) return false; + + int destDbid; + if (!getDbIdFromRobj(argv[MOVE_COMMAND_DBID_ARG_INDEX], &destDbid)) return false; + + bool mustBlock = false; + robj *key = argv[1]; + + // Not looking for special cases to optimize here. Just try to expedite both src and dest + // keys. Note that the dest key might exist (and need iteration) but could be expired and + // could be overwritten by MOVE. In this case, a DEL would replicate due to the expiry. So + // even if the target is expired, we need to replicate it before executing the command. + if (expediteSingleKeyWithoutOptimization(it, dbid, key, waitingOnKeys)) mustBlock = true; + if (expediteSingleKeyWithoutOptimization(it, destDbid, key, waitingOnKeys)) mustBlock = true; + + it->cur_cmd_may_replicate = true; + return mustBlock; +} + + +// MOVE/COPY are unfortunate special commands. They work on 2 DBs at once. +static bool expediteKeysForCopy( + bgIterator *it, + int dbid, + int argc, + robj **argv, + dict *waitingOnKeys) { + + int destDbid; + if (!getTargetDbIdForCopyCommand(argc, argv, dbid, &destDbid)) return false; + + bool mustBlock = false; + robj *srcKey = argv[1]; + robj *destKey = argv[2]; + + // Not trying to optimize COPY. Just expedite source and destination (if it exists). We + // don't really care if the value is overwritten or not (so no need to parse REPLACE option). + if (expediteSingleKeyWithoutOptimization(it, dbid, srcKey, waitingOnKeys)) mustBlock = true; + if (expediteSingleKeyWithoutOptimization(it, destDbid, destKey, waitingOnKeys)) mustBlock = true; + + it->cur_cmd_may_replicate = true; + return mustBlock; +} + + +/* There are several cases where a client must be blocked on write operations. (Clients never need + * to be blocked for read operations.) + * + * Note: An Amazon extension to the Valkey command structure allows us to identify commands where + * the first key is for write and the rest are for read. This allows us to make the + * following optimizations: + * - for keys which are read only, there's no need to block if the key is in-use by an iterator + * - without replication, there's no need to immediately queue read keys on a consistent iteration + * + * Iterator: CONSISTENT = NO, REPLICATION = NO + * - Block if any write-key is in use by an the iterator + * + * Iterator: CONSISTENT = NO, REPLICATION = YES + * - Block if any write-key is in use by an the iterator + * - If ANY key has already been iterated (but some keys have not), then + * - Block and immediately queue any key (read or write) that has not + * already been iterated + * Example: SDIFFSTORE KEY_A KEY_B KEY_C + * In this case, KEY_A is written, KEY_B and KEY_C are read. If KEY_A has already been + * iterated over, the replication stream will contain this command. The receiver of this + * replication will need KEY_B and KEY_C in order to process the replication stream. So + * these need to be iterated and the client blocked. + * + * Iterator: CONSISTENT = YES, REPLICATION = NO + * - Block if any write-key is in use by an the iterator + * - Block and immediately queue any WRITE-key that has not already been iterated + * + * Iterator: CONSISTENT = YES, REPLICATION = YES + * (Combination only valid in cluster mode - no SWAPDB possible) + * - Block if any write-key is in use by an the iterator + * - Block and immediately queue any key (read or write) that has not already been iterated + */ +static bool expediteKeysForWrite( + bgIterator *it, + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv, + keyReference *keyrefs, + int numKeys, + dict *waitingOnKeys) { + serverAssert(numKeys > 0); + + bool mustBlock = false; + + // All keys of the command should either be in scope or not since in cluster mode enabled they + // should all be in the same slot. So we just check the first key. + robj *oKey = argv[keyrefs[0].pos]; + sds key = objectGetVal(oKey); + // If it's not in the iteration scope for the current iterator, then we don't need to do + // anything with this command. + if (!it->keyset_iter->isKeyInScope(it->keyset_iter, key)) return false; + + // Note: performance optimization for commands which only modify the first key. If this flag + // is not available, we can safely remove this `if` statement. + if ((cmd->flags & CMD_WRITE_FIRSTKEY_ONLY) + && !(it->iteration_flags & BGITERATOR_FLAG_REPLICATION)) { + // If this write command only modifies the 1st key, we don't need to expedite others + // unless replication enabled. + numKeys = 1; + } + + if (cmd->proc == moveCommand) { + // Unfortunate special case for MOVE + return expediteKeysForMove(it, dbid, argc, argv, waitingOnKeys); + } + + if (cmd->proc == copyCommand) { + // Similar special case for COPY + return expediteKeysForCopy(it, dbid, argc, argv, waitingOnKeys); + } + + bool iterComplete = it->completed || it->terminated; + + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { + // CONSISTENT = YES, REPLICATION = YES / NO + for (int i = 0; i < numKeys; i++) { + robj *oKey = argv[keyrefs[i].pos]; + sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], key); + if (de == NULL) continue; // New key, no need to expedite + if (!(iterComplete || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) + && dictFind(it->early_iterate_entries, de) == NULL + && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { + if (addEarlyIterationKey(it, de, dbid)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } else { + if (isEntryInuseByAnyIterator(de)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } + } + it->cur_cmd_may_replicate = true; // Will replicate only if replication enabled + } else { + // Identification of missing keys is only needed for non-consistent iteration. This only + // needs to be collected once (on the 1st non-consistent iteration) + bool collectMissing = (listLength(curCmdMissingKeys) == 0); + + if (it->iteration_flags & BGITERATOR_FLAG_REPLICATION) { + // CONSISTENT = NO, REPLICATION = YES + bool someIterated = false; + // dict containing the keys that have not been iterated yet. + // Using a dict dedupes the keys in case the command contains duplicated keys. + dict *notIteratedKeys = dictCreate(&dictEntryPtrDictType); // dict of dbEntry* -> robj* + + for (int i = 0; i < numKeys; i++) { + robj *oKey = argv[keyrefs[i].pos]; + sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], key); + if (de == NULL) { + if (collectMissing) { + incrRefCount(oKey); + listAddNodeHead(curCmdMissingKeys, oKey); + } + continue; + } + if (iterComplete + || it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) + || (dictFind(it->early_iterate_entries, de) != NULL)) { + someIterated = true; + } else { + dictAdd(notIteratedKeys, de, oKey); + } + if (isEntryInuseByAnyIterator(de)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } + + // Since missing keys are considered as already iterated, if there are any missing keys + // we must consider that some keys have been iterated, and make sure all other keys + // will be expedited if needed. + if (listLength(curCmdMissingKeys) > 0) someIterated = true; + + // This command may be executing as part of a larger transaction. If some parts of the + // transaction have already been identified to replicate, we must wait on all keys and + // replicate here as well. (Take care not to set cur_cmd_may_replicate to false.) + if (someIterated) { + if (server.in_exec) { + // We are now executing the commands in a multi-exec block. + // + // Regarding MULTI/EXEC: Remember that this code is executed twice for commands + // within a MULTI/EXEC block. First, we parse all the commands when deciding + // if the EXEC should be blocked. Then, as each command is executed, it's + // re-parsed so that we can maintain the early iterated list as the commands + // execute. In this second pass, as each command is executed, we can't change + // the replication decision which was made earlier (when the EXEC was processed). + // We don't want to get tricked (by a key being removed and recreated) into + // into starting to replicate in the middle of a MULTI/EXEC block. + } else { + it->cur_cmd_may_replicate = true; + } + } + if (it->cur_cmd_may_replicate) { + dictEntry *de; + dictIterator *di = dictGetIterator(notIteratedKeys); + while ((de = dictNext(di)) != NULL) { + dbEntry *notIteratedEntry = dictGetKey(de); + robj *oKey = dictGetVal(de); + + if (addEarlyIterationKey(it, notIteratedEntry, dbid)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } + dictReleaseIterator(di); + } + dictRelease(notIteratedKeys); + } else { + // CONSISTENT = NO, REPLICATION = NO + for (int i = 0; i < numKeys; i++) { + robj *oKey = argv[keyrefs[i].pos]; + sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], key); + if (de == NULL) { + if (collectMissing) { + incrRefCount(oKey); + listAddNodeHead(curCmdMissingKeys, oKey); + } + continue; + } + if (isEntryInuseByAnyIterator(de)) { + mustBlock = true; + dictAdd(waitingOnKeys, oKey, NULL); + } + } + } + } + + return mustBlock; +} + + +// Called when an iterator is terminated. Pulls everything out of the queue +// and returns the items to Valkey (before they hit the iterator). +static void returnAllItemsToValkey(bgIterator *it) { + serverAssert(onValkeyMainThread()); + + fifo *poppedFifo = mutexQueuePopAll(it->items_for_iterator, false); + if (poppedFifo == NULL) return; // Nothing to return + + // Release non-dictentry items first... + fifo *itemsToReturn = fifoCreate(); + while (fifoLength(poppedFifo) > 0) { + bgIteratorItem *item; + fifoPop(poppedFifo, (void **)&item); + switch (item->type) { + // back out the "queued" statistic + case BGITERATOR_ITEM_DBENTRY: + it->dbentries_queued--; + if (item->u.dbe.is_cloned) it->dbentry_clones_queued--; + break; + case BGITERATOR_ITEM_REPLICATION: + it->replication_queued--; + break; + case BGITERATOR_ITEM_SWAPDB: + it->swapdb_queued--; + break; + case BGITERATOR_ITEM_FLUSHDB: + it->flushdb_queued--; + break; + + case BGITERATOR_ITEM_COMPLETE: + // This can only happen if the completion item has been enqueued and + // the iterator is terminated before reaching the completion item. + itemFreeList_returnItemBackToFreeList(item); + continue; // Skip pushing this onto itemsToReturn + + case BGITERATOR_ITEM_TERMINATED: + // This can only happen if there is a race when terminating between + // the iteration client and main thread. + itemFreeList_returnItemBackToFreeList(item); + continue; // Skip pushing this onto itemsToReturn + + default: + serverAssert(false); + } + + fifoPush(itemsToReturn, item); + } + fifoRelease(poppedFifo); + + // Now release items all at once... + if (fifoLength(itemsToReturn) > 0) { + mutexQueueAddMultiple(it->return_to_valkey, itemsToReturn); + } + fifoRelease(itemsToReturn); +} + + + +//============================================================================================= +// Foreground support functions (private) +//============================================================================================= + +static size_t replicationItemSize(bgIteratorItem *item) { + serverAssert(item->type == BGITERATOR_ITEM_REPLICATION); + size_t itemSize = sizeof(bgIteratorItem); + for (int i = 0; i < item->u.repl.argc; i++) { + itemSize += objectComputeSize(NULL, item->u.repl.argv[i], 0, 0); + } + return itemSize; +} + +static void processReturnOfItemToValkey(bgIteratorItem *item, bgIterator *iter) { + serverAssert(onValkeyMainThread()); + switch ((int)item->type) { + case BGITERATOR_ITEM_REPLICATION: + bufferedReplicationBytes -= replicationItemSize(item); + freeRobjArray(item->u.repl.argc, item->u.repl.argv); + break; + + case BGITERATOR_ITEM_DBENTRY: + { + if (item->u.dbe.is_cloned) { + freeClonedDictEntry(item->u.dbe.de); + } else { + if (isEntryInuseBySingleIterator(item->u.dbe.de)) { + // This blocking mechanism isn't the best. Written for slot-migration, + // it assumes a single DB so if the same key appears in multiple DBs, + // commands might get unblocked only to get blocked again. (This would + // happen only rarely, and with minimal impact.) + robj *key = createStringObjectFromSds(objectGetKey(item->u.dbe.de)); + unblockClientsInUseOnKey(key); + decrRefCount(key); + } + // resumeRehashing must be called before decrementEntryInuse, since decrementEntryInuse can free + if (item->u.dbe.is_rehashing_paused) resumeRehashing(item->u.dbe.de); + decrementEntryInuse(item->u.dbe.de); + } + } + break; + + case BGITERATOR_ITEM_SWAPDB: + case BGITERATOR_ITEM_FLUSHDB: + break; + + case BGITERATOR_ITEMEXT_ITER_CLOSED: + { + bgIterator *it = ((bgIteratorItemExtClose*)item)->iter; + serverAssert(it == iter); + if (it->terminated) { + // Abnormal termination + // Normally the item is TERMINATED, but might be COMPLETE in race + serverAssert(it->current_item->type == BGITERATOR_ITEM_TERMINATED + || it->current_item->type == BGITERATOR_ITEM_COMPLETE); + // Release any items stranded on the iterator after early termination + returnAllItemsToValkey(it); + receiveItemsBackFromOneIterator(it); + } else { + // Normal completion + serverAssert(it->current_item->type == BGITERATOR_ITEM_COMPLETE); + } + serverAssert(mutexQueueLength(it->items_for_iterator) == 0); + serverAssert(it->dbentries_queued == it->dbentries_processed); + serverAssert(it->replication_queued == it->replication_processed); + serverAssert(it->swapdb_queued == it->swapdb_processed); + serverAssert(it->flushdb_queued == it->flushdb_processed); + serverAssert(it->dbentry_clones_queued >= it->dbentry_clones_processed); + + listEmpty(curCmdMissingKeys); // Just in case any remain + + itemFreeList_returnItemBackToFreeList(it->current_item); + it->current_item = NULL; + + bool terminated = it->terminated; + void *privdata = it->privdata; + bgIteratorCleanupFunc cleanup = it->cleanup; + bgIteratorRelease(it); // Fully release the iterator before calling cleanup + + if (BGITERATION_DEBUG) { + if (cleanup) debugBuffer = sdscatprintf(debugBuffer, "CLEANUP FN (%s)\n", + (terminated) ? "terminated" : "success"); + + sds filename = sdscatprintf(sdsempty(), "bgiteration_debug.%d", getpid()); + FILE *f = fopen(filename, "w"); + sdsfree(filename); + + fputs(debugBuffer, f); + + fclose(f); + sdsfree(debugBuffer); + debugBuffer = sdsempty(); + } + + if (cleanup) cleanup(terminated, privdata); + } + break; + + default: + serverAssert(false); // Not expecting any other type of item! + } + + // We don't allocate extension items from the pool so we manually free them + if((int)item->type == BGITERATOR_ITEMEXT_ITER_CLOSED) { + zfree(item); + } else { + itemFreeList_returnItemBackToFreeList(item); + } +} + +static void prepareAndProcessReturnedItems(int n, bgIteratorItem **items, bgIterator *iter) { + int i = 0; + for (i = 0; i < n; i++) valkey_prefetch(items[i]); + for (i = 0; i < n; i++) { + if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue; + // Prefetch can have a significant perf hit on NULL + // but we never expect items[i]->u.dbe.de to be NULL + valkey_prefetch(items[i]->u.dbe.de); + } + for (i = 0; i < n; i++) { + if (items[i]->type != BGITERATOR_ITEM_DBENTRY) continue; + // Same as above, assume key is never NULL + valkey_prefetch(objectGetKey(items[i]->u.dbe.de)); + } + for (i = 0; i < n; i++) processReturnOfItemToValkey(items[i], iter); +} + +#define PREFETCH_BATCH_SIZE 16 + +static bool receiveItemsBackFromOneIterator(bgIterator *it) { + bgIteratorItem* batchPool[PREFETCH_BATCH_SIZE]; + int n = 0; + // Returns true if we process at least one item from + // a given iterator's return_to_valkey queue, false otherwise. + fifo *poppedFifo = mutexQueuePopAll(it->return_to_valkey, false); + if (poppedFifo != NULL) { + while (fifoLength(poppedFifo) > 0) { + fifoPop(poppedFifo, (void **)&batchPool[n++]); + if (n == PREFETCH_BATCH_SIZE) { + prepareAndProcessReturnedItems(n, batchPool, it); + n = 0; + } + } + if (n > 0) { + prepareAndProcessReturnedItems(n, batchPool, it); + } + fifoRelease(poppedFifo); + return true; + } + return false; +} + +static void receiveItemsBackFromIterators(bool blocking) { + // Process each iterator's return_to_valkey queue + // If `blocking` is true, continue reading until + // at least one queue was not empty. + serverAssert(onValkeyMainThread()); + listIter li; + listNode *node; + bool processedItems = false; + do { + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + processedItems |= receiveItemsBackFromOneIterator(it); + } + if (blocking) usleep(100); // Sleep for 1ms and re-try processing iterators + } while (blocking && !processedItems); +} + + +static long long bgIteration_feedIterators_task( + struct aeEventLoop *eventLoop, + long long id, + void *clientData) { + UNUSED(eventLoop); + UNUSED(id); + UNUSED(clientData); + serverAssert(onValkeyMainThread()); + + static monotime lastFeedEndTime; // STATIC: Persists For checking starvation + monotime startTime = getMonotonicUs(); + + if (!bgIteration_iterationActive()) { + // No more iterators exist. Self-check, and terminate the "feed" task. + serverAssert(dictSize(nameToIterator) == 0); + serverAssert(dictSize(inUseEntries) == 0); + serverAssert(bufferedReplicationBytes == 0); + + // Shrink dict back to zero (doesn't normally shrink) + dictRelease(inUseEntries); + inUseEntries = dictCreate(&dictEntryPtrDictType); + + itemFreeList_release(); + + bgIterator_timeproc_id = AE_DELETED_EVENT_ID; + lastFeedEndTime = 0; + return AE_NOMORE; + } + + long dutyTimeUs = BGITER_CYCLE_BUDGET_MS * 1000; + if (lastFeedEndTime > 0) { + // If the timer was delayed, compute the proportional time we should have had, and increase + // the duty cycle to compensate (up to a limit). + long starvationUs = (startTime - lastFeedEndTime) - BGITER_CYCLE_DELAY_MS * 1000; + if (starvationUs > 0) { + long starvationCompensationUs = starvationUs * BGITER_CYCLE_BUDGET_MS + / (BGITER_CYCLE_BUDGET_MS + BGITER_CYCLE_DELAY_MS); + dutyTimeUs += starvationCompensationUs; + dutyTimeUs = MIN(dutyTimeUs, BGITER_CYCLE_BUDGET_MAX_MS * 1000); + } + } + monotime endTime = startTime + dutyTimeUs; + + // Run this part regardless of time limit... + receiveItemsBackFromIterators(false); + + // Feeding iterators (below) respects endTime. The stuff above always runs to completion. + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL && getMonotonicUs() < endTime) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated) continue; + feedIterator(it, endTime); + } + + lastFeedEndTime = getMonotonicUs(); + return BGITER_CYCLE_DELAY_MS; +} + + +// Not static, but not API. Intended for unit tests where the event loop may not be active. +void bgIteration_feedIterators(void) { + // For unit testing, force the item_count_target to 1 in each call. This ensures that we only + // feed a minimal amount to the iterators rather than a non-deterministic amount. + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + it->item_count_target = 1; + } + + // Invoke the feeding task (normally invoked by timer). + bgIteration_feedIterators_task(NULL, 0, NULL); +} + + +static void resetReplicationFlagForIterators(client *c) { + // For any given command, the command may or may not need to be replicated based on the status + // and flags of each iterator. Furthermore, if a command does need to be replicated, this + // replication must occur for an entire atomic unit; we can't replicate only part of a script + // or multi/exec. + // This function is the only place where the replication flag is cleared. + + if (c->flag.multi || c->flag.script) { + // REGARDING MULTI/EXEC + // -------------------- + // When processing a MULTI/EXEC, blockClientIfRequired is called first for the MULTI. Then, + // all of the commands are queued up in server.c:processCommand(). It's only when EXEC is + // encountered, that server.c:call() is fired to begin execution. + // AFTER the EXEC is processed by call(), then each of the commands in the MULTI/EXEC block + // will be processed through call(). + // If write commands are present, MULTI & EXEC will be passed to the replication stream + // before/after the transaction commands. Note that MULTI & EXEC are not actually + // "executed" at the time when their replication is passed to the replication stream. + // + // Example: MULTI; SET A B; EXEC + // 1. blockClientIfRequired() called for MULTI. MULTI flag IS NOT set. (Won't block.) + // 2. blockClientIfRequired() called for EXEC. MULTI flag IS set. (Might block.) + // 3. blockClientIfRequired() called for SET. MULTI flag IS set. (Won't block.) + // 4. handleCommandReplication() is called for MULTI. + // 5. handleCommandReplication() is called for SET. + // 6. handleCommandReplication() is called for EXEC. + // + // SO - if the MULTI flag is set, we DON'T clear the flag. It should only be cleared at the + // start of the transaction, when MULTI is received - and the flag isn't set yet. + + // REGARDING SCRIPTS + // ----------------- + // When processing a script, blockClientIfRequired is called first for the EVAL/EVALSHA/FCALL. + // Then, all of the commands are processed using a special script client. The script + // client has the CLIENT_SCRIPT flag set. For scripts, the replication flag is set when + // processing the EVAL/EVALSHA/FCALL and should not be cleared when executing individual + // commands in the script. + + // If it's the EXEC command, we fall through and clear the flag below. But for all other + // commands within the transaction, we don't clear the flag. + if (c->cmd->proc != execCommand) return; + } + + // For most commands, the replication flag is cleared and we determine if replication is needed + // based on the keys being used and their state in each iterator. If a modified key hasn't been + // processed yet, there's no need to expedite the key or send the replication. The key will be + // sent later, when reached by the iterator. + // However, for scripts, it is not possible to perform this optimization. There is no way to + // know if an undeclared key might be modified. Since the entire script needs to be replicated + // (or not replicated) atomically, we can't take the chance that an undeclared key might be + // hit which requires replication. + bool isScript = isScriptCallWriteCmd(c->cmd); + + getKeysResult result; + initGetKeysResult(&result); + getKeysFromCommand(c->cmd, c->argv, c->argc, &result); + + // [sm-bgiterator] TODO: ELMO-108525, This assumes all keys are in the same slot, should consider cross-slot script case. + sds check_key = (result.numkeys > 0) ? objectGetVal(c->argv[result.keys[0].pos]) : NULL; + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated) { + it->cur_cmd_may_replicate = false; + } else { + // Set initial state of the replication flag for this transaction + // For full scan iterators, write commands within scripts must always be replicated. + // For cluster slot iterators, replication of script write commands depends on whether + // the key is in scope of the current iterator. + it->cur_cmd_may_replicate = isScript && it->keyset_iter->isKeyInScope(it->keyset_iter, check_key); + } + } + getKeysFreeResult(&result); +} + + +static void handleSwapdb(int db1, int db2) { + serverAssert(onValkeyMainThread()); + serverAssert(bgIteration_iterationActive()); + serverAssert(!server.cluster_enabled); + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated) continue; + + // Let the iterator internal mechanism know + it->keyset_iter->swapDb(it->keyset_iter, db1, db2); + + // Let the background client know + if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) { + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "SWAP: %d %d\n", db1, db2); + } + + bgIteratorItem *item = itemFreeList_getElementOrAllocate(); + item->type = BGITERATOR_ITEM_SWAPDB; + item->dbid = db1; + item->u.dbid2 = db2; + it->swapdb_queued++; + mutexQueueAdd(it->items_for_iterator, item); + } + } +} + + +static void removePtrFromEarlyIterate(dbEntry *de) { + // If the item is being released, let's get the pointer out of our early_iterate_entries. + // Note that this is not strictly necessary, but it frees some memory and keeps the + // dictionary small. + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + dictDelete(it->early_iterate_entries, de); // just try delete (might not be here) + } +} + + +static int findDbForEntry(dbEntry *de) { + for (int i = 0; i < server.dbnum; i++) { + if (dbFind(server.db[i], objectGetKey(de)) == de) return i; + } + serverAssert(false); // the entry MUST be in one of the DBs +} + + +static void terminateIteratorForFlush(bgIterator *it, int dbid) { + if (!it->terminated) bgIteratorTerminate(it); + + // Snoop on the iterator. There might be 1 item still being processed. If that item is in the + // DB being flushed, the item is removed from the dict and held for deferred deletion. This + // allows the iterator to complete processing on the current item without the item being + // deleted unexpectedly. + // Since this is running in parallel with a background thread, the results are volatile. This + // is OK as when the iterator completes processing the item, it still won't have been accepted + // back to Valkey yet, meaning the item will still be in inUseEntries. + bgIteratorItem *item = it->current_item; + if (item && item->type == BGITERATOR_ITEM_DBENTRY) { + dbEntry *de = item->u.dbe.de; + int deDb = findDbForEntry(de); + if (dbid == -1 || dbid == deDb) { + removePtrFromEarlyIterate(de); + } + } +} + + +static void preserveIteratorItemsForFlush(bgIterator *it, int dbid) { + serverAssert(onValkeyMainThread()); + serverAssert(!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)); + serverAssert(dbid >= 0); + // Since this is not a consistent iteration, it's OK if the early_iterate_entries contains + // pointers to items being deleted. The item is not actually accessed from the pointer. And + // if the pointer gets reused for a new item, there's no guarantee that we would iterate it + // anyway. If replication is enabled, both new items and early_iterate_entries are treated the + // same (replication is processed). So this is safe in all cases. + // Given this, we will just worry about preserving items in the iterator's processing queue. + // Because of commands like SWAPDB and MOVE, there's no attempt to remove unnecessary items + // from the queue. This is also safer to future Valkey extensions. + + // Temporarily yank all items from the iterator's queue + fifo *poppedFifo = mutexQueuePopAll(it->items_for_iterator, false); + if (poppedFifo != NULL) { + fifo *readdFifo = fifoCreate(); + while(fifoLength(poppedFifo) > 0) { + bgIteratorItem *item; + fifoPop(poppedFifo, (void **)&item); + if (item->type == BGITERATOR_ITEM_DBENTRY) { + dbEntry *de = item->u.dbe.de; + if (dbFind(server.db[dbid], objectGetKey(de)) == de) { + // Found the entry in the DB about to be flushed + removePtrFromEarlyIterate(de); + } + } + fifoPush(readdFifo, item); + } + fifoRelease(poppedFifo); + + // Now give the list back to the iterator + mutexQueueAddMultiple(it->items_for_iterator, readdFifo); + fifoRelease(readdFifo); + } + + // And snoop on the active item. Even if the background task finishes with this item as we look + // at it, the item can't have been returned to Valkey yet. + bgIteratorItem *item = it->current_item; + if (item && item->type == BGITERATOR_ITEM_DBENTRY) { + dbEntry *de = item->u.dbe.de; + if (dbFind(server.db[dbid], objectGetKey(de)) == de) { + // Found the entry in the DB about to be flushed + removePtrFromEarlyIterate(de); + } + } +} + + +static bool isDbSignificant(int dbid) { + unsigned long long totalKeys = 0; + for (int i = 0; i < server.dbnum; i++) { + totalKeys += (server.db[i]) ? dbSize(server.db[i]) : 0; + } + return (server.db[dbid]) ? (dbSize(server.db[dbid]) > totalKeys / 2) : false; +} + + +static void handleFlushdb(int dbid) { + // Invoked BEFORE the actual flush. -1 indicates FLUSHALL. + bool should_abort_iterators = server.cluster_enabled || dbid == -1 || isDbSignificant(dbid); + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + + if (should_abort_iterators || it->iteration_flags & BGITERATOR_FLAG_CONSISTENT) { + terminateIteratorForFlush(it, dbid); + } else { + // In this (limited) case, we're only flushing a single DB that contains < half the + // keys. We don't want to kill a full-sync replication. We will just continue with + // iteration, knowing that a replication client will also receive the FLUSHDB on the + // replication stream. + // It would be nice to do this with consistent snapshot also, but given that this is a + // very rare condition, development is not justified to save off the DB for deferred + // delete. This would add a lot of complexity as well as memory implications. + preserveIteratorItemsForFlush(it, dbid); + it->keyset_iter->flushDb(it->keyset_iter, dbid); + + // Send a flushdb event to notify the client + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "FLUSH: %d\n", dbid); + } + + bgIteratorItem *item = itemFreeList_getElementOrAllocate(); + item->type = BGITERATOR_ITEM_FLUSHDB; + item->dbid = dbid; + it->flushdb_queued++; + mutexQueueAdd(it->items_for_iterator, item); + } + } + receiveItemsBackFromIterators(false); // Receive items back before flushing the items +} + + +static bool expediteKeysForWriteOnAllIterators( + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv, + keyReference *keyrefs, + int numKeys, + dict *waitingOnKeys) { + bool mustBlock = false; + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (expediteKeysForWrite(it, dbid, cmd, argc, argv, keyrefs, numKeys, waitingOnKeys)) + mustBlock = true; + } + + return mustBlock; +} + + +static bool anIteratorWillReplicateForThisCommand(void) { + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->cur_cmd_may_replicate) return true; + } + return false; +} + + +static bool expediteKeysForMultiExec(client *c, dict *waitingOnKeys) { + serverAssert(c->cmd->proc == execCommand); + + /* For MULTI/EXEC, Valkey buffers all of the commands until hitting the EXEC. + * At this point, the client holds all of the commands to be executed. This function searches + * for all of the keys used by any of the buffered write commands. In addition, if SWAPDB or + * SELECT is used, this tracks the DBIDs through various swap/select operations. + */ + + /* There's a special concern for a NON-consistent iteration with replication. If the keys are + * all "future" keys (which haven't been processed by the iterator yet), then we don't expedite + * the keys or replicate. However, if some keys have already been processed, we need to + * expedite the remaining keys and replicate everything. + * + * When processing a single command, this is all handled. But in this function, for MULTI/EXEC, + * we process 1 command at a time. There's an issue if the first command modifies a "future" + * key, we don't know (without reading ahead) if a later command will modify a prior key. This + * would require the future key to be expedited. + * + * This COULD be addressed by collecting all of the keys into a single structure and then + * analyzing them all at once. However, this won't share code well with the single commands. + * Also, building this structure is a little complex/time-consuming as we need to track both + * key AND dictID. One way to do this might be with a dict of dicts, where the first dict maps + * a dictID to a dict of keys. + * + * ALTERNATIVELY (and it's the simpler approach that's taken here) we can just check if the + * MULTI will be replicated. If so, we re-process the MULTI, just in case there were commands + * prior to deciding that replication was required that might have missed expediting. If so, + * these will be caught on the 2nd time around. + * + * Checking replication status before/after ensures that there can only be a single recursive + * call. + */ + bool initiallyAnIteratorWillReplicate = anIteratorWillReplicateForThisCommand(); + + bool mustBlock = false; + int *cur_to_orig_db = NULL; + + int curDb = c->db->id; + for (int cmdNum = 0; cmdNum < c->mstate->count; cmdNum++) { + struct serverCommand *cmd = c->mstate->commands[cmdNum].cmd; + robj **argv = c->mstate->commands[cmdNum].argv; + int argc = c->mstate->commands[cmdNum].argc; + + if (cmd->proc == swapdbCommand) { + int id1, id2; + if (getParamsForSwapdb(argc, argv, c, &id1, &id2)) { + if (cur_to_orig_db == NULL) { + cur_to_orig_db = zmalloc(sizeof(int) * server.dbnum); + for (int i = 0; i < server.dbnum; i++) cur_to_orig_db[i] = i; + } + int temp = cur_to_orig_db[id1]; + cur_to_orig_db[id1] = cur_to_orig_db[id2]; + cur_to_orig_db[id2] = temp; + } + continue; + } + + if (cmd->proc == selectCommand) { + int id; + if (getParamsForSelect(argc, argv, c, &id)) { + curDb = id; + } + continue; + } + + if (!isWriteCmd(cmd)) continue; + + getKeysResult result; + initGetKeysResult(&result); + int numkeys = getKeysFromCommand(cmd, argv, argc, &result); + keyReference *keyrefs = result.keys; + if (numkeys == 0) continue; // Write command with no keys - like FLUSHDB + + if (expediteKeysForWriteOnAllIterators( + cur_to_orig_db ? cur_to_orig_db[curDb] : curDb, + cmd, argc, argv, keyrefs, numkeys, waitingOnKeys)) { + mustBlock = true; + } + getKeysFreeResult(&result); + } + + zfree(cur_to_orig_db); + + if (!initiallyAnIteratorWillReplicate && anIteratorWillReplicateForThisCommand()) { + // We've decided to replicate. Re-process the MULTI/EXEC just once more to make sure that + // we didn't miss any keys at the beginning. This can't continue to recurse because + // `initiallyAnIteratorWillReplicate` will be TRUE in the recursive call. Note that the + // recursive call may add additional entries to `waitingOnKeys`. + if (expediteKeysForMultiExec(c, waitingOnKeys)) mustBlock = true; + } + + return mustBlock; +} + +static bgIterator * bgIteratorCreate( + const char *name, + int flags, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata, + bgIterationType iter_type, + genericIterator *keyset_iter) { + serverAssert(onValkeyMainThread()); + serverAssert(server.cluster_enabled || iter_type == BGITERATION_TYPE_FULLSCAN); + serverAssert(server.cluster_enabled // Don't allow CONSISTENT & REPLICATION + || !(flags & BGITERATOR_FLAG_CONSISTENT) // unless cluster mode (avoids + || !(flags & BGITERATOR_FLAG_REPLICATION)); // complications with SWAPDB & FLUSHDB) + + bgIterator *it = zmalloc(sizeof(bgIterator)); + it->name = sdsnew(name); + it->repldone = repldone; + it->cleanup = cleanup; + it->privdata = privdata; + it->items_for_iterator = mutexQueueCreate(); + it->return_to_valkey = mutexQueueCreate(); + + // Floor queue size to bgiteration_queue_increase_incr or use last queue size value + if (last_item_count_target < BGITER_QUEUE_INCREASE_INCR) { + last_item_count_target = BGITER_QUEUE_INCREASE_INCR; + } + it->item_count_target = last_item_count_target; + it->iteration_flags = flags; + it->iteration_type = iter_type; + it->consistent_modification_id = bgIteration_epoch++; + it->keyset_iter = keyset_iter; + it->early_iterate_entries = dictCreate(&dictEntryPtrDictType); + dictExpand(it->early_iterate_entries, BGITER_EARLY_ITERATE_DICT_INITIAL_SIZE); + it->current_item = NULL; + it->client_is_active = false; + it->completed = false; + it->terminated = false; + it->cur_cmd_may_replicate = false; + + it->dbentries_queued = 0; + it->dbentries_processed = 0; + it->replication_queued = 0; + it->replication_processed = 0; + it->swapdb_queued = 0; + it->swapdb_processed = 0; + it->flushdb_queued = 0; + it->flushdb_processed = 0; + it->dbentry_clones_queued = 0; + it->dbentry_clones_processed = 0; + + elapsedStart(&it->monotonic_start_time); + it->monotonic_item_start_time = 0; + + + if (bgIterator_timeproc_id <= 0) { + // If iteration is not currently active, start the feeding task. (Runs in main thread.) + bgIterator_timeproc_id = aeCreateTimeEvent(server.el, 1, bgIteration_feedIterators_task, NULL, NULL); + serverAssert(bgIterator_timeproc_id != AE_ERR); + } + + if (dictAdd(nameToIterator, (void*)it->name, it) != DICT_OK) { + // Can't have 2 iterators with the same name! + serverAssert(false); + } + + listAddNodeTail(allIterators, it); + + dictExpand(inUseEntries, listLength(allIterators) * it->item_count_target); + + return it; +} + + + +//============================================================================================= +// PUBLIC INTERFACE: Iterator creation and use +//============================================================================================= + +// PUBLIC API +bgIterator * bgIteratorCreateFullScanIter( + const char *name, + int flags, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata) { + return bgIteratorCreate(name, flags, repldone, cleanup, privdata, BGITERATION_TYPE_FULLSCAN, + fullScanIteratorCreate()); +} + +// PUBLIC API +bgIterator * bgIteratorCreateSlotsIter( + const char *name, + int flags, + const int *slots, + int slots_count, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata) { + return bgIteratorCreate(name, flags, repldone, cleanup, privdata, BGITERATION_TYPE_CLUSTERSLOT, + clusterSlotIteratorCreate(slots, slots_count)); +} + +// PUBLIC API +bgIterator * bgIteratorFind(const char *name) { + serverAssert(onValkeyMainThread()); + + sds sdsname = sdsnew(name); + bgIterator *it = dictFetchValue(nameToIterator, sdsname); + sdsfree(sdsname); + + return it; +} + + +// PUBLIC API +const char *bgIteratorName(bgIterator *it) { + return it->name; +} + + +// PUBLIC API +void bgIteratorGetStatus(bgIterator *it, bgIteratorStatus *status) { + status->dbentries_queued = it->dbentries_queued; + status->dbentries_processed = it->dbentries_processed; + status->replication_queued = it->replication_queued; + status->replication_processed = it->replication_processed; + status->swapdb_queued = it->swapdb_queued; + status->swapdb_processed = it->swapdb_processed; + status->flushdb_queued = it->flushdb_queued; + status->flushdb_processed = it->flushdb_processed; + status->dbentry_clones_queued = it->dbentry_clones_queued; + status->dbentry_clones_processed = it->dbentry_clones_processed; + + status->queue_length = mutexQueueLength(it->items_for_iterator); + status->queue_length_target = it->item_count_target; + + status->runtime_ms = elapsedMs(it->monotonic_start_time); + + monotime nonvolatile_item_start_time = it->monotonic_item_start_time; + status->current_item_ms = + (nonvolatile_item_start_time == 0) ? 0 : elapsedMs(nonvolatile_item_start_time); +} + + +// PUBLIC API +void bgIteratorTerminate(bgIterator *it) { + serverAssert(onValkeyMainThread()); + + // Remove any items in the queue, but doesn't affect the 1 item that's being processed. + returnAllItemsToValkey(it); + + // We have to add an item, just in case the READER is waiting on the mutex. + if (BGITERATION_DEBUG) { + debugBuffer = sdscat(debugBuffer, "SENDING TERMINATE\n"); + } + + bgIteratorItem *terminationItem = itemFreeList_getElementOrAllocate(); + *terminationItem = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED }; + mutexQueueAdd(it->items_for_iterator, terminationItem); + + it->terminated = true; +} + + +// PUBLIC API +bool bgIteratorIsTerminating(bgIterator *it) { + return it->terminated; +} + + +// PUBLIC API +bgIteratorItem * bgIteratorRead(bgIterator *it) { + serverAssert(it->current_item == NULL + || (it->current_item->type != BGITERATOR_ITEM_COMPLETE + && it->current_item->type != BGITERATOR_ITEM_TERMINATED)); + + // First, clean up the previous item read + if (it->current_item != NULL) { + returnCurrentItemToValkey(it); + + // To support unit tests. Normal clients call bgIteratorRead from an alternate thread. + // Without this, a unit test could get stuck waiting on the completion event because + // feed won't get invoked. For production, this is called regularly from the main thread. + if (onValkeyMainThread()) bgIteration_feedIterators_task(NULL, 0, NULL); + } else { + it->client_is_active = true; + } + + it->monotonic_item_start_time = 0; // idle until blocking pop returns + it->current_item = mutexQueuePop(it->items_for_iterator, true); + it->monotonic_item_start_time = getMonotonicUs(); + + return it->current_item; +} + + +// PUBLIC API +void bgIteratorClose(bgIterator *it) { + if (it->current_item != NULL) { + if (it->current_item->type == BGITERATOR_ITEM_COMPLETE + || it->current_item->type == BGITERATOR_ITEM_TERMINATED) { + // Normal confirmation of background completion + } else { + // Client is initiating the termination + it->terminated = true; + returnCurrentItemToValkey(it); + + it->current_item = itemFreeList_getElementOrAllocate(); + *(it->current_item) = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED }; + } + } else { + // terminated before first item read + it->terminated = true; + it->current_item = itemFreeList_getElementOrAllocate(); + *(it->current_item) = (bgIteratorItem){ .type = BGITERATOR_ITEM_TERMINATED }; + } + + // We don't allocate extension items from the free list + bgIteratorItemExtClose *itemClose = zmalloc(sizeof(bgIteratorItemExtClose)); + itemClose->type = BGITERATOR_ITEMEXT_ITER_CLOSED; + itemClose->iter = it; + mutexQueueAdd(it->return_to_valkey, itemClose); +} + + + +//============================================================================================= +// PUBLIC INTERFACE: Valkey main-thread support hooks +//============================================================================================= + +// PUBLIC API +void bgIteration_init(void) { + serverAssert(onValkeyMainThread()); + + /* This should be called once and only once from the Valkey main thread. However to support + * unit tests, this is not validated, and multiple invocations are ignored. */ + if (nameToIterator) return; // If already initialized, ignore (unit tests) + + nameToIterator = dictCreate(&sdsrefToPtrDictType); + serverAssert(nameToIterator != NULL); + + allIterators = listCreate(); + serverAssert(allIterators != NULL); + + inUseEntries = dictCreate(&dictEntryPtrDictType); + serverAssert(inUseEntries != NULL); + + curCmdMissingKeys = listCreate(); + serverAssert(curCmdMissingKeys != NULL); + listSetFreeMethod(curCmdMissingKeys, decrRefCountVoid); + + bufferedReplicationBytes = 0; + + if (BGITERATION_DEBUG) { + debugBuffer = sdsMakeRoomFor(sdsempty(), SDS_MAX_PREALLOC); + } +} + + +// PUBLIC API +bool bgIteration_iterationActive(void) { + return (allIterators != NULL && listLength(allIterators) > 0); +} + + +// PUBLIC API +void bgIteration_keyDelete(int dbid, const_sds key) { + if (!bgIteration_iterationActive()) return; + serverAssert(onValkeyMainThread()); + + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "KEYDEL: (%d)%s\n", dbid, key); + } + + dbEntry *de = dbFind(server.db[dbid], (sds)key); + if (de == NULL) return; + + // For consistent iterators, we need to make sure the item gets written before delete + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated || !it->keyset_iter->isKeyInScope(it->keyset_iter, key)) continue; + + if (it->iteration_flags & BGITERATOR_FLAG_CONSISTENT + && ((bgIterationEntryMetadata *)objectGetMetadata(de))->iterator_epoch <= it->consistent_modification_id) { + if (!it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) + && !(dictFind(it->early_iterate_entries, de) != NULL)) { + addEarlyIterationKey(it, de, dbid); // (may also add to inUseEntries) + } + } + } + + removePtrFromEarlyIterate(de); + + // We might be within the context of a command execution. This happens if the key is found to + // be expired when attempting to execute the command. In this case, we should treat the key as + // missing. If the key exists after the command executes, we can treat it like a new key. + // (If not in command execution, this is ok - it's reset at the beginning of command execution.) + robj *oKey = createObject(OBJ_STRING, sdsdup(key)); + listAddNodeHead(curCmdMissingKeys, oKey); +} + + +// PUBLIC API +// Notify bgIteration that a FLUSHALL is being performed outside of the normal client interface. +void bgIteration_flushall(void) { + handleFlushdb(-1); +} + + +// PUBLIC API +bool bgIteration_blockClientIfRequired(client *c) { + serverAssert(onValkeyMainThread()); + if (!bgIteration_iterationActive()) return false; + if (!isWriteCmd(c->cmd)) return false; + + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "BLCK?: (%d)%s\n", c->db->id, + createSdsFromClientArgv(c->argc, c->argv)); + } + + // Before executing a command or atomic transaction, the replication flag is cleared for each + // iterator. If it's determined that the command should replicate, the flag will be set + // as the command and keys are examined for expedite. + resetReplicationFlagForIterators(c); + + if (c->cmd->proc == flushdbCommand || c->cmd->proc == flushallCommand) { + // Handle flush commands prior to execution + int flags; + if (getFlushCommandFlags(c, &flags) == C_OK) { + // The command parsed ok - we WILL flush + handleFlushdb((c->cmd->proc == flushdbCommand) ? c->db->id : -1); + } + } + + bool mustBlock = false; + dict *waitOnKeys = dictCreate(&tempKeysetDictType); // dict of robj(sds)->NULL + listEmpty(curCmdMissingKeys); + + if (c->cmd->proc == execCommand) { + mustBlock = expediteKeysForMultiExec(c, waitOnKeys); + } else { + getKeysResult result; + initGetKeysResult(&result); + int numkeys = getKeysFromCommand(c->cmd, c->argv, c->argc, &result); + keyReference *keyrefs = result.keys; + if (numkeys > 0) { + mustBlock = expediteKeysForWriteOnAllIterators( + c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); + serverAssert(!(mustBlock && (c->flag.multi) && !(c->flag.script))); + + if (mustBlock && (c->flag.script)) { + /* For scripts, we will block for keys declared in EVAL/EVALSHA/FCALL. + * However, scripts are NOT required to declare keys. Even if it declares keys, + * it's not declaring the DB for the key. After a SELECT or SWAPDB, we might be on + * a key we haven't blocked for. In this case, there is no option but to execute a + * synchronous block and wait for the iterator(s) to be done with the key(s). + * (Yuck.) */ + while (mustBlock) { + receiveItemsBackFromIterators(true); // Blocking + dictEmpty(waitOnKeys, NULL); + mustBlock = expediteKeysForWriteOnAllIterators( + c->db->id, c->cmd, c->argc, c->argv, keyrefs, numkeys, waitOnKeys); + } + } + getKeysFreeResult(&result); + } else { + // WRITE commands with no keys should always be replicated. SWAPDB, FLUSH, FUNCTION, etc. + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + it->cur_cmd_may_replicate = true; + } + } + } + + if (mustBlock) { + serverAssert(dictSize(waitOnKeys) > 0); + robj **waitKeysArgv = zmalloc(sizeof(robj*) * dictSize(waitOnKeys)); + + dictEntry *de; + dictIterator *di = dictGetIterator(waitOnKeys); + unsigned long argvCount = 0; + while((de = dictNext(di)) != NULL) { + waitKeysArgv[argvCount++] = dictGetKey(de); + } + dictReleaseIterator(di); + serverAssert(argvCount == dictSize(waitOnKeys)); + + blockClientInUseOnKeys(c, argvCount, waitKeysArgv); + + zfree(waitKeysArgv); + } + + dictRelease(waitOnKeys); + + if (BGITERATION_DEBUG) { + if (mustBlock) debugBuffer = sdscat(debugBuffer, " (blocked)\n"); + } + + return mustBlock; +} + + +// PUBLIC API +void bgIteration_handleCommandReplication( + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv) { + if (BGITERATION_DEBUG) { + // DEBUG - enable this to capture replication not queued because iteration is inactive + if (0 && !bgIteration_iterationActive() && (isWriteCmd(cmd) || cmd->proc == multiCommand)) { + debugBuffer = sdscatprintf(debugBuffer, "REPL? INACT: (%d)%s\n", dbid, + createSdsFromClientArgv(argc, argv)); + } + } + + if (!bgIteration_iterationActive()) return; + serverAssert(onValkeyMainThread()); + + // Some commands are replicated which are not writes (like publish) these can be ignored. + // Be careful with MULTI which is not a write command, but must be replicated. + if (!isWriteCmd(cmd) && cmd->proc != multiCommand) return; + + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "REPL?: (%d)%s\n", dbid, + createSdsFromClientArgv(argc, argv)); + } + + if (cmd->proc == swapdbCommand) { + // All iterators and clients must be informed of swapdb + int id1, id2; + // command has been processed, but Valkey allows "swapdb 0 0" (which can be ignored) + if (getParamsForSwapdb(argc, argv, NULL, &id1, &id2)) + handleSwapdb(id1, id2); + } + + // In the case that a key is touched in a different DB (COPY/MOVE) the key is recorded as + // a "special" key and than handled below. + int special_dbid = 0; + sds special_key = NULL; + dbEntry *special_dbEntry = NULL; + if (cmd->proc == moveCommand) { + // The MOVE command succeeded. However MOVE requires special handling as it creates a new + // key in a different database. We need to make sure that we don't later try to iterate + // on the key as it would be a duplicate key at that point. So, instead, we will mark the + // newly created key as "early iterated". + bool success = getDbIdFromRobj(argv[MOVE_COMMAND_DBID_ARG_INDEX], &special_dbid); + serverAssert(success); // the command already succeeded, so this should work! + + robj *oKey = argv[1]; + special_key = (sds)objectGetVal(oKey); + + special_dbEntry = dbFind(server.db[special_dbid], special_key); + } + if (cmd->proc == copyCommand) { + // The COPY command succeeded. However COPY requires special handling (like MOVE). + bool success = getTargetDbIdForCopyCommand(argc, argv, dbid, &special_dbid); + serverAssert(success); // the command already succeeded, so this should work! + + // Find the newly created entry. + robj *oKey = argv[2]; + special_key = (sds)objectGetVal(oKey); + + special_dbEntry = dbFind(server.db[special_dbid], special_key); + } + + /* Implementation note regarding LUA and MULTI: LUA scripts and MULTI-EXEC blocks must be + * treated atomically. We need to ensure that either ALL of the replication (or none of the + * replication) for the atomic operation is processed by the iterator(s). This is handled + * naturally as we can only "complete" the iteration during the feeding process - and feeding + * is only performed when handling timer events (after the LUA/MULTI has completed). */ + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (it->completed || it->terminated) continue; + + // For consistent iteration, we only iterate values based on version. But for + // non-consistent iteration, we don't need to explicitly iterate any values newly created + // during the iteration. So we mark them as expedited. We know we have a new key if it + // was missing before the command, and exists now. + if (!(it->iteration_flags & BGITERATOR_FLAG_CONSISTENT)) { + // Handle the special case of a key moved to a different DB + if (special_dbEntry != NULL) { + if (it->cur_cmd_may_replicate + && !it->keyset_iter->hasPassedItem(it->keyset_iter, special_key, special_dbid)) { + dictAdd(it->early_iterate_entries, special_dbEntry, NULL); + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(special_dbid, special_dbEntry); + debugBuffer = sdscatprintf(debugBuffer, "EARLY(special): %s\n", entryString); + sdsfree(entryString); + } + } + + // Note: In the cases where there's a special command, we are copying or moving an + // item to a different DB. In these limited cases, we can only possibly be + // creating a single key. And if we've handled it here, we don't need to + // handle it as a "missing key" below. If we were to try to handle it as a + // standard "missing key", we would get the DBID incorrect. + } else if (listLength(curCmdMissingKeys) > 0) { + listIter missingIt; + listNode *missingNode; + listRewind(curCmdMissingKeys, &missingIt); + while ((missingNode = listNext(&missingIt)) != NULL) { + robj *oKey = listNodeValue(missingNode); + const_sds key = objectGetVal(oKey); + dbEntry *de = dbFind(server.db[dbid], (sds)key); + if (de != NULL) { + // It exists now! + if (it->cur_cmd_may_replicate + && !it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid)) { + // If the current command is allowed to replicate, and there is a new + // key which we haven't yet reached in iteration, it needs to be added + // to the set of early iterate entries. (We know that it's not already + // in that set because it's a newly created key!) + dictAdd(it->early_iterate_entries, de, NULL); + if (BGITERATION_DEBUG) { + sds entryString = createEntryString(dbid, de); + debugBuffer = sdscatprintf(debugBuffer, "EARLY(NEW): %s\n", entryString); + sdsfree(entryString); + } + } + } + } + } + } + + /* Deletes (and unlinks) are special. + * Developer context: For most commands, we call bgIteration_blockClientIfRequired before + * the command and then call bgIteration_handleCommandReplication after the command. While + * the "before" logic is determining the need to block, it can also determine (mostly) the + * need for replication (on each iterator). Doing this all in one place saves us from + * performing some of the same logic twice. When we get to this point in the code, we just + * use the previously determined information regarding replication. This works because + * Valkey is single-threaded and only processes one command at a time. + * + * But deletes (and unlinks) happen multiple ways - and occur outside the normal + * before/after logic for commands. These situations must be handled: + * - A normal (client-driven) DEL/UNLINK command will use the standard before/after + * logic. If the key is in use by bgIteration, the command will be blocked. + * - An EVICTION generates a DEL/UNLINK which happens outside of the context of a client + * issued command. The replication flags on the iterators are stale and relate to the + * prior command executed. + * - An EXPIRATION in the context of a client-driven WRITE command occurs when the client + * command attempts to access a key and it is found to be expired. In this case, the + * client-command has already gone through the blocking process, so it should be OK to + * use it->cmd_may_replicate. + * - An EXPIRATION in the context of a client-driven READ command occurs when the client + * command attempts to access a key and it is found to be expired. In this case, the + * client-command has NOT gone through the blocking process. The replication flags on + * the iterators are stale and relate to the prior (write) command executed. + * - An EXPIRATION outside of a client-driven command occurs due to active expiry. In + * this case, the replication flags on the iterator are stale and relate to the prior + * command executed. + * + * In the case of EXPIRE/EVICT occurring outside the context of a write command, this is + * handled. If the key is in-use by bgIterator, increment of robj's refcount prevents the + * key from deletion. In this case the key will be removed from the main dictionary, but + * held inside bgIteration until no longer needed. + * Even though the entry is not physically deleted yet, it is logically deleted and it is + * safe to replicate the DEL/UNLINK. Since iterators process items FIFO, the replication + * for DEL/UNLINK won't actually get processed until other queued replication is processed. + * + * In the case of a client driven DEL command, the key will have already been deleted when + * we hit this routine. In the case of EXPIRE/EVICT, they propagate happens before the key + * is deleted. So if the key is missing, we can use the cached replication decision. But + * if the key still exists (indicating EXPIRE/EVICT) we evaluate it specially. + */ + bool shouldReplicateDelCommand = false; + bool isDelCommand = isDeleteCmd(cmd); + if (isDelCommand) { + sds key = objectGetVal(argv[1]); + if (it->keyset_iter->isKeyInScope(it->keyset_iter, key)) { + dbEntry *de = dbFind(server.db[dbid], key); + if (de) { + // NOTE: It's weird, but helpful, for both EXPIRE and EVICT the propagation happens + // BEFORE the actual delete. So if the dbEntry still exists, we are doing + // an expire/evict which is not preceded by blockClientIfRequired(). + if (it->keyset_iter->hasPassedItem(it->keyset_iter, key, dbid) + || (dictFind(it->early_iterate_entries, de) != NULL)) { + shouldReplicateDelCommand = true; + } + } else { + // The dbEntry has already been deleted, this must be part of normal command + // processing. + shouldReplicateDelCommand = it->cur_cmd_may_replicate; + } + } + } + + bool replicate = (it->iteration_flags & BGITERATOR_FLAG_REPLICATION && + ((!isDelCommand && it->cur_cmd_may_replicate) + || shouldReplicateDelCommand)); + + if (replicate) { + /* We will replicate the command in these cases: + * 1) For consistent iteration - it->cur_cmd_may_replicate is always true + * 2) For non-consistent, if any of the keys have been processed, expediteKeysForWrite + * will ensure that ALL of the keys have been expedited - and we should replicate + * 3) For non-consistent, if NONE of the keys have been processed, no need to replicate + */ + + if (BGITERATION_DEBUG) { + debugBuffer = sdscat(debugBuffer, " (queued)\n"); + } + + bgIteratorItem *item = itemFreeList_getElementOrAllocate(); + item->type = BGITERATOR_ITEM_REPLICATION; + item->dbid = dbid; + item->u.repl.cmd = cmd; + item->u.repl.argv = cloneRobjArray(argc, argv); + item->u.repl.argc = argc; + bufferedReplicationBytes += replicationItemSize(item); + it->replication_queued++; + mutexQueueAdd(it->items_for_iterator, item); + } + } // allIterators loop +} + + +// PUBLIC API +size_t bgIteration_memoryInuseForReplication(void) { + return bufferedReplicationBytes; +} + + +// PUBLIC API +bool bgIteration_isEntryInuse(dbEntry *de) { + serverAssert(onValkeyMainThread()); + return isEntryInuseByAnyIterator(de); +} + + +// PUBLIC API +uint32_t bgIteration_getEpoch(void) { + return bgIteration_epoch; +} + + +// PUBLIC API +void bgIteration_updateDbEntryPtr(dbEntry *old, dbEntry *new) { + if (!bgIteration_iterationActive() || old == new) return; + serverAssert(onValkeyMainThread()); + serverAssert(!isEntryInuseByAnyIterator(old)); + + listIter li; + listNode *node; + listRewind(allIterators, &li); + while ((node = listNext(&li)) != NULL) { + bgIterator *it = listNodeValue(node); + if (dictDelete(it->early_iterate_entries, old) == DICT_OK) { + if (BGITERATION_DEBUG) { + debugBuffer = sdscatprintf(debugBuffer, "EARLY LIST UPDATE %p -> %p\n", (void *)old, (void *)new); + } + dictAdd(it->early_iterate_entries, new, NULL); + } + } +} diff --git a/src/bgiteration.h b/src/bgiteration.h new file mode 100644 index 00000000000..35a4b988857 --- /dev/null +++ b/src/bgiteration.h @@ -0,0 +1,363 @@ +#ifndef __BGITERATION_H +#define __BGITERATION_H + +#include +#include "sds.h" + +/* A mechanism for creating iteration clients which iterate over the main dictionary in a + * background thread. + * + * This mechanism passes keys to the iteration client, while blocking the keys from write by the + * Valkey main thread. Once an iteration client is done with a key, it is returned to the Valkey + * main thread and any pending writers are unblocked. + * + * A bgIterator must be created on the main Valkey thread, and then passed to another thread which + * implements the logic of the iteration client. + * + * Iteration clients are expected to read through the keyspace until the iteration is complete or + * terminated. An iteration client may not perform modifications on a key. + * + * Future enhancement: Certain types of modifications may be passed back to the Valkey main thread. + * Use case: A background compression thread wants to compress a string value. + */ + +/* Avoids dependency on server.h */ +typedef struct serverObject dbEntry; // An object with key/value inserted into main dictionary +typedef struct serverObject robj; // An object with a value used for command parameters +typedef struct client client; + +/* The bgIterator is an opaque structure. */ +typedef struct bgIterator bgIterator; + + +/* Flag indicates that a consistent iteration is required. This is used to create a point-in-time + * iteration. The iteration client will see all keys AS THEY EXISTED at the time when the iterator + * was created. + * Note: The DBID provided with the DICTENTRY events is the original DBID (at the time of iteration + * start). SWAPDB events are NOT provided during a consistent iteration. */ +#define BGITERATOR_FLAG_CONSISTENT (1 << 0) + +/* Flag indicating that the replication stream for keys which have already been processed should be + * forwarded to the iteration client. Most useful for non-consistent iteration to track changes + * to keys already processed. By tracking changes, this allows an non-consistent iteration client + * to achieve a consistent view at the END of the iteration. + * NOTE: Replication events will be provided ordered and synchronized with any SWAPDB events. + * LIMITATION: Since SWAPDB events are not provided during CONSISTENT iteration, it is not + * permitted to use both CONSISTENT and REPLICATION on a non-clustermode instance. */ +#define BGITERATOR_FLAG_REPLICATION (1 << 1) + + +/* When running an iterator with replication, a replication-done function (callback) may be + * provided. This function will be executed after the last replication item has been fed into the + * queue for the client. This function will be run on the Valkey main thread, and allows a client + * to recognize the point where no additional replication data will be sent for processing. + * + * PRIVDATA: this pointer is for data private to the iteration client. + * + * Returns true when an iterator stops accepting any replication item into the queue for the client. + * If false is returned, replication will continue, and bgiteration will periodically call the callback + * until true is returned. In this context, returning false indicates that the client is not ready to + * stop receiving replication, it is requesting that replication be continued. + */ +typedef bool (*bgIteratorReplDoneFunc)(void *privdata); + + +/* When creating a bgIterator, a cleanup function (callback) may be provided. This function will be + * executed once iteration has completed and this will run on the Valkey main thread. + * + * TERMINATED: will be passed as TRUE if the iteration process was terminated early (either by + * the main thread calling bgIteratorTerminate() or the iteration client calling + * bgIteratorClose()). + * PRIVDATA: this pointer is for data private to the iteration client. + */ +typedef void (*bgIteratorCleanupFunc)(bool terminated, void *privdata); + + +/* Create a background full-scan iterator (bgIterator). + * This bgIterator will iterate through the entire keyspace (across all DBs). + * + * NAME: a human readable name for the iterator (must be unique) + * FLAGS: creation flags indicate iteration options + * REPLDONE: if provided, called after the last replication item has been queued (on the Valkey main thread) + * CLEANUP: if provided, called at the end of iteration (on the Valkey main thread) + * PRIVDATA: passed to cleanup function + * + * This method creates and initializes the bgIterator. It does not perform any thread management. + * It is expected that the main Valkey thread will call this method, and then start a new thread to + * to implement the iteration client which will read from the returned bgIterator. + * + * There is no need to delete/destroy a bgIterator. It will automatically be cleaned up after the + * last item is read. + */ +bgIterator * bgIteratorCreateFullScanIter( + const char *name, + int flags, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata); + + +/* Create a background slots iterator (bgIterator). + * This bgIterator will iterate through the keys belonging to a set of cluster slots. + * + * NAME: a human readable name for the iterator (must be unique) + * FLAGS: creation flags indicate iteration options + * SLOTS: array of cluster slots to iterate over + * SLOTS_COUNT: size of the array of slots + * REPLDONE: if provided, called after the last replication item has been queued (on the Valkey main thread) + * CLEANUP: if provided, called at the end of iteration (on the Valkey main thread) + * PRIVDATA: passed to cleanup function + * + * This method creates and initializes the bgIterator. It does not perform any thread management. + * It is expected that the main Valkey thread will call this method, and then start a new thread to + * to implement the iteration client which will read from the returned bgIterator. + * + * The caller of this function has the ownership of the `slots` array's memory. This function will + * just copy its data and leave the array untouched. + * + * There is no need to delete/destroy a bgIterator. It will automatically be cleaned up after the + * last item is read. + */ +bgIterator * bgIteratorCreateSlotsIter( + const char *name, + int flags, + const int *slots, + int slots_count, + bgIteratorReplDoneFunc repldone, + bgIteratorCleanupFunc cleanup, + void *privdata); + + +/* Find an existing bgIterator by name. + * Returns NULL if the iterator does not exist (or has completed). + */ +bgIterator * bgIteratorFind(const char *name); + + +/* Get the name of an existing iterator. */ +const char * bgIteratorName(bgIterator *iter); + + +/* Struct to retrieve status information for an active iteration client. */ +typedef struct { + unsigned long dbentries_queued; // Cumulative BGITERATOR_ITEM_DBENTRY queued + unsigned long dbentries_processed; // Cumulative BGITERATOR_ITEM_DBENTRY processed + unsigned long replication_queued; // Cumulative BGITERATOR_ITEM_REPLICATION queued + unsigned long replication_processed; // Cumulative BGITERATOR_ITEM_REPLICATION processed + unsigned long swapdb_queued; // Cumulative BGITERATOR_ITEM_SWAPDB queued + unsigned long swapdb_processed; // Cumulative BGITERATOR_ITEM_SWAPDB processed + unsigned long flushdb_queued; // Cumulative BGITERATOR_ITEM_FLUSHDB queued + unsigned long flushdb_processed; // Cumulative BGITERATOR_ITEM_FLUSHDB processed + unsigned long dbentry_clones_queued; // A subset of dbentries_queued for cloned entries + unsigned long dbentry_clones_processed; // A subset of dbentries_processed for cloned entries + unsigned long queue_length; // Current length of queue to iteration client + unsigned long queue_length_target; // Dynamic target length for queue to iteration client + unsigned long runtime_ms; // Time, in milliseconds, that iterator has been running + unsigned long current_item_ms; // Time, in milliseconds, spent processing current item +} bgIteratorStatus; + + +/* Get the status of a background iteration. + * + * The caller-provided bgIteratorStatus will be populated. + */ +void bgIteratorGetStatus(bgIterator *iter, bgIteratorStatus *status); + + +/* Terminate a background iteration. + * + * An iteration is terminated by the Valkey main thread. It is expected that the iteration client + * will continue to read, receiving BGITERATOR_ITEM_TERMINATED or BGITERATOR_ITEM_COMPLETE to + * complete the iteration. (This is necessary to ensure proper cleanup.) + * NOTE: If the iteration client wants to terminate iteration, it may call bgIteratorClose(). + */ +void bgIteratorTerminate(bgIterator *iter); + + +/* Check if an iterator is being terminated. + * + * This checks if the iterator is in the process of terminating. For the Valkey main thread, this + * can be used to determine if a call has already been made to bgIteratorTerminate. For an + * iteration client, it normally learns about terminate by reading the next item, this allows + * out-of-band detection of termination which can be useful when processing a large key. + */ +bool bgIteratorIsTerminating(bgIterator *iter); + + +typedef enum { + /* Indicates that the iteration has completed normally. No more items to read. + * If replication is enabled, on completion, the final replication offset is recorded in + * 'u.master_repl_offset' and 'dbid' is set to the selected replication db. The iteration + * client will have received all *applicable* replication data to this point. */ + BGITERATOR_ITEM_COMPLETE = 1, + + /* Indicates that the iteration has been terminated before completion. No more items to read.*/ + BGITERATOR_ITEM_TERMINATED, + + /* A dbEntry for DB=dbid. + * NOTE: The dbEntry MAY be expired. It is up to the client to decide how to handle + * expired entries. */ + BGITERATOR_ITEM_DBENTRY, + + /* A replication command for DB=dbid. cmd, argv, & argc provided. + * NOTE: The command may have been re-written before replication. */ + BGITERATOR_ITEM_REPLICATION, + + /* A SWAPDB event. dbid swapped with dbid2. + * Note that SWAPDB events are not provided during consistent iteration. */ + BGITERATOR_ITEM_SWAPDB, + + /* A FLUSHDB event. In most cases, iteration will be terminated, and this event will NOT be + * sent. However, in the case of a single minor DB being flushed, non-consistent iteration is + * permitted to continue. */ + BGITERATOR_ITEM_FLUSHDB +} bgIteratorItemType; + + +typedef struct { + dbEntry *de; + bool is_cloned; + bool is_rehashing_paused; +} dbEntryData; + +typedef struct { + struct serverCommand *cmd; + robj **argv; + int argc; +} replicationData; + +typedef struct { + bgIteratorItemType type; + int dbid; /* orig DB ID for CONSISTENT, queue-time DB ID for !CONSISTENT. */ + union { + dbEntryData dbe; // for BGITERATOR_ITEM_DBENTRY + replicationData repl; // for BGITERATOR_ITEM_REPLICATION + long long master_repl_offset; // for BGITERATOR_ITEM_COMPLETE + int dbid2; // for BGITERATOR_ITEM_SWAPDB + } u; +} bgIteratorItem; + + +/* Read the next bgIteratorItem from the bgIterator. + * + * The iteration client is expected to call this function in a loop. After reading + * BGITERATOR_ITEM_COMPLETE or BGITERATOR_ITEM_TERMINATED, the iteration client must call + * bgIteratorClose to finalize the iteration process. + * + * This is a blocking call. If the main Valkey thread has been too busy to send items to the + * iterator, the iteration client's queue may run dry and this call will block until data is + * available. + * + * NOTE: Reading an item returns previously read items to Valkey. It is unsafe to reference an item + * previously read. + * + * (All memory management is the responsibility of the bgIterator - not the reader.) + */ +bgIteratorItem * bgIteratorRead(bgIterator *iter); + + +/* Close the bgIterator, allowing the bgIterator to be deallocated. + * + * This must be called by an iteration client to release the bgIterator. + * + * It is required that this is called after receiving BGITERATOR_ITEM_COMPLETE or + * BGITERATOR_ITEM_TERMINATED and signals that the background activity is complete. + * + * This may also be called by the iteration client to force terminate an iteration early. The + * bgIterator will be marked as terminated. + */ +void bgIteratorClose(bgIterator *iter); + + +/******************************************************************************************** + * BGITERATION HOOKS REQUIRED TO SUPPORT ITERATION - CALLS INSERTED INTO MAIN VALKEY CODE + ********************************************************************************************/ + +typedef struct { + uint32_t iterator_epoch; // iterator epoch of last modification +} bgIterationEntryMetadata; + + +/* Must be called once (and only once) at server startup. */ +void bgIteration_init(void); + + +/* Returns true if any iterators are currently active. */ +bool bgIteration_iterationActive(void); + + +/* Notify bgIteration that a key is being deleted. In Valkey, key deletion can occur in a READ + * command if the key is expired. Note that this notification is more about status than memory. + * Since the dbEntry is a reference counted object, the dbEntry can't be physically deleted if + * bgIteration is still actively using it. + */ +void bgIteration_keyDelete(int dbid, const_sds key); + + +/* Iteration needs to know if a FLUSHALL is being performed. For normal clients, this comes through + * the standard "blockClientIfRequired" interface. This interface is for cases where Valkey + * performs the FLUSHALL operation independently of clients (e.g. when syncing with master). + */ +void bgIteration_flushall(void); + + +/* Updating value or expiration of an existing key may lead to reallocation of the dbEntry (robj). + * BgIteration keeps track of expedited keys (by pointer) to avoid repeated iteration. BgIteration + * must be notified when dbEntries are reallocated. BgIteration will not dereference the pointers; + * it is safe to have deallocated the old dbEntry before calling this function. + * + * We can't update the dbEntry if the entry is actually in use (bgIteration_isEntryInuse)! + * + * To simplify calling code, this function does nothing if old_entry == new_entry. + */ +void bgIteration_updateDbEntryPtr(dbEntry *old_entry, dbEntry *new_entry); + + +/* Before executing any command, the Valkey main thread must call this function. If the key(s) are + * blocked for writes by an iterator, the function returns true and the client is blocked. A + * blocked client will be unblocked once the key becomes available for write. + * + * This should be called for all commands - even commands which are executed as part of a MULTI/EXEC + * or LUA script. + * + * For MULTI/EXEC - This function is called when hitting the EXEC - after all of the commands + * have been queued. This may block the EXEC, but will NOT block individual + * commands as they are executed in the MULTI/EXEC block. + * + * For LUA script - This function is first called for EVAL/EVALSHA. It may block the script while + * waiting on declared keys. However, if the script accesses undeclared keys or + * performs SWAPDB, a synchronous block may be performed (returning false) on + * individual commands within the script. + * + * Note: this function should be called for all commands (not just writes). + */ +bool bgIteration_blockClientIfRequired(client *c); + + +/* After execution of a write command, the Valkey main thread must provide the command to iterators + * which are interested in the replication feed. It is required that all commands have been passed + * through bgIteration_blockClientIfRequired(), however, it is permitted that the command can be + * re-written for propagation. + */ +void bgIteration_handleCommandReplication( + int dbid, + struct serverCommand *cmd, + int argc, + robj **argv); + + +/* The memory that bgIteration uses while temporarily buffering replication data is not included in + * the maxmemory computation used for eviction. This function provides insight into the current + * amount of memory used for buffered replication data. + */ +size_t bgIteration_memoryInuseForReplication(void); + + +/* Check if a dbEntry is currently in-use/locked by bgIteration. */ +bool bgIteration_isEntryInuse(dbEntry *de); + + +/* Get the current iteration epoch, for tagging metadata on keys. */ +uint32_t bgIteration_getEpoch(void); + +#endif diff --git a/src/db.c b/src/db.c index ba9d25c2fa6..d48bc4b935a 100644 --- a/src/db.c +++ b/src/db.c @@ -37,6 +37,7 @@ #include "module.h" #include "vector.h" #include "expire.h" +#include "bgiteration.h" /*----------------------------------------------------------------------------- * C-level DB API @@ -361,6 +362,7 @@ static void dbSetValue(serverDb *db, robj *key, robj **valref, int overwrite, vo val->lru = old->lru; long long expire = objectGetExpire(old); new = objectSetKeyAndExpire(val, objectGetVal(key), expire); + bgIteration_updateDbEntryPtr(old, new); *oldref = new; /* Replace the old value at its location in the expire space. */ if (expire >= 0) { @@ -430,6 +432,8 @@ void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags) { } else { dbSetValue(db, key, valref, 1, NULL); } + bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(*valref); + if (md) md->iterator_epoch = bgIteration_getEpoch(); if (!(flags & SETKEY_KEEPTTL)) removeExpire(db, key); if (!(flags & SETKEY_NO_SIGNAL)) signalModifiedKey(c, db, key); } @@ -475,6 +479,8 @@ int dbGenericDeleteWithDictIndex(serverDb *db, robj *key, int async, int flags, hashtablePosition pos; void **ref = kvstoreHashtableTwoPhasePopFindRef(db->keys, dict_index, objectGetVal(key), &pos); if (ref != NULL) { + bgIteration_keyDelete(db->id, (sds)objectGetVal(key)); + robj *val = *ref; /* VM_StringDMA may call dbUnshareStringValue which may free val, so we * need to incr to retain val */ @@ -753,6 +759,15 @@ long long dbTotalServerKeyCount(void) { void signalModifiedKey(client *c, serverDb *db, robj *key) { touchWatchedKey(db, key); trackingInvalidateKey(c, key, 1); + + /* If bgIteration is running, need to maintain the iteration epoch. */ + if (bgIteration_iterationActive()) { + dbEntry *o = dbFind(db, objectGetVal(key)); + if (o) { + bgIterationEntryMetadata *md = (bgIterationEntryMetadata *)objectGetMetadata(o); + if (md) md->iterator_epoch = bgIteration_getEpoch(); + } + } } void signalFlushedDb(int dbid, int async) { @@ -2255,7 +2270,7 @@ robj *dbFindExpires(serverDb *db, sds key) { } unsigned long long dbSize(serverDb *db) { - return kvstoreSize(db->keys); + return (db->keys) ? kvstoreSize(db->keys) : 0; } unsigned long long dbScan(serverDb *db, unsigned long long cursor, kvstoreScanFunction scan_cb, void *privdata) { diff --git a/src/hashtable.c b/src/hashtable.c index dcae6dfa014..1dcb8038030 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -214,6 +214,8 @@ static_assert(100 * BUCKET_DIVISOR / BUCKET_FACTOR / ENTRIES_PER_BUCKET <= MAX_F "Expand must result in a fill below the soft max fill factor"); static_assert(MAX_FILL_PERCENT_SOFT <= MAX_FILL_PERCENT_HARD, "Soft vs hard fill factor"); +#define ITERATOR_DONE_WITH_BUCKET_IDX (ENTRIES_PER_BUCKET + 1) + /* --- Random entry --- */ #define FAIR_RANDOM_SAMPLE_SIZE (ENTRIES_PER_BUCKET * 10) @@ -344,7 +346,7 @@ typedef struct { } position; static_assert(sizeof(hashtablePosition) >= sizeof(position), - "Opaque iterator size"); + "Opaque position size"); /* State for incremental find. */ typedef struct { @@ -612,7 +614,8 @@ static bucket *fetchEntriesForExpand(bucket *b, void *buf[], int *size, int max_ /* Processes one bucket chain during incremental table expansion. * Uses batch processing to optimize memory access patterns. */ -static void rehashStepExpand(hashtable *ht) { +// Not API, but not static - used in unit testing +void rehashStepExpand(hashtable *ht) { void *entry_buf[FETCH_ENTRY_BUFFER_SIZE_WHEN_EXPAND]; const void *key_buf[FETCH_ENTRY_BUFFER_SIZE_WHEN_EXPAND]; size_t idx = ht->rehash_idx; @@ -1377,13 +1380,13 @@ void hashtableResumeAutoShrink(hashtable *ht) { * spaces, "holes", in the bucket chains, which wastes memory. Additionally, we * pause auto shrink when rehashing is paused, meaning the hashtable will not * shrink the bucket count. */ -static void hashtablePauseRehashing(hashtable *ht) { +void hashtablePauseRehashing(hashtable *ht) { ht->pause_rehash++; hashtablePauseAutoShrink(ht); } /* Resumes incremental rehashing, after pausing it. */ -static void hashtableResumeRehashing(hashtable *ht) { +void hashtableResumeRehashing(hashtable *ht) { ht->pause_rehash--; assert(ht->pause_rehash >= 0); hashtableResumeAutoShrink(ht); @@ -2268,7 +2271,9 @@ bool hashtableNext(hashtableIterator *iterator, void **elemptr) { * child bucket in a chain, or to the next bucket index, or to the * next table. */ iter->pos_in_bucket++; - if (iter->bucket->chained && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1) { + if (iter->bucket->chained + && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1 + && iter->pos_in_bucket != ITERATOR_DONE_WITH_BUCKET_IDX) { iter->pos_in_bucket = 0; iter->bucket = getChildBucket(iter->bucket); } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) { @@ -2562,3 +2567,68 @@ int hashtableLongestBucketChain(hashtable *ht) { } return maxlen; } + +/* This is an internal function - not part of the standard API. It must be explicitly declared + * where used. It shouldn't be included in any .h (API) file. Use of this interface is discouraged + * as it depends on the internal structure, which may change. + * + * For a given key, return: + * table_idx - the index of the internal table (0 or 1) + * bucket_idx - the bucket index within the table (0..n) + * + * Returns TRUE if the the key exists in the table. + * Returns FALSE if the key doesn't exist (and table/index are undefined) + */ +bool hashtableInternalFindBucketIdx(hashtable *ht, void *key, int *table_idx, size_t *bucket_idx) { + uint64_t hash = hashKey(ht, key); + int pos_in_bucket; + int table; + bucket *b = findBucket(ht, hash, key, &pos_in_bucket, &table); + if (!b) return false; + + *table_idx = table; + *bucket_idx = hash & expToMask(ht->bucket_exp[table]); + return true; +} + +/* This is an internal function - not part of the standard API. It must be explicitly declared + * where used. It shouldn't be included in any .h (API) file. Use of this interface is discouraged + * as it depends on the internal structure, which may change. + * + * For a given iterator, return: + * table_idx - the index of the internal table (0 or 1) + * bucket_idx - the bucket index within the table (0..n) + * + * NOTE: hashtableIterator position is based on the LAST item returned. + */ +void hashtableInternalIteratorGetBucketIdx(hashtableIterator *iterator, int *table_idx, size_t *bucket_idx) { + iter *it = iteratorFromOpaque(iterator); + *table_idx = it->table; + *bucket_idx = it->index; +} + +/* This is an internal function - not part of the standard API. It must be explicitly declared + * where used. It shouldn't be included in any .h (API) file. Use of this interface is discouraged + * as it depends on the internal structure, which may change. + * + * Returns TRUE if the iterator is ready to move to the next bucket index (if it has completed the + * current bucket index). Note: hashtableIterator bucket_idx is the bucket index of the last item + * returned by hashtableNext. + * + * Note: If this function returns true, the iterator commits to move onto the next bucket index, + * even if something new is added to the end of the current bucket before hashtableNext is called. + */ +bool hashtableInternalIteratorIsBucketIdxComplete(hashtableIterator *iterator) { + iter *it = iteratorFromOpaque(iterator); + + if (it->bucket->chained) return false; + + if (!(it->bucket->presence >> (it->pos_in_bucket + 1))) { + /* There's CURRENTLY nothing else to return at this bucket index. Mark pos_in_bucket so + * so that hashtableNext will move to the next bucket index, regardless of items which may + * be added in the future. */ + it->pos_in_bucket = ITERATOR_DONE_WITH_BUCKET_IDX; + return true; + } + return false; +} diff --git a/src/hashtable.h b/src/hashtable.h index 8bbf5d8c05b..97ecab68518 100644 --- a/src/hashtable.h +++ b/src/hashtable.h @@ -129,6 +129,8 @@ size_t hashtableMemUsage(const hashtable *ht); void hashtablePauseAutoShrink(hashtable *ht); void hashtableResumeAutoShrink(hashtable *ht); bool hashtableIsRehashing(hashtable *ht); +void hashtablePauseRehashing(hashtable *ht); +void hashtableResumeRehashing(hashtable *ht); bool hashtableIsRehashingPaused(hashtable *ht); ssize_t hashtableGetRehashingIndex(hashtable *ht); void hashtableRehashingInfo(hashtable *ht, size_t *from_size, size_t *to_size); diff --git a/src/kvstore.c b/src/kvstore.c index 86078cfc1ab..1ac72a01dc2 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -689,6 +689,16 @@ int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it) { return kvs_it->didx; } +/* This is an internal function - not part of the standard API. It must be explicitly declared + * where used. It shouldn't be included in any .h (API) file. Use of this interface is discouraged + * as it depends on the internal structure, which may change. + * + * Return the current hashtableIterator from within the kvstoreIterator. + */ +hashtableIterator *kvstoreInternalIteratorGetCurrentHashtableIterator(kvstoreIterator *kvs_it) { + return &kvs_it->di; +} + /* Fetches the next element and returns true. Returns false if there are no more elements. */ bool kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next) { if (kvs_it->didx != KVSTORE_INDEX_NOT_FOUND && hashtableNext(&kvs_it->di, next)) { diff --git a/src/module.c b/src/module.c index c2511dbb54e..3bcfa2d3aae 100644 --- a/src/module.c +++ b/src/module.c @@ -70,6 +70,7 @@ #include "io_threads.h" #include "scripting_engine.h" #include "cluster_migrateslots.h" +#include "bgiteration.h" #include #include #include @@ -4464,6 +4465,7 @@ int VM_SetAbsExpire(ValkeyModuleKey *key, mstime_t expire) { * When async is set to true, db contents will be freed by a background thread. */ void VM_ResetDataset(int restart_aof, int async) { if (restart_aof && server.aof_state != AOF_OFF) stopAppendOnly(); + bgIteration_flushall(); flushAllDataAndResetRDB((async ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS) | EMPTYDB_NOFUNCTIONS); if (server.aof_enabled && restart_aof) restartAOFAfterSYNC(); } diff --git a/src/object.c b/src/object.c index 21eb57e5cbd..f4545cf8025 100644 --- a/src/object.c +++ b/src/object.c @@ -38,6 +38,7 @@ #include "zmalloc.h" #include "sds.h" #include "module.h" +#include "bgiteration.h" #include #include @@ -340,7 +341,7 @@ robj *createStringObjectFromSds(const_sds s) { return createStringObject(s, sdslen(s)); } -static robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const_sds key, long long expire) { +robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const_sds key, long long expire) { if (shouldEmbedStringObject(len, key, expire)) { return createEmbeddedStringObjectWithKeyAndExpire(ptr, len, key, expire); } else { @@ -447,6 +448,7 @@ void objectUnembedVal(robj *o) { robj *objectSetKeyAndExpire(robj *o, const_sds key, long long expire) { if (o->type == OBJ_STRING && o->encoding == OBJ_ENCODING_EMBSTR) { robj *new = createStringObjectWithKeyAndExpire(objectGetVal(o), sdslen(objectGetVal(o)), key, expire); + bgIteration_updateDbEntryPtr(o, new); new->lru = o->lru; decrRefCount(o); return new; @@ -471,6 +473,7 @@ robj *objectSetKeyAndExpire(robj *o, const_sds key, long long expire) { serverPanic("Not implemented"); } robj *new = createUnembeddedObjectWithKeyAndExpire(o->type, ptr, key, expire); + bgIteration_updateDbEntryPtr(o, new); new->encoding = o->encoding; new->lru = o->lru; decrRefCount(o); diff --git a/src/rdb.c b/src/rdb.c index e4e006a16ec..ae16f62bd26 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -46,6 +46,7 @@ #include "module.h" #include "cluster.h" #include "cluster_migrateslots.h" +#include "bgiteration.h" #include #include @@ -3171,6 +3172,7 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin if (rdbflags & RDBFLAGS_EMPTY_DATA) { int empty_db_flags = server.repl_replica_lazy_flush ? EMPTYDB_ASYNC : EMPTYDB_NO_FLAGS; serverLog(LL_NOTICE, "RDB signature and version check passed. Flushing old data"); + bgIteration_flushall(); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); /* functionsLibCtx is cleared when we call emptyData, reinitialize here. */ diff --git a/src/replication.c b/src/replication.c index 9c8c56d44d2..9f1e00087e6 100644 --- a/src/replication.c +++ b/src/replication.c @@ -41,6 +41,7 @@ #include "connection.h" #include "module.h" #include "cluster_migrateslots.h" +#include "bgiteration.h" #include #include @@ -2482,6 +2483,7 @@ int replicaLoadPrimaryRDBFromSocket(connection *conn, char *buf, char *eofmark, } else { /* Remove the half-loaded data in case the load failed for other reasons. */ serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); + bgIteration_flushall(); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); } } @@ -2585,6 +2587,7 @@ int replicaLoadPrimaryRDBFromDisk(rdbSaveInfo *rsi) { } else { /* If disk-based RDB loading fails, remove the half-loaded dataset. */ serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding the half-loaded data"); + bgIteration_flushall(); emptyData(-1, empty_db_flags, replicationEmptyDbCallback); } diff --git a/src/server.c b/src/server.c index 4eb7798a924..ecbae40c2f5 100644 --- a/src/server.c +++ b/src/server.c @@ -54,6 +54,7 @@ #include "util.h" #include "eval.h" +#include "bgiteration.h" #include "trace/trace_commands.h" @@ -3018,8 +3019,11 @@ void initServer(void) { /* Set object metadata size before creating any database key objects */ if (server.forkless_options_supported) { - objectSetMetadataSize(sizeof(uint32_t)); /* This is a placeholder until Threadsave defines a metadata structure */ - /* 4 bytes for iterator_epoch for now*/ + /* NOTE: At this time, there is only one reason for dbEntry metadata. bgIteration. However, + * if/when new metadata options are added, we will need to compute the size of a variable + * size metadata, and provide appropriate accessors to access the specific portion of the + * metadata (each of which may/may not exist, based on immutable startup parameters). */ + objectSetMetadataSize(sizeof(bgIterationEntryMetadata)); } createDatabaseIfNeeded(0); /* The default database should always exist */ @@ -3141,6 +3145,7 @@ void initServer(void) { commandlogInit(); latencyMonitorInit(); initSharedQueryBuf(); + bgIteration_init(); /* Initialize ACL default password if it exists */ ACLUpdateDefaultUserPassword(server.requirepass); @@ -3702,6 +3707,11 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot) if (propagate_to_slot_migration) clusterFeedSlotExportJobs(dbid, argv, argc, slot); } +// If true, a MULTI has been sent to bgIterator. +// Remember to send the matching EXEC in propagatePendingCommands(). +static bool sentMultiToBgIterator = false; +static int lastDbidSentToBgIterator; + /* Used inside commands to schedule the propagation of additional commands * after the current command is propagated to AOF / Replication. * @@ -3714,6 +3724,29 @@ static void propagateNow(int dbid, robj **argv, int argc, int target, int slot) * stack allocated). The function automatically increments ref count of * passed objects, so the caller does not need to. */ void alsoPropagate(int dbid, robj **argv, int argc, int target, int slot) { + if (target & PROPAGATE_REPL && bgIteration_iterationActive()) { + // Note that bgIterator must be invoked immediately after each command. This is required + // for proper processing in the bgIterator state machine. It's NOT ok to call bgIterator + // from propagateNow as that handles all of the commands for a transaction at the end. + // THIS FUNCTION (alsoPropagate) is called after each command. + if (!sentMultiToBgIterator && (scriptIsRunning() || server.in_exec)) { + // For a script or multi/exec, we should be sending the MULTI at the beginning of the + // execution unit. There shouldn't be any commands in the propagation queue yet. + serverAssert(server.also_propagate.numops == 0); + // If this is the first propagated command of a script or multi, make it a transaction. + // It may turn out that there is only 1 command in the MULTI block, but we can't know + // that now. Unlike regular replication, we can't defer all of the replication until + // we know for sure. We must call bgIterator after each command. + static struct serverCommand* cmd_multi = NULL; // STATIC to avoid repeated lookups + if (cmd_multi == NULL) cmd_multi = lookupCommandOrOriginal(&shared.multi, 1); + bgIteration_handleCommandReplication(dbid, cmd_multi, 1, &shared.multi); + sentMultiToBgIterator = true; + } + struct serverCommand* cmd = lookupCommandOrOriginal(argv, argc); + bgIteration_handleCommandReplication(dbid, cmd, argc, argv); + lastDbidSentToBgIterator = dbid; + } + robj **argvcopy; int j; @@ -3780,6 +3813,17 @@ void updateCommandLatencyHistogram(struct hdr_histogram **latency_histogram, int * multiple separated commands. Note that alsoPropagate() is not affected * by CLIENT_PREVENT_PROP flag. */ static void propagatePendingCommands(void) { + // Note: This is done before the check on server.also_propagate.numops. Numops might be zero + // if there is no replica but we might be running bgIteration for something other than + // replication. If we sent the multi (to bgIteration), we need to send the matching exec. + if (sentMultiToBgIterator) { + // If a MULTI was sent to bgIterator via alsoPropagate(), then send the matching EXEC. + static struct serverCommand* cmd_exec = NULL; // STATIC to avoid repeated lookups + if (cmd_exec == NULL) cmd_exec = lookupCommandOrOriginal(&shared.exec, 1); + bgIteration_handleCommandReplication(lastDbidSentToBgIterator, cmd_exec, 1, &shared.exec); + sentMultiToBgIterator = false; + } + if (server.also_propagate.numops == 0) return; int j; @@ -3909,6 +3953,8 @@ int incrCommandStatsOnError(struct serverCommand *cmd, int flags) { * */ void call(client *c, int flags) { + if (bgIteration_blockClientIfRequired(c)) return; + long long dirty; struct ClientFlags client_old_flags = c->flag; diff --git a/src/server.h b/src/server.h index 51db9a38baa..c68dd524592 100644 --- a/src/server.h +++ b/src/server.h @@ -103,7 +103,19 @@ static_assert(sizeof(off_t) >= 8, "off_t must be 64-bit; ensure _FILE_OFFSET_BIT #define dismissMemory zmadvise_dontneed #define VALKEYMODULE_CORE 1 -typedef struct serverObject robj; + +/* serverObject (aka robj) is currently overloaded for 2 purposes. This is a legacy artifact. + * 1. It's carries a reference counted STRING (a keyless value) during parsing and command execution. + * 2. It's also used to carry a key/value pair which is inserted into the DB. In this form, the + * value is not limited to being a string. + * + * The typedef "dbEntry" is used to explicitly connote the latter form. It indicates a key/value + * pair which is suitable to exist in the DB. It might be active in the DB, or may be unlinked from + * the DB (but still contains a key/value). The value may be any of the Valkey data types/encodings. + */ +typedef struct serverObject robj; // A keyless string OR a key/value pair +typedef struct serverObject dbEntry; // Explicitly a key/value pair + #include "valkeymodule.h" /* Modules API defines. */ /* Following includes allow test functions to be called from main() */ diff --git a/src/unit/test_bgiteration.cpp b/src/unit/test_bgiteration.cpp new file mode 100644 index 00000000000..7499e53ca52 --- /dev/null +++ b/src/unit/test_bgiteration.cpp @@ -0,0 +1,3747 @@ +//#include +#include "generated_wrappers.hpp" +#include +//#include "amz_assert.h" + +// +// +// ## +// ######: ## +// #######: ## +// ## :## +// ## ## ##.#### .####: ##: :## #### .####: ## ## +// ## :## ####### .######: ## ## #### .######: ##. .## +// #######: ###. ##: :## :## ##: ## ##: :## #: ## :# +// ######: ## ######## ##..## ## ######## :#:.##.:#: +// ## ## ######## ##::## ## ######## # :##:## +// ## ## ## :####: ## ## ## ## ## +// ## ## ###. :# #### ## ###. :# ###::## +// ## ## .####### #### ######## .####### :##..##: +// ## ## .#####: :##: ######## .#####: .## ## +// +// +// +// +// +// +// +// .####. #### +// ###### #### +// :## ##: ## +// ##: :## ##.#### ## ## ## +// ## ## ####### ## :## ## +// ## ## ### :## ## ##: ##. +// ## ## ## ## ## ###:## +// ## ## ## ## ## .## # +// ##: :## ## ## ## ####. +// :## ##: ## ## ##: :### +// ###### ## ## ##### ## +// .####. ## ## .#### ##. +// :## +// ###: +// ### +// +// +// +// +// ### ## ## +// ### ## ## ## +// ###: ## ## ## +// #### ## .####. ####### ##.#### .####: :#### :###.## ## ## +// ##:#: ## .######. ####### ####### .######: ###### :####### :## ## +// ## ## ## ### ### ## ###. ##: :## #: :## ### ### ##: ##. +// ## ## ## ##. .## ## ## ######## :##### ##. .## ###:## +// ## :#:## ## ## ## ## ######## .####### ## ## .## # +// ## #### ##. .## ## ## ## ## . ## ##. .## ####. +// ## :### ### ### ##. ## ###. :# ##: ### ### ### :### +// ## ### .######. ##### ## .####### ######## :####### ## +// ## ### .####. .#### ## .#####: ###.## :###.## ##. +// :## +// ###: +// ### +// +// +// +// ## +// :#### ## +// ##### ## +// ## +// ####### .####. ##.#### ##.#### .####: ##: :## #### .####: ## ## +// ####### .######. ####### ####### .######: ## ## #### .######: ##. .## +// ## ### ### ###. ###. ##: :## :## ##: ## ##: :## #: ## :# +// ## ##. .## ## ## ######## ##..## ## ######## :#:.##.:#: +// ## ## ## ## ## ######## ##::## ## ######## # :##:## +// ## ##. .## ## ## ## :####: ## ## ## ## ## +// ## ### ### ## ## ###. :# #### ## ###. :# ###::## +// ## .######. ## ## .####### #### ######## .####### :##..##: +// ## .####. ## ## .#####: :##: ######## .#####: .## ## +// +// +// +// + + + +using namespace ::testing; + +extern "C" { + #include "stdlib.h" + #include "bgiteration.h" + #include "server.h" + //#include "serverassert.h" + #define using usingvar // compile hack + #include "module.h" + #undef using + extern hashtableType commandSetType; + extern dictType keylistDictType; + bool iteratorRepldoneFn(void *privdata); + void iteratorCleanupFn(bool terminated, void *privdata); + void bgIteration_feedIterators(void); + void createSharedObjects(void); + void hashtableDump(hashtable *ht); + void rehashStepExpand(hashtable *ht); // in hashtable.c (non-API) + void bgIteration_unitTestDisableCloning(void); + void bgIteration_unitTestEnableCloning(int item_bytes, int pool_bytes); +} + + +// The private data is a pointer to arbitrary data. This value is used just to +// test that the correct value is passed through. +#define PRIVDATA reinterpret_cast(12345) + +// A bgIteration cleanup function used for testing. +int cleanupCount; +bool cleanupTerminated; +void iteratorCleanupFn(bool terminated, void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + cleanupCount++; + cleanupTerminated = terminated; +} + + +// A bgIteration repldone function used for testing. +int repldoneCount; +bool iteratorRepldoneFn(void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + repldoneCount++; + return true; +} + + +// A more complicated repldone function that can delay the replcation done condition. +bool isReplDoneReady; +bool iteratorRepldoneFnNotBeingReadyInitially(void *privdata) { + EXPECT_EQ(privdata, PRIVDATA); + // This is to test the behavior when Repl Done function is not ready to be executed. + if (!isReplDoneReady) { + isReplDoneReady = true; + return false; + } + repldoneCount++; + return true; +} + + +static const char *logfile = ""; + +/* Most of the bgIteration unit tests are based on a CMD instance with 2 DBs. There are 8 keys in + * each DB. The keys are named A0, B0, C0, D0, E0, F0, G0, H0 for DB-0 and A1, B1, C1, D1, E1, F1, + * G1, H1 for DB-1. There are a number of helper functions to simulate certain key modification + * actions within our test configuration. Note that this is isolated from the actual call to + * processCommand. + * + * Because most of bgIteration is based on an ordered processing of keys, it doesn't matter if we + * are simulating CMD or CME, full scan, or slot-based. The majority of tests are independent of + * these concerns. + * + * However, there are some tests which are are unique to these configurations and use a specialized + * derived class to handle the differences. We do not want to duplicate all of the tests for + * the different configurations, but we do want to ensure that each configuration works properly. + * - bgIterationTestCluster - handles tests unique to full scan in cluster mode + * - bgIterationTestClusterSlots - handles tests unique to cluster slot-based iteration + */ +class BgIterationTest : public ::testing::Test { + private: + static const int DB_COUNT = 2; + static const int ITEMS_PER_DB = 8; + + // This is the expected order of the keys when hashed + const char *keys[DB_COUNT][ITEMS_PER_DB] = {{"D0", "G0", "H0", "C0", "F0", "A0", "B0", "E0"}, + {"B1", "C1", "F1", "G1", "E1", "D1", "A1", "H1"}}; + + protected: + static const int TOTAL_ITEMS = DB_COUNT * ITEMS_PER_DB; + static const int LAST_ITEM = TOTAL_ITEMS - 1; + + MockValkey mock; + RealValkey real; + + struct serverCommand dummy_cmd = {0}; + + // Helper functions for accessing the keys. We can access by db(0..1) and seq(0..4) + // or by item number (0..9). + // NOTE: These virtual functions can be overridden in subclasses which may have different item layout. + virtual const char * getKeyAtDbSeq(int db, int seq) { + assert(db < DB_COUNT); + assert(seq < ITEMS_PER_DB); + return keys[db][seq]; + } + + virtual int getDbFromItemNum(int itemNum) { + assert(itemNum < DB_COUNT * ITEMS_PER_DB); + return itemNum / ITEMS_PER_DB; + } + + virtual int getSeqFromItemNum(int itemNum) { + assert(itemNum < DB_COUNT * ITEMS_PER_DB); + return itemNum % ITEMS_PER_DB; + } + + const char * keyStr(int itemNum) { + return getKeyAtDbSeq(getDbFromItemNum(itemNum), getSeqFromItemNum(itemNum)); + } + + int itemNumFromKey(const char * key) { + for (int itemNum = 0; itemNum < DB_COUNT * ITEMS_PER_DB; itemNum++) { + if (strcmp(key, keyStr(itemNum)) == 0) return itemNum; + } + return -1; + } + + + // Do some general initialization before starting the suite. Normally, the tests are run in + // isolation - and this isn't much different than SetUp(). But if running the + // entire test suite together (just manually running the test executable), this gets called + // only once. + static void SetUpTestSuite() { + monotonicInit(); + + bzero(&server, sizeof(server)); + server.hz = 100; + server.logfile = const_cast(logfile); + createSharedObjects(); + + moduleInitModulesSystem(); + + server.commands = hashtableCreate(&commandSetType); + server.orig_commands = hashtableCreate(&commandSetType); + populateCommandTable(); + } + + + static void TearDownTestSuite() { + hashtableRelease(server.commands); + hashtableRelease(server.orig_commands); + } + + + void initializeServerDb(int dbid, int slot_count_bits = 0) { + server.db[dbid] = static_cast(zcalloc(sizeof(serverDb))); + server.db[dbid]->id = dbid; + server.db[dbid]->keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, 0); + server.db[dbid]->expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, 0); + server.db[dbid]->watched_keys = dictCreate(&keylistDictType); + kvstoreExpand(server.db[dbid]->keys, 8, 0, NULL); + } + + + void addKeyAndValObjsToDb(int dbid, sds key, sds val) { + robj *key_obj = createStringObjectFromSds(key); + robj *val_obj = createStringObjectFromSds(val); + dbAdd(server.db[dbid], key_obj, &val_obj); + decrRefCount(key_obj); + } + + + void addKeyToDb(int dbid, const char *key, const char *val) { + addKeyAndValObjsToDb(dbid, sdsnew(key), sdsnew(val)); + } + + + virtual void setupDatabase() { + // For these unit tests, a standard database is constructed. The order of items in the + // hash table is important, and this is validated here. If the hash table + // implementation changes, we will find out quickly at this point. All other tests + // will become invalid! + + server.dbnum = 2; + server.cluster_enabled = false; + server.db = static_cast(zcalloc(sizeof(serverDb *) * server.dbnum)); + + for (int dbid = 0; dbid < server.dbnum; dbid++) { + initializeServerDb(dbid); + } + + // With hashtable, it can be difficult to get our keys spread across different buckets. + // Here we play with hashtable size and rehashing to get comfortable scenarios for testing. + // NOTE: If the hashtable bucketization changes, we'll need to evaluate the tests for + // changes. Since bgIteration processes a bucket at a time, we need to evaluate + // all the tests when bucketization changes. + // As an alternative, we could mock all of the hashtable activity, but it's better if we + // can use the real functionality as much as possible. + + kvstoreExpand(server.db[0]->keys, 16, 0, NULL); + addKeyToDb(0, "A0", "A0"); + addKeyToDb(0, "B0", "B0"); + addKeyToDb(0, "C0", "C0"); + addKeyToDb(0, "D0", "D0"); + addKeyToDb(0, "E0", "E0"); + addKeyToDb(0, "F0", "F0"); + addKeyToDb(0, "G0", "G0"); + addKeyToDb(0, "H0", "H0"); + hashtable *ht = kvstoreGetHashtable(server.db[1]->keys, 0); + hashtablePauseRehashing(ht); + + kvstoreExpand(server.db[1]->keys, 16, 0, NULL); + addKeyToDb(1, "A1", "A1"); + addKeyToDb(1, "B1", "B1"); + addKeyToDb(1, "C1", "C1"); + addKeyToDb(1, "D1", "D1"); + addKeyToDb(1, "E1", "E1"); + addKeyToDb(1, "F1", "F1"); + addKeyToDb(1, "G1", "G1"); + addKeyToDb(1, "H1", "H1"); + // Now, let's increase the size and start a rehash on the 2nd DB. This ensures that + // iteration is working even if a hashtable is in the middle of rehashing. We choose + // a 128 size so that rehashed keys all get unique buckets. + kvstoreExpand(server.db[1]->keys, 128, 0, NULL); + ht = kvstoreGetHashtable(server.db[1]->keys, 0); + rehashStepExpand(ht); // in hashtable.c (non-API) + rehashStepExpand(ht); // and rehash the 2nd bucket also + hashtablePauseRehashing(ht); + + // The bucketization should look like this. Remember that DB-1 is in + // the middle of a rehash, so it has 2 tables. + // + // DB: 0 SLOT: 0 + // Table 0, used 8, exp 2, top-level buckets 4, child buckets 0 + // Bucket 0:1 level:0 + // 0 h2 63, key "D0" + // 1 h2 a5, key "G0" + // 2 h2 ca, key "H0" + // Bucket 0:2 level:0 + // 0 h2 91, key "C0" + // 1 h2 88, key "F0" + // Bucket 0:3 level:0 + // 0 h2 b8, key "A0" + // 1 h2 f5, key "B0" + // 2 h2 13, key "E0" + // Table 1, used 0, exp -1, top-level buckets 0, child buckets 0 + // + // DB: 1 SLOT: 0 + // Table 0, used 3, exp 2, top-level buckets 4, child buckets 0 + // Bucket 0:0 level:0 <- rehashed into table 1 + // Bucket 0:1 level:0 <- rehashed into table 1 + // Bucket 0:2 level:0 + // 0 h2 18, key "B1" + // 1 h2 fd, key "C1" + // Bucket 0:3 level:0 + // 0 h2 6f, key "F1" + // Table 1, used 5, exp 5, top-level buckets 32, child buckets 0 + // Bucket 1:1 level:0 + // 0 h2 ad, key "G1" + // Bucket 1:5 level:0 + // 0 h2 0c, key "E1" + // Bucket 1:12 level:0 + // 0 h2 e9, key "D1" + // Bucket 1:17 level:0 + // 0 h2 36, key "A1" + // Bucket 1:29 level:0 + // 0 h2 9e, key "H1" + // Bucket 1:30 level:0 + + + // In case we need to debug... + // Used to generate comment above, showing bucketization. + if (0) debugPrintBucketInfo(); + + // Validate that the iteration order matches the expected order + for (int db = 0; db < server.dbnum; db++) { + ht = kvstoreGetHashtable(server.db[db]->keys, 0); + hashtableIterator *it = hashtableCreateIterator(ht, 0); + robj *next; + int i = 0; + while (hashtableNext(it, reinterpret_cast(&next))) { + ASSERT_THAT(next, robjEqualsStr(getKeyAtDbSeq(db, i++))); + } + hashtableReleaseIterator(it); + } + } + + + void SetUp() override { + server.main_thread_id = pthread_self(); + server.forkless_options_supported = 1; + objectSetMetadataSize(sizeof(bgIterationEntryMetadata)); + + bgIteration_unitTestDisableCloning(); + + setupDatabase(); + + EXPECT_CALL(mock, aeCreateTimeEvent(_,_,_,_,_)).WillRepeatedly(Return(0)); + bgIteration_init(); + + cleanupCount = 0; + repldoneCount = 0; + isReplDoneReady = false; + + // By default, in tests, we treat items as not having an expiration + //JHB EXPECT_CALL(mock, getExpire(_,_)).WillRepeatedly(Return(-1)); + + // By default, do nothing for these + EXPECT_CALL(mock, blockClientInUseOnKeys(_,_,_)).WillRepeatedly(Return()); + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).WillRepeatedly(Return()); + + // By default, expect no permission issues + EXPECT_CALL(mock, ACLCheckAllUserCommandPerm(_,_,_,_,_,_)).WillRepeatedly(Return(ACL_OK)); + + //JHB EXPECT_CALL(mock, lookupCommandOrOriginal(_)).WillRepeatedly(Return(&dummy_cmd)); + } + + + void TearDown() override { + bgIteration_feedIterators(); // process returning stuff before deleting DB + bgIteration_feedIterators(); // in case an iterator was closed there might be more + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i]->keys) kvstoreRelease(server.db[i]->keys); + if (server.db[i]->expires) kvstoreRelease(server.db[i]->expires); + dictRelease(server.db[i]->watched_keys); + zfree(server.db[i]); + } + zfree(server.db); + } + + + // void update_keys(const char **new_keys, int db, int len) { + // memcpy(keys[db], new_keys, len * sizeof(const char *)); + // } + + + + + + + // Deletes an item from the DB (often at the start of a test) - but does NOT notify + // bgIteration. bgIteration_keyDelete() should be explicitly called where needed. + void simpleDelItem(int itemNum) { + int db = getDbFromItemNum(itemNum); + + sds delKey = sdsnew(keyStr(itemNum)); + int rc = kvstoreHashtableDelete(server.db[db]->keys, 0, delKey); + ASSERT_EQ(rc, 1); + sdsfree(delKey); + } + + + // Find the actual dbEntry object by itemNum + dbEntry * getItem(int itemNum) { + int db = getDbFromItemNum(itemNum); + sds key = sdsnew(keyStr(itemNum)); + dbEntry *de = dbFind(server.db[db], key); + sdsfree(key); + return de; + } + + + // The test expects that the next item read will be BGITERATOR_ITEM_COMPLETE + void expectReadComplete(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + EXPECT_EQ(item->type, BGITERATOR_ITEM_COMPLETE); + bgIteratorClose(iter); + + int oldCleanupCount = cleanupCount; + bgIteration_feedIterators(); + EXPECT_EQ(cleanupCount, oldCleanupCount + 1); + } + + + // The test is cleaning up and isn't validating the remaining cleanup + void expectAnythingCleanup(bgIterator *iter) { + while (true) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + if ((item->type == BGITERATOR_ITEM_COMPLETE + || item->type == BGITERATOR_ITEM_TERMINATED)) { + bgIteratorClose(iter); + break; + } + } + bgIteration_feedIterators(); // Recognize the closed iterator + EXPECT_EQ(cleanupCount, 1); + } + + + void expectDictEntryMetadataMatch(dbEntry *de1, dbEntry *de2) { + bgIterationEntryMetadata *dm1 = static_cast(objectGetMetadata(de1)); + bgIterationEntryMetadata *dm2 = static_cast(objectGetMetadata(de2)); + + EXPECT_NE(dm1, nullptr); + EXPECT_NE(dm2, nullptr); + EXPECT_EQ(dm1->iterator_epoch, dm2->iterator_epoch); + } + + + // Useful when debugging new tests. It reads/prints all remaining items then crashes. + void cleanupIteratorDebugPrint(bgIterator *iter) { + bool done = false; + printf("[DEBUG] Printing bgIterator '%s' items:\n", bgIteratorName(iter)); + while (!done) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + switch (item->type) { + case BGITERATOR_ITEM_DBENTRY: + { + auto obj = item->u.dbe.de; + const char * keyStr = objectGetKey(obj); + printf("Entry: %s -> %s [itemNum: %i]\n", + keyStr, + static_cast(objectGetVal(obj)), + itemNumFromKey(keyStr)); + break; + } + case BGITERATOR_ITEM_REPLICATION: + printf("Repl: DB=%d : ", item->dbid); + for (int i = 0; i < item->u.repl.argc; i++) + printf("%s ", static_cast(objectGetVal(item->u.repl.argv[i]))); + printf("\n"); + break; + case BGITERATOR_ITEM_COMPLETE: + case BGITERATOR_ITEM_TERMINATED: + bgIteratorClose(iter); + done = true; + break; + default: + printf("unhandled: %d\n", item->type); + } + } + bgIteration_feedIterators(); // Recognize the closed iterator + ASSERT_TRUE(false); // Halt the test here + } + + + // Make a copy of the metadata + void * cloneMetadata(dbEntry *de) { + int size = objectGetMetadataSize(de); + void *metadata = zmalloc(size); + memcpy(metadata, objectGetMetadata(de), size); + return metadata; + } + + + // Compare a previous metadata copy to an existing entry + void compareAndFreeClonedMetadata(dbEntry *de, void *metadata) { + EXPECT_EQ(memcmp(objectGetMetadata(de), metadata, objectGetMetadataSize(de)), 0); + zfree(metadata); + } + + + // The test expects the next item will be a specific key + // The item value is verified against the default unless provided as a parameter. + void expectReadKey(bgIterator *iter, int itemNum, const char *value=nullptr) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_FALSE(item->u.dbe.is_cloned); + // if (item->u.dbe.is_cloned) { // JHB - wrong place to check this. + // // If the entry is cloned, make sure we copied the metadata + // dbEntry *cloned_dbEntry = item->u.dbe.de; + // dbEntry *original_dbEntry = getItem(itemNum); + // expectDictEntryMetadataMatch(original_dbEntry, cloned_dbEntry); + // } + EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum)); + if (value) { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } else { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum))); + } + } + + + // The test expects the next item will be a specific key amd that the item is cloned. + // Metadata is tested (to make sure the clone includes the proper metadata). + // The item value is verified against the default unless provided as a parameter. + void expectReadClonedKey(bgIterator *iter, int itemNum, void *metadata, const char *value=nullptr) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_TRUE(item->u.dbe.is_cloned); + compareAndFreeClonedMetadata(item->u.dbe.de, metadata); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), keyStr(itemNum)); + if (value) { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } else { + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(keyStr(itemNum))); + } + } + + + // Test expects the next key, but specified by key name, not itemNum. + void expectReadDbKeyValue(bgIterator *iter, int db, const char *key, const char *value) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_DBENTRY); + EXPECT_EQ(item->dbid, db); + EXPECT_STREQ(objectGetKey(item->u.dbe.de), key); + EXPECT_THAT(item->u.dbe.de, robjEqualsStr(value)); + } + + + // Test expect to read a sequence of key items + void expectReadKeySequence(bgIterator *iter, int startItem, int endItem) { + for (int i = startItem; i <= endItem; i++) expectReadKey(iter, i); + } + + + // Just like expectReadKey, but also tests that a previous item is becoming unblocked. + void expectReadKeyWithUnblock(bgIterator *iter, int itemNum, int unblockItem, const char *value=nullptr) { + bool blocked = true; + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(unblockItem)))) + .WillOnce(Assign(&blocked, false)); + expectReadKey(iter, itemNum, value); + EXPECT_FALSE(blocked); + } + + + // Test expects to read a replication item matching the command help by client 'c' + void expectReadReplication(bgIterator *iter, client *c) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->dbid, c->db->id); + EXPECT_EQ(item->u.repl.cmd, c->cmd); + EXPECT_EQ(item->u.repl.argc, c->argc); + for (int i = 0; i < c->argc; i++) { + EXPECT_STREQ(static_cast(objectGetVal(item->u.repl.argv[i])), + static_cast(objectGetVal(c->argv[i]))); + } + } + + + // We expect to read a MULTI command which should have been inserted. + void expectReadMultiReplication(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("multi")); + } + + + // We expect to read an EXEC command which should have been inserted. + void expectReadExecReplication(bgIterator *iter) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("exec")); + } + + + // Expecting that a DEL command should have been replicated. + void expectReadReplicationDel(bgIterator *iter, int itemNum) { + int db = getDbFromItemNum(itemNum); + + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + EXPECT_EQ(item->dbid, db); + EXPECT_EQ(item->u.repl.cmd, lookupCommandByCString("DEL")); + EXPECT_EQ(item->u.repl.argc, 2); + EXPECT_THAT(item->u.repl.argv[0], robjEqualsStr("DEL")); + EXPECT_THAT(item->u.repl.argv[1], robjEqualsStr(keyStr(itemNum))); + } + + + // Expecting that a special SWAPDB item has been inserted. + void expectReadSwapDB(bgIterator *iter, int db1, int db2) { + bgIteration_feedIterators(); + bgIteratorItem *item = bgIteratorRead(iter); + bgIteration_feedIterators(); + + ASSERT_EQ(item->type, BGITERATOR_ITEM_SWAPDB); + EXPECT_EQ(item->dbid, db1); + EXPECT_EQ(item->u.dbid2, db2); + } + + + // Used to examine the physical bucket layout in the hash table. Generated the comment + // above which shows each item in each bucket. Necessary if hash table layout changes. + void debugPrintBucketInfo(int num_slots = -1) { + for (int db = 0; db < server.dbnum; db++) { + int n = (num_slots == -1) ? kvstoreNumHashtables(server.db[db]->keys) : num_slots; + for (int slot = 0; slot < n; slot++) { + hashtable *ht = kvstoreGetHashtable(server.db[db]->keys, slot); + printf("DB: %d SLOT: %d\n", db, slot); + hashtableDump(ht); + } + } + } + + + // Creates a client with a write command (SET) for the given itemNum + client * getWriteClient(int itemNum, const char *value) { + int db = getDbFromItemNum(itemNum); + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString("set"); + c->db = server.db[db]; + + c->argc = 3; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); + c->argv[1] = createStringObjectFromSds(sdsnew(keyStr(itemNum))); + c->argv[2] = createStringObjectFromSds(sdsnew(value)); + + return c; + } + + + // Create a client with a write command that touches multiple keys + client * getWriteMultiKeysClient( + const char * cmdName, + int dstItemNum, + const std::vector & srcItemsNum) { + + assert(!srcItemsNum.empty()); + + const int db = getDbFromItemNum(dstItemNum); + std::for_each(srcItemsNum.cbegin(), srcItemsNum.cend(), [&db, this](int srcItemNum) { + assert(db == getDbFromItemNum(srcItemNum)); + }); + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString(cmdName); + assert(c->cmd != nullptr); + c->db = server.db[db]; + + c->argc = 2 + srcItemsNum.size(); + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); + c->argv[1] = createStringObjectFromSds(sdsnew(keyStr(dstItemNum))); + for (unsigned int i = 0; i < srcItemsNum.size(); i++) { + c->argv[2 + i] = createStringObjectFromSds(sdsnew(keyStr(srcItemsNum[i]))); + } + + return c; + } + + + client * getWrite2KeysClient(const char * cmdName, int dstItemNum, int srcItemNum) { + return getWriteMultiKeysClient(cmdName, dstItemNum, {srcItemNum}); + } + + + client * getWrite3KeysClient( + const char * cmdName, int dstItemNum, int src1ItemNum, int src2ItemNum) { + return getWriteMultiKeysClient(cmdName, dstItemNum, {src1ItemNum, src2ItemNum}); + } + + + // Create a client with a MULTI/EXEC block. + // This parses a series of commands separated by ';' + // Example: getMultiClient("SET A0 xxx; SELECT 1; SET A1 xxx; SET B1 xxx") + client * getMultiClient(const char *commands, int dbid = 0) { + char *commandsCopy = zstrdup(commands); // a mutable copy + char *commandStr, *commandStrSave; + char *token, *tokenSave; + + client *c = static_cast(zcalloc(sizeof(client))); + c->db = server.db[dbid]; + initClientMultiState(c); + c->flag.multi = 1; + c->mstate->cmd_flags |= CMD_WRITE; + + commandStr = strtok_r(commandsCopy, ";", &commandStrSave); + while (commandStr != NULL) { + + token = strtok_r(commandStr, " ", &tokenSave); + c->cmd = lookupCommandByCString(token); + + c->argv = static_cast(zcalloc(sizeof(robj*) * 5)); // command + 4 args + + for (int i = 0; token != NULL; i++) { + c->argv[i] = createStringObject(token, strlen(token)); + c->argc = i+1; + token = strtok_r(NULL, " ", &tokenSave); + } + + queueMultiCommand(c, 0); + freeClientArgv(c); + + commandStr = strtok_r(NULL, ";", &commandStrSave); + } + + c->cmd = lookupCommandByCString("exec"); + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew("EXEC")); + + zfree(commandsCopy); + return c; + } + + + // Initially, a MULTI client is set up to execute the EXEC command (which examines the + // contents of the multi/exec block). This function advances the client to begin executing + // the individual commands within the multi/exec block. + void advanceMultiClientToCommand(client *c, int cmdNum) { + assert(cmdNum >= 0 && cmdNum < c->mstate->count); + c->argc = c->mstate->commands[cmdNum].argc; + c->argv = c->mstate->commands[cmdNum].argv; + c->argv_len = c->mstate->commands[cmdNum].argv_len; + c->cmd = c->realcmd = c->mstate->commands[cmdNum].cmd; + } + + + // A client with a fictional command: + // SETGET + // - writes a value to the first key (making this CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY) + // - reads a second key + client * getSetGetClient(int itemNum1, const char *value1, int itemNum2) { + // Fictional command which writes to 1st key and reads the 2nd + int db = getDbFromItemNum(itemNum1); + assert(db == getDbFromItemNum(itemNum2)); // (this would be a testcase error) + + client *c = static_cast(zcalloc(sizeof(client))); + struct serverCommand *cmd + = static_cast(zcalloc(sizeof(struct serverCommand))); + + cmd->fullname = const_cast("SETGET"); + cmd->arity = 4; + cmd->flags = CMD_WRITE | CMD_WRITE_FIRSTKEY_ONLY; + + cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INDEX; + cmd->legacy_range_key_spec.bs.index.pos = 1; // firstkey + cmd->legacy_range_key_spec.fk.range.lastkey = -1; + cmd->legacy_range_key_spec.fk.range.keystep = 2; + + c->cmd = cmd; + c->db = server.db[db]; + + c->argc = 4; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(cmd->fullname)); + c->argv[1] = createStringObjectFromSds(sdsnew(keyStr(itemNum1))); + c->argv[2] = createStringObjectFromSds(sdsnew(value1)); + c->argv[3] = createStringObjectFromSds(sdsnew(keyStr(itemNum2))); + + return c; + } + + + // Client with a fictional write command with no keys specified + client * getNoKeysWriteClient() { + // Fictional command which is marked WRITE, but has no keys. + client *c = static_cast(zcalloc(sizeof(client))); + struct serverCommand *cmd + = static_cast(zcalloc(sizeof(struct serverCommand))); + + cmd->fullname = const_cast("NOKEYSWRITE"); + cmd->arity = 1; + cmd->flags = CMD_WRITE; + + cmd->legacy_range_key_spec.begin_search_type = KSPEC_BS_INVALID; // No keys + + c->cmd = cmd; + c->db = server.db[0]; + + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(cmd->fullname)); + + return c; + } + + + void freeClientArgv(client *c) { + for (int i = 0; i < c->argc; i++) decrRefCount(c->argv[i]); + zfree(c->argv); + c->argv = NULL; + c->argc = 0; + } + + + // During testing, we create some fake commands. This checks if the command is real or fake. + // A fake command is dynamically allocated and can be freed. Real commands are static. + bool isRealValkeyCommand(struct serverCommand *cmd) { + return lookupCommandByCString(cmd->declared_name); + } + + + void freeTestClient(client *c) { + freeClientMultiState(c); + freeClientArgv(c); + + if (!isRealValkeyCommand(c->cmd)) zfree(c->cmd); + + zfree(c); + } + + + // Simulate what happens when a write command is blocked + void simulateBlockedWrite(client *c, int expectedNumberBlockedKeys = 1) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c,expectedNumberBlockedKeys,_)).Times(1); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_TRUE(blocked); + } + + + // Simulate what happens when a write command isn't blocked + void simulateUnblockedWrite(client *c) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + } + + + // Simulate what happens when a write command is NOT blocked, because the key can be cloned + // and expedited. This requires a scenario where we would normally need to block the + // client so that bgIteration can process the item. + void simulateClonedWrite(bgIterator *it, client *c) { + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + unsigned long initialClones = status.dbentry_clones_queued; + + // Client should not get blocked + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + + // Ensure that cloning took place + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, (initialClones + 1)); + + // Ensure that the real item isn't inuse (because we cloned it instead) + dbEntry *de = dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); + ASSERT_FALSE(bgIteration_isEntryInuse(de)); + } + + + // Simulates what happens when a write command (SET) actually executes. This requires a + // scenario where we would NOT be blocked on the write. It actually alters the value of + // the key and updates the metadata. + void simulateUnblockedWriteWithModification(client *c) { + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + + //dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); JHB + + // Fake execution of the command - touch the iterator_epoch counter and swap the value + // We need to duplicate the value because setKey() can reallocate it. + robj *value = dupStringObject(c->argv[2]); + setKey(c, c->db, c->argv[1], &value, SETKEY_ADD_OR_UPDATE); + + // Let's make sure that setKey updated the iteration epoch (as it should have) + dbEntry *de = dbFind(c->db, static_cast(objectGetVal(c->argv[1]))); + bgIterationEntryMetadata *md = static_cast(objectGetMetadata(de)); + EXPECT_EQ(md->iterator_epoch, bgIteration_getEpoch()); + + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + } + + + // Simulate execution of a MULTI/EXEC transaction for a client `c` without blocking. + // It replays all queued commands and ensures replication matches a real transaction. + // command replication flag is revalidated when exec command is processed. + // This requires a scenario where we don't expect the client to be blocked. + void simulateUnblockedMultiExec(client *c) { + + // simulate EXEC command of the multi/exec client + simulateUnblockedWrite(c); + server.in_exec = 1; + + // If there are other commands, call both blockClientIfRequired and handleCommandReplication for each of the command. + for (int i = 0; i < c->mstate->count; i++) { + advanceMultiClientToCommand(c, i); + simulateUnblockedWrite(c); + + // Replicate MULTI if this is the first instruction inside MULTI/EXEC + if (i == 0) { + robj *argv[1]; + argv[0] = createStringObjectFromSds(sdsnew("multi")); + bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("multi"), 1, argv); + decrRefCount(argv[0]); + } + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + } + + // Call handleCommandReplication for EXEC + robj *argv[1]; + argv[0] = createStringObjectFromSds(sdsnew("EXEC")); + bgIteration_handleCommandReplication(c->db->id, lookupCommandByCString("exec"), 1, argv); + server.in_exec = 0; + decrRefCount(argv[0]); + } + + + // Simulate the expiration (active expiration) of a key. This is independent of command execution. + void simulateExpiration(int itemNum) { + ASSERT_NE(getItem(itemNum), nullptr); // Should be there before expire + + // NOTE: This seems weird, but Valkey propagates the delete before actually expiring the + // key. BgIterator expects this behavior and expects the key to exist when the + // DEL is received for propagation. + + // Send bgIteration the DEL + int db = getDbFromItemNum(itemNum); + sds sdsKey = sdsnew(keyStr(itemNum)); + robj *argv[2]; + argv[0] = createStringObjectFromSds(sdsnew("DEL")); + argv[1] = createStringObjectFromSds(sdsdup(sdsKey)); + serverCommand *cmd = lookupCommandByCString("DEL"); + bgIteration_handleCommandReplication(db, cmd, 2, argv); + decrRefCount(argv[0]); + decrRefCount(argv[1]); + + bgIteration_keyDelete(db, sdsKey); + simpleDelItem(itemNum); // Simulate the actual del + + EXPECT_EQ(getItem(itemNum), nullptr); + sdsfree(sdsKey); + } + + + // Simulates an expiration, but validates behavior for an item inuse by bgIteration. + void simulateExpirationOfInuse(int itemNum) { + // An inuse item will have a refcount > 1. BgIteration should have incremented the + // refcount while it is inuse. + dbEntry *de = getItem(itemNum); + ASSERT_NE(de, nullptr); // Should be there before expire + EXPECT_TRUE(bgIteration_isEntryInuse(de)); + EXPECT_EQ(de->refcount, 2u); + + simulateExpiration(itemNum); + + // At this point, the item is removed from the DB, but still exists, and the refcount + // has been reduced to 1. This allows a background thread to continue using the item. + EXPECT_EQ(de->refcount, 1u); + } + + + // Simulates an expiration, but the item is a future item which will be expedited. + void simulateExpirationWithExpedite(int itemNum) { + // An inuse item will have a refcount > 1. BgIteration should have incremented the + // refcount while it is inuse. + dbEntry *de = getItem(itemNum); + ASSERT_NE(de, nullptr); // Should be there before expire + EXPECT_FALSE(bgIteration_isEntryInuse(de)); // Not yet inuse + EXPECT_EQ(de->refcount, 1u); + + simulateExpiration(itemNum); + + // At this point, the item is removed from the DB, but still exists, and the refcount + // has been reduced to 1. This allows a background thread to continue using the item. + EXPECT_TRUE(bgIteration_isEntryInuse(de)); // It's inuse now + EXPECT_EQ(getItem(itemNum), nullptr); // but it's not in the DB anymore + EXPECT_EQ(de->refcount, 1u); + } + + + // Simulate execution of a SWAPDB command + void simulateSwapDB(int dbid0, int dbid1) { + char dbStr[2] = {0}; + + client *c = static_cast(zcalloc(sizeof(client))); + + c->cmd = lookupCommandByCString("swapdb"); + c->db = server.db[0]; + + c->argc = 3; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); + dbStr[0] = '0' + dbid0; + c->argv[1] = createStringObjectFromSds(sdsnew(dbStr)); + dbStr[0] = '0' + dbid1; + c->argv[2] = createStringObjectFromSds(sdsnew(dbStr)); + + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // SWAPDB should never block + + // The real SWAP does more than this, but this is enough for unit tests + serverDb *aux = server.db[dbid0]; + server.db[dbid0] = server.db[dbid1]; + server.db[dbid1] = aux; + + bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv); + + freeTestClient(c); + } + + + // Simulate execution of a FLUSHDB or FLUSHALL command + void simulateFlushDB(int db, int anInUseItem) { + client *c = static_cast(zcalloc(sizeof(client))); + + if (db == -1) { + c->cmd = lookupCommandByCString("flushall"); + c->db = server.db[0]; + } else { + c->cmd = lookupCommandByCString("flushdb"); + c->db = server.db[db]; + } + + c->argc = 1; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); + + dbEntry *de_in_use = getItem(anInUseItem); + EXPECT_EQ(de_in_use->refcount, 2u); + + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // FLUSHDB should never block + + // The real FLUSH does more than this, but this is enough for unit tests + + // Now flush the items + for (int d = 0; d < server.dbnum; d++) { + if (db == -1 || db == d) { + kvstoreRelease(server.db[d]->keys); + server.db[d]->keys = NULL; + } + } + + EXPECT_EQ(de_in_use->refcount, 1u); + + // and replicate + + bgIteration_handleCommandReplication(0, c->cmd, c->argc, c->argv); + + freeTestClient(c); + } +}; + +using BgIterationDeathTest = BgIterationTest; + + +TEST_F(BgIterationTest, dbIsOK) { + // Just run the setup/teardown code to make sure the DB is OK. +} + + +///////////////////////////////////////////////////// +// Simple Full-scan iterator tests +///////////////////////////////////////////////////// + +// A simple full scan that just checks basic flow. +TEST_F(BgIterationTest, createAndCleanup) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + 0, NULL, iteratorCleanupFn, PRIVDATA); + EXPECT_EQ(bgIteratorFind("simple"), it); + EXPECT_STREQ(bgIteratorName(it), "simple"); + + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + + EXPECT_EQ(status.dbentries_queued, 0u); + EXPECT_EQ(status.dbentries_processed, 0u); + EXPECT_EQ(status.replication_queued, 0u); + EXPECT_EQ(status.replication_processed, 0u); + EXPECT_EQ(status.swapdb_queued, 0u); + EXPECT_EQ(status.swapdb_processed, 0u); + EXPECT_EQ(status.flushdb_queued, 0u); + EXPECT_EQ(status.flushdb_processed, 0u); + + EXPECT_EQ(status.queue_length, 0u); + EXPECT_GT(status.queue_length_target, 0u); + + EXPECT_LT(status.runtime_ms, 5u); + EXPECT_EQ(status.current_item_ms, 0u); + + expectAnythingCleanup(it); + + EXPECT_EQ(bgIteratorFind("simple"), nullptr); +} + + +// Close client before reading anything +TEST_F(BgIterationTest, testClientCloseBeforeRead) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + 0, NULL, iteratorCleanupFn, PRIVDATA); + bgIteration_feedIterators(); + + bgIteratorClose(it); // Immediately close before reading + + bgIteration_feedIterators(); // Recognize the closed iterator + + // Check that the cleanup callback was executed properly + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Test that the full scan hits each item in the expected sequence. +TEST_F(BgIterationTest, orderedIteration) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + 0, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, LAST_ITEM); + + // Quick status check. At this point, item #9 hasn't been returned yet. + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentries_queued, static_cast(TOTAL_ITEMS)); + EXPECT_EQ(status.dbentries_processed, static_cast(TOTAL_ITEMS) - 1); + + expectReadComplete(it); // Returns item #9, and reads the completion item + + // Check that the cleanup callback was executed properly + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); +} + + +// Test that two simultaneous iterations work properly. +TEST_F(BgIterationTest, twoOrderedIterations) { + bgIterator *it1 = bgIteratorCreateFullScanIter("simple1", + 0, NULL, iteratorCleanupFn, PRIVDATA); + bgIterator *it2 = bgIteratorCreateFullScanIter("simple2", + 0, NULL, iteratorCleanupFn, PRIVDATA); + EXPECT_EQ(bgIteratorFind("simple1"), it1); + EXPECT_EQ(bgIteratorFind("simple2"), it2); + + int it1Count = 0; + int it2Count = 0; + while (it1Count < TOTAL_ITEMS || it2Count < TOTAL_ITEMS) { + // Randomly read from either iterator + if ((rand() % 2) == 0) { + if (it1Count < TOTAL_ITEMS) expectReadKey(it1, it1Count++); + } else { + if (it2Count < TOTAL_ITEMS) expectReadKey(it2, it2Count++); + } + } + + // Nothing left but to read the final completions + expectReadComplete(it1); + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); + expectReadComplete(it2); + EXPECT_EQ(cleanupCount, 2); + EXPECT_FALSE(cleanupTerminated); +} + + +///////////////////////////////////////////////////// +// MODIFY A FUTURE ITEM +// The next tests validate the basic pattern when a key, not yet iterated, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a future item, without replication or consistency. +// Our expectation for this case is that the modification should proceed without blocking, the item +// shouldn't be expedited, and we will see the modified item once the iterator reaches it. +TEST_F(BgIterationTest, modFutureItem_NoReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c = getWriteClient(6, "xxx"); + + // We DONT expect the client to be blocked - not consistent + simulateUnblockedWriteWithModification(c); + + // Now continue reading, 1, 2, 3, 4, 5 + expectReadKeySequence(it, 1, 5); + + // Let's validate that key 6 shows the new value + expectReadKey(it, 6, "xxx"); + + // Continue... + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Modify a future item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification SHOULD be blocked, as we have to save the +// the item in it's state before the modification. To reduce blocking time, the item should be +// moved to the head of the queue - there's no replication in this case, so out-of-order processing +// isn't a concern. +TEST_F(BgIterationTest, modFutureItem_NoReplication_YesConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c = getWriteClient(6, "xxx"); + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c); + + // On a consistent iterator, the event is expedited in-front of items already in queue! + // Read key 6 out of order. + expectReadKey(it, 6); + + // Now, when we read key 1, key 6 is released back to Valkey, and the client will be unblocked. + expectReadKeyWithUnblock(it, 1, 6); + simulateUnblockedWriteWithModification(c); // Now the write can proceed + + // Continue... + expectReadKeySequence(it, 2, 5); + // 6 has already been processed + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Modify a future item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification should proceed without blocking, as the +// mode is inconsistent. We don't expect replication, as we haven't reached the item yet. We'll +// see the modified item later. +TEST_F(BgIterationTest, modFutureItem_YesReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c = getWriteClient(6, "xxx"); + + // We DONT expect the client to be blocked - not consistent + simulateUnblockedWriteWithModification(c); + + // NOTE: Since we haven't reached this item yet, and consistency is not required, there's no + // need to replicate this command. So everything should wrap up just fine - we will see + // the new value when we get to it. + + // Now continue reading, 1, 2, 3, 4, 5 + expectReadKeySequence(it, 1, 5); + + // Let's validate that key 6 shows the new value + expectReadKey(it, 6, "xxx"); + + // Continue... + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// There's no current use case for CONSISTENT with REPLICATION. It's included for completeness +// and to clarify the functionality of the design. However, if this combination were to be used, +// it would be invalid in the presence of SWAPDB. +TEST_F(BgIterationDeathTest, modFutureItem_YesReplication_YesConsistent_fail) { + // Note: This configuration (CONSISTENT with REPLICATION) is invalid unless in cluster mode. + // The issue is that with multiple database supporting SWAPDB creates a problem. How is it + // possible to maintain a CONSISTENT view with a SWAPDB impacting the values seen in the + // replication stream? (Cluster mode doesn't support SWAPDB, so no issue there.) + EXPECT_DEATH(bgIteratorCreateFullScanIter("iter", BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, NULL, NULL), ""); +} + + +///////////////////////////////////////////////////// +// MODIFY A CURRENT ITEM +// The next tests validate the basic pattern when a key, currently in use, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a current item, without replication or consistency. +// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't +// be expedited (it's already in use). +TEST_F(BgIterationTest, modCurrentItem_NoReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + client *c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication) + + // Continue... + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Modify a current item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification SHOULD be blocked, the item shouldn't +// be expedited (it's already in use). +TEST_F(BgIterationTest, modCurrentItem_NoReplication_YesConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + client *c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write won't affect anything (past key, no replication) + + // Continue... + expectReadKeySequence(it, 4, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Modify a current item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification SHOULD be blocked. After the key is processed, +// the write will proceed, and the replication will be sent. +TEST_F(BgIterationTest, modCurrentItem_YesReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + client *c = getWriteClient(2, "xxx"); + + // Must be blocked since key is queued + simulateBlockedWrite(c); + + // Now continue reading + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKeyWithUnblock(it, 3, 2); + simulateUnblockedWriteWithModification(c); // the actual write will cause replication + + expectReadKey(it, 4); // 4 got put in queue when 3 was read + + expectReadReplication(it, c); + + // Continue... + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +#ifdef CODE_NOT_READY_YET +TEST_F(BgIterationTestCluster, modCurrentItem_YesReplication_YesConsistent_cluster) { + // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. All other keys are queued. + client *c = getWriteClient(1, "xxx"); + + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c); + + // Not expedited because item is already in queue + expectReadKey(it, 1); + expectReadKeyWithUnblock(it, 2, nullptr, 1); // reading original/unmodified item + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 3); // 2, 3 & 4 are in the same bucket, so the replication comes after + expectReadKey(it, 4); + expectReadReplication(it, c); + + // Continue... + expectReadComplete(it); + freeTestClient(c); +} +#endif + + +///////////////////////////////////////////////////// +// MODIFY A PAST ITEM +// The next tests validate the basic pattern when a key, not yet iterated on, is modified. +// Each variation of iteration flags is tested. +// Note that these tests execute without cloning (cloning is tested elsewhere). +///////////////////////////////////////////////////// + +// Modify a past item, without replication or consistency. +// Our expectation for this case is that the modification should proceed without blocking. +// No replication is generated and keys are processed similar to no modification. +TEST_F(BgIterationTest, modPastItem_NoReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKeySequence(it, 2, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Modify a past item, without replication but with consistency. (Like a SAVE operation) +// Our expectation for this case is that the modification should proceed without blocking. +// No replication is generated and keys are processed similar to no modification. +TEST_F(BgIterationTest, modPastItem_NoReplication_YesConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKeySequence(it, 2, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Modify a past item, with replication but without consistency. (Like a Threadsave Full Sync operation) +// Our expectation for this case is that the modification should proceed without blocking. +// Replication will be sent. +TEST_F(BgIterationTest, modPastItem_YesReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Key 2 was already in queue (same bucket as key 1). The replication will follow. + expectReadKey(it, 2); + expectReadReplication(it, c); + + // Continue... + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +#ifdef CODE_NOT_READY_YET +TEST_F(BgIterationTestCluster, modPastItem_YesReplication_YesConsistent_cluster) { + // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // This read returns key 0 (making it a past item) + expectReadKey(it, 1); + + // At this point, key 0 is returned. + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + // Keys 2, 3, and 4 were already in queue. The replication will follow. + expectReadKey(it, 2); + expectReadKey(it, 3); + expectReadKey(it, 4); + expectReadReplication(it, c); + + expectReadComplete(it); + freeTestClient(c); +} +#endif + + +///////////////////////////////////////////////////// +// TESTS FOR ITEM CLONING +///////////////////////////////////////////////////// + +// In a consistent iteration, verify that a simple string is properly cloned, and that a write can +// occur without blocking. Validate the cloned item and metadata. +TEST_F(BgIterationTest, modFutureItem_NoReplication_YesConsistent_CloneExpeditedItem) { + // Initialize cloning configurations. + bgIteration_unitTestEnableCloning(50, 100); + + bgIteratorStatus status; + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c = getWriteClient(6, "xxx"); + + // Quick status check. At this point, no clones exist yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 0u); + + // Since item 6 should be cloned, it will not block the client, allowing the write. + void *de6_md = cloneMetadata(getItem(6)); + simulateClonedWrite(it, c); // This wouldn't block, and queues the cloned value + simulateUnblockedWriteWithModification(c); // This modifies the real entry in the de (touching metadata) + + // At this point, one clone is in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // On a consistent iterator, the event is expedited in-front of items already in queue! + // Read key 6 (which is cloned) out of order. The value will still match the key. + expectReadClonedKey(it, 6, de6_md); // Also validates and frees the metadata + + // Quick status check. At this point, cloned items have not been marked as processed yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 0u); + + // Reading key 1 will release key 6, and the clone will finish processing. + expectReadKey(it, 1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Now, when we read key 2 should not have an impact on number of processed clones. + expectReadKey(it, 2); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Continue... + expectReadKeySequence(it, 3, 5); + // 6 has already been processed + expectReadKeySequence(it, 7, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Check that cloning for simple strings is respecting the size limits and pool size. On a +// consistent iteration, we expect to block or clone on all future keys. We validate that we can +// clone if the item is small enough and the cloning pool has more space left. +TEST_F(BgIterationTest, modFutureItem_NoReplication_YesConsistent_LargeItemOrClonePoolFull) { + // Initialize cloning configurations to test the clone pool functionality first. + bgIteration_unitTestEnableCloning(50, 50); + + bgIteratorStatus status; + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1 & 2 are queued (they are all in the same bucket). + // Fake a modification to a later key so that we can see if it gets processed out of order. + client *c6 = getWriteClient(6, "xxx"); + client *c7 = getWriteClient(7, "xxx"); + client *c8 = getWriteClient(8, "xxx"); + + // Quick status check. At this point, no clones exist yet. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 0u); + + // Since item 6 should be cloned, it will not block the client, allowing the write. + void *de6_md = cloneMetadata(getItem(6)); + simulateClonedWrite(it, c6); + simulateUnblockedWriteWithModification(c6); + + // At this point, one clone is in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // Now that cloning pool is full, item 7 will not be cloned and the client will be blocked. + simulateBlockedWrite(c7); + ASSERT_TRUE(bgIteration_isEntryInuse(getItem(7))); + + // There is still only one cloned item in the queue. + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_queued, 1u); + + // Now change cloning configurations to test that large items will not be cloned. We adjust + // the clone pool size to allow two items, but set the maximum item size to be smaller than + // the size of item 8. The clone pool size must be larger than the total size of the existing + // clones plus the maximum item clone size. + bgIteration_unitTestEnableCloning(1, 101); + + // This write will pass the clone pool check but fail the item size check, blocking the client. + simulateBlockedWrite(c8); + ASSERT_TRUE(bgIteration_isEntryInuse(getItem(8))); + + // On a consistent iterator, the expedited item in-front of items already in queue! + // Read key 6 out of order. + expectReadClonedKey(it, 6, de6_md); + + // Now, when we expect to read key 7, which was expedited, key 6 will be released back to Valkey + // and the clone will be deallocated here. + expectReadKey(it, 7); + + // Now, when we read key 8, which was expedited, key 7 is released back to Valkey, and the client + // will be unblocked. + // (actually, unblock is called after every key [just in case] - but functionally we only care + // about this one) + expectReadKeyWithUnblock(it, 8, 7); + simulateUnblockedWriteWithModification(c7); + + // Now, when we read key 1, key 8 is released back to Valkey, and the client will be unblocked. + expectReadKeyWithUnblock(it, 1, 8); + simulateUnblockedWriteWithModification(c8); + + // Since only one item was cloned, there should be one clone processed + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentry_clones_processed, 1u); + + // Continue... + expectReadKeySequence(it, 2, 5); + // 6, 7, and 8 have already been processed + expectReadKeySequence(it, 9, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c6); + freeTestClient(c7); + freeTestClient(c8); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO MODIFICATION OF TWO ITEMS +// When 2 keys are modified, we need to ensure that both keys have been sent before we can send +// replication. This means that if replication is present, we may have to block/expedite for +// future keys, even in the inconsistent scenario. +///////////////////////////////////////////////////// + +// Replication enabled, but NOT consistent. In this case, if ANY of the keys have been iterated, +// ALL of the keys must be replicated so that the command can be processed properly on the replica. +TEST_F(BgIterationTest, modPastFutureItem_YesReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Even though key 12 is for READ in this command, it must be expedited so that it exists before + // the associated replication is sent. + client *c = getSetGetClient(8, "xxx", 12); + simulateBlockedWrite(c); + + // Key 12 will be expedited, but not in front of existing items in queue (can only do that for + // consistent iterators) - JHB How about cluster mode? + + expectReadKey(it, 10); + expectReadKey(it, 12); // expedited + expectReadKeyWithUnblock(it, 11, 12); // 13 is now in queue + + simulateUnblockedWriteWithModification(c); + + // Continue... + expectReadKey(it, 13); + expectReadReplication(it, c); + + expectReadKeySequence(it, 14, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// Replication NOT enabled. A read-only key doesn't need to be expedited, even if other keys have +// been processed already. (This should work identically for both consistent/non-consistent. +TEST_F(BgIterationTest, modPastFutureItem_NoReplication_YesConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter1", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Since there's no replication, we don't have to worry about expediting 12. The write will + // proceed without blocking. + client *c = getSetGetClient(8, "xxx", 12); + simulateUnblockedWriteWithModification(c); + + // Key 12 will not be expedited. Remaining keys should be received in normal order. + expectReadKeySequence(it, 10, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + +TEST_F(BgIterationTest, modPastFutureItem_NoReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter2", + 0, NULL, iteratorCleanupFn, PRIVDATA); + + // In this test, we need a past and future key IN THE SAME DB (they're used in the same command). + // DB1 has lots of buckets. After reading item 9, + // 8 will be past, 10 will be in queue, 11-15 will be future. + expectReadKeySequence(it, 0, 9); + + // We're going to write to key 8 (past) and read from key 12 (future) + // Since there's no replication, we don't have to worry about expediting 12. The write will + // proceed without blocking. + client *c = getSetGetClient(8, "xxx", 12); + simulateUnblockedWriteWithModification(c); + + // Key 9 will not be expedited. Remaining keys should be received in normal order. + expectReadKeySequence(it, 10, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +///////////////////////////////////////////////////// +// TESTS RELATED TO MISSING ITEMS +// Missing items are tricky. A missing item might be logically located in the past or future, in +// relation to the current iteration position. The command may (or may not) create the "missing" +// key. Some general considerations: +// * In a consistent iteration, a missing key didn't exist at the time of consistency, or it was +// already processed (saved) at the time of the deletion. If the missing key gets created, we +// must be sure to skip it if we later iterate over it. +// * In a non-consistent iteration with replication: +// * If the key location is already passed, the replication is sent, allowing the key to be +// created (or not) based on the replication. +// * If the key location is in the furure, we can allow the command to proceed, without +// replication. If the key is created, we will process it when the iterator gets to it. +// +// We expect: +// no-repl, no-consist: past items are ignored - future items are processed when iterated +// no-repl, yes-consist: past items are ignored - future items are ignored +// yes-repl, no-consist: past item skipped, but replicated - future items are created by replication and skipped later +// yes-repl, yes-consist: past item skipped, but replicated - future items are processed when iterated +///////////////////////////////////////////////////// + +// no-repl, no-consist: creation of PAST item has no impact +TEST_F(BgIterationTest, missingPastItem_NoReplication_NoConsistent) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + 0, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// no-repl, yes-consist: creation of PAST item has no impact +TEST_F(BgIterationTest, missingPastItem_NoReplication_YesConsistent) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 3, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// yes-repl, no-consist: creation of a PAST item will be replicated +TEST_F(BgIterationTest, missingPastItem_YesReplication_NoConsistent) { + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // replication will be added after item 4 (3,4 in same bucket) + + expectReadKey(it, 4); + + expectReadReplication(it, c); + + expectReadKeySequence(it, 5, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +#ifdef CODE_NOT_READY_YET +// yes-repl, yes-consist: creation of a PAST item will be replicated +TEST_F(BgIterationTestCluster, missingPastItem_YesReplication_YesConsistent) { + // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode + simpleDelItem(0); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 1); + expectReadKey(it, 2); + + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // replication will be added after item 4 (2, 3, and 4 in same bucket) + + expectReadKey(it, 3); + expectReadKey(it, 4); + expectReadReplication(it, c); + + expectReadComplete(it); + freeTestClient(c); +} +#endif + + +// no-repl, no-consist: creation of FUTURE item is seen when reached by the iteration. +TEST_F(BgIterationTest, missingFutureItem_NoReplication_NoConsistent) { + // Using DB1 so we have lots of buckets + // Note: Choosing item 14 because it's in the portion of DB1 that's already rehashed. So we + // know that the item won't be moving when we re-add it. + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + 0, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + const char * newValue = "xxx"; + client *c = getWriteClient(14, newValue); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 13); + + // We expect to see item 14. + // Note that for an inconsistent DB view, it is logically undefined if this value is seen (or not). + // But as implemented, we should see it and the test is helpful to understand if/when the + // functionality changes. + expectReadKey(it, 14, newValue); + + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// no-repl, yes-consist: creation of FUTURE item is ignored by consistent iteration. +TEST_F(BgIterationTest, missingFutureItem_NoReplication_YesConsistent) { + // Using DB1 so we have lots of buckets + // Note: Choosing item 14 because it's in the portion of DB1 that's already rehashed. So we + // know that the item won't be moving when we re-add it. + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + client *c = getWriteClient(14, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 13); + // Key 14 is missing - it didn't exist at start of consistent iteration + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// yes-repl, no-consist: creation of FUTURE item is handled by the replication, and then the key is +// later skipped (treated like an early iteration case). +TEST_F(BgIterationTest, missingFutureItem_YesReplication_NoConsistent) { + // Using DB1 so we have lots of buckets + // Note: Choosing item 14 because it's in the portion of DB1 that's already rehashed. So we + // know that the item won't be moving when we re-add it. + simpleDelItem(14); // Delete the item before iterator creation + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Items 1 & 2 are in queue (same bucket) + + client *c = getWriteClient(14, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKeySequence(it, 1, 2); + + expectReadReplication(it, c); // Here's the replication creating item 14 + + expectReadKeySequence(it, 3, 13); + // We expect item 14 to be skipped, because it was created by the earlier replication + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + +#ifdef CODE_NOT_READY_YET +TEST_F(BgIterationTestCluster, missingFutureItem_YesReplication_YesConsistent) { + // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode + simpleDelItem(4); + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, iteratorCleanupFn, PRIVDATA); + + bgIteration_feedIterators(); // Make sure we get key 0 and 1 into the queue + + client *c = getWriteClient(4, "xxx"); + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + expectReadReplication(it, c); + + expectReadKey(it, 2); + expectReadKey(it, 3); + + // The replication was read - we don't want to see the key now - #4 should be skipped + + expectReadComplete(it); + freeTestClient(c); +} +#endif + + +///////////////////////////////////////////////////// +// TESTS RELATED TO EXPIRATION +// Expiration can be tricky. When pre-evaluating a command with bgIteration_blockClientIfRequired, +// a key might exist, but be ready for expiration. Then, as the command executes, the key expires +// and gets deleted before the write operation. Consider SET K V. +// In the unexpired case, this appears to bgIteration as a single SET command (which replaces the value). +// In the expired case, bgIteration will receive a DEL followed by a SET. +// +// Another case is a READ command. A read command won't cause the client to be blocked. However, +// if the key is expired, this will cause a DEL. For consistent processing, this key might need to +// be expedited so that it can be processed before it gets deleted. In this case, the key is +// unlinked from the main Valkey dictionary, but the actual deletion is deferred. +///////////////////////////////////////////////////// + +TEST_F(BgIterationTest, expireKeys_NoReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + 0, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we no longer care + simulateExpirationOfInuse(2); // Current - it's inuse + simulateExpiration(5); // Future - we don't care (non-consistent) + + expectReadKeySequence(it, 2, 4); + // key 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, expireKeys_Replication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we expect replication + simulateExpirationOfInuse(2); // Current - it's inuse, but we expect replication + simulateExpiration(5); // Future - we don't care (non-consistent) + + expectReadKey(it, 2); // this was already queued + expectReadReplicationDel(it, 0); // Past item should replicate + expectReadReplicationDel(it, 2); // Current item should replicate + // Item 5 is a future item and doesn't need to replicate + + expectReadKeySequence(it, 3, 4); + // Item 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +TEST_F(BgIterationTest, expireKeys_NoReplication_YesConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // At this point, key 1 is active, key 2 is in queue. + + simulateExpiration(0); // Past - we no longer care + simulateExpirationOfInuse(2); // Current - we must defer + simulateExpirationWithExpedite(5); // Future - will become inuse and expedited for consistency + + expectReadKey(it, 5); // Expedited to front + + expectReadKeySequence(it, 2, 4); + // Item 5 has been deleted + expectReadKeySequence(it, 6, LAST_ITEM); + expectReadComplete(it); +} + + +// Special case during a non-consistent iteration with replication and expiration. +// 1. A future key is created (and processed by its replication) - considered early iterated +// 2. Later the key is expired and deleted during command processing (causes DEL to be sent) - no longer early iterated +// 3. The key is recreated as part of the command processing (and this command was replicated) - again early iterated +// 4. Finally, when we iterate to the key, it shouldn't be sent, because it was replicated in step 3. +TEST_F(BgIterationTest, expireKeys_Replication_NoConsistent_FutureKeyCreatedThenExipredDuringSet) { + simpleDelItem(8); // Start with a missing future item + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); // Get the iterator started + + client *c = getWriteClient(8, "xxx"); + simulateUnblockedWriteWithModification(c); // Not blocked because this is a future key (but we expect repl) + + // Now do it again, but break out the steps so that we can simulate an expiration + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); // Shouldn't be blocked because this is a future key + + // Now, as the SET command tries to execute, simulate that the key is expired. Expiration + // processing sends the replication FIRST! + robj *argv[2]; + argv[0] = createStringObjectFromSds(sdsnew("DEL")); + argv[1] = c->argv[1]; + serverCommand *cmd = lookupCommandByCString("DEL"); + bgIteration_handleCommandReplication(getDbFromItemNum(8), cmd, 2, argv); + decrRefCount(argv[0]); + + // Now the call to keyDelete happens (after the replication). + bgIteration_keyDelete(getDbFromItemNum(8), static_cast(objectGetVal(c->argv[1]))); + simpleDelItem(8); // Simulate the actual del + + // Now the SET will run, re-creating the item (which is still a future item) + // We need to duplicate the value because setKey() can reallocate it. + robj *value = dupStringObject(c->argv[2]); + setKey(c, c->db, c->argv[1], &(value), SETKEY_ADD_OR_UPDATE); + + // Finally, replication will be sent because this is creating a new key + bgIteration_handleCommandReplication(getDbFromItemNum(8), c->cmd, c->argc, c->argv); + + // Test that everything comes as expected + expectReadKeySequence(it, 1, 2); // All one bucket - queued after key 0 read + + expectReadReplication(it, c); // Repl from the first SET command + expectReadReplicationDel(it, 8); // This is the expected replication of the DEL from expire + expectReadReplication(it, c); // Repl from the second SET command (recreating deleted key) + + expectReadKeySequence(it, 3, 7); // continue with normal iteration + // KEY 8 SHOULD BE OMITTED - This was already replicated + expectReadKeySequence(it, 9, LAST_ITEM); + + expectReadComplete(it); +} + + +#ifdef CODE_NOT_READY_YET +///////////////////////////////////////////////////// +// THE REMAINING TESTS ARE GENERAL / UNCATEGORIZED +///////////////////////////////////////////////////// + +// Iteration can be terminated from the main thread or from the child client. +// This tests termination driven from the main thread. +TEST_F(BgIterationTest, earlyTerminationFromMain) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); + expectReadKey(it, 0); + + // At this point, keys 1 & 2 are in queue. A termination should release those keys. + bool blocked1 = true; + bool blocked2 = true; + // We expect no general unblocks, we account for each specific unblock below. + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0); + // We should expect to see unblock called for items 1-4, as they are released from the queue. + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1)))) + .WillOnce(Assign(&blocked1, false)); + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2)))) + .WillOnce(Assign(&blocked2, false)); + bgIteratorTerminate(it); // queues the items for release + EXPECT_TRUE(bgIteratorIsTerminating(it)); + bgIteration_feedIterators(); // actually performs the release + EXPECT_FALSE(blocked1); + EXPECT_FALSE(blocked2); + + bool blocked0 = true; + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))) + .WillOnce(Assign(&blocked0, false)); + bgIteratorItem *item = bgIteratorRead(it); + EXPECT_FALSE(blocked0); + EXPECT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + EXPECT_EQ(cleanupCount, 0); + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Iteration can be terminated from the main thread or from the child client. +// This tests termination driven from the child client (the background thread). +TEST_F(BgIterationTest, earlyTerminationFromChild) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + // At this point, keys 1 & 2 are in queue. A termination should release those keys. + bgIteratorClose(it); // background thread initiates the termination + EXPECT_TRUE(bgIteratorIsTerminating(it)); + + bool blocked0 = true; + bool blocked1 = true; + bool blocked2 = true; + // Expecting no extra unblocks + EXPECT_CALL(mock, unblockClientsInUseOnKey(_)).Times(0); + // We expect item 0 (the in progress item) to be released + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(0)))) + .WillOnce(Assign(&blocked0, false)); + // We expect items 1-4 (the queued items) to be released + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(1)))) + .WillOnce(Assign(&blocked1, false)); + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(2)))) + .WillOnce(Assign(&blocked2, false)); + bgIteration_feedIterators(); + EXPECT_FALSE(blocked0); + EXPECT_FALSE(blocked1); + EXPECT_FALSE(blocked2); + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// Edge case. Executing a command (like SUNIONSTORE) which REPLACES the first key and reads the +// second key. In this case, bgIteration will get notified of the key deletion during execution of +// SETUNIONSTORE. Given that both keys are in the future (not iterated yet), we'll allow the +// command to execute, unblocked. We won't replicate as we'll pick up the key when we get to it. +TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_keyDeletedDuringSetReplace) { + // Using DB1 so we have lots of buckets + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 8); // 9 is in queue + + // Write command that has 2 keys. 1 existing key that we write to and 1 dependant future key. + client *c = getWrite2KeysClient("sunionstore", 12, 13); + + simulateUnblockedWrite(c); + + // Now the call to keyDelete happens + bgIteration_keyDelete(getDbFromItemNum(12), keyStr(12)); + simpleDelItem(12); // So simulate the actual del + + // Now the write will run, re-creating the item (which is still a future item) + const char * const newValueStr = "new value"; + robj *newValueRobj = createStringObjectFromSds(sdsnew(newValueStr)); + setKey(c, c->db, c->argv[1], &newValueRobj, SETKEY_ADD_OR_UPDATE); + + // Finally, we are letting bgIteration know that the write command was executed + bgIteration_handleCommandReplication(getDbFromItemNum(12), c->cmd, c->argc, c->argv); + + // Since the write command was not replicated, we expect all the keys to be read in the normal + // order from the dictionary. + expectReadKeySequence(it, 9, 11); + expectReadKey(it, 12, newValueStr); + expectReadKeySequence(it, 13, LAST_ITEM); + + expectReadComplete(it); + freeTestClient(c); +} + + +// Edge case. When we have a new key which is created by a command, AND replication is enabled, we +// expect that we will replicate the command rather than serializing the key/value later. As an +// example, consider SUNIONSTORE A B. We want to create A by replicating the command. We don't +// want to have to process A as a key later on. But in this case, we can't run the command until +// B has been sent. We expect the command to be blocked while we send B. +TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_setNewKey_DependantFuture) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 8); // 9 is in queue + + // Write command that has 2 keys. 1 new key and 1 dependant future key. + client *c = getWrite2KeysClient("sunionstore", 12, 13); + + // We are simulating a new key in the dict. This command should block on the dependant key. + // This adds key 13 in the queue since the command depends on it. + simulateBlockedWrite(c); + + // Key 9 was already in the queue + expectReadKey(it, 9); + + // Key 13 is processed out of order since the write depends on it + expectReadKey(it, 13); + + // Reading key 10 will unblock key 13, allowing us to write. + expectReadKey(it, 10); + + // Now that key 13 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + // Key 11 was queued when we read key 10 + expectReadKey(it, 11); + + // The replication of the write command was enqueued after key 11 + expectReadReplication(it, c); + + // We shouldn't see key 12 - as that was processed via replication. + // We shouldn't see key 13 - as that was expedited earlier + + // Now resuming processing of dict entries + expectReadKeySequence(it, 14, LAST_ITEM); + + expectReadComplete(it); + freeTestClient(c); +} + + +// A new key is being created, but is dependent on another key which has already been processed. +// In this case, the command shouldn't be blocked. +TEST_F(BgIterationTest, writeWith2Keys_Replication_NoConsistent_setNewKey_DependantPast) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // 10 is in queue, done with 8 + + // Write command that has 2 keys. 1 new key and 1 dependant past key. + client *c = getWrite2KeysClient("sunionstore", 12, 8); + + // We are simulating a new key in the dict. + // This command should not block since the dependant key has already been processed. + simulateUnblockedWriteWithModification(c); + + // Key 10 was put in the queue before the write + expectReadKey(it, 10); + + expectReadReplication(it, c); + + expectReadKey(it, 11); + + // Key 12 should be missing - it was processed by replication + + expectReadKeySequence(it, 13, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + + +// A new key is being created, and has dependencies on 2 other keys - one already processed, one not. +// In this case, the command should be blocked so that the future key can be sent first. +TEST_F(BgIterationTest, writeWith3Keys_Replication_NoConsistent_setNewKey_1DependantPast1DependantFuture) { + // Using DB1 so we have lots of buckets + simpleDelItem(12); // Deleting key 12 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // 8 has been returned, 9 is active, 10 is in queue + + // Write command that has 1 new key and 2 dependencies (past/future) + client *c = getWrite3KeysClient("sunionstore", 12, 8, 13); + + // The write should be blocked, so that item 13 can be processed. + simulateBlockedWrite(c); + + expectReadKey(it, 10); // 10 was already in queue + expectReadKey(it, 13); // 13 was expedited since the write depends on it + EXPECT_CALL(mock, unblockClientsInUseOnKey(robjEqualsStr(keyStr(13)))).Times(1); + expectReadKey(it, 11); // Releases 13 so the command can execute + + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 14); // was queued when reading 11 (12 is missing, 13 was expedited) + + expectReadReplication(it, c); + + expectReadKey(it, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} + +// Test an edge case with the same (future) key being repeated in the command, like: +// SUNIONSTORE A B B +// In this test, A is a previously handled key, and B is a future key. We expect the future key B to +// be expedited (once). +TEST_F(BgIterationTest, writeWith3Keys_Replication_NoConsistent_repeatedKey_1DependantPast1RepeatedFuture) { + // Using DB1 so we have lots of buckets + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKeySequence(it, 0, 9); // We're done with 8, and 10 is in queue + + // Write command that has 3 keys. 1 past key and 1 repeated key in the future. + client *c = getWrite3KeysClient("sunionstore", 8, 12, 12); + + // This command should block because 12 needs to be expedited. + simulateBlockedWrite(c); + + expectReadKey(it, 10); // was already in queue + expectReadKey(it, 12); // expedited + expectReadKey(it, 11); // releases 12 (unblocking the command) + + // Now that key 12 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + expectReadKey(it, 13); // queued when we read 11 + + expectReadReplication(it, c); + + // Now resuming processing of dict entries. + expectReadKeySequence(it, 14, LAST_ITEM); + expectReadComplete(it); + freeTestClient(c); +} +#endif +#ifdef CODE_NOT_READY_YET + + +TEST_F(BgIterationTest, writeWith3Keys_Replication_NoConsistent_repeatedKey_1newKey1RepeatedFuture) { + // This tests the replication of a write command that creates a new key and depends on 1 other + // key which is repeated in the command. The repeated key is in the future. + // This test is meant to replicate this bug: https://issues.amazon.com/ELMO-46572 + + // Expected sequence of event for this test: + // ITEM: (0)'D0' : 'D0' + // BLCK?: (0)'sunionstore' 'E0' 'C0' 'C0' + // EARLY: (0)'C0' : 'C0' + // (blocked) + // ITEM: (0)'B0' : 'B0' + // ITEM: (0)'A0' : 'A0' + // BLCK?: (0)'sunionstore' 'E0' 'C0' 'C0' + // REPL?: (0)'sunionstore' 'E0' 'C0' 'C0' + // (queued) + // SKIPPING ITEM(early iterate): (0)'C0' : 'C0' + // ITEM: (1)'E1' : 'E1' + // ITEM: (1)'C1' : 'C1' + // ITEM: (1)'B1' : 'B1' + // ITEM: (1)'A1' : 'A1' + // ITEM: (1)'D1' : 'D1' + // SENDING COMPLETE + // CLEANUP FN (success) + + simpleDelItem(1); // Deleting key 1 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // Start with this to load 0 into the queue - but don't read 0 as that would load 1,2,3 into the queue! + bgIteration_feedIterators(); + + // Write command that has 3 keys. 1 new key and 1 repeated key in the future. + client *c = getWrite3KeysClient(1, 4, 4); + + // This command should block on key 4. + // This adds key 4 in the queue because: + // - the command depends on key 4 which hasn't been processed yet + // - the command depends on a new key (key 1). + simulateBlockedWrite(c); + + // Key 0 was already enqueued. + expectReadKey(it, 0); + + // Key 4 is processed out of order since the write depends on it + expectReadKey(it, 4); + + // Keys 2,3 are next in the queue (they are all in the same bucket). + // Only reading key 2 for now to release key 4 from the iterator. + expectReadKey(it, 2); + + // Now that key 4 was processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + // Key 3 is next in the queue (it was put in the queue at the same time as key 2). + expectReadKey(it, 3); + + // The replication of the write command was enqueued after keys 1,2,3. + expectReadReplication(it, c); + + // Now resuming processing of dict entries. + expectReadKeySequence(it, 5, 9); + + expectReadComplete(it); + freeTestClient(c); +} + + +TEST_F(BgIterationTest, writeWith3Keys_NoReplication_Consistent_repeatedKey_1DependantPast1RepeatedFuture) { + // This tests the replication of a write command that updates multiple keys and depends on a key + // which is repeated in the command. The repeated key is in the future and the other key is in + // the past. + + // Expected sequence of event for this test: + // ITEM: (0)'D0' : 'D0' + // BLCK?: (0)'blpop' 'D0' 'C0' 'C0' 'D0' + // EARLY_1: (0)'C0' : 'C0' + // (blocked) + // ITEM: (0)'E0' : 'E0' + // ITEM: (0)'B0' : 'B0' + // ITEM: (0)'A0' : 'A0' + // BLCK?: (0)'blpop' 'D0' 'C0' 'C0' 'D0' + // REPL?: (0)'blpop' 'D0' 'C0' 'C0' 'D0' + // SKIPPING ITEM(early iterate): (0)'C0' : 'C0' + // ITEM: (1)'E1' : 'E1' + // ITEM: (1)'C1' : 'C1' + // ITEM: (1)'B1' : 'B1' + // ITEM: (1)'A1' : 'A1' + // ITEM: (1)'D1' : 'D1' + // SENDING COMPLETE + // CLEANUP FN (success) + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Start with this to load 0 into the queue - but don't read 0 as that would load 1,2,3 into the queue! + bgIteration_feedIterators(); + + // Write command that has 3 keys. 1 past key and 1 repeated key in the future. + // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a + // multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY). + client *c = getWriteMultiKeysClient(0, {4, 4, 0}, "blpop"); + + // This command should block on 2 keys (0 and 4), since: + // - key 0 is in use by the iterator (still in the queue since it has not been processed by the consumer yet) + // - key 4 is in the future + // This adds key 4 in the queue since the command depends on it and it hasn't been processed yet. + simulateBlockedWrite(c, 2); + + // Key 4 is processed out of order since the write depends on it. + // Key 4 is processed before key 0 even though key 0 was already in the queue + // because key 4 was enqueued as a priority item. + expectReadKey(it, 4); + + // Key 0 was already enqueued. + // Reading key 0 releases key 4 from the iterator. + expectReadKey(it, 0); + + // Keys 1,2,3 are next in the queue (they are all in the same bucket). + // Only reading key 1 for now to release key 0 from the iterator. + expectReadKey(it, 1); + + // Now that keys 4 and 0 were processed and released by the iterator, the write command can be executed. + simulateUnblockedWriteWithModification(c); + + // Keys 2, 3 are next in the queue (it was put in the queue at the same time as key 1). + expectReadKeySequence(it, 2, 3); + + // Now resuming processing of dict entries. + expectReadKeySequence(it, 5, 9); + + expectReadComplete(it); + freeTestClient(c); +} + + +TEST_F(BgIterationTest, writeWith3Keys_NoReplication_NoConsistent_repeatedKey_1repeatedNewKey) { + // This tests a write command that creates a new key where the new key is repeated in the + // command. The repeated key is in the future. + + // Expected sequence of event for this test: + // ITEM: (0)'D0' : 'D0' + // ITEM: (0)'A0' : 'A0' + // ITEM: (0)'B0' : 'B0' + // ITEM: (0)'E0' : 'E0' + // BLCK?: (0)'blpop' 'C0' 'D0' 'C0' 'D0' + // REPL?: (0)'blpop' 'C0' 'D0' 'C0' 'D0' + // ITEM: (0)'C0' : 'D0' + // ITEM: (1)'B1' : 'B1' + // ITEM: (1)'C1' : 'C1' + // ITEM: (1)'D1' : 'D1' + // ITEM: (1)'A1' : 'A1' + // ITEM: (1)'E1' : 'E1' + // SENDING COMPLETE + // CLEANUP FN (success) + + server.db[0]->keys->dtype->resizeAllowed = NULL; + kvstoreExpand(server.db[0]->keys, 32, 0, NULL); + hashtableRehash(server.db[0]->keys->hashtables[0], 32); + + // The table looks this way now: + // Table 0, used 5, exp 3, top-level buckets 8, child buckets 0 + // Bucket 0:0 level:0 + // 0 (empty) + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:1 level:0 + // 0 h2 63, key "D0" + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:2 level:0 + // 0 (empty) + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:3 level:0 + // 0 h2 b8, key "A0" + // 1 h2 f5, key "B0" + // 2 h2 13, key "E0" + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:4 level:0 + // 0 (empty) + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:5 level:0 + // 0 (empty) + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:6 level:0 + // 0 h2 91, key "C0" + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:7 level:0 + // 0 (empty) + // 1 (empty) + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + + const char *new_keys[5] = {"D0", "A0", "B0", "E0", "C0"}; + update_keys(new_keys, 0, 5); + + simpleDelItem(4); // Deleting key 4 to then create it with a write command + bgIterator *it = bgIteratorCreateFullScanIter("iter", + 0, NULL, iteratorCleanupFn, PRIVDATA); + + // Getting started + // The first bucket is empty + bgIteration_feedIterators(); + expectReadKey(it, 0); + + // Key 1 is the next in the queue. + // Reading key 1 to release key 0 from the iterator. + expectReadKey(it, 1); + + // Write command that has 3 keys. 1 new repeated key and 1 key in the past. + // How BLPOP works exactly is not relevant to bgIterator, we just chose BLPOP because it's a + // multi-key command that (potentially) modifies all of its keys (ie is not CMD_WRITE_FIRSTKEY_ONLY). + client *c = getWriteMultiKeysClient(4, {0, 4, 0}, "blpop"); + + // The write command is not blocked since key 0 is not in use by the iterator + simulateUnblockedWriteWithModification(c); + + // Keys 2, 3 are next in the queue (it was put in the queue at the same time as key 1). + expectReadKeySequence(it, 2, 3); + + // Key 4 is now in the dict with the value of key 0. + expectReadKey(it, 4, keyStr(0)); + + // Processing the rest of the dict entries. + expectReadKeySequence(it, 5, 9); + + expectReadComplete(it); + freeTestClient(c); +} + +TEST_F(BgIterationTest, copyHandlesProperDb_Replication_NoConsistent) { + // In this test, the COPY command is copying from one DB to another. We will create the + // same key in both DBs. We make sure that the proper key is created via replication, and + // the proper key is created by iteration. + + // NOTE: Adding E0 to dict 1. Now there is a E0 in both dict 0 and dict 1. + addKeyToDb(1, "E0", "E0"); + + // The test: + // We will simulate (with DB0 selected): COPY D0 C0 DB 1 REPLACE + // This will overwrite DB1:C0 that was created above. + // Since DB0:D0 is the first iterated key we expect that DB1:C0 will be expedited. + // After DB1:C0 is "overwritten", it should be marked early iterate. + // We expect DB0:C0 to NOT be marked early iterate, and should get processed normally. + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // Start with this to load 0 (C0) into the queue - but don't read 0 as that would load 1,2,3 into the queue! + bgIteration_feedIterators(); + + // COPY C0 E0 DB 1 REPLACE + client *c = static_cast(zcalloc(sizeof(client))); + c->cmd = lookupCommandByCString("copy"); + c->db = server.db[0]; + c->argc = 6; + c->argv = static_cast(zcalloc(sizeof(robj*) * c->argc)); + c->argv[0] = createStringObjectFromSds(sdsnew(c->cmd->fullname)); + c->argv[1] = createStringObjectFromSds(sdsnew("C0")); + c->argv[2] = createStringObjectFromSds(sdsnew("E0")); + c->argv[3] = createStringObjectFromSds(sdsnew("DB")); + c->argv[4] = createStringObjectFromSds(sdsnew("1")); + c->argv[5] = createStringObjectFromSds(sdsnew("REPLACE")); + + // This should block on 2 keys. DB0:C0 is in queue. DB1:E0 needs to be expedited. + simulateBlockedWrite(c, 2); + expectReadKey(it, 0); // DB0:C0 + expectReadDbKeyValue(it, 1, "E0", "E0"); // DB1:E0 is expedited + expectReadKey(it, 1); // (to release DB1:E0) + // Now keys 2 & 3 & 4 are in the queue + + simulateUnblockedWrite(c); // We shouldn't be blocked this time + + // Now, we'll simulate the actual activity of the COPY. DB1:C0 will be deleted in order to + // be overwritten. + bgIteration_keyDelete(1, sdsnew("E0")); + // At this point the key would actually be deleted and recreated by COPY (no need to actually do this) + + // And finally the replication (this should queue replication) + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + + // Now let's read everything... + expectReadKeySequence(it, 2, 4); // These were in queue already + expectReadReplication(it, c); // This is the new replication (creating DB1:C0) + + expectReadKeySequence(it, 5, 9); // These are all normal + + expectReadComplete(it); // At this point, we should be done. We should NOT see DB1:C0. + freeTestClient(c); +} + + +// Just check that termination with replication in queue works OK. +TEST_F(BgIterationTest, terminateWithReplication) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); // makes sure we are done with key 0 (don't want to block) + + client *c = getWriteClient(0, "xxx"); + simulateUnblockedWriteWithModification(c); // Should replicate + freeTestClient(c); + + bgIteratorTerminate(it); + + bgIteratorItem *item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + + +// SWAPDB tests - Get ready for the mind-bend... + +TEST_F(BgIterationTest, swapDB_NoReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + 0, NULL, iteratorCleanupFn, PRIVDATA); + bgIteratorStatus status; + + // In the non-consistent iterator (without replication), items are identified with the DBID at + // the time they are placed into the queue. The SWAPDB event signals the change to the + // iterating process - and this is properly sequenced with the DB info for each item. + + expectReadKey(it, 0); + + // Keys 1,2,3, and 4 are in queue + simulateSwapDB(0, 1); // The swap event will be queued after item 3 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 0u); + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + expectReadKey(it, 4); + + expectReadSwapDB(it, 0, 1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 0u); // still processing it... + + // Since we've seen the swap event, items now have the new DBID + expectReadDbKeyValue(it, 0, keyStr(5), keyStr(5)); // item 5 is in DB0 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 1u); + EXPECT_EQ(status.swapdb_processed, 1u); // done processing the swapdb + + // Keys 6 & 7 are in the queue - let's swap back! + simulateSwapDB(1, 0); // The swap event will be queued after item 7 + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); // 2nd one queued + EXPECT_EQ(status.swapdb_processed, 1u); + + expectReadDbKeyValue(it, 0, keyStr(6), keyStr(6)); // Still appears as DB0 + expectReadDbKeyValue(it, 0, keyStr(7), keyStr(7)); // Still appears as DB0 + + expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); + EXPECT_EQ(status.swapdb_processed, 1u); // still processing it... + + expectReadKey(it, 8); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.swapdb_queued, 2u); + EXPECT_EQ(status.swapdb_processed, 2u); // done processing all swaps + + expectReadKey(it, 9); + expectReadComplete(it); +} + +TEST_F(BgIterationTest, swapDB_NoReplication_YesConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // In the consistent iterator (without replication) all items are presented to the iterating + // process using the DBID at the time of the iterator creation. No changes are evident. + + expectReadKey(it, 0); + + // Keys 1,2,3,4 are in queue + simulateSwapDB(0, 1); // The swap occurs, but the iterator sees no change + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + expectReadKey(it, 4); + + // Heck, let's go crazy with those swaps... + for (int itemNum = 5; itemNum <= 9; itemNum++) { + simulateSwapDB(0, 1); + expectReadKey(it, itemNum); + } + + expectReadComplete(it); +} + +TEST_F(BgIterationTest, swapDB_YesReplication_NoConsistent) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // In the non-consistent iterator WITH replication, items are identified with the DBID at the + // time they are placed into the queue. The SWAPDB event signals the change to the iterating + // process - and this is properly sequenced with the DB info for each item. + + expectReadKey(it, 0); + + // Keys 1,2,3,4 are in queue + simulateSwapDB(0, 1); // The swap event will be queued after item 3 + + expectReadKey(it, 1); + expectReadKey(it, 2); + expectReadKey(it, 3); + expectReadKey(it, 4); + + expectReadSwapDB(it, 0, 1); // We should see a SWAPDB event + bgIteratorItem *item = bgIteratorRead(it); // followed by the associated replication + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + bgIteration_feedIterators(); + + // Since we've seen the swap event, items now have the new DBID + expectReadDbKeyValue(it, 0, keyStr(5), keyStr(5)); // item 5 is in DB0 + + // Keys 6 & 7 are in the queue - let's swap back! + simulateSwapDB(1, 0); // The swap event will be queued after item 7 + + expectReadDbKeyValue(it, 0, keyStr(6), keyStr(6)); // Still appears as DB0 + expectReadDbKeyValue(it, 0, keyStr(7), keyStr(7)); // Still appears as DB0 + + expectReadSwapDB(it, 1, 0); // Now the iterator knows about the 2nd swap + item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_REPLICATION); + bgIteration_feedIterators(); + + expectReadKey(it, 8); + expectReadKey(it, 9); + expectReadComplete(it); +} + +// There is no test for swapDB_YesReplication_YesConsistent because this configuration is not +// permitted with multiple DBs (not permitted with swaps). + + +// FLUSHDB & FLUSHALL Tests +TEST_F(BgIterationTest, flushDB_flushAll) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", 0, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + expectReadKey(it, 1); + + // key 1 is active in the iterator - this key will be removed from the DB before flush. + // keys 2 & 3 & 4 are in queue - but will be returned to Valkey before the flush. These are yanked + // back by Valkey and will not be seen by iterator. + simulateFlushDB(-1, 1); + + bgIteratorItem *item = bgIteratorRead(it); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + + bgIteratorClose(it); // background thread completes the termination + + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 1); + EXPECT_TRUE(cleanupTerminated); +} + +TEST_F(BgIterationTest, flushDB_flushOne) { + bgIterator *it1 = bgIteratorCreateFullScanIter("iter1", + 0, NULL, iteratorCleanupFn, PRIVDATA); + bgIterator *it2 = bgIteratorCreateFullScanIter("iter2", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + bgIteratorStatus status; + + // The test flushes DB0. This is half the data. Since <= half, a non-consistent iterator is + // allowed to proceed. But the consistent iterator will be terminated. + + expectReadKey(it1, 0); + expectReadKey(it2, 0); + expectReadKey(it1, 1); + expectReadKey(it2, 1); + + // key 1 is active in the iterator - this key will be removed from the DB before flush. + // keys 2 & 3 & 4 are in queue - but will be returned to Valkey before the flush. These are yanked + // back by Valkey and will not be seen by iterator. + simulateFlushDB(0, 1); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 0u); + + // Testing the non-consistent one continues... + // Everything already on the iterator queue should be preserved (deleted from the DB). + // Keys 2 & 3 & 4 are already queued (and preserved). + expectReadKey(it1, 2); + expectReadKey(it1, 3); + expectReadKey(it1, 4); + + bgIteratorItem *item = bgIteratorRead(it1); + ASSERT_EQ(item->type, BGITERATOR_ITEM_FLUSHDB); + ASSERT_EQ(item->dbid, 0); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 0u); // still processing it + + expectReadKey(it1, 5); + bgIteratorGetStatus(it1, &status); + EXPECT_EQ(status.flushdb_queued, 1u); + EXPECT_EQ(status.flushdb_processed, 1u); // done with all flushdb's + expectReadKey(it1, 6); + expectReadKey(it1, 7); + expectReadKey(it1, 8); + expectReadKey(it1, 9); + expectReadComplete(it1); + EXPECT_EQ(cleanupCount, 1); + EXPECT_FALSE(cleanupTerminated); + + // But the consistent iterator should be terminated + item = bgIteratorRead(it2); + ASSERT_EQ(item->type, BGITERATOR_ITEM_TERMINATED); + bgIteratorClose(it2); // background thread completes the termination + bgIteration_feedIterators(); // main thread, cleans up iterator and calls cleanup function + EXPECT_EQ(cleanupCount, 2); + EXPECT_TRUE(cleanupTerminated); +} + +// Cluster mode, 2 iterators, CONSISTENT+REPLICATION and NONCONSISTENT+REPLICATION +// Modify a missing key. +TEST_F(BgIterationTestCluster, modMissingKey_2iter_cluster) { + // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode + // For this test, we only have 5 keys since not using DB[1]. Remove the last one. + simpleDelItem(4); + + bgIterator *it1 = bgIteratorCreateFullScanIter("iter1", + BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, iteratorCleanupFn, PRIVDATA); + bgIterator *it2 = bgIteratorCreateFullScanIter("iter2", + BGITERATOR_FLAG_REPLICATION, + NULL, iteratorCleanupFn, PRIVDATA); + + client *c = getWriteClient(4, "xxx"); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked since key doesn't exist + + bgIteration_feedIterators(); // Prime the feed - key 0 and 1 are now enqueued + + // Process the consistent iteration + expectReadReplication(it1, c); // replication happened before feeding (should be 1st) + expectReadKeySequence(it1, 0, 3); + expectReadComplete(it1); + + // Process the non-consistent iteration + expectReadReplication(it2, c); // replication happened before feeding (should be 1st) + expectReadKeySequence(it2, 0, 3); + expectReadComplete(it2); + + freeTestClient(c); +} + +TEST_F(BgIterationTest, twoKeys_firstFuture) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, + NULL, iteratorCleanupFn, PRIVDATA); + + bgIteration_feedIterators(); // Prime the feed - key 0 + expectReadKey(it, 0); // Causes keys 1, 2, 3, 4 to be queued (same bucket) + expectReadKey(it, 1); // Causes key 0 to be released + + // This must replicate, because A0 is in the past. B1 (future) wouldn't need replication except + // for the modification to B1. We try to trip up bgIterator by giving a key that doesn't need + // replication except for the later command that does. Make this a little trickier by adding + // the set for A1 - unnecessary, but more clearly shows the expediting in progress. + client *c = getMultiClient("SET A0 xxx; SELECT 1; SET A1 xxx; SET B1 xxx", 1); + + // The EXEC should block on 2 keys, because B1(5) & A1(8) should be expedited + simulateBlockedWrite(c, 2); + + expectReadKeySequence(it, 2, 4); // These were already in queue + + // Note - it would be OK if these 2 were reversed, but this is how the current algorithm works. + expectReadKey(it, 8); // Key 8 (A1) was expedited + expectReadKey(it, 5); // Key 5 (B1) was expedited + + // and clean up the rest... + expectReadKeySequence(it, 6, 7); + // Key 8 was already read above (expedited) + expectReadKey(it, 9); + expectReadComplete(it); +} + +TEST_F(BgIterationTest, multiBlocksOnFutureKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1,2,3,4 are queued (they are all in the same bucket). + // If we fake a modification to key 5, we won't know if it's handled out of order. + // So we fake a modification to key 6 + // Dummy up a MULTI... + client *c = getMultiClient("SET C1 xxx", 1); + + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c); + freeTestClient(c); + + // C1 (key 6) will be expedited to the front of the list + expectReadKey(it, 6); + + // Now that we've read key 5, key 0 (C0) is passed and should not block + client *c2 = getMultiClient("SET C0 xxx"); + simulateUnblockedWrite(c2); + freeTestClient(c2); + + + expectReadKeySequence(it, 1, 5); + expectReadKeySequence(it, 7, 9); + expectReadComplete(it); +} + +TEST_F(BgIterationTest, multiNotReplicatedButDelRecreateAccess) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + // Scenario. We have a multi that doesn't need to be replicated because all of the keys exist + // but are all future keys. Note that missing keys are considered already-iterated, so all + // must exist for this test. Then: + // - we delete a key + // - we re-create the deleted (future) key - normally this would be replicated + // - we access another (future) key - we don't expect to get blocked! + + // We use DB 1 only because the hash table buckets are better broken up there. + client *c = getMultiClient("DEL A1; SET A1 xxx; SET E1 yyy", 1); + + // For DB[1]: + // Bucket 0:0 level:0 + // 0 h2 18, key "B1" + // 1 h2 fd, key "C1" + // 2 h2 e9, key "D1" + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:1 level:0 + // 0 h2 36, key "A1" + // 1 h2 0c, key "E1" + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + + // Read through DB 0 and into DB 1 + expectReadKeySequence(it, 0, 5); // D0, E0, B0, A0, C0, B1 + // Now, C1 and D1 are in the queue (in use) and A1 & E1 are future + + // Now let's process the multi. Since A1 & D1 are both future (existing) items, we shouldn't + // block or replicate. + simulateUnblockedWrite(c); // the EXEC + + // Simulate the DEL A1 + server.in_exec = 1; // Simulate actual execution of the MULTI/EXEC + advanceMultiClientToCommand(c, 0); // DEL A1 + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + simpleDelItem(8); + sds delKey = sdsnew(keyStr(8)); + bgIteration_keyDelete(1, delKey); + sdsfree(delKey); + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); // shouldn't replicate + + // Simulate SET A1 - the key doesn't exist, and would normally replicate and mark early iterate, + // but this is in a transaction, and we are not replicating this transaction. + advanceMultiClientToCommand(c, 1); // SET A1 xxx + simulateUnblockedWriteWithModification(c); + + // Now write to another existing future key - this should work if we weren't confused by the DEL + advanceMultiClientToCommand(c, 2); // SET E1 yyy + simulateUnblockedWriteWithModification(c); + server.in_exec = 0; + + // Now we can continue iterating, and we should pick up keys 6-9. (and no replication!) + expectReadKeySequence(it, 6, 7); + expectReadKey(it, 8, "xxx"); + expectReadKey(it, 9, "yyy"); + expectReadComplete(it); +} + +TEST_F(BgIterationTest, multiHandlesSelectProperly) { + // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1. We will process it + // in DB0, but it will be unprocessed in DB1. See if we track select properly. + addKeyToDb(1, "C0", "C0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - C0 in DB 0. + expectReadKey(it, 0); + + // Now, we are done with C0 in DB0, but not in DB1 + expectReadKey(it, 1); + + // These cases should NOT block... (they access C0 in DB0) + client *c; + c = getMultiClient("SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET C0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SELECT 0; SET C0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block... (they access C0 in DB1) + c = getMultiClient("SET C0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SET C0 xxx"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SET C0 xxx; SELECT 0"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SELECT 1; SET C0 xxx; SELECT 1"); + simulateBlockedWrite(c); + freeTestClient(c); + + expectAnythingCleanup(it); +} + + +TEST_F(BgIterationTest, multiHandlesSelectNoPermissionProperly) { + // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1. We will process it + // in DB0, but it will be unprocessed in DB1. See if we track select properly. + addKeyToDb(1, "C0", "C0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - C0 in DB 0. + expectReadKey(it, 0); + + // Now, we are done with DC00 in DB0, but not in DB1 + expectReadKey(it, 1); + + // No permission for any commands (specifically select/swapdb) + EXPECT_CALL(mock, amzCanClientExecuteCommand(_,_,_,_)) + .Times(AtLeast(1)).WillRepeatedly(Return(false)); + + // These cases should NOT block... (they access C0 in DB0) + // The SELECTs below are inconsequential - with/without select, same result. + client *c; + c = getMultiClient("SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 0; SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET C0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SELECT 1; SELECT 0; SET C0 xxx; SELECT 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block IF SELECT IS WORKING... (they access C0 in DB1) + c = getMultiClient("SET C0 xxx"); + c->db = server.db[1]; // already starting on DB1 + simulateBlockedWrite(c); // will block, no select + freeTestClient(c); + c = getMultiClient("SELECT 1; SET C0 xxx"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + freeTestClient(c); + c = getMultiClient("SELECT 1; SET C0 xxx; SELECT 0"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + freeTestClient(c); + c = getMultiClient("SELECT 0; SELECT 1; SET C0 xxx; SELECT 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (select fails) + freeTestClient(c); + + expectAnythingCleanup(it); +} + + +TEST_F(BgIterationTest, multiHandlesSwapdbProperly) { + // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1. We will process it + // in DB0, but it will be unprocessed in DB1. See if we track select properly. + addKeyToDb(1, "C0", "C0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - C0 in DB 0. + expectReadKey(it, 0); + + // Now, we are done with C0 in DB0, but not in DB1 + expectReadKey(it, 1); + + // These cases should NOT block... (they access C0 in DB0) + client *c; + c = getMultiClient("SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET C0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET C0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 0 1; SELECT 1; SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block... (they access C0 in DB1) + c = getMultiClient("SET C0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SET C0 xxx; SWAPDB 0 1"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SELECT 0; SET C0 xxx; SWAPDB 0 1"); + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET C0 xxx; SELECT 1"); + simulateBlockedWrite(c); + freeTestClient(c); + + expectAnythingCleanup(it); +} + + +TEST_F(BgIterationTest, multiHandlesSwapdbNoPermissionProperly) { + // For this test, C0 is added into DB1 - so it exists in both DB 0 and 1. We will process it + // in DB0, but it will be unprocessed in DB1. See if we track select properly. + addKeyToDb(1, "C0", "C0"); + + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - C0 in DB 0. + expectReadKey(it, 0); + + // Now, we are done with C0 in DB0, but not in DB1 + expectReadKey(it, 1); + + // No permission for any commands (specifically select/swapdb) + EXPECT_CALL(mock, amzCanClientExecuteCommand(_,_,_,_)) + .Times(AtLeast(1)).WillRepeatedly(Return(false)); + + // These cases should NOT block... (they access C0 in DB0) + // The SELECTs & SWAPDBs below are inconsequential - with/without select/swapdb, same result. + client *c; + c = getMultiClient("SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET C0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SET C0 xxx; SWAPDB 0 1; SWAPDB 0 1; SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 0 1; SELECT 1; SET C0 xxx"); + simulateUnblockedWrite(c); + freeTestClient(c); + + // These cases SHOULD block IF SELECT/SWAPDB IS WORKING... (they access C0 in DB1) + c = getMultiClient("SET C0 xxx"); + c->db = server.db[1]; + simulateBlockedWrite(c); + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SET C0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb fails) + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SELECT 0; SET C0 xxx; SWAPDB 0 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails) + freeTestClient(c); + c = getMultiClient("SWAPDB 1 0; SWAPDB 1 0; SELECT 1; SET C0 xxx; SELECT 1"); + simulateUnblockedWrite(c); // will not block because accessing DB0 (swapdb/select fails) + freeTestClient(c); + + expectAnythingCleanup(it); +} + +void * pthreadWait200msAndReadTwoKeys(void *arg) { + bgIterator *it = static_cast(arg); + + usleep(200000); + bgIteratorRead(it); + bgIteratorRead(it); + return nullptr; +} + +void asyncWait200msAndReadTwoKeys(bgIterator *it) { + int rc; + pthread_attr_t attr; + pthread_t thread; + + rc = pthread_attr_init(&attr); + assert(rc == 0); + rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + assert(rc == 0); + + rc = pthread_create(&thread, &attr, pthreadWait200msAndReadTwoKeys, it); + assert(rc == 0); + + rc = pthread_attr_destroy(&attr); + assert(rc == 0); +} + + +TEST_F(BgIterationTest, testLuaWithUndeclaredKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_CONSISTENT, NULL, iteratorCleanupFn, PRIVDATA); + + // Read the 1st key - let's get the party started + expectReadKey(it, 0); + + // At this point, key 0 is read. Keys 1,2,3 are queued (they are all in the same bucket). + // If we fake a modification to key 4, we won't know if it's handled out of order. + // So we fake a modification to key 5 + client *c = getWriteClient(5, "xxx"); + c->flag.script = 1; + + // Now for a LUA script, we have already blocked (on the eval/evalsha) for any declared keys + // But here, we're about to modify an undeclared key. We can't actually block in the middle + // of the LUA script. So this will behave as unblocked, but incur a synchronous wait. + + // Key 5 will get expedited when we simulate the write. After reading key 5, key 1 will need + // to be read to return key 5 to Valkey, unbloking the synchronous wait. + asyncWait200msAndReadTwoKeys(it); + + monotime blockTimer; + elapsedStart(&blockTimer); + simulateUnblockedWrite(c); + // Must have delayed at least 150ms (some time may have passed before timer start) + EXPECT_GT(elapsedMs(blockTimer), 150u); + + // Continue... + expectReadKeySequence(it, 2, 4); + // 5 has already been processed + expectReadKeySequence(it, 6, 9); + expectReadComplete(it); + freeTestClient(c); +} + + +TEST_F(BgIterationTest, replicationReceivedWhileProcessingLastKey) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + client *c = getWriteClient(0, "xxx"); + + expectReadKeySequence(it, 0, 9); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + expectReadReplication(it, c); // Replication happened while processing key 9, should be here. + + simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing key 9 + expectReadComplete(it); // We expect to see the completion instead + + freeTestClient(c); +} + + +TEST_F(BgIterationTest, repldoneFunctionCalled) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); + + client *c = getWriteClient(0, "xxx"); + + expectReadKeySequence(it, 0, 9); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + expectReadReplication(it, c); // Replication happened while processing key 9, should be here. + EXPECT_EQ(repldoneCount, 1); // Last key released, now done feeding replication + + simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing key 9 + expectReadComplete(it); // We expect to see the completion instead + + freeTestClient(c); +} + + +TEST_F(BgIterationTest, repldoneFunctionCalledTwice) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, iteratorRepldoneFnNotBeingReadyInitially, iteratorCleanupFn, PRIVDATA); + + client *c = getWriteClient(0, "xxx"); + + expectReadKeySequence(it, 0, 9); + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + expectReadReplication(it, c); // Replication happened while processing key 9, should be here. + EXPECT_EQ(repldoneCount, 0); // Last key released, now done feeding replication + EXPECT_EQ(isReplDoneReady, 1); + bgIteration_feedIterators(); // Need to call it as RepldoneFnNotBeingReadyInitially returns false in first call + EXPECT_EQ(repldoneCount, 1); + + simulateUnblockedWriteWithModification(c); // This won't replicate because we are done processing key 9 + expectReadComplete(it); // We expect to see the completion instead + + freeTestClient(c); +} + + +TEST_F(BgIterationTest, queuingitemFunctionCalled) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + 0, NULL, iteratorCleanupFn, iteratorBeforeAndAfterQueuingItemFn, PRIVDATA); + EXPECT_EQ(beforeQueuingItemCount, 0); + EXPECT_EQ(afterQueuingItemCount, 0); + expectReadKeySequence(it, 0, 9); + expectReadComplete(it); + // Callback is invoked when item is fed to and returned from an iterator + EXPECT_EQ(beforeQueuingItemCount, 10); + EXPECT_EQ(afterQueuingItemCount, 10); +} + +TEST_F(BgIterationTest, checkReplicationByteCount) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, iteratorRepldoneFn, iteratorCleanupFn, PRIVDATA); + + client *c = getWriteClient(0, "xxx"); + int expectedReplicationSize = sizeof(bgIteratorItem); + for (int i = 0; i < c->argc; i++) { + expectedReplicationSize += objectComputeSize(NULL, c->argv[i], 0, 0); + } + + expectReadKey(it, 0); + expectReadKey(it, 1); // Releases and unblocks 0 + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u); + + simulateUnblockedWriteWithModification(c); // Wouldn't be blocked because done with key 0 + EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize); + simulateUnblockedWriteWithModification(c); // and write again (2nd replication) + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize); + + expectReadKeySequence(it, 2, 4); // Keys 1..4 all in same bucket + + expectReadReplication(it, c); + // After reading the 1st replication, it hasn't been returned yet (it's the active item) + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 2 * expectedReplicationSize); + expectReadReplication(it, c); + // After reading the 2nd replication, the 1st has been returned + EXPECT_EQ(bgIteration_memoryInuseForReplication(), expectedReplicationSize); + + expectReadKey(it, 5); + // Now all replication has been returned/freed + EXPECT_EQ(bgIteration_memoryInuseForReplication(), 0u); + + expectReadKeySequence(it, 6, 9); + expectReadComplete(it); + + freeTestClient(c); +} + +// Test that for an arbitrary write command having no keys, replication should occur. +TEST_F(BgIterationTest, checkNoKeysWriteIsReplicated) { + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION, NULL, iteratorCleanupFn, PRIVDATA); + + expectReadKey(it, 0); + + client *c = getNoKeysWriteClient(); + EXPECT_CALL(mock, blockClientInUseOnKeys(c,_,_)).Times(0); + bool blocked = bgIteration_blockClientIfRequired(c); + EXPECT_FALSE(blocked); + bgIteration_handleCommandReplication(c->db->id, c->cmd, c->argc, c->argv); + + expectReadKeySequence(it, 1, 4); // These were already in queue + + expectReadReplication(it, c); + + expectReadKeySequence(it, 5, 9); + expectReadComplete(it); + freeTestClient(c); +} +TEST_F(BgIterationTestClusterSlots, testAmzKeyIsLogicallyDeletedInOrderedIteration3Slots) { + bgIterator *it = bgIteratorCreateSlotsIter("simple", + 0, slots_to_iterate, slots_to_iterate_size, NULL, iteratorCleanupFn, PRIVDATA); + EXPECT_CALL(mock, amzKeyIsLogicallyDeleted(_)).WillOnce(Return(true)).WillRepeatedly(Return(false)); + expectReadKeySequence(it, 1, n_keys_to_read - 1); + + // Quick status check. At this point, the last item hasn't been returned yet. + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentries_queued, n_keys_to_read - 1); // The first item should be skipped from the queue + EXPECT_EQ(status.dbentries_processed, n_keys_to_read - 2); + + expectReadComplete(it); + EXPECT_FALSE(cleanupTerminated); +} + +TEST_F(BgIterationTest, testAmzKeyIsLogicallyDeletedInOrderedFullScanIteration) { + bgIterator *it = bgIteratorCreateFullScanIter("simple", + 0, NULL, iteratorCleanupFn, PRIVDATA); + EXPECT_CALL(mock, amzKeyIsLogicallyDeleted(_)).WillOnce(Return(true)).WillRepeatedly(Return(false)); + expectReadKeySequence(it, 1, 9); + + // Quick status check. At this point, item #9 hasn't been returned yet. + bgIteratorStatus status; + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.dbentries_queued, 9u); // The first item should be skipped from the queue + EXPECT_EQ(status.dbentries_processed, 8u); + + expectReadComplete(it); + EXPECT_FALSE(cleanupTerminated); +} +#endif + +#ifdef CODE_NOT_READY_YET +class BgIterationTestCluster : public BgIterationTest { + private: + // This is the expected order of the keys when hashed into a single dict at slot 0 having size 8. + // The "{06S}" prefix ensures use of only slot 0. + const char *keys[1][5] = {{"{06S}C0", "{06S}D0", "{06S}A0", "{06S}B0", "{06S}E0"}}; + + protected: + // Furthermore, the bucketization will look like this: + // db 0 slot 0 + // Table 0, used 5, exp 1, top-level buckets 2, child buckets 0 + // Bucket 0:0 level:0 + // 0 h2 1a, key "{06S}C0" + // 1 h2 7b, key "{06S}D0" + // 2 (empty) + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + // Bucket 0:1 level:0 + // 0 h2 5c, key "{06S}A0" + // 1 h2 bf, key "{06S}B0" + // 2 h2 57, key "{06S}E0" + // 3 (empty) + // 4 (empty) + // 5 (empty) + // 6 (empty) + + virtual const char * getKeyAtDbSeq(int db, int seq) override { + assert(db == 0); + return keys[db][seq]; + } + + + virtual void setupDatabase() override { + // For these unit tests, a standard database is constructed. The order of items in the + // hash table is important, and this is validated here. If the hash table + // implementation changes, we will find out quickly at this point. All other tests + // will become invalid! + + // Note that the cluster_enabled tests are designed for the purpose of testing + // CONSISTENT iteration WITH REPLICATION. This type of iteration is not supported + // in non-cluster-mode. At the time of writing, there is no-known use-case for this + // combination. But it is tested for completeness and to ensure future availability. + + // Note also that the cluster_enabled tests are not designed to address issues specific + // to per-slot-dictionaries. The tests are simplified by ensuring that all keys are + // mapped to slot-0. It is assumed that iteration would progress in slot order, and + // failure in this regard will be caught in integration tests (amztests). + + server.dbnum = 1; // cluster-mode means 1 DB + server.cluster_enabled = true; + server.db = static_cast(zcalloc(sizeof(serverDb *) * server.dbnum)); + + // Yes, it's cluster mode, but we're mapping all keys to slot 0 - so we cheat and create only 1 dict (just like CMD). + initializeServerDb(0, CLUSTER_SLOT_MASK_BITS); + + // Note "06S" is a prefix that maps to slot 0. We're not testing slots here. + + addKeyToDb(0, "{06S}A0", "{06S}A0"); + addKeyToDb(0, "{06S}B0", "{06S}B0"); + addKeyToDb(0, "{06S}C0", "{06S}C0"); + addKeyToDb(0, "{06S}D0", "{06S}D0"); + addKeyToDb(0, "{06S}E0", "{06S}E0"); + + // In case we need to debug... + if (0) debugPrintBucketInfo(); + + // Validate that the iteration order matches the expected order + hashtableIterator *it = hashtableCreateIterator(server.db[0]->keys->hashtables[0], 0); + for (int i = 0; i < 5; i++) { + void *nextEntry; + hashtableNext(it, &nextEntry); + dbEntry *de = static_cast(nextEntry); + ASSERT_STREQ(static_cast(objectGetKey(de)), getKeyAtDbSeq(0, i)); + } + hashtableReleaseIterator(it); + } +}; +#endif + +#ifdef CODE_NOT_READY_YET +TEST_F(BgIterationTestCluster, dictIsOK) { + // Just run the setup/teardown code to make sure the dict is OK. +} + + +TEST_F(BgIterationTestCluster, modFutureItem_YesReplication_YesConsistent_cluster) { + // Cluster test. REPLICATION + CONSISTENT only supported in cluster mode + bgIterator *it = bgIteratorCreateFullScanIter("iter", + BGITERATOR_FLAG_REPLICATION | BGITERATOR_FLAG_CONSISTENT, + NULL, iteratorCleanupFn, PRIVDATA); + bgIteratorStatus status; + + // For this test, don't read the 1st key - we only have 5 keys since not using DB[1] + bgIteration_feedIterators(); // Prime the feed - key 0 and 1 are now enqueued + + // At this point, key 0, and 1 are queued. Fake a modification to key 2 & 4 - two keys to ensure + // that replication is ordered + client *c1 = getWriteClient(2, "xxx"); + client *c2 = getWriteClient(4, "yyy"); + + // Since this is consistent, we will block the client, disallowing the write. + simulateBlockedWrite(c1); + simulateBlockedWrite(c2); + + // On a consistent iterator, the event is expedited in-front of items already in queue! + // Read keys 2&4 out of order. + expectReadKey(it, 2); // reading original/unmodified item + + // This call is expected to unblock the client waiting on #2 + expectReadKeyWithUnblock(it, 4, nullptr, 2); // reading original/unmodified item + simulateUnblockedWriteWithModification(c1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.replication_queued, 1u); + EXPECT_EQ(status.replication_processed, 0u); + + // Now read items 0 and 1 - these were actually already queued before keys 1 & 4 were expedited. + // This call is expected to unblock the client waiting on #4 + expectReadKeyWithUnblock(it, 0, nullptr, 4); + simulateUnblockedWriteWithModification(c2); + expectReadKey(it, 1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.replication_queued, 2u); + EXPECT_EQ(status.replication_processed, 0u); + + // And now the 2 replications are queued + expectReadReplication(it, c1); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.replication_queued, 2u); // 1st replication still being processed + EXPECT_EQ(status.replication_processed, 0u); // (no change in these metrics yet) + + expectReadReplication(it, c2); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.replication_queued, 2u); + EXPECT_EQ(status.replication_processed, 1u); // Done with 1st, processing 2nd + + // Continue... + expectReadKey(it, 3); + bgIteratorGetStatus(it, &status); + EXPECT_EQ(status.replication_queued, 2u); + EXPECT_EQ(status.replication_processed, 2u); // Done processing both repl items + expectReadComplete(it); + freeTestClient(c1); + freeTestClient(c2); +} +#endif + + + +// JHB - need test that hashing is paused when an entry is in use. diff --git a/src/unit/wrappers.h b/src/unit/wrappers.h index 0f4fb388b98..0f80919d6f7 100644 --- a/src/unit/wrappers.h +++ b/src/unit/wrappers.h @@ -61,6 +61,12 @@ extern "C" { long long __wrap_aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds, aeTimeProc *proc, void *clientData, aeEventFinalizerProc *finalizerProc); int __wrap_processPendingCommandAndInputBuffer(client *c); void __wrap_beforeNextClient(client *c); + +void __wrap_blockClientInUseOnKeys(client *c, int nKeys, robj **keys); +void __wrap_unblockClientsInUseOnKey(robj *key); + +int __wrap_ACLCheckAllUserCommandPerm(user *u, struct serverCommand *cmd, robj **argv, int argc, int dbid, int *idxptr); + #undef protected #undef _Bool #undef typename