From 1583d60cd62954338ca90f16f4d13afd96fcbf07 Mon Sep 17 00:00:00 2001
From: Yves LeBras <ylb415+github@gmail.com>
Date: Wed, 12 Feb 2025 16:42:38 -0800
Subject: [PATCH 01/24] Missing --memkeys and --keystats for some options in
 redis-cli help text (#13794)

Help text modified for -i, --count, --pattern.
---
 src/redis-cli.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/src/redis-cli.c b/src/redis-cli.c
index 36f3d24ff9f..3e447d6dcab 100644
--- a/src/redis-cli.c
+++ b/src/redis-cli.c
@@ -3118,7 +3118,7 @@ static void usage(int err) {
 "  -i <interval>      When -r is used, waits <interval> seconds per command.\n"
 "                     It is possible to specify sub-second times like -i 0.1.\n"
 "                     This interval is also used in --scan and --stat per cycle.\n"
-"                     and in --bigkeys, --memkeys, and --hotkeys per 100 cycles.\n"
+"                     and in --bigkeys, --memkeys, --keystats, and --hotkeys per 100 cycles.\n"
 "  -n <db>            Database number.\n"
 "  -2                 Start session in RESP2 protocol mode.\n"
 "  -3                 Start session in RESP3 protocol mode.\n"
@@ -3181,9 +3181,10 @@ version,tls_usage);
 "  --hotkeys          Sample Redis keys looking for hot keys.\n"
 "                     only works when maxmemory-policy is *lfu.\n"
 "  --scan             List all keys using the SCAN command.\n"
-"  --pattern <pat>    Keys pattern when using the --scan, --bigkeys or --hotkeys\n"
-"                     options (default: *).\n"
-"  --count <count>    Count option when using the --scan, --bigkeys or --hotkeys (default: 10).\n"
+"  --pattern <pat>    Keys pattern when using the --scan, --bigkeys, --memkeys,\n"
+"                     --keystats or --hotkeys options (default: *).\n"
+"  --count <count>    Count option when using the --scan, --bigkeys, --memkeys,\n"
+"                     --keystats or --hotkeys (default: 10).\n"
 "  --quoted-pattern <pat> Same as --pattern, but the specified string can be\n"
 "                         quoted, in order to pass an otherwise non binary-safe string.\n"
 "  --intrinsic-latency <sec> Run a test to measure intrinsic system latency.\n"

From 87124a38b65e8f7c867e4dcb1f471355d50f7f74 Mon Sep 17 00:00:00 2001
From: Yuan Wang <yuan.wang@redis.com>
Date: Thu, 13 Feb 2025 10:48:29 +0800
Subject: [PATCH 02/24] Fix wrongly updating fsynced_reploff_pending when
 appendfsync=everysecond (#13793)

```
if (server.aof_fsync == AOF_FSYNC_EVERYSEC &&
    server.aof_last_incr_fsync_offset != server.aof_last_incr_size &&
    server.mstime - server.aof_last_fsync >= 1000 &&
    !(sync_in_progress = aofFsyncInProgress())) {
    goto try_fsync;
```
In https://github.com/redis/redis/pull/12622, when when
appendfsync=everysecond, if redis has written some data to AOF but not
`fsync`, and less than 1 second has passed since the last `fsync `,
redis will won't fsync AOF, but we will update `
fsynced_reploff_pending`, so it cause the `WAITAOF` to return
prematurely.

this bug is introduced in https://github.com/redis/redis/pull/12622,
from 7.4

The bug fix
https://github.com/redis/redis/pull/13793/commits/1bd6688bcae4add51dc829d96776359dfa39b100
is just as follows:
```diff
diff --git a/src/aof.c b/src/aof.c
index 8ccd8d8f8..521b30449 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -1096,8 +1096,11 @@ void flushAppendOnlyFile(int force) {
              * in which case master_repl_offset will increase but fsynced_reploff_pending won't be updated
              * (because there's no reason, from the AOF POV, to call fsync) and then WAITAOF may wait on
              * the higher offset (which contains data that was only propagated to replicas, and not to AOF) */
-            if (!sync_in_progress && server.aof_fsync != AOF_FSYNC_NO)
+            if (server.aof_last_incr_fsync_offset == server.aof_last_incr_size &&
+                !(sync_in_progress = aofFsyncInProgress()))
+            {
                 atomicSet(server.fsynced_reploff_pending, server.master_repl_offset);
+            }
             return;
```
Additionally, we slightly refactored fsync AOF to make it simpler, as
https://github.com/redis/redis/pull/13793/commits/584f008d1c39b90ae1d4862a313e04e3b426b136
---
 src/aof.c | 51 +++++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/src/aof.c b/src/aof.c
index 8ccd8d8f887..89443e4bdba 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -1071,35 +1071,34 @@ void flushAppendOnlyFile(int force) {
     mstime_t latency;
 
     if (sdslen(server.aof_buf) == 0) {
-        /* Check if we need to do fsync even the aof buffer is empty,
-         * because previously in AOF_FSYNC_EVERYSEC mode, fsync is
-         * called only when aof buffer is not empty, so if users
-         * stop write commands before fsync called in one second,
-         * the data in page cache cannot be flushed in time. */
-        if (server.aof_fsync == AOF_FSYNC_EVERYSEC &&
-            server.aof_last_incr_fsync_offset != server.aof_last_incr_size &&
-            server.mstime - server.aof_last_fsync >= 1000 &&
-            !(sync_in_progress = aofFsyncInProgress())) {
-            goto try_fsync;
-
-        /* Check if we need to do fsync even the aof buffer is empty,
-         * the reason is described in the previous AOF_FSYNC_EVERYSEC block,
-         * and AOF_FSYNC_ALWAYS is also checked here to handle a case where
-         * aof_fsync is changed from everysec to always. */
-        } else if (server.aof_fsync == AOF_FSYNC_ALWAYS &&
-                   server.aof_last_incr_fsync_offset != server.aof_last_incr_size)
-        {
-            goto try_fsync;
-        } else {
+        if (server.aof_last_incr_fsync_offset == server.aof_last_incr_size) {
             /* All data is fsync'd already: Update fsynced_reploff_pending just in case.
-             * This is needed to avoid a WAITAOF hang in case a module used RM_Call with the NO_AOF flag,
-             * in which case master_repl_offset will increase but fsynced_reploff_pending won't be updated
-             * (because there's no reason, from the AOF POV, to call fsync) and then WAITAOF may wait on
-             * the higher offset (which contains data that was only propagated to replicas, and not to AOF) */
-            if (!sync_in_progress && server.aof_fsync != AOF_FSYNC_NO)
+             * This is needed to avoid a WAITAOF hang in case a module used RM_Call
+             * with the NO_AOF flag, in which case master_repl_offset will increase but
+             * fsynced_reploff_pending won't be updated (because there's no reason, from
+             * the AOF POV, to call fsync) and then WAITAOF may wait on the higher offset
+             * (which contains data that was only propagated to replicas, and not to AOF) */
+            if (!aofFsyncInProgress())
                 atomicSet(server.fsynced_reploff_pending, server.master_repl_offset);
-            return;
+        } else {
+            /* Check if we need to do fsync even the aof buffer is empty,
+             * because previously in AOF_FSYNC_EVERYSEC mode, fsync is
+             * called only when aof buffer is not empty, so if users
+             * stop write commands before fsync called in one second,
+             * the data in page cache cannot be flushed in time. */
+            if (server.aof_fsync == AOF_FSYNC_EVERYSEC &&
+                server.mstime - server.aof_last_fsync >= 1000 &&
+                !(sync_in_progress = aofFsyncInProgress()))
+                goto try_fsync;
+
+            /* Check if we need to do fsync even the aof buffer is empty,
+             * the reason is described in the previous AOF_FSYNC_EVERYSEC block,
+             * and AOF_FSYNC_ALWAYS is also checked here to handle a case where
+             * aof_fsync is changed from everysec to always. */
+            if (server.aof_fsync == AOF_FSYNC_ALWAYS)
+                goto try_fsync;
         }
+        return;
     }
 
     if (server.aof_fsync == AOF_FSYNC_EVERYSEC)

From 662cb2fe75d747734d3ca4bfd062044315fb9dc9 Mon Sep 17 00:00:00 2001
From: Yuan Wang <yuan.wang@redis.com>
Date: Thu, 13 Feb 2025 10:52:19 +0800
Subject: [PATCH 03/24] Don't send unnecessary PING to replicas (#13790)

The reason why master sends PING is to keep the connection with replica
active, so master need not send PING to replicas if already sent
replication stream in the past `repl_ping_slave_period` time.

Now master only sends PINGs and increases `master_repl_offset` if there
is no traffic, so this PR also can reduce the impact of issue in
https://github.com/redis/redis/pull/13773, of course, does not resolve
it completely.
> Fix ping causing inconsistency between AOF size and replication offset
in the future PR. Because we increment the replication offset when
sending PING/REPLCONF to the replica but do not write data to the AOF
file, this might cause the starting offset of the AOF file plus its size
to be inconsistent with the actual replication offset.
---
 src/replication.c | 15 +++++++++------
 src/server.c      |  1 +
 src/server.h      |  1 +
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/replication.c b/src/replication.c
index c2a49951a3c..1c2351c0f76 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -519,6 +519,9 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
     /* We can't have slaves attached and no backlog. */
     serverAssert(!(listLength(slaves) != 0 && server.repl_backlog == NULL));
 
+    /* Update the time of sending replication stream to replicas. */
+    server.repl_stream_lastio = server.unixtime;
+
     /* Must install write handler for all replicas first before feeding
      * replication stream. */
     prepareReplicasToWrite();
@@ -4479,8 +4482,6 @@ long long replicationGetSlaveOffset(void) {
 
 /* Replication cron function, called 1 time per second. */
 void replicationCron(void) {
-    static long long replication_cron_loops = 0;
-
     /* Check failover status first, to see if we need to start
      * handling the failover. */
     updateFailoverStatus();
@@ -4533,9 +4534,12 @@ void replicationCron(void) {
     listNode *ln;
     robj *ping_argv[1];
 
-    /* First, send PING according to ping_slave_period. */
-    if ((replication_cron_loops % server.repl_ping_slave_period) == 0 &&
-        listLength(server.slaves))
+    /* First, send PING according to ping_slave_period. The reason why master
+     * sends PING is to keep the connection with replica active, so master need
+     * not send PING to replicas if already sent replication stream in the past
+     * repl_ping_slave_period time. */
+    if (server.masterhost == NULL && listLength(server.slaves) &&
+        server.unixtime >= server.repl_stream_lastio + server.repl_ping_slave_period)
     {
         /* Note that we don't send the PING if the clients are paused during
          * a Redis Cluster manual failover: the PING we send will otherwise
@@ -4672,7 +4676,6 @@ void replicationCron(void) {
 
     /* Refresh the number of slaves with lag <= min-slaves-max-lag. */
     refreshGoodSlavesCount();
-    replication_cron_loops++; /* Incremented with frequency 1 HZ. */
 }
 
 int shouldStartChildReplication(int *mincapa_out, int *req_out) {
diff --git a/src/server.c b/src/server.c
index 5f20900e053..b76b3526ccf 100644
--- a/src/server.c
+++ b/src/server.c
@@ -2192,6 +2192,7 @@ void initServerConfig(void) {
     server.repl_down_since = 0; /* Never connected, repl is down since EVER. */
     server.master_repl_offset = 0;
     server.fsynced_reploff_pending = 0;
+    server.repl_stream_lastio = server.unixtime;
 
     /* Replication partial resync backlog */
     server.repl_backlog = NULL;
diff --git a/src/server.h b/src/server.h
index 72a6f779dbd..e0634d77a46 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2012,6 +2012,7 @@ struct redisServer {
     size_t repl_buffer_mem;         /* The memory of replication buffer. */
     list *repl_buffer_blocks;       /* Replication buffers blocks list
                                      * (serving replica clients and repl backlog) */
+    time_t repl_stream_lastio;      /* Unix time of the latest sending replication stream. */
     /* Replication (slave) */
     char *masteruser;               /* AUTH with this user and masterauth with master */
     sds masterauth;                 /* AUTH with this password with master */

From d29f04a63b74067866283b5205f21566e447e0eb Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Thu, 13 Feb 2025 15:49:46 +0800
Subject: [PATCH 04/24] Add support for module to defrag incremental

---
 src/defrag.c               | 40 +++++++++++++++++++++++++++++++++++---
 src/module.c               | 20 -------------------
 src/server.h               | 11 ++++++++++-
 tests/modules/defragtest.c | 27 ++++++++++++++-----------
 4 files changed, 63 insertions(+), 35 deletions(-)

diff --git a/src/defrag.c b/src/defrag.c
index a7c77296d68..a6a04c21095 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -118,6 +118,12 @@ typedef struct {
 } defragPubSubCtx;
 static_assert(offsetof(defragPubSubCtx, kvstate) == 0, "defragStageKvstoreHelper requires this");
 
+typedef struct {
+    RedisModule *module;
+    RedisModuleDefragCtx *module_ctx;
+    unsigned long cursor;
+} defragModuleCtx;
+
 /* When scanning a main kvstore, large elements are queued for later handling rather than
  * causing a large latency spike while processing a hash table bucket. This list is only used
  * for stage: "defragStageDbKeys". It will only contain values for the current kvstore being
@@ -1238,11 +1244,25 @@ static doneStatus defragLuaScripts(monotime endtime, void *target, void *privdat
     return DEFRAG_DONE;
 }
 
+/* Handles defragmentation of module global data. This is a stage function
+ * that gets called periodically during the active defragmentation process. */
 static doneStatus defragModuleGlobals(monotime endtime, void *target, void *privdata) {
     UNUSED(target);
-    UNUSED(privdata);
+    defragModuleCtx *ctx = privdata;
     if (endtime == 0) return DEFRAG_NOT_DONE; /* required initialization */
-    moduleDefragGlobals();
+
+    /* Set up context for the module's defrag callback. */
+    ctx->module_ctx->endtime = endtime;
+    ctx->module_ctx->cursor = &ctx->cursor;
+
+    /* Call the module's defrag callback function and check if more work remains. */
+    ctx->module->defrag_cb(ctx->module_ctx);
+    if (ctx->cursor != 0)
+        return DEFRAG_NOT_DONE;
+
+    /* Clean up when module defragmentation is complete. */
+    zfree(ctx->module_ctx);
+    zfree(ctx);
     return DEFRAG_DONE;
 }
 
@@ -1508,7 +1528,21 @@ static void beginDefragCycle(void) {
     addDefragStage(defragStagePubsubKvstore, server.pubsubshard_channels, &getClientPubSubShardChannelsFn);
 
     addDefragStage(defragLuaScripts, NULL, NULL);
-    addDefragStage(defragModuleGlobals, NULL, NULL);
+
+    /* Add stages for modules. */
+    dictIterator *di = dictGetIterator(modules);
+    dictEntry *de;
+    while ((de = dictNext(di)) != NULL) {
+        struct RedisModule *module = dictGetVal(de);
+        if (module->defrag_cb) {
+            defragModuleCtx *ctx = zmalloc(sizeof(defragModuleCtx));
+            ctx->module = module;
+            ctx->module_ctx = zcalloc(sizeof(RedisModuleDefragCtx));
+            ctx->cursor = 0;
+            addDefragStage(defragModuleGlobals, NULL, ctx);
+        }
+    }
+    dictReleaseIterator(di);
 
     defrag.current_stage = NULL;
     defrag.start_cycle = getMonotonicUs();
diff --git a/src/module.c b/src/module.c
index bb5f68daf1d..c4001608727 100644
--- a/src/module.c
+++ b/src/module.c
@@ -13778,16 +13778,6 @@ const char *RM_GetCurrentCommandName(RedisModuleCtx *ctx) {
  * ## Defrag API
  * -------------------------------------------------------------------------- */
 
-/* The defrag context, used to manage state during calls to the data type
- * defrag callback.
- */
-struct RedisModuleDefragCtx {
-    monotime endtime;
-    unsigned long *cursor;
-    struct redisObject *key; /* Optional name of key processed, NULL when unknown. */
-    int dbid;                /* The dbid of the key being processed, -1 when unknown. */
-};
-
 /* Register a defrag callback for global data, i.e. anything that the module
  * may allocate that is not tied to a specific data type.
  */
@@ -13987,16 +13977,6 @@ int moduleDefragValue(robj *key, robj *value, int dbid) {
     return 1;
 }
 
-/* Call registered module API defrag functions */
-void moduleDefragGlobals(void) {
-    dictForEach(modules, struct RedisModule, module, 
-        if (module->defrag_cb) {
-            RedisModuleDefragCtx defrag_ctx = { 0, NULL, NULL, -1};
-            module->defrag_cb(&defrag_ctx);
-        }
-    );
-}
-
 /* Call registered module API defrag start functions */
 void moduleDefragStart(void) {
     dictForEach(modules, struct RedisModule, module, 
diff --git a/src/server.h b/src/server.h
index d749c995153..8c10f6a4e6f 100644
--- a/src/server.h
+++ b/src/server.h
@@ -900,6 +900,16 @@ struct RedisModule {
 };
 typedef struct RedisModule RedisModule;
 
+/* The defrag context, used to manage state during calls to the data type
+ * defrag callback.
+ */
+struct RedisModuleDefragCtx {
+    monotime endtime;
+    unsigned long *cursor;
+    struct redisObject *key; /* Optional name of key processed, NULL when unknown. */
+    int dbid;                /* The dbid of the key being processed, -1 when unknown. */
+};
+
 /* This is a wrapper for the 'rio' streams used inside rdb.c in Redis, so that
  * the user does not have to take the total count of the written bytes nor
  * to care about error conditions. */
@@ -2673,7 +2683,6 @@ size_t moduleGetMemUsage(robj *key, robj *val, size_t sample_size, int dbid);
 robj *moduleTypeDupOrReply(client *c, robj *fromkey, robj *tokey, int todb, robj *value);
 int moduleDefragValue(robj *key, robj *obj, int dbid);
 int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid);
-void moduleDefragGlobals(void);
 void moduleDefragStart(void);
 void moduleDefragEnd(void);
 void *moduleGetHandleByName(char *modulename);
diff --git a/tests/modules/defragtest.c b/tests/modules/defragtest.c
index 597b5aa79f3..5715643d02d 100644
--- a/tests/modules/defragtest.c
+++ b/tests/modules/defragtest.c
@@ -26,29 +26,34 @@ unsigned long int defrag_started = 0;
 unsigned long int defrag_ended = 0;
 unsigned long int global_defragged = 0;
 
-int global_strings_len = 0;
+unsigned long global_strings_len = 0;
 RedisModuleString **global_strings = NULL;
 
-static void createGlobalStrings(RedisModuleCtx *ctx, int count)
+static void createGlobalStrings(RedisModuleCtx *ctx, unsigned long count)
 {
     global_strings_len = count;
     global_strings = RedisModule_Alloc(sizeof(RedisModuleString *) * count);
 
-    for (int i = 0; i < count; i++) {
+    for (unsigned long i = 0; i < count; i++) {
         global_strings[i] = RedisModule_CreateStringFromLongLong(ctx, i);
     }
 }
 
-static void defragGlobalStrings(RedisModuleDefragCtx *ctx)
+static int defragGlobalStrings(RedisModuleDefragCtx *ctx)
 {
-    for (int i = 0; i < global_strings_len; i++) {
-        RedisModuleString *new = RedisModule_DefragRedisModuleString(ctx, global_strings[i]);
-        global_attempts++;
-        if (new != NULL) {
-            global_strings[i] = new;
-            global_defragged++;
-        }
+    unsigned long cursor = 0;
+    RedisModule_DefragCursorGet(ctx, &cursor);
+
+    RedisModule_Assert(cursor < global_strings_len);
+    RedisModuleString *new = RedisModule_DefragRedisModuleString(ctx, global_strings[cursor]);
+    global_attempts++;
+    if (new != NULL) {
+        global_strings[cursor] = new;
+        global_defragged++;
     }
+
+    cursor++;
+    RedisModule_DefragCursorSet(ctx, cursor == global_strings_len ? 0 : cursor);
 }
 
 static void defragStart(RedisModuleDefragCtx *ctx) {

From 53c1d9931c0c4db22d139f6331d49e0a7e03dd8f Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Thu, 13 Feb 2025 16:02:18 +0800
Subject: [PATCH 05/24] Fix complaint

---
 tests/modules/defragtest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/modules/defragtest.c b/tests/modules/defragtest.c
index 5715643d02d..5b4ef27cded 100644
--- a/tests/modules/defragtest.c
+++ b/tests/modules/defragtest.c
@@ -39,7 +39,7 @@ static void createGlobalStrings(RedisModuleCtx *ctx, unsigned long count)
     }
 }
 
-static int defragGlobalStrings(RedisModuleDefragCtx *ctx)
+static void defragGlobalStrings(RedisModuleDefragCtx *ctx)
 {
     unsigned long cursor = 0;
     RedisModule_DefragCursorGet(ctx, &cursor);

From 7f5f5882322d19ebba3ccbf933e1252c03116d55 Mon Sep 17 00:00:00 2001
From: Yuan Wang <yuan.wang@redis.com>
Date: Thu, 13 Feb 2025 17:31:40 +0800
Subject: [PATCH 06/24] AOF offset info (#13773)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Background
AOF is often used as an effective data recovery method, but now if we
have two AOFs from different nodes, it is hard to learn which one has
latest data. Generally, we determine whose data is more up-to-date by
reading the latest modification time of the AOF file, but because of
replication delay, even if both master and replica write to the AOF at
the same time, the data in the master is more up-to-date (there are
commands that didn't arrive at the replica yet, or a large number of
commands have accumulated on replica side ), so we may make wrong
decision.

### Solution
The replication offset always increments when AOF is enabled even if
there is no replica, we think replication offset is better method to
determine which one has more up-to-date data, whoever has a larger
offset will have newer data, so we add the start replication offset info
for AOF, as bellow.
```
file appendonly.aof.2.base.rdb seq 2 type b
file appendonly.aof.2.incr.aof seq 2 type i startoffset 224
```
And if we close gracefully the AOF file, not a crash, such as
`shutdown`, `kill signal 15` or `config set appendonly no`, we will add
the end replication offset, as bellow.
```
file appendonly.aof.2.base.rdb seq 2 type b
file appendonly.aof.2.incr.aof seq 2 type i startoffset 224 endoffset 532
```

#### Things to pay attention to
- For BASE AOF, we do not add `startoffset` and `endoffset` info, since
we could not know the start replication replication of data, and it is
useless to help us to determine which one has more up-to-date data.
- For AOFs from old version, we also don't add `startoffset` and
`endoffset` info, since we also don't know start replication replication
of them. If we add the start offset from 0, we might make the judgment
even less accurate. For example, if the master has just rewritten the
AOF, its INCR AOF will inevitably be very small. However, if the replica
has not rewritten AOF for a long time, its INCR AOF might be much
larger. By applying the following method, we might make incorrect
decisions, so we still just check timestamp instead of adding offset
info
- If the last INCR AOF has `startoffset` or `endoffset`, we need to
restore `server.master_repl_offset` according to them to avoid the
rollback of the `startoffset` of next INCR AOF. If it has `endoffset`,
we just use this value as `server.master_repl_offset`, and a very
important thing is to remove this information from the manifest file to
avoid the next time we load the manifest file with wrong `endoffset`. If
it only has `startoffset`, we calculate `server.master_repl_offset` by
the `startoffset` plus the file size.

### How to determine which one has more up-to-date data
If AOF has a larger replication offset, it will have more up-to-date
data. The following is how to get AOF offset:

Read the AOF manifest file to obtain information about **the last INCR
AOF**
1. If the last INCR AOF has `endoffset` field, we can directly use the
`endoffset` to present the replication offset of AOF
2. If there is no `endoffset`(such as redis crashes abnormally), but
there is `startoffset` filed of the last INCR AOF, we can get the
replication offset of AOF by `startoffset` plus the file size
3. Finally, if the AOF doesn’t have both `startoffset` and `endoffset`,
maybe from old version, and new version redis has not rewritten AOF yet,
we still need to check the modification timestamp of the last INCR AOF

### TODO
Fix ping causing inconsistency between AOF size and replication
offset in the future PR. Because we increment the replication offset
when sending PING/REPLCONF to the replica but do not write data to the
AOF file, this might cause the starting offset of the AOF file plus its
size to be inconsistent with the actual replication offset.
---
 src/aof.c                            |  86 ++++++++++-
 src/server.c                         |   4 +
 src/server.h                         |   4 +
 tests/integration/aof-multi-part.tcl | 206 +++++++++++++++++++++++++++
 tests/support/aofmanifest.tcl        |   2 +-
 5 files changed, 295 insertions(+), 7 deletions(-)

diff --git a/src/aof.c b/src/aof.c
index 89443e4bdba..f3f6d782fc4 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -30,6 +30,13 @@ aofManifest *aofLoadManifestFromFile(sds am_filepath);
 void aofManifestFreeAndUpdate(aofManifest *am);
 void aof_background_fsync_and_close(int fd);
 
+/* When we call 'startAppendOnly', we will create a temp INCR AOF, and rename
+ * it to the real INCR AOF name when the AOFRW is done, so if want to know the
+ * accurate start offset of the INCR AOF, we need to record it when we create
+ * the temp INCR AOF. This variable is used to record the start offset, and
+ * set the start offset of the real INCR AOF when the AOFRW is done. */
+static long long tempIncAofStartReplOffset = 0;
+
 /* ----------------------------------------------------------------------------
  * AOF Manifest file implementation.
  *
@@ -73,10 +80,15 @@ void aof_background_fsync_and_close(int fd);
 #define AOF_MANIFEST_KEY_FILE_NAME   "file"
 #define AOF_MANIFEST_KEY_FILE_SEQ    "seq"
 #define AOF_MANIFEST_KEY_FILE_TYPE   "type"
+#define AOF_MANIFEST_KEY_FILE_STARTOFFSET "startoffset"
+#define AOF_MANIFEST_KEY_FILE_ENDOFFSET   "endoffset"
 
 /* Create an empty aofInfo. */
 aofInfo *aofInfoCreate(void) {
-    return zcalloc(sizeof(aofInfo));
+    aofInfo *ai = zcalloc(sizeof(aofInfo));
+    ai->start_offset = -1;
+    ai->end_offset = -1;
+    return ai;
 }
 
 /* Free the aofInfo structure (pointed to by ai) and its embedded file_name. */
@@ -93,6 +105,8 @@ aofInfo *aofInfoDup(aofInfo *orig) {
     ai->file_name = sdsdup(orig->file_name);
     ai->file_seq = orig->file_seq;
     ai->file_type = orig->file_type;
+    ai->start_offset = orig->start_offset;
+    ai->end_offset = orig->end_offset;
     return ai;
 }
 
@@ -105,10 +119,19 @@ sds aofInfoFormat(sds buf, aofInfo *ai) {
     if (sdsneedsrepr(ai->file_name))
         filename_repr = sdscatrepr(sdsempty(), ai->file_name, sdslen(ai->file_name));
 
-    sds ret = sdscatprintf(buf, "%s %s %s %lld %s %c\n",
+    sds ret = sdscatprintf(buf, "%s %s %s %lld %s %c",
         AOF_MANIFEST_KEY_FILE_NAME, filename_repr ? filename_repr : ai->file_name,
         AOF_MANIFEST_KEY_FILE_SEQ, ai->file_seq,
         AOF_MANIFEST_KEY_FILE_TYPE, ai->file_type);
+
+    if (ai->start_offset != -1) {
+        ret = sdscatprintf(ret, " %s %lld", AOF_MANIFEST_KEY_FILE_STARTOFFSET, ai->start_offset);
+        if (ai->end_offset != -1) {
+            ret = sdscatprintf(ret, " %s %lld", AOF_MANIFEST_KEY_FILE_ENDOFFSET, ai->end_offset);
+        }
+    }
+
+    ret = sdscatlen(ret, "\n", 1);
     sdsfree(filename_repr);
 
     return ret;
@@ -304,6 +327,10 @@ aofManifest *aofLoadManifestFromFile(sds am_filepath) {
                 ai->file_seq = atoll(argv[i+1]);
             } else if (!strcasecmp(argv[i], AOF_MANIFEST_KEY_FILE_TYPE)) {
                 ai->file_type = (argv[i+1])[0];
+            } else if (!strcasecmp(argv[i], AOF_MANIFEST_KEY_FILE_STARTOFFSET)) {
+                ai->start_offset = atoll(argv[i+1]);
+            } else if (!strcasecmp(argv[i], AOF_MANIFEST_KEY_FILE_ENDOFFSET)) {
+                ai->end_offset = atoll(argv[i+1]);
             }
             /* else if (!strcasecmp(argv[i], AOF_MANIFEST_KEY_OTHER)) {} */
         }
@@ -433,12 +460,13 @@ sds getNewBaseFileNameAndMarkPreAsHistory(aofManifest *am) {
  * for example:
  *  appendonly.aof.1.incr.aof
  */
-sds getNewIncrAofName(aofManifest *am) {
+sds getNewIncrAofName(aofManifest *am, long long start_reploff) {
     aofInfo *ai = aofInfoCreate();
     ai->file_type = AOF_FILE_TYPE_INCR;
     ai->file_name = sdscatprintf(sdsempty(), "%s.%lld%s%s", server.aof_filename,
                         ++am->curr_incr_file_seq, INCR_FILE_SUFFIX, AOF_FORMAT_SUFFIX);
     ai->file_seq = am->curr_incr_file_seq;
+    ai->start_offset = start_reploff;
     listAddNodeTail(am->incr_aof_list, ai);
     am->dirty = 1;
     return ai->file_name;
@@ -456,7 +484,7 @@ sds getLastIncrAofName(aofManifest *am) {
 
     /* If 'incr_aof_list' is empty, just create a new one. */
     if (!listLength(am->incr_aof_list)) {
-        return getNewIncrAofName(am);
+        return getNewIncrAofName(am, server.master_repl_offset);
     }
 
     /* Or return the last one. */
@@ -781,10 +809,11 @@ int openNewIncrAofForAppend(void) {
     if (server.aof_state == AOF_WAIT_REWRITE) {
         /* Use a temporary INCR AOF file to accumulate data during AOF_WAIT_REWRITE. */
         new_aof_name = getTempIncrAofName();
+        tempIncAofStartReplOffset = server.master_repl_offset;
     } else {
         /* Dup a temp aof_manifest to modify. */
         temp_am = aofManifestDup(server.aof_manifest);
-        new_aof_name = sdsdup(getNewIncrAofName(temp_am));
+        new_aof_name = sdsdup(getNewIncrAofName(temp_am, server.master_repl_offset));
     }
     sds new_aof_filepath = makePath(server.aof_dirname, new_aof_name);
     newfd = open(new_aof_filepath, O_WRONLY|O_TRUNC|O_CREAT, 0644);
@@ -833,6 +862,50 @@ int openNewIncrAofForAppend(void) {
     return C_ERR;
 }
 
+/* When we close gracefully the AOF file, we have the chance to persist the
+ * end replication offset of current INCR AOF. */
+void updateCurIncrAofEndOffset(void) {
+    if (server.aof_state != AOF_ON) return;
+    serverAssert(server.aof_manifest != NULL);
+
+    if (listLength(server.aof_manifest->incr_aof_list) == 0) return;
+    aofInfo *ai = listNodeValue(listLast(server.aof_manifest->incr_aof_list));
+    ai->end_offset = server.master_repl_offset;
+    server.aof_manifest->dirty = 1;
+    /* It doesn't matter if the persistence fails since this information is not
+     * critical, we can get an approximate value by start offset plus file size. */
+    persistAofManifest(server.aof_manifest);
+}
+
+/* After loading AOF data, we need to update the `server.master_repl_offset`
+ * based on the information of the last INCR AOF, to avoid the rollback of
+ * the start offset of new INCR AOF. */
+void updateReplOffsetAndResetEndOffset(void) {
+    if (server.aof_state != AOF_ON) return;
+    serverAssert(server.aof_manifest != NULL);
+
+    /* If the INCR file has an end offset, we directly use it, and clear it
+     * to avoid the next time we load the manifest file, we will use the same
+     * offset, but the real offset may have advanced. */
+    if (listLength(server.aof_manifest->incr_aof_list) == 0) return;
+    aofInfo *ai = listNodeValue(listLast(server.aof_manifest->incr_aof_list));
+    if (ai->end_offset != -1) {
+        server.master_repl_offset = ai->end_offset;
+        ai->end_offset = -1;
+        server.aof_manifest->dirty = 1;
+        /* We must update the end offset of INCR file correctly, otherwise we
+         * may keep wrong information in the manifest file, since we continue
+         * to append data to the same INCR file. */
+        if (persistAofManifest(server.aof_manifest) != AOF_OK)
+            exit(1);
+    } else {
+        /* If the INCR file doesn't have an end offset, we need to calculate
+         * the replication offset by the start offset plus the file size. */
+        server.master_repl_offset = (ai->start_offset == -1 ? 0 : ai->start_offset) +
+                                    getAppendOnlyFileSize(ai->file_name, NULL);
+    }
+}
+
 /* Whether to limit the execution of Background AOF rewrite.
  *
  * At present, if AOFRW fails, redis will automatically retry. If it continues
@@ -938,6 +1011,7 @@ void stopAppendOnly(void) {
         server.aof_last_fsync = server.mstime;
     }
     close(server.aof_fd);
+    updateCurIncrAofEndOffset();
 
     server.aof_fd = -1;
     server.aof_selected_db = -1;
@@ -2664,7 +2738,7 @@ void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
             sds temp_incr_aof_name = getTempIncrAofName();
             sds temp_incr_filepath = makePath(server.aof_dirname, temp_incr_aof_name);
             /* Get next new incr aof name. */
-            sds new_incr_filename = getNewIncrAofName(temp_am);
+            sds new_incr_filename = getNewIncrAofName(temp_am, tempIncAofStartReplOffset);
             new_incr_filepath = makePath(server.aof_dirname, new_incr_filename);
             latencyStartMonitor(latency);
             if (rename(temp_incr_filepath, new_incr_filepath) == -1) {
diff --git a/src/server.c b/src/server.c
index b76b3526ccf..0e056c8be4c 100644
--- a/src/server.c
+++ b/src/server.c
@@ -4600,6 +4600,9 @@ int finishShutdown(void) {
         }
     }
 
+    /* Update the end offset of current INCR AOF if possible. */
+    updateCurIncrAofEndOffset();
+
     /* Free the AOF manifest. */
     if (server.aof_manifest) aofManifestFree(server.aof_manifest);
 
@@ -6864,6 +6867,7 @@ void loadDataFromDisk(void) {
             exit(1);
         if (ret != AOF_NOT_EXIST)
             serverLog(LL_NOTICE, "DB loaded from append only file: %.3f seconds", (float)(ustime()-start)/1000000);
+        updateReplOffsetAndResetEndOffset();
     } else {
         rdbSaveInfo rsi = RDB_SAVE_INFO_INIT;
         int rsi_is_valid = 0;
diff --git a/src/server.h b/src/server.h
index e0634d77a46..e04035c28e2 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1620,6 +1620,8 @@ typedef struct {
     sds           file_name;  /* file name */
     long long     file_seq;   /* file sequence */
     aof_file_type file_type;  /* file type */
+    long long     start_offset;  /* the start replication offset of the file */
+    long long     end_offset;    /* the end replication offset of the file */
 } aofInfo;
 
 typedef struct {
@@ -3059,6 +3061,8 @@ void aofOpenIfNeededOnServerStart(void);
 void aofManifestFree(aofManifest *am);
 int aofDelHistoryFiles(void);
 int aofRewriteLimited(void);
+void updateCurIncrAofEndOffset(void);
+void updateReplOffsetAndResetEndOffset(void);
 
 /* Child info */
 void openChildInfoPipe(void);
diff --git a/tests/integration/aof-multi-part.tcl b/tests/integration/aof-multi-part.tcl
index bdd03823398..5a0025070a5 100644
--- a/tests/integration/aof-multi-part.tcl
+++ b/tests/integration/aof-multi-part.tcl
@@ -1329,4 +1329,210 @@ tags {"external:skip"} {
             }
         }
     }
+
+    # Test Part 3
+    #
+    # Test if INCR AOF offset information is as expected
+    test {Multi Part AOF writes start offset in the manifest} {
+        set aof_dirpath "$server_path/$aof_dirname"
+        set aof_manifest_file "$server_path/$aof_dirname/${aof_basename}$::manifest_suffix"
+
+        start_server_aof [list dir $server_path] {
+            set client [redis [srv host] [srv port] 0 $::tls]
+            wait_done_loading $client
+
+            # The manifest file has startoffset now
+            assert_aof_manifest_content $aof_manifest_file {
+                {file appendonly.aof.1.base.rdb seq 1 type b}
+                {file appendonly.aof.1.incr.aof seq 1 type i startoffset 0}
+            }
+        }
+
+        clean_aof_persistence $aof_dirpath
+    }
+
+    test {Multi Part AOF won't add the offset of incr AOF from old version} {
+        create_aof $aof_dirpath $aof_base1_file {
+            append_to_aof [formatCommand set k1 v1]
+        }
+
+        create_aof $aof_dirpath $aof_incr1_file {
+            append_to_aof [formatCommand set k2 v2]
+        }
+
+        create_aof_manifest $aof_dirpath $aof_manifest_file {
+            append_to_manifest "file appendonly.aof.1.base.aof seq 1 type b\n"
+            append_to_manifest "file appendonly.aof.1.incr.aof seq 1 type i\n"
+        }
+
+        start_server_aof [list dir $server_path] {
+            assert_equal 1 [is_alive [srv pid]]
+            set client [redis [srv host] [srv port] 0 $::tls]
+            wait_done_loading $client
+
+            assert_equal v1 [$client get k1]
+            assert_equal v2 [$client get k2]
+
+            $client set k3 v3
+            catch {$client shutdown}
+
+            # Should not add offset to the manifest since we also don't know the right
+            # starting replication of them.
+            set fp [open $aof_manifest_file r]
+            set content [read $fp]
+            close $fp
+            assert ![regexp {startoffset} $content]
+
+            # The manifest file still have information from the old version
+            assert_aof_manifest_content $aof_manifest_file  {
+                {file appendonly.aof.1.base.aof seq 1 type b}
+                {file appendonly.aof.1.incr.aof seq 1 type i}
+            }
+        }
+
+        clean_aof_persistence $aof_dirpath
+    }
+
+    test {Multi Part AOF can update master_repl_offset with only startoffset info} {
+        create_aof $aof_dirpath $aof_base1_file {
+            append_to_aof [formatCommand set k1 v1]
+        }
+
+        create_aof $aof_dirpath $aof_incr1_file {
+            append_to_aof [formatCommand set k2 v2]
+        }
+
+        create_aof_manifest $aof_dirpath $aof_manifest_file {
+            append_to_manifest "file appendonly.aof.1.base.aof seq 1 type b\n"
+            append_to_manifest "file appendonly.aof.1.incr.aof seq 1 type i startoffset 100\n"
+        }
+
+        start_server [list overrides [list dir $server_path appendonly yes ]] {
+            wait_done_loading r
+            r select 0
+            assert_equal v1 [r get k1]
+            assert_equal v2 [r get k2]
+
+            # After loading AOF, redis will update the replication offset based on
+            # the information of the last INCR AOF, to avoid the rollback of the
+            # start offset of new INCR AOF. If the INCR file doesn't have an end offset
+            # info, redis will calculate the replication offset by the start offset
+            # plus the file size.
+            set file_size [file size $aof_incr1_file]
+            set offset [expr $file_size + 100]
+            assert_equal $offset [s master_repl_offset]
+        }
+
+        clean_aof_persistence $aof_dirpath
+    }
+
+    test {Multi Part AOF can update master_repl_offset with endoffset info} {
+        create_aof $aof_dirpath $aof_base1_file {
+            append_to_aof [formatCommand set k1 v1]
+        }
+
+        create_aof $aof_dirpath $aof_incr1_file {
+            append_to_aof [formatCommand set k2 v2]
+        }
+
+        create_aof_manifest $aof_dirpath $aof_manifest_file {
+            append_to_manifest "file appendonly.aof.1.base.aof seq 1 type b\n"
+            append_to_manifest "file appendonly.aof.1.incr.aof seq 1 type i startoffset 100 endoffset 200\n"
+        }
+
+        start_server [list overrides [list dir $server_path appendonly yes ]] {
+            wait_done_loading r
+            r select 0
+            assert_equal v1 [r get k1]
+            assert_equal v2 [r get k2]
+
+            # If the INCR file has an end offset, redis directly uses it as replication offset
+            assert_equal 200 [s master_repl_offset]
+
+            # We should reset endoffset in manifest file
+            set fp [open $aof_manifest_file r]
+            set content [read $fp]
+            close $fp
+            assert ![regexp {endoffset} $content]
+        }
+
+        clean_aof_persistence $aof_dirpath
+    }
+
+    test {Multi Part AOF will add the end offset if we close gracefully the AOF} {
+        start_server_aof [list dir $server_path] {
+            set client [redis [srv host] [srv port] 0 $::tls]
+            wait_done_loading $client
+
+            assert_aof_manifest_content $aof_manifest_file {
+                {file appendonly.aof.1.base.rdb seq 1 type b}
+                {file appendonly.aof.1.incr.aof seq 1 type i startoffset 0}
+            }
+
+            $client set k1 v1
+            $client set k2 v2
+            # Close AOF gracefully when stopping appendonly, we should add endoffset
+            # in the manifest file, 'endoffset' should be 2 since writing 2 commands
+            r config set appendonly no
+            assert_aof_manifest_content $aof_manifest_file {
+                {file appendonly.aof.1.base.rdb seq 1 type b}
+                {file appendonly.aof.1.incr.aof seq 1 type i startoffset 0 endoffset 2}
+            }
+            r config set appendonly yes
+            waitForBgrewriteaof $client
+
+            $client set k3 v3
+            # Close AOF gracefully when shutting down server, we should add endoffset
+            # in the manifest file, 'endoffset' should be 3 since writing 3 commands
+            catch {$client shutdown}
+            assert_aof_manifest_content $aof_manifest_file {
+                {file appendonly.aof.2.base.rdb seq 2 type b}
+                {file appendonly.aof.2.incr.aof seq 2 type i startoffset 2 endoffset 3}
+            }
+        }
+
+        clean_aof_persistence $aof_dirpath
+    }
+
+    test {INCR AOF has accurate start offset when AOFRW} {
+        start_server [list overrides [list dir $server_path appendonly yes ]] {
+            r config set auto-aof-rewrite-percentage 0
+
+            # Start write load to let the master_repl_offset continue increasing
+            # since appendonly is enabled
+            set load_handle0 [start_write_load [srv 0 host] [srv 0 port] 10]
+            wait_for_condition 50 100 {
+                [r dbsize] > 0
+            } else {
+                fail "No write load detected."
+            }
+
+            # We obtain the master_repl_offset at the time of bgrewriteaof by pausing
+            # the redis process, sending pipeline commands, and then resuming the process
+            set rd [redis_deferring_client]
+            pause_process [srv 0 pid]
+            set buf "info replication\r\n"
+            append buf "bgrewriteaof\r\n"
+            $rd write $buf
+            $rd flush
+            resume_process [srv 0 pid]
+            # Read the replication offset and the start of the bgrewriteaof
+            regexp {master_repl_offset:(\d+)} [$rd read] -> offset1
+            assert_match {*rewriting started*} [$rd read]
+            $rd close
+
+            # Get the start offset from the manifest file after bgrewriteaof
+            waitForBgrewriteaof r
+            set fp [open $aof_manifest_file r]
+            set content [read $fp]
+            close $fp
+            set offset2 [lindex [regexp -inline {startoffset (\d+)} $content] 1]
+
+            # The start offset of INCR AOF should be the same as master_repl_offset
+            # when we trigger bgrewriteaof
+            assert {$offset1 == $offset2}
+            stop_write_load $load_handle0
+            wait_load_handlers_disconnected
+        }
+    }
 }
diff --git a/tests/support/aofmanifest.tcl b/tests/support/aofmanifest.tcl
index 151626294fd..68eed037b5a 100644
--- a/tests/support/aofmanifest.tcl
+++ b/tests/support/aofmanifest.tcl
@@ -122,7 +122,7 @@ proc assert_aof_manifest_content {manifest_path content} {
     assert_equal [llength $lines] [llength $content]
 
     for { set i 0 } { $i < [llength $lines] } {incr i} {
-        assert_equal [lindex $lines $i] [lindex $content $i]
+        assert {[string first [lindex $content $i] [lindex $lines $i]] != -1}
     }
 }
 

From 57807cd33806116d2d65c2e037ffa35cdadfd801 Mon Sep 17 00:00:00 2001
From: Ofir Luzon <ofirluzon@gmail.com>
Date: Thu, 13 Feb 2025 17:18:47 -0800
Subject: [PATCH 07/24] Memory Usage command LIST accuracy fix (#13783)

MEMORY USAGE on a List samples quicklist entries, but does not account
to how many elements are in each sampled node. This can skew the
calculation when the sampled nodes are not balanced.
The fix calculate the average element size in the sampled nodes instead
of the average node size.
---
 src/object.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/object.c b/src/object.c
index 6a03f26a741..45acd2e8360 100644
--- a/src/object.c
+++ b/src/object.c
@@ -988,7 +988,7 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) {
     dict *d;
     dictIterator *di;
     struct dictEntry *de;
-    size_t asize = 0, elesize = 0, samples = 0;
+    size_t asize = 0, elesize = 0, elecount = 0, samples = 0;
 
     if (o->type == OBJ_STRING) {
         if(o->encoding == OBJ_ENCODING_INT) {
@@ -1007,9 +1007,10 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) {
             asize = sizeof(*o)+sizeof(quicklist);
             do {
                 elesize += sizeof(quicklistNode)+zmalloc_size(node->entry);
+                elecount += node->count;
                 samples++;
             } while ((node = node->next) && samples < sample_size);
-            asize += (double)elesize/samples*ql->len;
+            asize += (double)elesize/elecount*ql->count;
         } else if (o->encoding == OBJ_ENCODING_LISTPACK) {
             asize = sizeof(*o)+zmalloc_size(o->ptr);
         } else {

From e2608478b6a46998bb3196b9b3ef4a24cbbae8a7 Mon Sep 17 00:00:00 2001
From: Ozan Tezcan <ozantezcan@gmail.com>
Date: Fri, 14 Feb 2025 17:13:35 +0300
Subject: [PATCH 08/24] Add HGETDEL, HGETEX and HSETEX hash commands (#13798)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR adds three new hash commands: HGETDEL, HGETEX and HSETEX. These
commands enable user to do multiple operations in one step atomically
e.g. set a hash field and update its TTL with a single command.
Previously, it was only possible to do it by calling hset and hexpire
commands subsequently.

- **HGETDEL command**

  ```
  HGETDEL <key> FIELDS <numfields> field [field ...]
  ```

  **Description**
  Get and delete the value of one or more fields of a given hash key

  **Reply**
Array reply: list of the value associated with each field or nil if the
field doesn’t exist.

- **HGETEX command**

  ```
   HGETEX <key>
[EX seconds | PX milliseconds | EXAT unix-time-seconds | PXAT
unix-time-milliseconds | PERSIST]
     FIELDS <numfields> field [field ...]
  ```

  **Description**
Get the value of one or more fields of a given hash key, and optionally
set their expiration

  **Options:**
  EX seconds: Set the specified expiration time, in seconds.
  PX milliseconds: Set the specified expiration time, in milliseconds.
EXAT timestamp-seconds: Set the specified Unix time at which the field
will expire, in seconds.
PXAT timestamp-milliseconds: Set the specified Unix time at which the
field will expire, in milliseconds.
  PERSIST: Remove the time to live associated with the field.

  **Reply**
Array reply: list of the value associated with each field or nil if the
field doesn’t exist.

- **HSETEX command**

  ```
  HSETEX <key>
     [FNX | FXX]
[EX seconds | PX milliseconds | EXAT unix-time-seconds | PXAT
unix-time-milliseconds | KEEPTTL]
     FIELDS <numfields> field value [field value...]
  ```
  **Description**
Set the value of one or more fields of a given hash key, and optionally
set their expiration

  **Options:**
  FNX: Only set the fields if all do not already exist.
  FXX: Only set the fields if all already exist.

  EX seconds: Set the specified expiration time, in seconds.
  PX milliseconds: Set the specified expiration time, in milliseconds.
EXAT timestamp-seconds: Set the specified Unix time at which the field
will expire, in seconds.
PXAT timestamp-milliseconds: Set the specified Unix time at which the
field will expire, in milliseconds.
  KEEPTTL: Retain the time to live associated with the field.


Note: If no option is provided, any associated expiration time will be
discarded similar to how SET command behaves.

  **Reply**
  Integer reply: 0 if no fields were set
  Integer reply: 1 if all the fields were set
---
 src/commands.def                      | 129 +++++
 src/commands/hgetdel.json             |  78 +++
 src/commands/hgetex.json              | 111 ++++
 src/commands/hsetex.json              | 132 +++++
 src/server.c                          |   1 +
 src/server.h                          |   9 +-
 src/t_hash.c                          | 778 +++++++++++++++++++++-----
 tests/unit/info-keysizes.tcl          |  25 +
 tests/unit/pubsub.tcl                 |  52 ++
 tests/unit/type/hash-field-expire.tcl | 653 +++++++++++++++++++++
 tests/unit/type/hash.tcl              |  84 +++
 11 files changed, 1924 insertions(+), 128 deletions(-)
 create mode 100644 src/commands/hgetdel.json
 create mode 100644 src/commands/hgetex.json
 create mode 100644 src/commands/hsetex.json

diff --git a/src/commands.def b/src/commands.def
index d8f49634217..dd55c816216 100644
--- a/src/commands.def
+++ b/src/commands.def
@@ -3472,6 +3472,78 @@ struct COMMAND_ARG HGETALL_Args[] = {
 {MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
 };
 
+/********** HGETDEL ********************/
+
+#ifndef SKIP_CMD_HISTORY_TABLE
+/* HGETDEL history */
+#define HGETDEL_History NULL
+#endif
+
+#ifndef SKIP_CMD_TIPS_TABLE
+/* HGETDEL tips */
+#define HGETDEL_Tips NULL
+#endif
+
+#ifndef SKIP_CMD_KEY_SPECS_TABLE
+/* HGETDEL key specs */
+keySpec HGETDEL_Keyspecs[1] = {
+{NULL,CMD_KEY_RW|CMD_KEY_ACCESS|CMD_KEY_DELETE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}}
+};
+#endif
+
+/* HGETDEL fields argument table */
+struct COMMAND_ARG HGETDEL_fields_Subargs[] = {
+{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)},
+};
+
+/* HGETDEL argument table */
+struct COMMAND_ARG HGETDEL_Args[] = {
+{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("fields",ARG_TYPE_BLOCK,-1,"FIELDS",NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=HGETDEL_fields_Subargs},
+};
+
+/********** HGETEX ********************/
+
+#ifndef SKIP_CMD_HISTORY_TABLE
+/* HGETEX history */
+#define HGETEX_History NULL
+#endif
+
+#ifndef SKIP_CMD_TIPS_TABLE
+/* HGETEX tips */
+#define HGETEX_Tips NULL
+#endif
+
+#ifndef SKIP_CMD_KEY_SPECS_TABLE
+/* HGETEX key specs */
+keySpec HGETEX_Keyspecs[1] = {
+{"RW and UPDATE because it changes the TTL",CMD_KEY_RW|CMD_KEY_ACCESS|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}}
+};
+#endif
+
+/* HGETEX expiration argument table */
+struct COMMAND_ARG HGETEX_expiration_Subargs[] = {
+{MAKE_ARG("seconds",ARG_TYPE_INTEGER,-1,"EX",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("milliseconds",ARG_TYPE_INTEGER,-1,"PX",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("unix-time-seconds",ARG_TYPE_UNIX_TIME,-1,"EXAT",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("unix-time-milliseconds",ARG_TYPE_UNIX_TIME,-1,"PXAT",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("persist",ARG_TYPE_PURE_TOKEN,-1,"PERSIST",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+};
+
+/* HGETEX fields argument table */
+struct COMMAND_ARG HGETEX_fields_Subargs[] = {
+{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,0,NULL)},
+};
+
+/* HGETEX argument table */
+struct COMMAND_ARG HGETEX_Args[] = {
+{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("expiration",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,5,NULL),.subargs=HGETEX_expiration_Subargs},
+{MAKE_ARG("fields",ARG_TYPE_BLOCK,-1,"FIELDS",NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=HGETEX_fields_Subargs},
+};
+
 /********** HINCRBY ********************/
 
 #ifndef SKIP_CMD_HISTORY_TABLE
@@ -3903,6 +3975,60 @@ struct COMMAND_ARG HSET_Args[] = {
 {MAKE_ARG("data",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,2,NULL),.subargs=HSET_data_Subargs},
 };
 
+/********** HSETEX ********************/
+
+#ifndef SKIP_CMD_HISTORY_TABLE
+/* HSETEX history */
+#define HSETEX_History NULL
+#endif
+
+#ifndef SKIP_CMD_TIPS_TABLE
+/* HSETEX tips */
+#define HSETEX_Tips NULL
+#endif
+
+#ifndef SKIP_CMD_KEY_SPECS_TABLE
+/* HSETEX key specs */
+keySpec HSETEX_Keyspecs[1] = {
+{NULL,CMD_KEY_RW|CMD_KEY_UPDATE,KSPEC_BS_INDEX,.bs.index={1},KSPEC_FK_RANGE,.fk.range={0,1,0}}
+};
+#endif
+
+/* HSETEX condition argument table */
+struct COMMAND_ARG HSETEX_condition_Subargs[] = {
+{MAKE_ARG("fnx",ARG_TYPE_PURE_TOKEN,-1,"FNX",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("fxx",ARG_TYPE_PURE_TOKEN,-1,"FXX",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+};
+
+/* HSETEX expiration argument table */
+struct COMMAND_ARG HSETEX_expiration_Subargs[] = {
+{MAKE_ARG("seconds",ARG_TYPE_INTEGER,-1,"EX",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("milliseconds",ARG_TYPE_INTEGER,-1,"PX",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("unix-time-seconds",ARG_TYPE_UNIX_TIME,-1,"EXAT",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("unix-time-milliseconds",ARG_TYPE_UNIX_TIME,-1,"PXAT",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("keepttl",ARG_TYPE_PURE_TOKEN,-1,"KEEPTTL",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+};
+
+/* HSETEX fields data argument table */
+struct COMMAND_ARG HSETEX_fields_data_Subargs[] = {
+{MAKE_ARG("field",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("value",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+};
+
+/* HSETEX fields argument table */
+struct COMMAND_ARG HSETEX_fields_Subargs[] = {
+{MAKE_ARG("numfields",ARG_TYPE_INTEGER,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("data",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_MULTIPLE,2,NULL),.subargs=HSETEX_fields_data_Subargs},
+};
+
+/* HSETEX argument table */
+struct COMMAND_ARG HSETEX_Args[] = {
+{MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,2,NULL),.subargs=HSETEX_condition_Subargs},
+{MAKE_ARG("expiration",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,5,NULL),.subargs=HSETEX_expiration_Subargs},
+{MAKE_ARG("fields",ARG_TYPE_BLOCK,-1,"FIELDS",NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=HSETEX_fields_Subargs},
+};
+
 /********** HSETNX ********************/
 
 #ifndef SKIP_CMD_HISTORY_TABLE
@@ -11043,6 +11169,8 @@ struct COMMAND_STRUCT redisCommandTable[] = {
 {MAKE_CMD("hexpiretime","Returns the expiration time of a hash field as a Unix timestamp, in seconds.","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXPIRETIME_History,0,HEXPIRETIME_Tips,0,hexpiretimeCommand,-5,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HEXPIRETIME_Keyspecs,1,NULL,2),.args=HEXPIRETIME_Args},
 {MAKE_CMD("hget","Returns the value of a field in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGET_History,0,HGET_Tips,0,hgetCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HGET_Keyspecs,1,NULL,2),.args=HGET_Args},
 {MAKE_CMD("hgetall","Returns all fields and values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGETALL_History,0,HGETALL_Tips,1,hgetallCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HGETALL_Keyspecs,1,NULL,1),.args=HGETALL_Args},
+{MAKE_CMD("hgetdel","Returns the value of a field and deletes it from the hash.","O(N) where N is the number of specified fields","8.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGETDEL_History,0,HGETDEL_Tips,0,hgetdelCommand,-5,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HGETDEL_Keyspecs,1,NULL,2),.args=HGETDEL_Args},
+{MAKE_CMD("hgetex","Get the value of one or more fields of a given hash key, and optionally set their expiration.","O(N) where N is the number of specified fields","8.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGETEX_History,0,HGETEX_Tips,0,hgetexCommand,-5,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HGETEX_Keyspecs,1,NULL,3),.args=HGETEX_Args},
 {MAKE_CMD("hincrby","Increments the integer value of a field in a hash by a number. Uses 0 as initial value if the field doesn't exist.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HINCRBY_History,0,HINCRBY_Tips,0,hincrbyCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HINCRBY_Keyspecs,1,NULL,3),.args=HINCRBY_Args},
 {MAKE_CMD("hincrbyfloat","Increments the floating point value of a field by a number. Uses 0 as initial value if the field doesn't exist.","O(1)","2.6.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HINCRBYFLOAT_History,0,HINCRBYFLOAT_Tips,0,hincrbyfloatCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HINCRBYFLOAT_Keyspecs,1,NULL,3),.args=HINCRBYFLOAT_Args},
 {MAKE_CMD("hkeys","Returns all fields in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HKEYS_History,0,HKEYS_Tips,1,hkeysCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HKEYS_Keyspecs,1,NULL,1),.args=HKEYS_Args},
@@ -11057,6 +11185,7 @@ struct COMMAND_STRUCT redisCommandTable[] = {
 {MAKE_CMD("hrandfield","Returns one or more random fields from a hash.","O(N) where N is the number of fields returned","6.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HRANDFIELD_History,0,HRANDFIELD_Tips,1,hrandfieldCommand,-2,CMD_READONLY,ACL_CATEGORY_HASH,HRANDFIELD_Keyspecs,1,NULL,2),.args=HRANDFIELD_Args},
 {MAKE_CMD("hscan","Iterates over fields and values of a hash.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSCAN_History,0,HSCAN_Tips,1,hscanCommand,-3,CMD_READONLY,ACL_CATEGORY_HASH,HSCAN_Keyspecs,1,NULL,5),.args=HSCAN_Args},
 {MAKE_CMD("hset","Creates or modifies the value of a field in a hash.","O(1) for each field/value pair added, so O(N) to add N field/value pairs when the command is called with multiple field/value pairs.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSET_History,1,HSET_Tips,0,hsetCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HSET_Keyspecs,1,NULL,2),.args=HSET_Args},
+{MAKE_CMD("hsetex","Set the value of one or more fields of a given hash key, and optionally set their expiration.","O(N) where N is the number of fields being set.","8.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSETEX_History,0,HSETEX_Tips,0,hsetexCommand,-6,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HSETEX_Keyspecs,1,NULL,4),.args=HSETEX_Args},
 {MAKE_CMD("hsetnx","Sets the value of a field in a hash only when the field doesn't exist.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSETNX_History,0,HSETNX_Tips,0,hsetnxCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HSETNX_Keyspecs,1,NULL,3),.args=HSETNX_Args},
 {MAKE_CMD("hstrlen","Returns the length of the value of a field.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSTRLEN_History,0,HSTRLEN_Tips,0,hstrlenCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HSTRLEN_Keyspecs,1,NULL,2),.args=HSTRLEN_Args},
 {MAKE_CMD("httl","Returns the TTL in seconds of a hash field.","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HTTL_History,0,HTTL_Tips,1,httlCommand,-5,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HTTL_Keyspecs,1,NULL,2),.args=HTTL_Args},
diff --git a/src/commands/hgetdel.json b/src/commands/hgetdel.json
new file mode 100644
index 00000000000..af748fb52a9
--- /dev/null
+++ b/src/commands/hgetdel.json
@@ -0,0 +1,78 @@
+{
+    "HGETDEL": {
+        "summary": "Returns the value of a field and deletes it from the hash.",
+        "complexity": "O(N) where N is the number of specified fields",
+        "group": "hash",
+        "since": "8.0.0",
+        "arity": -5,
+        "function": "hgetdelCommand",
+        "history": [],
+        "command_flags": [
+            "WRITE",
+            "FAST"
+        ],
+        "acl_categories": [
+            "HASH"
+        ],
+        "key_specs": [
+            {
+                "flags": [
+                    "RW",
+                    "ACCESS",
+                    "DELETE"
+                ],
+                "begin_search": {
+                    "index": {
+                        "pos": 1
+                    }
+                },
+                "find_keys": {
+                    "range": {
+                        "lastkey": 0,
+                        "step": 1,
+                        "limit": 0
+                    }
+                }
+            }
+        ],
+        "reply_schema": {
+            "description": "List of values associated with the given fields, in the same order as they are requested.",
+            "type": "array",
+            "minItems": 1,
+            "items": {
+                "oneOf": [
+                    {
+                        "type": "string"
+                    },
+                    {
+                        "type": "null"
+                    }
+                ]
+            }
+        },
+        "arguments": [
+            {
+                "name": "key",
+                "type": "key",
+                "key_spec_index": 0
+            },
+            {
+                "name": "fields",
+                "token": "FIELDS",
+                "type": "block",
+                "arguments": [
+                    {
+                        "name": "numfields",
+                        "type": "integer"
+                    },
+                    {
+                        "name": "field",
+                        "type": "string",
+                        "multiple": true
+                    }
+                ]
+            }
+        ]
+    }
+}
+
diff --git a/src/commands/hgetex.json b/src/commands/hgetex.json
new file mode 100644
index 00000000000..02889ca8392
--- /dev/null
+++ b/src/commands/hgetex.json
@@ -0,0 +1,111 @@
+{
+    "HGETEX": {
+        "summary": "Get the value of one or more fields of a given hash key, and optionally set their expiration.",
+        "complexity": "O(N) where N is the number of specified fields",
+        "group": "hash",
+        "since": "8.0.0",
+        "arity": -5,
+        "function": "hgetexCommand",
+        "history": [],
+        "command_flags": [
+            "WRITE",
+            "FAST"
+        ],
+        "acl_categories": [
+            "HASH"
+        ],
+        "key_specs": [
+            {
+                "notes": "RW and UPDATE because it changes the TTL",
+                "flags": [
+                    "RW",
+                    "ACCESS",
+                    "UPDATE"
+                ],
+                "begin_search": {
+                    "index": {
+                        "pos": 1
+                    }
+                },
+                "find_keys": {
+                    "range": {
+                        "lastkey": 0,
+                        "step": 1,
+                        "limit": 0
+                    }
+                }
+            }
+        ],
+        "reply_schema": {
+            "description": "List of values associated with the given fields, in the same order as they are requested.",
+            "type": "array",
+            "minItems": 1,
+            "items": {
+                "oneOf": [
+                    {
+                        "type": "string"
+                    },
+                    {
+                        "type": "null"
+                    }
+                ]
+            }
+        },
+        "arguments": [
+            {
+                "name": "key",
+                "type": "key",
+                "key_spec_index": 0
+            },
+            {
+                "name": "expiration",
+                "type": "oneof",
+                "optional": true,
+                "arguments": [
+                    {
+                        "name": "seconds",
+                        "type": "integer",
+                        "token": "EX"
+                    },
+                    {
+                        "name": "milliseconds",
+                        "type": "integer",
+                        "token": "PX"
+                    },
+                    {
+                        "name": "unix-time-seconds",
+                        "type": "unix-time",
+                        "token": "EXAT"
+                    },
+                    {
+                        "name": "unix-time-milliseconds",
+                        "type": "unix-time",
+                        "token": "PXAT"
+                    },
+                    {
+                        "name": "persist",
+                        "type": "pure-token",
+                        "token": "PERSIST"
+                    }
+                ]
+            },
+            {
+                "name": "fields",
+                "token": "FIELDS",
+                "type": "block",
+                "arguments": [
+                    {
+                        "name": "numfields",
+                        "type": "integer"
+                    },
+                    {
+                        "name": "field",
+                        "type": "string",
+                        "multiple": true
+                    }
+                ]
+            }
+        ]
+    }
+}
+
diff --git a/src/commands/hsetex.json b/src/commands/hsetex.json
new file mode 100644
index 00000000000..6f6a6c60079
--- /dev/null
+++ b/src/commands/hsetex.json
@@ -0,0 +1,132 @@
+{
+    "HSETEX": {
+        "summary": "Set the value of one or more fields of a given hash key, and optionally set their expiration.",
+        "complexity": "O(N) where N is the number of fields being set.",
+        "group": "hash",
+        "since": "8.0.0",
+        "arity": -6,
+        "function": "hsetexCommand",
+        "command_flags": [
+            "WRITE",
+            "DENYOOM",
+            "FAST"
+        ],
+        "acl_categories": [
+            "HASH"
+        ],
+        "key_specs": [
+            {
+                "flags": [
+                    "RW",
+                    "UPDATE"
+                ],
+                "begin_search": {
+                    "index": {
+                        "pos": 1
+                    }
+                },
+                "find_keys": {
+                    "range": {
+                        "lastkey": 0,
+                        "step": 1,
+                        "limit": 0
+                    }
+                }
+            }
+        ],
+        "reply_schema": {
+            "oneOf": [
+                {
+                    "description": "No field was set (due to FXX or FNX flags).",
+                    "const": 0
+                },
+                {
+                    "description": "All the fields were set.",
+                    "const": 1
+                }
+            ]
+        },
+        "arguments": [
+            {
+                "name": "key",
+                "type": "key",
+                "key_spec_index": 0
+            },
+            {
+                "name": "condition",
+                "type": "oneof",
+                "optional": true,
+                "arguments": [
+                    {
+                        "name": "fnx",
+                        "type": "pure-token",
+                        "token": "FNX"
+                    },
+                    {
+                        "name": "fxx",
+                        "type": "pure-token",
+                        "token": "FXX"
+                    }
+                ]
+            },
+            {
+                "name": "expiration",
+                "type": "oneof",
+                "optional": true,
+                "arguments": [
+                    {
+                        "name": "seconds",
+                        "type": "integer",
+                        "token": "EX"
+                    },
+                    {
+                        "name": "milliseconds",
+                        "type": "integer",
+                        "token": "PX"
+                    },
+                    {
+                        "name": "unix-time-seconds",
+                        "type": "unix-time",
+                        "token": "EXAT"
+                    },
+                    {
+                        "name": "unix-time-milliseconds",
+                        "type": "unix-time",
+                        "token": "PXAT"
+                    },
+                    {
+                        "name": "keepttl",
+                        "type": "pure-token",
+                        "token": "KEEPTTL"
+                    }
+                ]
+            },
+            {
+                "name": "fields",
+                "token": "FIELDS",
+                "type": "block",
+                "arguments": [
+                    {
+                        "name": "numfields",
+                        "type": "integer"
+                    },
+                    {
+                        "name": "data",
+                        "type": "block",
+                        "multiple": true,
+                        "arguments": [
+                            {
+                                "name": "field",
+                                "type": "string"
+                            },
+                            {
+                                "name": "value",
+                                "type": "string"
+                            }
+                        ]
+                    }
+                ]
+            }
+        ]
+    }
+}
diff --git a/src/server.c b/src/server.c
index 0e056c8be4c..48936ac53e1 100644
--- a/src/server.c
+++ b/src/server.c
@@ -2034,6 +2034,7 @@ void createSharedObjects(void) {
     shared.set = createStringObject("SET",3);
     shared.eval = createStringObject("EVAL",4);
     shared.hpexpireat = createStringObject("HPEXPIREAT",10);
+    shared.hpersist = createStringObject("HPERSIST",8);
     shared.hdel = createStringObject("HDEL",4);
 
     /* Shared command argument */
diff --git a/src/server.h b/src/server.h
index e04035c28e2..d65392b8cc6 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1434,7 +1434,7 @@ struct sharedObjectsStruct {
     *rpop, *lpop, *lpush, *rpoplpush, *lmove, *blmove, *zpopmin, *zpopmax,
     *emptyscan, *multi, *exec, *left, *right, *hset, *srem, *xgroup, *xclaim,
     *script, *replconf, *eval, *persist, *set, *pexpireat, *pexpire,
-    *hdel, *hpexpireat,
+    *hdel, *hpexpireat, *hpersist,
     *time, *pxat, *absttl, *retrycount, *force, *justid, *entriesread,
     *lastid, *ping, *setid, *keepttl, *load, *createconsumer,
     *getack, *special_asterick, *special_equals, *default_username, *redacted,
@@ -3361,7 +3361,9 @@ typedef struct dictExpireMetadata {
 #define HFE_LAZY_AVOID_HASH_DEL   (1<<1) /* Avoid deleting hash if the field is the last one */
 #define HFE_LAZY_NO_NOTIFICATION  (1<<2) /* Do not send notification, used when multiple fields
                                           * may expire and only one notification is desired. */
-#define HFE_LAZY_ACCESS_EXPIRED   (1<<3) /* Avoid lazy expire and allow access to expired fields */
+#define HFE_LAZY_NO_SIGNAL        (1<<3) /* Do not send signal, used when multiple fields
+                                          * may expire and only one signal is desired. */
+#define HFE_LAZY_ACCESS_EXPIRED   (1<<4) /* Avoid lazy expire and allow access to expired fields */
 
 void hashTypeConvert(robj *o, int enc, ebuckets *hexpires);
 void hashTypeTryConversion(redisDb *db, robj *subject, robj **argv, int start, int end);
@@ -3881,6 +3883,7 @@ void strlenCommand(client *c);
 void zrankCommand(client *c);
 void zrevrankCommand(client *c);
 void hsetCommand(client *c);
+void hsetexCommand(client *c);
 void hpexpireCommand(client *c);
 void hexpireCommand(client *c);
 void hpexpireatCommand(client *c);
@@ -3893,6 +3896,8 @@ void hpersistCommand(client *c);
 void hsetnxCommand(client *c);
 void hgetCommand(client *c);
 void hmgetCommand(client *c);
+void hgetexCommand(client *c);
+void hgetdelCommand(client *c);
 void hdelCommand(client *c);
 void hlenCommand(client *c);
 void hstrlenCommand(client *c);
diff --git a/src/t_hash.c b/src/t_hash.c
index c6e48b77abc..b4d31bd0e62 100644
--- a/src/t_hash.c
+++ b/src/t_hash.c
@@ -48,7 +48,7 @@ typedef listpackEntry CommonEntry; /* extend usage beyond lp */
 static ExpireAction onFieldExpire(eItem item, void *ctx);
 static ExpireMeta* hfieldGetExpireMeta(const eItem field);
 static ExpireMeta *hashGetExpireMeta(const eItem hash);
-static void hexpireGenericCommand(client *c, const char *cmd, long long basetime, int unit);
+static void hexpireGenericCommand(client *c, long long basetime, int unit);
 static ExpireAction hashTypeActiveExpire(eItem hashObj, void *ctx);
 static uint64_t hashTypeExpire(robj *o, ExpireCtx *expireCtx, int updateGlobalHFE);
 static void hfieldPersist(robj *hashObj, hfield field);
@@ -214,15 +214,13 @@ typedef struct HashTypeSetEx {
                                          * minimum expiration time. If minimum recorded
                                          * is above minExpire of the hash, then we don't
                                          * have to update global HFE DS */
-    int fieldDeleted;                   /* Number of fields deleted */
-    int fieldUpdated;                   /* Number of fields updated */
 
     /* Optionally provide client for notification */
     client *c;
     const char *cmd;
 } HashTypeSetEx;
 
-int hashTypeSetExInit(robj *key, robj *o, client *c, redisDb *db, const char *cmd,
+int hashTypeSetExInit(robj *key, robj *o, client *c, redisDb *db,
                       ExpireSetCond expireSetCond, HashTypeSetEx *ex);
 
 SetExRes hashTypeSetEx(robj *o, sds field, uint64_t expireAt, HashTypeSetEx *exInfo);
@@ -531,6 +529,15 @@ SetExRes hashTypeSetExpiryListpack(HashTypeSetEx *ex, sds field,
         prevExpire = (uint64_t) expireTime;
     }
 
+    /* Special value of EXPIRE_TIME_INVALID indicates field should be persisted.*/
+    if (expireAt == EB_EXPIRE_TIME_INVALID) {
+        /* Return error if already there is no ttl. */
+        if (prevExpire == EB_EXPIRE_TIME_INVALID)
+            return HSETEX_NO_CONDITION_MET;
+        listpackExUpdateExpiry(ex->hashObj, field, fptr, vptr, HASH_LP_NO_TTL);
+        return HSETEX_OK;
+    }
+
     if (prevExpire == EB_EXPIRE_TIME_INVALID) {
         /* For fields without expiry, LT condition is considered valid */
         if (ex->expireSetCond & (HFE_XX | HFE_GT))
@@ -551,13 +558,7 @@ SetExRes hashTypeSetExpiryListpack(HashTypeSetEx *ex, sds field,
     if (unlikely(checkAlreadyExpired(expireAt))) {
         propagateHashFieldDeletion(ex->db, ex->key->ptr, field, sdslen(field));
         hashTypeDelete(ex->hashObj, field, 1);
-        
-        /* get listpack length */
-        listpackEx *lpt = ((listpackEx *) ex->hashObj->ptr);
-        unsigned long length = lpLength(lpt->lp) / 3;
-        updateKeysizesHist(ex->db, getKeySlot(ex->key->ptr), OBJ_HASH, length+1, length); 
         server.stat_expired_subkeys++;
-        ex->fieldDeleted++;
         return HSETEX_DELETED;
     }
 
@@ -565,7 +566,6 @@ SetExRes hashTypeSetExpiryListpack(HashTypeSetEx *ex, sds field,
         ex->minExpireFields = expireAt;
 
     listpackExUpdateExpiry(ex->hashObj, field, fptr, vptr, expireAt);
-    ex->fieldUpdated++;
     return HSETEX_OK;
 }
 
@@ -788,7 +788,8 @@ GetFieldRes hashTypeGetValue(redisDb *db, robj *o, sds field, unsigned char **vs
         dbDelete(db,keyObj);
         res = GETF_EXPIRED_HASH;
     }
-    signalModifiedKey(NULL, db, keyObj);
+    if (!(hfeFlags & HFE_LAZY_NO_SIGNAL))
+        signalModifiedKey(NULL, db, keyObj);
     decrRefCount(keyObj);
     return res;
 }
@@ -1010,34 +1011,33 @@ int hashTypeSet(redisDb *db, robj *o, sds field, sds value, int flags) {
 SetExRes hashTypeSetExpiryHT(HashTypeSetEx *exInfo, sds field, uint64_t expireAt) {
     dict *ht = exInfo->hashObj->ptr;
     dictEntry *existingEntry = NULL;
+    hfield hfNew = NULL;
 
-    /* New field with expiration metadata */
-    hfield hfNew = hfieldNew(field, sdslen(field), 1 /*withExpireMeta*/);
-
-    if ((existingEntry = dictFind(ht, field)) == NULL) {
-        hfieldFree(hfNew);
+    if ((existingEntry = dictFind(ht, field)) == NULL)
         return HSETEX_NO_FIELD;
-    }
 
     hfield hfOld = dictGetKey(existingEntry);
+    /* Special value of EXPIRE_TIME_INVALID indicates field should be persisted.*/
+    if (expireAt == EB_EXPIRE_TIME_INVALID) {
+        /* Return error if already there is no ttl. */
+        if (hfieldGetExpireTime(hfOld) == EB_EXPIRE_TIME_INVALID)
+            return HSETEX_NO_CONDITION_MET;
+
+        hfieldPersist(exInfo->hashObj, hfOld);
+        return HSETEX_OK;
+    }
 
     /* If field doesn't have expiry metadata attached */
     if (!hfieldIsExpireAttached(hfOld)) {
-
         /* For fields without expiry, LT condition is considered valid */
-        if (exInfo->expireSetCond & (HFE_XX | HFE_GT)) {
-            hfieldFree(hfNew);
+        if (exInfo->expireSetCond & (HFE_XX | HFE_GT))
             return HSETEX_NO_CONDITION_MET;
-        }
 
         /* Delete old field. Below goanna dictSetKey(..,hfNew) */
         hfieldFree(hfOld);
-
+        /* New field with expiration metadata */
+        hfNew = hfieldNew(field, sdslen(field), 1);
     } else { /* field has ExpireMeta struct attached */
-
-        /* No need for hfNew (Just modify expire-time of existing field) */
-        hfieldFree(hfNew);
-
         uint64_t prevExpire = hfieldGetExpireTime(hfOld);
 
         /* If field has valid expiration time, then check GT|LT|NX */
@@ -1073,13 +1073,10 @@ SetExRes hashTypeSetExpiryHT(HashTypeSetEx *exInfo, sds field, uint64_t expireAt
     /* If expired, then delete the field and propagate the deletion.
      * If replica, continue like the field is valid */
     if (unlikely(checkAlreadyExpired(expireAt))) {
-        unsigned long length = dictSize(ht); 
-        updateKeysizesHist(exInfo->db, getKeySlot(exInfo->key->ptr), OBJ_HASH, length, length-1);
         /* replicas should not initiate deletion of fields */
         propagateHashFieldDeletion(exInfo->db, exInfo->key->ptr, field, sdslen(field));
         hashTypeDelete(exInfo->hashObj, field, 1);
         server.stat_expired_subkeys++;
-        exInfo->fieldDeleted++;
         return HSETEX_DELETED;
     }
 
@@ -1088,7 +1085,6 @@ SetExRes hashTypeSetExpiryHT(HashTypeSetEx *exInfo, sds field, uint64_t expireAt
 
     dictExpireMetadata *dm = (dictExpireMetadata *) dictMetadata(ht);
     ebAdd(&dm->hfe, &hashFieldExpireBucketsType, hfNew, expireAt);
-    exInfo->fieldUpdated++;
     return HSETEX_OK;
 }
 
@@ -1097,20 +1093,18 @@ SetExRes hashTypeSetExpiryHT(HashTypeSetEx *exInfo, sds field, uint64_t expireAt
  *
  * Take care to call first hashTypeSetExInit() and then call this function.
  * Finally, call hashTypeSetExDone() to notify and update global HFE DS.
+ *
+ * Special value of EB_EXPIRE_TIME_INVALID for 'expireAt' argument will persist
+ * the field.
  */
-SetExRes hashTypeSetEx(robj *o, sds field, uint64_t expireAt, HashTypeSetEx *exInfo)
-{
-    if (o->encoding == OBJ_ENCODING_LISTPACK_EX)
-    {
+SetExRes hashTypeSetEx(robj *o, sds field, uint64_t expireAt, HashTypeSetEx *exInfo) {
+    if (o->encoding == OBJ_ENCODING_LISTPACK_EX) {
         unsigned char *fptr = NULL, *vptr = NULL, *tptr = NULL;
-
         listpackEx *lpt = o->ptr;
-        long long expireTime = HASH_LP_NO_TTL;
-
-        if ((fptr = lpFirst(lpt->lp)) == NULL)
-            return HSETEX_NO_FIELD;
 
-        fptr = lpFind(lpt->lp, fptr, (unsigned char*)field, sdslen(field), 2);
+        fptr = lpFirst(lpt->lp);
+        if (fptr)
+            fptr = lpFind(lpt->lp, fptr, (unsigned char*)field, sdslen(field), 2);
 
         if (!fptr)
             return HSETEX_NO_FIELD;
@@ -1120,7 +1114,7 @@ SetExRes hashTypeSetEx(robj *o, sds field, uint64_t expireAt, HashTypeSetEx *exI
         serverAssert(vptr != NULL);
 
         tptr = lpNext(lpt->lp, vptr);
-        serverAssert(tptr && lpGetIntegerValue(tptr, &expireTime));
+        serverAssert(tptr);
 
         /* update TTL */
         return hashTypeSetExpiryListpack(exInfo, field, fptr, vptr, tptr, expireAt);
@@ -1144,19 +1138,16 @@ void initDictExpireMetadata(sds key, robj *o) {
 }
 
 /* Init HashTypeSetEx struct before calling hashTypeSetEx() */
-int hashTypeSetExInit(robj *key, robj *o, client *c, redisDb *db, const char *cmd,
+int hashTypeSetExInit(robj *key, robj *o, client *c, redisDb *db,
                       ExpireSetCond expireSetCond, HashTypeSetEx *ex)
 {
     dict *ht = o->ptr;
     ex->expireSetCond = expireSetCond;
     ex->minExpire = EB_EXPIRE_TIME_INVALID;
     ex->c = c;
-    ex->cmd = cmd;
     ex->db = db;
     ex->key = key;
     ex->hashObj = o;
-    ex->fieldDeleted = 0;
-    ex->fieldUpdated = 0;
     ex->minExpireFields = EB_EXPIRE_TIME_INVALID;
 
     /* Take care that HASH support expiration */
@@ -1220,50 +1211,38 @@ int hashTypeSetExInit(robj *key, robj *o, client *c, redisDb *db, const char *cm
 
 /*
  * After calling hashTypeSetEx() for setting fields or their expiry, call this
- * function to notify and update global HFE DS.
+ * function to update global HFE DS.
  */
 void hashTypeSetExDone(HashTypeSetEx *ex) {
-    /* Notify keyspace event, update dirty count and update global HFE DS */
-    if (ex->fieldDeleted + ex->fieldUpdated > 0) {
-
-        server.dirty += ex->fieldDeleted + ex->fieldUpdated;
-        if (ex->fieldDeleted && hashTypeLength(ex->hashObj, 0) == 0) {
-            dbDelete(ex->db,ex->key);
-            signalModifiedKey(ex->c, ex->db, ex->key);
-            notifyKeyspaceEvent(NOTIFY_HASH, "hdel", ex->key, ex->db->id);
-            notifyKeyspaceEvent(NOTIFY_GENERIC,"del",ex->key, ex->db->id);
-        } else {
-            signalModifiedKey(ex->c, ex->db, ex->key);
-            notifyKeyspaceEvent(NOTIFY_HASH, ex->fieldDeleted ? "hdel" : "hexpire",
-                                ex->key, ex->db->id);
-
-            /* If minimum HFE of the hash is smaller than expiration time of the
-             * specified fields in the command as well as it is smaller or equal
-             * than expiration time provided in the command, then the minimum
-             * HFE of the hash won't change following this command. */
-            if ((ex->minExpire < ex->minExpireFields))
-                return;
 
-            /* Retrieve new expired time. It might have changed. */
-            uint64_t newMinExpire = hashTypeGetMinExpire(ex->hashObj, 1 /*accurate*/);
-
-            /* Calculate the diff between old minExpire and newMinExpire. If it is
-             * only few seconds, then don't have to update global HFE DS. At the worst
-             * case fields of hash will be active-expired up to few seconds later.
-             *
-             * In any case, active-expire operation will know to update global
-             * HFE DS more efficiently than here for a single item.
-             */
-            uint64_t diff = (ex->minExpire > newMinExpire) ?
-                                (ex->minExpire - newMinExpire) : (newMinExpire - ex->minExpire);
-            if (diff < HASH_NEW_EXPIRE_DIFF_THRESHOLD) return;
-
-            if (ex->minExpire != EB_EXPIRE_TIME_INVALID)
-                ebRemove(&ex->db->hexpires, &hashExpireBucketsType, ex->hashObj);
-            if (newMinExpire != EB_EXPIRE_TIME_INVALID)
-                ebAdd(&ex->db->hexpires, &hashExpireBucketsType, ex->hashObj, newMinExpire);
-        }
-    }
+    if (hashTypeLength(ex->hashObj, 0) == 0)
+        return;
+
+    /* If minimum HFE of the hash is smaller than expiration time of the
+     * specified fields in the command as well as it is smaller or equal
+     * than expiration time provided in the command, then the minimum
+     * HFE of the hash won't change following this command. */
+    if ((ex->minExpire < ex->minExpireFields))
+        return;
+
+    /* Retrieve new expired time. It might have changed. */
+    uint64_t newMinExpire = hashTypeGetMinExpire(ex->hashObj, 1 /*accurate*/);
+
+    /* Calculate the diff between old minExpire and newMinExpire. If it is
+     * only few seconds, then don't have to update global HFE DS. At the worst
+     * case fields of hash will be active-expired up to few seconds later.
+     *
+     * In any case, active-expire operation will know to update global
+     * HFE DS more efficiently than here for a single item.
+     */
+    uint64_t diff = (ex->minExpire > newMinExpire) ?
+                    (ex->minExpire - newMinExpire) : (newMinExpire - ex->minExpire);
+    if (diff < HASH_NEW_EXPIRE_DIFF_THRESHOLD) return;
+
+    if (ex->minExpire != EB_EXPIRE_TIME_INVALID)
+        ebRemove(&ex->db->hexpires, &hashExpireBucketsType, ex->hashObj);
+    if (newMinExpire != EB_EXPIRE_TIME_INVALID)
+        ebAdd(&ex->db->hexpires, &hashExpireBucketsType, ex->hashObj, newMinExpire);
 }
 
 /* Delete an element from a hash.
@@ -2222,6 +2201,303 @@ void hsetCommand(client *c) {
     server.dirty += (c->argc - 2)/2;
 }
 
+/* Parse expire time from argument and do boundary checks. */
+static int parseExpireTime(client *c, robj *o, int unit, long long basetime,
+                           long long *expire)
+{
+    long long val;
+
+    /* Read the expiry time from command */
+    if (getLongLongFromObjectOrReply(c, o, &val, NULL) != C_OK)
+        return C_ERR;
+
+    if (val < 0) {
+        addReplyError(c,"invalid expire time, must be >= 0");
+        return C_ERR;
+    }
+
+    if (unit == UNIT_SECONDS) {
+        if (val > (long long) HFE_MAX_ABS_TIME_MSEC / 1000) {
+            addReplyErrorExpireTime(c);
+            return C_ERR;
+        }
+        val *= 1000;
+    }
+
+    if (val > (long long) HFE_MAX_ABS_TIME_MSEC - basetime) {
+        addReplyErrorExpireTime(c);
+        return C_ERR;
+    }
+    val += basetime;
+    *expire = val;
+    return C_OK;
+}
+
+/* Flags that are used as part of HGETEX and HSETEX commands. */
+#define HFE_EX       (1<<0) /* Expiration time in seconds */
+#define HFE_PX       (1<<1) /* Expiration time in milliseconds */
+#define HFE_EXAT     (1<<2) /* Expiration time in unix seconds */
+#define HFE_PXAT     (1<<3) /* Expiration time in unix milliseconds */
+#define HFE_PERSIST  (1<<4) /* Persist fields */
+#define HFE_KEEPTTL  (1<<5) /* Do not discard field ttl on set op */
+#define HFE_FXX      (1<<6) /* Set fields if all the fields already exist */
+#define HFE_FNX      (1<<7) /* Set fields if none of the fields exist */
+
+/* Parse hsetex command arguments.
+ * HSETEX <key>
+ *  [FNX|FXX]
+ *  [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]
+ *  FIELDS <numfields> field value [field value ...]
+*/
+static int hsetexParseArgs(client *c, int *flags,
+                           long long *expire_time, int *expire_time_pos,
+                           int *first_field_pos, int *field_count) {
+    *flags = 0;
+    *first_field_pos = -1;
+    *field_count = -1;
+    *expire_time_pos = -1;
+
+    for (int i = 2; i < c->argc; i++) {
+        if (!strcasecmp(c->argv[i]->ptr, "fields")) {
+            long val;
+
+            if (i >= c->argc - 3) {
+                addReplyErrorArity(c);
+                return C_ERR;
+            }
+
+            if (getRangeLongFromObjectOrReply(c, c->argv[i + 1], 1, INT_MAX, &val,
+                                              "invalid number of fields") != C_OK)
+                return C_ERR;
+
+            int remaining = (c->argc - i  - 2);
+            if (remaining % 2 != 0 || val != remaining / 2) {
+                addReplyErrorArity(c);
+                return C_ERR;
+            }
+
+            *first_field_pos = i + 2;
+            *field_count = (int) val;
+            return C_OK;
+        } else if (!strcasecmp(c->argv[i]->ptr, "EX")) {
+            if (*flags & (HFE_EX | HFE_EXAT | HFE_PX | HFE_PXAT | HFE_KEEPTTL))
+                goto err_expiration;
+
+            if (i >= c->argc - 1)
+                goto err_missing_expire;
+
+            *flags |= HFE_EX;
+            i++;
+
+            if (parseExpireTime(c, c->argv[i], UNIT_SECONDS,
+                                commandTimeSnapshot(), expire_time) != C_OK)
+                return C_ERR;
+
+            *expire_time_pos = i;
+        } else if (!strcasecmp(c->argv[i]->ptr, "PX")) {
+            if (*flags & (HFE_EX | HFE_EXAT | HFE_PX | HFE_PXAT | HFE_KEEPTTL))
+                goto err_expiration;
+
+            if (i >= c->argc - 1)
+                goto err_missing_expire;
+
+            *flags |= HFE_PX;
+            i++;
+            if (parseExpireTime(c, c->argv[i], UNIT_MILLISECONDS,
+                                commandTimeSnapshot(), expire_time) != C_OK)
+                return C_ERR;
+
+            *expire_time_pos = i;
+        } else if (!strcasecmp(c->argv[i]->ptr, "EXAT")) {
+            if (*flags & (HFE_EX | HFE_EXAT | HFE_PX | HFE_PXAT | HFE_KEEPTTL))
+                goto err_expiration;
+
+            if (i >= c->argc - 1)
+                goto err_missing_expire;
+
+            *flags |= HFE_EXAT;
+            i++;
+            if (parseExpireTime(c, c->argv[i], UNIT_SECONDS, 0, expire_time) != C_OK)
+                return C_ERR;
+
+            *expire_time_pos = i;
+        } else if (!strcasecmp(c->argv[i]->ptr, "PXAT")) {
+            if (*flags & (HFE_EX | HFE_EXAT | HFE_PX | HFE_PXAT | HFE_KEEPTTL))
+                goto err_expiration;
+
+            if (i >= c->argc - 1)
+                goto err_missing_expire;
+
+            *flags |= HFE_PXAT;
+            i++;
+            if (parseExpireTime(c, c->argv[i], UNIT_MILLISECONDS, 0,
+                                expire_time) != C_OK)
+                return C_ERR;
+
+            *expire_time_pos = i;
+        } else if (!strcasecmp(c->argv[i]->ptr, "KEEPTTL")) {
+            if (*flags & (HFE_EX | HFE_EXAT | HFE_PX | HFE_PXAT | HFE_KEEPTTL))
+                goto err_expiration;
+            *flags |= HFE_KEEPTTL;
+        } else if (!strcasecmp(c->argv[i]->ptr, "FXX")) {
+            if (*flags & (HFE_FXX | HFE_FNX))
+                goto err_condition;
+            *flags |= HFE_FXX;
+        } else if (!strcasecmp(c->argv[i]->ptr, "FNX")) {
+            if (*flags & (HFE_FXX | HFE_FNX))
+                goto err_condition;
+            *flags |= HFE_FNX;
+        } else {
+            addReplyErrorFormat(c, "unknown argument: %s", (char*) c->argv[i]->ptr);
+            return C_ERR;
+        }
+    }
+
+    serverAssert(0);
+
+err_missing_expire:
+    addReplyError(c, "missing expire time");
+    return C_ERR;
+err_condition:
+    addReplyError(c, "Only one of FXX or FNX arguments can be specified");
+    return C_ERR;
+err_expiration:
+    addReplyError(c, "Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments can be specified");
+    return C_ERR;
+}
+
+/* Set the value of one or more fields of a given hash key, and optionally set
+ * their expiration.
+ *
+ * HSETEX key
+ *  [FNX | FXX]
+ *  [EX seconds | PX milliseconds | EXAT unix-time-seconds | PXAT unix-time-milliseconds | KEEPTTL]
+ *  FIELDS <numfields> field value [field value...]
+ *
+ * Reply:
+ *   Integer reply: 0 if no fields were set (due to FXX/FNX args)
+ *   Integer reply: 1 if all the fields were set
+ */
+void hsetexCommand(client *c) {
+    int flags = 0, first_field_pos = 0, field_count = 0, expire_time_pos = -1;
+    int updated = 0, deleted = 0, set_expiry;
+    long long expire_time = EB_EXPIRE_TIME_INVALID;
+    unsigned long oldlen, newlen;
+    robj *o;
+    HashTypeSetEx setex;
+
+    if (hsetexParseArgs(c, &flags, &expire_time, &expire_time_pos,
+                        &first_field_pos, &field_count) != C_OK)
+        return;
+
+    o = lookupKeyWrite(c->db, c->argv[1]);
+    if (checkType(c, o, OBJ_HASH))
+        return;
+
+    if (!o) {
+        if (flags & HFE_FXX) {
+            addReplyLongLong(c, 0);
+            return;
+        }
+        o = createHashObject();
+        dbAdd(c->db, c->argv[1], o);
+    }
+    oldlen = hashTypeLength(o, 0);
+
+    if (flags & (HFE_FXX | HFE_FNX)) {
+        int found = 0;
+        for (int i = 0; i < field_count; i++) {
+            sds field = c->argv[first_field_pos + (i * 2)]->ptr;
+            const int opt = HFE_LAZY_NO_NOTIFICATION |
+                            HFE_LAZY_NO_SIGNAL |
+                            HFE_LAZY_AVOID_HASH_DEL;
+            int exists = hashTypeExists(c->db, o, field, opt, NULL);
+            found += (exists != 0);
+
+            /* Check for early exit if the condition is already invalid. */
+            if (((flags & HFE_FXX) && !exists) ||
+                ((flags & HFE_FNX) && exists))
+                break;
+        }
+
+        int all_exists = (found == field_count);
+        int non_exists = (found == 0);
+
+        if (((flags & HFE_FNX) && !non_exists) ||
+            ((flags & HFE_FXX) && !all_exists))
+        {
+            addReplyLongLong(c, 0);
+            goto out;
+        }
+    }
+    hashTypeTryConversion(c->db, o,c->argv, first_field_pos, c->argc - 1);
+
+    /* Check if we will set the expiration time. */
+    set_expiry = flags & (HFE_EX | HFE_PX | HFE_EXAT | HFE_PXAT);
+    if (set_expiry)
+        hashTypeSetExInit(c->argv[1], o, c, c->db, 0, &setex);
+
+
+    for (int i = 0; i < field_count; i++) {
+        sds field = c->argv[first_field_pos + (i * 2)]->ptr;
+        sds value = c->argv[first_field_pos + (i * 2) + 1]->ptr;
+
+        int opt = HASH_SET_COPY;
+        /* If we are going to set the expiration time later, no need to discard
+         * it as part of set operation now. */
+        if (flags & (HFE_EX | HFE_PX | HFE_EXAT | HFE_PXAT | HFE_KEEPTTL))
+            opt |= HASH_SET_KEEP_TTL;
+
+        hashTypeSet(c->db, o, field, value, opt);
+
+        /* Update the expiration time. */
+        if (set_expiry) {
+            int ret = hashTypeSetEx(o, field, expire_time, &setex);
+            updated += (ret == HSETEX_OK);
+            deleted += (ret == HSETEX_DELETED);
+        }
+    }
+
+    if (set_expiry)
+        hashTypeSetExDone(&setex);
+
+    server.dirty += field_count;
+    signalModifiedKey(c, c->db, c->argv[1]);
+    notifyKeyspaceEvent(NOTIFY_HASH, "hset", c->argv[1], c->db->id);
+    if (deleted || updated)
+        notifyKeyspaceEvent(NOTIFY_HASH, deleted ? "hdel": "hexpire",
+                            c->argv[1], c->db->id);
+
+    if (deleted) {
+        /* If fields are deleted due to timestamp is being in the past, hdel's
+         * are already propagated. No need to propagate the command itself. */
+        preventCommandPropagation(c);
+    } else if (set_expiry && !(flags & HFE_PXAT)) {
+        /* Propagate as 'HSETEX <key> PXAT ..' if there is EX/EXAT/PX flag*/
+
+        /* Replace EX/EXAT/PX with PXAT */
+        rewriteClientCommandArgument(c, expire_time_pos - 1, shared.pxat);
+        /* Replace timestamp with unix timestamp milliseconds. */
+        robj *expire = createStringObjectFromLongLong(expire_time);
+        rewriteClientCommandArgument(c, expire_time_pos, expire);
+        decrRefCount(expire);
+    }
+
+    addReplyLongLong(c, 1);
+
+out:
+    /* Key may become empty due to lazy expiry in hashTypeExists()
+     * or the new expiration time is in the past.*/
+    newlen = hashTypeLength(o, 0);
+    if (newlen == 0) {
+        dbDelete(c->db, c->argv[1]);
+        notifyKeyspaceEvent(NOTIFY_GENERIC, "del", c->argv[1], c->db->id);
+    }
+    if (oldlen != newlen)
+        updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_HASH,
+                           oldlen, newlen);
+}
+
 void hincrbyCommand(client *c) {
     long long value, incr, oldvalue;
     robj *o;
@@ -2393,6 +2669,254 @@ void hmgetCommand(client *c) {
     }
 }
 
+/* Get and delete the value of one or more fields of a given hash key.
+ * HGETDEL <key> FIELDS <numfields> field1 field2 ...
+ * Reply: list of the value associated with each field or nil if the field
+ *        doesn’t exist.
+ */
+void hgetdelCommand(client *c) {
+    int res = 0, hfe = 0, deleted = 0, expired = 0;
+    unsigned long oldlen = 0, newlen= 0;
+    long num_fields = 0;
+    robj *o;
+
+    o = lookupKeyWrite(c->db, c->argv[1]);
+    if (checkType(c, o, OBJ_HASH))
+        return;
+
+    if (strcasecmp(c->argv[2]->ptr, "FIELDS") != 0) {
+        addReplyError(c, "Mandatory argument FIELDS is missing or not at the right position");
+        return;
+    }
+
+    /* Read number of fields */
+    if (getRangeLongFromObjectOrReply(c, c->argv[3], 1, LONG_MAX, &num_fields,
+                                      "Number of fields must be a positive integer") != C_OK)
+        return;
+
+    /* Verify `numFields` is consistent with number of arguments */
+    if (num_fields != c->argc - 4) {
+        addReplyError(c, "The `numfields` parameter must match the number of arguments");
+        return;
+    }
+
+    /* Hash field expiration is optimized to avoid frequent update global HFE DS
+     * for each field deletion. Eventually active-expiration will run and update
+     * or remove the hash from global HFE DS gracefully. Nevertheless, statistic
+     * "subexpiry" might reflect wrong number of hashes with HFE to the user if
+     * it is the last field with expiration. The following logic checks if this
+     * is the last field with expiration and removes it from global HFE DS. */
+    if (o) {
+        hfe = hashTypeIsFieldsWithExpire(o);
+        oldlen = hashTypeLength(o, 0);
+    }
+
+    addReplyArrayLen(c, num_fields);
+    for (int i = 4; i < c->argc; i++) {
+        const int flags = HFE_LAZY_NO_NOTIFICATION |
+                          HFE_LAZY_NO_SIGNAL |
+                          HFE_LAZY_AVOID_HASH_DEL;
+        res = addHashFieldToReply(c, o, c->argv[i]->ptr, flags);
+        expired += (res == GETF_EXPIRED);
+        /* Try to delete only if it's found and not expired lazily. */
+        if (res == GETF_OK) {
+            deleted++;
+            serverAssert(hashTypeDelete(o, c->argv[i]->ptr, 1) == 1);
+        }
+    }
+
+    /* Return if no modification has been made. */
+    if (expired == 0 && deleted == 0)
+        return;
+
+    signalModifiedKey(c, c->db, c->argv[1]);
+
+    if (expired)
+        notifyKeyspaceEvent(NOTIFY_HASH, "hexpired", c->argv[1], c->db->id);
+    if (deleted) {
+        notifyKeyspaceEvent(NOTIFY_HASH, "hdel", c->argv[1], c->db->id);
+        server.dirty += deleted;
+
+        /* Propagate as HDEL command.
+         * Orig: HGETDEL <key> FIELDS <numfields> field1 field2 ...
+         * Repl: HDEL <key> field1 field2 ... */
+        rewriteClientCommandArgument(c, 0, shared.hdel);
+        rewriteClientCommandArgument(c, 2, NULL);  /* Delete FIELDS arg */
+        rewriteClientCommandArgument(c, 2, NULL);  /* Delete <numfields> arg */
+    }
+
+    /* Key may have become empty because of deleting fields or lazy expire. */
+    newlen = hashTypeLength(o, 0);
+    if (newlen == 0) {
+        dbDelete(c->db, c->argv[1]);
+        notifyKeyspaceEvent(NOTIFY_GENERIC, "del", c->argv[1], c->db->id);
+    } else if (hfe && (hashTypeIsFieldsWithExpire(o) == 0)) { /*is it last HFE*/
+        ebRemove(&c->db->hexpires, &hashExpireBucketsType, o);
+    }
+
+    if (oldlen != newlen)
+        updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_HASH,
+                           oldlen, newlen);
+}
+
+/* Get and delete the value of one or more fields of a given hash key.
+ *
+ * HGETEX <key>
+ *   [EX seconds | PX milliseconds | EXAT unix-time-seconds | PXAT unix-time-milliseconds | PERSIST]
+ *   FIELDS <numfields> field1 field2 ...
+ *
+ * Reply: list of the value associated with each field or nil if the field
+ *        doesn’t exist.
+ */
+void hgetexCommand(client *c) {
+    int expired = 0, deleted = 0, updated = 0;
+    int num_fields_pos = 3, cond = 0;
+    long num_fields;
+    unsigned long oldlen = 0, newlen = 0;
+    long long expire_time = 0;
+    robj *o;
+    HashTypeSetEx setex;
+
+    o = lookupKeyWrite(c->db, c->argv[1]);
+    if (checkType(c, o, OBJ_HASH))
+        return;
+
+    /* Read optional arg */
+    if (!strcasecmp(c->argv[2]->ptr, "ex"))
+        cond = HFE_EX;
+    else if (!strcasecmp(c->argv[2]->ptr, "px"))
+        cond = HFE_PX;
+    else if (!strcasecmp(c->argv[2]->ptr, "exat"))
+        cond = HFE_EXAT;
+    else if (!strcasecmp(c->argv[2]->ptr, "pxat"))
+        cond = HFE_PXAT;
+    else if (!strcasecmp(c->argv[2]->ptr, "persist"))
+        cond = HFE_PERSIST;
+
+    /* Parse expiration time */
+    if (cond & (HFE_EX | HFE_PX | HFE_EXAT | HFE_PXAT)) {
+        num_fields_pos += 2;
+        int unit = (cond & (HFE_EX | HFE_EXAT)) ? UNIT_SECONDS : UNIT_MILLISECONDS;
+        long long basetime = cond & (HFE_EX | HFE_PX) ? commandTimeSnapshot() : 0;
+        if (parseExpireTime(c, c->argv[3], unit, basetime, &expire_time) != C_OK)
+            return;
+    } else if (cond & HFE_PERSIST) {
+        num_fields_pos += 1;
+    }
+
+    if (strcasecmp(c->argv[num_fields_pos - 1]->ptr, "FIELDS") != 0) {
+        addReplyError(c, "Mandatory argument FIELDS is missing or not at the right position");
+        return;
+    }
+
+    /* Read number of fields */
+    if (getRangeLongFromObjectOrReply(c, c->argv[num_fields_pos], 1, LONG_MAX, &num_fields,
+                                      "Number of fields must be a positive integer") != C_OK)
+        return;
+
+    /* Check number of fields is consistent with number of arguments */
+    if (num_fields != c->argc - num_fields_pos - 1) {
+        addReplyError(c, "The `numfields` parameter must match the number of arguments");
+        return;
+    }
+
+    /* Non-existing keys and empty hashes are the same thing. Reply null if the
+     * key does not exist.*/
+    if (!o) {
+        addReplyArrayLen(c, num_fields);
+        for (int i = 0; i < num_fields; i++)
+            addReplyNull(c);
+        return;
+    }
+
+    oldlen = hashTypeLength(o, 0);
+    if (cond)
+        hashTypeSetExInit(c->argv[1], o, c, c->db, 0, &setex);
+
+    addReplyArrayLen(c, num_fields);
+    for (int i = num_fields_pos + 1; i < c->argc; i++) {
+        const int flags = HFE_LAZY_NO_NOTIFICATION |
+                          HFE_LAZY_NO_SIGNAL |
+                          HFE_LAZY_AVOID_HASH_DEL;
+        sds field = c->argv[i]->ptr;
+        int res = addHashFieldToReply(c, o, field, flags);
+        expired += (res == GETF_EXPIRED);
+
+        /* Set expiration only if the field exists and not expired lazily. */
+        if (res == GETF_OK && cond) {
+            if (cond & HFE_PERSIST)
+                expire_time = EB_EXPIRE_TIME_INVALID;
+
+            res = hashTypeSetEx(o, field, expire_time, &setex);
+            deleted += (res == HSETEX_DELETED);
+            updated += (res == HSETEX_OK);
+        }
+    }
+
+    if (cond)
+        hashTypeSetExDone(&setex);
+
+    /* Exit early if no modification has been made. */
+    if (expired == 0 && deleted == 0 && updated == 0)
+        return;
+
+    server.dirty += deleted + updated;
+    signalModifiedKey(c, c->db, c->argv[1]);
+
+    /* Key may become empty due to lazy expiry in addHashFieldToReply()
+     * or the new expiration time is in the past.*/
+    newlen = hashTypeLength(o, 0);
+    if (newlen == 0) {
+        dbDelete(c->db, c->argv[1]);
+        notifyKeyspaceEvent(NOTIFY_GENERIC, "del", c->argv[1], c->db->id);
+    }
+    if (oldlen != newlen)
+        updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_HASH,
+                           oldlen, newlen);
+
+    /* This command will never be propagated as it is. It will be propagated as
+     * HDELs when fields are lazily expired or deleted, if the new timestamp is
+     * in the past. HDEL's will be emitted as part of addHashFieldToReply()
+     * or hashTypeSetEx() in this case.
+     *
+     * If PERSIST flags is used, it will be propagated as HPERSIST command.
+     * IF EX/EXAT/PX/PXAT flags are used, it will be replicated as HPEXPRITEAT.
+     */
+    if (expired)
+        notifyKeyspaceEvent(NOTIFY_HASH, "hexpired", c->argv[1], c->db->id);
+    if (updated) {
+        if (cond & HFE_PERSIST) {
+            notifyKeyspaceEvent(NOTIFY_HASH, "hpersist", c->argv[1], c->db->id);
+
+            /* Propagate as HPERSIST command.
+             * Orig: HGETEX <key> PERSIST FIELDS <numfields> field1 field2 ...
+             * Repl: HPERSIST <key> FIELDS <numfields> field1 field2 ... */
+            rewriteClientCommandArgument(c, 0, shared.hpersist);
+            rewriteClientCommandArgument(c, 2, NULL); /* Delete PERSIST arg */
+        } else {
+            notifyKeyspaceEvent(NOTIFY_HASH, "hexpire", c->argv[1], c->db->id);
+
+            /* Propagate as HPEXPIREAT command.
+             * Orig: HGETEX <key> [EX|PX|EXAT|PXAT] ttl FIELDS <numfields> field1 field2 ...
+             * Repl: HPEXPIREAT <key> ttl FIELDS <numfields> field1 field2 ... */
+            rewriteClientCommandArgument(c, 0, shared.hpexpireat);
+            rewriteClientCommandArgument(c, 2, NULL); /* Del [EX|PX|EXAT|PXAT]*/
+
+            /* Rewrite TTL if it is not unix time milliseconds already. */
+            if (!(cond & HFE_PXAT)) {
+                robj *expire = createStringObjectFromLongLong(expire_time);
+                rewriteClientCommandArgument(c, 2, expire);
+                decrRefCount(expire);
+            }
+        }
+    } else if (deleted) {
+        /* If we are here, fields are deleted because new timestamp was in the
+         * past. HDELs are already propagated as part of hashTypeSetEx(). */
+        notifyKeyspaceEvent(NOTIFY_HASH, "hdel", c->argv[1], c->db->id);
+        preventCommandPropagation(c);
+    }
+}
+
 void hdelCommand(client *c) {
     robj *o;
     int j, deleted = 0, keyremoved = 0;
@@ -3174,10 +3698,11 @@ static void httlGenericCommand(client *c, const char *cmd, long long basetime, i
  *   not met, then command will be rejected. Otherwise, EXPIRE command will be
  *   propagated for given key.
  */
-static void hexpireGenericCommand(client *c, const char *cmd, long long basetime, int unit) {
+static void hexpireGenericCommand(client *c, long long basetime, int unit) {
     long numFields = 0, numFieldsAt = 4;
     long long expire; /* unix time in msec */
-    int fieldAt, fieldsNotSet = 0, expireSetCond = 0;
+    int fieldAt, fieldsNotSet = 0, expireSetCond = 0, updated = 0, deleted = 0;
+    unsigned long oldlen, newlen;
     robj *hashObj, *keyArg = c->argv[1], *expireArg = c->argv[2];
 
     /* Read the hash object */
@@ -3186,28 +3711,8 @@ static void hexpireGenericCommand(client *c, const char *cmd, long long basetime
         return;
 
     /* Read the expiry time from command */
-    if (getLongLongFromObjectOrReply(c, expireArg, &expire, NULL) != C_OK)
-        return;
-
-    if (expire < 0) {
-        addReplyError(c,"invalid expire time, must be >= 0");
-        return;
-    }
-
-    if (unit == UNIT_SECONDS) {
-        if (expire > (long long) HFE_MAX_ABS_TIME_MSEC / 1000) {
-            addReplyErrorExpireTime(c);
-            return;
-        }
-        expire *= 1000;
-    }
-
-    /* Ensure that the final absolute Unix timestamp does not exceed EB_EXPIRE_TIME_MAX. */
-    if (expire > (long long) HFE_MAX_ABS_TIME_MSEC - basetime) {
-        addReplyErrorExpireTime(c);
+    if (parseExpireTime(c, expireArg, unit, basetime, &expire) != C_OK)
         return;
-    }
-    expire += basetime;
 
     /* Read optional expireSetCond [NX|XX|GT|LT] */
     char *optArg = c->argv[3]->ptr;
@@ -3247,14 +3752,18 @@ static void hexpireGenericCommand(client *c, const char *cmd, long long basetime
         return;
     }
 
+    oldlen = hashTypeLength(hashObj, 0);
+
     HashTypeSetEx exCtx;
-    hashTypeSetExInit(keyArg, hashObj, c, c->db, cmd, expireSetCond, &exCtx);
+    hashTypeSetExInit(keyArg, hashObj, c, c->db, expireSetCond, &exCtx);
     addReplyArrayLen(c, numFields);
 
     fieldAt = numFieldsAt + 1;
     while (fieldAt < c->argc) {
         sds field = c->argv[fieldAt]->ptr;
         SetExRes res = hashTypeSetEx(hashObj, field, expire, &exCtx);
+        updated += (res == HSETEX_OK);
+        deleted += (res == HSETEX_DELETED);
 
         if (unlikely(res != HSETEX_OK)) {
             /* If the field was not set, prevent field propagation */
@@ -3269,17 +3778,34 @@ static void hexpireGenericCommand(client *c, const char *cmd, long long basetime
 
     hashTypeSetExDone(&exCtx);
 
+    if (deleted + updated > 0) {
+        server.dirty += deleted + updated;
+        signalModifiedKey(c, c->db, keyArg);
+        notifyKeyspaceEvent(NOTIFY_HASH, deleted ? "hdel" : "hexpire",
+                            keyArg, c->db->id);
+    }
+
+    newlen = hashTypeLength(hashObj, 0);
+    if (newlen == 0) {
+        dbDelete(c->db, keyArg);
+        notifyKeyspaceEvent(NOTIFY_GENERIC, "del", keyArg, c->db->id);
+    }
+
+    if (oldlen != newlen)
+        updateKeysizesHist(c->db, getKeySlot(c->argv[1]->ptr), OBJ_HASH,
+                           oldlen, newlen);
+
     /* Avoid propagating command if not even one field was updated (Either because
      * the time is in the past, and corresponding HDELs were sent, or conditions
      * not met) then it is useless and invalid to propagate command with no fields */
-    if (exCtx.fieldUpdated == 0) {
+    if (updated == 0) {
         preventCommandPropagation(c);
         return;
     }
 
     /* If some fields were dropped, rewrite the number of fields */
     if (fieldsNotSet) {
-        robj *numFieldsObj = createStringObjectFromLongLong(exCtx.fieldUpdated);
+        robj *numFieldsObj = createStringObjectFromLongLong(updated);
         rewriteClientCommandArgument(c, numFieldsAt, numFieldsObj);
         decrRefCount(numFieldsObj);
     }
@@ -3297,48 +3823,48 @@ static void hexpireGenericCommand(client *c, const char *cmd, long long basetime
     }
 }
 
-/* HPEXPIRE key milliseconds [ NX | XX | GT | LT] numfields <field [field ...]> */
+/* HPEXPIRE key milliseconds [ NX | XX | GT | LT] FIELDS numfields <field [field ...]> */
 void hpexpireCommand(client *c) {
-    hexpireGenericCommand(c,"hpexpire", commandTimeSnapshot(),UNIT_MILLISECONDS);
+    hexpireGenericCommand(c,commandTimeSnapshot(),UNIT_MILLISECONDS);
 }
 
-/* HEXPIRE key seconds [NX | XX | GT | LT] numfields <field [field ...]> */
+/* HEXPIRE key seconds [NX | XX | GT | LT] FIELDS numfields <field [field ...]> */
 void hexpireCommand(client *c) {
-    hexpireGenericCommand(c,"hexpire", commandTimeSnapshot(),UNIT_SECONDS);
+    hexpireGenericCommand(c,commandTimeSnapshot(),UNIT_SECONDS);
 }
 
-/* HEXPIREAT key unix-time-seconds [NX | XX | GT | LT] numfields <field [field ...]> */
+/* HEXPIREAT key unix-time-seconds [NX | XX | GT | LT] FIELDS numfields <field [field ...]> */
 void hexpireatCommand(client *c) {
-    hexpireGenericCommand(c,"hexpireat", 0,UNIT_SECONDS);
+    hexpireGenericCommand(c,0,UNIT_SECONDS);
 }
 
-/* HPEXPIREAT key unix-time-milliseconds [NX | XX | GT | LT] numfields <field [field ...]> */
+/* HPEXPIREAT key unix-time-milliseconds [NX | XX | GT | LT] FIELDS numfields <field [field ...]> */
 void hpexpireatCommand(client *c) {
-    hexpireGenericCommand(c,"hpexpireat", 0,UNIT_MILLISECONDS);
+    hexpireGenericCommand(c,0,UNIT_MILLISECONDS);
 }
 
 /* for each specified field: get the remaining time to live in seconds*/
-/* HTTL key numfields <field [field ...]> */
+/* HTTL key FIELDS numfields <field [field ...]> */
 void httlCommand(client *c) {
     httlGenericCommand(c, "httl", commandTimeSnapshot(), UNIT_SECONDS);
 }
 
-/* HPTTL key numfields <field [field ...]> */
+/* HPTTL key FIELDS numfields <field [field ...]> */
 void hpttlCommand(client *c) {
     httlGenericCommand(c, "hpttl", commandTimeSnapshot(), UNIT_MILLISECONDS);
 }
 
-/* HEXPIRETIME key numFields <field [field ...]> */
+/* HEXPIRETIME key FIELDS numfields <field [field ...]> */
 void hexpiretimeCommand(client *c) {
     httlGenericCommand(c, "hexpiretime", 0, UNIT_SECONDS);
 }
 
-/* HPEXPIRETIME key numFields <field [field ...]> */
+/* HPEXPIRETIME key FIELDS numfields <field [field ...]> */
 void hpexpiretimeCommand(client *c) {
     httlGenericCommand(c, "hexpiretime", 0, UNIT_MILLISECONDS);
 }
 
-/* HPERSIST key <FIELDS count field [field ...]> */
+/* HPERSIST key FIELDS numfields <field [field ...]> */
 void hpersistCommand(client *c) {
     robj *hashObj;
     long numFields = 0, numFieldsAt = 3;
diff --git a/tests/unit/info-keysizes.tcl b/tests/unit/info-keysizes.tcl
index 98d6d4e6fb0..a866efcfd67 100644
--- a/tests/unit/info-keysizes.tcl
+++ b/tests/unit/info-keysizes.tcl
@@ -334,6 +334,31 @@ proc test_all_keysizes { {replMode 0} } {
             run_cmd_verify_hist {$server HSET h2 2 2} {db0_HASH:2=1}
             run_cmd_verify_hist {$server HDEL h2 1}   {db0_HASH:1=1}
             run_cmd_verify_hist {$server HDEL h2 2}   {}
+            # HGETDEL
+            run_cmd_verify_hist {$server FLUSHALL} {}
+            run_cmd_verify_hist {$server HSETEX h2 FIELDS 1 1 1} {db0_HASH:1=1}
+            run_cmd_verify_hist {$server HSETEX h2 FIELDS 1 2 2} {db0_HASH:2=1}
+            run_cmd_verify_hist {$server HGETDEL h2 FIELDS 1 1}  {db0_HASH:1=1}
+            run_cmd_verify_hist {$server HGETDEL h2 FIELDS 1 3}  {db0_HASH:1=1}
+            run_cmd_verify_hist {$server HGETDEL h2 FIELDS 1 2}  {}
+            # HGETEX
+            run_cmd_verify_hist {$server FLUSHALL} {}
+            run_cmd_verify_hist {$server HSETEX h1 FIELDS 2 f1 1 f2 1} {db0_HASH:2=1}
+            run_cmd_verify_hist {$server HGETEX h1 PXAT 1 FIELDS 1 f1} {db0_HASH:1=1}
+            run_cmd_verify_hist {$server HSETEX h1 FIELDS 1 f3 1} {db0_HASH:2=1}
+            run_cmd_verify_hist {$server HGETEX h1 PX 50 FIELDS 1 f2} {db0_HASH:2=1}
+            run_cmd_verify_hist {} {db0_HASH:1=1} 1
+            run_cmd_verify_hist {$server HGETEX h1 PX 50 FIELDS 1 f3} {db0_HASH:1=1}
+            run_cmd_verify_hist {} {} 1
+            # HSETEX
+            run_cmd_verify_hist {$server FLUSHALL} {}
+            run_cmd_verify_hist {$server HSETEX h1 FIELDS 2 f1 1 f2 1} {db0_HASH:2=1}
+            run_cmd_verify_hist {$server HSETEX h1 PXAT 1 FIELDS 1 f1 v1} {db0_HASH:1=1}
+            run_cmd_verify_hist {$server HSETEX h1 FIELDS 1 f3 1} {db0_HASH:2=1}
+            run_cmd_verify_hist {$server HSETEX h1 PX 50 FIELDS 1 f2 v2} {db0_HASH:2=1}
+            run_cmd_verify_hist {} {db0_HASH:1=1} 1
+            run_cmd_verify_hist {$server HSETEX h1 PX 50 FIELDS 1 f3 v3} {db0_HASH:1=1}
+            run_cmd_verify_hist {} {} 1
             # HMSET
             run_cmd_verify_hist {$server FLUSHALL} {}
             run_cmd_verify_hist {$server HMSET h1 1 1 2 2 3 3} {db0_HASH:2=1}
diff --git a/tests/unit/pubsub.tcl b/tests/unit/pubsub.tcl
index 9a4f1196b90..def27190810 100644
--- a/tests/unit/pubsub.tcl
+++ b/tests/unit/pubsub.tcl
@@ -414,6 +414,58 @@ start_server {tags {"pubsub network"}} {
         assert_equal "pmessage * __keyspace@${db}__:myhash del" [$rd1 read]
         r debug set-active-expire 1
 
+
+        # Test HSETEX, HGETEX and HGETDEL notifications
+        r hsetex myhash FIELDS 3 f4 v4 f5 v5 f6 v6
+        assert_equal "pmessage * __keyspace@${db}__:myhash hset" [$rd1 read]
+
+        # hgetex sets ttl in past
+        r hgetex myhash PX 0 FIELDS 1 f4
+        assert_equal "pmessage * __keyspace@${db}__:myhash hdel" [$rd1 read]
+
+        # hgetex sets ttl
+        r hgetex myhash EXAT [expr {[clock seconds] + 999999}] FIELDS 1 f5
+        assert_equal "pmessage * __keyspace@${db}__:myhash hexpire" [$rd1 read]
+
+        # hgetex persists field
+        r hgetex myhash PERSIST FIELDS 1 f5
+        assert_equal "pmessage * __keyspace@${db}__:myhash hpersist" [$rd1 read]
+
+        # hgetdel deletes a field
+        r hgetdel myhash FIELDS 1 f5
+        assert_equal "pmessage * __keyspace@${db}__:myhash hdel" [$rd1 read]
+
+        # hsetex sets field and expiry time
+        r hsetex myhash EXAT [expr {[clock seconds] + 999999}] FIELDS 1 f6 v6
+        assert_equal "pmessage * __keyspace@${db}__:myhash hset" [$rd1 read]
+        assert_equal "pmessage * __keyspace@${db}__:myhash hexpire" [$rd1 read]
+
+        # hsetex sets field and ttl in the past
+        r hsetex myhash PX 0 FIELDS 1 f6 v6
+        assert_equal "pmessage * __keyspace@${db}__:myhash hset" [$rd1 read]
+        assert_equal "pmessage * __keyspace@${db}__:myhash hdel" [$rd1 read]
+        assert_equal "pmessage * __keyspace@${db}__:myhash del" [$rd1 read]
+
+        # Test that we will get `hexpired` notification when a hash field is
+        # removed by lazy expire using hgetdel command
+        r debug set-active-expire 0
+        r hsetex myhash PX 10 FIELDS 1 f1 v1
+        assert_equal "pmessage * __keyspace@${db}__:myhash hset" [$rd1 read]
+        assert_equal "pmessage * __keyspace@${db}__:myhash hexpire" [$rd1 read]
+
+        # Set another field
+        r hsetex myhash FIELDS 1 f2 v2
+        assert_equal "pmessage * __keyspace@${db}__:myhash hset" [$rd1 read]
+        # Wait until field expires
+        after 20
+        r hgetdel myhash FIELDS 1 f1
+        assert_equal "pmessage * __keyspace@${db}__:myhash hexpired" [$rd1 read]
+        # Get and delete the only field
+        r hgetdel myhash FIELDS 1 f2
+        assert_equal "pmessage * __keyspace@${db}__:myhash hdel" [$rd1 read]
+        assert_equal "pmessage * __keyspace@${db}__:myhash del" [$rd1 read]
+        r debug set-active-expire 1
+
         $rd1 close
     } {0} {needs:debug}
     } ;# foreach
diff --git a/tests/unit/type/hash-field-expire.tcl b/tests/unit/type/hash-field-expire.tcl
index 0d1840091c6..02cac6008b0 100644
--- a/tests/unit/type/hash-field-expire.tcl
+++ b/tests/unit/type/hash-field-expire.tcl
@@ -855,6 +855,430 @@ start_server {tags {"external:skip needs:debug"}} {
             assert_equal [r HINCRBYFLOAT h1 f1 2.5] 12.5
             assert_range [r HPTTL h1 FIELDS 1 f1] 1 20
         }
+
+        test "HGETDEL - delete field with ttl ($type)" {
+            r debug set-active-expire 0
+            r del h1
+
+            # Test deleting only field in a hash. Due to lazy expiry,
+            # reply will be null and the field and the key will be deleted.
+            r hsetex h1 PX 5 FIELDS 1 f1 10
+            after 15
+            assert_equal [r hgetdel h1 fields 1 f1] "{}"
+            assert_equal [r exists h1]  0
+
+            # Test deleting one field among many. f2 will lazily expire
+            r hsetex h1 FIELDS 3 f1 10 f2 20 f3 value3
+            r hpexpire h1 5 FIELDS 1 f2
+            after 15
+            assert_equal [r hgetdel h1 fields 2 f2 f3] "{} value3"
+            assert_equal [lsort [r hgetall h1]] [lsort "f1 10"]
+
+            # Try to delete the last field, along with non-existing fields
+            assert_equal [r hgetdel h1 fields 4 f1 f2 f3 f4] "10 {} {} {}"
+            r debug set-active-expire 1
+        }
+
+        test "HGETEX - input validation ($type)" {
+            r del h1
+            assert_error "*wrong number of arguments*" {r HGETEX}
+            assert_error "*wrong number of arguments*" {r HGETEX h1}
+            assert_error "*wrong number of arguments*" {r HGETEX h1 FIELDS}
+            assert_error "*wrong number of arguments*" {r HGETEX h1 FIELDS 0}
+            assert_error "*wrong number of arguments*" {r HGETEX h1 FIELDS 1}
+            assert_error "*argument FIELDS is missing*" {r HGETEX h1 XFIELDX 1 a}
+            assert_error "*argument FIELDS is missing*" {r HGETEX h1 PXAT 1 1}
+            assert_error "*argument FIELDS is missing*" {r HGETEX h1 PERSIST 1 FIELDS 1 a}
+            assert_error "*must match the number of arguments*" {r HGETEX h1 FIELDS 2 a}
+            assert_error "*Number of fields must be a positive integer*" {r HGETEX h1 FIELDS 0 a}
+            assert_error "*Number of fields must be a positive integer*" {r HGETEX h1 FIELDS -1 a}
+            assert_error "*Number of fields must be a positive integer*" {r HGETEX h1 FIELDS 9223372036854775808 a}
+        }
+
+        test "HGETEX - input validation (expire time) ($type)" {
+            assert_error "*value is not an integer or out of range*" {r HGETEX h1 EX bla FIELDS 1 a}
+            assert_error "*value is not an integer or out of range*" {r HGETEX h1 EX 9223372036854775808 FIELDS 1 a}
+            assert_error "*value is not an integer or out of range*" {r HGETEX h1 EXAT 9223372036854775808 FIELDS 1 a}
+            assert_error "*invalid expire time, must be >= 0*" {r HGETEX h1 PX -1 FIELDS 1 a}
+            assert_error "*invalid expire time, must be >= 0*" {r HGETEX h1 PXAT -1 FIELDS 1 a}
+            assert_error "*invalid expire time*" {r HGETEX h1 EX -1 FIELDS 1 a}
+            assert_error "*invalid expire time*" {r HGETEX h1 EX [expr (1<<48)] FIELDS 1 a}
+            assert_error "*invalid expire time*" {r HGETEX h1 EX [expr (1<<46) - [clock seconds] + 100 ] FIELDS 1 a}
+            assert_error "*invalid expire time*" {r HGETEX h1 EXAT [expr (1<<46) + 100 ] FIELDS 1 a}
+            assert_error "*invalid expire time*" {r HGETEX h1 PX [expr (1<<46) - [clock milliseconds] + 100 ] FIELDS 1 a}
+            assert_error "*invalid expire time*" {r HGETEX h1 PXAT [expr (1<<46) + 100 ] FIELDS 1 a}
+        }
+
+        test "HGETEX - get without setting ttl ($type)" {
+            r del h1
+            r hset h1 a 1 b 2 c strval
+            assert_equal [r hgetex h1 fields 1 a] "1"
+            assert_equal [r hgetex h1 fields 2 a b] "1 2"
+            assert_equal [r hgetex h1 fields 3 a b c] "1 2 strval"
+            assert_equal [r HTTL h1 FIELDS 3 a b c] "$T_NO_EXPIRY $T_NO_EXPIRY $T_NO_EXPIRY"
+        }
+
+        test "HGETEX - get and set the ttl ($type)" {
+            r del h1
+            r hset h1 a 1 b 2 c strval
+            assert_equal [r hgetex h1 EX 10000 fields 1 a] "1"
+            assert_range [r HTTL h1 FIELDS 1 a] 9000 10000
+            assert_equal [r hgetex h1 EX 10000 fields 1 c] "strval"
+            assert_range [r HTTL h1 FIELDS 1 c] 9000 10000
+        }
+
+        test "HGETEX - Test 'EX' flag ($type)" {
+            r del myhash
+            r hset myhash field1 value1 field2 value2 field3 value3
+            assert_equal [r hgetex myhash EX 1000 FIELDS 1 field1] [list "value1"]
+            assert_range [r httl myhash FIELDS 1 field1] 1 1000
+        }
+
+        test "HGETEX - Test 'EXAT' flag ($type)" {
+            r del myhash
+            r hset myhash field1 value1 field2 value2 field3 value3
+            assert_equal [r hgetex myhash EXAT 4000000000 FIELDS 1 field2] [list "value2"]
+            assert_range [expr [r httl myhash FIELDS 1 field2] + [clock seconds]] 3900000000 4000000000
+        }
+
+        test "HGETEX - Test 'PX' flag ($type)" {
+            r del myhash
+            r hset myhash field1 value1 field2 value2 field3 value3
+            assert_equal [r hgetex myhash PX 1000000 FIELDS 1 field3] [list "value3"]
+            assert_range [r httl myhash FIELDS 1 field3] 900 1000
+        }
+
+        test "HGETEX - Test 'PXAT' flag ($type)" {
+            r del myhash
+            r hset myhash field1 value1 field2 value2 field3 value3
+            assert_equal [r hgetex myhash PXAT 4000000000000 FIELDS 1 field3] [list "value3"]
+            assert_range [expr [r httl myhash FIELDS 1 field3] + [clock seconds]] 3900000000 4000000000
+        }
+
+        test "HGETEX - Test 'PERSIST' flag ($type)" {
+            r del myhash
+            r debug set-active-expire 0
+
+            r hsetex myhash PX 5000 FIELDS 3 f1 v1 f2 v2 f3 v3
+            assert_not_equal [r httl myhash FIELDS 1 f1] "$T_NO_EXPIRY"
+            assert_not_equal [r httl myhash FIELDS 1 f2] "$T_NO_EXPIRY"
+            assert_not_equal [r httl myhash FIELDS 1 f3] "$T_NO_EXPIRY"
+
+            # Persist f1 and verify it does not have TTL anymore
+            assert_equal [r hgetex myhash PERSIST FIELDS 1 f1] "v1"
+            assert_equal [r httl myhash FIELDS 1 f1] "$T_NO_EXPIRY"
+
+            # Persist rest of the fields
+            assert_equal [r hgetex myhash PERSIST FIELDS 2 f2 f3] "v2 v3"
+            assert_equal [r httl myhash FIELDS 2 f2 f3]  "$T_NO_EXPIRY $T_NO_EXPIRY"
+
+            # Redo the operation. It should be noop as fields are persisted already.
+            assert_equal [r hgetex myhash PERSIST FIELDS 2 f2 f3] "v2 v3"
+            assert_equal [r httl myhash FIELDS 2 f2 f3]  "$T_NO_EXPIRY $T_NO_EXPIRY"
+
+            # Final sanity, fields exist and have no attached ttl.
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v1 f2 v2 f3 v3"]
+            assert_equal [r httl myhash FIELDS 3 f1 f2 f3]  "$T_NO_EXPIRY $T_NO_EXPIRY $T_NO_EXPIRY"
+            r debug set-active-expire 1
+        }
+
+        test "HGETEX - Test setting ttl in the past will delete the key ($type)" {
+            r del myhash
+            r hset myhash f1 v1 f2 v2 f3 v3
+
+            # hgetex without setting ttl
+            assert_equal [lsort [r hgetex myhash fields 3 f1 f2 f3]] [lsort "v1 v2 v3"]
+            assert_equal [r httl myhash FIELDS 3 f1 f2 f3] "$T_NO_EXPIRY $T_NO_EXPIRY $T_NO_EXPIRY"
+
+            # set an expired ttl and verify the key is deleted
+            r hgetex myhash PXAT 1 fields 3 f1 f2 f3
+            assert_equal [r exists myhash] 0
+        }
+
+        test "HGETEX - Test active expiry ($type)" {
+            r del myhash
+            r debug set-active-expire 0
+
+            r hset myhash f1 v1 f2 v2 f3 v3 f4 v4 f5 v5
+            assert_equal [lsort [r hgetex myhash PXAT 1 FIELDS 5 f1 f2 f3 f4 f5]] [lsort "v1 v2 v3 v4 v5"]
+
+            r debug set-active-expire 1
+            wait_for_condition 50 20 { [r EXISTS myhash] == 0 } else { fail "'myhash' should be expired" }
+        }
+
+        test "HGETEX - A field with TTL overridden with another value (TTL discarded) ($type)" {
+            r del myhash
+            r hset myhash f1 v1 f2 v2 f3 v3
+            r hgetex myhash PX 10000 FIELDS 1 f1
+            r hgetex myhash EX 100 FIELDS 1 f2
+
+            # f2 ttl will be discarded
+            r hset myhash f2 v22
+            assert_equal [r hget myhash f2] "v22"
+            assert_equal [r httl myhash FIELDS 2 f2 f3] "$T_NO_EXPIRY $T_NO_EXPIRY"
+
+            # Other field is not affected (still has TTL)
+            assert_not_equal [r httl myhash FIELDS 1 f1] "$T_NO_EXPIRY"
+        }
+
+        test "HGETEX - Test with lazy expiry ($type)" {
+            r del myhash
+            r debug set-active-expire 0
+
+            r hsetex myhash PX 1 FIELDS 2 f1 v1 f2 v2
+            after 5
+            assert_equal [r hgetex myhash FIELDS 2 f1 f2] "{} {}"
+            assert_equal [r exists myhash] 0
+
+            r debug set-active-expire 1
+        }
+
+        test "HSETEX - input validation ($type)" {
+            assert_error {*wrong number of arguments*} {r hsetex myhash}
+            assert_error {*wrong number of arguments*} {r hsetex myhash fields}
+            assert_error {*wrong number of arguments*} {r hsetex myhash fields 1}
+            assert_error {*wrong number of arguments*} {r hsetex myhash fields 2 a b}
+            assert_error {*wrong number of arguments*} {r hsetex myhash fields 2 a b c}
+            assert_error {*wrong number of arguments*} {r hsetex myhash fields 2 a b c d e}
+            assert_error {*wrong number of arguments*} {r hsetex myhash fields 3 a b c d}
+            assert_error {*wrong number of arguments*} {r hsetex myhash fields 3 a b c d e}
+            assert_error {*wrong number of arguments*} {r hsetex myhash fields 3 a b c d e f g}
+            assert_error {*wrong number of arguments*} {r hsetex myhash fields 3 a b}
+            assert_error {*wrong number of arguments*} {r hsetex myhash fields 1 a b unknown}
+            assert_error {*unknown argument*} {r hsetex myhash nx fields 1 a b}
+            assert_error {*unknown argument*} {r hsetex myhash 1 fields 1 a b}
+
+            # Only one of FNX or FXX
+            assert_error {*Only one of FXX or FNX arguments *} {r hsetex myhash fxx fxx EX 100 fields 1 a b}
+            assert_error {*Only one of FXX or FNX arguments *} {r hsetex myhash fxx fnx EX 100 fields 1 a b}
+            assert_error {*Only one of FXX or FNX arguments *} {r hsetex myhash fnx fxx EX 100 fields 1 a b}
+            assert_error {*Only one of FXX or FNX arguments *} {r hsetex myhash fnx fnx EX 100 fields 1 a b}
+            assert_error {*Only one of FXX or FNX arguments *} {r hsetex myhash fxx fnx fxx EX 100 fields 1 a b}
+            assert_error {*Only one of FXX or FNX arguments *} {r hsetex myhash fnx fxx fnx EX 100 fields 1 a b}
+
+            # Only one of EX, PX, EXAT, PXAT or KEEPTTL can be specified
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash EX 100 PX 1000 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash EX 100 EXAT 100 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash EXAT 100 EX 1000 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash EXAT 100 PX 1000 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash PX 100 EXAT 100 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash PX 100 PXAT 100 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash PXAT 100 EX 100 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash PXAT 100 EXAT 100 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash EX 100 KEEPTTL fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash KEEPTTL EX 100 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash EX 100 EX 100 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash EXAT 100 EXAT 100 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash PX 10 PX 10 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash PXAT 10 PXAT 10 fields 1 a b}
+            assert_error {*Only one of EX, PX, EXAT, PXAT or KEEPTTL arguments*} {r hsetex myhash KEEPTTL KEEPTTL fields 1 a b}
+
+            # missing expire time
+            assert_error {*not an integer or out of range*} {r hsetex myhash ex fields 1 a b}
+            assert_error {*not an integer or out of range*} {r hsetex myhash px fields 1 a b}
+            assert_error {*not an integer or out of range*} {r hsetex myhash exat fields 1 a b}
+            assert_error {*not an integer or out of range*} {r hsetex myhash pxat fields 1 a b}
+
+            # expire time more than 2 ^ 48
+            assert_error {*invalid expire time*} {r hsetex myhash EXAT [expr (1<<48)] 1 a b}
+            assert_error {*invalid expire time*} {r hsetex myhash PXAT [expr (1<<48)] 1 a b}
+            assert_error {*invalid expire time*} {r hsetex myhash EX [expr (1<<48) - [clock seconds] + 1000 ] 1 a b}
+            assert_error {*invalid expire time*} {r hsetex myhash PX [expr (1<<48) - [clock milliseconds] + 1000 ] 1 a b}
+
+            # invalid expire time
+            assert_error {*invalid expire time*} {r hsetex myhash EXAT -1 1 a b}
+            assert_error {*not an integer or out of range*} {r hsetex myhash EXAT 9223372036854775808 1 a b}
+            assert_error {*not an integer or out of range*} {r hsetex myhash EXAT x 1 a b}
+
+            # invalid numfields arg
+            assert_error {*invalid number of fields*} {r hsetex myhash fields x a b}
+            assert_error {*invalid number of fields*} {r hsetex myhash fields 9223372036854775808 a b}
+            assert_error {*invalid number of fields*} {r hsetex myhash fields 0 a b}
+            assert_error {*invalid number of fields*} {r hsetex myhash fields -1 a b}
+        }
+        
+        test "HSETEX - Basic test ($type)" {
+            r del myhash
+
+            # set field
+            assert_equal [r hsetex myhash FIELDS 1 f1 v1] 1
+            assert_equal [r hget myhash f1] "v1"
+
+            # override
+            assert_equal [r hsetex myhash FIELDS 1 f1 v11] 1
+            assert_equal [r hget myhash f1] "v11"
+
+            # set multiple
+            assert_equal [r hsetex myhash FIELDS 2 f1 v1 f2 v2] 1
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v1 f2 v2"]
+            assert_equal [r hsetex myhash FIELDS 3 f1 v111 f2 v222 f3 v333] 1
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v111 f2 v222 f3 v333"]
+        }
+
+        test "HSETEX - Test FXX flag ($type)" {
+            r del myhash
+
+            # Key is empty, command fails due to FXX
+            assert_equal [r hsetex myhash FXX FIELDS 2 f1 v1 f2 v2] 0
+            # Verify it did not leave the key empty
+            assert_equal [r exists myhash] 0
+
+            # Command fails and no change on fields
+            r hset myhash f1 v1
+            assert_equal [r hsetex myhash FXX FIELDS 2 f1 v1 f2 v2] 0
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v1"]
+
+            # Command executed successfully
+            assert_equal [r hsetex myhash FXX FIELDS 1 f1 v11] 1
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v11"]
+
+            # Try with multiple fields
+            r hset myhash f2 v2
+            assert_equal [r hsetex myhash FXX FIELDS 2 f1 v111 f2 v222] 1
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v111 f2 v222"]
+
+            # Try with expiry
+            assert_equal [r hsetex myhash FXX EX 100 FIELDS 2 f1 v1 f2 v2] 1
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v1 f2 v2"]
+            assert_range [r httl myhash FIELDS 1 f1] 80 100
+            assert_range [r httl myhash FIELDS 1 f2] 80 100
+
+            # Try with expiry, FXX arg comes after TTL
+            assert_equal [r hsetex myhash PX 5000 FXX FIELDS 2 f1 v1 f2 v2] 1
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v1 f2 v2"]
+            assert_range [r hpttl myhash FIELDS 1 f1] 4500 5000
+            assert_range [r hpttl myhash FIELDS 1 f2] 4500 5000
+        }
+
+        test "HSETEX - Test FXX flag with lazy expire ($type)" {
+            r del myhash
+            r debug set-active-expire 0
+
+            r hsetex myhash PX 10 FIELDS 1 f1 v1
+            after 15
+            assert_equal [r hsetex myhash FXX FIELDS 1 f1 v11] 0
+            assert_equal [r exists myhash] 0
+            r debug set-active-expire 1
+        }
+
+        test "HSETEX - Test FNX flag ($type)" {
+            r del myhash
+
+            # Command successful on an empty key
+            assert_equal [r hsetex myhash FNX FIELDS 1 f1 v1] 1
+
+            # Command fails and no change on fields
+            assert_equal [r hsetex myhash FNX FIELDS 2 f1 v1 f2 v2] 0
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v1"]
+
+            # Command executed successfully
+            assert_equal [r hsetex myhash FNX FIELDS 2 f2 v2 f3 v3] 1
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v1 f2 v2 f3 v3"]
+            assert_equal [r hsetex myhash FXX FIELDS 3 f1 v11 f2 v22 f3 v33] 1
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v11 f2 v22 f3 v33"]
+
+            # Try with expiry
+            r del myhash
+            assert_equal [r hsetex myhash FNX EX 100 FIELDS 2 f1 v1 f2 v2] 1
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v1 f2 v2"]
+            assert_range [r httl myhash FIELDS 1 f1] 80 100
+            assert_range [r httl myhash FIELDS 1 f2] 80 100
+
+            # Try with expiry, FNX arg comes after TTL
+            assert_equal [r hsetex myhash PX 5000 FNX FIELDS 1 f3 v3] 1
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v1 f2 v2 f3 v3"]
+            assert_range [r hpttl myhash FIELDS 1 f3] 4500 5000
+        }
+
+        test "HSETEX - Test 'EX' flag ($type)" {
+            r del myhash
+            r hset myhash f1 v1 f2 v2
+            assert_equal [r hsetex myhash EX 1000 FIELDS 1 f3 v3 ] 1
+            assert_range [r httl myhash FIELDS 1 f3] 900 1000
+        }
+
+        test "HSETEX - Test 'EXAT' flag ($type)" {
+            r del myhash
+            r hset myhash f1 v1 f2 v2
+            assert_equal [r hsetex myhash EXAT 4000000000 FIELDS 1 f3 v3] 1
+            assert_range [expr [r httl myhash FIELDS 1 f3] + [clock seconds]] 3900000000 4000000000
+        }
+
+        test "HSETEX - Test 'PX' flag ($type)" {
+            r del myhash
+            assert_equal [r hsetex myhash PX 1000000 FIELDS 1 f3 v3] 1
+            assert_range [r httl myhash FIELDS 1 f3] 990 1000
+        }
+
+        test "HSETEX - Test 'PXAT' flag ($type)" {
+            r del myhash
+            r hset myhash f1 v2 f2 v2 f3 v3
+            assert_equal [r hsetex myhash PXAT 4000000000000 FIELDS 1 f2 v2] 1
+            assert_range [expr [r httl myhash FIELDS 1 f2] + [clock seconds]] 3900000000 4000000000
+        }
+
+        test "HSETEX - Test 'KEEPTTL' flag ($type)" {
+            r del myhash
+
+            r hsetex myhash FIELDS 2 f1 v1 f2 v2
+            r hsetex myhash PX 20000 FIELDS 1 f2 v2
+
+            # f1 does not have ttl
+            assert_equal [r httl myhash FIELDS 1 f1] "$T_NO_EXPIRY"
+
+            # f2 has ttl
+            assert_not_equal [r httl myhash FIELDS 1 f2] "$T_NO_EXPIRY"
+
+            # Validate KEEPTTL preserves the TTL
+            assert_equal [r hsetex myhash KEEPTTL FIELDS 1 f2 v22] 1
+            assert_equal [r hget myhash f2] "v22"
+            assert_not_equal [r httl myhash FIELDS 1 f2] "$T_NO_EXPIRY"
+
+            # Try with multiple fields. First, set fields and TTL
+            r hsetex myhash EX 10000 FIELDS 3 f1 v1 f2 v2 f3 v3
+
+            # Update fields with KEEPTTL flag
+            r hsetex myhash KEEPTTL FIELDS 3 f1 v111 f2 v222 f3 v333
+
+            # Verify values are set, ttls are untouched
+            assert_equal [lsort [r hgetall myhash]] [lsort "f1 v111 f2 v222 f3 v333"]
+            assert_range [r httl myhash FIELDS 1 f1] 9000 10000
+            assert_range [r httl myhash FIELDS 1 f2] 9000 10000
+            assert_range [r httl myhash FIELDS 1 f3] 9000 10000
+        }
+
+        test "HSETEX - Test no expiry flag discards TTL ($type)" {
+            r del myhash
+
+            r hsetex myhash FIELDS 1 f1 v1
+            r hsetex myhash PX 100000 FIELDS 1 f2 v2
+            assert_range [r hpttl myhash FIELDS 1 f2] 1 100000
+
+            assert_equal [r hsetex myhash FIELDS 2 f1 v1 f2 v2] 1
+            assert_equal [r httl myhash FIELDS 2 f1 f2] "$T_NO_EXPIRY $T_NO_EXPIRY"
+        }
+
+        test "HSETEX - Test with active expiry" {
+            r del myhash
+            r debug set-active-expire 0
+
+            r hsetex myhash PX 10 FIELDS 5 f1 v1 f2 v2 f3 v3 f4 v4 f5 v5
+            r debug set-active-expire 1
+            wait_for_condition 50 20 { [r EXISTS myhash] == 0 } else { fail "'myhash' should be expired" }
+        }
+
+        test "HSETEX - Set time in the past ($type)" {
+            r del myhash
+
+            # Try on an empty key
+            assert_equal [r hsetex myhash EXAT [expr {[clock seconds] - 1}] FIELDS 2 f1 v1 f2 v2] 1
+            assert_equal [r hexists myhash field1] 0
+
+            # Try with existing fields
+            r hset myhash fields 2 f1 v1 f2 v2
+            assert_equal [r hsetex myhash EXAT [expr {[clock seconds] - 1}] FIELDS 2 f1 v1 f2 v2] 1
+            assert_equal [r hexists myhash field1] 0
+        }
     }
 
     test "Statistics - Hashes with HFEs ($type)" {
@@ -879,6 +1303,13 @@ start_server {tags {"external:skip needs:debug"}} {
         r hdel myhash3 f2
         assert_match  [get_stat_subexpiry r] 2
 
+        # hash4: 2 fields, 1 with TTL. HGETDEL field with TTL. subexpiry decr -1
+        r hset myhash4 f1 v1 f2 v2
+        r hpexpire myhash4 100 FIELDS 1 f2
+        assert_match [get_stat_subexpiry r] 3
+        r hgetdel myhash4 FIELDS 1 f2
+        assert_match [get_stat_subexpiry r] 2
+
         # Expired fields of hash1 and hash2. subexpiry decr -2
         wait_for_condition 50 50 {
                 [get_stat_subexpiry r] == 0
@@ -887,6 +1318,21 @@ start_server {tags {"external:skip needs:debug"}} {
         }
     }
 
+    test "HFE commands against wrong type" {
+        r set wrongtype somevalue
+        assert_error "WRONGTYPE Operation against a key*" {r hexpire wrongtype 10 fields 1 f1}
+        assert_error "WRONGTYPE Operation against a key*" {r hexpireat wrongtype 10 fields 1 f1}
+        assert_error "WRONGTYPE Operation against a key*" {r hpexpire wrongtype 10 fields 1 f1}
+        assert_error "WRONGTYPE Operation against a key*" {r hpexpireat wrongtype 10 fields 1 f1}
+        assert_error "WRONGTYPE Operation against a key*" {r hexpiretime wrongtype fields 1 f1}
+        assert_error "WRONGTYPE Operation against a key*" {r hpexpiretime wrongtype fields 1 f1}
+        assert_error "WRONGTYPE Operation against a key*" {r httl wrongtype fields 1 f1}
+        assert_error "WRONGTYPE Operation against a key*" {r hpttl wrongtype fields 1 f1}
+        assert_error "WRONGTYPE Operation against a key*" {r hpersist wrongtype fields 1 f1}
+        assert_error "WRONGTYPE Operation against a key*" {r hgetex wrongtype fields 1 f1}
+        assert_error "WRONGTYPE Operation against a key*" {r hsetex wrongtype fields 1 f1 v1}
+    }
+
     r config set hash-max-listpack-entries 512
 }
 
@@ -1048,6 +1494,54 @@ start_server {tags {"external:skip needs:debug"}} {
                     fail "Field f2 of hash h2 wasn't deleted"
                 }
 
+                # HSETEX
+                r hsetex h3 FIELDS 1 f1 v1
+                r hsetex h3 FXX FIELDS 1 f1 v11
+                r hsetex h3 FNX FIELDS 1 f2 v22
+                r hsetex h3 KEEPTTL FIELDS 1 f2 v22
+
+                # Next one will fail due to FNX arg and it won't be replicated
+                r hsetex h3 FNX FIELDS 2 f1 v1 f2 v2
+
+                # Commands with EX/PX/PXAT/EXAT will be replicated as PXAT
+                r hsetex h3 EX 10000 FIELDS 1 f1 v111
+                r hsetex h3 PX 10000 FIELDS 1 f1 v111
+                r hsetex h3 PXAT [expr [clock milliseconds]+100000] FIELDS 1 f1 v111
+                r hsetex h3 EXAT [expr [clock seconds]+100000] FIELDS 1 f1 v111
+
+                # Following commands will set and then delete the fields because
+                # of TTL in the past. HDELs will be propagated.
+                r hsetex h3 PX 0 FIELDS 1 f1 v111
+                r hsetex h3 PX 0 FIELDS 3 f1 v2 f2 v2 f3 v3
+
+                # HGETEX
+                r hsetex h4 FIELDS 3 f1 v1 f2 v2 f3 v3
+                # No change on expiry, it won't be replicated.
+                r hgetex h4 FIELDS 1 f1
+
+                # Commands with EX/PX/PXAT/EXAT will be replicated as
+                # HPEXPIREAT command.
+                r hgetex h4 EX 10000 FIELDS 1 f1
+                r hgetex h4 PX 10000 FIELDS 1 f1
+                r hgetex h4 PXAT [expr [clock milliseconds]+100000] FIELDS 1 f1
+                r hgetex h4 EXAT [expr [clock seconds]+100000] FIELDS 1 f1
+
+                # Following commands will delete the fields because of TTL in
+                # the past. HDELs will be propagated.
+                r hgetex h4 PX 0 FIELDS 1 f1
+                # HDELs will be propagated for f2 and f3 as only those exist.
+                r hgetex h4 PX 0 FIELDS 3 f1 f2 f3
+
+                # HGETEX with PERSIST flag will be replicated as HPERSIST
+                r hsetex h4 EX 1000 FIELDS 1 f4 v4
+                r hgetex h4 PERSIST FIELDS 1 f4
+
+                # Nothing will be replicated as f4 is persisted already.
+                r hgetex h4 PERSIST FIELDS 1 f4
+
+                # Replicated as hdel
+                r hgetdel h4 FIELDS 1 f4
+
                 # Assert that each TTL-related command are persisted with absolute timestamps in AOF
                 assert_aof_content $aof {
                     {select *}
@@ -1068,6 +1562,33 @@ start_server {tags {"external:skip needs:debug"}} {
                     {hdel h1 f2}
                     {hdel h2 f1}
                     {hdel h2 f2}
+                    {hsetex h3 FIELDS 1 f1 v1}
+                    {hsetex h3 FXX FIELDS 1 f1 v11}
+                    {hsetex h3 FNX FIELDS 1 f2 v22}
+                    {hsetex h3 KEEPTTL FIELDS 1 f2 v22}
+                    {hsetex h3 PXAT * 1 f1 v111}
+                    {hsetex h3 PXAT * 1 f1 v111}
+                    {hsetex h3 PXAT * 1 f1 v111}
+                    {hsetex h3 PXAT * 1 f1 v111}
+                    {hdel h3 f1}
+                    {multi}
+                        {hdel h3 f1}
+                        {hdel h3 f2}
+                        {hdel h3 f3}
+                    {exec}
+                    {hsetex h4 FIELDS 3 f1 v1 f2 v2 f3 v3}
+                    {hpexpireat h4 * FIELDS 1 f1}
+                    {hpexpireat h4 * FIELDS 1 f1}
+                    {hpexpireat h4 * FIELDS 1 f1}
+                    {hpexpireat h4 * FIELDS 1 f1}
+                    {hdel h4 f1}
+                    {multi}
+                        {hdel h4 f2}
+                        {hdel h4 f3}
+                    {exec}
+                    {hsetex h4 PXAT * FIELDS 1 f4 v4}
+                    {hpersist h4 FIELDS 1 f4}
+                    {hdel h4 f4}
                 }
             }
         } {} {needs:debug}
@@ -1135,6 +1656,16 @@ start_server {tags {"external:skip needs:debug"}} {
                 r hpexpire h2 1 FIELDS 2 f1 f2
                 after 200
 
+                r hsetex h3 EX 100000 FIELDS 2 f1 v1 f2 v2
+                r hsetex h3 EXAT [expr [clock seconds] + 1000] FIELDS 2 f1 v1 f2 v2
+                r hsetex h3 PX 100000 FIELDS 2 f1 v1 f2 v2
+                r hsetex h3 PXAT [expr [clock milliseconds]+100000] FIELDS 2 f1 v1 f2 v2
+
+                r hgetex h3 EX 100000 FIELDS 2 f1 f2
+                r hgetex h3 EXAT [expr [clock seconds] + 1000] FIELDS 2 f1 f2
+                r hgetex h3 PX 100000 FIELDS 2 f1 f2
+                r hgetex h3 PXAT [expr [clock milliseconds]+100000] FIELDS 2 f1 f2
+
                 assert_aof_content $aof {
                     {select *}
                     {hset h1 f1 v1 f2 v2 f3 v3 f4 v4 f5 v5 f6 v6}
@@ -1146,6 +1677,14 @@ start_server {tags {"external:skip needs:debug"}} {
                     {hpexpireat h2 * FIELDS 2 f1 f2}
                     {hdel h2 *}
                     {hdel h2 *}
+                    {hsetex h3 PXAT * FIELDS 2 f1 v1 f2 v2}
+                    {hsetex h3 PXAT * FIELDS 2 f1 v1 f2 v2}
+                    {hsetex h3 PXAT * FIELDS 2 f1 v1 f2 v2}
+                    {hsetex h3 PXAT * FIELDS 2 f1 v1 f2 v2}
+                    {hpexpireat h3 * FIELDS 2 f1 f2}
+                    {hpexpireat h3 * FIELDS 2 f1 f2}
+                    {hpexpireat h3 * FIELDS 2 f1 f2}
+                    {hpexpireat h3 * FIELDS 2 f1 f2}
                 }
 
                 array set keyAndFields1 [dumpAllHashes r]
@@ -1265,6 +1804,23 @@ start_server {tags {"external:skip needs:debug"}} {
                 $primary hpexpireat h5 [expr [clock milliseconds]-100000] FIELDS 1 f
                 $primary hset h9 f v
 
+                $primary hsetex h10 EX 100000 FIELDS 1 f v
+                $primary hsetex h11 EXAT [expr [clock seconds] + 1000] FIELDS 1 f v
+                $primary hsetex h12 PX 100000 FIELDS 1 f v
+                $primary hsetex h13 PXAT [expr [clock milliseconds]+100000] FIELDS 1 f v
+                $primary hsetex h14 PXAT 1 FIELDS 1 f v
+
+                $primary hsetex h15 FIELDS 1 f v
+                $primary hgetex h15 EX 100000 FIELDS 1 f
+                $primary hsetex h16 FIELDS 1 f v
+                $primary hgetex h16 EXAT [expr [clock seconds] + 1000] FIELDS 1 f
+                $primary hsetex h17 FIELDS 1 f v
+                $primary hgetex h17 PX 100000 FIELDS 1 f
+                $primary hsetex h18 FIELDS 1 f v
+                $primary hgetex h18 PXAT [expr [clock milliseconds]+100000] FIELDS 1 f
+                $primary hsetex h19 FIELDS 1 f v
+                $primary hgetex h19 PXAT 1 FIELDS 1 f
+
                 # Wait for replica to get the keys and TTLs
                 assert {[$primary wait 1 0] == 1}
 
@@ -1273,5 +1829,102 @@ start_server {tags {"external:skip needs:debug"}} {
                 assert_equal [dumpAllHashes $primary] [dumpAllHashes $replica]
             }
         }
+
+        test "Test HSETEX command replication" {
+            r flushall
+            set repl [attach_to_replication_stream]
+
+            # Create a field and delete it in a single command due to timestamp
+            # being in the past. It will be propagated as HDEL.
+            r hsetex h1 PXAT 1 FIELDS 1 f1 v1
+
+            # Following ones will be propagated with PXAT arg
+            r hsetex h1 EX 100000 FIELDS 1 f1 v1
+            r hsetex h1 EXAT [expr [clock seconds] + 1000] FIELDS 1 f1 v1
+            r hsetex h1 PX 100000 FIELDS 1 f1 v1
+            r hsetex h1 PXAT [expr [clock milliseconds]+100000] FIELDS 1 f1 v1
+
+            # Propagate with KEEPTTL flag
+            r hsetex h1 KEEPTTL FIELDS 1 f1 v1
+
+            # Following commands will fail and won't be propagated
+            r hsetex h1 FNX FIELDS 1 f1 v11
+            r hsetex h1 FXX FIELDS 1 f2 v2
+
+            # Propagate with FNX and FXX flags
+            r hsetex h1 FNX FIELDS 1 f2 v2
+            r hsetex h1 FXX FIELDS 1 f2 v22
+
+            assert_replication_stream $repl {
+                {select *}
+                {hdel h1 f1}
+                {hsetex h1 PXAT * FIELDS 1 f1 v1}
+                {hsetex h1 PXAT * FIELDS 1 f1 v1}
+                {hsetex h1 PXAT * FIELDS 1 f1 v1}
+                {hsetex h1 PXAT * FIELDS 1 f1 v1}
+                {hsetex h1 KEEPTTL FIELDS 1 f1 v1}
+                {hsetex h1 FNX FIELDS 1 f2 v2}
+                {hsetex h1 FXX FIELDS 1 f2 v22}
+            }
+            close_replication_stream $repl
+        } {} {needs:repl}
+
+        test "Test HGETEX command replication" {
+            r flushall
+            r debug set-active-expire 0
+            set repl [attach_to_replication_stream]
+
+            # If no fields are found, command won't be replicated
+            r hgetex h1 EX 10000 FIELDS 1 f0
+            r hgetex h1 PERSIST FIELDS 1 f0
+
+            # Get without setting expiry will not be replicated
+            r hsetex h1 FIELDS 1 f0 v0
+            r hgetex h1 FIELDS 1 f0
+
+            # Lazy expired field will be replicated as HDEL
+            r hsetex h1 PX 10 FIELDS 1 f1 v1
+            after 15
+            r hgetex h1 EX 1000 FIELDS 1 f1
+
+            # If new TTL is in the past, it will be replicated as HDEL
+            r hsetex h1 EX 10000 FIELDS 1 f2 v2
+            r hgetex h1 EXAT 1 FIELDS 1 f2
+
+            # A field will expire lazily and other field will be deleted due to
+            # TTL is being in the past. It'll be propagated as two HDEL's.
+            r hsetex h1 PX 10 FIELDS 1 f3 v3
+            after 15
+            r hsetex h1 FIELDS 1 f4 v4
+            r hgetex h1 EXAT 1 FIELDS 2 f3 f4
+
+            # TTL update, it will be replicated as HPEXPIREAT
+            r hsetex h1 FIELDS 1 f5 v5
+            r hgetex h1 EX 10000 FIELDS 1 f5
+
+            # If PERSIST flag is used, it will be replicated as HPERSIST
+            r hsetex h1 EX 10000 FIELDS 1 f6 v6
+            r hgetex h1 PERSIST FIELDS 1 f6
+
+            assert_replication_stream $repl {
+                {select *}
+                {hsetex h1 FIELDS 1 f0 v0}
+                {hsetex h1 PXAT * FIELDS 1 f1 v1}
+                {hdel h1 f1}
+                {hsetex h1 PXAT * FIELDS 1 f2 v2}
+                {hdel h1 f2}
+                {hsetex h1 PXAT * FIELDS 1 f3 v3}
+                {hsetex h1 FIELDS 1 f4 v4}
+                {multi}
+                    {hdel h1 f3}
+                    {hdel h1 f4}
+                {exec}
+                {hsetex h1 FIELDS 1 f5 v5}
+                {hpexpireat h1 * FIELDS 1 f5}
+                {hsetex h1 PXAT * FIELDS 1 f6 v6}
+                {hpersist h1 FIELDS 1 f6}
+            }
+            close_replication_stream $repl
+        } {} {needs:repl}
     }
 }
diff --git a/tests/unit/type/hash.tcl b/tests/unit/type/hash.tcl
index 1cb42245515..a3d6867f865 100644
--- a/tests/unit/type/hash.tcl
+++ b/tests/unit/type/hash.tcl
@@ -371,6 +371,7 @@ start_server {tags {"hash"}} {
         assert_error "WRONGTYPE Operation against a key*" {r hsetnx wrongtype field1 val1}
         assert_error "WRONGTYPE Operation against a key*" {r hlen wrongtype}
         assert_error "WRONGTYPE Operation against a key*" {r hscan wrongtype 0}
+        assert_error "WRONGTYPE Operation against a key*" {r hgetdel wrongtype fields 1 a}
     }
 
     test {HMGET - small hash} {
@@ -710,6 +711,89 @@ start_server {tags {"hash"}} {
         r config set hash-max-listpack-value $original_max_value
     }
 
+    test {HGETDEL input validation} {
+        r del key1
+        assert_error "*wrong number of arguments*" {r hgetdel}
+        assert_error "*wrong number of arguments*" {r hgetdel key1}
+        assert_error "*wrong number of arguments*" {r hgetdel key1 FIELDS}
+        assert_error "*wrong number of arguments*" {r hgetdel key1 FIELDS 0}
+        assert_error "*wrong number of arguments*" {r hgetdel key1 FIELDX}
+        assert_error "*argument FIELDS is missing*" {r hgetdel key1 XFIELDX 1 a}
+        assert_error "*numfields*parameter*must match*number of arguments*" {r hgetdel key1 FIELDS 2 a}
+        assert_error "*numfields*parameter*must match*number of arguments*" {r hgetdel key1 FIELDS 2 a b c}
+        assert_error "*Number of fields must be a positive integer*" {r hgetdel key1 FIELDS 0 a}
+        assert_error "*Number of fields must be a positive integer*" {r hgetdel key1 FIELDS -1 a}
+        assert_error "*Number of fields must be a positive integer*" {r hgetdel key1 FIELDS b a}
+        assert_error "*Number of fields must be a positive integer*" {r hgetdel key1 FIELDS 9223372036854775808 a}
+    }
+
+    foreach type {listpack ht} {
+        set orig_config [lindex [r config get hash-max-listpack-entries] 1]
+        r del key1
+
+        if {$type == "listpack"} {
+            r config set hash-max-listpack-entries $orig_config
+            r hset key1 f1 1 f2 2 f3 3 strfield strval
+            assert_encoding listpack key1
+        } else {
+            r config set hash-max-listpack-entries 0
+            r hset key1 f1 1 f2 2 f3 3 strfield strval
+            assert_encoding hashtable key1
+        }
+
+        test {HGETDEL basic test} {
+            r del key1
+            r hset key1 f1 1 f2 2 f3 3 strfield strval
+            assert_equal [r hgetdel key1 fields 1 f2] 2
+            assert_equal [r hlen key1] 3
+            assert_equal [r hget key1 f1] 1
+            assert_equal [r hget key1 f2] ""
+            assert_equal [r hget key1 f3] 3
+            assert_equal [r hget key1 strfield] strval
+
+            assert_equal [r hgetdel key1 fields 1 f1] 1
+            assert_equal [lsort [r hgetall key1]] [lsort "f3 3 strfield strval"]
+            assert_equal [r hgetdel key1 fields 1 f3] 3
+            assert_equal [r hgetdel key1 fields 1 strfield] strval
+            assert_equal [r hgetall key1] ""
+            assert_equal [r exists key1] 0
+        }
+
+        test {HGETDEL test with non existing fields} {
+             r del key1
+             r hset key1 f1 1 f2 2 f3 3
+             assert_equal [r hgetdel key1 fields 4 x1 x2 x3 x4] "{} {} {} {}"
+             assert_equal [r hgetdel key1 fields 4 x1 x2 f3 x4] "{} {} 3 {}"
+             assert_equal [lsort [r hgetall key1]] [lsort "f1 1 f2 2"]
+             assert_equal [r hgetdel key1 fields 3 f1 f2 f3] "1 2 {}"
+             assert_equal [r hgetdel key1 fields 3 f1 f2 f3] "{} {} {}"
+        }
+
+        r config set hash-max-listpack-entries $orig_config
+    }
+
+    test {HGETDEL propagated as HDEL command to replica} {
+        set repl [attach_to_replication_stream]
+        r hset key1 f1 v1 f2 v2 f3 v3 f4 v4 f5 v5
+        r hgetdel key1 fields 1 f1
+        r hgetdel key1 fields 2 f2 f3
+
+        # make sure non-existing fields are not replicated
+        r hgetdel key1 fields 2 f7 f8
+
+        # delete more
+        r hgetdel key1 fields 3 f4 f5 f6
+
+        assert_replication_stream $repl {
+            {select *}
+            {hset key1 f1 v1 f2 v2 f3 v3 f4 v4 f5 v5}
+            {hdel key1 f1}
+            {hdel key1 f2 f3}
+            {hdel key1 f4 f5 f6}
+        }
+        close_replication_stream $repl
+    } {} {needs:repl}
+
     test {Hash ziplist regression test for large keys} {
         r hset hash kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk a
         r hset hash kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk b

From 6c202f495c6057fb571da1e3ab133773f4851e9d Mon Sep 17 00:00:00 2001
From: Ozan Tezcan <ozantezcan@gmail.com>
Date: Sun, 16 Feb 2025 20:07:29 +0300
Subject: [PATCH 09/24] Remove DENYOOM flag from hexpire command (#13800)

Remove DENYOOM flag from hexpire / hexpireat / hpexpire / hpexpireat
commands.

h(p)expire(at) commands may allocate some memory but it is not that big.
Similary, we don't have DENYOOM flag for EXPIRE command. This change
will align EXPIRE and HEXPIRE commands in this manner.
---
 src/commands.def             | 8 ++++----
 src/commands/hexpire.json    | 1 -
 src/commands/hexpireat.json  | 1 -
 src/commands/hpexpire.json   | 1 -
 src/commands/hpexpireat.json | 1 -
 5 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/commands.def b/src/commands.def
index dd55c816216..9d70326e6a6 100644
--- a/src/commands.def
+++ b/src/commands.def
@@ -11164,8 +11164,8 @@ struct COMMAND_STRUCT redisCommandTable[] = {
 /* hash */
 {MAKE_CMD("hdel","Deletes one or more fields and their values from a hash. Deletes the hash if no fields remain.","O(N) where N is the number of fields to be removed.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HDEL_History,1,HDEL_Tips,0,hdelCommand,-3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HDEL_Keyspecs,1,NULL,2),.args=HDEL_Args},
 {MAKE_CMD("hexists","Determines whether a field exists in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXISTS_History,0,HEXISTS_Tips,0,hexistsCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HEXISTS_Keyspecs,1,NULL,2),.args=HEXISTS_Args},
-{MAKE_CMD("hexpire","Set expiry for hash field using relative time to expire (seconds)","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXPIRE_History,0,HEXPIRE_Tips,0,hexpireCommand,-6,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HEXPIRE_Keyspecs,1,NULL,4),.args=HEXPIRE_Args},
-{MAKE_CMD("hexpireat","Set expiry for hash field using an absolute Unix timestamp (seconds)","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXPIREAT_History,0,HEXPIREAT_Tips,0,hexpireatCommand,-6,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HEXPIREAT_Keyspecs,1,NULL,4),.args=HEXPIREAT_Args},
+{MAKE_CMD("hexpire","Set expiry for hash field using relative time to expire (seconds)","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXPIRE_History,0,HEXPIRE_Tips,0,hexpireCommand,-6,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HEXPIRE_Keyspecs,1,NULL,4),.args=HEXPIRE_Args},
+{MAKE_CMD("hexpireat","Set expiry for hash field using an absolute Unix timestamp (seconds)","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXPIREAT_History,0,HEXPIREAT_Tips,0,hexpireatCommand,-6,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HEXPIREAT_Keyspecs,1,NULL,4),.args=HEXPIREAT_Args},
 {MAKE_CMD("hexpiretime","Returns the expiration time of a hash field as a Unix timestamp, in seconds.","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXPIRETIME_History,0,HEXPIRETIME_Tips,0,hexpiretimeCommand,-5,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HEXPIRETIME_Keyspecs,1,NULL,2),.args=HEXPIRETIME_Args},
 {MAKE_CMD("hget","Returns the value of a field in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGET_History,0,HGET_Tips,0,hgetCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HGET_Keyspecs,1,NULL,2),.args=HGET_Args},
 {MAKE_CMD("hgetall","Returns all fields and values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGETALL_History,0,HGETALL_Tips,1,hgetallCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HGETALL_Keyspecs,1,NULL,1),.args=HGETALL_Args},
@@ -11178,8 +11178,8 @@ struct COMMAND_STRUCT redisCommandTable[] = {
 {MAKE_CMD("hmget","Returns the values of all fields in a hash.","O(N) where N is the number of fields being requested.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HMGET_History,0,HMGET_Tips,0,hmgetCommand,-3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HMGET_Keyspecs,1,NULL,2),.args=HMGET_Args},
 {MAKE_CMD("hmset","Sets the values of multiple fields.","O(N) where N is the number of fields being set.","2.0.0",CMD_DOC_DEPRECATED,"`HSET` with multiple field-value pairs","4.0.0","hash",COMMAND_GROUP_HASH,HMSET_History,0,HMSET_Tips,0,hsetCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HMSET_Keyspecs,1,NULL,2),.args=HMSET_Args},
 {MAKE_CMD("hpersist","Removes the expiration time for each specified field","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPERSIST_History,0,HPERSIST_Tips,0,hpersistCommand,-5,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HPERSIST_Keyspecs,1,NULL,2),.args=HPERSIST_Args},
-{MAKE_CMD("hpexpire","Set expiry for hash field using relative time to expire (milliseconds)","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPEXPIRE_History,0,HPEXPIRE_Tips,0,hpexpireCommand,-6,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HPEXPIRE_Keyspecs,1,NULL,4),.args=HPEXPIRE_Args},
-{MAKE_CMD("hpexpireat","Set expiry for hash field using an absolute Unix timestamp (milliseconds)","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPEXPIREAT_History,0,HPEXPIREAT_Tips,0,hpexpireatCommand,-6,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HPEXPIREAT_Keyspecs,1,NULL,4),.args=HPEXPIREAT_Args},
+{MAKE_CMD("hpexpire","Set expiry for hash field using relative time to expire (milliseconds)","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPEXPIRE_History,0,HPEXPIRE_Tips,0,hpexpireCommand,-6,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HPEXPIRE_Keyspecs,1,NULL,4),.args=HPEXPIRE_Args},
+{MAKE_CMD("hpexpireat","Set expiry for hash field using an absolute Unix timestamp (milliseconds)","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPEXPIREAT_History,0,HPEXPIREAT_Tips,0,hpexpireatCommand,-6,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HPEXPIREAT_Keyspecs,1,NULL,4),.args=HPEXPIREAT_Args},
 {MAKE_CMD("hpexpiretime","Returns the expiration time of a hash field as a Unix timestamp, in msec.","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPEXPIRETIME_History,0,HPEXPIRETIME_Tips,0,hpexpiretimeCommand,-5,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HPEXPIRETIME_Keyspecs,1,NULL,2),.args=HPEXPIRETIME_Args},
 {MAKE_CMD("hpttl","Returns the TTL in milliseconds of a hash field.","O(N) where N is the number of specified fields","7.4.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HPTTL_History,0,HPTTL_Tips,1,hpttlCommand,-5,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HPTTL_Keyspecs,1,NULL,2),.args=HPTTL_Args},
 {MAKE_CMD("hrandfield","Returns one or more random fields from a hash.","O(N) where N is the number of fields returned","6.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HRANDFIELD_History,0,HRANDFIELD_Tips,1,hrandfieldCommand,-2,CMD_READONLY,ACL_CATEGORY_HASH,HRANDFIELD_Keyspecs,1,NULL,2),.args=HRANDFIELD_Args},
diff --git a/src/commands/hexpire.json b/src/commands/hexpire.json
index 832c182aea2..a08a1d1f5dc 100644
--- a/src/commands/hexpire.json
+++ b/src/commands/hexpire.json
@@ -9,7 +9,6 @@
         "history": [],
         "command_flags": [
             "WRITE",
-            "DENYOOM",
             "FAST"
         ],
         "acl_categories": [
diff --git a/src/commands/hexpireat.json b/src/commands/hexpireat.json
index 4a7c0c71886..7e8ea33cad9 100644
--- a/src/commands/hexpireat.json
+++ b/src/commands/hexpireat.json
@@ -9,7 +9,6 @@
         "history": [],
         "command_flags": [
             "WRITE",
-            "DENYOOM",
             "FAST"
         ],
         "acl_categories": [
diff --git a/src/commands/hpexpire.json b/src/commands/hpexpire.json
index 02c68e61634..269eb4eace0 100644
--- a/src/commands/hpexpire.json
+++ b/src/commands/hpexpire.json
@@ -9,7 +9,6 @@
         "history": [],
         "command_flags": [
             "WRITE",
-            "DENYOOM",
             "FAST"
         ],
         "acl_categories": [
diff --git a/src/commands/hpexpireat.json b/src/commands/hpexpireat.json
index 58e5555fb5f..f5f1319270b 100644
--- a/src/commands/hpexpireat.json
+++ b/src/commands/hpexpireat.json
@@ -9,7 +9,6 @@
         "history": [],
         "command_flags": [
             "WRITE",
-            "DENYOOM",
             "FAST"
         ],
         "acl_categories": [

From 61551984c7c5f771cd4319691a8d0a5a18ffe9f4 Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Mon, 17 Feb 2025 15:42:07 +0800
Subject: [PATCH 10/24] Unify RedisModuleDefragFunc and
 RedisModuleTypeDefragFunc

---
 src/defrag.c               |  3 +--
 src/redismodule.h          |  5 +++--
 tests/modules/defragtest.c | 23 ++++++++++++++---------
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/defrag.c b/src/defrag.c
index 2df5a9394ad..4aec7df04cd 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -1225,8 +1225,7 @@ static doneStatus defragModuleGlobals(monotime endtime, void *ctx) {
     defrag_module_ctx->module_ctx.cursor = &defrag_module_ctx->cursor;
 
     /* Call the module's defrag callback function and check if more work remains. */
-    defrag_module_ctx->module->defrag_cb(&defrag_module_ctx->module_ctx);
-    if (defrag_module_ctx->cursor != 0)
+    if (defrag_module_ctx->module->defrag_cb(&defrag_module_ctx->module_ctx) != 0)
         return DEFRAG_NOT_DONE;
 
     return DEFRAG_DONE;
diff --git a/src/redismodule.h b/src/redismodule.h
index 54f778315a1..1e4873da5a3 100644
--- a/src/redismodule.h
+++ b/src/redismodule.h
@@ -839,7 +839,8 @@ typedef struct RedisModuleDefragCtx RedisModuleDefragCtx;
 /* Function pointers needed by both the core and modules, these needs to be
  * exposed since you can't cast a function pointer to (void *). */
 typedef void (*RedisModuleInfoFunc)(RedisModuleInfoCtx *ctx, int for_crash_report);
-typedef void (*RedisModuleDefragFunc)(RedisModuleDefragCtx *ctx);
+typedef int (*RedisModuleDefragFunc)(RedisModuleDefragCtx *ctx);
+typedef void (*RedisModuleDefragEventFunc)(RedisModuleDefragCtx *ctx);
 typedef void (*RedisModuleUserChangedFunc) (uint64_t client_id, void *privdata);
 
 /* ------------------------- End of common defines ------------------------ */
@@ -1305,7 +1306,7 @@ REDISMODULE_API int *(*RedisModule_GetCommandKeys)(RedisModuleCtx *ctx, RedisMod
 REDISMODULE_API int *(*RedisModule_GetCommandKeysWithFlags)(RedisModuleCtx *ctx, RedisModuleString **argv, int argc, int *num_keys, int **out_flags) REDISMODULE_ATTR;
 REDISMODULE_API const char *(*RedisModule_GetCurrentCommandName)(RedisModuleCtx *ctx) REDISMODULE_ATTR;
 REDISMODULE_API int (*RedisModule_RegisterDefragFunc)(RedisModuleCtx *ctx, RedisModuleDefragFunc func) REDISMODULE_ATTR;
-REDISMODULE_API int (*RedisModule_RegisterDefragCallbacks)(RedisModuleCtx *ctx, RedisModuleDefragFunc start, RedisModuleDefragFunc end) REDISMODULE_ATTR;
+REDISMODULE_API int (*RedisModule_RegisterDefragCallbacks)(RedisModuleCtx *ctx, RedisModuleDefragEventFunc start, RedisModuleDefragEventFunc end) REDISMODULE_ATTR;
 REDISMODULE_API void *(*RedisModule_DefragAlloc)(RedisModuleDefragCtx *ctx, void *ptr) REDISMODULE_ATTR;
 REDISMODULE_API void *(*RedisModule_DefragAllocRaw)(RedisModuleDefragCtx *ctx, size_t size) REDISMODULE_ATTR;
 REDISMODULE_API void (*RedisModule_DefragFreeRaw)(RedisModuleDefragCtx *ctx, void *ptr) REDISMODULE_ATTR;
diff --git a/tests/modules/defragtest.c b/tests/modules/defragtest.c
index 5b4ef27cded..7df149d3d5c 100644
--- a/tests/modules/defragtest.c
+++ b/tests/modules/defragtest.c
@@ -39,21 +39,26 @@ static void createGlobalStrings(RedisModuleCtx *ctx, unsigned long count)
     }
 }
 
-static void defragGlobalStrings(RedisModuleDefragCtx *ctx)
+static int defragGlobalStrings(RedisModuleDefragCtx *ctx)
 {
     unsigned long cursor = 0;
     RedisModule_DefragCursorGet(ctx, &cursor);
 
     RedisModule_Assert(cursor < global_strings_len);
-    RedisModuleString *new = RedisModule_DefragRedisModuleString(ctx, global_strings[cursor]);
-    global_attempts++;
-    if (new != NULL) {
-        global_strings[cursor] = new;
-        global_defragged++;
-    }
+    for (; cursor < global_strings_len; cursor++) {
+        RedisModuleString *new = RedisModule_DefragRedisModuleString(ctx, global_strings[cursor]);
+        global_attempts++;
+        if (new != NULL) {
+            global_strings[cursor] = new;
+            global_defragged++;
+        }
 
-    cursor++;
-    RedisModule_DefragCursorSet(ctx, cursor == global_strings_len ? 0 : cursor);
+        if (cursor % 16 == 0 && RedisModule_DefragShouldStop(ctx)) {
+            RedisModule_DefragCursorSet(ctx, cursor);
+            return 1;
+        }
+    }
+    return 0;
 }
 
 static void defragStart(RedisModuleDefragCtx *ctx) {

From e85ebc4573ec7166bd039b3d2cb80a0259d81fad Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Mon, 17 Feb 2025 23:09:15 +0800
Subject: [PATCH 11/24] Sync branch DefragRedisModuleDictDict

---
 tests/modules/defragtest.c      | 19 +++++++++----------
 tests/unit/moduleapi/defrag.tcl |  2 +-
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/tests/modules/defragtest.c b/tests/modules/defragtest.c
index 7df149d3d5c..cc4a4d28f73 100644
--- a/tests/modules/defragtest.c
+++ b/tests/modules/defragtest.c
@@ -21,10 +21,10 @@ unsigned long int datatype_defragged = 0;
 unsigned long int datatype_raw_defragged = 0;
 unsigned long int datatype_resumes = 0;
 unsigned long int datatype_wrong_cursor = 0;
-unsigned long int global_attempts = 0;
+unsigned long int global_strings_attempts = 0;
 unsigned long int defrag_started = 0;
 unsigned long int defrag_ended = 0;
-unsigned long int global_defragged = 0;
+unsigned long int global_strings_defragged = 0;
 
 unsigned long global_strings_len = 0;
 RedisModuleString **global_strings = NULL;
@@ -43,17 +43,16 @@ static int defragGlobalStrings(RedisModuleDefragCtx *ctx)
 {
     unsigned long cursor = 0;
     RedisModule_DefragCursorGet(ctx, &cursor);
-
     RedisModule_Assert(cursor < global_strings_len);
     for (; cursor < global_strings_len; cursor++) {
         RedisModuleString *new = RedisModule_DefragRedisModuleString(ctx, global_strings[cursor]);
-        global_attempts++;
+        global_strings_attempts++;
         if (new != NULL) {
             global_strings[cursor] = new;
-            global_defragged++;
+            global_strings_defragged++;
         }
 
-        if (cursor % 16 == 0 && RedisModule_DefragShouldStop(ctx)) {
+        if (RedisModule_DefragShouldStop(ctx)) {
             RedisModule_DefragCursorSet(ctx, cursor);
             return 1;
         }
@@ -80,8 +79,8 @@ static void FragInfo(RedisModuleInfoCtx *ctx, int for_crash_report) {
     RedisModule_InfoAddFieldLongLong(ctx, "datatype_raw_defragged", datatype_raw_defragged);
     RedisModule_InfoAddFieldLongLong(ctx, "datatype_resumes", datatype_resumes);
     RedisModule_InfoAddFieldLongLong(ctx, "datatype_wrong_cursor", datatype_wrong_cursor);
-    RedisModule_InfoAddFieldLongLong(ctx, "global_attempts", global_attempts);
-    RedisModule_InfoAddFieldLongLong(ctx, "global_defragged", global_defragged);
+    RedisModule_InfoAddFieldLongLong(ctx, "global_strings_attempts", global_strings_attempts);
+    RedisModule_InfoAddFieldLongLong(ctx, "global_strings_defragged", global_strings_defragged);
     RedisModule_InfoAddFieldLongLong(ctx, "defrag_started", defrag_started);
     RedisModule_InfoAddFieldLongLong(ctx, "defrag_ended", defrag_ended);
 }
@@ -109,8 +108,8 @@ static int fragResetStatsCommand(RedisModuleCtx *ctx, RedisModuleString **argv,
     datatype_raw_defragged = 0;
     datatype_resumes = 0;
     datatype_wrong_cursor = 0;
-    global_attempts = 0;
-    global_defragged = 0;
+    global_strings_attempts = 0;
+    global_strings_defragged = 0;
     defrag_started = 0;
     defrag_ended = 0;
 
diff --git a/tests/unit/moduleapi/defrag.tcl b/tests/unit/moduleapi/defrag.tcl
index 3f36ee1912b..e840305182a 100644
--- a/tests/unit/moduleapi/defrag.tcl
+++ b/tests/unit/moduleapi/defrag.tcl
@@ -46,7 +46,7 @@ start_server {tags {"modules"} overrides {{save ""}}} {
 
             after 2000
             set info [r info defragtest_stats]
-            assert {[getInfoProperty $info defragtest_global_attempts] > 0}
+            assert {[getInfoProperty $info defragtest_global_strings_attempts] > 0}
             assert_morethan [getInfoProperty $info defragtest_defrag_started] 0
             assert_morethan [getInfoProperty $info defragtest_defrag_ended] 0
         }

From 98e13b3f6d821ba61de1cf549399918493c33122 Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Tue, 18 Feb 2025 00:22:34 +0800
Subject: [PATCH 12/24] Add RedisModule_RegisterDefragFunc2 module api

Co-authored-by: oranagra <oran@redislabs.com>
---
 src/defrag.c               |  4 ++--
 src/module.c               | 10 ++++++++++
 src/redismodule.h          |  7 +++++--
 src/server.h               |  1 +
 tests/modules/defragtest.c |  2 +-
 5 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/defrag.c b/src/defrag.c
index 4aec7df04cd..570df9e5d16 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -1225,7 +1225,7 @@ static doneStatus defragModuleGlobals(monotime endtime, void *ctx) {
     defrag_module_ctx->module_ctx.cursor = &defrag_module_ctx->cursor;
 
     /* Call the module's defrag callback function and check if more work remains. */
-    if (defrag_module_ctx->module->defrag_cb(&defrag_module_ctx->module_ctx) != 0)
+    if (defrag_module_ctx->module->defrag_cb_2(&defrag_module_ctx->module_ctx) != 0)
         return DEFRAG_NOT_DONE;
 
     return DEFRAG_DONE;
@@ -1519,7 +1519,7 @@ static void beginDefragCycle(void) {
     dictEntry *de;
     while ((de = dictNext(di)) != NULL) {
         struct RedisModule *module = dictGetVal(de);
-        if (module->defrag_cb) {
+        if (module->defrag_cb_2) {
             defragModuleCtx *ctx = zmalloc(sizeof(defragModuleCtx));
             ctx->module = module;
             ctx->cursor = 0;
diff --git a/src/module.c b/src/module.c
index ef52d91cfd1..34c4fbecd12 100644
--- a/src/module.c
+++ b/src/module.c
@@ -2309,6 +2309,7 @@ void RM_SetModuleAttribs(RedisModuleCtx *ctx, const char *name, int ver, int api
     module->options = 0;
     module->info_cb = 0;
     module->defrag_cb = 0;
+    module->defrag_cb_2 = 0;
     module->defrag_start_cb = 0;
     module->defrag_end_cb = 0;
     module->loadmod = NULL;
@@ -13791,6 +13792,14 @@ int RM_RegisterDefragFunc(RedisModuleCtx *ctx, RedisModuleDefragFunc cb) {
     return REDISMODULE_OK;
 }
 
+/* Register a defrag callback for global data, i.e. anything that the module
+ * may allocate that is not tied to a specific data type.
+ */
+int RM_RegisterDefragFunc2(RedisModuleCtx *ctx, RedisModuleDefragFunc2 cb) {
+    ctx->module->defrag_cb_2 = cb;
+    return REDISMODULE_OK;
+}
+
 /* Register a defrag callbacks that will be called when defrag operation starts and ends.
  *
  * The callbacks are the same as `RM_RegisterDefragFunc` but the user
@@ -14361,6 +14370,7 @@ void moduleRegisterCoreAPI(void) {
     REGISTER_API(GetCurrentCommandName);
     REGISTER_API(GetTypeMethodVersion);
     REGISTER_API(RegisterDefragFunc);
+    REGISTER_API(RegisterDefragFunc2);
     REGISTER_API(RegisterDefragCallbacks);
     REGISTER_API(DefragAlloc);
     REGISTER_API(DefragAllocRaw);
diff --git a/src/redismodule.h b/src/redismodule.h
index 1e4873da5a3..775bf3b184c 100644
--- a/src/redismodule.h
+++ b/src/redismodule.h
@@ -839,7 +839,8 @@ typedef struct RedisModuleDefragCtx RedisModuleDefragCtx;
 /* Function pointers needed by both the core and modules, these needs to be
  * exposed since you can't cast a function pointer to (void *). */
 typedef void (*RedisModuleInfoFunc)(RedisModuleInfoCtx *ctx, int for_crash_report);
-typedef int (*RedisModuleDefragFunc)(RedisModuleDefragCtx *ctx);
+typedef void (*RedisModuleDefragFunc)(RedisModuleDefragCtx *ctx);
+typedef int (*RedisModuleDefragFunc2)(RedisModuleDefragCtx *ctx);
 typedef void (*RedisModuleDefragEventFunc)(RedisModuleDefragCtx *ctx);
 typedef void (*RedisModuleUserChangedFunc) (uint64_t client_id, void *privdata);
 
@@ -1306,7 +1307,8 @@ REDISMODULE_API int *(*RedisModule_GetCommandKeys)(RedisModuleCtx *ctx, RedisMod
 REDISMODULE_API int *(*RedisModule_GetCommandKeysWithFlags)(RedisModuleCtx *ctx, RedisModuleString **argv, int argc, int *num_keys, int **out_flags) REDISMODULE_ATTR;
 REDISMODULE_API const char *(*RedisModule_GetCurrentCommandName)(RedisModuleCtx *ctx) REDISMODULE_ATTR;
 REDISMODULE_API int (*RedisModule_RegisterDefragFunc)(RedisModuleCtx *ctx, RedisModuleDefragFunc func) REDISMODULE_ATTR;
-REDISMODULE_API int (*RedisModule_RegisterDefragCallbacks)(RedisModuleCtx *ctx, RedisModuleDefragEventFunc start, RedisModuleDefragEventFunc end) REDISMODULE_ATTR;
+REDISMODULE_API int (*RedisModule_RegisterDefragFunc2)(RedisModuleCtx *ctx, RedisModuleDefragFunc2 func) REDISMODULE_ATTR;
+REDISMODULE_API int (*RedisModule_RegisterDefragCallbacks)(RedisModuleCtx *ctx, RedisModuleDefragFunc start, RedisModuleDefragFunc end) REDISMODULE_ATTR;
 REDISMODULE_API void *(*RedisModule_DefragAlloc)(RedisModuleDefragCtx *ctx, void *ptr) REDISMODULE_ATTR;
 REDISMODULE_API void *(*RedisModule_DefragAllocRaw)(RedisModuleDefragCtx *ctx, size_t size) REDISMODULE_ATTR;
 REDISMODULE_API void (*RedisModule_DefragFreeRaw)(RedisModuleDefragCtx *ctx, void *ptr) REDISMODULE_ATTR;
@@ -1679,6 +1681,7 @@ static int RedisModule_Init(RedisModuleCtx *ctx, const char *name, int ver, int
     REDISMODULE_GET_API(GetCommandKeysWithFlags);
     REDISMODULE_GET_API(GetCurrentCommandName);
     REDISMODULE_GET_API(RegisterDefragFunc);
+    REDISMODULE_GET_API(RegisterDefragFunc2);
     REDISMODULE_GET_API(RegisterDefragCallbacks);
     REDISMODULE_GET_API(DefragAlloc);
     REDISMODULE_GET_API(DefragAllocRaw);
diff --git a/src/server.h b/src/server.h
index 8c10f6a4e6f..ffc1850a8cb 100644
--- a/src/server.h
+++ b/src/server.h
@@ -891,6 +891,7 @@ struct RedisModule {
     int blocked_clients;         /* Count of RedisModuleBlockedClient in this module. */
     RedisModuleInfoFunc info_cb; /* Callback for module to add INFO fields. */
     RedisModuleDefragFunc defrag_cb;    /* Callback for global data defrag. */
+    RedisModuleDefragFunc2 defrag_cb_2; /* Callback for global data defrag. */
     RedisModuleDefragFunc defrag_start_cb;    /* Callback indicating defrag started. */
     RedisModuleDefragFunc defrag_end_cb;      /* Callback indicating defrag ended. */
     struct moduleLoadQueueEntry *loadmod; /* Module load arguments for config rewrite. */
diff --git a/tests/modules/defragtest.c b/tests/modules/defragtest.c
index cc4a4d28f73..dc81caa4132 100644
--- a/tests/modules/defragtest.c
+++ b/tests/modules/defragtest.c
@@ -267,7 +267,7 @@ int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc)
         return REDISMODULE_ERR;
 
     RedisModule_RegisterInfoFunc(ctx, FragInfo);
-    RedisModule_RegisterDefragFunc(ctx, defragGlobalStrings);
+    RedisModule_RegisterDefragFunc2(ctx, defragGlobalStrings);
     RedisModule_RegisterDefragCallbacks(ctx, defragStart, defragEnd);
 
     return REDISMODULE_OK;

From 4f0f7a71a055ec94df78b734067efb39dff24ab5 Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Tue, 18 Feb 2025 00:28:00 +0800
Subject: [PATCH 13/24] Cleanup

---
 src/redismodule.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/redismodule.h b/src/redismodule.h
index 775bf3b184c..f7d8cc76af5 100644
--- a/src/redismodule.h
+++ b/src/redismodule.h
@@ -841,7 +841,6 @@ typedef struct RedisModuleDefragCtx RedisModuleDefragCtx;
 typedef void (*RedisModuleInfoFunc)(RedisModuleInfoCtx *ctx, int for_crash_report);
 typedef void (*RedisModuleDefragFunc)(RedisModuleDefragCtx *ctx);
 typedef int (*RedisModuleDefragFunc2)(RedisModuleDefragCtx *ctx);
-typedef void (*RedisModuleDefragEventFunc)(RedisModuleDefragCtx *ctx);
 typedef void (*RedisModuleUserChangedFunc) (uint64_t client_id, void *privdata);
 
 /* ------------------------- End of common defines ------------------------ */

From 0f41e1957a6c73402772757af10495a5fbe07c53 Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Tue, 18 Feb 2025 00:28:30 +0800
Subject: [PATCH 14/24] Add defragtest_global_strings_pauses to check string
 pauses

---
 tests/modules/defragtest.c      | 6 +++++-
 tests/unit/moduleapi/defrag.tcl | 3 ++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/modules/defragtest.c b/tests/modules/defragtest.c
index dc81caa4132..d436e56b07f 100644
--- a/tests/modules/defragtest.c
+++ b/tests/modules/defragtest.c
@@ -21,10 +21,11 @@ unsigned long int datatype_defragged = 0;
 unsigned long int datatype_raw_defragged = 0;
 unsigned long int datatype_resumes = 0;
 unsigned long int datatype_wrong_cursor = 0;
-unsigned long int global_strings_attempts = 0;
 unsigned long int defrag_started = 0;
 unsigned long int defrag_ended = 0;
+unsigned long int global_strings_attempts = 0;
 unsigned long int global_strings_defragged = 0;
+unsigned long int global_strings_pauses = 0;
 
 unsigned long global_strings_len = 0;
 RedisModuleString **global_strings = NULL;
@@ -53,6 +54,7 @@ static int defragGlobalStrings(RedisModuleDefragCtx *ctx)
         }
 
         if (RedisModule_DefragShouldStop(ctx)) {
+            global_strings_pauses++;
             RedisModule_DefragCursorSet(ctx, cursor);
             return 1;
         }
@@ -81,6 +83,7 @@ static void FragInfo(RedisModuleInfoCtx *ctx, int for_crash_report) {
     RedisModule_InfoAddFieldLongLong(ctx, "datatype_wrong_cursor", datatype_wrong_cursor);
     RedisModule_InfoAddFieldLongLong(ctx, "global_strings_attempts", global_strings_attempts);
     RedisModule_InfoAddFieldLongLong(ctx, "global_strings_defragged", global_strings_defragged);
+    RedisModule_InfoAddFieldLongLong(ctx, "global_strings_pauses", global_strings_pauses);
     RedisModule_InfoAddFieldLongLong(ctx, "defrag_started", defrag_started);
     RedisModule_InfoAddFieldLongLong(ctx, "defrag_ended", defrag_ended);
 }
@@ -110,6 +113,7 @@ static int fragResetStatsCommand(RedisModuleCtx *ctx, RedisModuleString **argv,
     datatype_wrong_cursor = 0;
     global_strings_attempts = 0;
     global_strings_defragged = 0;
+    global_strings_pauses = 0;
     defrag_started = 0;
     defrag_ended = 0;
 
diff --git a/tests/unit/moduleapi/defrag.tcl b/tests/unit/moduleapi/defrag.tcl
index e840305182a..9e7efb673cb 100644
--- a/tests/unit/moduleapi/defrag.tcl
+++ b/tests/unit/moduleapi/defrag.tcl
@@ -1,7 +1,7 @@
 set testmodule [file normalize tests/modules/defragtest.so]
 
 start_server {tags {"modules"} overrides {{save ""}}} {
-    r module load $testmodule 10000
+    r module load $testmodule 50000
     r config set hz 100
     r config set active-defrag-ignore-bytes 1
     r config set active-defrag-threshold-lower 0
@@ -47,6 +47,7 @@ start_server {tags {"modules"} overrides {{save ""}}} {
             after 2000
             set info [r info defragtest_stats]
             assert {[getInfoProperty $info defragtest_global_strings_attempts] > 0}
+            assert {[getInfoProperty $info defragtest_global_strings_pauses] > 0}
             assert_morethan [getInfoProperty $info defragtest_defrag_started] 0
             assert_morethan [getInfoProperty $info defragtest_defrag_ended] 0
         }

From 69879fa9c7acfb9067e8f903dc7d18914b214a0c Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Tue, 18 Feb 2025 00:29:35 +0800
Subject: [PATCH 15/24] Cleanup

---
 src/defrag.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/defrag.c b/src/defrag.c
index 570df9e5d16..42a69389ed1 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -1217,7 +1217,6 @@ static doneStatus defragLuaScripts(monotime endtime, void *ctx) {
 /* Handles defragmentation of module global data. This is a stage function
  * that gets called periodically during the active defragmentation process. */
 static doneStatus defragModuleGlobals(monotime endtime, void *ctx) {
-    UNUSED(endtime);
     defragModuleCtx *defrag_module_ctx = ctx;
 
     /* Set up context for the module's defrag callback. */

From 2e0ff3328a8c14f119facb8ebe430c0aedccb315 Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Tue, 18 Feb 2025 01:07:26 +0800
Subject: [PATCH 16/24] defragModuleCtx no longer stores module, which may be
 unlaoded midway

---
 src/defrag.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/defrag.c b/src/defrag.c
index 42a69389ed1..ab1f1db2945 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -112,9 +112,9 @@ typedef struct {
 static_assert(offsetof(defragPubSubCtx, kvstate) == 0, "defragStageKvstoreHelper requires this");
 
 typedef struct {
-    RedisModule *module;
     RedisModuleDefragCtx module_ctx;
     unsigned long cursor;
+    char module_name[];
 } defragModuleCtx;
 
 /* When scanning a main kvstore, large elements are queued for later handling rather than
@@ -1219,12 +1219,20 @@ static doneStatus defragLuaScripts(monotime endtime, void *ctx) {
 static doneStatus defragModuleGlobals(monotime endtime, void *ctx) {
     defragModuleCtx *defrag_module_ctx = ctx;
 
+    sds module_name = sdsnew(defrag_module_ctx->module_name);
+    RedisModule *module = moduleGetHandleByName(module_name);
+    sdsfree(module_name);
+    if (!module) {
+        /* Module has been unloaded, nothing to defrag. */
+        return DEFRAG_DONE;
+    }
+
     /* Set up context for the module's defrag callback. */
     defrag_module_ctx->module_ctx.endtime = endtime;
     defrag_module_ctx->module_ctx.cursor = &defrag_module_ctx->cursor;
 
     /* Call the module's defrag callback function and check if more work remains. */
-    if (defrag_module_ctx->module->defrag_cb_2(&defrag_module_ctx->module_ctx) != 0)
+    if (module->defrag_cb_2(&defrag_module_ctx->module_ctx) != 0)
         return DEFRAG_NOT_DONE;
 
     return DEFRAG_DONE;
@@ -1519,9 +1527,9 @@ static void beginDefragCycle(void) {
     while ((de = dictNext(di)) != NULL) {
         struct RedisModule *module = dictGetVal(de);
         if (module->defrag_cb_2) {
-            defragModuleCtx *ctx = zmalloc(sizeof(defragModuleCtx));
-            ctx->module = module;
+            defragModuleCtx *ctx = zmalloc(sizeof(defragModuleCtx) + strlen(module->name) + 1);
             ctx->cursor = 0;
+            redis_strlcpy(ctx->module_name,module->name,strlen(module->name)+1);
             addDefragStage(defragModuleGlobals, ctx);
         }
     }

From 9dfcdbbcd93fd293ef5c75eadeacf93772d6c311 Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Tue, 18 Feb 2025 02:13:07 +0800
Subject: [PATCH 17/24] Add support for both v1 and v2

---
 src/defrag.c | 19 +++++++++++++------
 src/server.h |  2 +-
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/defrag.c b/src/defrag.c
index ab1f1db2945..04b9e82acd7 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -1231,11 +1231,18 @@ static doneStatus defragModuleGlobals(monotime endtime, void *ctx) {
     defrag_module_ctx->module_ctx.endtime = endtime;
     defrag_module_ctx->module_ctx.cursor = &defrag_module_ctx->cursor;
 
-    /* Call the module's defrag callback function and check if more work remains. */
-    if (module->defrag_cb_2(&defrag_module_ctx->module_ctx) != 0)
-        return DEFRAG_NOT_DONE;
-
-    return DEFRAG_DONE;
+    /* Call appropriate version of module's defrag callback:
+     * 1. Version 2 (defrag_cb_2): Supports incremental defrag and returns whether more work is needed
+     * 2. Version 1 (defrag_cb): Legacy version, performs all work in one call.
+     *    Note: V1 doesn't support incremental defragmentation, may block for longer periods. */
+    if (module->defrag_cb_2) {
+        return module->defrag_cb_2(&defrag_module_ctx->module_ctx) ? DEFRAG_NOT_DONE : DEFRAG_DONE;
+    } else if (module->defrag_cb) {
+        module->defrag_cb(&defrag_module_ctx->module_ctx);
+        return DEFRAG_DONE;
+    } else {
+        redis_unreachable();
+    }
 }
 
 static void addDefragStage(defragStageFn stage_fn, void *ctx) {
@@ -1526,7 +1533,7 @@ static void beginDefragCycle(void) {
     dictEntry *de;
     while ((de = dictNext(di)) != NULL) {
         struct RedisModule *module = dictGetVal(de);
-        if (module->defrag_cb_2) {
+        if (module->defrag_cb || module->defrag_cb_2) {
             defragModuleCtx *ctx = zmalloc(sizeof(defragModuleCtx) + strlen(module->name) + 1);
             ctx->cursor = 0;
             redis_strlcpy(ctx->module_name,module->name,strlen(module->name)+1);
diff --git a/src/server.h b/src/server.h
index ffc1850a8cb..a52f70db41f 100644
--- a/src/server.h
+++ b/src/server.h
@@ -891,7 +891,7 @@ struct RedisModule {
     int blocked_clients;         /* Count of RedisModuleBlockedClient in this module. */
     RedisModuleInfoFunc info_cb; /* Callback for module to add INFO fields. */
     RedisModuleDefragFunc defrag_cb;    /* Callback for global data defrag. */
-    RedisModuleDefragFunc2 defrag_cb_2; /* Callback for global data defrag. */
+    RedisModuleDefragFunc2 defrag_cb_2; /* Version 2 callback for global data defrag. */
     RedisModuleDefragFunc defrag_start_cb;    /* Callback indicating defrag started. */
     RedisModuleDefragFunc defrag_end_cb;      /* Callback indicating defrag ended. */
     struct moduleLoadQueueEntry *loadmod; /* Module load arguments for config rewrite. */

From a202a999ae0b7975a15299b663641272cec128ce Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Tue, 18 Feb 2025 13:11:24 +0800
Subject: [PATCH 18/24] Style

---
 src/defrag.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/defrag.c b/src/defrag.c
index 8fd4fa51618..36471b4de0b 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -1244,6 +1244,14 @@ static doneStatus defragModuleGlobals(monotime endtime, void *ctx) {
     }
 }
 
+static void freeDefragKeysContext(void *ctx) {
+    defragKeysCtx *defrag_keys_ctx = ctx;
+    if (defrag_keys_ctx->defrag_later) {
+        listRelease(defrag_keys_ctx->defrag_later);
+    }
+    zfree(defrag_keys_ctx);
+}
+
 static void freeDefragModelContext(void *ctx) {
     defragModuleCtx *defrag_model_ctx = ctx;
     sdsfree(defrag_model_ctx->module_name);
@@ -1258,14 +1266,6 @@ static void freeDefragStage(void *ptr) {
     zfree(stage);
 }
 
-static void freeDefragKeysContext(void *ctx) {
-    defragKeysCtx *defrag_keys_ctx = ctx;
-    if (defrag_keys_ctx->defrag_later) {
-        listRelease(defrag_keys_ctx->defrag_later);
-    }
-    zfree(defrag_keys_ctx);
-}
-
 static void addDefragStage(defragStageFn stage_fn, defragStageContextFreeFn ctx_free_fn, void *ctx) {
     StageDescriptor *stage = zmalloc(sizeof(StageDescriptor));
     stage->stage_fn = stage_fn;

From 4cbe954960b2336453ee32eb27e0eb3147c20cd9 Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Tue, 18 Feb 2025 14:24:50 +0800
Subject: [PATCH 19/24] Fix RedisModuleDefragCtx calloc

---
 src/defrag.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/defrag.c b/src/defrag.c
index 36471b4de0b..647b2ed2f82 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -1546,7 +1546,7 @@ static void beginDefragCycle(void) {
             defragModuleCtx *ctx = zmalloc(sizeof(defragModuleCtx));
             ctx->cursor = 0;
             ctx->module_name = sdsnew(module->name);
-            ctx->module_ctx = zcalloc(sizeof(defragModuleCtx));
+            ctx->module_ctx = zcalloc(sizeof(RedisModuleDefragCtx));
             addDefragStage(defragModuleGlobals, freeDefragModelContext, ctx);
         }
     }

From 051705fcec276c342af51d16ffff0239a8dc3451 Mon Sep 17 00:00:00 2001
From: "debing.sun" <sundbcn@gmail.com>
Date: Tue, 18 Feb 2025 15:26:28 +0800
Subject: [PATCH 20/24] Update src/module.c

Co-authored-by: Oran Agra <oran@redislabs.com>
---
 src/module.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/module.c b/src/module.c
index 34c4fbecd12..f19f853baa5 100644
--- a/src/module.c
+++ b/src/module.c
@@ -13794,6 +13794,9 @@ int RM_RegisterDefragFunc(RedisModuleCtx *ctx, RedisModuleDefragFunc cb) {
 
 /* Register a defrag callback for global data, i.e. anything that the module
  * may allocate that is not tied to a specific data type.
+ * This is a more advanced version of RM_RegisterDefragFunc, in that it takes
+ * a callbacks that has a return value, and can use RM_DefragShouldStop
+ * in and indicate that it should be called again later, or is it done (returned 0).
  */
 int RM_RegisterDefragFunc2(RedisModuleCtx *ctx, RedisModuleDefragFunc2 cb) {
     ctx->module->defrag_cb_2 = cb;

From c5f91abaf70c4af035ec4cce13db86e3a584c28f Mon Sep 17 00:00:00 2001
From: Yunxiao Du <joker@jackdu.cn>
Date: Wed, 19 Feb 2025 10:58:14 +0800
Subject: [PATCH 21/24] Fix syntax issue in comments of src/module.c (#13802)

closes https://github.com/redis/redis/issues/13797, just fix syntax
issue in comments instead of real code.
---
 src/module.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/module.c b/src/module.c
index 12c7788da3c..59ae99e70e8 100644
--- a/src/module.c
+++ b/src/module.c
@@ -11258,7 +11258,7 @@ static void moduleScanKeyCallback(void *privdata, const dictEntry *de) {
  * The way it should be used:
  *
  *      RedisModuleScanCursor *c = RedisModule_ScanCursorCreate();
- *      RedisModuleKey *key = RedisModule_OpenKey(...)
+ *      RedisModuleKey *key = RedisModule_OpenKey(...);
  *      while(RedisModule_ScanKey(key, c, callback, privateData));
  *      RedisModule_CloseKey(key);
  *      RedisModule_ScanCursorDestroy(c);
@@ -11268,13 +11268,13 @@ static void moduleScanKeyCallback(void *privdata, const dictEntry *de) {
  *
  *      RedisModuleScanCursor *c = RedisModule_ScanCursorCreate();
  *      RedisModule_ThreadSafeContextLock(ctx);
- *      RedisModuleKey *key = RedisModule_OpenKey(...)
+ *      RedisModuleKey *key = RedisModule_OpenKey(...);
  *      while(RedisModule_ScanKey(ctx, c, callback, privateData)){
  *          RedisModule_CloseKey(key);
  *          RedisModule_ThreadSafeContextUnlock(ctx);
  *          // do some background job
  *          RedisModule_ThreadSafeContextLock(ctx);
- *          RedisModuleKey *key = RedisModule_OpenKey(...)
+ *          key = RedisModule_OpenKey(...);
  *      }
  *      RedisModule_CloseKey(key);
  *      RedisModule_ScanCursorDestroy(c);

From b045fe4e174eb90e110d0cb1c34e0fdc61ed9cbf Mon Sep 17 00:00:00 2001
From: luozongle01 <luozongle@hotmail.com>
Date: Wed, 19 Feb 2025 11:01:15 +0800
Subject: [PATCH 22/24] Fix overflow on 32-bit systems when calculating idle
 time for eviction (#13804)

the `dictGetSignedIntegerVal` function should be used here,
because in some cases (especially on 32-bit systems) long may
be 4 bytes, and the ttl time saved in expires is a unix timestamp
(millisecond value), which is more than 4 bytes. In this case, we may
not be able to get the correct idle time, which may cause eviction
disorder, in other words, keys that should be evicted later may be
evicted earlier.
---
 src/evict.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evict.c b/src/evict.c
index 059e82fe372..fe19ca17791 100644
--- a/src/evict.c
+++ b/src/evict.c
@@ -162,7 +162,7 @@ int evictionPoolPopulate(redisDb *db, kvstore *samplekvs, struct evictionPoolEnt
             idle = 255-LFUDecrAndReturn(o);
         } else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
             /* In this case the sooner the expire the better. */
-            idle = ULLONG_MAX - (long)dictGetVal(de);
+            idle = ULLONG_MAX - dictGetSignedIntegerVal(de);
         } else {
             serverPanic("Unknown eviction policy in evictionPoolPopulate()");
         }

From 66df58f9616618fb519d6d74b7b643442051d43e Mon Sep 17 00:00:00 2001
From: guybe7 <guy.benoish@redislabs.com>
Date: Wed, 19 Feb 2025 15:04:28 +0700
Subject: [PATCH 23/24] Do not send NL if replica client is already closed
 (#13813)

In case a replica connection was closed mid-RDB, we should not send a \n
to that replica, otherwise, it may reach the replica BEFORE it realizes
that the RDB transfer failed, causing it to treat the \n as if it was
read from the RDB stream
---
 src/replication.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/replication.c b/src/replication.c
index 1c2351c0f76..611913c9215 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -4581,7 +4581,7 @@ void replicationCron(void) {
             (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END &&
              server.rdb_child_type != RDB_CHILD_TYPE_SOCKET));
 
-        if (is_presync) {
+        if (is_presync && !(slave->flags & CLIENT_CLOSE_ASAP)) {
             connWrite(slave->conn, "\n", 1);
         }
     }

From 725cd268e652cd3c9f0047c3cfe23a911e565d8a Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Thu, 20 Feb 2025 00:05:24 +0800
Subject: [PATCH 24/24] Refactor of ActiveDefrag to reduce latencies (#13814)

This PR is based on: https://github.com/valkey-io/valkey/pull/1462

## Issue/Problems

Duty Cycle: Active Defrag has configuration values which determine the
intended percentage of CPU to be used based on a gradient of the
fragmentation percentage. However, Active Defrag performs its work on
the 100ms serverCron timer. It then computes a duty cycle and performs a
single long cycle. For example, if the intended CPU is computed to be
10%, Active Defrag will perform 10ms of work on this 100ms timer cron.

* This type of cycle introduces large latencies on the client (up to
25ms with default configurations)
* This mechanism is subject to starvation when slow commands delay the
serverCron

Maintainability: The current Active Defrag code is difficult to read &
maintain. Refactoring of the high level control mechanisms and functions
will allow us to more seamlessly adapt to new defragmentation needs.
Specific examples include:

* A single function (activeDefragCycle) includes the logic to
start/stop/modify the defragmentation as well as performing one "step"
of the defragmentation. This should be separated out, so that the actual
defrag activity can be performed on an independent timer (see duty cycle
above).
* The code is focused on kvstores, with other actions just thrown in at
the end (defragOtherGlobals). There's no mechanism to break this up to
reduce latencies.
* For the main dictionary (only), there is a mechanism to set aside
large keys to be processed in a later step. However this code creates a
separate list in each kvstore (main dict or not), bleeding/exposing
internal defrag logic. We only need 1 list - inside defrag. This logic
should be more contained for the main key store.
* The structure is not well suited towards other non-main-dictionary
items. For example, pub-sub and pub-sub-shard was added, but it's added
in such a way that in CMD mode, with multiple DBs, we will defrag
pub-sub repeatedly after each DB.

## Description of the feature

Primarily, this feature will split activeDefragCycle into 2 functions.

1. One function will be called from serverCron to determine if a defrag
cycle (a complete scan) needs to be started. It will also determine if
the CPU expenditure needs to be adjusted.
2. The 2nd function will be a timer proc dedicated to performing defrag.
This will be invoked independently from serverCron.

Once the functions are split, there is more control over the latency
created by the defrag process. A new configuration will be used to
determine the running time for the defrag timer proc. The default for
this will be 500us (one-half of the current minimum time). Then the
timer will be adjusted to achieve the desired CPU. As an example, 5% of
CPU will run the defrag process for 500us every 10ms. This is much
better than running for 5ms every 100ms.

The timer function will also adjust to compensate for starvation. If a
slow command delays the timer, the process will run proportionately
longer to ensure that the configured CPU is achieved. Given the presence
of slow commands, the proportional extra time is insignificant to
latency. This also addresses the overload case. At 100% CPU, if the
event loop slows, defrag will run proportionately longer to achieve the
configured CPU utilization.

Optionally, in low CPU situations, there would be little impact in
utilizing more than the configured CPU. We could optionally allow the
timer to pop more often (even with a 0ms delay) and the (tail) latency
impact would not change.

And we add a time limit for the defrag duty cycle to prevent excessive
latency. When latency is already high (indicated by a long time between
calls), we don't want to make it worse by running defrag for too long.

Addressing maintainability:

* The basic code structure can more clearly be organized around a
"cycle".
* Have clear begin/end functions and a set of "stages" to be executed.
* Rather than stages being limited to "kvstore" type data, a cycle
should be more flexible, incorporating the ability to incrementally
perform arbitrary work. This will likely be necessary in the future for
certain module types. It can be used today to address oddballs like
defragOtherGlobals.
* We reduced some of the globals, and reduce some of the coupling.
defrag_later should be removed from serverDb.
* Each stage should begin on a fresh cycle. So if there are
non-time-bounded operations like kvstoreDictLUTDefrag, these would be
less likely to introduce additional latency.


Signed-off-by: Jim Brunner
[brunnerj@amazon.com](mailto:brunnerj@amazon.com)
Signed-off-by: Madelyn Olson
[madelyneolson@gmail.com](mailto:madelyneolson@gmail.com)
Co-authored-by: Madelyn Olson
[madelyneolson@gmail.com](mailto:madelyneolson@gmail.com)

---------

Signed-off-by: Jim Brunner brunnerj@amazon.com
Signed-off-by: Madelyn Olson madelyneolson@gmail.com
Co-authored-by: Madelyn Olson madelyneolson@gmail.com
Co-authored-by: ShooterIT <wangyuancode@163.com>
---
 src/defrag.c                 | 947 ++++++++++++++++++++++-------------
 src/kvstore.c                |  23 +-
 src/kvstore.h                |  15 +-
 src/module.c                 |  11 +-
 src/server.c                 |  22 +-
 src/server.h                 |   4 +-
 tests/unit/memefficiency.tcl |  65 ++-
 7 files changed, 678 insertions(+), 409 deletions(-)

diff --git a/src/defrag.c b/src/defrag.c
index f25e102d51d..4766e16370b 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -8,8 +8,13 @@
  * Copyright (c) 2020-Present, Redis Ltd.
  * All rights reserved.
  *
+ * Copyright (c) 2024-present, Valkey contributors.
+ * All rights reserved.
+ *
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2) or the Server Side Public License v1 (SSPLv1).
+ *
+ * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information.
  */
 
 #include "server.h"
@@ -18,15 +23,106 @@
 
 #ifdef HAVE_DEFRAG
 
-typedef struct defragCtx {
-    void *privdata;
+#define DEFRAG_CYCLE_US 500 /* Standard duration of defrag cycle (in microseconds) */
+
+typedef enum { DEFRAG_NOT_DONE = 0,
+               DEFRAG_DONE = 1 } doneStatus;
+
+/*
+ * Defragmentation is performed in stages. Each stage is serviced by a stage function
+ * (defragStageFn). The stage function is passed a context (void*) to defrag. The contents of that
+ * context are unique to the particular stage - and may even be NULL for some stage functions. The
+ * same stage function can be used multiple times (for different stages) each having a different
+ * context.
+ *
+ * Parameters:
+ *  endtime     - This is the monotonic time that the function should end and return. This ensures
+ *                a bounded latency due to defrag.
+ *  ctx         - A pointer to context which is unique to the stage function.
+ *
+ * Returns:
+ *  - DEFRAG_DONE if the stage is complete
+ *  - DEFRAG_NOT_DONE if there is more work to do
+ */
+typedef doneStatus (*defragStageFn)(void *ctx, monotime endtime);
+
+/* Function pointer type for freeing context in defragmentation stages. */
+typedef void (*defragStageContextFreeFn)(void *ctx);
+typedef struct {
+    defragStageFn stage_fn; /* The function to be invoked for the stage */
+    defragStageContextFreeFn ctx_free_fn; /* Function to free the context */
+    void *ctx; /* Context, unique to the stage function */
+} StageDescriptor;
+
+/* Globals needed for the main defrag processing logic.
+ * Doesn't include variables specific to a stage or type of data. */
+struct DefragContext {
+    monotime start_cycle;           /* Time of beginning of defrag cycle */
+    long long start_defrag_hits;    /* server.stat_active_defrag_hits captured at beginning of cycle */
+    long long start_defrag_misses;  /* server.stat_active_defrag_misses captured at beginning of cycle */
+    float start_frag_pct;           /* Fragmention percent of beginning of defrag cycle */
+    float decay_rate;               /* Defrag speed decay rate */
+
+    list *remaining_stages;         /* List of stages which remain to be processed */
+    listNode *current_stage;        /* The list node of stage that's currently being processed */
+
+    long long timeproc_id;      /* Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID) */
+    monotime timeproc_end_time; /* Ending time of previous timerproc execution */
+    long timeproc_overage_us;   /* A correction value if over target CPU percent */
+};
+static struct DefragContext defrag = {0, 0, 0, 0, 1.0f};
+
+/* There are a number of stages which process a kvstore. To simplify this, a stage helper function
+ * `defragStageKvstoreHelper()` is defined. This function aids in iterating over the kvstore. It
+ * uses these definitions.
+ */
+/* State of the kvstore helper. The context passed to the kvstore helper MUST BEGIN
+ * with a kvstoreIterState (or be passed as NULL). */
+#define KVS_SLOT_DEFRAG_LUT -2
+#define KVS_SLOT_UNASSIGNED -1
+typedef struct {
+    kvstore *kvs;
     int slot;
-} defragCtx;
+    unsigned long cursor;
+} kvstoreIterState;
+#define INIT_KVSTORE_STATE(kvs) ((kvstoreIterState){(kvs), KVS_SLOT_DEFRAG_LUT, 0})
+
+/* The kvstore helper uses this function to perform tasks before continuing the iteration. For the
+ * main dictionary, large items are set aside and processed by this function before continuing with
+ * iteration over the kvstore.
+ *  endtime     - This is the monotonic time that the function should end and return.
+ *  ctx         - Context for functions invoked by the helper. If provided in the call to
+ *                `defragStageKvstoreHelper()`, the `kvstoreIterState` portion (at the beginning)
+ *                will be updated with the current kvstore iteration status.
+ *
+ * Returns:
+ *  - DEFRAG_DONE if the pre-continue work is complete
+ *  - DEFRAG_NOT_DONE if there is more work to do
+ */
+typedef doneStatus (*kvstoreHelperPreContinueFn)(void *ctx, monotime endtime);
 
-typedef struct defragPubSubCtx {
-    kvstore *pubsub_channels;
-    dict *(*clientPubSubChannels)(client*);
+typedef struct {
+    kvstoreIterState kvstate;
+    int dbid;
+
+    /* When scanning a main kvstore, large elements are queued for later handling rather than
+     * causing a large latency spike while processing a hash table bucket. This list is only used
+     * for stage: "defragStageDbKeys". It will only contain values for the current kvstore being
+     * defragged.
+     * Note that this is a list of key names. It's possible that the key may be deleted or modified
+     * before "later" and we will search by key name to find the entry when we defrag the item later. */
+    list *defrag_later;
+    unsigned long defrag_later_cursor;
+} defragKeysCtx;
+static_assert(offsetof(defragKeysCtx, kvstate) == 0, "defragStageKvstoreHelper requires this");
+
+/* Context for pubsub kvstores */
+typedef dict *(*getClientChannelsFn)(client *);
+typedef struct {
+    kvstoreIterState kvstate;
+    getClientChannelsFn getPubSubChannels;
 } defragPubSubCtx;
+static_assert(offsetof(defragPubSubCtx, kvstate) == 0, "defragStageKvstoreHelper requires this");
 
 /* this method was added to jemalloc in order to help us understand which
  * pointers are worthwhile moving and which aren't */
@@ -336,36 +432,6 @@ void activeDefragHfieldDict(dict *d) {
 }
 
 /* Defrag a list of ptr, sds or robj string values */
-void activeDefragList(list *l, int val_type) {
-    listNode *ln, *newln;
-    for (ln = l->head; ln; ln = ln->next) {
-        if ((newln = activeDefragAlloc(ln))) {
-            if (newln->prev)
-                newln->prev->next = newln;
-            else
-                l->head = newln;
-            if (newln->next)
-                newln->next->prev = newln;
-            else
-                l->tail = newln;
-            ln = newln;
-        }
-        if (val_type == DEFRAG_SDS_DICT_VAL_IS_SDS) {
-            sds newsds, sdsele = ln->value;
-            if ((newsds = activeDefragSds(sdsele)))
-                ln->value = newsds;
-        } else if (val_type == DEFRAG_SDS_DICT_VAL_IS_STROB) {
-            robj *newele, *ele = ln->value;
-            if ((newele = activeDefragStringOb(ele)))
-                ln->value = newele;
-        } else if (val_type == DEFRAG_SDS_DICT_VAL_VOID_PTR) {
-            void *newptr, *ptr = ln->value;
-            if ((newptr = activeDefragAlloc(ptr)))
-                ln->value = newptr;
-        }
-    }
-}
-
 void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) {
     quicklistNode *newnode, *node = *node_ref;
     unsigned char *newzl;
@@ -395,13 +461,18 @@ void activeDefragQuickListNodes(quicklist *ql) {
 /* when the value has lots of elements, we want to handle it later and not as
  * part of the main dictionary scan. this is needed in order to prevent latency
  * spikes when handling large items */
-void defragLater(redisDb *db, dictEntry *kde) {
+void defragLater(defragKeysCtx *ctx, dictEntry *kde) {
+    if (!ctx->defrag_later) {
+        ctx->defrag_later = listCreate();
+        listSetFreeMethod(ctx->defrag_later, sdsfreegeneric);
+        ctx->defrag_later_cursor = 0;
+    }
     sds key = sdsdup(dictGetKey(kde));
-    listAddNodeTail(db->defrag_later, key);
+    listAddNodeTail(ctx->defrag_later, key);
 }
 
 /* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */
-long scanLaterList(robj *ob, unsigned long *cursor, long long endtime) {
+long scanLaterList(robj *ob, unsigned long *cursor, monotime endtime) {
     quicklist *ql = ob->ptr;
     quicklistNode *node;
     long iterations = 0;
@@ -427,7 +498,7 @@ long scanLaterList(robj *ob, unsigned long *cursor, long long endtime) {
         activeDefragQuickListNode(ql, &node);
         server.stat_active_defrag_scanned++;
         if (++iterations > 128 && !bookmark_failed) {
-            if (ustime() > endtime) {
+            if (getMonotonicUs() > endtime) {
                 if (!quicklistBookmarkCreate(&ql, "_AD", node)) {
                     bookmark_failed = 1;
                 } else {
@@ -495,19 +566,19 @@ void scanLaterHash(robj *ob, unsigned long *cursor) {
     *cursor = dictScanDefrag(d, *cursor, activeDefragHfieldDictCallback, &defragfns, d);
 }
 
-void defragQuicklist(redisDb *db, dictEntry *kde) {
+void defragQuicklist(defragKeysCtx *ctx, dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     quicklist *ql = ob->ptr, *newql;
     serverAssert(ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST);
     if ((newql = activeDefragAlloc(ql)))
         ob->ptr = ql = newql;
     if (ql->len > server.active_defrag_max_scan_fields)
-        defragLater(db, kde);
+        defragLater(ctx, kde);
     else
         activeDefragQuickListNodes(ql);
 }
 
-void defragZsetSkiplist(redisDb *db, dictEntry *kde) {
+void defragZsetSkiplist(defragKeysCtx *ctx, dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     zset *zs = (zset*)ob->ptr;
     zset *newzs;
@@ -523,7 +594,7 @@ void defragZsetSkiplist(redisDb *db, dictEntry *kde) {
     if ((newheader = activeDefragAlloc(zs->zsl->header)))
         zs->zsl->header = newheader;
     if (dictSize(zs->dict) > server.active_defrag_max_scan_fields)
-        defragLater(db, kde);
+        defragLater(ctx, kde);
     else {
         dictIterator *di = dictGetIterator(zs->dict);
         while((de = dictNext(di)) != NULL) {
@@ -536,13 +607,13 @@ void defragZsetSkiplist(redisDb *db, dictEntry *kde) {
         zs->dict = newdict;
 }
 
-void defragHash(redisDb *db, dictEntry *kde) {
+void defragHash(defragKeysCtx *ctx, dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     dict *d, *newd;
     serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HT);
     d = ob->ptr;
     if (dictSize(d) > server.active_defrag_max_scan_fields)
-        defragLater(db, kde);
+        defragLater(ctx, kde);
     else
         activeDefragHfieldDict(d);
     /* defrag the dict struct and tables */
@@ -550,13 +621,13 @@ void defragHash(redisDb *db, dictEntry *kde) {
         ob->ptr = newd;
 }
 
-void defragSet(redisDb *db, dictEntry *kde) {
+void defragSet(defragKeysCtx *ctx, dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     dict *d, *newd;
     serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT);
     d = ob->ptr;
     if (dictSize(d) > server.active_defrag_max_scan_fields)
-        defragLater(db, kde);
+        defragLater(ctx, kde);
     else
         activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL);
     /* defrag the dict struct and tables */
@@ -576,7 +647,7 @@ int defragRaxNode(raxNode **noderef) {
 }
 
 /* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */
-int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, long long endtime) {
+int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, monotime endtime) {
     static unsigned char last[sizeof(streamID)];
     raxIterator ri;
     long iterations = 0;
@@ -613,7 +684,7 @@ int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, long long endtime)
             raxSetData(ri.node, ri.data=newdata);
         server.stat_active_defrag_scanned++;
         if (++iterations > 128) {
-            if (ustime() > endtime) {
+            if (getMonotonicUs() > endtime) {
                 serverAssert(ri.key_len==sizeof(last));
                 memcpy(last,ri.key,ri.key_len);
                 raxStop(&ri);
@@ -703,7 +774,7 @@ void* defragStreamConsumerGroup(raxIterator *ri, void *privdata) {
     return NULL;
 }
 
-void defragStream(redisDb *db, dictEntry *kde) {
+void defragStream(defragKeysCtx *ctx, dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     serverAssert(ob->type == OBJ_STREAM && ob->encoding == OBJ_ENCODING_STREAM);
     stream *s = ob->ptr, *news;
@@ -716,7 +787,7 @@ void defragStream(redisDb *db, dictEntry *kde) {
         rax *newrax = activeDefragAlloc(s->rax);
         if (newrax)
             s->rax = newrax;
-        defragLater(db, kde);
+        defragLater(ctx, kde);
     } else
         defragRadixTree(&s->rax, 1, NULL, NULL);
 
@@ -727,24 +798,25 @@ void defragStream(redisDb *db, dictEntry *kde) {
 /* Defrag a module key. This is either done immediately or scheduled
  * for later. Returns then number of pointers defragged.
  */
-void defragModule(redisDb *db, dictEntry *kde) {
+void defragModule(defragKeysCtx *ctx, redisDb *db, dictEntry *kde) {
     robj *obj = dictGetVal(kde);
     serverAssert(obj->type == OBJ_MODULE);
     robj keyobj;
     initStaticStringObject(keyobj, dictGetKey(kde));
     if (!moduleDefragValue(&keyobj, obj, db->id))
-        defragLater(db, kde);
+        defragLater(ctx, kde);
 }
 
 /* for each key we scan in the main dict, this function will attempt to defrag
  * all the various pointers it has. */
-void defragKey(defragCtx *ctx, dictEntry *de) {
+void defragKey(defragKeysCtx *ctx, dictEntry *de) {
+    redisDb *db = &server.db[ctx->dbid];
+    int slot = ctx->kvstate.slot;
     sds keysds = dictGetKey(de);
     robj *newob, *ob = dictGetVal(de);
     unsigned char *newzl;
     sds newsds;
-    redisDb *db = ctx->privdata;
-    int slot = ctx->slot;
+
     /* Try to defrag the key name. */
     newsds = activeDefragSds(keysds);
     if (newsds) {
@@ -781,7 +853,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) {
         /* Already handled in activeDefragStringOb. */
     } else if (ob->type == OBJ_LIST) {
         if (ob->encoding == OBJ_ENCODING_QUICKLIST) {
-            defragQuicklist(db, de);
+            defragQuicklist(ctx, de);
         } else if (ob->encoding == OBJ_ENCODING_LISTPACK) {
             if ((newzl = activeDefragAlloc(ob->ptr)))
                 ob->ptr = newzl;
@@ -790,7 +862,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) {
         }
     } else if (ob->type == OBJ_SET) {
         if (ob->encoding == OBJ_ENCODING_HT) {
-            defragSet(db, de);
+            defragSet(ctx, de);
         } else if (ob->encoding == OBJ_ENCODING_INTSET ||
                    ob->encoding == OBJ_ENCODING_LISTPACK)
         {
@@ -805,7 +877,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) {
             if ((newzl = activeDefragAlloc(ob->ptr)))
                 ob->ptr = newzl;
         } else if (ob->encoding == OBJ_ENCODING_SKIPLIST) {
-            defragZsetSkiplist(db, de);
+            defragZsetSkiplist(ctx, de);
         } else {
             serverPanic("Unknown sorted set encoding");
         }
@@ -820,23 +892,23 @@ void defragKey(defragCtx *ctx, dictEntry *de) {
             if ((newzl = activeDefragAlloc(lpt->lp)))
                 lpt->lp = newzl;
         } else if (ob->encoding == OBJ_ENCODING_HT) {
-            defragHash(db, de);
+            defragHash(ctx, de);
         } else {
             serverPanic("Unknown hash encoding");
         }
     } else if (ob->type == OBJ_STREAM) {
-        defragStream(db, de);
+        defragStream(ctx, de);
     } else if (ob->type == OBJ_MODULE) {
-        defragModule(db, de);
+        defragModule(ctx,db, de);
     } else {
         serverPanic("Unknown object type");
     }
 }
 
 /* Defrag scan callback for the main db dictionary. */
-void defragScanCallback(void *privdata, const dictEntry *de) {
+static void dbKeysScanCallback(void *privdata, const dictEntry *de) {
     long long hits_before = server.stat_active_defrag_hits;
-    defragKey((defragCtx*)privdata, (dictEntry*)de);
+    defragKey((defragKeysCtx *)privdata, (dictEntry *)de);
     if (server.stat_active_defrag_hits != hits_before)
         server.stat_active_defrag_key_hits++;
     else
@@ -880,9 +952,8 @@ float getAllocatorFragmentation(size_t *out_frag_bytes) {
 
 /* Defrag scan callback for the pubsub dictionary. */
 void defragPubsubScanCallback(void *privdata, const dictEntry *de) {
-    defragCtx *ctx = privdata;
-    defragPubSubCtx *pubsub_ctx = ctx->privdata;
-    kvstore *pubsub_channels = pubsub_ctx->pubsub_channels;
+    defragPubSubCtx *ctx = privdata;
+    kvstore *pubsub_channels = ctx->kvstate.kvs;
     robj *newchannel, *channel = dictGetKey(de);
     dict *newclients, *clients = dictGetVal(de);
 
@@ -890,7 +961,7 @@ void defragPubsubScanCallback(void *privdata, const dictEntry *de) {
     serverAssert(channel->refcount == (int)dictSize(clients) + 1);
     newchannel = activeDefragStringObEx(channel, dictSize(clients) + 1);
     if (newchannel) {
-        kvstoreDictSetKey(pubsub_channels, ctx->slot, (dictEntry*)de, newchannel);
+        kvstoreDictSetKey(pubsub_channels, ctx->kvstate.slot, (dictEntry*)de, newchannel);
 
         /* The channel name is shared by the client's pubsub(shard) and server's
          * pubsub(shard), after defraging the channel name, we need to update
@@ -899,36 +970,24 @@ void defragPubsubScanCallback(void *privdata, const dictEntry *de) {
         dictEntry *clientde;
         while((clientde = dictNext(di)) != NULL) {
             client *c = dictGetKey(clientde);
-            dictEntry *pubsub_channel = dictFind(pubsub_ctx->clientPubSubChannels(c), newchannel);
+            dict *client_channels = ctx->getPubSubChannels(c);
+            dictEntry *pubsub_channel = dictFind(client_channels, newchannel);
             serverAssert(pubsub_channel);
-            dictSetKey(pubsub_ctx->clientPubSubChannels(c), pubsub_channel, newchannel);
+            dictSetKey(ctx->getPubSubChannels(c), pubsub_channel, newchannel);
         }
         dictReleaseIterator(di);
     }
 
     /* Try to defrag the dictionary of clients that is stored as the value part. */
     if ((newclients = dictDefragTables(clients)))
-        kvstoreDictSetVal(pubsub_channels, ctx->slot, (dictEntry*)de, newclients);
+        kvstoreDictSetVal(pubsub_channels, ctx->kvstate.slot, (dictEntry *)de, newclients);
 
     server.stat_active_defrag_scanned++;
 }
 
-/* We may need to defrag other globals, one small allocation can hold a full allocator run.
- * so although small, it is still important to defrag these */
-void defragOtherGlobals(void) {
-
-    /* there are many more pointers to defrag (e.g. client argv, output / aof buffers, etc.
-     * but we assume most of these are short lived, we only need to defrag allocations
-     * that remain static for a long time */
-    activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT);
-    moduleDefragGlobals();
-    kvstoreDictLUTDefrag(server.pubsub_channels, dictDefragTables);
-    kvstoreDictLUTDefrag(server.pubsubshard_channels, dictDefragTables);
-}
-
 /* returns 0 more work may or may not be needed (see non-zero cursor),
  * and 1 if time is up and more work is needed. */
-int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int dbid) {
+int defragLaterItem(dictEntry *de, unsigned long *cursor, monotime endtime, int dbid) {
     if (de) {
         robj *ob = dictGetVal(de);
         if (ob->type == OBJ_LIST) {
@@ -942,9 +1001,10 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int
         } else if (ob->type == OBJ_STREAM) {
             return scanLaterStreamListpacks(ob, cursor, endtime);
         } else if (ob->type == OBJ_MODULE) {
+            long long endtimeWallClock = ustime() + (endtime - getMonotonicUs());
             robj keyobj;
             initStaticStringObject(keyobj, dictGetKey(de));
-            return moduleLateDefrag(&keyobj, ob, cursor, endtime, dbid);
+            return moduleLateDefrag(&keyobj, ob, cursor, endtimeWallClock, dbid);
         } else {
             *cursor = 0; /* object type may have changed since we schedule it for later */
         }
@@ -954,78 +1014,55 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int
     return 0;
 }
 
-/* static variables serving defragLaterStep to continue scanning a key from were we stopped last time. */
-static sds defrag_later_current_key = NULL;
-static unsigned long defrag_later_cursor = 0;
+static int defragIsRunning(void) {
+    return (defrag.timeproc_id > 0);
+}
+
+/* A kvstoreHelperPreContinueFn */
+static doneStatus defragLaterStep(void *ctx, monotime endtime) {
+    defragKeysCtx *defrag_keys_ctx = ctx;
 
-/* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */
-int defragLaterStep(redisDb *db, int slot, long long endtime) {
     unsigned int iterations = 0;
     unsigned long long prev_defragged = server.stat_active_defrag_hits;
     unsigned long long prev_scanned = server.stat_active_defrag_scanned;
-    long long key_defragged;
 
-    do {
-        /* if we're not continuing a scan from the last call or loop, start a new one */
-        if (!defrag_later_cursor) {
-            listNode *head = listFirst(db->defrag_later);
-
-            /* Move on to next key */
-            if (defrag_later_current_key) {
-                serverAssert(defrag_later_current_key == head->value);
-                listDelNode(db->defrag_later, head);
-                defrag_later_cursor = 0;
-                defrag_later_current_key = NULL;
-            }
+    while (defrag_keys_ctx->defrag_later && listLength(defrag_keys_ctx->defrag_later) > 0) {
+        listNode *head = listFirst(defrag_keys_ctx->defrag_later);
+        sds key = head->value;
+        dictEntry *de = kvstoreDictFind(defrag_keys_ctx->kvstate.kvs, defrag_keys_ctx->kvstate.slot, key);
+
+        long long key_defragged = server.stat_active_defrag_hits;
+        int timeout = (defragLaterItem(de, &defrag_keys_ctx->defrag_later_cursor, endtime, defrag_keys_ctx->dbid) == 1);
+        if (key_defragged != server.stat_active_defrag_hits) {
+            server.stat_active_defrag_key_hits++;
+        } else {
+            server.stat_active_defrag_key_misses++;
+        }
 
-            /* stop if we reached the last one. */
-            head = listFirst(db->defrag_later);
-            if (!head)
-                return 0;
+        if (timeout) break;
 
-            /* start a new key */
-            defrag_later_current_key = head->value;
-            defrag_later_cursor = 0;
+        if (defrag_keys_ctx->defrag_later_cursor == 0) {
+            /* the item is finished, move on */
+            listDelNode(defrag_keys_ctx->defrag_later, head);
         }
 
-        /* each time we enter this function we need to fetch the key from the dict again (if it still exists) */
-        dictEntry *de = kvstoreDictFind(db->keys, slot, defrag_later_current_key);
-        key_defragged = server.stat_active_defrag_hits;
-        do {
-            int quit = 0;
-            if (defragLaterItem(de, &defrag_later_cursor, endtime,db->id))
-                quit = 1; /* time is up, we didn't finish all the work */
-
-            /* Once in 16 scan iterations, 512 pointer reallocations, or 64 fields
-             * (if we have a lot of pointers in one hash bucket, or rehashing),
-             * check if we reached the time limit. */
-            if (quit || (++iterations > 16 ||
-                            server.stat_active_defrag_hits - prev_defragged > 512 ||
-                            server.stat_active_defrag_scanned - prev_scanned > 64)) {
-                if (quit || ustime() > endtime) {
-                    if(key_defragged != server.stat_active_defrag_hits)
-                        server.stat_active_defrag_key_hits++;
-                    else
-                        server.stat_active_defrag_key_misses++;
-                    return 1;
-                }
-                iterations = 0;
-                prev_defragged = server.stat_active_defrag_hits;
-                prev_scanned = server.stat_active_defrag_scanned;
-            }
-        } while(defrag_later_cursor);
-        if(key_defragged != server.stat_active_defrag_hits)
-            server.stat_active_defrag_key_hits++;
-        else
-            server.stat_active_defrag_key_misses++;
-    } while(1);
+        if (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 ||
+            server.stat_active_defrag_scanned - prev_scanned > 64) {
+            if (getMonotonicUs() > endtime) break;
+            iterations = 0;
+            prev_defragged = server.stat_active_defrag_hits;
+            prev_scanned = server.stat_active_defrag_scanned;
+        }
+    }
+
+    return (!defrag_keys_ctx->defrag_later || listLength(defrag_keys_ctx->defrag_later) == 0) ? DEFRAG_DONE : DEFRAG_NOT_DONE;
 }
 
 #define INTERPOLATE(x, x1, x2, y1, y2) ( (y1) + ((x)-(x1)) * ((y2)-(y1)) / ((x2)-(x1)) )
 #define LIMIT(y, min, max) ((y)<(min)? min: ((y)>(max)? max: (y)))
 
 /* decide if defrag is needed, and at what CPU effort to invest in it */
-void computeDefragCycles(float decay_rate) {
+void computeDefragCycles(void) {
     size_t frag_bytes;
     float frag_pct = getAllocatorFragmentation(&frag_bytes);
     /* If we're not already running, and below the threshold, exit. */
@@ -1041,7 +1078,7 @@ void computeDefragCycles(float decay_rate) {
             server.active_defrag_threshold_upper,
             server.active_defrag_cycle_min,
             server.active_defrag_cycle_max);
-    cpu_pct *= decay_rate;
+    cpu_pct *= defrag.decay_rate;
     cpu_pct = LIMIT(cpu_pct,
             server.active_defrag_cycle_min,
             server.active_defrag_cycle_max);
@@ -1052,246 +1089,451 @@ void computeDefragCycles(float decay_rate) {
     if (cpu_pct > server.active_defrag_running ||
         server.active_defrag_configuration_changed)
     {
-        server.active_defrag_running = cpu_pct;
         server.active_defrag_configuration_changed = 0;
-        serverLog(LL_VERBOSE,
-            "Starting active defrag, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%",
-            frag_pct, frag_bytes, cpu_pct);
+        if (defragIsRunning()) {
+            serverLog(LL_VERBOSE, "Changing active defrag CPU, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%",
+                      frag_pct, frag_bytes, cpu_pct);
+        } else {
+            serverLog(LL_VERBOSE,
+                "Starting active defrag, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%",
+                frag_pct, frag_bytes, cpu_pct);
+        }
+        server.active_defrag_running = cpu_pct;
     }
 }
 
-/* Perform incremental defragmentation work from the serverCron.
- * This works in a similar way to activeExpireCycle, in the sense that
- * we do incremental work across calls. */
-void activeDefragCycle(void) {
-    static int slot = -1;
-    static int current_db = -1;
-    static int defrag_later_item_in_progress = 0;
-    static int defrag_stage = 0;
-    static unsigned long defrag_cursor = 0;
-    static redisDb *db = NULL;
-    static long long start_scan, start_hits, start_misses;
-    static float start_frag_pct;
-    static float decay_rate = 1.0f;
+/* This helper function handles most of the work for iterating over a kvstore. 'privdata', if
+ * provided, MUST begin with 'kvstoreIterState' and this part is automatically updated by this
+ * function during the iteration. */
+static doneStatus defragStageKvstoreHelper(monotime endtime,
+                                           void *ctx,
+                                           dictScanFunction scan_fn,
+                                           kvstoreHelperPreContinueFn precontinue_fn,
+                                           dictDefragFunctions *defragfns)
+{
     unsigned int iterations = 0;
     unsigned long long prev_defragged = server.stat_active_defrag_hits;
     unsigned long long prev_scanned = server.stat_active_defrag_scanned;
-    long long start, timelimit, endtime;
-    mstime_t latency;
-    int all_stages_finished = 0;
-    int quit = 0;
+    kvstoreIterState *state = (kvstoreIterState*)ctx;
 
-    if (!server.active_defrag_enabled) {
-        if (server.active_defrag_running) {
-            /* if active defrag was disabled mid-run, start from fresh next time. */
-            server.active_defrag_running = 0;
-            server.active_defrag_configuration_changed = 0;
-            if (db)
-                listEmpty(db->defrag_later);
-            defrag_later_current_key = NULL;
-            defrag_later_cursor = 0;
-            current_db = -1;
-            defrag_stage = 0;
-            defrag_cursor = 0;
-            slot = -1;
-            defrag_later_item_in_progress = 0;
-            db = NULL;
-            moduleDefragEnd();
-            goto update_metrics;
+    if (state->slot == KVS_SLOT_DEFRAG_LUT) {
+        /* Before we start scanning the kvstore, handle the main structures */
+        do {
+            state->cursor = kvstoreDictLUTDefrag(state->kvs, state->cursor, dictDefragTables);
+            if (getMonotonicUs() >= endtime) return DEFRAG_NOT_DONE;
+        } while (state->cursor != 0);
+        state->slot = KVS_SLOT_UNASSIGNED;
+    }
+
+    while (1) {
+        if (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || server.stat_active_defrag_scanned - prev_scanned > 64) {
+            if (getMonotonicUs() >= endtime) break;
+            iterations = 0;
+            prev_defragged = server.stat_active_defrag_hits;
+            prev_scanned = server.stat_active_defrag_scanned;
         }
-        return;
+
+        if (precontinue_fn) {
+            if (precontinue_fn(ctx, endtime) == DEFRAG_NOT_DONE) return DEFRAG_NOT_DONE;
+        }
+
+        if (!state->cursor) {
+            /* If there's no cursor, we're ready to begin a new kvstore slot. */
+            if (state->slot == KVS_SLOT_UNASSIGNED) {
+                state->slot = kvstoreGetFirstNonEmptyDictIndex(state->kvs);
+            } else {
+                state->slot = kvstoreGetNextNonEmptyDictIndex(state->kvs, state->slot);
+            }
+
+            if (state->slot == KVS_SLOT_UNASSIGNED) return DEFRAG_DONE;
+        }
+
+        /* Whatever privdata's actual type, this function requires that it begins with kvstoreIterState. */
+        state->cursor = kvstoreDictScanDefrag(state->kvs, state->slot, state->cursor,
+                                             scan_fn, defragfns, ctx);
     }
 
-    if (hasActiveChildProcess())
-        return; /* Defragging memory while there's a fork will just do damage. */
+    return DEFRAG_NOT_DONE;
+}
 
-    /* Once a second, check if the fragmentation justfies starting a scan
-     * or making it more aggressive. */
-    run_with_period(1000) {
-        computeDefragCycles(decay_rate);
+static doneStatus defragStageDbKeys(void *ctx, monotime endtime) {
+    defragKeysCtx *defrag_keys_ctx = ctx;
+    redisDb *db = &server.db[defrag_keys_ctx->dbid];
+    if (db->keys != defrag_keys_ctx->kvstate.kvs) {
+        /* There has been a change of the kvs (flushdb, swapdb, etc.). Just complete the stage. */
+        return DEFRAG_DONE;
     }
 
-    /* Normally it is checked once a second, but when there is a configuration
-     * change, we want to check it as soon as possible. */
-    if (server.active_defrag_configuration_changed) {
-        computeDefragCycles(decay_rate);
-        server.active_defrag_configuration_changed = 0;
+    /* Note: for DB keys, we use the start/finish callback to fix an expires table entry if
+     * the main DB entry has been moved. */
+    static dictDefragFunctions defragfns = {
+        .defragAlloc = activeDefragAlloc,
+        .defragKey = NULL, /* Handled by dbKeysScanCallback */
+        .defragVal = NULL, /* Handled by dbKeysScanCallback */
+    };
+
+    return defragStageKvstoreHelper(endtime, ctx,
+        dbKeysScanCallback, defragLaterStep, &defragfns);
+}
+
+static doneStatus defragStageExpiresKvstore(void *ctx, monotime endtime) {
+    defragKeysCtx *defrag_keys_ctx = ctx;
+    redisDb *db = &server.db[defrag_keys_ctx->dbid];
+    if (db->keys != defrag_keys_ctx->kvstate.kvs) {
+        /* There has been a change of the kvs (flushdb, swapdb, etc.). Just complete the stage. */
+        return DEFRAG_DONE;
     }
 
-    if (!server.active_defrag_running)
-        return;
+    static dictDefragFunctions defragfns = {
+        .defragAlloc = activeDefragAlloc,
+        .defragKey = NULL, /* Not needed for expires (just a ref) */
+        .defragVal = NULL, /* Not needed for expires (no value) */
+    };
+    return defragStageKvstoreHelper(endtime, ctx,
+        scanCallbackCountScanned, NULL, &defragfns);
+}
 
-    /* See activeExpireCycle for how timelimit is handled. */
-    start = ustime();
-    timelimit = 1000000*server.active_defrag_running/server.hz/100;
-    if (timelimit <= 0) timelimit = 1;
-    endtime = start + timelimit;
-    latencyStartMonitor(latency);
+static doneStatus defragStagePubsubKvstore(void *ctx, monotime endtime) {
+    static dictDefragFunctions defragfns = {
+        .defragAlloc = activeDefragAlloc,
+        .defragKey = NULL, /* Handled by defragPubsubScanCallback */
+        .defragVal = NULL, /* Not needed for expires (no value) */
+    };
 
-    dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc};
-    do {
-        /* if we're not continuing a scan from the last call or loop, start a new one */
-        if (!defrag_stage && !defrag_cursor && (slot < 0)) {
-            /* finish any leftovers from previous db before moving to the next one */
-            if (db && defragLaterStep(db, slot, endtime)) {
-                quit = 1; /* time is up, we didn't finish all the work */
-                break; /* this will exit the function and we'll continue on the next cycle */
-            }
+    return defragStageKvstoreHelper(endtime, ctx,
+        defragPubsubScanCallback, NULL, &defragfns);
+}
 
-            if (current_db == -1) {
-                moduleDefragStart();
-            }
+static doneStatus defragLuaScripts(void *ctx, monotime endtime) {
+    UNUSED(endtime);
+    UNUSED(ctx);
+    activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT);
+    return DEFRAG_DONE;
+}
 
-            /* Move on to next database, and stop if we reached the last one. */
-            if (++current_db >= server.dbnum) {
-                /* defrag other items not part of the db / keys */
-                defragOtherGlobals();
-
-                long long now = ustime();
-                size_t frag_bytes;
-                float frag_pct = getAllocatorFragmentation(&frag_bytes);
-                serverLog(LL_VERBOSE,
-                    "Active defrag done in %dms, reallocated=%d, frag=%.0f%%, frag_bytes=%zu",
-                    (int)((now - start_scan)/1000), (int)(server.stat_active_defrag_hits - start_hits), frag_pct, frag_bytes);
-
-                start_scan = now;
-                current_db = -1;
-                defrag_stage = 0;
-                defrag_cursor = 0;
-                slot = -1;
-                defrag_later_item_in_progress = 0;
-                db = NULL;
-                server.active_defrag_running = 0;
-
-                long long last_hits = server.stat_active_defrag_hits - start_hits;
-                long long last_misses = server.stat_active_defrag_misses - start_misses;
-                float last_frag_pct_change = start_frag_pct - frag_pct;
-                /* When defragmentation efficiency is low, we gradually reduce the
-                 * speed for the next cycle to avoid CPU waste. However, in the
-                 * following two cases, we keep the normal speed:
-                 * 1) If the fragmentation percentage has increased or decreased by more than 2%.
-                 * 2) If the fragmentation percentage decrease is small, but hits are above 1%,
-                 *    we still keep the normal speed. */
-                if (fabs(last_frag_pct_change) > 2 ||
-                    (last_frag_pct_change < 0 && last_hits >= (last_hits + last_misses) * 0.01))
-                {
-                    decay_rate = 1.0f;
-                } else {
-                    decay_rate *= 0.9;
-                }
+static doneStatus defragModuleGlobals(void *ctx, monotime endtime) {
+    UNUSED(endtime);
+    UNUSED(ctx);
+    moduleDefragGlobals();
+    return DEFRAG_DONE;
+}
 
-                moduleDefragEnd();
+static void freeDefragKeysContext(void *ctx) {
+    defragKeysCtx *defrag_keys_ctx = ctx;
+    if (defrag_keys_ctx->defrag_later) {
+        listRelease(defrag_keys_ctx->defrag_later);
+    }
+    zfree(defrag_keys_ctx);
+}
 
-                computeDefragCycles(decay_rate); /* if another scan is needed, start it right away */
-                if (server.active_defrag_running != 0 && ustime() < endtime)
-                    continue;
-                break;
-            }
-            else if (current_db==0) {
-                /* Start a scan from the first database. */
-                start_scan = ustime();
-                start_hits = server.stat_active_defrag_hits;
-                start_misses = server.stat_active_defrag_misses;
-                start_frag_pct = getAllocatorFragmentation(NULL);
-            }
+static void freeDefragContext(void *ptr) {
+    StageDescriptor *stage = ptr;
+    if (stage->ctx_free_fn)
+        stage->ctx_free_fn(stage->ctx);
+    zfree(stage);
+}
+
+static void addDefragStage(defragStageFn stage_fn, defragStageContextFreeFn ctx_free_fn, void *ctx) {
+    StageDescriptor *stage = zmalloc(sizeof(StageDescriptor));
+    stage->stage_fn = stage_fn;
+    stage->ctx_free_fn = ctx_free_fn;
+    stage->ctx = ctx;
+    listAddNodeTail(defrag.remaining_stages, stage);
+}
+
+/* Updates the defrag decay rate based on the observed effectiveness of the defrag process.
+ * The decay rate is used to gradually slow down defrag when it's not being effective. */
+static void updateDefragDecayRate(float frag_pct) {
+    long long last_hits = server.stat_active_defrag_hits - defrag.start_defrag_hits;
+    long long last_misses = server.stat_active_defrag_misses - defrag.start_defrag_misses;
+    float last_frag_pct_change = defrag.start_frag_pct - frag_pct;
+    /* When defragmentation efficiency is low, we gradually reduce the
+     * speed for the next cycle to avoid CPU waste. However, in the
+     * following two cases, we keep the normal speed:
+     * 1) If the fragmentation percentage has increased or decreased by more than 2%.
+     * 2) If the fragmentation percentage decrease is small, but hits are above 1%,
+     *    we still keep the normal speed. */
+    if (fabs(last_frag_pct_change) > 2 ||
+        (last_frag_pct_change < 0 && last_hits >= (last_hits + last_misses) * 0.01))
+    {
+        defrag.decay_rate = 1.0f;
+    } else {
+        defrag.decay_rate *= 0.9;
+    }
+}
+
+/* Called at the end of a complete defrag cycle, or when defrag is terminated */
+static void endDefragCycle(int normal_termination) {
+    if (normal_termination) {
+        /* For normal termination, we expect... */
+        serverAssert(!defrag.current_stage);
+        serverAssert(listLength(defrag.remaining_stages) == 0);
+    } else {
+        /* Defrag is being terminated abnormally */
+        aeDeleteTimeEvent(server.el, defrag.timeproc_id);
 
-            db = &server.db[current_db];
-            kvstoreDictLUTDefrag(db->keys, dictDefragTables);
-            kvstoreDictLUTDefrag(db->expires, dictDefragTables);
-            defrag_stage = 0;
-            defrag_cursor = 0;
-            slot = -1;
-            defrag_later_item_in_progress = 0;
+        if (defrag.current_stage) {
+            listDelNode(defrag.remaining_stages, defrag.current_stage);
+            defrag.current_stage = NULL;
         }
+    }
+    defrag.timeproc_id = AE_DELETED_EVENT_ID;
 
-        /* This array of structures holds the parameters for all defragmentation stages. */
-        typedef struct defragStage {
-            kvstore *kvs;
-            dictScanFunction *scanfn;
-            void *privdata;
-        } defragStage;
-        defragStage defrag_stages[] = {
-            {db->keys, defragScanCallback, db},
-            {db->expires, scanCallbackCountScanned, NULL},
-            {server.pubsub_channels, defragPubsubScanCallback,
-                &(defragPubSubCtx){server.pubsub_channels, getClientPubSubChannels}},
-            {server.pubsubshard_channels, defragPubsubScanCallback,
-                &(defragPubSubCtx){server.pubsubshard_channels, getClientPubSubShardChannels}},
-        };
-        do {
-            int num_stages = sizeof(defrag_stages) / sizeof(defrag_stages[0]);
-            serverAssert(defrag_stage < num_stages);
-            defragStage *current_stage = &defrag_stages[defrag_stage];
-
-            /* before scanning the next bucket, see if we have big keys left from the previous bucket to scan */
-            if (defragLaterStep(db, slot, endtime)) {
-                quit = 1; /* time is up, we didn't finish all the work */
-                break; /* this will exit the function and we'll continue on the next cycle */
-            }
+    listRelease(defrag.remaining_stages);
+    defrag.remaining_stages = NULL;
 
-            if (!defrag_later_item_in_progress) {
-                /* Continue defragmentation from the previous stage.
-                 * If slot is -1, it means this stage starts from the first non-empty slot. */
-                if (slot == -1) slot = kvstoreGetFirstNonEmptyDictIndex(current_stage->kvs);
-                defrag_cursor = kvstoreDictScanDefrag(current_stage->kvs, slot, defrag_cursor,
-                    current_stage->scanfn, &defragfns, &(defragCtx){current_stage->privdata, slot});
-            }
+    size_t frag_bytes;
+    float frag_pct = getAllocatorFragmentation(&frag_bytes);
+    serverLog(LL_VERBOSE, "Active defrag done in %dms, reallocated=%d, frag=%.0f%%, frag_bytes=%zu",
+              (int)elapsedMs(defrag.start_cycle), (int)(server.stat_active_defrag_hits - defrag.start_defrag_hits),
+              frag_pct, frag_bytes);
 
-            if (!defrag_cursor) {
-                /* Move to the next slot only if regular and large item scanning has been completed. */
-                if (listLength(db->defrag_later) > 0) {
-                    defrag_later_item_in_progress = 1;
-                    continue;
-                }
+    server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time);
+    server.stat_last_active_defrag_time = 0;
+    server.active_defrag_running = 0;
 
-                /* Move to the next slot in the current stage. If we've reached the end, move to the next stage. */
-                if ((slot = kvstoreGetNextNonEmptyDictIndex(current_stage->kvs, slot)) == -1)
-                    defrag_stage++;
-                defrag_later_item_in_progress = 0;
-            }
+    updateDefragDecayRate(frag_pct);
+    moduleDefragEnd();
 
-            /* Check if all defragmentation stages have been processed.
-             * If so, mark as finished and reset the stage counter to move on to next database. */
-            if (defrag_stage == num_stages) {
-                all_stages_finished = 1;
-                defrag_stage = 0;
-            }
-    
-            /* Once in 16 scan iterations, 512 pointer reallocations. or 64 keys
-             * (if we have a lot of pointers in one hash bucket or rehashing),
-             * check if we reached the time limit.
-             * But regardless, don't start a new db in this loop, this is because after
-             * the last db we call defragOtherGlobals, which must be done in one cycle */
-            if (all_stages_finished ||
-                ++iterations > 16 ||
-                server.stat_active_defrag_hits - prev_defragged > 512 ||
-                server.stat_active_defrag_scanned - prev_scanned > 64)
-            {
-                /* Quit if all stages were finished or timeout. */
-                if (all_stages_finished || ustime() > endtime) {
-                    quit = 1;
-                    break;
-                }
-                iterations = 0;
-                prev_defragged = server.stat_active_defrag_hits;
-                prev_scanned = server.stat_active_defrag_scanned;
-            }
-        } while(!all_stages_finished && !quit);
-    } while(!quit);
+    /* Immediately check to see if we should start another defrag cycle. */
+    activeDefragCycle();
+}
+
+/* Must be called at the start of the timeProc as it measures the delay from the end of the previous
+ * timeProc invocation when performing the computation. */
+static int computeDefragCycleUs(void) {
+    long dutyCycleUs;
+
+    int targetCpuPercent = server.active_defrag_running;
+    serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100);
+
+    static int prevCpuPercent = 0; /* STATIC - this persists */
+    if (targetCpuPercent != prevCpuPercent) {
+        /* If the targetCpuPercent changes, the value might be different from when the last wait
+         * time was computed. In this case, don't consider wait time. (This is really only an
+         * issue in crazy tests that dramatically increase CPU while defrag is running.) */
+        defrag.timeproc_end_time = 0;
+        prevCpuPercent = targetCpuPercent;
+    }
+
+    /* Given when the last duty cycle ended, compute time needed to achieve the desired percentage. */
+    if (defrag.timeproc_end_time == 0) {
+        /* Either the first call to the timeProc, or we were paused for some reason. */
+        defrag.timeproc_overage_us = 0;
+        dutyCycleUs = DEFRAG_CYCLE_US;
+    } else {
+        long waitedUs = getMonotonicUs() - defrag.timeproc_end_time;
+        /* Given the elapsed wait time between calls, compute the necessary duty time needed to
+         * achieve the desired CPU percentage.
+         * With:  D = duty time, W = wait time, P = percent
+         * Solve:    D          P
+         *         -----   =  -----
+         *         D + W       100
+         * Solving for D:
+         *     D = P * W / (100 - P)
+         *
+         * Note that dutyCycleUs addresses starvation. If the wait time was long, we will compensate
+         * with a proportionately long duty-cycle. This won't significantly affect perceived
+         * latency, because clients are already being impacted by the long cycle time which caused
+         * the starvation of the timer. */
+        dutyCycleUs = targetCpuPercent * waitedUs / (100 - targetCpuPercent);
+
+        /* Also adjust for any accumulated overage. */
+        dutyCycleUs -= defrag.timeproc_overage_us;
+        defrag.timeproc_overage_us = 0;
+
+        if (dutyCycleUs < DEFRAG_CYCLE_US) {
+            /* We never reduce our cycle time, that would increase overhead. Instead, we track this
+             * as part of the overage, and increase wait time between cycles. */
+            defrag.timeproc_overage_us = DEFRAG_CYCLE_US - dutyCycleUs;
+            dutyCycleUs = DEFRAG_CYCLE_US;
+        } else if (dutyCycleUs > DEFRAG_CYCLE_US * 10) {
+            /* Add a time limit for the defrag duty cycle to prevent excessive latency.
+             * When latency is already high (indicated by a long time between calls),
+             * we don't want to make it worse by running defrag for too long. */
+            dutyCycleUs = DEFRAG_CYCLE_US * 10;
+        }
+    }
+    return dutyCycleUs;
+}
+
+/* Must be called at the end of the timeProc as it records the timeproc_end_time for use in the next
+ * computeDefragCycleUs computation. */
+static int computeDelayMs(monotime intendedEndtime) {
+    defrag.timeproc_end_time = getMonotonicUs();
+    long overage = defrag.timeproc_end_time - intendedEndtime;
+    defrag.timeproc_overage_us += overage; /* track over/under desired CPU */
+    /* Allow negative overage (underage) to count against existing overage, but don't allow
+     * underage (from short stages) to be accumulated. */
+    if (defrag.timeproc_overage_us < 0) defrag.timeproc_overage_us = 0;
+
+    int targetCpuPercent = server.active_defrag_running;
+    serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100);
+
+    /* Given the desired duty cycle, what inter-cycle delay do we need to achieve that? */
+    /* We want to achieve a specific CPU percent. To do that, we can't use a skewed computation. */
+    /* Example, if we run for 1ms and delay 10ms, that's NOT 10%, because the total cycle time is 11ms. */
+    /* Instead, if we rum for 1ms, our total time should be 10ms. So the delay is only 9ms. */
+    long totalCycleTimeUs = DEFRAG_CYCLE_US * 100 / targetCpuPercent;
+    long delayUs = totalCycleTimeUs - DEFRAG_CYCLE_US;
+    /* Only increase delay by the fraction of the overage that would be non-duty-cycle */
+    delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100;
+    if (delayUs < 0) delayUs = 0;
+    long delayMs = delayUs / 1000; /* round down */
+    return delayMs;
+}
+
+/* An independent time proc for defrag. While defrag is running, this is called much more often
+ * than the server cron. Frequent short calls provides low latency impact. */
+static int activeDefragTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData) {
+    UNUSED(eventLoop);
+    UNUSED(id);
+    UNUSED(clientData);
+
+    /* This timer shouldn't be registered unless there's work to do. */
+    serverAssert(defrag.current_stage || listLength(defrag.remaining_stages) > 0);
+
+    if (!server.active_defrag_enabled) {
+        /* Defrag has been disabled while running */
+        endDefragCycle(0);
+        return AE_NOMORE;
+    }
+
+    if (hasActiveChildProcess()) {
+        /* If there's a child process, pause the defrag, polling until the child completes. */
+        defrag.timeproc_end_time = 0; /* prevent starvation recovery */
+        return 100;
+    }
+
+    monotime starttime = getMonotonicUs();
+    int dutyCycleUs = computeDefragCycleUs();
+    monotime endtime = starttime + dutyCycleUs;
+    int haveMoreWork = 1;
+
+    /* Increment server.cronloops so that run_with_period works. */
+    long hz_ms = 1000 / server.hz;
+    int cronloops = (server.mstime - server.blocked_last_cron + (hz_ms - 1)) / hz_ms; /* rounding up */
+    server.blocked_last_cron += cronloops * hz_ms;
+    server.cronloops += cronloops;
+
+    mstime_t latency;
+    latencyStartMonitor(latency);
+
+    do {
+        if (!defrag.current_stage) {
+            defrag.current_stage = listFirst(defrag.remaining_stages);
+        }
+
+        StageDescriptor *stage = listNodeValue(defrag.current_stage);
+        doneStatus status = stage->stage_fn(stage->ctx, endtime);
+        if (status == DEFRAG_DONE) {
+            listDelNode(defrag.remaining_stages, defrag.current_stage);
+            defrag.current_stage = NULL;
+        }
+
+        haveMoreWork = (defrag.current_stage || listLength(defrag.remaining_stages) > 0);
+        /* If we've completed a stage early, and still have a standard time allotment remaining,
+         * we'll start another stage. This can happen when defrag is running infrequently, and
+         * starvation protection has increased the duty-cycle. */
+    } while (haveMoreWork && getMonotonicUs() <= endtime - DEFRAG_CYCLE_US);
 
     latencyEndMonitor(latency);
-    latencyAddSampleIfNeeded("active-defrag-cycle",latency);
-
-update_metrics:
-    if (server.active_defrag_running > 0) {
-        if (server.stat_last_active_defrag_time == 0)
-            elapsedStart(&server.stat_last_active_defrag_time);
-    } else if (server.stat_last_active_defrag_time != 0) {
-        server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time);
-        server.stat_last_active_defrag_time = 0;
+    latencyAddSampleIfNeeded("active-defrag-cycle", latency);
+
+    if (haveMoreWork) {
+        return computeDelayMs(endtime);
+    } else {
+        endDefragCycle(1);
+        return AE_NOMORE; /* Ends the timer proc */
     }
 }
 
+/* During long running scripts, or while loading, there is a periodic function for handling other
+ * actions. This interface allows defrag to continue running, avoiding a single long defrag step
+ * after the long operation completes. */
+void defragWhileBlocked(void) {
+    /* This is called infrequently, while timers are not active. We might need to start defrag. */
+    if (!defragIsRunning()) activeDefragCycle();
+
+    if (!defragIsRunning()) return;
+
+    /* Save off the timeproc_id. If we have a normal termination, it will be cleared. */
+    long long timeproc_id = defrag.timeproc_id;
+
+    /* Simulate a single call of the timer proc */
+    long long reschedule_delay = activeDefragTimeProc(NULL, 0, NULL);
+    if (reschedule_delay == AE_NOMORE) {
+        /* If it's done, deregister the timer */
+        aeDeleteTimeEvent(server.el, timeproc_id);
+    }
+    /* Otherwise, just ignore the reschedule_delay, the timer will pop the next time that the
+     * event loop can process timers again. */
+}
+
+static void beginDefragCycle(void) {
+    serverAssert(!defragIsRunning());
+
+    moduleDefragStart();
+
+    serverAssert(defrag.remaining_stages == NULL);
+    defrag.remaining_stages = listCreate();
+    listSetFreeMethod(defrag.remaining_stages, freeDefragContext);
+
+    for (int dbid = 0; dbid < server.dbnum; dbid++) {
+        redisDb *db = &server.db[dbid];
+
+        /* Add stage for keys. */
+        defragKeysCtx *defrag_keys_ctx = zcalloc(sizeof(defragKeysCtx));
+        defrag_keys_ctx->kvstate = INIT_KVSTORE_STATE(db->keys);
+        defrag_keys_ctx->dbid = dbid;
+        addDefragStage(defragStageDbKeys, freeDefragKeysContext, defrag_keys_ctx);
+
+        /* Add stage for expires. */
+        defragKeysCtx *defrag_expires_ctx = zcalloc(sizeof(defragKeysCtx));
+        defrag_expires_ctx->kvstate = INIT_KVSTORE_STATE(db->expires);
+        defrag_expires_ctx->dbid = dbid;
+        addDefragStage(defragStageExpiresKvstore, freeDefragKeysContext, defrag_expires_ctx);
+    }
+
+    /* Add stage for pubsub channels. */
+    defragPubSubCtx *defrag_pubsub_ctx = zmalloc(sizeof(defragPubSubCtx));
+    defrag_pubsub_ctx->kvstate = INIT_KVSTORE_STATE(server.pubsub_channels);
+    defrag_pubsub_ctx->getPubSubChannels = getClientPubSubChannels;
+    addDefragStage(defragStagePubsubKvstore, zfree, defrag_pubsub_ctx);
+
+    /* Add stage for pubsubshard channels. */
+    defragPubSubCtx *defrag_pubsubshard_ctx = zmalloc(sizeof(defragPubSubCtx));
+    defrag_pubsubshard_ctx->kvstate = INIT_KVSTORE_STATE(server.pubsubshard_channels);
+    defrag_pubsubshard_ctx->getPubSubChannels = getClientPubSubShardChannels;
+    addDefragStage(defragStagePubsubKvstore, zfree, defrag_pubsubshard_ctx);
+
+    addDefragStage(defragLuaScripts, NULL, NULL);
+    addDefragStage(defragModuleGlobals, NULL, NULL);
+
+    defrag.current_stage = NULL;
+    defrag.start_cycle = getMonotonicUs();
+    defrag.start_defrag_hits = server.stat_active_defrag_hits;
+    defrag.start_defrag_misses = server.stat_active_defrag_misses;
+    defrag.start_frag_pct = getAllocatorFragmentation(NULL);
+    defrag.timeproc_end_time = 0;
+    defrag.timeproc_overage_us = 0;
+    defrag.timeproc_id = aeCreateTimeEvent(server.el, 0, activeDefragTimeProc, NULL, NULL);
+
+    elapsedStart(&server.stat_last_active_defrag_time);
+}
+
+void activeDefragCycle(void) {
+    if (!server.active_defrag_enabled) return;
+
+    /* Defrag gets paused while a child process is active. So there's no point in starting a new
+     * cycle or adjusting the CPU percentage for an existing cycle. */
+    if (hasActiveChildProcess()) return;
+
+    computeDefragCycles();
+
+    if (server.active_defrag_running > 0 && !defragIsRunning()) beginDefragCycle();
+}
+
 #else /* HAVE_DEFRAG */
 
 void activeDefragCycle(void) {
@@ -1318,4 +1560,7 @@ robj *activeDefragStringOb(robj *ob) {
     return NULL;
 }
 
+void defragWhileBlocked(void) {
+}
+
 #endif
diff --git a/src/kvstore.c b/src/kvstore.c
index 6a4d123ad1c..fdb9b61a683 100644
--- a/src/kvstore.c
+++ b/src/kvstore.c
@@ -12,9 +12,15 @@
  * Copyright (c) 2011-Present, Redis Ltd. and contributors.
  * All rights reserved.
  *
+ * Copyright (c) 2024-present, Valkey contributors.
+ * All rights reserved.
+ *
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2) or the Server Side Public License v1 (SSPLv1).
+ *
+ * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information.
  */
+
 #include "fmacros.h"
 
 #include <string.h>
@@ -802,10 +808,14 @@ unsigned long kvstoreDictScanDefrag(kvstore *kvs, int didx, unsigned long v, dic
  * within dict, it only reallocates the memory used by the dict structure itself using 
  * the provided allocation function. This feature was added for the active defrag feature.
  *
- * The 'defragfn' callback is called with a reference to the dict
- * that callback can reallocate. */
-void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn) {
-    for (int didx = 0; didx < kvs->num_dicts; didx++) {
+ * With 16k dictionaries for cluster mode with 1 shard, this operation may require substantial time
+ * to execute.  A "cursor" is used to perform the operation iteratively.  When first called, a
+ * cursor value of 0 should be provided.  The return value is an updated cursor which should be
+ * provided on the next iteration.  The operation is complete when 0 is returned.
+ *
+ * The 'defragfn' callback is called with a reference to the dict that callback can reallocate. */
+unsigned long kvstoreDictLUTDefrag(kvstore *kvs, unsigned long cursor, kvstoreDictLUTDefragFunction *defragfn) {
+    for (int didx = cursor; didx < kvs->num_dicts; didx++) {
         dict **d = kvstoreGetDictRef(kvs, didx), *newd;
         if (!*d)
             continue;
@@ -818,7 +828,9 @@ void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn)
             if (metadata->rehashing_node)
                 metadata->rehashing_node->value = *d;
         }
+        return (didx + 1);
     }
+    return 0;
 }
 
 uint64_t kvstoreGetHash(kvstore *kvs, const void *key)
@@ -1059,13 +1071,14 @@ int kvstoreTest(int argc, char **argv, int flags) {
     }
 
     TEST("Verify that a rehashing dict's node in the rehashing list is correctly updated after defragmentation") {
+        unsigned long cursor = 0;
         kvstore *kvs = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND);
         for (i = 0; i < 256; i++) {
             de = kvstoreDictAddRaw(kvs, 0, stringFromInt(i), NULL);
             if (listLength(kvs->rehashing)) break;
         }
         assert(listLength(kvs->rehashing));
-        kvstoreDictLUTDefrag(kvs, defragLUTTestCallback);
+        while ((cursor = kvstoreDictLUTDefrag(kvs, cursor, defragLUTTestCallback)) != 0) {}
         while (kvstoreIncrementallyRehash(kvs, 1000)) {}
         kvstoreRelease(kvs);
     }
diff --git a/src/kvstore.h b/src/kvstore.h
index 9e2e4afe0d2..8b9fd7348f8 100644
--- a/src/kvstore.h
+++ b/src/kvstore.h
@@ -1,3 +1,16 @@
+/*
+ * Copyright (c) 2009-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Copyright (c) 2024-present, Valkey contributors.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2) or the Server Side Public License v1 (SSPLv1).
+ *
+ * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information.
+ */
+
 #ifndef DICTARRAY_H_
 #define DICTARRAY_H_
 
@@ -78,7 +91,7 @@ unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, uns
 int kvstoreDictExpand(kvstore *kvs, int didx, unsigned long size);
 unsigned long kvstoreDictScanDefrag(kvstore *kvs, int didx, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata);
 typedef dict *(kvstoreDictLUTDefragFunction)(dict *d);
-void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn);
+unsigned long kvstoreDictLUTDefrag(kvstore *kvs, unsigned long cursor, kvstoreDictLUTDefragFunction *defragfn);
 void *kvstoreDictFetchValue(kvstore *kvs, int didx, const void *key);
 dictEntry *kvstoreDictFind(kvstore *kvs, int didx, void *key);
 dictEntry *kvstoreDictAddRaw(kvstore *kvs, int didx, void *key, dictEntry **existing);
diff --git a/src/module.c b/src/module.c
index 59ae99e70e8..8ecd23862ba 100644
--- a/src/module.c
+++ b/src/module.c
@@ -2,8 +2,13 @@
  * Copyright (c) 2016-Present, Redis Ltd.
  * All rights reserved.
  *
+ * Copyright (c) 2024-present, Valkey contributors.
+ * All rights reserved.
+ *
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2) or the Server Side Public License v1 (SSPLv1).
+ *
+ * Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information.
  */
 
 /* --------------------------------------------------------------------------
@@ -13782,7 +13787,7 @@ const char *RM_GetCurrentCommandName(RedisModuleCtx *ctx) {
  * defrag callback.
  */
 struct RedisModuleDefragCtx {
-    long long int endtime;
+    monotime endtime;
     unsigned long *cursor;
     struct redisObject *key; /* Optional name of key processed, NULL when unknown. */
     int dbid;                /* The dbid of the key being processed, -1 when unknown. */
@@ -13821,7 +13826,7 @@ int RM_RegisterDefragCallbacks(RedisModuleCtx *ctx, RedisModuleDefragFunc start,
  * so it generally makes sense to do small batches of work in between calls.
  */
 int RM_DefragShouldStop(RedisModuleDefragCtx *ctx) {
-    return (ctx->endtime != 0 && ctx->endtime < ustime());
+    return (ctx->endtime != 0 && ctx->endtime <= getMonotonicUs());
 }
 
 /* Store an arbitrary cursor value for future re-use.
@@ -13929,7 +13934,7 @@ RedisModuleString *RM_DefragRedisModuleString(RedisModuleDefragCtx *ctx, RedisMo
  * Returns a zero value (and initializes the cursor) if no more needs to be done,
  * or a non-zero value otherwise.
  */
-int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, long long endtime, int dbid) {
+int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid) {
     moduleValue *mv = value->ptr;
     moduleType *mt = mv->type;
 
diff --git a/src/server.c b/src/server.c
index 48936ac53e1..9e1250ca04b 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1637,25 +1637,7 @@ void whileBlockedCron(void) {
     mstime_t latency;
     latencyStartMonitor(latency);
 
-    /* In some cases we may be called with big intervals, so we may need to do
-     * extra work here. This is because some of the functions in serverCron rely
-     * on the fact that it is performed every 10 ms or so. For instance, if
-     * activeDefragCycle needs to utilize 25% cpu, it will utilize 2.5ms, so we
-     * need to call it multiple times. */
-    long hz_ms = 1000/server.hz;
-    while (server.blocked_last_cron < server.mstime) {
-
-        /* Defrag keys gradually. */
-        activeDefragCycle();
-
-        server.blocked_last_cron += hz_ms;
-
-        /* Increment cronloop so that run_with_period works. */
-        server.cronloops++;
-    }
-
-    /* Other cron jobs do not need to be done in a loop. No need to check
-     * server.blocked_last_cron since we have an early exit at the top. */
+    defragWhileBlocked();
 
     /* Update memory stats during loading (excluding blocked scripts) */
     if (server.loading) cronUpdateMemoryStats();
@@ -2758,8 +2740,6 @@ void initServer(void) {
         server.db[j].watched_keys = dictCreate(&keylistDictType);
         server.db[j].id = j;
         server.db[j].avg_ttl = 0;
-        server.db[j].defrag_later = listCreate();
-        listSetFreeMethod(server.db[j].defrag_later, sdsfreegeneric);
     }
     evictionPoolAlloc(); /* Initialize the LRU keys pool. */
     /* Note that server.pubsub_channels was chosen to be a kvstore (with only one dict, which
diff --git a/src/server.h b/src/server.h
index d65392b8cc6..a98f1aa8a26 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1051,7 +1051,6 @@ typedef struct redisDb {
     int id;                     /* Database ID */
     long long avg_ttl;          /* Average TTL, just for stats */
     unsigned long expires_cursor; /* Cursor of the active expire cycle. */
-    list *defrag_later;         /* List of key names to attempt to defrag one by one, gradually. */
 } redisDb;
 
 /* forward declaration for functions ctx */
@@ -2675,7 +2674,7 @@ size_t moduleGetFreeEffort(robj *key, robj *val, int dbid);
 size_t moduleGetMemUsage(robj *key, robj *val, size_t sample_size, int dbid);
 robj *moduleTypeDupOrReply(client *c, robj *fromkey, robj *tokey, int todb, robj *value);
 int moduleDefragValue(robj *key, robj *obj, int dbid);
-int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, long long endtime, int dbid);
+int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid);
 void moduleDefragGlobals(void);
 void moduleDefragStart(void);
 void moduleDefragEnd(void);
@@ -3269,6 +3268,7 @@ void enterExecutionUnit(int update_cached_time, long long us);
 void exitExecutionUnit(void);
 void resetServerStats(void);
 void activeDefragCycle(void);
+void defragWhileBlocked(void);
 unsigned int getLRUClock(void);
 unsigned int LRU_CLOCK(void);
 const char *evictPolicyToString(void);
diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index 92c1f572cfd..15b00e767e4 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -1,3 +1,16 @@
+#
+# Copyright (c) 2009-Present, Redis Ltd.
+# All rights reserved.
+#
+# Copyright (c) 2024-present, Valkey contributors.
+# All rights reserved.
+#
+# Licensed under your choice of the Redis Source Available License 2.0
+# (RSALv2) or the Server Side Public License v1 (SSPLv1).
+#
+# Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information.
+#
+
 proc test_memory_efficiency {range} {
     r flushall
     set rd [redis_deferring_client]
@@ -37,15 +50,19 @@ start_server {tags {"memefficiency external:skip"}} {
 }
 
 run_solo {defrag} {
-    proc wait_for_defrag_stop {maxtries delay} {
+    proc wait_for_defrag_stop {maxtries delay {expect_frag 0}} {
         wait_for_condition $maxtries $delay {
-            [s active_defrag_running] eq 0
+            [s active_defrag_running] eq 0 && ($expect_frag == 0 || [s allocator_frag_ratio] <= $expect_frag)
         } else {
             after 120 ;# serverCron only updates the info once in 100ms
             puts [r info memory]
             puts [r info stats]
             puts [r memory malloc-stats]
-            fail "defrag didn't stop."
+            if {$expect_frag != 0} {
+                fail "defrag didn't stop or failed to achieve expected frag ratio ([s allocator_frag_ratio] > $expect_frag)"
+            } else {
+                fail "defrag didn't stop."
+            }
         }
     }
 
@@ -102,7 +119,7 @@ run_solo {defrag} {
                 r config set active-defrag-cycle-max 75
 
                 # Wait for the active defrag to stop working.
-                wait_for_defrag_stop 2000 100
+                wait_for_defrag_stop 2000 100 1.1
 
                 # Test the fragmentation is lower.
                 after 120 ;# serverCron only updates the info once in 100ms
@@ -124,7 +141,6 @@ run_solo {defrag} {
                     puts [r latency latest]
                     puts [r latency history active-defrag-cycle]
                 }
-                assert {$frag < 1.1}
                 # due to high fragmentation, 100hz, and active-defrag-cycle-max set to 75,
                 # we expect max latency to be not much higher than 7.5ms but due to rare slowness threshold is set higher
                 if {!$::no_latency} {
@@ -142,6 +158,11 @@ run_solo {defrag} {
                 # reset stats and load the AOF file
                 r config resetstat
                 r config set key-load-delay -25 ;# sleep on average 1/25 usec
+                # Note: This test is checking if defrag is working DURING AOF loading (while
+                #       timers are not active).  So we don't give any extra time, and we deactivate
+                #       defrag immediately after the AOF loading is complete.  During loading,
+                #       defrag will get invoked less often, causing starvation prevention.  We
+                #       should expect longer latency measurements.
                 r debug loadaof
                 r config set activedefrag no
                 # measure hits and misses right after aof loading
@@ -246,7 +267,7 @@ run_solo {defrag} {
                 }
 
                 # wait for the active defrag to stop working
-                wait_for_defrag_stop 500 100
+                wait_for_defrag_stop 500 100 1.05
 
                 # test the fragmentation is lower
                 after 120 ;# serverCron only updates the info once in 100ms
@@ -256,7 +277,6 @@ run_solo {defrag} {
                     puts "frag [s allocator_frag_ratio]"
                     puts "frag_bytes [s allocator_frag_bytes]"
                 }
-                assert_lessthan_equal [s allocator_frag_ratio] 1.05
             }
             # Flush all script to make sure we don't crash after defragging them
             r script flush sync
@@ -362,7 +382,7 @@ run_solo {defrag} {
                 }
 
                 # wait for the active defrag to stop working
-                wait_for_defrag_stop 500 100
+                wait_for_defrag_stop 500 100 1.1
 
                 # test the fragmentation is lower
                 after 120 ;# serverCron only updates the info once in 100ms
@@ -384,7 +404,6 @@ run_solo {defrag} {
                     puts [r latency latest]
                     puts [r latency history active-defrag-cycle]
                 }
-                assert {$frag < 1.1}
                 # due to high fragmentation, 100hz, and active-defrag-cycle-max set to 75,
                 # we expect max latency to be not much higher than 7.5ms but due to rare slowness threshold is set higher
                 if {!$::no_latency} {
@@ -464,7 +483,7 @@ run_solo {defrag} {
                 }
 
                 # wait for the active defrag to stop working
-                wait_for_defrag_stop 500 100
+                wait_for_defrag_stop 500 100 1.05
 
                 # test the fragmentation is lower
                 after 120 ;# serverCron only updates the info once in 100ms
@@ -474,7 +493,6 @@ run_solo {defrag} {
                     puts "frag [s allocator_frag_ratio]"
                     puts "frag_bytes [s allocator_frag_bytes]"
                 }
-                assert_lessthan_equal [s allocator_frag_ratio] 1.05
             }
 
             # Publishes some message to all the pubsub clients to make sure that
@@ -572,7 +590,7 @@ run_solo {defrag} {
                 }
 
                 # wait for the active defrag to stop working
-                wait_for_defrag_stop 500 100
+                wait_for_defrag_stop 500 100 1.5
 
                 # test the fragmentation is lower
                 after 120 ;# serverCron only updates the info once in 100ms
@@ -582,7 +600,6 @@ run_solo {defrag} {
                     puts "frag [s allocator_frag_ratio]"
                     puts "frag_bytes [s allocator_frag_bytes]"
                 }
-                assert_lessthan_equal [s allocator_frag_ratio] 1.5
             }
         }
 
@@ -682,7 +699,13 @@ run_solo {defrag} {
                 }
 
                 # wait for the active defrag to stop working
-                wait_for_defrag_stop 500 100
+                if {$io_threads == 1} {
+                    wait_for_defrag_stop 500 100 1.05
+                } else {
+                    # TODO: When multithreading is enabled, argv may be created in the io thread
+                    # and kept in the main thread, which can cause fragmentation to become worse.
+                    wait_for_defrag_stop 500 100 1.1
+                }
 
                 # test the fragmentation is lower
                 after 120 ;# serverCron only updates the info once in 100ms
@@ -692,14 +715,6 @@ run_solo {defrag} {
                     puts "frag [s allocator_frag_ratio]"
                     puts "frag_bytes [s allocator_frag_bytes]"
                 }
-
-                if {$io_threads == 1} {
-                    assert_lessthan_equal [s allocator_frag_ratio] 1.05
-                } else {
-                    # TODO: When multithreading is enabled, argv may be created in the io thread
-                    # and kept in the main thread, which can cause fragmentation to become worse.
-                    assert_lessthan_equal [s allocator_frag_ratio] 1.1
-                }
             }
         }
 
@@ -763,7 +778,7 @@ run_solo {defrag} {
                 }
 
                 # wait for the active defrag to stop working
-                wait_for_defrag_stop 500 100
+                wait_for_defrag_stop 500 100 1.1
 
                 # test the fragmentation is lower
                 after 120 ;# serverCron only updates the info once in 100ms
@@ -789,7 +804,6 @@ run_solo {defrag} {
                     puts [r latency history active-defrag-cycle]
                     puts [r memory malloc-stats]
                 }
-                assert {$frag < 1.1}
                 # due to high fragmentation, 100hz, and active-defrag-cycle-max set to 75,
                 # we expect max latency to be not much higher than 7.5ms but due to rare slowness threshold is set higher
                 if {!$::no_latency} {
@@ -884,7 +898,7 @@ run_solo {defrag} {
                     }
 
                     # wait for the active defrag to stop working
-                    wait_for_defrag_stop 500 100
+                    wait_for_defrag_stop 500 100 1.1
 
                     # test the fragmentation is lower
                     after 120 ;# serverCron only updates the info once in 100ms
@@ -896,7 +910,6 @@ run_solo {defrag} {
                         puts "hits: $hits"
                         puts "misses: $misses"
                     }
-                    assert {$frag < 1.1}
                     assert {$misses < 10000000} ;# when defrag doesn't stop, we have some 30m misses, when it does, we have 2m misses
                 }