From 81c03bc2f36fe6acfc19b76c8051a19885940a93 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Fri, 8 May 2026 16:25:37 +0800
Subject: [PATCH] Harden clone3, vfork, exec, and mmap edges

Several independent fixes surfaced while exercising real multi-threaded
fork/exec patterns. Each one was a real-world regression rather than
defensive cleanup.

clone3 stack lifetime
  Worker threads spawned via clone3(stack, stack_size) now record their
  guest stack range. sys_munmap() walks active threads under thread_lock
  in two passes (validate + commit) and defers the overlapping portion of
  any live stack into a per-thread queue, then unmaps the gaps now. The
  collect/commit/finish/rollback transaction marks each affected thread
  busy; concurrent collects on the same thread cond_wait until the
  in-flight transaction releases. On thread exit,
  mem_cleanup_deferred_stack_unmaps() waits for busy=0, clears the live
  stack range, snapshots the queue, drains entries one-by-one and drops
  successfully unmapped ones; failed unmaps stay in the queue and log at
  error level rather than silently leaking. The drain runs before the
  CLONE_CHILD_CLEARTID futex wake so a joiner cannot reuse the freed VA
  before the host page tables release it. Legacy clone() recovers the
  range from the containing region via guest_region_find().

CLONE_VFORK with CLONE_VM
  Previously took the in-process VM-clone path, which would have reset
  the parent's guest_t on child execve. Now CLONE_VM|CLONE_VFORK falls
  through to the posix_spawn helper-process path, which spawns a child
  elfuse process and suspends the parent on a notify pipe
  (--vfork-notify-fd) until the child execve()s or exits. Matches Linux
  vfork semantics rather than blocking on host child exit.

execve sysroot resolution
  Open and ELF-load go through path_resolve_sysroot_path() into a
  separate path_host buffer; proc_set_elf_path() still publishes the
  guest-visible path so /proc/self/exe stays stable across re-exec under
  --sysroot. PT_INTERP is resolved the same way. test-sysroot-procfs-exec
  exercises the full path.

Low-address mmap hints
  Non-fixed mmap with hint in [ELF_DEFAULT_BASE, MMAP_BASE) probes the
  low arena directly via find_free_gap_inner before falling back to the
  high RW arena. box64 and other static-x86 toolchains reserve their
  ET_EXEC image window at 0x400000 with a non-fixed hint and dereference
  the address afterwards; forcing it into the high arena silently broke
  them. Cached gap_hint is intentionally bypassed for the low probe so
  unrelated allocations stay sequential up high.

brk page granularity
  sys_brk now extends and updates page-table perms at GUEST_PAGE_SIZE
  rather than 2MiB blocks. After finalize_block_perms() leaves
  non-covered pages in a split block invalid, brk-driven growth must
  call guest_update_perms on the materialized range so heap pages
  inside an already-split block become accessible.

fork IPC SCM_RIGHTS chunking
  sendmsg/recvmsg fd transfers chunked at FORK_IPC_FD_CHUNK=120 to avoid
  the macOS per-cmsg fd limit; receiver allocates its own scratch buffer
  per chunk instead of borrowing CMSG_DATA. Backing-fd send goes through
  the same helper and detects stale fds via fcntl(F_GETFD) before
  attempting transfer.

Stable synthetic procfs identity
  /proc/* and /dev/shm stat fills now report a constant PROC_SYNTH_DEV
  and a 64-bit FNV-1a hash of the path as st_ino, plus st_blksize=4096.
  Without this, directory walkers collapsed multiple synthetic paths
  onto the same (dev, ino) pair and reported false filesystem loops.

getcpu(168)
  Synthetic CPU=0, node=0; obsolete cache pointer ignored. Required for
  glibc and file(1) to start on workloads that probe topology.

--timeout 0 disables the vCPU watchdog
  parse_int_arg lower bound dropped from 1 to 0; timeout=0 lets the
  vCPU run loop iterate without alarm() preemption, which CPU-bound
  guests need.
---
 docs/usage.md            |   3 +-
 mk/tests.mk              |  15 ++
 src/main.c               |  17 +-
 src/runtime/fork-state.c | 201 +++++++++++------------
 src/runtime/forkipc.c    | 190 ++++++++++++++++++----
 src/runtime/forkipc.h    |   9 +-
 src/runtime/procemu.c    |  55 +++++--
 src/runtime/thread.c     | 284 ++++++++++++++++++++++++++++++++-
 src/runtime/thread.h     |  85 +++++++++-
 src/syscall/abi.h        |   1 +
 src/syscall/dispatch.tbl |   1 +
 src/syscall/exec.c       |  41 ++++-
 src/syscall/mem.c        | 202 +++++++++++++++++------
 src/syscall/mem.h        |   3 +
 src/syscall/sys.c        |  19 +++
 src/syscall/sys.h        |   4 +
 src/syscall/syscall.c    |   4 +-
 tests/manifest.txt       |   1 +
 tests/test-clone3.c      | 336 ++++++++++++++++++++++++++++++++++++++-
 tests/test-mmap-hint.c   |  77 +++++++++
 tests/test-thread.c      |  72 +++++++++
 tests/test-tier-b.c      |  58 +++++++
 22 files changed, 1463 insertions(+), 215 deletions(-)
 create mode 100644 tests/test-mmap-hint.c

diff --git a/docs/usage.md b/docs/usage.md
index ae79366..4cf6946 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -16,7 +16,7 @@ Supported user-facing options:
 | `-h`, `--help` | Print built-in usage help |
 | `-V`, `--version` | Print the build version and exit |
 | `-v`, `--verbose` | Enable syscall-level and loader diagnostics |
-| `-t`, `--timeout N` | Per-iteration vCPU watchdog, in seconds (default `10`) |
+| `-t`, `--timeout N` | Per-iteration vCPU watchdog, in seconds (default `10`, `0` disables) |
 | `--sysroot PATH` | Resolve guest absolute paths under `PATH` first |
 | `--gdb PORT` | Listen for a GDB RSP client on `PORT` |
 | `--gdb-stop-on-entry` | Stop before the first guest instruction |
@@ -25,6 +25,7 @@ Supported user-facing options:
 `--timeout` is a run-loop watchdog. It does not cap total process runtime. It
 only bounds a single `hv_vcpu_run()` iteration before the host regains control,
 which is what allows host-side timers and signals to be observed promptly.
+Setting `--timeout 0` disables this watchdog for long-running CPU-bound guests.
 
 ## Common Launch Patterns
 
diff --git a/mk/tests.mk b/mk/tests.mk
index df17af1..bddd0e8 100644
--- a/mk/tests.mk
+++ b/mk/tests.mk
@@ -6,6 +6,7 @@
         test-glibc-coreutils test-perf \
         test-matrix test-matrix-elfuse-aarch64 test-matrix-qemu-aarch64 \
         test-full test-multi-vcpu test-rwx test-sysroot-rename \
+        test-sysroot-procfs-exec test-timeout-disable \
         test-sysroot-nofollow perf
 
 ## Build and run the assembly hello world test
@@ -18,6 +19,10 @@ check: $(ELFUSE_BIN) $(TEST_DEPS)
 	@bash tests/driver.sh -e $(ELFUSE_BIN) -d $(TEST_DIR) -v
 	@printf "\n$(BLUE)━━━ busybox applet validation ━━━$(RESET)\n"
 	@$(MAKE) --no-print-directory test-busybox
+	@printf "\n$(BLUE)━━━ sysroot procfs exec validation ━━━$(RESET)\n"
+	@$(MAKE) --no-print-directory test-sysroot-procfs-exec
+	@printf "\n$(BLUE)━━━ timeout=0 validation ━━━$(RESET)\n"
+	@$(MAKE) --no-print-directory test-timeout-disable
 
 test-sysroot-rename: $(ELFUSE_BIN) $(BUILD_DIR)/test-sysroot-rename
 	@tmpdir=$$(mktemp -d); \
@@ -42,6 +47,16 @@ test-sysroot-nofollow: $(ELFUSE_BIN) $(BUILD_DIR)/test-sysroot-nofollow
 	ln -sf /outside-target "$$tmpdir/tmp/elfuse-sysroot-nofollow-link"; \
 	$(ELFUSE_BIN) --sysroot "$$tmpdir" $(BUILD_DIR)/test-sysroot-nofollow
 
+test-sysroot-procfs-exec: $(ELFUSE_BIN) $(BUILD_DIR)/test-procfs-exec
+	@tmpdir=$$(mktemp -d); \
+	trap 'rm -rf "$$tmpdir"' EXIT; \
+	mkdir -p "$$tmpdir/bin"; \
+	cp $(BUILD_DIR)/test-procfs-exec "$$tmpdir/bin/test-procfs-exec"; \
+	$(ELFUSE_BIN) --sysroot "$$tmpdir" "$$tmpdir/bin/test-procfs-exec"
+
+test-timeout-disable: $(ELFUSE_BIN) $(TEST_HELLO_DEP)
+	@$(ELFUSE_BIN) --timeout 0 $(TEST_DIR)/test-hello > /dev/null
+
 ## Run GDB stub integration tests (LLDB <-> elfuse gdbstub)
 test-gdbstub: $(ELFUSE_BIN) $(TEST_DIR)/test-hello
 	@bash tests/test-gdbstub.sh -e $(ELFUSE_BIN) -v
diff --git a/src/main.c b/src/main.c
index fa7ce52..a4966ff 100644
--- a/src/main.c
+++ b/src/main.c
@@ -80,7 +80,7 @@ int main(int argc, char **argv)
     log_init();
 
     bool verbose = false;
-    int timeout_sec = 10, fork_child_fd = -1;
+    int timeout_sec = 10, fork_child_fd = -1, vfork_notify_fd = -1;
     const char *sysroot = NULL;
     int gdb_port = 0;
     bool gdb_stop_on_entry = false;
@@ -103,7 +103,7 @@ int main(int argc, char **argv)
                 "  -V, --version           Show version and exit\n"
                 "  -v, --verbose           Trace each guest syscall\n"
                 "  --timeout N             Per-iteration vCPU run timeout "
-                "(seconds, default 10)\n"
+                "(seconds, default 10; 0 disables)\n"
                 "  --sysroot PATH          Resolve absolute guest paths under "
                 "PATH first\n"
                 "  --gdb PORT              Listen for GDB Remote Serial "
@@ -124,7 +124,7 @@ int main(int argc, char **argv)
         } else if ((!strcmp(argv[arg_start], "--timeout") ||
                     !strcmp(argv[arg_start], "-t")) &&
                    arg_start + 1 < argc) {
-            if (parse_int_arg(argv[arg_start + 1], 1, INT_MAX, &timeout_sec) <
+            if (parse_int_arg(argv[arg_start + 1], 0, INT_MAX, &timeout_sec) <
                 0)
                 timeout_sec = 10;
             arg_start += 2;
@@ -136,6 +136,14 @@ int main(int argc, char **argv)
                 return 1;
             }
             arg_start += 2;
+        } else if (!strcmp(argv[arg_start], "--vfork-notify-fd") &&
+                   arg_start + 1 < argc) {
+            if (parse_int_arg(argv[arg_start + 1], 0, INT_MAX,
+                              &vfork_notify_fd) < 0) {
+                log_error("invalid vfork notify fd: %s", argv[arg_start + 1]);
+                return 1;
+            }
+            arg_start += 2;
         } else if (!strcmp(argv[arg_start], "--sysroot") &&
                    arg_start + 1 < argc) {
             sysroot = argv[arg_start + 1];
@@ -166,7 +174,8 @@ int main(int argc, char **argv)
 
     /* Fork-child mode: receive VM state over IPC and run */
     if (fork_child_fd >= 0)
-        return fork_child_main(fork_child_fd, verbose, timeout_sec);
+        return fork_child_main(fork_child_fd, vfork_notify_fd, verbose,
+                               timeout_sec);
 
     if (arg_start >= argc) {
         log_error(
diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c
index 5fd7b08..1d56c38 100644
--- a/src/runtime/fork-state.c
+++ b/src/runtime/fork-state.c
@@ -56,72 +56,95 @@ int fork_ipc_read_all(int fd, void *buf, size_t len)
     return 0;
 }
 
+/* macOS rejects overly large SCM_RIGHTS payloads with EINVAL. Keep each control
+ * message comfortably below that limit and stream large fd sets in multiple
+ * chunks.
+ */
+#define FORK_IPC_FD_CHUNK 120
+
 int fork_ipc_send_fds(int sock, const int *fds, int count)
 {
     if (count <= 0)
         return 0;
 
-    char dummy = 'F';
-    struct iovec iov = {.iov_base = &dummy, .iov_len = 1};
-    size_t cmsg_size = CMSG_SPACE(count * sizeof(int));
-    uint8_t *cmsg_buf = calloc(1, cmsg_size);
-    if (!cmsg_buf)
-        return -1;
+    int sent = 0;
+    while (sent < count) {
+        int chunk = count - sent;
+        if (chunk > FORK_IPC_FD_CHUNK)
+            chunk = FORK_IPC_FD_CHUNK;
+
+        char dummy = 'F';
+        struct iovec iov = {.iov_base = &dummy, .iov_len = 1};
+        size_t cmsg_size = CMSG_SPACE((size_t) chunk * sizeof(int));
+        uint8_t *cmsg_buf = calloc(1, cmsg_size);
+        if (!cmsg_buf)
+            return -1;
 
-    struct msghdr msg = {0};
-    msg.msg_iov = &iov;
-    msg.msg_iovlen = 1;
-    msg.msg_control = cmsg_buf;
-    msg.msg_controllen = cmsg_size;
+        struct msghdr msg = {0};
+        msg.msg_iov = &iov;
+        msg.msg_iovlen = 1;
+        msg.msg_control = cmsg_buf;
+        msg.msg_controllen = cmsg_size;
 
-    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
-    cmsg->cmsg_level = SOL_SOCKET;
-    cmsg->cmsg_type = SCM_RIGHTS;
-    cmsg->cmsg_len = CMSG_LEN(count * sizeof(int));
-    memcpy(CMSG_DATA(cmsg), fds, count * sizeof(int));
+        struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+        cmsg->cmsg_level = SOL_SOCKET;
+        cmsg->cmsg_type = SCM_RIGHTS;
+        cmsg->cmsg_len = CMSG_LEN((size_t) chunk * sizeof(int));
+        memcpy(CMSG_DATA(cmsg), fds + sent, (size_t) chunk * sizeof(int));
 
-    ssize_t ret = sendmsg(sock, &msg, 0);
-    free(cmsg_buf);
-    return ret < 0 ? -1 : 0;
+        ssize_t ret = sendmsg(sock, &msg, 0);
+        free(cmsg_buf);
+        if (ret < 0)
+            return -1;
+        sent += chunk;
+    }
+    return 0;
 }
 
 int fork_ipc_recv_fds(int sock, int *fds, int max_count, int *out_count)
 {
-    char dummy;
-    struct iovec iov = {.iov_base = &dummy, .iov_len = 1};
-    size_t cmsg_size = CMSG_SPACE(max_count * sizeof(int));
-    uint8_t *cmsg_buf = calloc(1, cmsg_size);
-    if (!cmsg_buf)
-        return -1;
+    *out_count = 0;
+    while (*out_count < max_count) {
+        int chunk_max = max_count - *out_count;
+        if (chunk_max > FORK_IPC_FD_CHUNK)
+            chunk_max = FORK_IPC_FD_CHUNK;
+
+        char dummy;
+        struct iovec iov = {.iov_base = &dummy, .iov_len = 1};
+        size_t cmsg_size = CMSG_SPACE((size_t) chunk_max * sizeof(int));
+        uint8_t *cmsg_buf = calloc(1, cmsg_size);
+        if (!cmsg_buf)
+            return -1;
 
-    struct msghdr msg = {0};
-    msg.msg_iov = &iov;
-    msg.msg_iovlen = 1;
-    msg.msg_control = cmsg_buf;
-    msg.msg_controllen = cmsg_size;
+        struct msghdr msg = {0};
+        msg.msg_iov = &iov;
+        msg.msg_iovlen = 1;
+        msg.msg_control = cmsg_buf;
+        msg.msg_controllen = cmsg_size;
 
-    ssize_t ret = recvmsg(sock, &msg, 0);
-    if (ret < 0) {
-        free(cmsg_buf);
-        return -1;
-    }
-
-    *out_count = 0;
-    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
-    if (cmsg && cmsg->cmsg_level == SOL_SOCKET &&
-        cmsg->cmsg_type == SCM_RIGHTS) {
-        if (cmsg->cmsg_len < CMSG_LEN(0)) {
+        ssize_t ret = recvmsg(sock, &msg, 0);
+        if (ret < 0) {
             free(cmsg_buf);
             return -1;
         }
+        struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+        if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
+            cmsg->cmsg_type != SCM_RIGHTS || cmsg->cmsg_len < CMSG_LEN(0) ||
+            (msg.msg_flags & MSG_CTRUNC)) {
+            free(cmsg_buf);
+            return -1;
+        }
+
         int n = (int) ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int));
-        if (n > max_count)
-            n = max_count;
-        memcpy(fds, CMSG_DATA(cmsg), n * sizeof(int));
-        *out_count = n;
-    }
+        if (n <= 0 || n > chunk_max) {
+            free(cmsg_buf);
+            return -1;
+        }
 
-    free(cmsg_buf);
+        memcpy(fds + *out_count, CMSG_DATA(cmsg), (size_t) n * sizeof(int));
+        *out_count += n;
+        free(cmsg_buf);
+    }
     return 0;
 }
 
@@ -379,8 +402,14 @@ static int fork_ipc_send_backing_fds(int ipc_sock,
     uint32_t nbacking = 0;
 
     for (uint32_t i = 0; i < num_guest_regions; i++) {
-        if (regions_snapshot[i].backing_fd >= 0)
+        if (regions_snapshot[i].backing_fd >= 0) {
+            if (fcntl(regions_snapshot[i].backing_fd, F_GETFD) < 0) {
+                log_error("clone: region %u carries stale backing_fd=%d: %s", i,
+                          regions_snapshot[i].backing_fd, strerror(errno));
+                return -1;
+            }
             backing_fds[nbacking++] = regions_snapshot[i].backing_fd;
+        }
     }
 
     if (fork_ipc_write_all(ipc_sock, &nbacking, sizeof(nbacking)) < 0)
@@ -388,27 +417,13 @@ static int fork_ipc_send_backing_fds(int ipc_sock,
     if (nbacking == 0)
         return 0;
 
-    char dummy = 'B';
-    struct iovec iov = {.iov_base = &dummy, .iov_len = 1};
-    size_t cmsg_sz = CMSG_SPACE(nbacking * sizeof(int));
-    uint8_t *cmsg_buf = calloc(1, cmsg_sz);
-    if (!cmsg_buf)
-        return -1;
-
-    struct msghdr msg = {
-        .msg_iov = &iov,
-        .msg_iovlen = 1,
-        .msg_control = cmsg_buf,
-        .msg_controllen = cmsg_sz,
-    };
-    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
-    cmsg->cmsg_level = SOL_SOCKET;
-    cmsg->cmsg_type = SCM_RIGHTS;
-    cmsg->cmsg_len = CMSG_LEN(nbacking * sizeof(int));
-    memcpy(CMSG_DATA(cmsg), backing_fds, nbacking * sizeof(int));
-    int ret = sendmsg(ipc_sock, &msg, 0);
-    free(cmsg_buf);
-    return ret < 0 ? -1 : 0;
+    log_debug("clone: sending %u backing fds for %u regions", nbacking,
+              num_guest_regions);
+    if (fork_ipc_send_fds(ipc_sock, backing_fds, (int) nbacking) < 0) {
+        log_error("clone: send backing fds failed: %s", strerror(errno));
+        return -1;
+    }
+    return 0;
 }
 
 int fork_ipc_send_process_state(int ipc_sock,
@@ -507,45 +522,17 @@ static int fork_ipc_recv_backing_fds(int ipc_fd,
     if (nbacking == 0 || nbacking > GUEST_MAX_REGIONS)
         return 0;
 
-    char dummy;
-    struct iovec iov = {.iov_base = &dummy, .iov_len = 1};
-    size_t cmsg_sz = CMSG_SPACE(nbacking * sizeof(int));
-    uint8_t *cmsg_buf = calloc(1, cmsg_sz);
-    if (!cmsg_buf)
-        return -1;
-
-    struct msghdr msg = {
-        .msg_iov = &iov,
-        .msg_iovlen = 1,
-        .msg_control = cmsg_buf,
-        .msg_controllen = cmsg_sz,
-    };
-    ssize_t nr = recvmsg(ipc_fd, &msg, 0);
-    if (nr <= 0) {
-        free(cmsg_buf);
+    int *region_fds = calloc(nbacking, sizeof(int));
+    if (!region_fds)
         return -1;
-    }
-
-    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
-    if (msg.msg_flags & MSG_CTRUNC) {
-        log_error("fork-child: backing fd SCM_RIGHTS payload truncated");
-        free(cmsg_buf);
-        return -1;
-    }
-    if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
-        cmsg->cmsg_type != SCM_RIGHTS) {
-        log_error("fork-child: missing backing fd SCM_RIGHTS payload");
-        free(cmsg_buf);
-        return -1;
-    }
-    if (cmsg->cmsg_len < CMSG_LEN(0)) {
-        free(cmsg_buf);
+    int received_count = 0;
+    if (fork_ipc_recv_fds(ipc_fd, region_fds, (int) nbacking, &received_count) <
+        0) {
+        log_error("fork-child: failed to receive backing fds");
+        free(region_fds);
         return -1;
     }
-
-    int *region_fds = (int *) CMSG_DATA(cmsg);
-    uint32_t nreceived =
-        (uint32_t) ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int));
+    uint32_t nreceived = (uint32_t) received_count;
     uint32_t fi = 0;
 
     /* Sender (fork_ipc_send_backing_fds) iterates regions and sends one fd per
@@ -572,10 +559,10 @@ static int fork_ipc_recv_backing_fds(int ipc_fd,
     if (nreceived != nbacking) {
         log_error("fork-child: expected %u backing fds but received %u",
                   nbacking, nreceived);
-        free(cmsg_buf);
+        free(region_fds);
         return -1;
     }
-    free(cmsg_buf);
+    free(region_fds);
     return 0;
 }
 
diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c
index c12a760..0803503 100644
--- a/src/runtime/forkipc.c
+++ b/src/runtime/forkipc.c
@@ -46,7 +46,26 @@
 
 /* fork_child_main. */
 
-int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
+static int fork_child_vfork_notify_fd = -1;
+
+void fork_notify_vfork_exec(void)
+{
+    if (fork_child_vfork_notify_fd < 0)
+        return;
+
+    char byte = 'X';
+    ssize_t n;
+    do {
+        n = write(fork_child_vfork_notify_fd, &byte, 1);
+    } while (n < 0 && errno == EINTR);
+    close(fork_child_vfork_notify_fd);
+    fork_child_vfork_notify_fd = -1;
+}
+
+int fork_child_main(int ipc_fd,
+                    int vfork_notify_fd,
+                    bool verbose,
+                    int timeout_sec)
 {
     /* Reinitialize logging after posix_spawn (mutex state is undefined). */
     log_init();
@@ -57,6 +76,7 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
      * snapshot so the incoming metadata survives child restore.
      */
     proc_init();
+    fork_child_vfork_notify_fd = vfork_notify_fd;
 
     /* The header fixes the IPC protocol version and the guest identity before
      * any variable-length state is trusted.
@@ -163,17 +183,20 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
     }
 
     if (fork_ipc_recv_memory_regions(ipc_fd, &g) < 0) {
+        log_error("fork-child: failed to receive memory regions");
         guest_destroy(&g);
         return 1;
     }
 
     if (fork_ipc_recv_fd_table(ipc_fd, &g) < 0) {
+        log_error("fork-child: failed to receive fd table");
         guest_destroy(&g);
         return 1;
     }
 
     signal_state_t sig;
     if (fork_ipc_recv_process_state(ipc_fd, &g, &sig) < 0) {
+        log_error("fork-child: failed to receive process state");
         guest_destroy(&g);
         return 1;
     }
@@ -298,6 +321,32 @@ typedef struct {
     vcpu_simd_state_t simd_state;
 } thread_create_args_t;
 
+static void resolve_clone_stack_range(const guest_t *g,
+                                      uint64_t child_stack,
+                                      uint64_t *start_out,
+                                      uint64_t *end_out)
+{
+    if (start_out)
+        *start_out = 0;
+    if (end_out)
+        *end_out = 0;
+    if (!g || !child_stack || child_stack <= g->ipa_base)
+        return;
+
+    uint64_t sp_off = child_stack - g->ipa_base;
+    if (sp_off == 0 || sp_off > g->guest_size)
+        return;
+
+    const guest_region_t *r = guest_region_find(g, sp_off - 1);
+    if (!r)
+        return;
+
+    if (start_out)
+        *start_out = r->start;
+    if (end_out)
+        *end_out = r->end;
+}
+
 /* Forward declaration: worker entry runs after sys_clone_thread */
 static void *thread_create_and_run(void *arg);
 
@@ -305,6 +354,8 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
                                 guest_t *g,
                                 uint64_t flags,
                                 uint64_t child_stack,
+                                uint64_t stack_map_start,
+                                uint64_t stack_map_end,
                                 uint64_t ptid_gva,
                                 uint64_t tls,
                                 uint64_t ctid_gva,
@@ -314,7 +365,11 @@ static int64_t sys_clone_thread(hv_vcpu_t parent_vcpu,
     int64_t child_tid = proc_alloc_pid();
 
     /* Allocate thread table slot */
-    thread_entry_t *t = thread_alloc(child_tid);
+    if (stack_map_start >= stack_map_end)
+        resolve_clone_stack_range(g, child_stack, &stack_map_start,
+                                  &stack_map_end);
+
+    thread_entry_t *t = thread_alloc(child_tid, stack_map_start, stack_map_end);
     if (!t) {
         log_error("clone_thread: thread table full");
         return -LINUX_EAGAIN;
@@ -513,12 +568,17 @@ static void *thread_create_and_run(void *arg)
     /* CLONE_CHILD_CLEARTID: write 0 to the address and wake one waiter.
      * This is how pthread_join works in musl: the joining thread does
      * FUTEX_WAIT on this address until it becomes 0.
+     *
+     * Drain any deferred munmap of this thread's stack before waking the
+     * joiner: the parent may reuse the freed VA as soon as it returns from
+     * pthread_join, and reuse must not race with the deferred unmap.
      */
+    bool wake_ctid = false;
     if (t->clear_child_tid != 0) {
         uint32_t zero = 0;
         if (guest_write_small(g, t->clear_child_tid, &zero, sizeof(zero)) ==
             0) {
-            futex_wake_one(g, t->clear_child_tid);
+            wake_ctid = true;
         } else {
             log_warn(
                 "thread tid=%lld clear_child_tid "
@@ -527,6 +587,9 @@ static void *thread_create_and_run(void *arg)
                 (unsigned long long) t->clear_child_tid);
         }
     }
+    mem_cleanup_deferred_stack_unmaps(g, t);
+    if (wake_ctid)
+        futex_wake_one(g, t->clear_child_tid);
 
     log_debug("thread tid=%lld exiting", (long long) t->guest_tid);
 
@@ -561,6 +624,8 @@ static int64_t sys_clone_vm(hv_vcpu_t parent_vcpu,
                             guest_t *g,
                             uint64_t flags,
                             uint64_t child_stack,
+                            uint64_t stack_map_start,
+                            uint64_t stack_map_end,
                             uint64_t ptid_gva,
                             uint64_t tls,
                             uint64_t ctid_gva,
@@ -570,7 +635,11 @@ static int64_t sys_clone_vm(hv_vcpu_t parent_vcpu,
     int64_t child_tid = proc_alloc_pid();
 
     /* Allocate thread table slot */
-    thread_entry_t *t = thread_alloc(child_tid);
+    if (stack_map_start >= stack_map_end)
+        resolve_clone_stack_range(g, child_stack, &stack_map_start,
+                                  &stack_map_end);
+
+    thread_entry_t *t = thread_alloc(child_tid, stack_map_start, stack_map_end);
     if (!t) {
         log_error("clone_vm: thread table full");
         return -LINUX_EAGAIN;
@@ -752,14 +821,27 @@ static void *vm_clone_thread_run(void *arg)
 
     int exit_code = vcpu_run_loop(vcpu, vexit, g, verbose, 0);
 
-    /* CLONE_CHILD_CLEARTID cleanup */
+    /* CLONE_CHILD_CLEARTID cleanup. Same ordering as thread_entry: drain
+     * deferred stack munmaps before waking the joiner so the parent does
+     * not reuse the VA before it is released.
+     */
+    bool wake_ctid = false;
     if (t->clear_child_tid != 0) {
         uint32_t zero = 0;
         if (guest_write_small(g, t->clear_child_tid, &zero, sizeof(zero)) ==
             0) {
-            futex_wake_one(g, t->clear_child_tid);
+            wake_ctid = true;
+        } else {
+            log_warn(
+                "vm_clone tid=%lld clear_child_tid "
+                "write failed (gva=0x%llx)",
+                (long long) t->guest_tid,
+                (unsigned long long) t->clear_child_tid);
         }
     }
+    mem_cleanup_deferred_stack_unmaps(g, t);
+    if (wake_ctid)
+        futex_wake_one(g, t->clear_child_tid);
 
     /* Mark exit status for parent's wait4 to collect.
      * vm_exit_status uses wait-format: (exit_code << 8) for normal exit.
@@ -806,6 +888,8 @@ int64_t sys_clone(hv_vcpu_t vcpu,
                   guest_t *g,
                   uint64_t flags,
                   uint64_t child_stack,
+                  uint64_t stack_map_start,
+                  uint64_t stack_map_end,
                   uint64_t ptid_gva,
                   uint64_t tls,
                   uint64_t ctid_gva,
@@ -813,17 +897,9 @@ int64_t sys_clone(hv_vcpu_t vcpu,
 {
     /* CLONE_THREAD: create a new thread in the same VM (not a new process) */
     if (flags & LINUX_CLONE_THREAD) {
-        return sys_clone_thread(vcpu, g, flags, child_stack, ptid_gva, tls,
-                                ctid_gva, verbose);
-    }
-
-    /* CLONE_VM without CLONE_THREAD: create an in-process VM-clone child.
-     * The child shares guest memory but has a separate TID and is waitable
-     * via wait4/ptrace.
-     */
-    if ((flags & LINUX_CLONE_VM) && !(flags & LINUX_CLONE_THREAD)) {
-        return sys_clone_vm(vcpu, g, flags, child_stack, ptid_gva, tls,
-                            ctid_gva, verbose);
+        return sys_clone_thread(vcpu, g, flags, child_stack, stack_map_start,
+                                stack_map_end, ptid_gva, tls, ctid_gva,
+                                verbose);
     }
 
     /* elfuse only supports fork-like clone (SIGCHLD) and posix_spawn-like
@@ -831,6 +907,18 @@ int64_t sys_clone(hv_vcpu_t vcpu,
      */
     bool is_vfork = (flags & LINUX_CLONE_VFORK) != 0;
 
+    /* CLONE_VM without CLONE_THREAD usually creates an in-process VM-clone
+     * child that shares guest memory and is waitable via wait4/ptrace.
+     * However CLONE_VFORK must go through the helper-process path below so the
+     * child's later execve replaces only the child image rather than resetting
+     * the parent's shared guest_t.
+     */
+    if ((flags & LINUX_CLONE_VM) && !(flags & LINUX_CLONE_THREAD) &&
+        !is_vfork) {
+        return sys_clone_vm(vcpu, g, flags, child_stack, stack_map_start,
+                            stack_map_end, ptid_gva, tls, ctid_gva, verbose);
+    }
+
     log_debug("clone(flags=0x%llx, vfork=%d)", (unsigned long long) flags,
               is_vfork);
 
@@ -838,10 +926,17 @@ int64_t sys_clone(hv_vcpu_t vcpu,
      * and SCM_RIGHTS file descriptors to the fork-child process.
      */
     int sock_fds[2];
+    int vfork_notify_fds[2] = {-1, -1};
     if (socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds) < 0) {
         log_error("clone: socketpair failed: %s", strerror(errno));
         return -LINUX_ENOMEM;
     }
+    if (is_vfork && pipe(vfork_notify_fds) < 0) {
+        log_error("clone: vfork notify pipe failed: %s", strerror(errno));
+        close(sock_fds[0]);
+        close(sock_fds[1]);
+        return -LINUX_ENOMEM;
+    }
 
     /* Spawn the same elfuse binary so the child has the same entitlement and
      * build as the parent.
@@ -862,13 +957,20 @@ int64_t sys_clone(hv_vcpu_t vcpu,
     snprintf(fd_str, sizeof(fd_str), "%d", sock_fds[1]);
 
     /* argv is intentionally minimal; guest argv is restored later from IPC. */
-    char *child_argv[6];
+    char notify_fd_str[32];
+    char *child_argv[8];
     int ci = 0;
     child_argv[ci++] = self_path;
     if (verbose)
         child_argv[ci++] = "--verbose";
     child_argv[ci++] = "--fork-child";
     child_argv[ci++] = fd_str;
+    if (is_vfork) {
+        snprintf(notify_fd_str, sizeof(notify_fd_str), "%d",
+                 vfork_notify_fds[1]);
+        child_argv[ci++] = "--vfork-notify-fd";
+        child_argv[ci++] = notify_fd_str;
+    }
     child_argv[ci] = NULL;
 
     /* Set up spawn attributes: close all inherited FDs by default.
@@ -890,6 +992,9 @@ int64_t sys_clone(hv_vcpu_t vcpu,
     posix_spawn_file_actions_addinherit_np(&file_actions, STDOUT_FILENO);
     posix_spawn_file_actions_addinherit_np(&file_actions, STDERR_FILENO);
     posix_spawn_file_actions_addinherit_np(&file_actions, sock_fds[1]);
+    if (is_vfork)
+        posix_spawn_file_actions_addinherit_np(&file_actions,
+                                               vfork_notify_fds[1]);
 
     extern char **environ;
     pid_t child_host_pid;
@@ -902,11 +1007,17 @@ int64_t sys_clone(hv_vcpu_t vcpu,
         log_error("clone: posix_spawn failed: %s", strerror(spawn_ret));
         close(sock_fds[0]);
         close(sock_fds[1]);
+        if (vfork_notify_fds[0] >= 0)
+            close(vfork_notify_fds[0]);
+        if (vfork_notify_fds[1] >= 0)
+            close(vfork_notify_fds[1]);
         return -LINUX_ENOMEM;
     }
 
     /* The parent keeps only its end of the control channel. */
     close(sock_fds[1]);
+    if (vfork_notify_fds[1] >= 0)
+        close(vfork_notify_fds[1]);
     int ipc_sock = sock_fds[0];
 
     /* Allocate guest PID before serialization so the child header carries its
@@ -1049,6 +1160,8 @@ int64_t sys_clone(hv_vcpu_t vcpu,
     ipc_registers_t regs = {0};
     regs.elr_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_ELR_EL1);
     regs.sp_el0 = vcpu_get_sysreg(vcpu, HV_SYS_REG_SP_EL0);
+    if (child_stack)
+        regs.sp_el0 = child_stack;
     regs.spsr_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_SPSR_EL1);
     regs.vbar_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_VBAR_EL1);
     regs.ttbr0_el1 = vcpu_get_sysreg(vcpu, HV_SYS_REG_TTBR0_EL1);
@@ -1066,8 +1179,10 @@ int64_t sys_clone(hv_vcpu_t vcpu,
         log_error("clone: failed to send registers");
         goto fail_snapshot;
     }
-    if (fork_ipc_send_memory_regions(ipc_sock, g, use_shm) < 0)
+    if (fork_ipc_send_memory_regions(ipc_sock, g, use_shm) < 0) {
+        log_error("clone: failed to send memory regions");
         goto fail_snapshot;
+    }
 
     /* Snapshot the semantic region array before resuming siblings.
      * Siblings may mmap/munmap/mprotect after resume, so the code needs a
@@ -1085,13 +1200,17 @@ int64_t sys_clone(hv_vcpu_t vcpu,
         memcpy(regions_snapshot, g->regions, snap_sz);
     }
 
-    if (fork_ipc_send_fd_table(ipc_sock) < 0)
+    if (fork_ipc_send_fd_table(ipc_sock) < 0) {
+        log_error("clone: failed to send fd table");
         goto fail_snapshot;
+    }
 
     uint32_t num_guest_regions = (uint32_t) nregions_snapshot;
     if (fork_ipc_send_process_state(ipc_sock, regions_snapshot,
-                                    num_guest_regions) < 0)
+                                    num_guest_regions) < 0) {
+        log_error("clone: failed to send process state");
         goto fail_snapshot;
+    }
 
     /* The process-state payload includes the SCM_RIGHTS handoff for region
      * backing fds. Keep siblings quiesced until that send completes so a
@@ -1115,11 +1234,23 @@ int64_t sys_clone(hv_vcpu_t vcpu,
      * so it waits for the helper to exit.
      */
     if (is_vfork) {
-        int status;
-        waitpid(child_host_pid, &status, 0);
-
-        /* Publish the vfork child status for later wait calls. */
-        proc_mark_child_exited(child_host_pid, status);
+        char byte;
+        ssize_t nr;
+        do {
+            nr = read(vfork_notify_fds[0], &byte, 1);
+        } while (nr < 0 && errno == EINTR);
+        close(vfork_notify_fds[0]);
+
+        if (nr <= 0) {
+            int status;
+            waitpid(child_host_pid, &status, 0);
+            proc_mark_child_exited(child_host_pid, status);
+        } else {
+            int status;
+            pid_t waited = waitpid(child_host_pid, &status, WNOHANG);
+            if (waited == child_host_pid)
+                proc_mark_child_exited(child_host_pid, status);
+        }
     }
 
     log_debug("clone: child pid=%lld (host=%d)", (long long) child_guest_pid,
@@ -1145,6 +1276,10 @@ int64_t sys_clone(hv_vcpu_t vcpu,
             abort_rc);
     thread_resume_siblings();
     close(ipc_sock);
+    if (vfork_notify_fds[0] >= 0)
+        close(vfork_notify_fds[0]);
+    if (vfork_notify_fds[1] >= 0)
+        close(vfork_notify_fds[1]);
     return -LINUX_ENOMEM;
 }
 
@@ -1275,7 +1410,8 @@ int64_t sys_clone3(hv_vcpu_t vcpu,
         (unsigned long long) ca.stack, (unsigned long long) ca.stack_size,
         (unsigned long long) ca.tls, (unsigned long long) cl_args_size);
 
-    int64_t ret = sys_clone(vcpu, g, flags, child_stack, ca.parent_tid, ca.tls,
+    int64_t ret = sys_clone(vcpu, g, flags, child_stack, ca.stack,
+                            ca.stack + ca.stack_size, ca.parent_tid, ca.tls,
                             ca.child_tid, verbose);
 
     /* If clone succeeded and CLONE_PIDFD was requested, create a pidfd
diff --git a/src/runtime/forkipc.h b/src/runtime/forkipc.h
index 46d9214..4eb1d7a 100644
--- a/src/runtime/forkipc.h
+++ b/src/runtime/forkipc.h
@@ -21,7 +21,12 @@
  * vCPU run loop. Called from main.c when --fork-child is specified.
  * Returns the process exit code.
  */
-int fork_child_main(int ipc_fd, bool verbose, int timeout_sec);
+int fork_child_main(int ipc_fd,
+                    int vfork_notify_fd,
+                    bool verbose,
+                    int timeout_sec);
+
+void fork_notify_vfork_exec(void);
 
 /* Clone syscall: spawn a new host elfuse process with IPC state transfer.
  * Returns child guest PID to parent, or negative Linux errno.
@@ -30,6 +35,8 @@ int64_t sys_clone(hv_vcpu_t vcpu,
                   guest_t *g,
                   uint64_t flags,
                   uint64_t child_stack,
+                  uint64_t stack_map_start,
+                  uint64_t stack_map_end,
                   uint64_t ptid_gva,
                   uint64_t tls,
                   uint64_t ctid_gva,
diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c
index f35b163..7219323 100644
--- a/src/runtime/procemu.c
+++ b/src/runtime/procemu.c
@@ -578,12 +578,37 @@ static int dev_shm_resolve_path(const char *guest_suffix,
     return 0;
 }
 
+/* Give synthetic procfs nodes stable identities so directory walkers do not
+ * collapse distinct paths into one inode and falsely report filesystem loops.
+ */
+#define PROC_SYNTH_DEV ((dev_t) 0x504f)
+
+static ino_t proc_synth_ino(const char *path)
+{
+    /* 64-bit FNV-1a with Linux-looking nonzero output. */
+    uint64_t h = 1469598103934665603ULL;
+    for (const unsigned char *p = (const unsigned char *) path; *p; ++p) {
+        h ^= (uint64_t) *p;
+        h *= 1099511628211ULL;
+    }
+    h &= 0x7fffffffffffffffULL;
+    if (h == 0)
+        h = 1;
+    return (ino_t) h;
+}
+
 /* Populate *st for a synthetic /proc directory entry. */
-static void stat_fill_proc_dir(struct stat *st, mode_t mode, nlink_t nlink)
+static void stat_fill_proc_dir(struct stat *st,
+                               mode_t mode,
+                               nlink_t nlink,
+                               const char *path)
 {
     memset(st, 0, sizeof(*st));
     st->st_mode = S_IFDIR | mode;
     st->st_nlink = nlink;
+    st->st_dev = PROC_SYNTH_DEV;
+    st->st_ino = proc_synth_ino(path);
+    st->st_blksize = 4096;
 }
 
 /* Resolve a /dev/fd/<N> or /proc/self/fd/<N> path to a fresh dup() of the
@@ -642,11 +667,13 @@ static int proc_alias_self(const char *path, char *alias, size_t alias_sz)
  * st_size = 0 for proc nodes; mirroring that forces readers to drain to EOF
  * instead of pre-sizing buffers from a stale value.
  */
-static void stat_fill_proc_file(struct stat *st, mode_t mode)
+static void stat_fill_proc_file(struct stat *st, mode_t mode, const char *path)
 {
     memset(st, 0, sizeof(*st));
     st->st_mode = S_IFREG | mode;
     st->st_nlink = 1;
+    st->st_dev = PROC_SYNTH_DEV;
+    st->st_ino = proc_synth_ino(path);
     st->st_size = 0;
     st->st_blksize = 4096;
     st->st_blocks = 0;
@@ -2000,7 +2027,8 @@ int proc_intercept_stat(const char *path, struct stat *st)
      */
     /* /dev/shm is a directory */
     if (!strcmp(path, "/dev/shm") || !strcmp(path, "/dev/shm/")) {
-        stat_fill_proc_dir(st, 01777, 2); /* sticky bit, like real /dev/shm */
+        stat_fill_proc_dir(st, 01777, 2,
+                           path); /* sticky bit, like real /dev/shm */
         return 0;
     }
     /* /dev/shm/<name> files: check the host temp dir */
@@ -2013,7 +2041,7 @@ int proc_intercept_stat(const char *path, struct stat *st)
 
     /* /proc and /proc/<our_pid> are directories */
     if (!strcmp(path, "/proc") || !strcmp(path, "/proc/")) {
-        stat_fill_proc_dir(st, 0555, 3);
+        stat_fill_proc_dir(st, 0555, 3, path);
         return 0;
     }
     {
@@ -2024,12 +2052,12 @@ int proc_intercept_stat(const char *path, struct stat *st)
                  (long long) proc_get_pid());
         if (!strcmp(path, pidbuf) || !strcmp(path, pidslash) ||
             !strcmp(path, "/proc/self") || !strcmp(path, "/proc/self/")) {
-            stat_fill_proc_dir(st, 0555, 3);
+            stat_fill_proc_dir(st, 0555, 3, path);
             return 0;
         }
     }
     if (!strcmp(path, "/proc/net") || !strcmp(path, "/proc/net/")) {
-        stat_fill_proc_dir(st, 0555, 2);
+        stat_fill_proc_dir(st, 0555, 2, path);
         return 0;
     }
 
@@ -2045,7 +2073,7 @@ int proc_intercept_stat(const char *path, struct stat *st)
 
     /* /proc/self/task and /proc/self/task/<tid> are directories */
     if (!strcmp(path, "/proc/self/task") || !strcmp(path, "/proc/self/task/")) {
-        stat_fill_proc_dir(st, 0555, 2 + (nlink_t) thread_active_count());
+        stat_fill_proc_dir(st, 0555, 2 + (nlink_t) thread_active_count(), path);
         return 0;
     }
     if (!strncmp(path, "/proc/self/task/", 16)) {
@@ -2057,11 +2085,11 @@ int proc_intercept_stat(const char *path, struct stat *st)
                 return -1;
             }
             if (*endp == '\0' || !strcmp(endp, "/")) {
-                stat_fill_proc_dir(st, 0555, 2);
+                stat_fill_proc_dir(st, 0555, 2, path);
                 return 0;
             }
             if (!strcmp(endp, "/stat") || !strcmp(endp, "/status")) {
-                stat_fill_proc_file(st, 0444);
+                stat_fill_proc_file(st, 0444, path);
                 return 0;
             }
         }
@@ -2070,7 +2098,8 @@ int proc_intercept_stat(const char *path, struct stat *st)
     {
         int kind = proc_oom_path_kind(path);
         if (kind != OOM_PATH_NONE) {
-            stat_fill_proc_file(st, (kind == OOM_PATH_SCORE) ? 0444 : 0644);
+            stat_fill_proc_file(st, (kind == OOM_PATH_SCORE) ? 0444 : 0644,
+                                path);
             return 0;
         }
     }
@@ -2078,7 +2107,7 @@ int proc_intercept_stat(const char *path, struct stat *st)
     if (!strcmp(path, "/proc/self/fdinfo") ||
         !strcmp(path, "/proc/self/fdinfo/") || !strcmp(path, "/proc/self/fd") ||
         !strcmp(path, "/proc/self/fd/")) {
-        stat_fill_proc_dir(st, 0555, 2);
+        stat_fill_proc_dir(st, 0555, 2, path);
         return 0;
     }
 
@@ -2091,7 +2120,7 @@ int proc_intercept_stat(const char *path, struct stat *st)
             errno = ENOENT;
             return -1;
         }
-        stat_fill_proc_file(st, 0444);
+        stat_fill_proc_file(st, 0444, path);
         return 0;
     }
 
@@ -2127,7 +2156,7 @@ int proc_intercept_stat(const char *path, struct stat *st)
 
     for (const char **p = known_proc_files; *p; p++) {
         if (!strcmp(path, *p)) {
-            stat_fill_proc_file(st, 0444);
+            stat_fill_proc_file(st, 0444, path);
             return 0;
         }
     }
diff --git a/src/runtime/thread.c b/src/runtime/thread.c
index aedddce..36d6a2d 100644
--- a/src/runtime/thread.c
+++ b/src/runtime/thread.c
@@ -31,6 +31,12 @@
 #define LINUX_SS_DISABLE 2
 
 static void thread_ptrace_init(thread_entry_t *t);
+static int thread_add_deferred_unmap_locked(thread_entry_t *t,
+                                            uint64_t start,
+                                            uint64_t end);
+static int thread_can_add_deferred_unmap_locked(thread_entry_t *t,
+                                                uint64_t start,
+                                                uint64_t end);
 
 /* Top of the EL1 exception stack region (one 4KiB slot per thread) */
 #define SP_EL1_TOP (GUEST_IPA_BASE + SHIM_DATA_BASE + BLOCK_2MIB)
@@ -112,7 +118,9 @@ void thread_register_main(hv_vcpu_t vcpu,
     current_thread = t;
 }
 
-thread_entry_t *thread_alloc(int64_t tid)
+thread_entry_t *thread_alloc(int64_t tid,
+                             uint64_t stack_start,
+                             uint64_t stack_end)
 {
     thread_entry_t *result = NULL;
 
@@ -131,6 +139,10 @@ thread_entry_t *thread_alloc(int64_t tid)
         }
         memset(t, 0, sizeof(*t));
         t->guest_tid = tid;
+        if (stack_start < stack_end) {
+            t->stack_map_start = stack_start;
+            t->stack_map_end = stack_end;
+        }
         t->active = 1;
         t->altstack_flags = LINUX_SS_DISABLE;
         thread_ptrace_init(t);
@@ -400,6 +412,7 @@ static int fork_quiesced_count = 0;      /* Siblings blocked on barrier */
 static int fork_target_count = 0;        /* Number of siblings to quiesce */
 static pthread_cond_t fork_cond = PTHREAD_COND_INITIALIZER;
 static pthread_cond_t fork_all_quiesced_cond = PTHREAD_COND_INITIALIZER;
+static pthread_cond_t deferred_stack_unmap_cond = PTHREAD_COND_INITIALIZER;
 
 void thread_quiesce_siblings(void)
 {
@@ -487,6 +500,275 @@ pthread_mutex_t *thread_get_lock(void)
     return &thread_lock;
 }
 
+int thread_collect_and_defer_stack_ranges(
+    uint64_t start,
+    uint64_t end,
+    thread_deferred_stack_unmap_txn_t *txns,
+    int max_ranges)
+{
+    int nranges = 0;
+
+    if (start >= end || !txns || max_ranges <= 0)
+        return 0;
+
+    pthread_mutex_lock(&thread_lock);
+retry:
+    nranges = 0;
+
+    /* Pass 1: enumerate every thread whose live stack overlaps [start, end)
+     * and verify each one can record a new deferred-unmap entry. If the
+     * caller-provided buffer is too small or any thread is at its
+     * deferred-unmap cap, refuse the whole operation so pass 2 never has
+     * to handle a partial commit.
+     */
+    THREAD_FOR_EACH_ACTIVE (t) {
+        uint64_t rs = t->stack_map_start;
+        uint64_t re = t->stack_map_end;
+
+        if (rs >= re || re <= start || rs >= end)
+            continue;
+        if (t->deferred_stack_unmap_busy > 0) {
+            pthread_cond_wait(&deferred_stack_unmap_cond, &thread_lock);
+            goto retry;
+        }
+        if (nranges >= max_ranges) {
+            pthread_mutex_unlock(&thread_lock);
+            return -1;
+        }
+        uint64_t ds = (rs > start) ? rs : start;
+        uint64_t de = (re < end) ? re : end;
+        if (thread_can_add_deferred_unmap_locked(t, ds, de) < 0) {
+            pthread_mutex_unlock(&thread_lock);
+            return -1;
+        }
+
+        txns[nranges].thread = t;
+        txns[nranges].guest_tid = t->guest_tid;
+        txns[nranges].start = ds;
+        txns[nranges].end = de;
+        txns[nranges].deferred_count = t->deferred_stack_unmap_count;
+        for (int j = 0; j < t->deferred_stack_unmap_count; j++) {
+            txns[nranges].deferred_starts[j] =
+                t->deferred_stack_unmap_starts[j];
+            txns[nranges].deferred_ends[j] = t->deferred_stack_unmap_ends[j];
+        }
+        nranges++;
+    }
+    /* Pass 2: commit. Both passes iterate the table in the same order
+     * under the same lock, so the active set seen here matches pass 1.
+     */
+    for (int i = 0; i < nranges; i++) {
+        (void) thread_add_deferred_unmap_locked(txns[i].thread, txns[i].start,
+                                                txns[i].end);
+        txns[i].thread->deferred_stack_unmap_busy++;
+    }
+    pthread_mutex_unlock(&thread_lock);
+
+    return nranges;
+}
+
+void thread_finish_deferred_stack_ranges(
+    const thread_deferred_stack_unmap_txn_t *txns,
+    int nranges)
+{
+    bool wake = false;
+
+    if (!txns || nranges <= 0)
+        return;
+
+    pthread_mutex_lock(&thread_lock);
+    for (int i = 0; i < nranges; i++) {
+        thread_entry_t *t = txns[i].thread;
+
+        if (!t || !t->active || t->guest_tid != txns[i].guest_tid ||
+            t->deferred_stack_unmap_busy <= 0)
+            continue;
+        t->deferred_stack_unmap_busy--;
+        wake = true;
+    }
+    if (wake)
+        pthread_cond_broadcast(&deferred_stack_unmap_cond);
+    pthread_mutex_unlock(&thread_lock);
+}
+
+void thread_rollback_deferred_stack_ranges(
+    const thread_deferred_stack_unmap_txn_t *txns,
+    int nranges)
+{
+    bool wake = false;
+
+    if (!txns || nranges <= 0)
+        return;
+
+    pthread_mutex_lock(&thread_lock);
+    for (int i = 0; i < nranges; i++) {
+        thread_entry_t *t = txns[i].thread;
+
+        if (!t || !t->active || t->guest_tid != txns[i].guest_tid)
+            continue;
+        t->deferred_stack_unmap_count = txns[i].deferred_count;
+        for (int j = 0; j < txns[i].deferred_count; j++) {
+            t->deferred_stack_unmap_starts[j] = txns[i].deferred_starts[j];
+            t->deferred_stack_unmap_ends[j] = txns[i].deferred_ends[j];
+        }
+        if (t->deferred_stack_unmap_busy > 0) {
+            t->deferred_stack_unmap_busy--;
+            wake = true;
+        }
+    }
+    if (wake)
+        pthread_cond_broadcast(&deferred_stack_unmap_cond);
+    pthread_mutex_unlock(&thread_lock);
+}
+
+int thread_prepare_deferred_stack_unmaps_for_cleanup(thread_entry_t *t,
+                                                     uint64_t *starts,
+                                                     uint64_t *ends,
+                                                     int max_ranges)
+{
+    int nranges = 0;
+
+    if (!t || !starts || !ends || max_ranges <= 0)
+        return 0;
+
+    pthread_mutex_lock(&thread_lock);
+    while (t->deferred_stack_unmap_busy > 0)
+        pthread_cond_wait(&deferred_stack_unmap_cond, &thread_lock);
+    t->stack_map_start = 0;
+    t->stack_map_end = 0;
+    nranges = t->deferred_stack_unmap_count;
+    if (nranges > max_ranges)
+        nranges = max_ranges;
+    for (int i = 0; i < nranges; i++) {
+        starts[i] = t->deferred_stack_unmap_starts[i];
+        ends[i] = t->deferred_stack_unmap_ends[i];
+    }
+    pthread_mutex_unlock(&thread_lock);
+
+    return nranges;
+}
+
+int thread_peek_deferred_stack_unmaps(thread_entry_t *t,
+                                      uint64_t *starts,
+                                      uint64_t *ends,
+                                      int max_ranges)
+{
+    int nranges = 0;
+
+    if (!t || !starts || !ends || max_ranges <= 0)
+        return 0;
+
+    pthread_mutex_lock(&thread_lock);
+    nranges = t->deferred_stack_unmap_count;
+    if (nranges > max_ranges)
+        nranges = max_ranges;
+    for (int i = 0; i < nranges; i++) {
+        starts[i] = t->deferred_stack_unmap_starts[i];
+        ends[i] = t->deferred_stack_unmap_ends[i];
+    }
+    pthread_mutex_unlock(&thread_lock);
+
+    return nranges;
+}
+
+int thread_drop_deferred_stack_unmap(thread_entry_t *t,
+                                     uint64_t start,
+                                     uint64_t end)
+{
+    int removed = 0;
+
+    if (!t || start >= end)
+        return 0;
+
+    pthread_mutex_lock(&thread_lock);
+    int n = t->deferred_stack_unmap_count;
+    for (int i = 0; i < n; i++) {
+        if (t->deferred_stack_unmap_starts[i] != start ||
+            t->deferred_stack_unmap_ends[i] != end)
+            continue;
+        n--;
+        t->deferred_stack_unmap_starts[i] = t->deferred_stack_unmap_starts[n];
+        t->deferred_stack_unmap_ends[i] = t->deferred_stack_unmap_ends[n];
+        t->deferred_stack_unmap_count = n;
+        removed = 1;
+        break;
+    }
+    pthread_mutex_unlock(&thread_lock);
+
+    return removed;
+}
+
+void thread_clear_stack_map(thread_entry_t *t)
+{
+    if (!t)
+        return;
+
+    pthread_mutex_lock(&thread_lock);
+    t->stack_map_start = 0;
+    t->stack_map_end = 0;
+    pthread_mutex_unlock(&thread_lock);
+}
+
+static int thread_add_deferred_unmap_locked(thread_entry_t *t,
+                                            uint64_t start,
+                                            uint64_t end)
+{
+    if (!t || start >= end)
+        return 0;
+
+    /* Absorb every existing slot that overlaps or is adjacent to [start,
+     * end), expanding the candidate as needed. Compact the array in place
+     * by pulling the live tail into each absorbed slot.
+     */
+    int n = t->deferred_stack_unmap_count;
+    int i = 0;
+    while (i < n) {
+        uint64_t rs = t->deferred_stack_unmap_starts[i];
+        uint64_t re = t->deferred_stack_unmap_ends[i];
+
+        if (end < rs || start > re) {
+            i++;
+            continue;
+        }
+        if (rs < start)
+            start = rs;
+        if (re > end)
+            end = re;
+        n--;
+        t->deferred_stack_unmap_starts[i] = t->deferred_stack_unmap_starts[n];
+        t->deferred_stack_unmap_ends[i] = t->deferred_stack_unmap_ends[n];
+    }
+
+    if (n >= MAX_DEFERRED_STACK_UNMAPS) {
+        t->deferred_stack_unmap_count = n;
+        return -1;
+    }
+
+    t->deferred_stack_unmap_starts[n] = start;
+    t->deferred_stack_unmap_ends[n] = end;
+    t->deferred_stack_unmap_count = n + 1;
+    return 0;
+}
+
+static int thread_can_add_deferred_unmap_locked(thread_entry_t *t,
+                                                uint64_t start,
+                                                uint64_t end)
+{
+    if (!t || start >= end)
+        return 0;
+
+    for (int i = 0; i < t->deferred_stack_unmap_count; i++) {
+        uint64_t rs = t->deferred_stack_unmap_starts[i];
+        uint64_t re = t->deferred_stack_unmap_ends[i];
+
+        if (end < rs || start > re)
+            continue;
+        return 0;
+    }
+
+    return (t->deferred_stack_unmap_count < MAX_DEFERRED_STACK_UNMAPS) ? 0 : -1;
+}
+
 static void thread_ptrace_init(thread_entry_t *t)
 {
     t->ptraced = false;
diff --git a/src/runtime/thread.h b/src/runtime/thread.h
index 4304eaa..25ab5ff 100644
--- a/src/runtime/thread.h
+++ b/src/runtime/thread.h
@@ -24,6 +24,7 @@
 
 /* Maximum number of concurrent guest threads in one VM. */
 #define MAX_THREADS 64
+#define MAX_DEFERRED_STACK_UNMAPS 8
 
 /* Per-thread state. One entry per guest thread (main + workers). */
 typedef struct {
@@ -112,8 +113,30 @@ typedef struct {
     int exit_signal;    /* Signal on exit (usually SIGCHLD) */
     bool vm_exited;     /* Child has exited */
     int vm_exit_status; /* Wait-format exit status */
+
+    /* Guest stack range supplied by clone3(stack, stack_size).
+     * elfuse uses this to avoid tearing down a still-active child stack when
+     * another thread munmaps the backing range before the child is done with
+     * its bootstrap stack.
+     */
+    uint64_t stack_map_start;
+    uint64_t stack_map_end;
+    uint64_t deferred_stack_unmap_starts[MAX_DEFERRED_STACK_UNMAPS];
+    uint64_t deferred_stack_unmap_ends[MAX_DEFERRED_STACK_UNMAPS];
+    int deferred_stack_unmap_count;
+    int deferred_stack_unmap_busy;
 } thread_entry_t;
 
+typedef struct {
+    thread_entry_t *thread;
+    int64_t guest_tid;
+    uint64_t start;
+    uint64_t end;
+    uint64_t deferred_starts[MAX_DEFERRED_STACK_UNMAPS];
+    uint64_t deferred_ends[MAX_DEFERRED_STACK_UNMAPS];
+    int deferred_count;
+} thread_deferred_stack_unmap_txn_t;
+
 /* Current thread pointer, set once per host pthread at thread start.
  * All syscall handlers can access per-thread state through this.
  */
@@ -134,7 +157,9 @@ void thread_register_main(hv_vcpu_t vcpu,
  * Returns a pointer to the entry, or NULL if the table is full.
  * The caller must fill in vcpu, vexit, host_thread, sp_el1.
  */
-thread_entry_t *thread_alloc(int64_t tid);
+thread_entry_t *thread_alloc(int64_t tid,
+                             uint64_t stack_start,
+                             uint64_t stack_end);
 
 /* Mark a thread as inactive and release its table slot. */
 void thread_deactivate(thread_entry_t *t);
@@ -242,3 +267,61 @@ int64_t thread_ptrace_wait(int64_t tracer_tid,
 
 /* Get the thread table mutex (needed for ptrace wait blocking). */
 pthread_mutex_t *thread_get_lock(void);
+
+/* Snapshot every active guest stack range overlapping [start, end), then
+ * record a deferred-unmap entry on each one. While the transaction is live,
+ * cleanup of the affected thread's deferred stack entries will block so a
+ * later rollback cannot race with thread exit.
+ * On success, txns[0..nranges) contains both the overlapping ranges and the
+ * pre-update deferred-unmap state needed for rollback.
+ * Returns the number of overlapping stack ranges, or -1 if the caller's
+ * buffer is too small or any thread's deferred-unmap budget is exhausted.
+ */
+int thread_collect_and_defer_stack_ranges(
+    uint64_t start,
+    uint64_t end,
+    thread_deferred_stack_unmap_txn_t *txns,
+    int max_ranges);
+
+/* Release the in-flight marker set by thread_collect_and_defer_stack_ranges()
+ * after the caller has successfully completed the non-deferred munmap work.
+ */
+void thread_finish_deferred_stack_ranges(
+    const thread_deferred_stack_unmap_txn_t *txns,
+    int nranges);
+
+/* Restore the deferred-unmap state previously captured by
+ * thread_collect_and_defer_stack_ranges(), then release the in-flight marker.
+ */
+void thread_rollback_deferred_stack_ranges(
+    const thread_deferred_stack_unmap_txn_t *txns,
+    int nranges);
+
+/* For thread exit cleanup: wait for any in-flight deferred-stack munmap
+ * transaction affecting this thread to finish, then clear the live stack map
+ * and snapshot the current deferred unmaps. Returns the number of entries
+ * copied (capped at max_ranges).
+ */
+int thread_prepare_deferred_stack_unmaps_for_cleanup(thread_entry_t *t,
+                                                     uint64_t *starts,
+                                                     uint64_t *ends,
+                                                     int max_ranges);
+/* Snapshot the deferred unmap entries without modifying the thread record.
+ * Returns the number of entries copied (capped at max_ranges).
+ */
+int thread_peek_deferred_stack_unmaps(thread_entry_t *t,
+                                      uint64_t *starts,
+                                      uint64_t *ends,
+                                      int max_ranges);
+
+/* Drop a single completed deferred unmap entry by exact [start, end) match.
+ * Returns 1 if removed, 0 if no matching entry was found.
+ */
+int thread_drop_deferred_stack_unmap(thread_entry_t *t,
+                                     uint64_t start,
+                                     uint64_t end);
+
+/* Forget the thread's stack range so future munmap calls do not enqueue new
+ * deferred entries against this slot. Safe to call once the thread is dead.
+ */
+void thread_clear_stack_map(thread_entry_t *t);
diff --git a/src/syscall/abi.h b/src/syscall/abi.h
index c6bfb10..12e3a25 100644
--- a/src/syscall/abi.h
+++ b/src/syscall/abi.h
@@ -188,6 +188,7 @@
 #define SYS_pwritev2 287
 /* misc */
 #define SYS_sethostname 161
+#define SYS_getcpu 168
 #define SYS_memfd_create 279
 #define SYS_membarrier 283
 #define SYS_mlock 228
diff --git a/src/syscall/dispatch.tbl b/src/syscall/dispatch.tbl
index 2925ca1..7ccb457 100644
--- a/src/syscall/dispatch.tbl
+++ b/src/syscall/dispatch.tbl
@@ -175,6 +175,7 @@ SYS_prctl sc_prctl 1
 # System info
 SYS_uname sc_uname 0
 SYS_sethostname sc_sethostname 0
+SYS_getcpu sc_getcpu 0
 SYS_sysinfo sc_sysinfo 1
 SYS_getrlimit sc_getrlimit 1
 SYS_setrlimit sc_setrlimit 1
diff --git a/src/syscall/exec.c b/src/syscall/exec.c
index 3ec2870..b52366c 100644
--- a/src/syscall/exec.c
+++ b/src/syscall/exec.c
@@ -26,11 +26,13 @@
 #include "core/stack.h"
 #include "core/vdso.h"
 
+#include "runtime/forkipc.h"
 #include "runtime/futex.h"
 
 #include "syscall/abi.h"
 #include "syscall/exec.h"
 #include "syscall/internal.h"
+#include "syscall/path.h"
 #include "syscall/proc.h"
 #include "syscall/signal.h"
 
@@ -114,6 +116,9 @@ int64_t sys_execve(hv_vcpu_t vcpu,
 
     log_debug("execve(\"%s\")", path);
 
+    char path_host_buf[LINUX_PATH_MAX];
+    const char *path_host = path;
+
 #define MAX_ARGS 256
 #define MAX_ENVS 4096
 #define STR_BUF_SIZE ((size_t) 256 * 1024)
@@ -161,16 +166,24 @@ int64_t sys_execve(hv_vcpu_t vcpu,
         log_debug("execve resolved to \"%s\"", path);
     }
 
+    if (!host_path && path[0] == '/')
+        path_host = path_resolve_sysroot_path(path, path_host_buf,
+                                              sizeof(path_host_buf));
+    if (!path_host) {
+        err = -LINUX_ENAMETOOLONG;
+        goto fail;
+    }
+
     /* Try loading as ELF; if that fails, emulate Linux binfmt_script for
      * shebang files.
      * Linux kernel handles shebangs transparently in binfmt_script.
      */
     elf_info_t elf_info;
-    if (elf_load(path, &elf_info) < 0) {
+    if (elf_load(path_host, &elf_info) < 0) {
         /* Not a valid ELF. Check if it's a script with a shebang line.
          * Read the first 256 bytes and look for "#!" at the start.
          */
-        int script_fd = open(path, O_RDONLY);
+        int script_fd = open(path_host, O_RDONLY);
         if (script_fd < 0) {
             err = -LINUX_ENOENT;
             goto fail;
@@ -279,8 +292,16 @@ int64_t sys_execve(hv_vcpu_t vcpu,
 
         /* Continue the same exec transaction using the interpreter image. */
         str_copy_trunc(path, interp_start, sizeof(path));
+        path_host = path;
+        if (path[0] == '/')
+            path_host = path_resolve_sysroot_path(path, path_host_buf,
+                                                  sizeof(path_host_buf));
+        if (!path_host) {
+            err = -LINUX_ENAMETOOLONG;
+            goto fail;
+        }
 
-        if (elf_load(path, &elf_info) < 0) {
+        if (elf_load(path_host, &elf_info) < 0) {
             err = -LINUX_ENOENT;
             goto fail;
         }
@@ -446,6 +467,7 @@ int64_t sys_execve(hv_vcpu_t vcpu,
     /* Past this point the old image is gone; later failures are fatal like a
      * kernel exec failure after its point of no return.
      */
+    fork_notify_vfork_exec();
     guest_reset(g);
 
     /* The replacement image must not inherit process-wide shutdown requests
@@ -470,12 +492,12 @@ int64_t sys_execve(hv_vcpu_t vcpu,
     }
 
     /* Load the executable image that was validated before guest_reset(). */
-    if (elf_map_segments(&elf_info, path, g->host_base, g->guest_size,
+    if (elf_map_segments(&elf_info, path_host, g->host_base, g->guest_size,
                          elf_load_base) < 0) {
         log_fatal(
             "execve failed after point of no return: "
             "failed to map ELF segments for %s",
-            path);
+            path_host);
         exit(128);
     }
 
@@ -656,7 +678,8 @@ int64_t sys_execve(hv_vcpu_t vcpu,
                          elf_info.segments[i].gpa + elf_info.segments[i].memsz +
                              elf_load_base,
                          elf_pf_to_prot(elf_info.segments[i].flags),
-                         LINUX_MAP_PRIVATE, elf_info.segments[i].offset, path);
+                         LINUX_MAP_PRIVATE, elf_info.segments[i].offset,
+                         path_host);
     }
     /* interp_resolved was computed before guest_reset so no filesystem lookup
      * is needed after the point of no return.
@@ -704,7 +727,9 @@ int64_t sys_execve(hv_vcpu_t vcpu,
         entry_point = (interp_base != 0) ? (interp_info.entry + interp_base)
                                          : (elf_info.entry + elf_load_base);
 
-        /* Publish the new identity only after stack construction succeeds. */
+        /* Publish the guest-visible path so /proc/self/exe remains stable
+         * across sysroot translation and can be re-exec'd by the guest.
+         */
         proc_set_elf_path(path);
         proc_set_cmdline(argc, argv_const);
         proc_set_environ(envp_const);
@@ -759,7 +784,7 @@ int64_t sys_execve(hv_vcpu_t vcpu,
         (void) _sync;
     }
 
-    log_debug("execve: loaded %s, entry=0x%llx sp=0x%llx", path,
+    log_debug("execve: loaded %s, entry=0x%llx sp=0x%llx", path_host,
               (unsigned long long) entry_ipa, (unsigned long long) sp_ipa);
 
     free(argv_buf);
diff --git a/src/syscall/mem.c b/src/syscall/mem.c
index fc57998..4787745 100644
--- a/src/syscall/mem.c
+++ b/src/syscall/mem.c
@@ -21,6 +21,7 @@
 #include "debug/log.h"
 #include "utils.h"
 
+#include "runtime/thread.h"
 #include "syscall/abi.h"
 #include "syscall/internal.h"
 #include "syscall/mem.h"
@@ -1020,6 +1021,7 @@ int64_t sys_brk(guest_t *g, uint64_t addr)
     /* brk addresses as seen by the guest are IPA-based */
     uint64_t ipa_brk = guest_ipa(g, g->brk_current);
     uint64_t ipa_base = guest_ipa(g, g->brk_base);
+    uint64_t old_brk = g->brk_current;
 
     if (addr == 0) {
         return (int64_t) ipa_brk;
@@ -1035,16 +1037,18 @@ int64_t sys_brk(guest_t *g, uint64_t addr)
         return (int64_t) ipa_brk;
     }
 
-    /* Extend page tables if brk grows beyond currently-mapped region.
-     * The brk region is initially mapped up to MMAP_RX_BASE; if it grows
-     * past that, the mmap allocator needs to extend dynamically.
+    /* Materialize any newly exposed heap pages. This must handle both:
+     * 1. growth into brand-new 2 MiB blocks, and
+     * 2. growth within an already-split block where finalize_block_perms()
+     *    intentionally left non-covered pages invalid until brk exposes them.
      */
-    uint64_t brk_pt_end = ALIGN_UP(g->brk_current, BLOCK_2MIB);
-    if (brk_pt_end < MMAP_RX_BASE)
-        brk_pt_end = MMAP_RX_BASE;
-    if (new_off > brk_pt_end) {
-        uint64_t new_end = ALIGN_UP(new_off, BLOCK_2MIB);
-        if (guest_extend_page_tables(g, brk_pt_end, new_end, MEM_PERM_RW) < 0)
+    if (new_off > old_brk) {
+        uint64_t grow_start = ALIGN_DOWN(old_brk, GUEST_PAGE_SIZE);
+        uint64_t grow_end = PAGE_ALIGN_UP(new_off);
+
+        if (guest_extend_page_tables(g, grow_start, grow_end, MEM_PERM_RW) < 0)
+            return (int64_t) ipa_brk;
+        if (guest_update_perms(g, grow_start, grow_end, MEM_PERM_RW) < 0)
             return (int64_t) ipa_brk;
     }
 
@@ -1054,7 +1058,6 @@ int64_t sys_brk(guest_t *g, uint64_t addr)
                new_off - g->brk_current);
     }
 
-    uint64_t old_brk = g->brk_current;
     g->brk_current = new_off;
 
     /* Update "[heap]" region tracking atomically.
@@ -1443,10 +1446,25 @@ int64_t sys_mmap(guest_t *g,
             result_off = UINT64_MAX;
             if (addr != 0) {
                 uint64_t hint_off = addr - g->ipa_base;
-                if (hint_off >= MMAP_BASE && hint_off <= g->mmap_limit &&
-                    length <= g->mmap_limit - hint_off)
+                if (hint_off >= ELF_DEFAULT_BASE && hint_off <= g->mmap_limit &&
+                    length <= g->mmap_limit - hint_off) {
+                    /* Real Linux treats non-fixed mmap(addr!=0) as a strong
+                     * hint, including low canonical addresses such as the
+                     * traditional x86-64 ET_EXEC base at 0x400000. box64 uses
+                     * this pattern when reserving address space for static
+                     * ET_EXEC binaries; forcing every hint below MMAP_BASE
+                     * into the high RW arena breaks that expectation and the
+                     * guest later still dereferences the low address.
+                     *
+                     * Probe the hinted range first. Keep low-hint searches
+                     * below MMAP_BASE so an unresolved low hint does not
+                     * silently spill into the high arena on this fast path.
+                     */
+                    uint64_t hint_max =
+                        (hint_off < MMAP_BASE) ? MMAP_BASE : g->mmap_limit;
                     result_off =
-                        find_free_gap(g, length, hint_off, g->mmap_limit);
+                        find_free_gap_inner(g, length, hint_off, hint_max);
+                }
             }
             if (result_off == UINT64_MAX)
                 result_off = find_free_gap(g, length, MMAP_BASE, g->mmap_limit);
@@ -2381,6 +2399,90 @@ int64_t sys_mmap_anon(guest_t *g, uint64_t addr, uint64_t length, int prot)
                     LINUX_MAP_PRIVATE | LINUX_MAP_ANONYMOUS, -1, 0);
 }
 
+static int compare_range_pair(const void *a, const void *b)
+{
+    const uint64_t *ra = a;
+    const uint64_t *rb = b;
+
+    if (ra[0] < rb[0])
+        return -1;
+    if (ra[0] > rb[0])
+        return 1;
+    return 0;
+}
+
+static int munmap_guest_range(guest_t *g, uint64_t unmap_off, uint64_t end)
+{
+    /* Reject munmap targeting VM infrastructure regions. */
+    if (unmap_off < ELF_DEFAULT_BASE && end > PT_POOL_BASE)
+        return -LINUX_EINVAL;
+
+    /* Restore slab backing under any active MAP_SHARED file overlay before
+     * zeroing the host VA. Without this, the memset below would write zeros
+     * directly into the file.
+     */
+    int cleanup_err = cleanup_overlays_in_range(g, unmap_off, end);
+    if (cleanup_err < 0)
+        return cleanup_err;
+
+    /* Invalidate PTEs first. This may need to split a 2MiB block which can
+     * fail if the page table pool is exhausted. Failing before region removal
+     * keeps metadata consistent.
+     */
+    if (guest_invalidate_ptes(g, unmap_off, end) < 0)
+        return -LINUX_ENOMEM;
+    g->need_tlbi = true;
+    for (int i = 0; i < g->nregions; i++) {
+        guest_region_t *r = &g->regions[i];
+        if (r->start >= end)
+            break;
+        if (r->end <= unmap_off)
+            continue;
+        if (r->prot == LINUX_PROT_NONE)
+            continue;
+        uint64_t zstart = (r->start > unmap_off) ? r->start : unmap_off;
+        uint64_t zend = (r->end < end) ? r->end : end;
+        memset((uint8_t *) g->host_base + zstart, 0, zend - zstart);
+    }
+    guest_region_remove(g, unmap_off, end);
+    if (unmap_off < g->mmap_rw_gap_hint)
+        g->mmap_rw_gap_hint = unmap_off;
+    if (unmap_off < g->mmap_rx_gap_hint)
+        g->mmap_rx_gap_hint = unmap_off;
+
+    return 0;
+}
+
+void mem_cleanup_deferred_stack_unmaps(guest_t *g, thread_entry_t *t)
+{
+    uint64_t starts[MAX_DEFERRED_STACK_UNMAPS];
+    uint64_t ends[MAX_DEFERRED_STACK_UNMAPS];
+    int nranges;
+
+    if (!g || !t)
+        return;
+
+    nranges = thread_prepare_deferred_stack_unmaps_for_cleanup(
+        t, starts, ends, (int) ARRAY_SIZE(starts));
+    if (nranges <= 0)
+        return;
+
+    pthread_mutex_lock(&mmap_lock);
+    for (int i = 0; i < nranges; i++) {
+        int rc = munmap_guest_range(g, starts[i], ends[i]);
+        if (rc < 0) {
+            log_error(
+                "deferred stack munmap for tid=%lld leaked: "
+                "[0x%llx-0x%llx) rc=%d (region tracking inconsistent)",
+                (long long) t->guest_tid, (unsigned long long) starts[i],
+                (unsigned long long) ends[i], rc);
+            continue;
+        }
+        thread_drop_deferred_stack_unmap(t, starts[i], ends[i]);
+    }
+    pthread_mutex_unlock(&mmap_lock);
+}
+
 /* sys_munmap. */
 
 int64_t sys_munmap(guest_t *g, uint64_t addr, uint64_t length)
@@ -2397,45 +2499,45 @@ int64_t sys_munmap(guest_t *g, uint64_t addr, uint64_t length)
         uint64_t unmap_off = addr - g->ipa_base;
         if (unmap_off <= g->guest_size && length <= g->guest_size - unmap_off) {
             uint64_t end = unmap_off + length;
-
-            /* Reject munmap targeting VM infrastructure regions. */
-            if (unmap_off < ELF_DEFAULT_BASE && end > PT_POOL_BASE)
-                return -LINUX_EINVAL;
-
-            /* Restore slab backing under any active MAP_SHARED file overlay
-             * before zeroing the host VA. Without this, the memset below
-             * would write zeros directly into the file. The cleanup walker
-             * reads live region metadata so it must run before
-             * guest_region_remove.
-             */
-            int cleanup_err = cleanup_overlays_in_range(g, unmap_off, end);
-            if (cleanup_err < 0)
-                return cleanup_err;
-
-            /* Invalidate PTEs first. This may need to split a 2MiB block
-             * which can fail if the page table pool is exhausted. Failing
-             * before region removal keeps metadata consistent.
-             */
-            if (guest_invalidate_ptes(g, unmap_off, end) < 0)
+            thread_deferred_stack_unmap_txn_t txns[MAX_THREADS];
+            uint64_t ranges[MAX_THREADS][2];
+            int nranges = thread_collect_and_defer_stack_ranges(
+                unmap_off, end, txns, (int) ARRAY_SIZE(txns));
+            if (nranges < 0)
                 return -LINUX_ENOMEM;
-            g->need_tlbi = true;
-            for (int i = 0; i < g->nregions; i++) {
-                guest_region_t *r = &g->regions[i];
-                if (r->start >= end)
-                    break;
-                if (r->end <= unmap_off)
-                    continue;
-                if (r->prot == LINUX_PROT_NONE)
-                    continue;
-                uint64_t zstart = (r->start > unmap_off) ? r->start : unmap_off;
-                uint64_t zend = (r->end < end) ? r->end : end;
-                memset((uint8_t *) g->host_base + zstart, 0, zend - zstart);
+
+            for (int i = 0; i < nranges; i++) {
+                ranges[i][0] = txns[i].start;
+                ranges[i][1] = txns[i].end;
+            }
+            if (nranges > 1)
+                qsort(ranges, (size_t) nranges, sizeof(ranges[0]),
+                      compare_range_pair);
+
+            uint64_t cursor = unmap_off;
+            for (int i = 0; i < nranges && cursor < end; i++) {
+                uint64_t keep_start = ranges[i][0];
+                uint64_t keep_end = ranges[i][1];
+
+                if (keep_start > cursor) {
+                    int rc = munmap_guest_range(
+                        g, cursor, keep_start < end ? keep_start : end);
+                    if (rc < 0) {
+                        thread_rollback_deferred_stack_ranges(txns, nranges);
+                        return rc;
+                    }
+                }
+                if (keep_end > cursor)
+                    cursor = keep_end;
+            }
+            if (cursor < end) {
+                int rc = munmap_guest_range(g, cursor, end);
+                if (rc < 0) {
+                    thread_rollback_deferred_stack_ranges(txns, nranges);
+                    return rc;
+                }
             }
-            guest_region_remove(g, unmap_off, end);
-            if (unmap_off < g->mmap_rw_gap_hint)
-                g->mmap_rw_gap_hint = unmap_off;
-            if (unmap_off < g->mmap_rx_gap_hint)
-                g->mmap_rx_gap_hint = unmap_off;
+            thread_finish_deferred_stack_ranges(txns, nranges);
         }
     }
     return 0;
diff --git a/src/syscall/mem.h b/src/syscall/mem.h
index fed3730..f5d4877 100644
--- a/src/syscall/mem.h
+++ b/src/syscall/mem.h
@@ -47,6 +47,9 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice);
 /* msync: synchronize file-backed mappings to disk */
 int64_t sys_msync(guest_t *g, uint64_t addr, uint64_t length, int flags);
 
+/* Apply deferred guest-stack munmap work for an exiting thread. */
+void mem_cleanup_deferred_stack_unmaps(guest_t *g, thread_entry_t *t);
+
 /* Fork preparation: convert MAP_SHARED|MAP_ANONYMOUS regions that have
  * no backing fd into memfd-backed overlay regions. Each converted region
  * gets a private mkstemp+unlink temp file seeded from the current host
diff --git a/src/syscall/sys.c b/src/syscall/sys.c
index 80a007f..fef7595 100644
--- a/src/syscall/sys.c
+++ b/src/syscall/sys.c
@@ -222,6 +222,25 @@ int64_t sys_getcwd(guest_t *g, uint64_t buf_gva, uint64_t size)
     return (int64_t) write_len;
 }
 
+int64_t sys_getcpu(guest_t *g,
+                   uint64_t cpu_gva,
+                   uint64_t node_gva,
+                   uint64_t cache_gva)
+{
+    (void) cache_gva;
+
+    /* elfuse models one online CPU and one NUMA node. glibc and tools such as
+     * file(1) only need a successful query here; the kernel cache pointer is
+     * obsolete and may be ignored.
+     */
+    uint32_t zero = 0;
+    if (cpu_gva && guest_write_small(g, cpu_gva, &zero, sizeof(zero)) < 0)
+        return -LINUX_EFAULT;
+    if (node_gva && guest_write_small(g, node_gva, &zero, sizeof(zero)) < 0)
+        return -LINUX_EFAULT;
+    return 0;
+}
+
 int64_t sys_sched_getaffinity(guest_t *g,
                               int pid,
                               uint64_t size,
diff --git a/src/syscall/sys.h b/src/syscall/sys.h
index 7c830ce..8025d2b 100644
--- a/src/syscall/sys.h
+++ b/src/syscall/sys.h
@@ -22,6 +22,10 @@ int64_t sys_getrandom(guest_t *g,
                       uint64_t buflen,
                       unsigned int flags);
 int64_t sys_getcwd(guest_t *g, uint64_t buf_gva, uint64_t size);
+int64_t sys_getcpu(guest_t *g,
+                   uint64_t cpu_gva,
+                   uint64_t node_gva,
+                   uint64_t cache_gva);
 int64_t sys_sched_getaffinity(guest_t *g,
                               int pid,
                               uint64_t size,
diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c
index c3abb46..bb8ef14 100644
--- a/src/syscall/syscall.c
+++ b/src/syscall/syscall.c
@@ -302,6 +302,7 @@ SC_FORWARD(sc_rt_sigpending,  signal_rt_sigpending(g, x0, x1))
 /* System info */
 SC_FORWARD(sc_uname,     sys_uname(g, x0))
 SC_FORWARD(sc_getrandom, sys_getrandom(g, x0, x1, (unsigned int) x2))
+SC_FORWARD(sc_getcpu,    sys_getcpu(g, x0, x1, x2))
 SC_FORWARD(sc_sysinfo,   sys_sysinfo(g, x0))
 SC_FORWARD(sc_prlimit64, sys_prlimit64(g, (int) x0, (int) x1, x2, x3))
 SC_FORWARD(sc_getrlimit, sys_prlimit64(g, 0, (int) x0, 0, x1))
@@ -1518,7 +1519,8 @@ static int64_t sc_clone(guest_t *g,
         (unsigned long long) x0, (unsigned long long) x1,
         (unsigned long long) x2, (unsigned long long) x3,
         (unsigned long long) x4);
-    return sys_clone(current_thread->vcpu, g, x0, x1, x2, x3, x4, verbose);
+    return sys_clone(current_thread->vcpu, g, x0, x1, 0, 0, x2, x3, x4,
+                     verbose);
 }
 
 static int64_t sc_clone3(guest_t *g,
diff --git a/tests/manifest.txt b/tests/manifest.txt
index 57ca6f2..71eb420 100644
--- a/tests/manifest.txt
+++ b/tests/manifest.txt
@@ -91,6 +91,7 @@ test-opath
 
 [section] Guard page / mmap edge cases
 test-guard-page
+test-mmap-hint
 
 [section] mremap tests
 test-mremap
diff --git a/tests/test-clone3.c b/tests/test-clone3.c
index 4e9da36..dc679b9 100644
--- a/tests/test-clone3.c
+++ b/tests/test-clone3.c
@@ -12,6 +12,7 @@
 #include <stdarg.h>
 #include <stdint.h>
 #include <string.h>
+#include <time.h>
 #include <unistd.h>
 #include <sys/mman.h>
 #include <sys/wait.h>
@@ -21,6 +22,8 @@
 #include "test-util.h"
 
 int passes = 0, fails = 0;
+extern char **environ;
+static const char *self_path = NULL;
 
 static void check(int cond, const char *fmt, ...)
 {
@@ -47,6 +50,7 @@ static void check(int cond, const char *fmt, ...)
 
 #define CLONE3_THREAD 0x00010000
 #define CLONE3_VM 0x00000100
+#define CLONE3_VFORK 0x00004000
 #define CLONE3_SIGHAND 0x00000800
 #define CLONE3_FILES 0x00000400
 #define CLONE3_FS 0x00000200
@@ -66,6 +70,8 @@ struct clone_args {
 static volatile int thread_done = 0;
 static volatile int thread_result = 0;
 static volatile int thread_tid = 0;
+static volatile int vfork_exec_guard = 0x13579bdf;
+static volatile int parked_thread_state = 0;
 
 /* Thread entry: sets thread_result and signals done via futex */
 static int thread_fn(void)
@@ -78,6 +84,28 @@ static int thread_fn(void)
     test_unreachable();
 }
 
+static int parked_thread_fn(void)
+{
+    __atomic_store_n(&parked_thread_state, 1, __ATOMIC_SEQ_CST);
+    raw_syscall6(__NR_futex, (long) &parked_thread_state,
+                 FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1, 0, 0, 0);
+
+    while (__atomic_load_n(&parked_thread_state, __ATOMIC_SEQ_CST) == 1) {
+        raw_syscall6(__NR_futex, (long) &parked_thread_state,
+                     FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 1, 0, 0, 0);
+    }
+
+    raw_exit(0);
+    test_unreachable();
+}
+
+static uint64_t read_sp(void)
+{
+    uint64_t sp;
+    __asm__ volatile("mov %0, sp" : "=r"(sp));
+    return sp;
+}
+
 /* Test 1: clone3 basic fork */
 static void test_fork(void)
 {
@@ -339,8 +367,309 @@ static void test_stack_overflow(void)
           ret);
 }
 
-int main(void)
+/* Test 11: CLONE_VM|CLONE_VFORK must not reuse the in-process VM-clone path.
+ * The child execs this same binary with a marker argument and exits 23.
+ */
+static void test_vfork_exec(void)
+{
+    struct clone_args ca;
+    memset(&ca, 0, sizeof(ca));
+    ca.flags = CLONE3_VM | CLONE3_VFORK;
+    ca.exit_signal = 17; /* SIGCHLD */
+
+    vfork_exec_guard = 0x13579bdf;
+
+    long ret = raw_clone3(&ca, CLONE_ARGS_SIZE_VER0);
+    if (ret < 0) {
+        CHECK(0, "clone3 vfork failed with %ld", ret);
+        return;
+    }
+
+    if (ret == 0) {
+        char *child_argv[] = {(char *) self_path,
+                              (char *) "--clone3-vfork-child", NULL};
+        execve(self_path, child_argv, environ);
+        _exit(127);
+    }
+
+    int status = 0;
+    pid_t waited = waitpid((pid_t) ret, &status, 0);
+    CHECK(waited == (pid_t) ret, "vfork waitpid returned %d, expected %ld",
+          waited, ret);
+    CHECK(WIFEXITED(status) && WEXITSTATUS(status) == 23,
+          "vfork+exec child exit status: 0x%x (expected exit 23)", status);
+    CHECK(vfork_exec_guard == 0x13579bdf,
+          "vfork guard changed to 0x%x after child exec",
+          (unsigned int) vfork_exec_guard);
+}
+
+/* Test 12: CLONE_VM|CLONE_VFORK with an explicit child stack must resume the
+ * child on that stack before it executes guest code.
+ */
+static void test_vfork_child_stack(void)
+{
+    size_t stack_size = 65536;
+    void *stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    CHECK(stack != MAP_FAILED, "mmap for vfork child stack test failed");
+    if (stack == MAP_FAILED)
+        return;
+
+    struct clone_args ca;
+    memset(&ca, 0, sizeof(ca));
+    ca.flags = CLONE3_VM | CLONE3_VFORK;
+    ca.exit_signal = 17; /* SIGCHLD */
+    ca.stack = (uint64_t) stack;
+    ca.stack_size = stack_size;
+
+    long ret = raw_clone3(&ca, CLONE_ARGS_SIZE_VER0);
+    if (ret < 0) {
+        CHECK(0, "clone3 vfork with child stack failed with %ld", ret);
+        munmap(stack, stack_size);
+        return;
+    }
+
+    if (ret == 0) {
+        uint64_t sp = read_sp();
+        _exit((sp >= (uint64_t) stack && sp <= (uint64_t) stack + stack_size)
+                  ? 24
+                  : 125);
+    }
+
+    int status = 0;
+    pid_t waited = waitpid((pid_t) ret, &status, 0);
+    CHECK(waited == (pid_t) ret,
+          "vfork child-stack waitpid returned %d, expected %ld", waited, ret);
+    CHECK(WIFEXITED(status) && WEXITSTATUS(status) == 24,
+          "vfork child stack exit status: 0x%x (expected exit 24)", status);
+    munmap(stack, stack_size);
+}
+
+static void test_vfork_exec_unblocks_parent(void)
 {
+    struct clone_args ca;
+    memset(&ca, 0, sizeof(ca));
+    ca.flags = CLONE3_VM | CLONE3_VFORK;
+    ca.exit_signal = 17; /* SIGCHLD */
+
+    struct timespec start, end;
+    CHECK(clock_gettime(CLOCK_MONOTONIC, &start) == 0,
+          "clock_gettime(start) failed");
+
+    long ret = raw_clone3(&ca, CLONE_ARGS_SIZE_VER0);
+    if (ret < 0) {
+        CHECK(0, "clone3 vfork unblock test failed with %ld", ret);
+        return;
+    }
+
+    if (ret == 0) {
+        char *child_argv[] = {(char *) self_path,
+                              (char *) "--clone3-vfork-sleep-child", NULL};
+        execve(self_path, child_argv, environ);
+        _exit(127);
+    }
+
+    CHECK(clock_gettime(CLOCK_MONOTONIC, &end) == 0,
+          "clock_gettime(end) failed");
+    long elapsed_ms = (long) (end.tv_sec - start.tv_sec) * 1000L +
+                      (long) (end.tv_nsec - start.tv_nsec) / 1000000L;
+    CHECK(elapsed_ms < 500,
+          "vfork parent resumed after %ld ms (expected < 500 ms)", elapsed_ms);
+
+    int status = 0;
+    pid_t waited = waitpid((pid_t) ret, &status, 0);
+    CHECK(waited == (pid_t) ret,
+          "vfork unblock waitpid returned %d, expected %ld", waited, ret);
+    CHECK(WIFEXITED(status) && WEXITSTATUS(status) == 25,
+          "vfork unblock child exit status: 0x%x (expected exit 25)", status);
+}
+
+/* Test 13: munmap overlapping an active clone3 stack must be cleaned up after
+ * the thread exits so the same VA can be reused.
+ */
+static void test_deferred_stack_munmap(void)
+{
+    size_t stack_size = 65536;
+    void *stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    CHECK(stack != MAP_FAILED, "mmap for deferred stack test failed");
+    if (stack == MAP_FAILED)
+        return;
+
+    parked_thread_state = 0;
+    thread_tid = 0;
+
+    struct clone_args ca;
+    memset(&ca, 0, sizeof(ca));
+    ca.flags = CLONE3_THREAD | CLONE3_VM | CLONE3_SIGHAND | CLONE3_FILES |
+               CLONE3_FS | CLONE3_CHILD_CLEARTID | CLONE3_CHILD_SETTID;
+    ca.exit_signal = 0;
+    ca.stack = (uint64_t) stack;
+    ca.stack_size = stack_size;
+    ca.child_tid = (uint64_t) &thread_tid;
+
+    long ret = raw_clone3(&ca, CLONE_ARGS_SIZE_VER0);
+    if (ret == 0) {
+        parked_thread_fn();
+        __builtin_unreachable();
+    }
+
+    CHECK(ret > 0, "clone3 parked thread returned %ld", ret);
+    if (ret < 0) {
+        munmap(stack, stack_size);
+        return;
+    }
+
+    while (__atomic_load_n(&parked_thread_state, __ATOMIC_SEQ_CST) == 0) {
+        raw_syscall6(__NR_futex, (long) &parked_thread_state,
+                     FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, 0, 0, 0);
+    }
+
+    CHECK(munmap(stack, stack_size) == 0,
+          "munmap of live child stack failed unexpectedly");
+
+    __atomic_store_n(&parked_thread_state, 2, __ATOMIC_SEQ_CST);
+    raw_syscall6(__NR_futex, (long) &parked_thread_state,
+                 FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1, 0, 0, 0);
+
+    while (__atomic_load_n(&thread_tid, __ATOMIC_SEQ_CST) != 0) {
+        int tid = __atomic_load_n(&thread_tid, __ATOMIC_SEQ_CST);
+        raw_syscall6(__NR_futex, (long) &thread_tid,
+                     FUTEX_WAIT | FUTEX_PRIVATE_FLAG, tid, 0, 0, 0);
+    }
+
+    void *reuse =
+        mmap(stack, stack_size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0);
+    CHECK(reuse == stack, "stack reuse mmap returned %p (expected %p)", reuse,
+          stack);
+    if (reuse != MAP_FAILED && reuse == stack)
+        munmap(reuse, stack_size);
+}
+
+/* Test 14: partial munmap overlap with a live clone3 stack must still unmap
+ * the non-overlapping portion immediately, then release the deferred slice once
+ * the thread exits.
+ */
+static void test_partial_deferred_stack_munmap(void)
+{
+    size_t stack_size = 65536;
+    size_t span_size = stack_size * 2;
+    void *span =
+        mmap(NULL, span_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    CHECK(span != MAP_FAILED,
+          "mmap reserve for partial deferred stack test failed");
+    if (span == MAP_FAILED)
+        return;
+    CHECK(munmap(span, span_size) == 0,
+          "munmap reserve for partial deferred stack test failed");
+
+    void *other =
+        mmap(span, stack_size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0);
+    CHECK(other == span, "mmap non-stack half returned %p (expected %p)", other,
+          span);
+    if (other != span) {
+        if (other != MAP_FAILED)
+            munmap(other, stack_size);
+        return;
+    }
+
+    void *stack =
+        mmap((char *) span + stack_size, stack_size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0);
+    CHECK(stack == (char *) span + stack_size,
+          "mmap stack half returned %p (expected %p)", stack,
+          (char *) span + stack_size);
+    if (stack != (char *) span + stack_size) {
+        if (stack != MAP_FAILED)
+            munmap(stack, stack_size);
+        munmap(other, stack_size);
+        return;
+    }
+
+    parked_thread_state = 0;
+    thread_tid = 0;
+
+    struct clone_args ca;
+    memset(&ca, 0, sizeof(ca));
+    ca.flags = CLONE3_THREAD | CLONE3_VM | CLONE3_SIGHAND | CLONE3_FILES |
+               CLONE3_FS | CLONE3_CHILD_CLEARTID | CLONE3_CHILD_SETTID;
+    ca.exit_signal = 0;
+    ca.stack = (uint64_t) stack;
+    ca.stack_size = stack_size;
+    ca.child_tid = (uint64_t) &thread_tid;
+
+    long ret = raw_clone3(&ca, CLONE_ARGS_SIZE_VER0);
+    if (ret == 0) {
+        parked_thread_fn();
+        __builtin_unreachable();
+    }
+
+    CHECK(ret > 0, "clone3 parked thread for partial unmap returned %ld", ret);
+    if (ret < 0) {
+        munmap(other, stack_size);
+        munmap(stack, stack_size);
+        return;
+    }
+
+    while (__atomic_load_n(&parked_thread_state, __ATOMIC_SEQ_CST) == 0) {
+        raw_syscall6(__NR_futex, (long) &parked_thread_state,
+                     FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, 0, 0, 0);
+    }
+
+    CHECK(munmap(span, span_size) == 0,
+          "partial munmap spanning live child stack failed unexpectedly");
+
+    void *reuse_other =
+        mmap(span, stack_size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0);
+    CHECK(reuse_other == span,
+          "non-overlapping half reuse mmap returned %p (expected %p)",
+          reuse_other, span);
+    if (reuse_other == span)
+        munmap(reuse_other, stack_size);
+
+    void *reuse_stack =
+        mmap((char *) span + stack_size, stack_size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0);
+    CHECK(reuse_stack == MAP_FAILED,
+          "live overlapping stack slice unexpectedly became reusable");
+    if (reuse_stack != MAP_FAILED)
+        munmap(reuse_stack, stack_size);
+
+    __atomic_store_n(&parked_thread_state, 2, __ATOMIC_SEQ_CST);
+    raw_syscall6(__NR_futex, (long) &parked_thread_state,
+                 FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1, 0, 0, 0);
+
+    while (__atomic_load_n(&thread_tid, __ATOMIC_SEQ_CST) != 0) {
+        int tid = __atomic_load_n(&thread_tid, __ATOMIC_SEQ_CST);
+        raw_syscall6(__NR_futex, (long) &thread_tid,
+                     FUTEX_WAIT | FUTEX_PRIVATE_FLAG, tid, 0, 0, 0);
+    }
+
+    reuse_stack =
+        mmap((char *) span + stack_size, stack_size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0);
+    CHECK(reuse_stack == (char *) span + stack_size,
+          "deferred overlapping half reuse mmap returned %p (expected %p)",
+          reuse_stack, (char *) span + stack_size);
+    if (reuse_stack == (char *) span + stack_size)
+        munmap(reuse_stack, stack_size);
+}
+
+int main(int argc, char **argv)
+{
+    if (argc > 1 && !strcmp(argv[1], "--clone3-vfork-child"))
+        return 23;
+    if (argc > 1 && !strcmp(argv[1], "--clone3-vfork-sleep-child")) {
+        struct timespec ts = {.tv_sec = 1, .tv_nsec = 0};
+        nanosleep(&ts, NULL);
+        return 25;
+    }
+
+    self_path = argv[0];
+
     printf("test-clone3: starting\n");
 
     test_fork();
@@ -353,6 +682,11 @@ int main(void)
     test_thread_with_signal();
     test_stack_mismatch();
     test_stack_overflow();
+    test_vfork_exec();
+    test_vfork_child_stack();
+    test_vfork_exec_unblocks_parent();
+    test_deferred_stack_munmap();
+    test_partial_deferred_stack_munmap();
 
     SUMMARY("test-clone3");
     return fails > 0 ? 1 : 0;
diff --git a/tests/test-mmap-hint.c b/tests/test-mmap-hint.c
new file mode 100644
index 0000000..4165119
--- /dev/null
+++ b/tests/test-mmap-hint.c
@@ -0,0 +1,77 @@
+/* Low-address mmap hint regression test
+ *
+ * Copyright 2026 elfuse contributors
+ * Copyright 2025 Moritz Angermann, zw3rk pte. ltd.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Verifies that non-fixed anonymous mmap() honors a free low address hint
+ * such as 0x400000. box64 uses this pattern to reserve the ET_EXEC image
+ * window for static x86-64 binaries.
+ */
+
+#include <errno.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "test-harness.h"
+
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE 0x100000
+#endif
+
+int passes = 0, fails = 0;
+
+static void *reserve_free_low_hint(size_t len)
+{
+    static const uintptr_t candidates[] = {
+        0x00400000ULL, 0x00800000ULL, 0x01000000ULL,
+        0x02000000ULL, 0x04000000ULL, 0x06000000ULL,
+    };
+
+    for (size_t i = 0; i < sizeof(candidates) / sizeof(candidates[0]); i++) {
+        void *hint = (void *) candidates[i];
+        void *p =
+            mmap(hint, len, PROT_NONE,
+                 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0);
+        if (p == MAP_FAILED) {
+            if (errno == EEXIST || errno == EINVAL)
+                continue;
+            return MAP_FAILED;
+        }
+        return p;
+    }
+
+    errno = ENOMEM;
+    return MAP_FAILED;
+}
+
+static void test_low_hint_exact(void)
+{
+    TEST("mmap low hint preserves ET_EXEC-style address");
+
+    size_t len = 0x21000;
+    void *hint = reserve_free_low_hint(len);
+    if (hint == MAP_FAILED) {
+        FAIL("no free low hint candidate");
+        return;
+    }
+    munmap(hint, len);
+
+    void *p = mmap(hint, len, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (p == MAP_FAILED) {
+        FAIL("mmap failed");
+        return;
+    }
+
+    EXPECT_TRUE((uintptr_t) p == (uintptr_t) hint,
+                "low mmap hint should be honored when range is free");
+    munmap(p, len);
+}
+
+int main(void)
+{
+    test_low_hint_exact();
+    SUMMARY("test-mmap-hint");
+    return fails ? 1 : 0;
+}
diff --git a/tests/test-thread.c b/tests/test-thread.c
index da427df..89a4924 100644
--- a/tests/test-thread.c
+++ b/tests/test-thread.c
@@ -20,6 +20,10 @@
 
 int passes = 0, fails = 0;
 
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE 0x100000
+#endif
+
 /* Shared state */
 
 /* Shared variable written by child thread, read by parent */
@@ -32,6 +36,7 @@ static volatile int child_tid = 0;
 
 /* Synchronization flag: child sets to 1 when done, parent waits */
 static volatile int done_flag = 0;
+static volatile int parked_state = 0;
 
 /* Child thread function */
 
@@ -55,6 +60,17 @@ static void child_work(void)
     raw_exit(0);
 }
 
+static void parked_child_work(void)
+{
+    parked_state = 1;
+    raw_futex_wake((int *) &parked_state, 1);
+
+    while (parked_state == 1)
+        raw_futex_wait((int *) &parked_state, 1);
+
+    raw_exit(0);
+}
+
 /* Tests */
 
 /* Stack for child thread (8KiB, 16-byte aligned) */
@@ -200,6 +216,61 @@ static void test_multi_thread(void)
 
 #undef N_THREADS
 
+static void test_clone_stack_unmap_reuse(void)
+{
+    TEST("clone stack munmap reuse");
+
+    size_t stack_size = 65536;
+    void *stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (stack == MAP_FAILED) {
+        FAIL("mmap failed");
+        return;
+    }
+
+    parked_state = 0;
+    child_tid = 0;
+
+    unsigned long flags = 0x7d0f00;
+    void *stack_top = (char *) stack + stack_size;
+    long ret =
+        raw_clone(flags, stack_top, (int *) &child_tid, 0, (int *) &child_tid);
+
+    if (ret == 0) {
+        parked_child_work();
+        __builtin_unreachable();
+    }
+    if (ret < 0) {
+        munmap(stack, stack_size);
+        FAIL("clone returned error");
+        return;
+    }
+
+    while (parked_state == 0)
+        raw_futex_wait((int *) &parked_state, 0);
+
+    if (munmap(stack, stack_size) != 0) {
+        FAIL("munmap failed");
+        return;
+    }
+
+    parked_state = 2;
+    raw_futex_wake((int *) &parked_state, 1);
+
+    while (child_tid != 0)
+        raw_futex_wait((int *) &child_tid, child_tid);
+
+    void *reuse =
+        mmap(stack, stack_size, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0);
+    if (reuse == stack) {
+        munmap(reuse, stack_size);
+        PASS();
+    } else {
+        FAIL("stack VA was not reusable");
+    }
+}
+
 /* Main */
 
 int main(void)
@@ -209,6 +280,7 @@ int main(void)
     test_clone_thread();
     test_parent_settid();
     test_multi_thread();
+    test_clone_stack_unmap_reuse();
 
     SUMMARY("test-thread");
     return fails > 0 ? 1 : 0;
diff --git a/tests/test-tier-b.c b/tests/test-tier-b.c
index 8594524..90668b5 100644
--- a/tests/test-tier-b.c
+++ b/tests/test-tier-b.c
@@ -31,6 +31,10 @@
 
 #include "test-harness.h"
 
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE 0x100000
+#endif
+
 int passes = 0, fails = 0;
 
 /* fchmodat2 (SYS 452). */
@@ -39,6 +43,10 @@ int passes = 0, fails = 0;
 #define SYS_fchmodat2 452
 #endif
 
+#ifndef SYS_getcpu
+#define SYS_getcpu 168
+#endif
+
 static void test_fchmodat2_basic(void)
 {
     TEST("fchmodat2 basic");
@@ -62,6 +70,19 @@ static void test_fchmodat2_basic(void)
     EXPECT_TRUE((st.st_mode & 0777) == 0644, "mode mismatch");
 }
 
+static void test_getcpu_basic(void)
+{
+    TEST("getcpu basic");
+    unsigned cpu = 99, node = 99;
+    long rc = syscall(SYS_getcpu, &cpu, &node, 0);
+    if (rc < 0) {
+        FAIL("getcpu");
+        return;
+    }
+    EXPECT_TRUE(cpu == 0, "cpu should be 0");
+    EXPECT_TRUE(node == 0, "node should be 0");
+}
+
 static void test_fchmodat2_symlink_nofollow(void)
 {
     TEST("fchmodat2 AT_SYMLINK_NOFOLLOW");
@@ -1378,6 +1399,41 @@ static void test_proc_cpuinfo_all_cpus(void)
     }
 }
 
+static void test_mmap_low_hint_exact(void)
+{
+    TEST("mmap low hint preserves ET_EXEC-style address");
+    size_t len = 0x21000;
+    static const uintptr_t candidates[] = {
+        0x00400000ULL, 0x00800000ULL, 0x01000000ULL,
+        0x02000000ULL, 0x04000000ULL, 0x06000000ULL,
+    };
+    void *hint = MAP_FAILED;
+    for (size_t i = 0; i < sizeof(candidates) / sizeof(candidates[0]); i++) {
+        hint = mmap((void *) candidates[i], len, PROT_NONE,
+                    MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0);
+        if (hint != MAP_FAILED)
+            break;
+        if (errno != EEXIST && errno != EINVAL) {
+            FAIL("probe mmap");
+            return;
+        }
+    }
+    if (hint == MAP_FAILED) {
+        FAIL("no free low hint candidate");
+        return;
+    }
+    munmap(hint, len);
+
+    void *p = mmap(hint, len, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (p == MAP_FAILED) {
+        FAIL("mmap");
+        return;
+    }
+    EXPECT_TRUE((uintptr_t) p == (uintptr_t) hint,
+                "low mmap hint should be honored when range is free");
+    munmap(p, len);
+}
+
 int main(void)
 {
     printf("Tier B correctness tests:\n");
@@ -1385,6 +1441,7 @@ int main(void)
     /* fchmodat2 */
     test_fchmodat2_basic();
     test_fchmodat2_symlink_nofollow();
+    test_getcpu_basic();
 
     /* openat2 RESOLVE_* */
     test_openat2_basic();
@@ -1428,6 +1485,7 @@ int main(void)
     test_proc_net_tcp_sl_dense();
     test_proc_net_dirfd_openat_uses_virtual_entries();
     test_proc_cpuinfo_all_cpus();
+    test_mmap_low_hint_exact();
 
     /* signalfd */
     test_signalfd_efault_preserves_pending();