From 1140b1345de590400a2c4b76efcbde64be4c0ebb Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Thu, 7 May 2026 18:23:18 +0800
Subject: [PATCH] Honor MAP_SHARED coherence across fork

Both fork paths (CoW shm and legacy IPC byte-copy) silently broke
MAP_SHARED visibility across fork: the child mapped the slab MAP_PRIVATE
or got a fresh byte copy, so writes from either side stayed local and
never reached the kernel page cache the parent shared with the file.
MAP_SHARED|MAP_ANONYMOUS, the standard parent-child IPC primitive used
by Postgres and other multi-process daemons, was equally broken.

Three pieces close the gap:
1. Parent-side conversion (mmap_fork_prepare_anon_shared, with
   commit/abort wrappers). While siblings are quiesced the fork
   thread walks live regions, promotes each MAP_SHARED|MAP_ANONYMOUS
   region without a backing fd into a memfd-style overlay
   (mkstemp+unlink+ftruncate, pwrite-seed from host_base, host
   MAP_FIXED|MAP_SHARED via the new hvf_apply_file_overlay_quiesced
   helper, mark_overlay_metadata_range), and pre-stages per-region
   dup() fds so a transient EMFILE rolls back cleanly. The candidate
   filter skips regions whose host-page-rounded tail would alias a
   neighbor mapping. The transactional commit/abort wrappers let the
   fork-IPC failure path roll back the in-place conversion (overlay
   teardown plus region metadata restore) before resuming siblings;
   abort validates every captured snapshot before tearing down so a
   sibling-drift past the quiesce timeout does not leave host VA out
   of sync with semantic state. forkipc.c logs a warning when abort
   returns a partial failure so the parent's stale state is visible
   in post-mortem.
2. Child-side restoration (mmap_fork_restore_overlays). The recv
   path now snapshots parent overlay_active/start/end (and a new
   parent_had_fd[] mirror) before clearing inherited state, then
   re-runs hvf_apply_file_overlay against the saved overlay span
   once SCM_RIGHTS delivers the backing fds. The inner quiesce is a
   no-op since no worker vCPUs exist yet.
3. Pre-existing fork-IPC alignment bug. The old recv_backing_fds
   filter (!MAP_ANONYMOUS && offset != -1) matched the shim region
   (LINUX_MAP_PRIVATE, offset 0) and ELF text segments and silently
   stole incoming SCM_RIGHTS fds, leaving the actual file-backed
   regions with backing_fd=-1. The receiver now uses parent_had_fd[]
   as the filter so its iteration order matches the sender's
   "backing_fd >= 0" filter exactly. Unassigned fds are closed
   instead of leaked.

hvf_apply_file_overlay and hvf_remove_file_overlay are split into a
public variant that handles thread_quiesce_siblings and a _quiesced
inner that the parent fork-prep / abort paths call without a nested
barrier.

Locked in by tests/test-cross-fork-mapshared.c (3 cases: file-backed
mkstemp, MAP_SHARED|MAP_ANONYMOUS, /dev/shm via shm_open). Each case
verifies pre-fork seed visibility, child-write-visible-to-parent,
parent-write-visible-to-child, and on-disk reconciliation. All three
pass against Linux ground truth via tests/qemu-runner.sh.
---
 src/runtime/fork-state.c          | 137 +++++--
 src/runtime/forkipc.c             |  80 ++--
 src/syscall/mem.c                 | 613 +++++++++++++++++++++++++-----
 src/syscall/mem.h                 |  48 +++
 tests/manifest.txt                |   3 +
 tests/test-cross-fork-mapshared.c | 458 ++++++++++++++++++++++
 6 files changed, 1202 insertions(+), 137 deletions(-)
 create mode 100644 tests/test-cross-fork-mapshared.c

diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c
index 1cb99f2..5fd7b08 100644
--- a/src/runtime/fork-state.c
+++ b/src/runtime/fork-state.c
@@ -21,6 +21,7 @@
 #include "debug/log.h"
 #include "syscall/abi.h"
 #include "syscall/internal.h"
+#include "syscall/mem.h"
 #include "syscall/proc.h"
 
 int fork_ipc_write_all(int fd, const void *buf, size_t len)
@@ -494,7 +495,9 @@ static int fork_ipc_drain_bytes(int ipc_fd, uint32_t len)
     return 0;
 }
 
-static int fork_ipc_recv_backing_fds(int ipc_fd, guest_t *g)
+static int fork_ipc_recv_backing_fds(int ipc_fd,
+                                     guest_t *g,
+                                     const bool *parent_had_fd)
 {
     uint32_t nbacking;
     if (fork_ipc_read_all(ipc_fd, &nbacking, sizeof(nbacking)) < 0) {
@@ -518,19 +521,59 @@ static int fork_ipc_recv_backing_fds(int ipc_fd, guest_t *g)
         .msg_controllen = cmsg_sz,
     };
     ssize_t nr = recvmsg(ipc_fd, &msg, 0);
-    if (nr > 0) {
-        struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
-        if (cmsg && cmsg->cmsg_level == SOL_SOCKET &&
-            cmsg->cmsg_type == SCM_RIGHTS) {
-            int *region_fds = (int *) CMSG_DATA(cmsg);
-            uint32_t fi = 0;
-            for (int i = 0; i < g->nregions && fi < nbacking; i++) {
-                if (!(g->regions[i].flags & LINUX_MAP_ANONYMOUS) &&
-                    g->regions[i].offset != (uint64_t) -1) {
-                    g->regions[i].backing_fd = region_fds[fi++];
-                }
-            }
-        }
+    if (nr <= 0) {
+        free(cmsg_buf);
+        return -1;
+    }
+
+    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+    if (msg.msg_flags & MSG_CTRUNC) {
+        log_error("fork-child: backing fd SCM_RIGHTS payload truncated");
+        free(cmsg_buf);
+        return -1;
+    }
+    if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
+        cmsg->cmsg_type != SCM_RIGHTS) {
+        log_error("fork-child: missing backing fd SCM_RIGHTS payload");
+        free(cmsg_buf);
+        return -1;
+    }
+    if (cmsg->cmsg_len < CMSG_LEN(0)) {
+        free(cmsg_buf);
+        return -1;
+    }
+
+    int *region_fds = (int *) CMSG_DATA(cmsg);
+    uint32_t nreceived =
+        (uint32_t) ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int));
+    uint32_t fi = 0;
+
+    /* Sender (fork_ipc_send_backing_fds) iterates regions and sends one fd per
+     * region with backing_fd >= 0. The receiver must iterate in the same order
+     * over regions that had backing_fd in the parent. parent_had_fd[i] is
+     * captured by the caller before backing_fd is cleared.
+     *
+     * The original filter (!MAP_ANONYMOUS && offset != -1) matched extra
+     * regions like the shim and ELF text, so the first received fd was
+     * misassigned and the actual file-backed region was left without
+     * backing_fd.
+     */
+    for (int i = 0; i < g->nregions && fi < nreceived; i++) {
+        if (parent_had_fd && parent_had_fd[i])
+            g->regions[i].backing_fd = region_fds[fi++];
+    }
+
+    /* Close any received fds that did not get assigned: avoids leaking host fds
+     * into the child's process table when a mismatch occurs.
+     */
+    while (fi < nreceived)
+        close(region_fds[fi++]);
+
+    if (nreceived != nbacking) {
+        log_error("fork-child: expected %u backing fds but received %u",
+                  nbacking, nreceived);
+        free(cmsg_buf);
+        return -1;
     }
     free(cmsg_buf);
     return 0;
@@ -618,23 +661,73 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig)
         return -1;
     }
     g->nregions = (int) num_guest_regions;
+
+    /* Capture parent state before clearing the inherited overlay/backing fd
+     * fields. parent_had_fd lets recv_backing_fds iterate in the same order the
+     * sender used (regions with backing_fd >= 0); the parent_ovl_* arrays let
+     * mmap_fork_restore_overlays know which regions to re-install, with what
+     * overlay span. Heap-allocated to avoid pushing hundreds of KiB onto the
+     * recv stack frame.
+     */
+    bool *parent_had_fd = NULL;
+    bool *parent_active = NULL;
+    uint64_t *parent_ovl_start = NULL;
+    uint64_t *parent_ovl_end = NULL;
+    if (g->nregions > 0) {
+        parent_had_fd = calloc((size_t) g->nregions, sizeof(*parent_had_fd));
+        parent_active = calloc((size_t) g->nregions, sizeof(*parent_active));
+        parent_ovl_start =
+            calloc((size_t) g->nregions, sizeof(*parent_ovl_start));
+        parent_ovl_end = calloc((size_t) g->nregions, sizeof(*parent_ovl_end));
+        if (!parent_had_fd || !parent_active || !parent_ovl_start ||
+            !parent_ovl_end) {
+            log_error("fork-child: parent overlay buffer alloc failed");
+            free(parent_had_fd);
+            free(parent_active);
+            free(parent_ovl_start);
+            free(parent_ovl_end);
+            return -1;
+        }
+        for (int i = 0; i < g->nregions; i++) {
+            parent_had_fd[i] = (g->regions[i].backing_fd >= 0);
+            parent_active[i] = g->regions[i].overlay_active;
+            parent_ovl_start[i] = g->regions[i].overlay_start;
+            parent_ovl_end[i] = g->regions[i].overlay_end;
+        }
+    }
+
     for (int i = 0; i < g->nregions; i++) {
         g->regions[i].backing_fd = -1;
-        /* Demote inherited overlays: the child does not yet re-establish
-         * host MAP_FIXED|MAP_SHARED mappings from the parent's overlay
-         * fds, so msync, MADV_DONTNEED and friends must use the
-         * snapshot-style emulation. The CoW path's pre-fork sync of
-         * overlay bytes into shm_fd already gave the child snapshot the
-         * correct content at fork time. Live cross-fork MAP_SHARED
-         * coherence is the next P1 TODO item.
+        /* Drop inherited overlay metadata; the host MAP_FIXED|MAP_SHARED
+         * mapping does not exist yet in the child. Re-establishment runs after
+         * fork_ipc_recv_backing_fds populates backing_fd from the
+         * parent-supplied SCM_RIGHTS bundle.
          */
         g->regions[i].overlay_active = false;
         g->regions[i].overlay_start = 0;
         g->regions[i].overlay_end = 0;
     }
 
-    if (fork_ipc_recv_backing_fds(ipc_fd, g) < 0)
+    if (fork_ipc_recv_backing_fds(ipc_fd, g, parent_had_fd) < 0) {
+        free(parent_had_fd);
+        free(parent_active);
+        free(parent_ovl_start);
+        free(parent_ovl_end);
         return -1;
+    }
+
+    /* Re-install MAP_SHARED overlays for every region the parent had as
+     * overlay_active and that now carries a backing fd. Failures here fall back
+     * to snapshot semantics for the affected region; the child still boots and
+     * can run.
+     */
+    if (g->nregions > 0)
+        (void) mmap_fork_restore_overlays(g, parent_active, parent_ovl_start,
+                                          parent_ovl_end);
+    free(parent_had_fd);
+    free(parent_active);
+    free(parent_ovl_start);
+    free(parent_ovl_end);
 
     if (fork_ipc_read_all(ipc_fd, sig, sizeof(*sig)) < 0) {
         log_error("fork-child: failed to read signal state");
diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c
index 03a7683..c12a760 100644
--- a/src/runtime/forkipc.c
+++ b/src/runtime/forkipc.c
@@ -35,6 +35,7 @@
 
 #include "syscall/abi.h"
 #include "syscall/internal.h"
+#include "syscall/mem.h"
 #include "syscall/net.h"  /* absock namespace IPC state */
 #include "syscall/poll.h" /* wakeup_pipe_signal */
 #include "syscall/proc.h"
@@ -89,8 +90,8 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
     absock_set_namespace_id(hdr.absock_namespace_id);
     proc_set_session(hdr.sid, hdr.pgid);
 
-    /* Create guest memory before receiving state so all incoming offsets can
-     * be bounds-checked against the negotiated guest size.
+    /* Create guest memory before receiving state so all incoming offsets can be
+     * bounds-checked against the negotiated guest size.
      */
     guest_t g;
 
@@ -176,6 +177,7 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
         guest_destroy(&g);
         return 1;
     }
+
     /* POSIX: "Signals pending to the parent shall not be pending to the child."
      * Clear pending bitmask and RT queue before applying state.
      * signal_set_state() is deferred until after thread_register_main()
@@ -218,17 +220,17 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, regs.tpidr_el0));
 
     /* Enable MMU directly (page tables already in guest memory from IPC).
-     * SCTLR must include MMU-enable (M), caches (C, I), RES1 bits,
-     * and EL0 cache maintenance access (UCI, UCT) for JIT translators.
+     * SCTLR must include MMU-enable (M), caches (C, I), RES1 bits, and EL0
+     * cache maintenance access (UCI, UCT) for JIT translators.
      */
     uint64_t sctlr_with_mmu = SCTLR_RES1 | SCTLR_M | SCTLR_C | SCTLR_I |
                               SCTLR_DZE | SCTLR_UCT | SCTLR_UCI;
     HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, sctlr_with_mmu));
 
-    /* Restore all 31 GPRs from parent state, then override X0=0 (child
-     * clone return value). This preserves X1-X30 exactly as they were when
-     * the parent called clone(), which is required by the Linux syscall ABI
-     * (especially callee-saved X19-X28, FP=X29, LR=X30).
+    /* Restore all 31 GPRs from parent state, then override X0=0 (child clone
+     * return value). This preserves X1-X30 exactly as they were when the parent
+     * called clone(), which is required by the Linux syscall ABI (especially
+     * callee-saved X19-X28, FP=X29, LR=X30).
      */
     vcpu_restore_gprs(vcpu, regs.x);
     vcpu_set_gpr(vcpu, 0, 0); /* Child gets 0 from clone */
@@ -246,14 +248,14 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
 
     /* Register the fork child's main thread in the thread table.
      * Without this, current_thread is NULL and any syscall handler that
-     * accesses per-thread state (signal masks, ptrace, CLONE_THREAD)
-     * will dereference NULL.
+     * accesses per-thread state (signal masks, ptrace, CLONE_THREAD) will
+     * dereference NULL.
      */
     thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1);
 
     /* Now that current_thread is set, apply signal state. This must happen
-     * after thread_register_main() so the per-thread blocked mask and
-     * altstack are properly restored to the thread entry.
+     * after thread_register_main() so the per-thread blocked mask and altstack
+     * are properly restored to the thread entry.
      */
     signal_set_state(&sig);
 
@@ -921,6 +923,22 @@ int64_t sys_clone(hv_vcpu_t vcpu,
      */
     thread_quiesce_siblings();
 
+    mmap_fork_anon_shared_txn_t *anon_shared_txn = NULL;
+    guest_region_t *regions_snapshot = NULL;
+
+    /* Convert MAP_SHARED|MAP_ANONYMOUS regions that have no backing fd
+     * into memfd-backed overlay regions. The conversion seeds a private
+     * temp file with the current bytes and installs a host
+     * MAP_SHARED|MAP_FIXED overlay on the parent. The child receives the
+     * fd via SCM_RIGHTS and re-installs its own overlay so subsequent
+     * writes from either side flow through the kernel page cache and
+     * reach the other. File-backed MAP_SHARED regions already carry a
+     * backing fd and are unaffected. Misaligned shared regions
+     * (snapshot-style) remain incoherent across fork by design.
+     */
+    if (mmap_fork_prepare_anon_shared(g, &anon_shared_txn) < 0)
+        goto fail_snapshot;
+
     /* Determine if elfuse can use the CoW (shm) fast path.
      * If shm_fd >= 0, elfuse freezes a snapshot via MAP_PRIVATE and sends the
      * shm fd to the child. Otherwise fall back to region-by-region copy.
@@ -947,8 +965,6 @@ int64_t sys_clone(hv_vcpu_t vcpu,
      * but before sibling vCPUs resume. Declared up front so all goto paths to
      * fail_snapshot can free it unconditionally.
      */
-    guest_region_t *regions_snapshot = NULL;
-
     /* Header */
     ipc_header_t hdr = {
         .magic = IPC_MAGIC_HEADER,
@@ -1064,9 +1080,7 @@ int64_t sys_clone(hv_vcpu_t vcpu,
     if (nregions_snapshot > 0) {
         regions_snapshot = malloc(snap_sz);
         if (!regions_snapshot) {
-            thread_resume_siblings();
-            close(ipc_sock);
-            return -LINUX_ENOMEM;
+            goto fail_snapshot;
         }
         memcpy(regions_snapshot, g->regions, snap_sz);
     }
@@ -1074,15 +1088,17 @@ int64_t sys_clone(hv_vcpu_t vcpu,
     if (fork_ipc_send_fd_table(ipc_sock) < 0)
         goto fail_snapshot;
 
-    /* Resume sibling vCPUs now that the memory snapshot, semantic region
-     * snapshot, and FD snapshot have been serialized.
-     */
-    thread_resume_siblings();
-
     uint32_t num_guest_regions = (uint32_t) nregions_snapshot;
     if (fork_ipc_send_process_state(ipc_sock, regions_snapshot,
                                     num_guest_regions) < 0)
-        goto fail_ipc;
+        goto fail_snapshot;
+
+    /* The process-state payload includes the SCM_RIGHTS handoff for region
+     * backing fds. Keep siblings quiesced until that send completes so a
+     * concurrent munmap/remap cannot close or recycle the captured fd numbers.
+     */
+    thread_resume_siblings();
+    mmap_fork_commit_anon_shared(&anon_shared_txn);
 
     close(ipc_sock);
 
@@ -1112,13 +1128,21 @@ int64_t sys_clone(hv_vcpu_t vcpu,
     free(regions_snapshot);
     return child_guest_pid;
 
-fail_ipc:
-    free(regions_snapshot);
-    close(ipc_sock);
-    return -LINUX_ENOMEM;
-
 fail_snapshot:
     free(regions_snapshot);
+    /* Roll back the in-place anon-shared overlay conversion while
+     * siblings are still parked. A partial rollback failure (e.g.,
+     * region drift past the quiesce timeout) leaves the parent in a
+     * mixed state: the originating fork-IPC error is the user-visible
+     * one, but log abort failures so post-mortem can spot the
+     * lingering overlay without grepping for behavioral symptoms.
+     */
+    int abort_rc = mmap_fork_abort_anon_shared(g, &anon_shared_txn);
+    if (abort_rc < 0)
+        log_warn(
+            "clone: anon-shared rollback partial failure (%d); parent "
+            "may have stale memfd-backed regions",
+            abort_rc);
     thread_resume_siblings();
     close(ipc_sock);
     return -LINUX_ENOMEM;
diff --git a/src/syscall/mem.c b/src/syscall/mem.c
index 9cfdcd5..fc57998 100644
--- a/src/syscall/mem.c
+++ b/src/syscall/mem.c
@@ -25,17 +25,17 @@
 #include "syscall/internal.h"
 #include "syscall/mem.h"
 
-/* Protects mmap/brk bump allocators and page table extension. Multiple
- * threads may call mmap/brk concurrently; without this lock they could
- * get overlapping allocations or corrupt page table structures.
+/* Protects mmap/brk bump allocators and page table extension. Multiple threads
+ * may call mmap/brk concurrently; without this lock they could get overlapping
+ * allocations or corrupt page table structures.
  */
 pthread_mutex_t mmap_lock = PTHREAD_MUTEX_INITIALIZER; /* Lock order: 1 */
 
-/* Host kernel page size (16 KiB on Apple Silicon, typically 4 KiB on
- * Intel macOS). MAP_FIXED requires addr/length/offset multiples of this,
- * so an overlay onto a guest 4 KiB-aligned IPA is only applicable when the
- * IPA happens to land on a host page boundary; otherwise sys_mmap falls
- * back to the pread snapshot path.
+/* Host kernel page size (16 KiB on Apple Silicon, typically 4 KiB on Intel
+ * macOS). MAP_FIXED requires addr/length/offset multiples of this, so an
+ * overlay onto a guest 4 KiB-aligned IPA is only applicable when the IPA
+ * happens to land on a host page boundary; otherwise sys_mmap falls back to the
+ * pread snapshot path.
  */
 static size_t host_page_size_cached(void)
 {
@@ -232,13 +232,12 @@ static uint64_t find_free_gap_inner(const guest_t *g,
                                     uint64_t min_addr,
                                     uint64_t max_addr)
 {
-    /* Round the search start up to the next host-page boundary so an
-     * unaligned addr hint cannot return a result that lands inside a host
-     * page already covered by a preceding region's overlay tail (the
-     * overlay extends to ALIGN_UP(r->end, hps)). Apple Silicon enforces
-     * 16 KiB host pages; aligning to the guest 4 KiB page is not enough.
-     * Advance past each walked region to the same boundary for the same
-     * reason.
+    /* Round the search start up to the next host-page boundary so an unaligned
+     * addr hint cannot return a result that lands inside a host page already
+     * covered by a preceding region's overlay tail (the overlay extends to
+     * ALIGN_UP(r->end, hps)). Apple Silicon enforces 16 KiB host pages;
+     * aligning to the guest 4 KiB page is not enough. Advance past each walked
+     * region to the same boundary for the same reason.
      */
     size_t hps = host_page_size_cached();
     uint64_t gap_start = ALIGN_UP(min_addr, hps);
@@ -250,8 +249,8 @@ static uint64_t find_free_gap_inner(const guest_t *g,
 
         /* If this region starts far enough after gap_start, the allocator found
          * a gap. Must also verify the gap is within max_addr; regions[] may
-         * contain entries beyond max_addr that could push gap_start past
-         * the valid range.
+         * contain entries beyond max_addr that could push gap_start past the
+         * valid range.
          */
         if (gap_start <= max_addr && length <= max_addr - gap_start &&
             g->regions[i].start >= gap_start + length)
@@ -267,11 +266,11 @@ static uint64_t find_free_gap_inner(const guest_t *g,
     return UINT64_MAX; /* No suitable gap found */
 }
 
-/* Find a free gap, probing the cached post-allocation hint before a full
- * scan. The hint tracks the first address after the last successful mapping
- * in each region, which avoids rescanning the same prefix on sequential
- * mmap activity. A miss falls back to the region base so holes reopened by
- * munmap are still reusable.
+/* Find a free gap, probing the cached post-allocation hint before a full scan.
+ * The hint tracks the first address after the last successful mapping in each
+ * region, which avoids rescanning the same prefix on sequential mmap activity.
+ * A miss falls back to the region base so holes reopened by munmap are still
+ * reusable.
  */
 static uint64_t find_free_gap(guest_t *g,
                               uint64_t length,
@@ -281,12 +280,12 @@ static uint64_t find_free_gap(guest_t *g,
     /* RX and RW mappings advance independently, so keep separate hints. */
     uint64_t *hint =
         (min_addr < MMAP_BASE) ? &g->mmap_rx_gap_hint : &g->mmap_rw_gap_hint;
+
     /* Advance the hint to the next host-page boundary so the following
-     * sequential allocation lands on an address that the kernel accepts
-     * for mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). The
-     * tradeoff is up to host_page-1 bytes of address-space waste per small
-     * allocation; physical pages are still demand-paged, so RAM cost is
-     * unchanged.
+     * sequential allocation lands on an address that the kernel accepts for
+     * mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). The tradeoff
+     * is up to host_page-1 bytes of address-space waste per small allocation;
+     * physical pages are still demand-paged, so RAM cost is unchanged.
      */
     size_t hps = host_page_size_cached();
 
@@ -345,6 +344,11 @@ static int hvf_apply_file_overlay(guest_t *g,
                                   uint64_t len,
                                   int fd,
                                   off_t file_off);
+static int hvf_apply_file_overlay_quiesced(guest_t *g,
+                                           uint64_t ipa,
+                                           uint64_t len,
+                                           int fd,
+                                           off_t file_off);
 static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len);
 
 static int read_file_range_to_guest(guest_t *g,
@@ -402,6 +406,20 @@ typedef struct {
     char name[sizeof(((guest_region_t *) 0)->name)];
 } region_snapshot_t;
 
+typedef struct {
+    uint64_t overlay_start;
+    uint64_t overlay_len;
+    int snap_base;
+    int nsnaps;
+} fork_overlay_snapshot_t;
+
+struct mmap_fork_anon_shared_txn {
+    int nsnaps;
+    region_snapshot_t snaps[GUEST_MAX_REGIONS];
+    int noverlays;
+    fork_overlay_snapshot_t overlays[GUEST_MAX_REGIONS];
+};
+
 static void close_region_snapshots(region_snapshot_t *snaps, int n)
 {
     for (int i = 0; i < n; i++) {
@@ -412,9 +430,9 @@ static void close_region_snapshots(region_snapshot_t *snaps, int n)
     }
 }
 
-/* Close any open dup'd backing fds in *snaps_ptr, free the heap buffer,
- * and zero out the caller's pointer/count so a follow-on call is a no-op.
- * Used for buffers allocated via malloc by sys_mmap and sys_mremap; the
+/* Close any open dup'd backing fds in *snaps_ptr, free the heap buffer, and
+ * zero out the caller's pointer/count so a follow-on call is a no-op. Used
+ * for buffers allocated via malloc by sys_mmap and sys_mremap; the
  * stack-allocated callers in capture_region_snapshots itself keep using
  * close_region_snapshots directly.
  */
@@ -546,8 +564,8 @@ static int restore_snapshot_page_tables(guest_t *g,
         guest_update_perms(g, snap->start, snap->end, page_perms);
     }
 
-    /* guest_extend_page_tables() repopulates whole 2 MiB blocks, so clear
-     * holes and deferred mappings again after all snapshot ranges are back.
+    /* guest_extend_page_tables() repopulates whole 2 MiB blocks, so clear holes
+     * and deferred mappings again after all snapshot ranges are back.
      */
     uint64_t cursor = start;
     for (int i = 0; i < n; i++) {
@@ -651,29 +669,28 @@ static int rollback_fresh_mmap_allocation(guest_t *g,
 
 /* HVF stage-2 segment management.
  *
- * The slab is mapped to HVF in 2 MiB-aligned segments tracked by
- * g->segments[]. Initially the slab is one segment (set up by guest_init).
- * MAP_SHARED file-backed mmap may need to overlay a sub-range of the slab
- * with a real host mmap MAP_FIXED|MAP_SHARED of the file fd. HVF caches
- * the host VA->PA mapping at hv_vm_map time and a plain MAP_FIXED overlay
- * does not refresh it (see comment in src/runtime/forkipc.c near line 940
- * for the empirical evidence). To force HVF to re-walk the host page
- * tables after the overlay, the affected segment is hv_vm_unmap'd, the
- * file is mmap'd MAP_FIXED|MAP_SHARED into its host VA, and the segment
- * is hv_vm_map'd again.
+ * The slab is mapped to HVF in 2 MiB-aligned segments tracked by g->segments[].
+ * Initially the slab is one segment (set up by guest_init). MAP_SHARED
+ * file-backed mmap may need to overlay a sub-range of the slab with a real host
+ * mmap MAP_FIXED|MAP_SHARED of the file fd. HVF caches the host VA->PA mapping
+ * at hv_vm_map time and a plain MAP_FIXED overlay does not refresh it (see
+ * comment in src/runtime/forkipc.c for the empirical evidence). To force HVF to
+ * re-walk the host page tables after the overlay, the affected segment is
+ * hv_vm_unmap'd, the file is mmap'd MAP_FIXED|MAP_SHARED into its host VA, and
+ * the segment is hv_vm_map'd again.
  *
  * HVF rejects sub-range hv_vm_unmap of a larger map (HV_BAD_ARGUMENT).
- * Therefore, before applying the first overlay inside a large segment,
- * the segment is split into 2 MiB-aligned pieces around the affected
- * range so each piece is independently unmappable.
+ * Therefore, before applying the first overlay inside a large segment, the
+ * segment is split into 2 MiB-aligned pieces around the affected range so each
+ * piece is independently unmappable.
  */
 
-/* HVF flags applied to slab segments. The slab is mapped RWX so guest
- * stage-1 page tables retain full control over per-page permissions
- * (W^X is enforced by the guest's L2/L3 entries, not stage-2). File
- * overlay segments use the same RWX flags so PROT_EXEC mmaps still
- * work; the host file mmap is created PROT_READ|PROT_WRITE so HVF
- * never asks the host kernel for execute permission on the file pages.
+/* HVF flags applied to slab segments. The slab is mapped RWX so guest stage-1
+ * page tables retain full control over per-page permissions (W^X is enforced by
+ * the guest's L2/L3 entries, not stage-2). File overlay segments use the same
+ * RWX flags so PROT_EXEC mmaps still work; the host file mmap is created
+ * PROT_READ|PROT_WRITE so HVF never asks the host kernel for execute permission
+ * on the file pages.
  */
 #define HVF_SEGMENT_FLAGS (HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC)
 
@@ -803,39 +820,31 @@ static int hvf_segment_split(guest_t *g,
 /* Apply a real MAP_SHARED file overlay at [ipa, ipa+len) backed by [fd,
  * file_off). The IPA range may be sub-2 MiB; the containing 2 MiB
  * segment is split out first if it is not already isolated. Caller
- * holds mmap_lock and has not quiesced siblings yet. The function
- * quiesces siblings around the unmap+remap window so concurrent vCPUs
- * cannot fault on the temporarily-unmapped IPA range.
+ * holds mmap_lock and has already quiesced sibling vCPUs (or has none).
+ * The fork pre-snapshot path quiesces siblings before calling this so
+ * the overlay install does not trigger a nested quiesce.
  */
-static int hvf_apply_file_overlay(guest_t *g,
-                                  uint64_t ipa,
-                                  uint64_t len,
-                                  int fd,
-                                  off_t file_off)
+static int hvf_apply_file_overlay_quiesced(guest_t *g,
+                                           uint64_t ipa,
+                                           uint64_t len,
+                                           int fd,
+                                           off_t file_off)
 {
     uint64_t aligned_start = ALIGN_2MIB_DOWN(ipa);
     uint64_t aligned_end = ALIGN_2MIB_UP(ipa + len);
 
-    thread_quiesce_siblings();
-
     int err = hvf_segment_split(g, aligned_start, aligned_end);
-    if (err < 0) {
-        thread_resume_siblings();
+    if (err < 0)
         return err;
-    }
 
     int idx = hvf_segment_find(g, aligned_start);
     if (idx < 0 || g->segments[idx].ipa != aligned_start ||
-        g->segments[idx].len != aligned_end - aligned_start) {
-        thread_resume_siblings();
+        g->segments[idx].len != aligned_end - aligned_start)
         return -LINUX_EFAULT;
-    }
     hvf_segment_t seg = g->segments[idx];
 
-    if (hv_vm_unmap(seg.ipa, seg.len) != HV_SUCCESS) {
-        thread_resume_siblings();
+    if (hv_vm_unmap(seg.ipa, seg.len) != HV_SUCCESS)
         return -LINUX_EIO;
-    }
 
     void *target = (uint8_t *) g->host_base + ipa;
     void *p = mmap(target, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED,
@@ -848,7 +857,6 @@ static int hvf_apply_file_overlay(guest_t *g,
          */
         hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len,
                   HVF_SEGMENT_FLAGS);
-        thread_resume_siblings();
         return saved < 0 ? saved : -saved;
     }
 
@@ -864,31 +872,42 @@ static int hvf_apply_file_overlay(guest_t *g,
         hvf_restore_slab_backing(g, ipa, len);
         hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len,
                   HVF_SEGMENT_FLAGS);
-        thread_resume_siblings();
         return -LINUX_EIO;
     }
 
-    thread_resume_siblings();
     return 0;
 }
 
-/* Undo a file overlay at [ipa, ipa+len) by restoring the slab backing
- * and refreshing the containing HVF segment. Caller holds mmap_lock.
- * Sibling vCPUs are quiesced around the brief unmap window.
+/* Apply a real MAP_SHARED file overlay at [ipa, ipa+len) backed by [fd,
+ * file_off). The IPA range may be sub-2 MiB; the containing 2 MiB
+ * segment is split out first if it is not already isolated. Caller
+ * holds mmap_lock and has not quiesced siblings yet. The function
+ * quiesces siblings around the unmap+remap window so concurrent vCPUs
+ * cannot fault on the temporarily-unmapped IPA range.
  */
-static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len)
+static int hvf_apply_file_overlay(guest_t *g,
+                                  uint64_t ipa,
+                                  uint64_t len,
+                                  int fd,
+                                  off_t file_off)
+{
+    thread_quiesce_siblings();
+    int err = hvf_apply_file_overlay_quiesced(g, ipa, len, fd, file_off);
+    thread_resume_siblings();
+    return err;
+}
+
+static int hvf_remove_file_overlay_quiesced(guest_t *g,
+                                            uint64_t ipa,
+                                            uint64_t len)
 {
     int idx = hvf_segment_find(g, ipa);
     if (idx < 0)
         return -LINUX_EFAULT;
     hvf_segment_t seg = g->segments[idx];
 
-    thread_quiesce_siblings();
-
-    if (hv_vm_unmap(seg.ipa, seg.len) != HV_SUCCESS) {
-        thread_resume_siblings();
+    if (hv_vm_unmap(seg.ipa, seg.len) != HV_SUCCESS)
         return -LINUX_EIO;
-    }
 
     int err = hvf_restore_slab_backing(g, ipa, len);
     if (err < 0) {
@@ -898,20 +917,28 @@ static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len)
          */
         hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len,
                   HVF_SEGMENT_FLAGS);
-        thread_resume_siblings();
         return err;
     }
 
     if (hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len,
-                  HVF_SEGMENT_FLAGS) != HV_SUCCESS) {
-        thread_resume_siblings();
+                  HVF_SEGMENT_FLAGS) != HV_SUCCESS)
         return -LINUX_EIO;
-    }
 
-    thread_resume_siblings();
     return 0;
 }
 
+/* Undo a file overlay at [ipa, ipa+len) by restoring the slab backing
+ * and refreshing the containing HVF segment. Caller holds mmap_lock.
+ * Sibling vCPUs are quiesced around the brief unmap window.
+ */
+static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len)
+{
+    thread_quiesce_siblings();
+    int err = hvf_remove_file_overlay_quiesced(g, ipa, len);
+    thread_resume_siblings();
+    return err;
+}
+
 /* Walk semantic regions in [start, end) and undo any active MAP_SHARED file
  * overlays on the underlying host VA. Used before sys_mmap MAP_FIXED replaces
  * a previously-overlaid range with a new mapping (anonymous or different
@@ -2676,3 +2703,415 @@ int64_t sys_msync(guest_t *g, uint64_t addr, uint64_t length, int flags)
 
     return 0;
 }
+
+/* See mem.h. Walk regions, convert each MAP_SHARED|MAP_ANONYMOUS region
+ * without backing fd into a memfd-backed overlay so fork can hand the fd
+ * to the child for live coherence. Caller has quiesced sibling vCPUs.
+ */
+static void mmap_fork_dispose_anon_shared_txn(
+    mmap_fork_anon_shared_txn_t **txn_ptr)
+{
+    if (!txn_ptr || !*txn_ptr)
+        return;
+
+    mmap_fork_anon_shared_txn_t *txn = *txn_ptr;
+    close_region_snapshots(txn->snaps, txn->nsnaps);
+    free(txn);
+    *txn_ptr = NULL;
+}
+
+int mmap_fork_prepare_anon_shared(guest_t *g,
+                                  mmap_fork_anon_shared_txn_t **txn_out)
+{
+    if (txn_out)
+        *txn_out = NULL;
+
+    mmap_fork_anon_shared_txn_t *txn = calloc(1, sizeof(*txn));
+    if (!txn)
+        return -LINUX_ENOMEM;
+
+    pthread_mutex_lock(&mmap_lock);
+
+    size_t hps = host_page_size_cached();
+
+    /* Snapshot candidate ranges first; conversion mutates the region
+     * table via hvf_segment_split / mark_overlay_metadata_range and
+     * would invalidate the walk indices.
+     */
+    struct {
+        uint64_t start;
+        uint64_t end;
+    } cands[GUEST_MAX_REGIONS];
+    int n_cands = 0;
+    for (int i = 0; i < g->nregions && n_cands < GUEST_MAX_REGIONS; i++) {
+        const guest_region_t *r = &g->regions[i];
+        if (r->backing_fd >= 0)
+            continue;
+        if (!r->shared)
+            continue;
+        if (!(r->flags & LINUX_MAP_ANONYMOUS))
+            continue;
+        if ((r->start % hps) != 0)
+            continue; /* misaligned start: snapshot fallback */
+        /* If the region is shorter than a host page, the host
+         * MAP_FIXED|MAP_SHARED mmap rounds up to ALIGN_UP(len, hps) and
+         * may alias the next region's host page. Codex flagged this
+         * tail-aliasing hazard. Skip when any subsequent region's tail
+         * crosses r->end into the same host page. The leading region
+         * is always the one we convert, so backing_fd is naturally -1
+         * for it; sibling regions in the host-page tail will each be
+         * inspected on their own iteration.
+         */
+        uint64_t aligned_end = ALIGN_UP(r->end, hps);
+        if (aligned_end > r->end) {
+            bool tail_clear = true;
+            for (int j = i + 1; j < g->nregions; j++) {
+                if (g->regions[j].start >= aligned_end)
+                    break;
+                if (g->regions[j].end > r->end) {
+                    tail_clear = false;
+                    break;
+                }
+            }
+            if (!tail_clear)
+                continue;
+        }
+        cands[n_cands].start = r->start;
+        cands[n_cands].end = r->end;
+        n_cands++;
+    }
+
+    for (int i = 0; i < n_cands; i++) {
+        uint64_t start = cands[i].start;
+        uint64_t end = cands[i].end;
+        if (end <= start)
+            continue;
+        uint64_t len = end - start;
+        uint64_t aligned_len = ALIGN_UP(len, hps);
+
+        char tmpl[] = "/tmp/elfuse-anonsh-XXXXXX";
+        int fd = mkstemp(tmpl);
+        if (fd < 0) {
+            log_warn("fork-prep: mkstemp for anon-shared region: %s",
+                     strerror(errno));
+            continue;
+        }
+        unlink(tmpl);
+        if (ftruncate(fd, (off_t) aligned_len) < 0) {
+            log_warn("fork-prep: ftruncate(%llu) failed: %s",
+                     (unsigned long long) aligned_len, strerror(errno));
+            close(fd);
+            continue;
+        }
+
+        /* Seed the temp file with the parent's current bytes so the
+         * child sees pre-fork content through the kernel page cache
+         * after re-installation.
+         */
+        const uint8_t *src = (const uint8_t *) g->host_base + start;
+        uint64_t remain = len;
+        off_t off = 0;
+        bool seed_ok = true;
+        while (remain > 0) {
+            size_t chunk = remain > (uint64_t) SSIZE_MAX ? (size_t) SSIZE_MAX
+                                                         : (size_t) remain;
+            ssize_t nw = pwrite(fd, src, chunk, off);
+            if (nw < 0) {
+                if (errno == EINTR)
+                    continue;
+                seed_ok = false;
+                break;
+            }
+            if (nw == 0) {
+                seed_ok = false;
+                break;
+            }
+            src += nw;
+            off += nw;
+            remain -= (uint64_t) nw;
+        }
+        if (!seed_ok) {
+            log_warn("fork-prep: seed pwrite failed for anon-shared");
+            close(fd);
+            continue;
+        }
+
+        /* Pre-stage the per-region backing_fd dups before installing
+         * the overlay. A post-install dup failure would otherwise leave
+         * the parent live on the temp file but with regions stuck at
+         * backing_fd=-1, which the SCM_RIGHTS sender silently skips.
+         * Reserving fds up front and aborting on failure preserves the
+         * snapshot fallback when the host runs out of fds.
+         */
+        int region_idxs[GUEST_MAX_REGIONS];
+        int dup_fds[GUEST_MAX_REGIONS];
+        int n_regions = 0;
+        for (int j = 0; j < g->nregions; j++) {
+            const guest_region_t *r = &g->regions[j];
+            if (r->start >= end)
+                break;
+            if (r->end <= start)
+                continue;
+            if (r->backing_fd >= 0)
+                continue;
+            region_idxs[n_regions++] = j;
+        }
+        bool dup_ok = true;
+        int dups_done = 0;
+        for (int k = 0; k < n_regions; k++) {
+            int dup_fd = dup(fd);
+            if (dup_fd < 0) {
+                log_warn("fork-prep: dup failed: %s", strerror(errno));
+                dup_ok = false;
+                break;
+            }
+            dup_fds[dups_done++] = dup_fd;
+        }
+        if (!dup_ok) {
+            for (int k = 0; k < dups_done; k++)
+                close(dup_fds[k]);
+            close(fd);
+            continue;
+        }
+
+        if (txn->noverlays >= GUEST_MAX_REGIONS ||
+            txn->nsnaps >= GUEST_MAX_REGIONS) {
+            for (int k = 0; k < n_regions; k++)
+                close(dup_fds[k]);
+            close(fd);
+            pthread_mutex_unlock(&mmap_lock);
+            if (txn_out)
+                *txn_out = txn;
+            return -LINUX_ENOMEM;
+        }
+
+        int snap_base = txn->nsnaps;
+        int nsnaps =
+            capture_region_snapshots(g, start, end, &txn->snaps[txn->nsnaps],
+                                     GUEST_MAX_REGIONS - txn->nsnaps);
+        if (nsnaps < 0) {
+            for (int k = 0; k < n_regions; k++)
+                close(dup_fds[k]);
+            close(fd);
+            pthread_mutex_unlock(&mmap_lock);
+            if (txn_out)
+                *txn_out = txn;
+            return nsnaps;
+        }
+
+        int err = hvf_apply_file_overlay_quiesced(g, start, aligned_len, fd, 0);
+        if (err < 0) {
+            log_warn("fork-prep: overlay install [0x%llx, 0x%llx) failed: %d",
+                     (unsigned long long) start,
+                     (unsigned long long) (start + aligned_len), err);
+            close_region_snapshots(&txn->snaps[snap_base], nsnaps);
+            for (int k = 0; k < n_regions; k++)
+                close(dup_fds[k]);
+            close(fd);
+            continue;
+        }
+
+        txn->nsnaps += nsnaps;
+        txn->overlays[txn->noverlays++] = (fork_overlay_snapshot_t) {
+            .overlay_start = start,
+            .overlay_len = aligned_len,
+            .snap_base = snap_base,
+            .nsnaps = nsnaps,
+        };
+
+        /* Mark every region in [start, end) with overlay span
+         * [start, start+aligned_len). The candidate filter guarantees
+         * the host-page tail is empty of other tracked regions, so the
+         * extended overlay span never aliases a neighbor's backing.
+         * Assign the pre-staged dups in lockstep with the iteration
+         * order used to size n_regions above.
+         */
+        mark_overlay_metadata_range(g, start, end, start, start + aligned_len);
+        for (int k = 0; k < n_regions; k++) {
+            guest_region_t *r = &g->regions[region_idxs[k]];
+            r->backing_fd = dup_fds[k];
+            r->offset = r->start - start;
+        }
+        close(fd);
+    }
+
+    pthread_mutex_unlock(&mmap_lock);
+    if (txn_out)
+        *txn_out = txn;
+    return 0;
+}
+
+void mmap_fork_commit_anon_shared(mmap_fork_anon_shared_txn_t **txn_ptr)
+{
+    mmap_fork_dispose_anon_shared_txn(txn_ptr);
+}
+
+int mmap_fork_abort_anon_shared(guest_t *g,
+                                mmap_fork_anon_shared_txn_t **txn_ptr)
+{
+    if (!txn_ptr || !*txn_ptr)
+        return 0;
+
+    mmap_fork_anon_shared_txn_t *txn = *txn_ptr;
+    int rc = 0;
+
+    pthread_mutex_lock(&mmap_lock);
+
+    for (int i = txn->noverlays - 1; i >= 0; i--) {
+        const fork_overlay_snapshot_t *ovl = &txn->overlays[i];
+
+        /* Validate every captured region snapshot for this overlay
+         * BEFORE tearing down the host MAP_SHARED|MAP_FIXED mapping.
+         * Removing the overlay first and then discovering the region
+         * shape has drifted (e.g., a sibling vCPU that returned from a
+         * long host syscall after the quiesce timeout ran mmap or
+         * munmap during the prepare/abort window) leaves the host VA
+         * restored to slab while the region metadata still claims the
+         * temp-file overlay -- a silent desync. By verifying first the
+         * function leaves the overlay live and surfaces -EFAULT so the
+         * caller can decide what to do (still better than a partial
+         * teardown).
+         */
+        bool drifted = false;
+        for (int j = 0; j < ovl->nsnaps; j++) {
+            const region_snapshot_t *snap = &txn->snaps[ovl->snap_base + j];
+            const guest_region_t *found = guest_region_find(g, snap->start);
+            if (!found || found->start != snap->start ||
+                found->end != snap->end) {
+                drifted = true;
+                break;
+            }
+        }
+        if (drifted) {
+            if (rc == 0)
+                rc = -LINUX_EFAULT;
+            continue;
+        }
+
+        int err = hvf_remove_file_overlay_quiesced(g, ovl->overlay_start,
+                                                   ovl->overlay_len);
+        if (err < 0) {
+            if (rc == 0)
+                rc = err;
+            continue;
+        }
+
+        for (int j = 0; j < ovl->nsnaps; j++) {
+            region_snapshot_t *snap = &txn->snaps[ovl->snap_base + j];
+            const guest_region_t *found = guest_region_find(g, snap->start);
+            guest_region_t *r = (guest_region_t *) found;
+            /* Validation above ensured r exists with matching bounds.
+             * Re-check defensively in case hvf_remove_file_overlay_quiesced
+             * itself mutated the region table on its failure paths.
+             */
+            if (!r || r->start != snap->start || r->end != snap->end) {
+                if (rc == 0)
+                    rc = -LINUX_EFAULT;
+                continue;
+            }
+            if (r->backing_fd >= 0) {
+                close(r->backing_fd);
+                r->backing_fd = -1;
+            }
+            r->prot = snap->prot;
+            r->flags = snap->flags;
+            r->offset = snap->offset;
+            r->backing_fd = snap->backing_fd;
+            snap->backing_fd = -1;
+            r->overlay_active = snap->overlay_active;
+            r->overlay_start = snap->overlay_start;
+            r->overlay_end = snap->overlay_end;
+            str_copy_trunc(r->name, snap->name, sizeof(r->name));
+        }
+    }
+
+    pthread_mutex_unlock(&mmap_lock);
+    mmap_fork_dispose_anon_shared_txn(txn_ptr);
+    return rc;
+}
+
+/* See mem.h. Re-install host MAP_SHARED|MAP_FIXED overlays on the child
+ * after IPC restore using parent-side overlay metadata captured before
+ * the recv path cleared the inherited overlay flags.
+ */
+int mmap_fork_restore_overlays(guest_t *g,
+                               const bool *parent_active,
+                               const uint64_t *parent_ovl_start,
+                               const uint64_t *parent_ovl_end)
+{
+    pthread_mutex_lock(&mmap_lock);
+    int rc = 0;
+
+    for (int i = 0; i < g->nregions; i++) {
+        if (!parent_active[i])
+            continue;
+        guest_region_t *r = &g->regions[i];
+        if (r->backing_fd < 0)
+            continue;
+        if (r->overlay_active)
+            continue; /* already re-installed via a sibling region */
+
+        uint64_t ovl_s = parent_ovl_start[i];
+        uint64_t ovl_e = parent_ovl_end[i];
+        if (ovl_e <= ovl_s)
+            continue;
+
+        /* file_off corresponding to ovl_s. The standard install path
+         * keeps ovl_s == r->start (host-page-aligned guest start), so
+         * file_off == r->offset. Handle the defensive clip-extends-low
+         * case by shifting r->offset down by the missing bytes; if that
+         * would underflow, skip the region (cannot honestly recreate).
+         */
+        uint64_t file_off;
+        if (ovl_s >= r->start) {
+            uint64_t delta = ovl_s - r->start;
+            if (r->offset > UINT64_MAX - delta) {
+                log_warn(
+                    "fork-child: file_off overflow for region [0x%llx, "
+                    "0x%llx)",
+                    (unsigned long long) r->start, (unsigned long long) r->end);
+                continue;
+            }
+            file_off = r->offset + delta;
+        } else {
+            uint64_t delta = r->start - ovl_s;
+            if (delta > r->offset) {
+                log_warn(
+                    "fork-child: file_off underflow for region [0x%llx, "
+                    "0x%llx)",
+                    (unsigned long long) r->start, (unsigned long long) r->end);
+                continue;
+            }
+            file_off = r->offset - delta;
+        }
+
+        int err = hvf_apply_file_overlay(g, ovl_s, ovl_e - ovl_s, r->backing_fd,
+                                         (off_t) file_off);
+        if (err < 0) {
+            log_warn(
+                "fork-child: overlay re-install [0x%llx, 0x%llx) failed: %d",
+                (unsigned long long) ovl_s, (unsigned long long) ovl_e, err);
+            rc = err;
+            continue;
+        }
+
+        /* Mark each region that the parent had attached to this same
+         * overlay span. Calling mark_overlay_metadata_range with the
+         * region's own [start, end) bounds marks only that region (the
+         * region table is sorted and non-overlapping). The outer loop
+         * later sees overlay_active=true for sibling regions and skips
+         * the redundant install.
+         */
+        for (int j = 0; j < g->nregions; j++) {
+            if (!parent_active[j])
+                continue;
+            if (parent_ovl_start[j] != ovl_s || parent_ovl_end[j] != ovl_e)
+                continue;
+            mark_overlay_metadata_range(g, g->regions[j].start,
+                                        g->regions[j].end, ovl_s, ovl_e);
+        }
+    }
+
+    pthread_mutex_unlock(&mmap_lock);
+    return rc;
+}
diff --git a/src/syscall/mem.h b/src/syscall/mem.h
index 114f704..fed3730 100644
--- a/src/syscall/mem.h
+++ b/src/syscall/mem.h
@@ -13,6 +13,8 @@
 #include <stdint.h>
 #include "core/guest.h"
 
+typedef struct mmap_fork_anon_shared_txn mmap_fork_anon_shared_txn_t;
+
 /* brk: set/query program break */
 int64_t sys_brk(guest_t *g, uint64_t addr);
 
@@ -44,3 +46,49 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice);
 
 /* msync: synchronize file-backed mappings to disk */
 int64_t sys_msync(guest_t *g, uint64_t addr, uint64_t length, int flags);
+
+/* Fork preparation: convert MAP_SHARED|MAP_ANONYMOUS regions that have
+ * no backing fd into memfd-backed overlay regions. Each converted region
+ * gets a private mkstemp+unlink temp file seeded from the current host
+ * bytes; a host MAP_SHARED|MAP_FIXED overlay is then installed so the
+ * parent's subsequent writes flow through the kernel page cache. The
+ * region's backing_fd is set to a dup of the temp file so the regular
+ * SCM_RIGHTS handover feeds the child a coherent fd.
+ *
+ * Caller must already hold sibling vCPUs quiesced. mmap_lock is acquired
+ * internally. Per-region failures are logged and skipped (snapshot
+ * fallback persists for those regions); structural failure returns
+ * -errno. Regions whose start address is not host-page-aligned are
+ * skipped (overlay-eligibility requirement). On success, *txn_out owns
+ * rollback metadata that must later be committed or aborted.
+ */
+int mmap_fork_prepare_anon_shared(guest_t *g,
+                                  mmap_fork_anon_shared_txn_t **txn_out);
+
+/* Finalize or roll back mmap_fork_prepare_anon_shared(). Callers must
+ * still have sibling vCPUs quiesced when aborting so the host overlay
+ * removal cannot race guest accesses.
+ */
+void mmap_fork_commit_anon_shared(mmap_fork_anon_shared_txn_t **txn_ptr);
+int mmap_fork_abort_anon_shared(guest_t *g,
+                                mmap_fork_anon_shared_txn_t **txn_ptr);
+
+/* Fork restore: re-install host MAP_SHARED|MAP_FIXED overlays on the
+ * child after IPC restore. parent_active[i] / parent_ovl_start[i] /
+ * parent_ovl_end[i] capture each region's parent-side overlay metadata,
+ * sampled before fork_ipc_recv_process_state cleared the inherited
+ * overlay flags. For each region that was overlay-active in the parent
+ * and now has a valid backing_fd (received via SCM_RIGHTS), the function
+ * calls hv_vm_unmap + mmap MAP_FIXED|MAP_SHARED + hv_vm_map to bind the
+ * host VA to the same backing file so the child observes parent writes
+ * (and vice-versa). Caller must hold no locks; the child has not yet
+ * created worker vCPUs so no quiesce is needed.
+ *
+ * Per-region failures are logged and skipped. Returns 0 on full success
+ * or the last error encountered (best-effort: a partial failure leaves
+ * snapshot semantics intact for the failed regions).
+ */
+int mmap_fork_restore_overlays(guest_t *g,
+                               const bool *parent_active,
+                               const uint64_t *parent_ovl_start,
+                               const uint64_t *parent_ovl_end);
diff --git a/tests/manifest.txt b/tests/manifest.txt
index 22b75f6..57ca6f2 100644
--- a/tests/manifest.txt
+++ b/tests/manifest.txt
@@ -98,6 +98,9 @@ test-mremap
 [section] msync MAP_SHARED tests
 test-msync
 
+[section] Cross-fork MAP_SHARED coherence tests
+test-cross-fork-mapshared          # diff=skip
+
 [section] madvise MADV_DONTNEED tests
 test-madvise
 
diff --git a/tests/test-cross-fork-mapshared.c b/tests/test-cross-fork-mapshared.c
new file mode 100644
index 0000000..7df2b11
--- /dev/null
+++ b/tests/test-cross-fork-mapshared.c
@@ -0,0 +1,458 @@
+/* Cross-fork MAP_SHARED coherence tests
+ *
+ * Copyright 2026 elfuse contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Verifies live MAP_SHARED visibility across fork(): both file-backed
+ * and anonymous shared mappings must continue to propagate writes
+ * between parent and child after the IPC handoff. Without the
+ * overlay re-establishment in fork-state, the child sees a stale
+ * snapshot of the parent's pre-fork contents and writes from each
+ * side stay private.
+ *
+ * Three scenarios:
+ *   1. Regular file: shared mmap of a tmp file; parent writes appear
+ *      in child mapping AND on disk; child writes appear in parent
+ *      mapping AND on disk.
+ *   2. shm/dev/shm file: same coherence over an unlinked shm file.
+ *   3. MAP_SHARED|MAP_ANONYMOUS: parent and child both see each
+ *      other's writes through the kernel-managed memfd that elfuse
+ *      installs at fork time.
+ */
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "test-harness.h"
+
+int passes = 0, fails = 0;
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS 0x20
+#endif
+
+/* glibc 2.28 static: shm_open is broken; emulate via /dev/shm. */
+static int my_shm_open(const char *name, int oflag, int mode)
+{
+    char path[128];
+    snprintf(path, sizeof(path), "/dev/shm%s", name);
+    return open(path, oflag, mode);
+}
+
+static int my_shm_unlink(const char *name)
+{
+    char path[128];
+    snprintf(path, sizeof(path), "/dev/shm%s", name);
+    return unlink(path);
+}
+
+/* IPC primitives between parent and child: a single byte over a
+ * pipe pair signals "go ahead" each direction. Replaces a sleep-based
+ * synchronization that the test matrix would race on slow runners.
+ */
+typedef struct {
+    int parent_to_child[2];
+    int child_to_parent[2];
+} sync_t;
+
+static int sync_init(sync_t *s)
+{
+    if (pipe(s->parent_to_child) != 0)
+        return -1;
+    if (pipe(s->child_to_parent) != 0) {
+        close(s->parent_to_child[0]);
+        close(s->parent_to_child[1]);
+        return -1;
+    }
+    return 0;
+}
+
+static void close_fd(int *fd)
+{
+    if (*fd >= 0) {
+        close(*fd);
+        *fd = -1;
+    }
+}
+
+static void sync_fini(sync_t *s)
+{
+    close_fd(&s->parent_to_child[0]);
+    close_fd(&s->parent_to_child[1]);
+    close_fd(&s->child_to_parent[0]);
+    close_fd(&s->child_to_parent[1]);
+}
+
+static void parent_close_child_ends(sync_t *s)
+{
+    close_fd(&s->parent_to_child[0]);
+    close_fd(&s->child_to_parent[1]);
+}
+
+static void child_close_parent_ends(sync_t *s)
+{
+    close_fd(&s->parent_to_child[1]);
+    close_fd(&s->child_to_parent[0]);
+}
+
+/* Drop the parent's write end so a child blocked in wait_byte() on the
+ * parent_to_child read end observes EOF and exits instead of deadlocking
+ * with the parent in waitpid(). Must be called on every parent-side
+ * failure path that bypasses send_byte(parent_to_child[1]).
+ */
+static void parent_release_writer(sync_t *s)
+{
+    close_fd(&s->parent_to_child[1]);
+}
+
+/* Wait for a single byte from the peer; returns true on success. */
+static bool wait_byte(int fd)
+{
+    char b;
+    ssize_t n;
+    do {
+        n = read(fd, &b, 1);
+    } while (n < 0 && errno == EINTR);
+    return n == 1;
+}
+
+static bool send_byte(int fd)
+{
+    char b = 'x';
+    ssize_t n;
+    do {
+        n = write(fd, &b, 1);
+    } while (n < 0 && errno == EINTR);
+    return n == 1;
+}
+
+/* Test 1: File-backed MAP_SHARED — parent and child see each other's
+ * writes through the same disk file without msync.
+ */
+static void test_file_backed_cross_fork(void)
+{
+    TEST("MAP_SHARED file: cross-fork live coherence");
+
+    char tmpl[] = "/tmp/elfuse-cf-mapshared-XXXXXX";
+    int fd = mkstemp(tmpl);
+    if (fd < 0) {
+        FAIL("mkstemp");
+        return;
+    }
+    /* Keep the file present so child can re-open via /proc semantics is
+     * unnecessary -- the child inherits fd from CLOEXEC=off and we map
+     * via the inherited fd. Unlink keeps the file on the FS but invisible
+     * via path; both sides hold open references.
+     */
+    unlink(tmpl);
+
+    if (ftruncate(fd, 4096) != 0) {
+        FAIL("ftruncate");
+        close(fd);
+        return;
+    }
+
+    char *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    if (p == MAP_FAILED) {
+        FAIL("parent mmap");
+        close(fd);
+        return;
+    }
+
+    /* Parent seeds with 'P' before fork. Child should observe it. */
+    p[0] = 'P';
+
+    sync_t s;
+    if (sync_init(&s) < 0) {
+        FAIL("sync_init");
+        munmap(p, 4096);
+        close(fd);
+        return;
+    }
+
+    pid_t pid = fork();
+    if (pid < 0) {
+        FAIL("fork");
+        sync_fini(&s);
+        munmap(p, 4096);
+        close(fd);
+        return;
+    }
+
+    if (pid == 0) {
+        child_close_parent_ends(&s);
+        /* Step 1: child observes parent's pre-fork seed 'P'. */
+        if (p[0] != 'P')
+            _exit(10);
+        /* Step 2: child writes 'C', signals parent. */
+        p[1] = 'C';
+        if (!send_byte(s.child_to_parent[1]))
+            _exit(11);
+        /* Step 3: child waits for parent's mid-run write 'M'. */
+        if (!wait_byte(s.parent_to_child[0]))
+            _exit(12);
+        if (p[2] != 'M')
+            _exit(13);
+        _exit(0);
+    }
+
+    parent_close_child_ends(&s);
+    bool failed = false;
+    /* Step 2: wait for child's write of 'C'. */
+    if (!wait_byte(s.child_to_parent[0])) {
+        FAIL("child sync recv");
+        failed = true;
+    } else if (p[1] != 'C') {
+        FAIL("parent did not see child write");
+        failed = true;
+    } else {
+        /* Step 3: parent writes 'M' for child to verify. */
+        p[2] = 'M';
+        if (!send_byte(s.parent_to_child[1])) {
+            FAIL("parent sync send");
+            failed = true;
+        }
+    }
+
+    /* Drop the writer so a child blocked in wait_byte() sees EOF when the
+     * parent took an early failure exit; without this both processes deadlock
+     * and the test driver kills the parent on timeout.
+     */
+    parent_release_writer(&s);
+    int status = 0;
+    if (waitpid(pid, &status, 0) < 0) {
+        FAIL("waitpid");
+    } else if (!failed) {
+        if (!WIFEXITED(status)) {
+            FAIL("child terminated abnormally");
+        } else if (WEXITSTATUS(status) != 0) {
+            char buf[64];
+            snprintf(buf, sizeof(buf), "child failed at step %d",
+                     WEXITSTATUS(status));
+            FAIL(buf);
+        } else {
+            /* Verify file content reflects both writes. */
+            char disk[3] = {0};
+            if (pread(fd, disk, 3, 0) != 3)
+                FAIL("pread");
+            else if (disk[0] == 'P' && disk[1] == 'C' && disk[2] == 'M')
+                PASS();
+            else
+                FAIL("file content does not reflect both sides");
+        }
+    }
+
+    sync_fini(&s);
+    munmap(p, 4096);
+    close(fd);
+}
+
+/* Test 2: Anonymous MAP_SHARED — typical parent-child IPC pattern
+ * (Postgres, multi-process daemons). elfuse must convert the region
+ * to memfd-backed at fork time so both sides observe writes.
+ */
+static void test_anon_shared_cross_fork(void)
+{
+    TEST("MAP_SHARED|MAP_ANONYMOUS: cross-fork live coherence");
+
+    char *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
+                   MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+    if (p == MAP_FAILED) {
+        FAIL("mmap MAP_SHARED|MAP_ANONYMOUS");
+        return;
+    }
+
+    p[0] = 'P';
+
+    sync_t s;
+    if (sync_init(&s) < 0) {
+        FAIL("sync_init");
+        munmap(p, 4096);
+        return;
+    }
+
+    pid_t pid = fork();
+    if (pid < 0) {
+        FAIL("fork");
+        sync_fini(&s);
+        munmap(p, 4096);
+        return;
+    }
+
+    if (pid == 0) {
+        child_close_parent_ends(&s);
+        if (p[0] != 'P')
+            _exit(20);
+        p[1] = 'C';
+        if (!send_byte(s.child_to_parent[1]))
+            _exit(21);
+        if (!wait_byte(s.parent_to_child[0]))
+            _exit(22);
+        if (p[2] != 'M')
+            _exit(23);
+        _exit(0);
+    }
+
+    parent_close_child_ends(&s);
+    bool failed = false;
+    if (!wait_byte(s.child_to_parent[0])) {
+        FAIL("child sync recv");
+        failed = true;
+    } else if (p[1] != 'C') {
+        FAIL("parent did not see child write");
+        failed = true;
+    } else {
+        p[2] = 'M';
+        if (!send_byte(s.parent_to_child[1])) {
+            FAIL("parent sync send");
+            failed = true;
+        }
+    }
+
+    /* See the file-backed test: drop the writer before waitpid so any
+     * failure above does not deadlock both ends of the pipe.
+     */
+    parent_release_writer(&s);
+    int status = 0;
+    if (waitpid(pid, &status, 0) < 0) {
+        FAIL("waitpid");
+    } else if (!failed) {
+        if (!WIFEXITED(status)) {
+            FAIL("child terminated abnormally");
+        } else if (WEXITSTATUS(status) != 0) {
+            char buf[64];
+            snprintf(buf, sizeof(buf), "child failed at step %d",
+                     WEXITSTATUS(status));
+            FAIL(buf);
+        } else {
+            PASS();
+        }
+    }
+
+    sync_fini(&s);
+    munmap(p, 4096);
+}
+
+/* Test 3: shm-backed MAP_SHARED via /dev/shm — same as test 1 but
+ * exercises the shm path (musl/glibc shm_open emulation in elfuse).
+ */
+static void test_shm_cross_fork(void)
+{
+    TEST("MAP_SHARED shm: cross-fork live coherence");
+
+    char name[64];
+    snprintf(name, sizeof(name), "/elfuse-cf-shm-%ld", (long) getpid());
+    int fd = my_shm_open(name, O_CREAT | O_EXCL | O_RDWR, 0600);
+    if (fd < 0) {
+        FAIL("shm_open");
+        return;
+    }
+    my_shm_unlink(name);
+
+    if (ftruncate(fd, 4096) != 0) {
+        FAIL("ftruncate");
+        close(fd);
+        return;
+    }
+
+    char *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    if (p == MAP_FAILED) {
+        FAIL("mmap");
+        close(fd);
+        return;
+    }
+    p[0] = 'P';
+
+    sync_t s;
+    if (sync_init(&s) < 0) {
+        FAIL("sync_init");
+        munmap(p, 4096);
+        close(fd);
+        return;
+    }
+
+    pid_t pid = fork();
+    if (pid < 0) {
+        FAIL("fork");
+        sync_fini(&s);
+        munmap(p, 4096);
+        close(fd);
+        return;
+    }
+
+    if (pid == 0) {
+        child_close_parent_ends(&s);
+        if (p[0] != 'P')
+            _exit(30);
+        p[1] = 'C';
+        if (!send_byte(s.child_to_parent[1]))
+            _exit(31);
+        if (!wait_byte(s.parent_to_child[0]))
+            _exit(32);
+        if (p[2] != 'M')
+            _exit(33);
+        _exit(0);
+    }
+
+    parent_close_child_ends(&s);
+    bool failed = false;
+    if (!wait_byte(s.child_to_parent[0])) {
+        FAIL("child sync recv");
+        failed = true;
+    } else if (p[1] != 'C') {
+        FAIL("parent did not see child write");
+        failed = true;
+    } else {
+        p[2] = 'M';
+        if (!send_byte(s.parent_to_child[1])) {
+            FAIL("parent sync send");
+            failed = true;
+        }
+    }
+
+    /* See the file-backed test: drop the writer before waitpid so any
+     * failure above does not deadlock both ends of the pipe.
+     */
+    parent_release_writer(&s);
+    int status = 0;
+    if (waitpid(pid, &status, 0) < 0) {
+        FAIL("waitpid");
+    } else if (!failed) {
+        if (!WIFEXITED(status)) {
+            FAIL("child terminated abnormally");
+        } else if (WEXITSTATUS(status) != 0) {
+            char buf[64];
+            snprintf(buf, sizeof(buf), "child failed at step %d",
+                     WEXITSTATUS(status));
+            FAIL(buf);
+        } else {
+            PASS();
+        }
+    }
+
+    sync_fini(&s);
+    munmap(p, 4096);
+    close(fd);
+}
+
+int main(void)
+{
+    printf("test-cross-fork-mapshared: cross-fork MAP_SHARED tests\n\n");
+    fflush(stdout);
+
+    test_file_backed_cross_fork();
+    fflush(stdout);
+    test_anon_shared_cross_fork();
+    fflush(stdout);
+    test_shm_cross_fork();
+    fflush(stdout);
+
+    SUMMARY("test-cross-fork-mapshared");
+    return fails ? 1 : 0;
+}