From 1140b1345de590400a2c4b76efcbde64be4c0ebb Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Thu, 7 May 2026 18:23:18 +0800 Subject: [PATCH] Honor MAP_SHARED coherence across fork Both fork paths (CoW shm and legacy IPC byte-copy) silently broke MAP_SHARED visibility across fork: the child mapped the slab MAP_PRIVATE or got a fresh byte copy, so writes from either side stayed local and never reached the kernel page cache the parent shared with the file. MAP_SHARED|MAP_ANONYMOUS, the standard parent-child IPC primitive used by Postgres and other multi-process daemons, was equally broken. Three pieces close the gap: 1. Parent-side conversion (mmap_fork_prepare_anon_shared, with commit/abort wrappers). While siblings are quiesced the fork thread walks live regions, promotes each MAP_SHARED|MAP_ANONYMOUS region without a backing fd into a memfd-style overlay (mkstemp+unlink+ftruncate, pwrite-seed from host_base, host MAP_FIXED|MAP_SHARED via the new hvf_apply_file_overlay_quiesced helper, mark_overlay_metadata_range), and pre-stages per-region dup() fds so a transient EMFILE rolls back cleanly. The candidate filter skips regions whose host-page-rounded tail would alias a neighbor mapping. The transactional commit/abort wrappers let the fork-IPC failure path roll back the in-place conversion (overlay teardown plus region metadata restore) before resuming siblings; abort validates every captured snapshot before tearing down so a sibling-drift past the quiesce timeout does not leave host VA out of sync with semantic state. forkipc.c logs a warning when abort returns a partial failure so the parent's stale state is visible in post-mortem. 2. Child-side restoration (mmap_fork_restore_overlays). The recv path now snapshots parent overlay_active/start/end (and a new parent_had_fd[] mirror) before clearing inherited state, then re-runs hvf_apply_file_overlay against the saved overlay span once SCM_RIGHTS delivers the backing fds. The inner quiesce is a no-op since no worker vCPUs exist yet. 3. Pre-existing fork-IPC alignment bug. The old recv_backing_fds filter (!MAP_ANONYMOUS && offset != -1) matched the shim region (LINUX_MAP_PRIVATE, offset 0) and ELF text segments and silently stole incoming SCM_RIGHTS fds, leaving the actual file-backed regions with backing_fd=-1. The receiver now uses parent_had_fd[] as the filter so its iteration order matches the sender's "backing_fd >= 0" filter exactly. Unassigned fds are closed instead of leaked. hvf_apply_file_overlay and hvf_remove_file_overlay are split into a public variant that handles thread_quiesce_siblings and a _quiesced inner that the parent fork-prep / abort paths call without a nested barrier. Locked in by tests/test-cross-fork-mapshared.c (3 cases: file-backed mkstemp, MAP_SHARED|MAP_ANONYMOUS, /dev/shm via shm_open). Each case verifies pre-fork seed visibility, child-write-visible-to-parent, parent-write-visible-to-child, and on-disk reconciliation. All three pass against Linux ground truth via tests/qemu-runner.sh. --- src/runtime/fork-state.c | 137 +++++-- src/runtime/forkipc.c | 80 ++-- src/syscall/mem.c | 613 +++++++++++++++++++++++++----- src/syscall/mem.h | 48 +++ tests/manifest.txt | 3 + tests/test-cross-fork-mapshared.c | 458 ++++++++++++++++++++++ 6 files changed, 1202 insertions(+), 137 deletions(-) create mode 100644 tests/test-cross-fork-mapshared.c diff --git a/src/runtime/fork-state.c b/src/runtime/fork-state.c index 1cb99f2..5fd7b08 100644 --- a/src/runtime/fork-state.c +++ b/src/runtime/fork-state.c @@ -21,6 +21,7 @@ #include "debug/log.h" #include "syscall/abi.h" #include "syscall/internal.h" +#include "syscall/mem.h" #include "syscall/proc.h" int fork_ipc_write_all(int fd, const void *buf, size_t len) @@ -494,7 +495,9 @@ static int fork_ipc_drain_bytes(int ipc_fd, uint32_t len) return 0; } -static int fork_ipc_recv_backing_fds(int ipc_fd, guest_t *g) +static int fork_ipc_recv_backing_fds(int ipc_fd, + guest_t *g, + const bool *parent_had_fd) { uint32_t nbacking; if (fork_ipc_read_all(ipc_fd, &nbacking, sizeof(nbacking)) < 0) { @@ -518,19 +521,59 @@ static int fork_ipc_recv_backing_fds(int ipc_fd, guest_t *g) .msg_controllen = cmsg_sz, }; ssize_t nr = recvmsg(ipc_fd, &msg, 0); - if (nr > 0) { - struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); - if (cmsg && cmsg->cmsg_level == SOL_SOCKET && - cmsg->cmsg_type == SCM_RIGHTS) { - int *region_fds = (int *) CMSG_DATA(cmsg); - uint32_t fi = 0; - for (int i = 0; i < g->nregions && fi < nbacking; i++) { - if (!(g->regions[i].flags & LINUX_MAP_ANONYMOUS) && - g->regions[i].offset != (uint64_t) -1) { - g->regions[i].backing_fd = region_fds[fi++]; - } - } - } + if (nr <= 0) { + free(cmsg_buf); + return -1; + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (msg.msg_flags & MSG_CTRUNC) { + log_error("fork-child: backing fd SCM_RIGHTS payload truncated"); + free(cmsg_buf); + return -1; + } + if (!cmsg || cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) { + log_error("fork-child: missing backing fd SCM_RIGHTS payload"); + free(cmsg_buf); + return -1; + } + if (cmsg->cmsg_len < CMSG_LEN(0)) { + free(cmsg_buf); + return -1; + } + + int *region_fds = (int *) CMSG_DATA(cmsg); + uint32_t nreceived = + (uint32_t) ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int)); + uint32_t fi = 0; + + /* Sender (fork_ipc_send_backing_fds) iterates regions and sends one fd per + * region with backing_fd >= 0. The receiver must iterate in the same order + * over regions that had backing_fd in the parent. parent_had_fd[i] is + * captured by the caller before backing_fd is cleared. + * + * The original filter (!MAP_ANONYMOUS && offset != -1) matched extra + * regions like the shim and ELF text, so the first received fd was + * misassigned and the actual file-backed region was left without + * backing_fd. + */ + for (int i = 0; i < g->nregions && fi < nreceived; i++) { + if (parent_had_fd && parent_had_fd[i]) + g->regions[i].backing_fd = region_fds[fi++]; + } + + /* Close any received fds that did not get assigned: avoids leaking host fds + * into the child's process table when a mismatch occurs. + */ + while (fi < nreceived) + close(region_fds[fi++]); + + if (nreceived != nbacking) { + log_error("fork-child: expected %u backing fds but received %u", + nbacking, nreceived); + free(cmsg_buf); + return -1; } free(cmsg_buf); return 0; @@ -618,23 +661,73 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig) return -1; } g->nregions = (int) num_guest_regions; + + /* Capture parent state before clearing the inherited overlay/backing fd + * fields. parent_had_fd lets recv_backing_fds iterate in the same order the + * sender used (regions with backing_fd >= 0); the parent_ovl_* arrays let + * mmap_fork_restore_overlays know which regions to re-install, with what + * overlay span. Heap-allocated to avoid pushing hundreds of KiB onto the + * recv stack frame. + */ + bool *parent_had_fd = NULL; + bool *parent_active = NULL; + uint64_t *parent_ovl_start = NULL; + uint64_t *parent_ovl_end = NULL; + if (g->nregions > 0) { + parent_had_fd = calloc((size_t) g->nregions, sizeof(*parent_had_fd)); + parent_active = calloc((size_t) g->nregions, sizeof(*parent_active)); + parent_ovl_start = + calloc((size_t) g->nregions, sizeof(*parent_ovl_start)); + parent_ovl_end = calloc((size_t) g->nregions, sizeof(*parent_ovl_end)); + if (!parent_had_fd || !parent_active || !parent_ovl_start || + !parent_ovl_end) { + log_error("fork-child: parent overlay buffer alloc failed"); + free(parent_had_fd); + free(parent_active); + free(parent_ovl_start); + free(parent_ovl_end); + return -1; + } + for (int i = 0; i < g->nregions; i++) { + parent_had_fd[i] = (g->regions[i].backing_fd >= 0); + parent_active[i] = g->regions[i].overlay_active; + parent_ovl_start[i] = g->regions[i].overlay_start; + parent_ovl_end[i] = g->regions[i].overlay_end; + } + } + for (int i = 0; i < g->nregions; i++) { g->regions[i].backing_fd = -1; - /* Demote inherited overlays: the child does not yet re-establish - * host MAP_FIXED|MAP_SHARED mappings from the parent's overlay - * fds, so msync, MADV_DONTNEED and friends must use the - * snapshot-style emulation. The CoW path's pre-fork sync of - * overlay bytes into shm_fd already gave the child snapshot the - * correct content at fork time. Live cross-fork MAP_SHARED - * coherence is the next P1 TODO item. + /* Drop inherited overlay metadata; the host MAP_FIXED|MAP_SHARED + * mapping does not exist yet in the child. Re-establishment runs after + * fork_ipc_recv_backing_fds populates backing_fd from the + * parent-supplied SCM_RIGHTS bundle. */ g->regions[i].overlay_active = false; g->regions[i].overlay_start = 0; g->regions[i].overlay_end = 0; } - if (fork_ipc_recv_backing_fds(ipc_fd, g) < 0) + if (fork_ipc_recv_backing_fds(ipc_fd, g, parent_had_fd) < 0) { + free(parent_had_fd); + free(parent_active); + free(parent_ovl_start); + free(parent_ovl_end); return -1; + } + + /* Re-install MAP_SHARED overlays for every region the parent had as + * overlay_active and that now carries a backing fd. Failures here fall back + * to snapshot semantics for the affected region; the child still boots and + * can run. + */ + if (g->nregions > 0) + (void) mmap_fork_restore_overlays(g, parent_active, parent_ovl_start, + parent_ovl_end); + free(parent_had_fd); + free(parent_active); + free(parent_ovl_start); + free(parent_ovl_end); if (fork_ipc_read_all(ipc_fd, sig, sizeof(*sig)) < 0) { log_error("fork-child: failed to read signal state"); diff --git a/src/runtime/forkipc.c b/src/runtime/forkipc.c index 03a7683..c12a760 100644 --- a/src/runtime/forkipc.c +++ b/src/runtime/forkipc.c @@ -35,6 +35,7 @@ #include "syscall/abi.h" #include "syscall/internal.h" +#include "syscall/mem.h" #include "syscall/net.h" /* absock namespace IPC state */ #include "syscall/poll.h" /* wakeup_pipe_signal */ #include "syscall/proc.h" @@ -89,8 +90,8 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec) absock_set_namespace_id(hdr.absock_namespace_id); proc_set_session(hdr.sid, hdr.pgid); - /* Create guest memory before receiving state so all incoming offsets can - * be bounds-checked against the negotiated guest size. + /* Create guest memory before receiving state so all incoming offsets can be + * bounds-checked against the negotiated guest size. */ guest_t g; @@ -176,6 +177,7 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec) guest_destroy(&g); return 1; } + /* POSIX: "Signals pending to the parent shall not be pending to the child." * Clear pending bitmask and RT queue before applying state. * signal_set_state() is deferred until after thread_register_main() @@ -218,17 +220,17 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec) HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, regs.tpidr_el0)); /* Enable MMU directly (page tables already in guest memory from IPC). - * SCTLR must include MMU-enable (M), caches (C, I), RES1 bits, - * and EL0 cache maintenance access (UCI, UCT) for JIT translators. + * SCTLR must include MMU-enable (M), caches (C, I), RES1 bits, and EL0 + * cache maintenance access (UCI, UCT) for JIT translators. */ uint64_t sctlr_with_mmu = SCTLR_RES1 | SCTLR_M | SCTLR_C | SCTLR_I | SCTLR_DZE | SCTLR_UCT | SCTLR_UCI; HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, sctlr_with_mmu)); - /* Restore all 31 GPRs from parent state, then override X0=0 (child - * clone return value). This preserves X1-X30 exactly as they were when - * the parent called clone(), which is required by the Linux syscall ABI - * (especially callee-saved X19-X28, FP=X29, LR=X30). + /* Restore all 31 GPRs from parent state, then override X0=0 (child clone + * return value). This preserves X1-X30 exactly as they were when the parent + * called clone(), which is required by the Linux syscall ABI (especially + * callee-saved X19-X28, FP=X29, LR=X30). */ vcpu_restore_gprs(vcpu, regs.x); vcpu_set_gpr(vcpu, 0, 0); /* Child gets 0 from clone */ @@ -246,14 +248,14 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec) /* Register the fork child's main thread in the thread table. * Without this, current_thread is NULL and any syscall handler that - * accesses per-thread state (signal masks, ptrace, CLONE_THREAD) - * will dereference NULL. + * accesses per-thread state (signal masks, ptrace, CLONE_THREAD) will + * dereference NULL. */ thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1); /* Now that current_thread is set, apply signal state. This must happen - * after thread_register_main() so the per-thread blocked mask and - * altstack are properly restored to the thread entry. + * after thread_register_main() so the per-thread blocked mask and altstack + * are properly restored to the thread entry. */ signal_set_state(&sig); @@ -921,6 +923,22 @@ int64_t sys_clone(hv_vcpu_t vcpu, */ thread_quiesce_siblings(); + mmap_fork_anon_shared_txn_t *anon_shared_txn = NULL; + guest_region_t *regions_snapshot = NULL; + + /* Convert MAP_SHARED|MAP_ANONYMOUS regions that have no backing fd + * into memfd-backed overlay regions. The conversion seeds a private + * temp file with the current bytes and installs a host + * MAP_SHARED|MAP_FIXED overlay on the parent. The child receives the + * fd via SCM_RIGHTS and re-installs its own overlay so subsequent + * writes from either side flow through the kernel page cache and + * reach the other. File-backed MAP_SHARED regions already carry a + * backing fd and are unaffected. Misaligned shared regions + * (snapshot-style) remain incoherent across fork by design. + */ + if (mmap_fork_prepare_anon_shared(g, &anon_shared_txn) < 0) + goto fail_snapshot; + /* Determine if elfuse can use the CoW (shm) fast path. * If shm_fd >= 0, elfuse freezes a snapshot via MAP_PRIVATE and sends the * shm fd to the child. Otherwise fall back to region-by-region copy. @@ -947,8 +965,6 @@ int64_t sys_clone(hv_vcpu_t vcpu, * but before sibling vCPUs resume. Declared up front so all goto paths to * fail_snapshot can free it unconditionally. */ - guest_region_t *regions_snapshot = NULL; - /* Header */ ipc_header_t hdr = { .magic = IPC_MAGIC_HEADER, @@ -1064,9 +1080,7 @@ int64_t sys_clone(hv_vcpu_t vcpu, if (nregions_snapshot > 0) { regions_snapshot = malloc(snap_sz); if (!regions_snapshot) { - thread_resume_siblings(); - close(ipc_sock); - return -LINUX_ENOMEM; + goto fail_snapshot; } memcpy(regions_snapshot, g->regions, snap_sz); } @@ -1074,15 +1088,17 @@ int64_t sys_clone(hv_vcpu_t vcpu, if (fork_ipc_send_fd_table(ipc_sock) < 0) goto fail_snapshot; - /* Resume sibling vCPUs now that the memory snapshot, semantic region - * snapshot, and FD snapshot have been serialized. - */ - thread_resume_siblings(); - uint32_t num_guest_regions = (uint32_t) nregions_snapshot; if (fork_ipc_send_process_state(ipc_sock, regions_snapshot, num_guest_regions) < 0) - goto fail_ipc; + goto fail_snapshot; + + /* The process-state payload includes the SCM_RIGHTS handoff for region + * backing fds. Keep siblings quiesced until that send completes so a + * concurrent munmap/remap cannot close or recycle the captured fd numbers. + */ + thread_resume_siblings(); + mmap_fork_commit_anon_shared(&anon_shared_txn); close(ipc_sock); @@ -1112,13 +1128,21 @@ int64_t sys_clone(hv_vcpu_t vcpu, free(regions_snapshot); return child_guest_pid; -fail_ipc: - free(regions_snapshot); - close(ipc_sock); - return -LINUX_ENOMEM; - fail_snapshot: free(regions_snapshot); + /* Roll back the in-place anon-shared overlay conversion while + * siblings are still parked. A partial rollback failure (e.g., + * region drift past the quiesce timeout) leaves the parent in a + * mixed state: the originating fork-IPC error is the user-visible + * one, but log abort failures so post-mortem can spot the + * lingering overlay without grepping for behavioral symptoms. + */ + int abort_rc = mmap_fork_abort_anon_shared(g, &anon_shared_txn); + if (abort_rc < 0) + log_warn( + "clone: anon-shared rollback partial failure (%d); parent " + "may have stale memfd-backed regions", + abort_rc); thread_resume_siblings(); close(ipc_sock); return -LINUX_ENOMEM; diff --git a/src/syscall/mem.c b/src/syscall/mem.c index 9cfdcd5..fc57998 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -25,17 +25,17 @@ #include "syscall/internal.h" #include "syscall/mem.h" -/* Protects mmap/brk bump allocators and page table extension. Multiple - * threads may call mmap/brk concurrently; without this lock they could - * get overlapping allocations or corrupt page table structures. +/* Protects mmap/brk bump allocators and page table extension. Multiple threads + * may call mmap/brk concurrently; without this lock they could get overlapping + * allocations or corrupt page table structures. */ pthread_mutex_t mmap_lock = PTHREAD_MUTEX_INITIALIZER; /* Lock order: 1 */ -/* Host kernel page size (16 KiB on Apple Silicon, typically 4 KiB on - * Intel macOS). MAP_FIXED requires addr/length/offset multiples of this, - * so an overlay onto a guest 4 KiB-aligned IPA is only applicable when the - * IPA happens to land on a host page boundary; otherwise sys_mmap falls - * back to the pread snapshot path. +/* Host kernel page size (16 KiB on Apple Silicon, typically 4 KiB on Intel + * macOS). MAP_FIXED requires addr/length/offset multiples of this, so an + * overlay onto a guest 4 KiB-aligned IPA is only applicable when the IPA + * happens to land on a host page boundary; otherwise sys_mmap falls back to the + * pread snapshot path. */ static size_t host_page_size_cached(void) { @@ -232,13 +232,12 @@ static uint64_t find_free_gap_inner(const guest_t *g, uint64_t min_addr, uint64_t max_addr) { - /* Round the search start up to the next host-page boundary so an - * unaligned addr hint cannot return a result that lands inside a host - * page already covered by a preceding region's overlay tail (the - * overlay extends to ALIGN_UP(r->end, hps)). Apple Silicon enforces - * 16 KiB host pages; aligning to the guest 4 KiB page is not enough. - * Advance past each walked region to the same boundary for the same - * reason. + /* Round the search start up to the next host-page boundary so an unaligned + * addr hint cannot return a result that lands inside a host page already + * covered by a preceding region's overlay tail (the overlay extends to + * ALIGN_UP(r->end, hps)). Apple Silicon enforces 16 KiB host pages; + * aligning to the guest 4 KiB page is not enough. Advance past each walked + * region to the same boundary for the same reason. */ size_t hps = host_page_size_cached(); uint64_t gap_start = ALIGN_UP(min_addr, hps); @@ -250,8 +249,8 @@ static uint64_t find_free_gap_inner(const guest_t *g, /* If this region starts far enough after gap_start, the allocator found * a gap. Must also verify the gap is within max_addr; regions[] may - * contain entries beyond max_addr that could push gap_start past - * the valid range. + * contain entries beyond max_addr that could push gap_start past the + * valid range. */ if (gap_start <= max_addr && length <= max_addr - gap_start && g->regions[i].start >= gap_start + length) @@ -267,11 +266,11 @@ static uint64_t find_free_gap_inner(const guest_t *g, return UINT64_MAX; /* No suitable gap found */ } -/* Find a free gap, probing the cached post-allocation hint before a full - * scan. The hint tracks the first address after the last successful mapping - * in each region, which avoids rescanning the same prefix on sequential - * mmap activity. A miss falls back to the region base so holes reopened by - * munmap are still reusable. +/* Find a free gap, probing the cached post-allocation hint before a full scan. + * The hint tracks the first address after the last successful mapping in each + * region, which avoids rescanning the same prefix on sequential mmap activity. + * A miss falls back to the region base so holes reopened by munmap are still + * reusable. */ static uint64_t find_free_gap(guest_t *g, uint64_t length, @@ -281,12 +280,12 @@ static uint64_t find_free_gap(guest_t *g, /* RX and RW mappings advance independently, so keep separate hints. */ uint64_t *hint = (min_addr < MMAP_BASE) ? &g->mmap_rx_gap_hint : &g->mmap_rw_gap_hint; + /* Advance the hint to the next host-page boundary so the following - * sequential allocation lands on an address that the kernel accepts - * for mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). The - * tradeoff is up to host_page-1 bytes of address-space waste per small - * allocation; physical pages are still demand-paged, so RAM cost is - * unchanged. + * sequential allocation lands on an address that the kernel accepts for + * mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). The tradeoff + * is up to host_page-1 bytes of address-space waste per small allocation; + * physical pages are still demand-paged, so RAM cost is unchanged. */ size_t hps = host_page_size_cached(); @@ -345,6 +344,11 @@ static int hvf_apply_file_overlay(guest_t *g, uint64_t len, int fd, off_t file_off); +static int hvf_apply_file_overlay_quiesced(guest_t *g, + uint64_t ipa, + uint64_t len, + int fd, + off_t file_off); static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len); static int read_file_range_to_guest(guest_t *g, @@ -402,6 +406,20 @@ typedef struct { char name[sizeof(((guest_region_t *) 0)->name)]; } region_snapshot_t; +typedef struct { + uint64_t overlay_start; + uint64_t overlay_len; + int snap_base; + int nsnaps; +} fork_overlay_snapshot_t; + +struct mmap_fork_anon_shared_txn { + int nsnaps; + region_snapshot_t snaps[GUEST_MAX_REGIONS]; + int noverlays; + fork_overlay_snapshot_t overlays[GUEST_MAX_REGIONS]; +}; + static void close_region_snapshots(region_snapshot_t *snaps, int n) { for (int i = 0; i < n; i++) { @@ -412,9 +430,9 @@ static void close_region_snapshots(region_snapshot_t *snaps, int n) } } -/* Close any open dup'd backing fds in *snaps_ptr, free the heap buffer, - * and zero out the caller's pointer/count so a follow-on call is a no-op. - * Used for buffers allocated via malloc by sys_mmap and sys_mremap; the +/* Close any open dup'd backing fds in *snaps_ptr, free the heap buffer, and + * zero out the caller's pointer/count so a follow-on call is a no-op. Used + * for buffers allocated via malloc by sys_mmap and sys_mremap; the * stack-allocated callers in capture_region_snapshots itself keep using * close_region_snapshots directly. */ @@ -546,8 +564,8 @@ static int restore_snapshot_page_tables(guest_t *g, guest_update_perms(g, snap->start, snap->end, page_perms); } - /* guest_extend_page_tables() repopulates whole 2 MiB blocks, so clear - * holes and deferred mappings again after all snapshot ranges are back. + /* guest_extend_page_tables() repopulates whole 2 MiB blocks, so clear holes + * and deferred mappings again after all snapshot ranges are back. */ uint64_t cursor = start; for (int i = 0; i < n; i++) { @@ -651,29 +669,28 @@ static int rollback_fresh_mmap_allocation(guest_t *g, /* HVF stage-2 segment management. * - * The slab is mapped to HVF in 2 MiB-aligned segments tracked by - * g->segments[]. Initially the slab is one segment (set up by guest_init). - * MAP_SHARED file-backed mmap may need to overlay a sub-range of the slab - * with a real host mmap MAP_FIXED|MAP_SHARED of the file fd. HVF caches - * the host VA->PA mapping at hv_vm_map time and a plain MAP_FIXED overlay - * does not refresh it (see comment in src/runtime/forkipc.c near line 940 - * for the empirical evidence). To force HVF to re-walk the host page - * tables after the overlay, the affected segment is hv_vm_unmap'd, the - * file is mmap'd MAP_FIXED|MAP_SHARED into its host VA, and the segment - * is hv_vm_map'd again. + * The slab is mapped to HVF in 2 MiB-aligned segments tracked by g->segments[]. + * Initially the slab is one segment (set up by guest_init). MAP_SHARED + * file-backed mmap may need to overlay a sub-range of the slab with a real host + * mmap MAP_FIXED|MAP_SHARED of the file fd. HVF caches the host VA->PA mapping + * at hv_vm_map time and a plain MAP_FIXED overlay does not refresh it (see + * comment in src/runtime/forkipc.c for the empirical evidence). To force HVF to + * re-walk the host page tables after the overlay, the affected segment is + * hv_vm_unmap'd, the file is mmap'd MAP_FIXED|MAP_SHARED into its host VA, and + * the segment is hv_vm_map'd again. * * HVF rejects sub-range hv_vm_unmap of a larger map (HV_BAD_ARGUMENT). - * Therefore, before applying the first overlay inside a large segment, - * the segment is split into 2 MiB-aligned pieces around the affected - * range so each piece is independently unmappable. + * Therefore, before applying the first overlay inside a large segment, the + * segment is split into 2 MiB-aligned pieces around the affected range so each + * piece is independently unmappable. */ -/* HVF flags applied to slab segments. The slab is mapped RWX so guest - * stage-1 page tables retain full control over per-page permissions - * (W^X is enforced by the guest's L2/L3 entries, not stage-2). File - * overlay segments use the same RWX flags so PROT_EXEC mmaps still - * work; the host file mmap is created PROT_READ|PROT_WRITE so HVF - * never asks the host kernel for execute permission on the file pages. +/* HVF flags applied to slab segments. The slab is mapped RWX so guest stage-1 + * page tables retain full control over per-page permissions (W^X is enforced by + * the guest's L2/L3 entries, not stage-2). File overlay segments use the same + * RWX flags so PROT_EXEC mmaps still work; the host file mmap is created + * PROT_READ|PROT_WRITE so HVF never asks the host kernel for execute permission + * on the file pages. */ #define HVF_SEGMENT_FLAGS (HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC) @@ -803,39 +820,31 @@ static int hvf_segment_split(guest_t *g, /* Apply a real MAP_SHARED file overlay at [ipa, ipa+len) backed by [fd, * file_off). The IPA range may be sub-2 MiB; the containing 2 MiB * segment is split out first if it is not already isolated. Caller - * holds mmap_lock and has not quiesced siblings yet. The function - * quiesces siblings around the unmap+remap window so concurrent vCPUs - * cannot fault on the temporarily-unmapped IPA range. + * holds mmap_lock and has already quiesced sibling vCPUs (or has none). + * The fork pre-snapshot path quiesces siblings before calling this so + * the overlay install does not trigger a nested quiesce. */ -static int hvf_apply_file_overlay(guest_t *g, - uint64_t ipa, - uint64_t len, - int fd, - off_t file_off) +static int hvf_apply_file_overlay_quiesced(guest_t *g, + uint64_t ipa, + uint64_t len, + int fd, + off_t file_off) { uint64_t aligned_start = ALIGN_2MIB_DOWN(ipa); uint64_t aligned_end = ALIGN_2MIB_UP(ipa + len); - thread_quiesce_siblings(); - int err = hvf_segment_split(g, aligned_start, aligned_end); - if (err < 0) { - thread_resume_siblings(); + if (err < 0) return err; - } int idx = hvf_segment_find(g, aligned_start); if (idx < 0 || g->segments[idx].ipa != aligned_start || - g->segments[idx].len != aligned_end - aligned_start) { - thread_resume_siblings(); + g->segments[idx].len != aligned_end - aligned_start) return -LINUX_EFAULT; - } hvf_segment_t seg = g->segments[idx]; - if (hv_vm_unmap(seg.ipa, seg.len) != HV_SUCCESS) { - thread_resume_siblings(); + if (hv_vm_unmap(seg.ipa, seg.len) != HV_SUCCESS) return -LINUX_EIO; - } void *target = (uint8_t *) g->host_base + ipa; void *p = mmap(target, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, @@ -848,7 +857,6 @@ static int hvf_apply_file_overlay(guest_t *g, */ hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len, HVF_SEGMENT_FLAGS); - thread_resume_siblings(); return saved < 0 ? saved : -saved; } @@ -864,31 +872,42 @@ static int hvf_apply_file_overlay(guest_t *g, hvf_restore_slab_backing(g, ipa, len); hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len, HVF_SEGMENT_FLAGS); - thread_resume_siblings(); return -LINUX_EIO; } - thread_resume_siblings(); return 0; } -/* Undo a file overlay at [ipa, ipa+len) by restoring the slab backing - * and refreshing the containing HVF segment. Caller holds mmap_lock. - * Sibling vCPUs are quiesced around the brief unmap window. +/* Apply a real MAP_SHARED file overlay at [ipa, ipa+len) backed by [fd, + * file_off). The IPA range may be sub-2 MiB; the containing 2 MiB + * segment is split out first if it is not already isolated. Caller + * holds mmap_lock and has not quiesced siblings yet. The function + * quiesces siblings around the unmap+remap window so concurrent vCPUs + * cannot fault on the temporarily-unmapped IPA range. */ -static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len) +static int hvf_apply_file_overlay(guest_t *g, + uint64_t ipa, + uint64_t len, + int fd, + off_t file_off) +{ + thread_quiesce_siblings(); + int err = hvf_apply_file_overlay_quiesced(g, ipa, len, fd, file_off); + thread_resume_siblings(); + return err; +} + +static int hvf_remove_file_overlay_quiesced(guest_t *g, + uint64_t ipa, + uint64_t len) { int idx = hvf_segment_find(g, ipa); if (idx < 0) return -LINUX_EFAULT; hvf_segment_t seg = g->segments[idx]; - thread_quiesce_siblings(); - - if (hv_vm_unmap(seg.ipa, seg.len) != HV_SUCCESS) { - thread_resume_siblings(); + if (hv_vm_unmap(seg.ipa, seg.len) != HV_SUCCESS) return -LINUX_EIO; - } int err = hvf_restore_slab_backing(g, ipa, len); if (err < 0) { @@ -898,20 +917,28 @@ static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len) */ hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len, HVF_SEGMENT_FLAGS); - thread_resume_siblings(); return err; } if (hv_vm_map((uint8_t *) g->host_base + seg.ipa, seg.ipa, seg.len, - HVF_SEGMENT_FLAGS) != HV_SUCCESS) { - thread_resume_siblings(); + HVF_SEGMENT_FLAGS) != HV_SUCCESS) return -LINUX_EIO; - } - thread_resume_siblings(); return 0; } +/* Undo a file overlay at [ipa, ipa+len) by restoring the slab backing + * and refreshing the containing HVF segment. Caller holds mmap_lock. + * Sibling vCPUs are quiesced around the brief unmap window. + */ +static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len) +{ + thread_quiesce_siblings(); + int err = hvf_remove_file_overlay_quiesced(g, ipa, len); + thread_resume_siblings(); + return err; +} + /* Walk semantic regions in [start, end) and undo any active MAP_SHARED file * overlays on the underlying host VA. Used before sys_mmap MAP_FIXED replaces * a previously-overlaid range with a new mapping (anonymous or different @@ -2676,3 +2703,415 @@ int64_t sys_msync(guest_t *g, uint64_t addr, uint64_t length, int flags) return 0; } + +/* See mem.h. Walk regions, convert each MAP_SHARED|MAP_ANONYMOUS region + * without backing fd into a memfd-backed overlay so fork can hand the fd + * to the child for live coherence. Caller has quiesced sibling vCPUs. + */ +static void mmap_fork_dispose_anon_shared_txn( + mmap_fork_anon_shared_txn_t **txn_ptr) +{ + if (!txn_ptr || !*txn_ptr) + return; + + mmap_fork_anon_shared_txn_t *txn = *txn_ptr; + close_region_snapshots(txn->snaps, txn->nsnaps); + free(txn); + *txn_ptr = NULL; +} + +int mmap_fork_prepare_anon_shared(guest_t *g, + mmap_fork_anon_shared_txn_t **txn_out) +{ + if (txn_out) + *txn_out = NULL; + + mmap_fork_anon_shared_txn_t *txn = calloc(1, sizeof(*txn)); + if (!txn) + return -LINUX_ENOMEM; + + pthread_mutex_lock(&mmap_lock); + + size_t hps = host_page_size_cached(); + + /* Snapshot candidate ranges first; conversion mutates the region + * table via hvf_segment_split / mark_overlay_metadata_range and + * would invalidate the walk indices. + */ + struct { + uint64_t start; + uint64_t end; + } cands[GUEST_MAX_REGIONS]; + int n_cands = 0; + for (int i = 0; i < g->nregions && n_cands < GUEST_MAX_REGIONS; i++) { + const guest_region_t *r = &g->regions[i]; + if (r->backing_fd >= 0) + continue; + if (!r->shared) + continue; + if (!(r->flags & LINUX_MAP_ANONYMOUS)) + continue; + if ((r->start % hps) != 0) + continue; /* misaligned start: snapshot fallback */ + /* If the region is shorter than a host page, the host + * MAP_FIXED|MAP_SHARED mmap rounds up to ALIGN_UP(len, hps) and + * may alias the next region's host page. Codex flagged this + * tail-aliasing hazard. Skip when any subsequent region's tail + * crosses r->end into the same host page. The leading region + * is always the one we convert, so backing_fd is naturally -1 + * for it; sibling regions in the host-page tail will each be + * inspected on their own iteration. + */ + uint64_t aligned_end = ALIGN_UP(r->end, hps); + if (aligned_end > r->end) { + bool tail_clear = true; + for (int j = i + 1; j < g->nregions; j++) { + if (g->regions[j].start >= aligned_end) + break; + if (g->regions[j].end > r->end) { + tail_clear = false; + break; + } + } + if (!tail_clear) + continue; + } + cands[n_cands].start = r->start; + cands[n_cands].end = r->end; + n_cands++; + } + + for (int i = 0; i < n_cands; i++) { + uint64_t start = cands[i].start; + uint64_t end = cands[i].end; + if (end <= start) + continue; + uint64_t len = end - start; + uint64_t aligned_len = ALIGN_UP(len, hps); + + char tmpl[] = "/tmp/elfuse-anonsh-XXXXXX"; + int fd = mkstemp(tmpl); + if (fd < 0) { + log_warn("fork-prep: mkstemp for anon-shared region: %s", + strerror(errno)); + continue; + } + unlink(tmpl); + if (ftruncate(fd, (off_t) aligned_len) < 0) { + log_warn("fork-prep: ftruncate(%llu) failed: %s", + (unsigned long long) aligned_len, strerror(errno)); + close(fd); + continue; + } + + /* Seed the temp file with the parent's current bytes so the + * child sees pre-fork content through the kernel page cache + * after re-installation. + */ + const uint8_t *src = (const uint8_t *) g->host_base + start; + uint64_t remain = len; + off_t off = 0; + bool seed_ok = true; + while (remain > 0) { + size_t chunk = remain > (uint64_t) SSIZE_MAX ? (size_t) SSIZE_MAX + : (size_t) remain; + ssize_t nw = pwrite(fd, src, chunk, off); + if (nw < 0) { + if (errno == EINTR) + continue; + seed_ok = false; + break; + } + if (nw == 0) { + seed_ok = false; + break; + } + src += nw; + off += nw; + remain -= (uint64_t) nw; + } + if (!seed_ok) { + log_warn("fork-prep: seed pwrite failed for anon-shared"); + close(fd); + continue; + } + + /* Pre-stage the per-region backing_fd dups before installing + * the overlay. A post-install dup failure would otherwise leave + * the parent live on the temp file but with regions stuck at + * backing_fd=-1, which the SCM_RIGHTS sender silently skips. + * Reserving fds up front and aborting on failure preserves the + * snapshot fallback when the host runs out of fds. + */ + int region_idxs[GUEST_MAX_REGIONS]; + int dup_fds[GUEST_MAX_REGIONS]; + int n_regions = 0; + for (int j = 0; j < g->nregions; j++) { + const guest_region_t *r = &g->regions[j]; + if (r->start >= end) + break; + if (r->end <= start) + continue; + if (r->backing_fd >= 0) + continue; + region_idxs[n_regions++] = j; + } + bool dup_ok = true; + int dups_done = 0; + for (int k = 0; k < n_regions; k++) { + int dup_fd = dup(fd); + if (dup_fd < 0) { + log_warn("fork-prep: dup failed: %s", strerror(errno)); + dup_ok = false; + break; + } + dup_fds[dups_done++] = dup_fd; + } + if (!dup_ok) { + for (int k = 0; k < dups_done; k++) + close(dup_fds[k]); + close(fd); + continue; + } + + if (txn->noverlays >= GUEST_MAX_REGIONS || + txn->nsnaps >= GUEST_MAX_REGIONS) { + for (int k = 0; k < n_regions; k++) + close(dup_fds[k]); + close(fd); + pthread_mutex_unlock(&mmap_lock); + if (txn_out) + *txn_out = txn; + return -LINUX_ENOMEM; + } + + int snap_base = txn->nsnaps; + int nsnaps = + capture_region_snapshots(g, start, end, &txn->snaps[txn->nsnaps], + GUEST_MAX_REGIONS - txn->nsnaps); + if (nsnaps < 0) { + for (int k = 0; k < n_regions; k++) + close(dup_fds[k]); + close(fd); + pthread_mutex_unlock(&mmap_lock); + if (txn_out) + *txn_out = txn; + return nsnaps; + } + + int err = hvf_apply_file_overlay_quiesced(g, start, aligned_len, fd, 0); + if (err < 0) { + log_warn("fork-prep: overlay install [0x%llx, 0x%llx) failed: %d", + (unsigned long long) start, + (unsigned long long) (start + aligned_len), err); + close_region_snapshots(&txn->snaps[snap_base], nsnaps); + for (int k = 0; k < n_regions; k++) + close(dup_fds[k]); + close(fd); + continue; + } + + txn->nsnaps += nsnaps; + txn->overlays[txn->noverlays++] = (fork_overlay_snapshot_t) { + .overlay_start = start, + .overlay_len = aligned_len, + .snap_base = snap_base, + .nsnaps = nsnaps, + }; + + /* Mark every region in [start, end) with overlay span + * [start, start+aligned_len). The candidate filter guarantees + * the host-page tail is empty of other tracked regions, so the + * extended overlay span never aliases a neighbor's backing. + * Assign the pre-staged dups in lockstep with the iteration + * order used to size n_regions above. + */ + mark_overlay_metadata_range(g, start, end, start, start + aligned_len); + for (int k = 0; k < n_regions; k++) { + guest_region_t *r = &g->regions[region_idxs[k]]; + r->backing_fd = dup_fds[k]; + r->offset = r->start - start; + } + close(fd); + } + + pthread_mutex_unlock(&mmap_lock); + if (txn_out) + *txn_out = txn; + return 0; +} + +void mmap_fork_commit_anon_shared(mmap_fork_anon_shared_txn_t **txn_ptr) +{ + mmap_fork_dispose_anon_shared_txn(txn_ptr); +} + +int mmap_fork_abort_anon_shared(guest_t *g, + mmap_fork_anon_shared_txn_t **txn_ptr) +{ + if (!txn_ptr || !*txn_ptr) + return 0; + + mmap_fork_anon_shared_txn_t *txn = *txn_ptr; + int rc = 0; + + pthread_mutex_lock(&mmap_lock); + + for (int i = txn->noverlays - 1; i >= 0; i--) { + const fork_overlay_snapshot_t *ovl = &txn->overlays[i]; + + /* Validate every captured region snapshot for this overlay + * BEFORE tearing down the host MAP_SHARED|MAP_FIXED mapping. + * Removing the overlay first and then discovering the region + * shape has drifted (e.g., a sibling vCPU that returned from a + * long host syscall after the quiesce timeout ran mmap or + * munmap during the prepare/abort window) leaves the host VA + * restored to slab while the region metadata still claims the + * temp-file overlay -- a silent desync. By verifying first the + * function leaves the overlay live and surfaces -EFAULT so the + * caller can decide what to do (still better than a partial + * teardown). + */ + bool drifted = false; + for (int j = 0; j < ovl->nsnaps; j++) { + const region_snapshot_t *snap = &txn->snaps[ovl->snap_base + j]; + const guest_region_t *found = guest_region_find(g, snap->start); + if (!found || found->start != snap->start || + found->end != snap->end) { + drifted = true; + break; + } + } + if (drifted) { + if (rc == 0) + rc = -LINUX_EFAULT; + continue; + } + + int err = hvf_remove_file_overlay_quiesced(g, ovl->overlay_start, + ovl->overlay_len); + if (err < 0) { + if (rc == 0) + rc = err; + continue; + } + + for (int j = 0; j < ovl->nsnaps; j++) { + region_snapshot_t *snap = &txn->snaps[ovl->snap_base + j]; + const guest_region_t *found = guest_region_find(g, snap->start); + guest_region_t *r = (guest_region_t *) found; + /* Validation above ensured r exists with matching bounds. + * Re-check defensively in case hvf_remove_file_overlay_quiesced + * itself mutated the region table on its failure paths. + */ + if (!r || r->start != snap->start || r->end != snap->end) { + if (rc == 0) + rc = -LINUX_EFAULT; + continue; + } + if (r->backing_fd >= 0) { + close(r->backing_fd); + r->backing_fd = -1; + } + r->prot = snap->prot; + r->flags = snap->flags; + r->offset = snap->offset; + r->backing_fd = snap->backing_fd; + snap->backing_fd = -1; + r->overlay_active = snap->overlay_active; + r->overlay_start = snap->overlay_start; + r->overlay_end = snap->overlay_end; + str_copy_trunc(r->name, snap->name, sizeof(r->name)); + } + } + + pthread_mutex_unlock(&mmap_lock); + mmap_fork_dispose_anon_shared_txn(txn_ptr); + return rc; +} + +/* See mem.h. Re-install host MAP_SHARED|MAP_FIXED overlays on the child + * after IPC restore using parent-side overlay metadata captured before + * the recv path cleared the inherited overlay flags. + */ +int mmap_fork_restore_overlays(guest_t *g, + const bool *parent_active, + const uint64_t *parent_ovl_start, + const uint64_t *parent_ovl_end) +{ + pthread_mutex_lock(&mmap_lock); + int rc = 0; + + for (int i = 0; i < g->nregions; i++) { + if (!parent_active[i]) + continue; + guest_region_t *r = &g->regions[i]; + if (r->backing_fd < 0) + continue; + if (r->overlay_active) + continue; /* already re-installed via a sibling region */ + + uint64_t ovl_s = parent_ovl_start[i]; + uint64_t ovl_e = parent_ovl_end[i]; + if (ovl_e <= ovl_s) + continue; + + /* file_off corresponding to ovl_s. The standard install path + * keeps ovl_s == r->start (host-page-aligned guest start), so + * file_off == r->offset. Handle the defensive clip-extends-low + * case by shifting r->offset down by the missing bytes; if that + * would underflow, skip the region (cannot honestly recreate). + */ + uint64_t file_off; + if (ovl_s >= r->start) { + uint64_t delta = ovl_s - r->start; + if (r->offset > UINT64_MAX - delta) { + log_warn( + "fork-child: file_off overflow for region [0x%llx, " + "0x%llx)", + (unsigned long long) r->start, (unsigned long long) r->end); + continue; + } + file_off = r->offset + delta; + } else { + uint64_t delta = r->start - ovl_s; + if (delta > r->offset) { + log_warn( + "fork-child: file_off underflow for region [0x%llx, " + "0x%llx)", + (unsigned long long) r->start, (unsigned long long) r->end); + continue; + } + file_off = r->offset - delta; + } + + int err = hvf_apply_file_overlay(g, ovl_s, ovl_e - ovl_s, r->backing_fd, + (off_t) file_off); + if (err < 0) { + log_warn( + "fork-child: overlay re-install [0x%llx, 0x%llx) failed: %d", + (unsigned long long) ovl_s, (unsigned long long) ovl_e, err); + rc = err; + continue; + } + + /* Mark each region that the parent had attached to this same + * overlay span. Calling mark_overlay_metadata_range with the + * region's own [start, end) bounds marks only that region (the + * region table is sorted and non-overlapping). The outer loop + * later sees overlay_active=true for sibling regions and skips + * the redundant install. + */ + for (int j = 0; j < g->nregions; j++) { + if (!parent_active[j]) + continue; + if (parent_ovl_start[j] != ovl_s || parent_ovl_end[j] != ovl_e) + continue; + mark_overlay_metadata_range(g, g->regions[j].start, + g->regions[j].end, ovl_s, ovl_e); + } + } + + pthread_mutex_unlock(&mmap_lock); + return rc; +} diff --git a/src/syscall/mem.h b/src/syscall/mem.h index 114f704..fed3730 100644 --- a/src/syscall/mem.h +++ b/src/syscall/mem.h @@ -13,6 +13,8 @@ #include #include "core/guest.h" +typedef struct mmap_fork_anon_shared_txn mmap_fork_anon_shared_txn_t; + /* brk: set/query program break */ int64_t sys_brk(guest_t *g, uint64_t addr); @@ -44,3 +46,49 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice); /* msync: synchronize file-backed mappings to disk */ int64_t sys_msync(guest_t *g, uint64_t addr, uint64_t length, int flags); + +/* Fork preparation: convert MAP_SHARED|MAP_ANONYMOUS regions that have + * no backing fd into memfd-backed overlay regions. Each converted region + * gets a private mkstemp+unlink temp file seeded from the current host + * bytes; a host MAP_SHARED|MAP_FIXED overlay is then installed so the + * parent's subsequent writes flow through the kernel page cache. The + * region's backing_fd is set to a dup of the temp file so the regular + * SCM_RIGHTS handover feeds the child a coherent fd. + * + * Caller must already hold sibling vCPUs quiesced. mmap_lock is acquired + * internally. Per-region failures are logged and skipped (snapshot + * fallback persists for those regions); structural failure returns + * -errno. Regions whose start address is not host-page-aligned are + * skipped (overlay-eligibility requirement). On success, *txn_out owns + * rollback metadata that must later be committed or aborted. + */ +int mmap_fork_prepare_anon_shared(guest_t *g, + mmap_fork_anon_shared_txn_t **txn_out); + +/* Finalize or roll back mmap_fork_prepare_anon_shared(). Callers must + * still have sibling vCPUs quiesced when aborting so the host overlay + * removal cannot race guest accesses. + */ +void mmap_fork_commit_anon_shared(mmap_fork_anon_shared_txn_t **txn_ptr); +int mmap_fork_abort_anon_shared(guest_t *g, + mmap_fork_anon_shared_txn_t **txn_ptr); + +/* Fork restore: re-install host MAP_SHARED|MAP_FIXED overlays on the + * child after IPC restore. parent_active[i] / parent_ovl_start[i] / + * parent_ovl_end[i] capture each region's parent-side overlay metadata, + * sampled before fork_ipc_recv_process_state cleared the inherited + * overlay flags. For each region that was overlay-active in the parent + * and now has a valid backing_fd (received via SCM_RIGHTS), the function + * calls hv_vm_unmap + mmap MAP_FIXED|MAP_SHARED + hv_vm_map to bind the + * host VA to the same backing file so the child observes parent writes + * (and vice-versa). Caller must hold no locks; the child has not yet + * created worker vCPUs so no quiesce is needed. + * + * Per-region failures are logged and skipped. Returns 0 on full success + * or the last error encountered (best-effort: a partial failure leaves + * snapshot semantics intact for the failed regions). + */ +int mmap_fork_restore_overlays(guest_t *g, + const bool *parent_active, + const uint64_t *parent_ovl_start, + const uint64_t *parent_ovl_end); diff --git a/tests/manifest.txt b/tests/manifest.txt index 22b75f6..57ca6f2 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -98,6 +98,9 @@ test-mremap [section] msync MAP_SHARED tests test-msync +[section] Cross-fork MAP_SHARED coherence tests +test-cross-fork-mapshared # diff=skip + [section] madvise MADV_DONTNEED tests test-madvise diff --git a/tests/test-cross-fork-mapshared.c b/tests/test-cross-fork-mapshared.c new file mode 100644 index 0000000..7df2b11 --- /dev/null +++ b/tests/test-cross-fork-mapshared.c @@ -0,0 +1,458 @@ +/* Cross-fork MAP_SHARED coherence tests + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Verifies live MAP_SHARED visibility across fork(): both file-backed + * and anonymous shared mappings must continue to propagate writes + * between parent and child after the IPC handoff. Without the + * overlay re-establishment in fork-state, the child sees a stale + * snapshot of the parent's pre-fork contents and writes from each + * side stay private. + * + * Three scenarios: + * 1. Regular file: shared mmap of a tmp file; parent writes appear + * in child mapping AND on disk; child writes appear in parent + * mapping AND on disk. + * 2. shm/dev/shm file: same coherence over an unlinked shm file. + * 3. MAP_SHARED|MAP_ANONYMOUS: parent and child both see each + * other's writes through the kernel-managed memfd that elfuse + * installs at fork time. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" + +int passes = 0, fails = 0; + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS 0x20 +#endif + +/* glibc 2.28 static: shm_open is broken; emulate via /dev/shm. */ +static int my_shm_open(const char *name, int oflag, int mode) +{ + char path[128]; + snprintf(path, sizeof(path), "/dev/shm%s", name); + return open(path, oflag, mode); +} + +static int my_shm_unlink(const char *name) +{ + char path[128]; + snprintf(path, sizeof(path), "/dev/shm%s", name); + return unlink(path); +} + +/* IPC primitives between parent and child: a single byte over a + * pipe pair signals "go ahead" each direction. Replaces a sleep-based + * synchronization that the test matrix would race on slow runners. + */ +typedef struct { + int parent_to_child[2]; + int child_to_parent[2]; +} sync_t; + +static int sync_init(sync_t *s) +{ + if (pipe(s->parent_to_child) != 0) + return -1; + if (pipe(s->child_to_parent) != 0) { + close(s->parent_to_child[0]); + close(s->parent_to_child[1]); + return -1; + } + return 0; +} + +static void close_fd(int *fd) +{ + if (*fd >= 0) { + close(*fd); + *fd = -1; + } +} + +static void sync_fini(sync_t *s) +{ + close_fd(&s->parent_to_child[0]); + close_fd(&s->parent_to_child[1]); + close_fd(&s->child_to_parent[0]); + close_fd(&s->child_to_parent[1]); +} + +static void parent_close_child_ends(sync_t *s) +{ + close_fd(&s->parent_to_child[0]); + close_fd(&s->child_to_parent[1]); +} + +static void child_close_parent_ends(sync_t *s) +{ + close_fd(&s->parent_to_child[1]); + close_fd(&s->child_to_parent[0]); +} + +/* Drop the parent's write end so a child blocked in wait_byte() on the + * parent_to_child read end observes EOF and exits instead of deadlocking + * with the parent in waitpid(). Must be called on every parent-side + * failure path that bypasses send_byte(parent_to_child[1]). + */ +static void parent_release_writer(sync_t *s) +{ + close_fd(&s->parent_to_child[1]); +} + +/* Wait for a single byte from the peer; returns true on success. */ +static bool wait_byte(int fd) +{ + char b; + ssize_t n; + do { + n = read(fd, &b, 1); + } while (n < 0 && errno == EINTR); + return n == 1; +} + +static bool send_byte(int fd) +{ + char b = 'x'; + ssize_t n; + do { + n = write(fd, &b, 1); + } while (n < 0 && errno == EINTR); + return n == 1; +} + +/* Test 1: File-backed MAP_SHARED — parent and child see each other's + * writes through the same disk file without msync. + */ +static void test_file_backed_cross_fork(void) +{ + TEST("MAP_SHARED file: cross-fork live coherence"); + + char tmpl[] = "/tmp/elfuse-cf-mapshared-XXXXXX"; + int fd = mkstemp(tmpl); + if (fd < 0) { + FAIL("mkstemp"); + return; + } + /* Keep the file present so child can re-open via /proc semantics is + * unnecessary -- the child inherits fd from CLOEXEC=off and we map + * via the inherited fd. Unlink keeps the file on the FS but invisible + * via path; both sides hold open references. + */ + unlink(tmpl); + + if (ftruncate(fd, 4096) != 0) { + FAIL("ftruncate"); + close(fd); + return; + } + + char *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + FAIL("parent mmap"); + close(fd); + return; + } + + /* Parent seeds with 'P' before fork. Child should observe it. */ + p[0] = 'P'; + + sync_t s; + if (sync_init(&s) < 0) { + FAIL("sync_init"); + munmap(p, 4096); + close(fd); + return; + } + + pid_t pid = fork(); + if (pid < 0) { + FAIL("fork"); + sync_fini(&s); + munmap(p, 4096); + close(fd); + return; + } + + if (pid == 0) { + child_close_parent_ends(&s); + /* Step 1: child observes parent's pre-fork seed 'P'. */ + if (p[0] != 'P') + _exit(10); + /* Step 2: child writes 'C', signals parent. */ + p[1] = 'C'; + if (!send_byte(s.child_to_parent[1])) + _exit(11); + /* Step 3: child waits for parent's mid-run write 'M'. */ + if (!wait_byte(s.parent_to_child[0])) + _exit(12); + if (p[2] != 'M') + _exit(13); + _exit(0); + } + + parent_close_child_ends(&s); + bool failed = false; + /* Step 2: wait for child's write of 'C'. */ + if (!wait_byte(s.child_to_parent[0])) { + FAIL("child sync recv"); + failed = true; + } else if (p[1] != 'C') { + FAIL("parent did not see child write"); + failed = true; + } else { + /* Step 3: parent writes 'M' for child to verify. */ + p[2] = 'M'; + if (!send_byte(s.parent_to_child[1])) { + FAIL("parent sync send"); + failed = true; + } + } + + /* Drop the writer so a child blocked in wait_byte() sees EOF when the + * parent took an early failure exit; without this both processes deadlock + * and the test driver kills the parent on timeout. + */ + parent_release_writer(&s); + int status = 0; + if (waitpid(pid, &status, 0) < 0) { + FAIL("waitpid"); + } else if (!failed) { + if (!WIFEXITED(status)) { + FAIL("child terminated abnormally"); + } else if (WEXITSTATUS(status) != 0) { + char buf[64]; + snprintf(buf, sizeof(buf), "child failed at step %d", + WEXITSTATUS(status)); + FAIL(buf); + } else { + /* Verify file content reflects both writes. */ + char disk[3] = {0}; + if (pread(fd, disk, 3, 0) != 3) + FAIL("pread"); + else if (disk[0] == 'P' && disk[1] == 'C' && disk[2] == 'M') + PASS(); + else + FAIL("file content does not reflect both sides"); + } + } + + sync_fini(&s); + munmap(p, 4096); + close(fd); +} + +/* Test 2: Anonymous MAP_SHARED — typical parent-child IPC pattern + * (Postgres, multi-process daemons). elfuse must convert the region + * to memfd-backed at fork time so both sides observe writes. + */ +static void test_anon_shared_cross_fork(void) +{ + TEST("MAP_SHARED|MAP_ANONYMOUS: cross-fork live coherence"); + + char *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + FAIL("mmap MAP_SHARED|MAP_ANONYMOUS"); + return; + } + + p[0] = 'P'; + + sync_t s; + if (sync_init(&s) < 0) { + FAIL("sync_init"); + munmap(p, 4096); + return; + } + + pid_t pid = fork(); + if (pid < 0) { + FAIL("fork"); + sync_fini(&s); + munmap(p, 4096); + return; + } + + if (pid == 0) { + child_close_parent_ends(&s); + if (p[0] != 'P') + _exit(20); + p[1] = 'C'; + if (!send_byte(s.child_to_parent[1])) + _exit(21); + if (!wait_byte(s.parent_to_child[0])) + _exit(22); + if (p[2] != 'M') + _exit(23); + _exit(0); + } + + parent_close_child_ends(&s); + bool failed = false; + if (!wait_byte(s.child_to_parent[0])) { + FAIL("child sync recv"); + failed = true; + } else if (p[1] != 'C') { + FAIL("parent did not see child write"); + failed = true; + } else { + p[2] = 'M'; + if (!send_byte(s.parent_to_child[1])) { + FAIL("parent sync send"); + failed = true; + } + } + + /* See the file-backed test: drop the writer before waitpid so any + * failure above does not deadlock both ends of the pipe. + */ + parent_release_writer(&s); + int status = 0; + if (waitpid(pid, &status, 0) < 0) { + FAIL("waitpid"); + } else if (!failed) { + if (!WIFEXITED(status)) { + FAIL("child terminated abnormally"); + } else if (WEXITSTATUS(status) != 0) { + char buf[64]; + snprintf(buf, sizeof(buf), "child failed at step %d", + WEXITSTATUS(status)); + FAIL(buf); + } else { + PASS(); + } + } + + sync_fini(&s); + munmap(p, 4096); +} + +/* Test 3: shm-backed MAP_SHARED via /dev/shm — same as test 1 but + * exercises the shm path (musl/glibc shm_open emulation in elfuse). + */ +static void test_shm_cross_fork(void) +{ + TEST("MAP_SHARED shm: cross-fork live coherence"); + + char name[64]; + snprintf(name, sizeof(name), "/elfuse-cf-shm-%ld", (long) getpid()); + int fd = my_shm_open(name, O_CREAT | O_EXCL | O_RDWR, 0600); + if (fd < 0) { + FAIL("shm_open"); + return; + } + my_shm_unlink(name); + + if (ftruncate(fd, 4096) != 0) { + FAIL("ftruncate"); + close(fd); + return; + } + + char *p = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + FAIL("mmap"); + close(fd); + return; + } + p[0] = 'P'; + + sync_t s; + if (sync_init(&s) < 0) { + FAIL("sync_init"); + munmap(p, 4096); + close(fd); + return; + } + + pid_t pid = fork(); + if (pid < 0) { + FAIL("fork"); + sync_fini(&s); + munmap(p, 4096); + close(fd); + return; + } + + if (pid == 0) { + child_close_parent_ends(&s); + if (p[0] != 'P') + _exit(30); + p[1] = 'C'; + if (!send_byte(s.child_to_parent[1])) + _exit(31); + if (!wait_byte(s.parent_to_child[0])) + _exit(32); + if (p[2] != 'M') + _exit(33); + _exit(0); + } + + parent_close_child_ends(&s); + bool failed = false; + if (!wait_byte(s.child_to_parent[0])) { + FAIL("child sync recv"); + failed = true; + } else if (p[1] != 'C') { + FAIL("parent did not see child write"); + failed = true; + } else { + p[2] = 'M'; + if (!send_byte(s.parent_to_child[1])) { + FAIL("parent sync send"); + failed = true; + } + } + + /* See the file-backed test: drop the writer before waitpid so any + * failure above does not deadlock both ends of the pipe. + */ + parent_release_writer(&s); + int status = 0; + if (waitpid(pid, &status, 0) < 0) { + FAIL("waitpid"); + } else if (!failed) { + if (!WIFEXITED(status)) { + FAIL("child terminated abnormally"); + } else if (WEXITSTATUS(status) != 0) { + char buf[64]; + snprintf(buf, sizeof(buf), "child failed at step %d", + WEXITSTATUS(status)); + FAIL(buf); + } else { + PASS(); + } + } + + sync_fini(&s); + munmap(p, 4096); + close(fd); +} + +int main(void) +{ + printf("test-cross-fork-mapshared: cross-fork MAP_SHARED tests\n\n"); + fflush(stdout); + + test_file_backed_cross_fork(); + fflush(stdout); + test_anon_shared_cross_fork(); + fflush(stdout); + test_shm_cross_fork(); + fflush(stdout); + + SUMMARY("test-cross-fork-mapshared"); + return fails ? 1 : 0; +}