From 2dcaca52f9ef9d5a14aaf40c283ac13f13c86bdf Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 16 May 2026 16:03:54 -0700 Subject: [PATCH 1/2] core: drain seccomp notifications via AsyncFd instead of a blocking thread Signed-off-by: Cong Wang --- crates/sandlock-core/src/sandbox.rs | 18 ++++- crates/sandlock-core/src/seccomp/notif.rs | 96 +++++++++++++++++------ 2 files changed, 89 insertions(+), 25 deletions(-) diff --git a/crates/sandlock-core/src/sandbox.rs b/crates/sandlock-core/src/sandbox.rs index 54c464e..263b9ab 100644 --- a/crates/sandlock-core/src/sandbox.rs +++ b/crates/sandlock-core/src/sandbox.rs @@ -1785,9 +1785,25 @@ impl Sandbox { }); let extra_handlers = std::mem::take(&mut self.rt_mut().extra_handlers); + let (startup_tx, startup_rx) = tokio::sync::oneshot::channel(); self.rt_mut().notif_handle = Some(tokio::spawn( - notif::supervisor(notif_fd, ctx, extra_handlers), + notif::supervisor(notif_fd, ctx, extra_handlers, startup_tx), )); + // Wait for the supervisor to register the notif fd with the IO + // driver before we release the child to execve. Otherwise an + // early traced syscall would queue a notification on a fd no + // one is polling, and the child would block until the next + // `block_on` re-enters the runtime. Critical for current-thread + // runtimes, harmless overhead for multi-thread. + match startup_rx.await { + Ok(Ok(())) => {} + Ok(Err(e)) => return Err(SandboxRuntimeError::Io(e).into()), + Err(_) => { + return Err(SandboxRuntimeError::Child( + "seccomp supervisor exited during startup".into(), + ).into()); + } + } let la_resource = Arc::clone(&res_state); self.rt_mut().loadavg_handle = Some(tokio::spawn(async move { diff --git a/crates/sandlock-core/src/seccomp/notif.rs b/crates/sandlock-core/src/seccomp/notif.rs index 300f1cc..200a944 100644 --- a/crates/sandlock-core/src/seccomp/notif.rs +++ b/crates/sandlock-core/src/seccomp/notif.rs @@ -1108,8 +1108,21 @@ pub async fn supervisor( notif_fd: OwnedFd, ctx: Arc, pending_handlers: Vec<(i64, std::sync::Arc)>, + startup: tokio::sync::oneshot::Sender>, ) { - let fd = notif_fd.as_raw_fd(); + // Register the notif fd with the Tokio IO driver so we can wait for + // readiness via epoll instead of a dedicated blocking thread. + let async_fd = match tokio::io::unix::AsyncFd::with_interest( + notif_fd, + tokio::io::Interest::READABLE, + ) { + Ok(fd) => fd, + Err(err) => { + let _ = startup.send(Err(err)); + return; + } + }; + let fd = async_fd.get_ref().as_raw_fd(); // Build the dispatch table once at startup. let dispatch_table = Arc::new(super::dispatch::build_dispatch_table( @@ -1122,38 +1135,73 @@ pub async fn supervisor( // Try to enable sync wakeup (Linux 6.7+, ignore error on older kernels). try_set_sync_wakeup(fd); - // SECCOMP_IOCTL_NOTIF_RECV blocks regardless of O_NONBLOCK, so we - // receive notifications in a blocking thread and send them to the - // async handler via a channel. This guarantees we never miss a - // notification — the thread is always blocked in recv_notif ready - // for the next one. + // The IO driver has the fd registered; subsequent block_on cycles + // can resume this task and pick up readiness events. Tell the + // caller it is safe to release the child. + let _ = startup.send(Ok(())); + + // Periodic sweep as a defensive backstop in case pidfd-based + // lifecycle cleanup misses an entry (e.g. pidfd_open failed for a + // child on an old kernel, or its watcher panicked). At 5 minutes + // this is cheap enough to leave on; the primary cleanup path is + // still per-child pidfd readiness in `spawn_pid_watcher`. + let gc = tokio::spawn(process_index_gc(Arc::clone(&ctx.processes))); + + // SECCOMP_IOCTL_NOTIF_RECV is a blocking ioctl that ignores + // O_NONBLOCK on the notif fd (see kernel/seccomp.c + // `seccomp_notify_recv`, which calls `wait_event_interruptible` + // unconditionally). So we cannot call it speculatively. The + // notif fd does implement `seccomp_notify_poll` and fires + // EPOLLIN whenever an INIT-state notification is queued, so we: + // + // 1. Wait for readiness via AsyncFd (one EPOLLET edge per + // wake_up_poll batch). + // 2. Re-arm immediately so any new arrivals during drain queue + // a fresh edge. + // 3. Drain the queue using `poll(timeout=0)` as a non-blocking + // probe before each recv. This is required because tokio's + // AsyncFd is edge-triggered: a burst of arrivals between + // our `clear_ready` and the next `readable().await` would + // coalesce into a single event, so the recv side must drain + // to empty before re-awaiting. // // Notifications are processed sequentially (not spawned) to avoid // mutex contention between concurrent handlers. - let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::(); + 'outer: loop { + let mut ready = match async_fd.readable().await { + Ok(r) => r, + Err(_) => break 'outer, + }; + ready.clear_ready(); + drop(ready); - std::thread::spawn(move || { loop { - match recv_notif(fd) { - Ok(notif) => { - if tx.send(notif).is_err() { - break; // receiver dropped — supervisor shutting down + let mut pfd = libc::pollfd { + fd, + events: libc::POLLIN, + revents: 0, + }; + let r = unsafe { libc::poll(&mut pfd, 1, 0) }; + if r > 0 && (pfd.revents & libc::POLLIN) != 0 { + match recv_notif(fd) { + Ok(notif) => { + handle_notification(notif, &ctx, &dispatch_table, fd).await; + continue; } + Err(err) if err.raw_os_error() == Some(libc::EINTR) => continue, + Err(_) => break 'outer, } - Err(_) => break, // fd closed — child exited } + // No POLLIN. POLLHUP/POLLERR/POLLNVAL on the notif fd are + // terminal (filter released or fd invalid); leaving them + // unhandled would busy-spin because tokio keeps reporting + // the fd ready. Otherwise the queue is just empty for now, + // so go back to awaiting an edge. + if (pfd.revents & (libc::POLLHUP | libc::POLLERR | libc::POLLNVAL)) != 0 { + break 'outer; + } + break; } - }); - - // Periodic sweep as a defensive backstop in case pidfd-based - // lifecycle cleanup misses an entry (e.g. pidfd_open failed for a - // child on an old kernel, or its watcher panicked). At 5 minutes - // this is cheap enough to leave on; the primary cleanup path is - // still per-child pidfd readiness in `spawn_pid_watcher`. - let gc = tokio::spawn(process_index_gc(Arc::clone(&ctx.processes))); - - while let Some(notif) = rx.recv().await { - handle_notification(notif, &ctx, &dispatch_table, fd).await; } gc.abort(); From 5e3d0476b00e336a2abff75961dbee1a36b04740 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 16 May 2026 16:18:28 -0700 Subject: [PATCH 2/2] core: clarify supervisor drain via NotifFdState enum and probe helper Signed-off-by: Cong Wang --- crates/sandlock-core/src/seccomp/notif.rs | 97 +++++++++++++---------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/crates/sandlock-core/src/seccomp/notif.rs b/crates/sandlock-core/src/seccomp/notif.rs index 200a944..2d2affd 100644 --- a/crates/sandlock-core/src/seccomp/notif.rs +++ b/crates/sandlock-core/src/seccomp/notif.rs @@ -257,6 +257,47 @@ fn recv_notif(fd: RawFd) -> io::Result { } } +/// Result of a non-blocking probe on the seccomp notif fd. +enum NotifFdState { + /// At least one INIT-state notification is queued. `recv_notif` + /// will return without blocking. + Pending, + /// No notifications and no terminal flags. Wait for the next + /// epoll edge before probing again. + Empty, + /// `POLLHUP`/`POLLERR`/`POLLNVAL` set, or `poll(2)` itself failed: + /// filter has been released or the fd is invalid. The supervisor + /// should exit; subsequent waits would busy-spin because epoll + /// keeps reporting the fd ready. + Terminal, +} + +/// Non-blocking probe of the seccomp notif fd. +/// +/// `SECCOMP_IOCTL_NOTIF_RECV` ignores `O_NONBLOCK` and calls +/// `wait_event_interruptible` unconditionally (kernel/seccomp.c +/// `seccomp_notify_recv`). So `recv_notif` cannot be invoked +/// speculatively to detect an empty queue. This helper uses +/// `poll(timeout=0)` as a non-blocking predictor: if POLLIN is set +/// the kernel will hand us a notification without blocking; if a +/// terminal flag is set the fd will keep waking AsyncFd until the +/// supervisor exits. +fn probe_notif_fd(fd: RawFd) -> NotifFdState { + let mut pfd = libc::pollfd { + fd, + events: libc::POLLIN, + revents: 0, + }; + let r = unsafe { libc::poll(&mut pfd, 1, 0) }; + if r > 0 && (pfd.revents & libc::POLLIN) != 0 { + return NotifFdState::Pending; + } + if r < 0 || (pfd.revents & (libc::POLLHUP | libc::POLLERR | libc::POLLNVAL)) != 0 { + return NotifFdState::Terminal; + } + NotifFdState::Empty +} + /// Send a response with SECCOMP_USER_NOTIF_FLAG_CONTINUE. fn respond_continue(fd: RawFd, id: u64) -> io::Result<()> { let resp = SeccompNotifResp { @@ -1147,23 +1188,12 @@ pub async fn supervisor( // still per-child pidfd readiness in `spawn_pid_watcher`. let gc = tokio::spawn(process_index_gc(Arc::clone(&ctx.processes))); - // SECCOMP_IOCTL_NOTIF_RECV is a blocking ioctl that ignores - // O_NONBLOCK on the notif fd (see kernel/seccomp.c - // `seccomp_notify_recv`, which calls `wait_event_interruptible` - // unconditionally). So we cannot call it speculatively. The - // notif fd does implement `seccomp_notify_poll` and fires - // EPOLLIN whenever an INIT-state notification is queued, so we: - // - // 1. Wait for readiness via AsyncFd (one EPOLLET edge per - // wake_up_poll batch). - // 2. Re-arm immediately so any new arrivals during drain queue - // a fresh edge. - // 3. Drain the queue using `poll(timeout=0)` as a non-blocking - // probe before each recv. This is required because tokio's - // AsyncFd is edge-triggered: a burst of arrivals between - // our `clear_ready` and the next `readable().await` would - // coalesce into a single event, so the recv side must drain - // to empty before re-awaiting. + // Edge-triggered drain: each `readable().await` returns once per + // epoll edge, then we drain the kernel queue via `probe_notif_fd` + // until empty. The drain is necessary because tokio's AsyncFd is + // edge-triggered and `recv_notif` does not signal "would block", + // so a burst of arrivals between two `readable().await` calls + // would coalesce into a single wake event. // // Notifications are processed sequentially (not spawned) to avoid // mutex contention between concurrent handlers. @@ -1176,31 +1206,18 @@ pub async fn supervisor( drop(ready); loop { - let mut pfd = libc::pollfd { - fd, - events: libc::POLLIN, - revents: 0, - }; - let r = unsafe { libc::poll(&mut pfd, 1, 0) }; - if r > 0 && (pfd.revents & libc::POLLIN) != 0 { - match recv_notif(fd) { - Ok(notif) => { - handle_notification(notif, &ctx, &dispatch_table, fd).await; - continue; - } - Err(err) if err.raw_os_error() == Some(libc::EINTR) => continue, - Err(_) => break 'outer, + match probe_notif_fd(fd) { + NotifFdState::Pending => { + let notif = match recv_notif(fd) { + Ok(n) => n, + Err(e) if e.raw_os_error() == Some(libc::EINTR) => continue, + Err(_) => break 'outer, + }; + handle_notification(notif, &ctx, &dispatch_table, fd).await; } + NotifFdState::Empty => break, + NotifFdState::Terminal => break 'outer, } - // No POLLIN. POLLHUP/POLLERR/POLLNVAL on the notif fd are - // terminal (filter released or fd invalid); leaving them - // unhandled would busy-spin because tokio keeps reporting - // the fd ready. Otherwise the queue is just empty for now, - // so go back to awaiting an edge. - if (pfd.revents & (libc::POLLHUP | libc::POLLERR | libc::POLLNVAL)) != 0 { - break 'outer; - } - break; } }