From 7dcd9fa3442dd05609cd0dad3dcb17ca15311419 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Thu, 30 Apr 2026 10:26:56 +0800
Subject: [PATCH] Add opt-in seccomp BPF allowlist behind --seccomp

A memory-corruption RCE in device emulation could otherwise pivot to
arbitrary host syscalls (open, execve, socket, unlink). The new filter
reduces blast radius to the syscalls every device worker, the vcpu
loop, and pthread internals actually need.

src/seccomp.c builds an arch-aware BPF allowlist for x86_64 and
aarch64. x86_64 explicitly rejects the x32 ABI: x32 shares
AUDIT_ARCH_X86_64 but tags syscall numbers with __X32_SYSCALL_BIT
(0x40000000), so a naive allowlist that copies an Internet example
without this guard would let an attacker pivot to x32 syscall numbers
that alias different kernel handlers. Default action is
SECCOMP_RET_KILL_PROCESS so a worker thread that takes a denied
syscall aborts the whole VMM rather than leaving the device in an
unrecoverable state.

Install uses seccomp(2) directly with SECCOMP_FILTER_FLAG_TSYNC. The
serial worker thread is already running by the time seccomp_apply()
is called (spawned in vm_arch_init_platform_device during vm_init);
plain prctl(PR_SET_SECCOMP) installs only on the calling thread,
leaving an attacker a path through the pre-existing worker. TSYNC's
return is three-way: 0 success, -1 errno error, positive TID for
partial-sync failure -- a naive < 0 check would silently treat
partial-sync as success and leave the process unfiltered, so any
non-zero return is reported as failure and surfaces the offending
TID.

The flag is opt-in via --seccomp so existing test and development
workflows are unaffected. CI gains a second "boot test (seccomp)"
step on host-x64 that reuses .ci/autorun.sh with
KVM_HOST_FLAGS=--seccomp; reaching the "Linux version " banner
exercises prctl(PR_SET_NO_NEW_PRIVS), seccomp(2)+TSYNC over the
already-running serial worker, and the early KVM_RUN dispatch under
the filter, so a regression that drops a steady-state syscall from
the allowlist surfaces here as a SIGSYS before the banner.

Boot-tested on x86_64 and aarch64: --seccomp boots Linux to the
busybox console prompt with virtio-blk mounting ext4 r/w (exercises
pread/pwrite/fdatasync) and virtio-net probed; the lazy
virtio-blk/virtio-net worker spawn paths inside vm_run rely on
clone/clone3 + set_robust_list + rseq + sigaltstack being
allowlisted.
---
 .github/workflows/main.yml |  13 ++
 Makefile                   |   9 +-
 README.md                  |  19 ++-
 src/main.c                 |  30 +++-
 src/seccomp.c              | 278 +++++++++++++++++++++++++++++++++++++
 src/seccomp.h              |  10 ++
 6 files changed, 352 insertions(+), 7 deletions(-)
 create mode 100644 src/seccomp.c
 create mode 100644 src/seccomp.h

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 175439d..c21b663 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -185,6 +185,19 @@ jobs:
       - name: boot test
         run: .ci/autorun.sh
         timeout-minutes: 5
+      # Smoke-test the --seccomp opt-in path on the same nested-KVM runner.
+      # The smoke test only waits for the "Linux version " banner before
+      # sending Ctrl-A x, but reaching the banner already exercises
+      # prctl(PR_SET_NO_NEW_PRIVS), seccomp(2)+TSYNC install over the
+      # already-running serial worker, and the early KVM_RUN dispatch
+      # under the filter. A regression that drops a steady-state syscall
+      # from src/seccomp.c's allowlist surfaces here as a SIGSYS before
+      # the banner appears.
+      - name: boot test (seccomp)
+        run: .ci/autorun.sh
+        env:
+          KVM_HOST_FLAGS: --seccomp
+        timeout-minutes: 5
 
   # arm64 host build: configs/linux-arm64.config has no prebuilt path and
   # the GitHub-hosted ubuntu-24.04-arm runners (Cobalt 100) do NOT expose
diff --git a/Makefile b/Makefile
index 09d5917..e9181ae 100644
--- a/Makefile
+++ b/Makefile
@@ -32,6 +32,7 @@ OBJS := \
 	virtio-blk.o \
 	virtio-net.o \
 	diskimg.o \
+	seccomp.o \
 	main.o
 
 ifeq ($(ARCH), x86_64)
@@ -66,9 +67,15 @@ $(OUT)/ext4.img:
 	$(Q)dd if=/dev/zero of=$@ bs=4k count=600
 	$(Q)mkfs.ext4 -F $@
 
+# KVM_HOST_FLAGS forwards extra flags to the binary so CI and developers
+# can opt into --seccomp without duplicating the recipe. Empty by default
+# to keep `make check` matching the documented invocation.
+KVM_HOST_FLAGS ?=
+
 check: $(BIN) $(LINUX_IMG) $(ROOTFS_IMG) $(OUT)/ext4.img
 	$(VECHO) "\nOnce the message 'Kernel panic' appears, press Ctrl-C to exit\n\n"
-	$(Q)sudo $(BIN) -k $(LINUX_IMG) -i $(ROOTFS_IMG) -d $(OUT)/ext4.img
+	$(Q)sudo $(BIN) -k $(LINUX_IMG) -i $(ROOTFS_IMG) -d $(OUT)/ext4.img \
+	    $(KVM_HOST_FLAGS)
 
 clean:
 	$(VECHO) "Cleaning...\n"
diff --git a/README.md b/README.md
index 133acf9..4d016d3 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ make check
 ### Start Emulator
 
 ```
-$ build/kvm-host -k bzImage [-i initrd] [-d disk-image]
+$ build/kvm-host -k bzImage [-i initrd] [-d disk-image] [--seccomp]
 ```
 
 `bzImage` is the path to linux kernel bzImage. The bzImage file is in a specific format,
@@ -51,6 +51,23 @@ containing concatenated `bootsect.o + setup.o + misc.o + piggy.o`. `initrd` is t
 initial RAM disk image, which is an optional argument.
 `disk-image` is the path to disk image which can be mounted as a block device via virtio. For the reference Linux guest, ext4 filesystem is used for disk image.
 
+`--seccomp` is an opt-in defense-in-depth flag that installs a seccomp BPF
+allowlist over the steady-state KVM_RUN loop. Once active, only the
+syscalls that the vcpu, virtio-blk, virtio-net, and serial workers need
+are permitted; anything else (including a memory-corruption RCE in
+device emulation pivoting to `execve`, `open`, or `socket`) terminates
+the process with `SIGSYS`. The filter is applied via `seccomp(2)` with
+`SECCOMP_FILTER_FLAG_TSYNC` so already-running worker threads inherit
+it. The flag is off by default so existing test and development
+workflows are unaffected. CI exercises both paths
+(`.github/workflows/main.yml`).
+
+To run `make check` with the filter enabled:
+
+```shell
+$ make KVM_HOST_FLAGS=--seccomp check
+```
+
 ### Exit Emulator
 
 To exit kvm-host, press "Ctrl-A", release both keys, and then press "x".
diff --git a/src/main.c b/src/main.c
index a4e0245..f219535 100644
--- a/src/main.c
+++ b/src/main.c
@@ -4,9 +4,18 @@
 #include <unistd.h>
 
 #include "err.h"
+#include "seccomp.h"
 #include "vm.h"
 
 static char *kernel_file = NULL, *initrd_file = NULL, *diskimg_file = NULL;
+static int enable_seccomp = 0;
+
+/* Long-only option ids start above the ASCII range so they can never collide
+ * with a short-option char in the getopt_long return.
+ */
+enum {
+    OPT_SECCOMP = 256,
+};
 
 #define print_option(args, help_msg) printf("  %-30s%s", args, help_msg)
 
@@ -19,6 +28,8 @@ static void usage(const char *execpath)
     print_option("-i, --initrd initrd", "Initial RAM disk image\n");
     print_option("-d, --disk disk-image",
                  "Disk image for virtio-blk devices\n");
+    print_option("--seccomp",
+                 "Install a seccomp BPF allowlist before vm_run.\n");
 }
 
 static struct termios saved_attributes;
@@ -50,9 +61,8 @@ int main(int argc, char *argv[])
 {
     int option_index = 0;
     struct option opts[] = {
-        {"kernel", 1, NULL, 'k'},
-        {"initrd", 1, NULL, 'i'},
-        {"disk", 1, NULL, 'd'},
+        {"kernel", 1, NULL, 'k'}, {"initrd", 1, NULL, 'i'},
+        {"disk", 1, NULL, 'd'},   {"seccomp", 0, NULL, OPT_SECCOMP},
         {"help", 0, NULL, 'h'},
     };
 
@@ -69,6 +79,9 @@ int main(int argc, char *argv[])
         case 'd':
             diskimg_file = optarg;
             break;
+        case OPT_SECCOMP:
+            enable_seccomp = 1;
+            break;
         case 'h':
             usage(argv[0]);
             exit(0);
@@ -97,8 +110,15 @@ int main(int argc, char *argv[])
     if (vm_late_init(&vm) < 0)
         return -1;
 
-    /* Switch the terminal to raw mode only once setup has succeeded so that
-     * any error from the load/init paths above is rendered on a normal tty.
+    /* Lock down the syscall surface before raw-mode and vm_run, so a
+     * memory-corruption RCE in device emulation cannot escape to arbitrary host
+     * syscalls. Off by default — opt in via --seccomp.
+     */
+    if (enable_seccomp && seccomp_apply() < 0)
+        return -1;
+
+    /* Switch the terminal to raw mode only once setup has succeeded so that any
+     * error from the load/init paths above is rendered on a normal tty.
      */
     set_input_mode();
 
diff --git a/src/seccomp.c b/src/seccomp.c
new file mode 100644
index 0000000..14b8c9b
--- /dev/null
+++ b/src/seccomp.c
@@ -0,0 +1,278 @@
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "err.h"
+#include "seccomp.h"
+
+#if defined(__x86_64__)
+#define SECCOMP_AUDIT_ARCH AUDIT_ARCH_X86_64
+#elif defined(__aarch64__)
+#define SECCOMP_AUDIT_ARCH AUDIT_ARCH_AARCH64
+#else
+#error "Unsupported architecture for seccomp filter"
+#endif
+
+#ifndef __X32_SYSCALL_BIT
+#define __X32_SYSCALL_BIT 0x40000000
+#endif
+
+/* SECCOMP_RET_KILL_PROCESS lands in kernel 4.14 (2017). Older kernels
+ * EINVAL the install rather than silently downgrading, which is the right
+ * fail-loud behavior for a defense-in-depth feature.
+ */
+#ifndef SECCOMP_RET_KILL_PROCESS
+#define SECCOMP_RET_KILL_PROCESS 0x80000000U
+#endif
+
+/* Steady-state syscall set the VMM needs *after* vm_late_init returns:
+ *
+ *   - I/O fast paths in serial / virtio-blk / virtio-net workers
+ *   - vm_run's KVM_RUN dispatch and KVM_IRQ_LINE assertions
+ *   - on-demand virtio worker spawning when the guest enables a virtqueue
+ *     (these threads are created lazily inside vm_run, so clone/clone3
+ *     must remain reachable here)
+ *   - vm_exit teardown
+ *
+ * Capture the actual set on your target host with `strace -c -f sudo
+ * build/kvm-host -k ...` before relaxing or tightening this list — glibc
+ * internals shift across versions and a missing entry is a SIGSYS in a
+ * device worker, not a graceful failure.
+ */
+static const long allowed_syscalls[] = {
+    /* KVM ioctls (KVM_RUN, KVM_IRQ_LINE) and TUN/TTY ioctls. */
+    SYS_ioctl,
+
+    /* Eventfd reads/writes (irqfd, ioeventfd, stopfd), serial stdin
+     * reads, and the per-segment legacy I/O paths land in read/write.
+     */
+    SYS_read,
+    SYS_write,
+
+    /* virtio-net's RX/TX paths use scatter-gather across packed-ring
+     * descriptor chains.
+     */
+    SYS_readv,
+    SYS_writev,
+
+    /* virtio-blk uses pread/pwrite to keep concurrent virtq workers from
+     * racing on a shared file pointer; FLUSH dispatches to fdatasync.
+     */
+    SYS_pread64,
+    SYS_pwrite64,
+    SYS_fdatasync,
+
+/* aarch64 lacks SYS_poll; glibc's poll(3) maps to ppoll there.
+ * Allow both so the same source compiles on either arch.
+ */
+#ifdef __NR_poll
+    SYS_poll,
+#endif
+#ifdef __NR_ppoll
+    SYS_ppoll,
+#endif
+
+    /* vm_run mmaps the kvm_run shared region; vm_exit munmaps RAM and
+     * the kvm_run mapping. pthread_create reaches mmap/mprotect to
+     * allocate stacks with guard pages.
+     */
+    SYS_mmap,
+    SYS_munmap,
+    SYS_mprotect,
+
+    /* vm_exit closes the kvm/vm/vcpu fds. */
+    SYS_close,
+
+    /* pthread synchronization — every mutex contention or cv signal
+     * lands here.
+     */
+    SYS_futex,
+
+    /* Lazy worker creation in virtio-blk / virtio-net enable_vq paths.
+     * Modern glibc tries clone3 first and falls back to clone, so allow
+     * both for portability across distributions.
+     */
+    SYS_clone,
+#ifdef __NR_clone3
+    SYS_clone3,
+#endif
+    /* pthread thread-creation chain touches these. */
+    SYS_set_robust_list,
+#ifdef __NR_rseq
+    SYS_rseq,
+#endif
+
+    /* Process teardown. */
+    SYS_exit,
+    SYS_exit_group,
+
+    /* Signal trampolines and EINTR restart of long-running syscalls.
+     * pthread cancellation paths block signals around cleanup handlers,
+     * so rt_sigprocmask is reachable even though our own code never
+     * touches signals at runtime.
+     */
+    SYS_rt_sigreturn,
+    SYS_rt_sigprocmask,
+    SYS_rt_sigaction,
+    SYS_restart_syscall,
+
+    /* Timing primitives the C library may reach for in cv waits and
+     * mutex back-off paths.
+     */
+    SYS_clock_gettime,
+    SYS_clock_nanosleep,
+    SYS_nanosleep,
+
+    /* glibc allocator hands pages back to the kernel via madvise. */
+    SYS_madvise,
+
+/* stdio's first write to a stream calls __fstat / statx via
+ * _IO_file_doallocate to size its block buffer; the first printf in
+ * vm_run (the "shutdown\n" path on KVM_EXIT_SHUTDOWN) and any
+ * fprintf(stderr) along an error path both reach this. Older glibc
+ * uses SYS_fstat, newer libc on aarch64 prefers newfstatat/statx —
+ * allow the union so the same binary survives a libc upgrade.
+ */
+#ifdef __NR_fstat
+    SYS_fstat,
+#endif
+#ifdef __NR_newfstatat
+    SYS_newfstatat,
+#endif
+#ifdef __NR_statx
+    SYS_statx,
+#endif
+
+    /* glibc malloc grows via brk(2) for small allocations; thread-local
+     * arenas may grow it too on the first allocation in a worker.
+     */
+    SYS_brk,
+
+    /* sigaltstack is set per-thread by glibc nptl during clone(); strip
+     * it and pthread_create kills the new worker before our entry runs.
+     */
+    SYS_sigaltstack,
+};
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+static struct sock_filter bpf_stmt(uint16_t code, uint32_t k)
+{
+    return (struct sock_filter) {code, 0, 0, k};
+}
+
+static struct sock_filter bpf_jump(uint16_t code,
+                                   uint32_t k,
+                                   uint8_t jt,
+                                   uint8_t jf)
+{
+    return (struct sock_filter) {code, jt, jf, k};
+}
+
+int seccomp_apply(void)
+{
+    const size_t n = ARRAY_SIZE(allowed_syscalls);
+
+    /* jt is a u8, so the longest forward jump from a JEQ to the trailing
+     * RET ALLOW is bounded by 255. Our list is well under that, but enforce
+     * it at compile-readable assert time so a future contributor adding
+     * the 200th syscall sees the fence rather than a silently-truncated
+     * jump turning into a kill.
+     */
+    _Static_assert(ARRAY_SIZE(allowed_syscalls) < 255,
+                   "allowlist exceeds BPF jt range");
+
+    /* Layout:
+     *   0: LD arch
+     *   1: JEQ AUDIT_ARCH (jt=1 skip kill, jf=0 fall through)
+     *   2: RET KILL              <- arch mismatch
+     *   3: LD nr
+     * x86_64 only:
+     *   4: JGE __X32_SYSCALL_BIT (jt=0 fall through, jf=1 skip kill)
+     *   5: RET KILL              <- x32 ABI
+     *   6..6+n-1: JEQ allowed[i] (jt = n-i to RET ALLOW)
+     *   6+n: RET KILL            <- default deny
+     *   6+n+1: RET ALLOW
+     */
+    struct sock_filter filter[8 + ARRAY_SIZE(allowed_syscalls)];
+    size_t i = 0;
+
+    /* 1. Reject any ABI other than the host's. */
+    filter[i++] =
+        bpf_stmt(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, arch));
+    filter[i++] = bpf_jump(BPF_JMP | BPF_JEQ | BPF_K, SECCOMP_AUDIT_ARCH, 1, 0);
+    filter[i++] = bpf_stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS);
+
+    /* 2. Load syscall number for the rest of the program. */
+    filter[i++] =
+        bpf_stmt(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, nr));
+
+#if defined(__x86_64__)
+    /* 3. Reject the x32 ABI: x32 shares AUDIT_ARCH_X86_64 but tags every
+     * syscall number with bit 30 (__X32_SYSCALL_BIT). A naive allowlist
+     * that copies an Internet example without this guard lets a guest
+     * pivot to x32 syscall numbers (which alias different kernel handlers
+     * than the x86_64 ones with the same low bits) and bypass the filter.
+     */
+    filter[i++] = bpf_jump(BPF_JMP | BPF_JGE | BPF_K, __X32_SYSCALL_BIT, 0, 1);
+    filter[i++] = bpf_stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS);
+#endif
+
+    /* 4. Walk allowlist; jt skips past every remaining JEQ plus the
+     * default-deny RET KILL to land on RET ALLOW.
+     */
+    for (size_t j = 0; j < n; j++) {
+        uint8_t jt = (uint8_t) (n - j);
+        filter[i++] = bpf_jump(BPF_JMP | BPF_JEQ | BPF_K,
+                               (uint32_t) allowed_syscalls[j], jt, 0);
+    }
+
+    /* 5. Default deny. SECCOMP_RET_KILL_PROCESS aborts the whole VMM
+     * rather than just the offending thread — a worker thread killed
+     * mid-virtq leaves the device in an unrecoverable state, and silent
+     * partial failure is worse than a clean SIGSYS at the host.
+     */
+    filter[i++] = bpf_stmt(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS);
+    /* 6. Allow target. */
+    filter[i++] = bpf_stmt(BPF_RET | BPF_K, SECCOMP_RET_ALLOW);
+
+    struct sock_fprog prog = {
+        .len = (unsigned short) i,
+        .filter = filter,
+    };
+
+    /* PR_SET_NO_NEW_PRIVS is a precondition for unprivileged seccomp
+     * install and harmless under root; it also blocks suid escalation
+     * if the VMM ever execs.
+     */
+    if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+        return throw_err("Failed to set PR_SET_NO_NEW_PRIVS");
+
+    /* Use seccomp(2) directly with TSYNC so every already-running worker
+     * thread (the serial worker spawned in vm_arch_init_platform_device)
+     * ends up under the filter. A plain prctl(PR_SET_SECCOMP) installs
+     * only on the calling thread, leaving an attacker a path through any
+     * pre-existing worker that survived a memory-corruption RCE in
+     * device emulation.
+     *
+     * TSYNC's return contract is three-way: 0 success, -1 errno error,
+     * positive TID meaning "could not synchronize this thread" (e.g. it
+     * already had a conflicting filter). A naive `< 0` check would treat
+     * the positive-TID partial-sync failure as success and leave the
+     * process unfiltered, defeating the opt-in hardening. Reject any
+     * non-zero return.
+     */
+    long ret = syscall(SYS_seccomp, SECCOMP_SET_MODE_FILTER,
+                       SECCOMP_FILTER_FLAG_TSYNC, &prog);
+    if (ret < 0)
+        return throw_err("Failed to install seccomp filter");
+    if (ret > 0)
+        return throw_err("Failed to TSYNC seccomp filter to thread %ld", ret);
+
+    return 0;
+}
diff --git a/src/seccomp.h b/src/seccomp.h
new file mode 100644
index 0000000..e1e8b89
--- /dev/null
+++ b/src/seccomp.h
@@ -0,0 +1,10 @@
+#pragma once
+
+/* Install a seccomp BPF allowlist filter on the calling thread and
+ * (via TSYNC) on every other thread already running in the process.
+ * Subsequently spawned threads inherit the filter via clone(2).
+ *
+ * Returns 0 on success, -1 on failure (errno set, message logged via
+ * throw_err()).
+ */
+int seccomp_apply(void);