From 33fc8009df3029cc654807b8505e97f5ea2f2323 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Wed, 6 May 2026 06:21:24 +0800 Subject: [PATCH] Harden /proc/self oom and fdinfo nodes procfs emulation now treats the OOM trio (oom_score_adj, legacy oom_adj, read-only oom_score) as one process-wide adjustment with per-path read and write semantics: legacy oom_adj scales to oom_score_adj on writes (special-casing OOM_DISABLE -> SCORE_ADJ_MIN and OOM_ADJUST_MAX -> SCORE_ADJ_MAX so the boundary intent survives the lossy multiply) and back-clamps to [-17, 15] on reads; oom_score is read-only with a stub zero. The OOM write path serializes the truncate+pwrite+lseek under a new oom_write_lock and publishes the global atomic only after the backing rewrite succeeds, so a partial-rewrite failure no longer leaves the process-wide value diverged from a returned -1. Zero-length writes short-circuit to success (matches Linux for proc nodes; sys_writev previously hit -EINVAL in the parser). Stat reports st_size 0 for every synthetic /proc file so callers that pre-size buffers from stat cannot truncate (a 256-byte cap had silently chopped /proc/cpuinfo on hosts with many CPUs; a 2-byte cap had reduced -1000 to -1 on oom_score_adj). A new read-intercept path mirrors the write side. proc_intercept_read and proc_intercept_readv let read/pread/readv/preadv on the OOM nodes return the live atomic value rather than the per-open temp file content, and sendfile/copy_file_range route through the same hook so proc-source byte counts stay consistent with the value an immediately following open would observe. /proc/self/fdinfo gains type-specific lines for the special fd classes elfuse implements: eventfd-count (16-char hex matching fs/eventfd.c), sigmask (16-char hex), and timerfd clockid/ticks/it_value/it_interval. The accessors live in src/syscall/fd.c (eventfd_fdinfo_snapshot, signalfd_fdinfo_snapshot, timerfd_fdinfo_snapshot) and read state under sfd_lock to prevent tearing across concurrent read/write/settime. The per-fd lseek probe now uses fd_to_host_dup so a concurrent close+reopen on another vCPU cannot redirect the probe to an unrelated host fd, and errno is saved/restored across the ESPIPE-prone lseek so non-seekable fds (sockets, pipes) do not pollute the caller's state. /proc/self/fdinfo and /proc/self/fd no longer share one static backing directory across opens. The previous design let a second open unlink and recreate entries while a sibling thread iterated its dirfd; both nodes now go through proc_open_fd_scratch, which mkdtemps a private directory per open, populates it from a fresh fd-table snapshot, and tracks the path in proc_scratch_dirs[] for atexit cleanup so the previously-leaked backing dirs are reaped at process exit. The unix-net visitor's buffer-tail margin grew from 128 to 256 bytes to fit the longest possible row (54 fixed + 108 sun_path + newline); the previous margin let the snprintf truncate the path and drop the trailing newline. Eight explicit /proc//X cases collapsed into one general alias-and-recurse, so /proc//maps, /oom_score_adj, /limits, etc. now route through the matching /proc/self handler. Locked in by tests/test-tier-b.c (35 cases including oom write persistence, out-of-range -EINVAL, oom_adj=15 -> 1000 scaling, oom_score read-only and write-rejected, zero-length writev, stat-size-zero, fdinfo eventfd-count hex, fdinfo sigmask, fdinfo timerfd next expiry for periodic timers, concurrent fdinfo enumeration, and a /proc/net/tcp sl-density regression that opens non-TCP sockets before TCP listeners so the iterator visits rejected sockets first; the post-fix dense sl=0,1,... output matches qemu Linux ground truth, and a manual bug reintroduction confirms the test catches the sparse-slot regression with sl=4 expected=0). tests/test-io-opt.c adds sendfile and copy_file_range coverage for the read-intercept path. --- src/runtime/procemu.c | 1666 ++++++++++++++++++++++++++--------------- src/runtime/procemu.h | 19 + src/syscall/fd.c | 150 +++- src/syscall/fd.h | 12 + src/syscall/fs.c | 18 +- src/syscall/io.c | 124 ++- src/syscall/syscall.c | 7 +- tests/test-io-opt.c | 103 +++ tests/test-netstat.c | 31 + tests/test-proc.c | 16 + tests/test-tier-b.c | 758 +++++++++++++++++++ 11 files changed, 2258 insertions(+), 646 deletions(-) diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index 04eb698..f35b163 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -19,6 +19,7 @@ */ #define MAPS_NAME_COLUMN 73 +#include #include #include #include @@ -47,17 +48,18 @@ #include "runtime/thread.h" #include "syscall/abi.h" +#include "syscall/fd.h" #include "syscall/internal.h" #include "syscall/proc.h" #include "syscall/sys.h" /* Return the shared /dev/shm emulation directory, creating it on first call. - * Linux POSIX shm names live in one namespace, so this must not be keyed by - * the host process id. + * Linux POSIX shm names live in one namespace, so this must not be keyed by the + * host process id. * - * Uses a mutex for thread-safe lazy initialization while still allowing - * retries after transient failures. The mkdir+lstat sequence has an inherent - * TOCTOU window, but the lstat ownership check limits the impact to directories + * Uses a mutex for thread-safe lazy initialization while still allowing retries + * after transient failures. The mkdir+lstat sequence has an inherent TOCTOU + * window, but the lstat ownership check limits the impact to directories * already owned by this UID. */ static char shm_dir[128]; @@ -74,8 +76,273 @@ static pthread_mutex_t shm_dir_lock = PTHREAD_MUTEX_INITIALIZER; static char proc_tmpdir[128]; static bool proc_tmpdir_ok; static pthread_mutex_t proc_tmpdir_lock = PTHREAD_MUTEX_INITIALIZER; + +/* OOM range constants from Linux include/uapi/linux/oom.h. */ +#define LINUX_OOM_SCORE_ADJ_MIN (-1000) +#define LINUX_OOM_SCORE_ADJ_MAX 1000 +#define LINUX_OOM_DISABLE (-17) +#define LINUX_OOM_ADJUST_MAX 15 + +/* Process-wide stub for the OOM score adjustment. The legacy oom_adj interface, + * the modern oom_score_adj interface, and the read-only oom_score node all + * derive their displayed values from this single state. + */ static _Atomic int oom_score_adj_value = 0; +/* Serializes backing-fd rewrites so concurrent writers do not race the + * truncate+pwrite sequence that publishes the new value to a same-fd reader. + * The atomic store happens last so a failed rewrite leaves the global state + * unchanged. + */ +static pthread_mutex_t oom_write_lock = PTHREAD_MUTEX_INITIALIZER; + +enum { + OOM_PATH_NONE = 0, + OOM_PATH_SCORE_ADJ, /* /proc/self/oom_score_adj: writable, [-1000, 1000] */ + OOM_PATH_ADJ, /* /proc/self/oom_adj: legacy, writable, [-17, 15] */ + OOM_PATH_SCORE, /* /proc/self/oom_score: read-only computed score */ +}; + +static int proc_oom_path_kind(const char *path) +{ + if (!strcmp(path, "/proc/self/oom_score_adj")) + return OOM_PATH_SCORE_ADJ; + if (!strcmp(path, "/proc/self/oom_adj")) + return OOM_PATH_ADJ; + if (!strcmp(path, "/proc/self/oom_score")) + return OOM_PATH_SCORE; + return OOM_PATH_NONE; +} + +/* Linux fs/proc/base.c oom_adj_write: a write to oom_adj is scaled into the + * [-1000, 1000] oom_score_adj domain. The kernel special-cases both boundary + * values so the "disable" and "max" semantics survive the lossy multiply that + * would otherwise round 15*1000/17 to 882 and lose the "kill me first" intent. + */ +static int oom_adj_to_score_adj(int v) +{ + if (v == LINUX_OOM_DISABLE) + return LINUX_OOM_SCORE_ADJ_MIN; + if (v == LINUX_OOM_ADJUST_MAX) + return LINUX_OOM_SCORE_ADJ_MAX; + return v * LINUX_OOM_SCORE_ADJ_MAX / -LINUX_OOM_DISABLE; +} + +/* Inverse of oom_adj_to_score_adj for legacy oom_adj reads. Clamp to the legacy + * [-17, 15] range so values outside the representable space (e.g. a guest that + * wrote -1000 to oom_score_adj) do not surprise readers. + */ +static int oom_score_adj_to_adj(int v) +{ + int s = v * -LINUX_OOM_DISABLE / LINUX_OOM_SCORE_ADJ_MAX; + if (s < LINUX_OOM_DISABLE) + s = LINUX_OOM_DISABLE; + if (s > LINUX_OOM_ADJUST_MAX) + s = LINUX_OOM_ADJUST_MAX; + return s; +} + +static int proc_oom_format_value(int kind, char *buf, size_t bufsz) +{ + int score_adj = atomic_load(&oom_score_adj_value); + int val = 0; + if (kind == OOM_PATH_SCORE_ADJ) + val = score_adj; + else if (kind == OOM_PATH_ADJ) + val = oom_score_adj_to_adj(score_adj); + return snprintf(buf, bufsz, "%d\n", val); +} + +static int proc_oom_copy_slice(char *dst, + size_t count, + int64_t offset, + const char *src, + size_t src_len, + ssize_t *read_out) +{ + if (offset < 0) { + errno = EINVAL; + return -1; + } + if ((uint64_t) offset >= src_len) { + *read_out = 0; + return 1; + } + + size_t avail = src_len - (size_t) offset; + size_t n = count < avail ? count : avail; + memcpy(dst, src + offset, n); + *read_out = (ssize_t) n; + return 1; +} + +typedef struct { + int fd; + int kind; +} proc_oom_live_fd_t; + +/* OOM proc nodes are opened on per-open temp files so lseek/pread semantics + * work naturally. After any successful write, republish the current formatted + * value into every still-open OOM fd so a later seek+read on another fd does + * not observe the stale snapshot that was materialized at open time. + */ +static void proc_oom_refresh_live_fds_locked(void) +{ + proc_oom_live_fd_t live[FD_TABLE_SIZE]; + int nlive = 0; + + pthread_mutex_lock(&fd_lock); + for (int i = 0; i < FD_TABLE_SIZE; i++) { + int kind = proc_oom_path_kind(fd_table[i].proc_path); + if (kind == OOM_PATH_NONE || fd_table[i].type == FD_CLOSED) + continue; + + int dup_fd = dup(fd_table[i].host_fd); + if (dup_fd < 0) + continue; + + live[nlive].fd = dup_fd; + live[nlive].kind = kind; + nlive++; + } + pthread_mutex_unlock(&fd_lock); + + for (int i = 0; i < nlive; i++) { + char text[32]; + int len = proc_oom_format_value(live[i].kind, text, sizeof(text)); + if (len > 0 && (size_t) len < sizeof(text)) { + /* Rewrite the backing temp file as defense in depth for any code + * path that might bypass proc_intercept_read and fall through to + * host read(). The dup'd fd shares the open file description with + * the guest's fd, so a paired lseek to "restore" the offset would + * clobber a concurrent reader's position; skip the offset dance and + * let proc_intercept_read (which always pulls from the atomic) be + * the source of truth for offset-aware reads. + */ + if (ftruncate(live[i].fd, 0) == 0) + pwrite(live[i].fd, text, (size_t) len, 0); + } + close(live[i].fd); + } +} + +static int proc_open_dir_fd(const char *path, int linux_flags); +static int proc_lazy_mkdtemp(char *buf, size_t buf_size, const char *template); +static int append_proc_net_row(char *buf, + size_t bufsz, + int off, + bool want_tcp, + int sl, + const char laddr[33], + uint16_t lport, + const char raddr[33], + uint16_t rport, + int st); +static void format_proc_net_addr(char out[33], + const struct in_sockinfo *ini, + int local, + int v6); + +/* Per-open scratch dirs for /proc/self/fd and /proc/self/fdinfo. + * + * The previous design shared one host directory across every open, which meant + * a second open could unlink/recreate entries while the first opener was + * mid-getdents on its dirfd. Each open now allocates its own mkdtemp dir, so + * concurrent enumerations cannot mutate one another. + * + * The tracker keeps the paths so an atexit hook can rmdir them at process exit. + * The capacity is a soft cap: callers that exceed it leak the dir to /tmp + * (cleared on host reboot or by tmp janitors). + */ +#define PROC_SCRATCH_DIRS_MAX 128 +static char proc_scratch_dirs[PROC_SCRATCH_DIRS_MAX][80]; +static int proc_scratch_dirs_count; +static pthread_mutex_t proc_scratch_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_once_t proc_scratch_atexit_once = PTHREAD_ONCE_INIT; + +static void proc_scratch_remove_one(const char *dir) +{ + DIR *d = opendir(dir); + if (d) { + struct dirent *ent; + char path[160]; + while ((ent = readdir(d))) { + if (ent->d_name[0] == '.' && + (ent->d_name[1] == '\0' || + (ent->d_name[1] == '.' && ent->d_name[2] == '\0'))) + continue; + int n = snprintf(path, sizeof(path), "%s/%s", dir, ent->d_name); + if (n > 0 && (size_t) n < sizeof(path)) + unlink(path); + } + closedir(d); + } + rmdir(dir); +} + +static void proc_scratch_cleanup_atexit(void) +{ + pthread_mutex_lock(&proc_scratch_lock); + for (int i = 0; i < proc_scratch_dirs_count; i++) + proc_scratch_remove_one(proc_scratch_dirs[i]); + proc_scratch_dirs_count = 0; + pthread_mutex_unlock(&proc_scratch_lock); +} + +static void proc_scratch_register_atexit(void) +{ + atexit(proc_scratch_cleanup_atexit); +} + +/* Open a per-call scratch directory populated with one empty file per live + * guest fd. Returns a host dirfd on success, -1 on failure with errno set. + * + * The dirfd is the standard backing for getdents on this synthetic listing. + * Two concurrent openers get two independent dirs, so neither mutates the + * other's enumeration. + */ +static int proc_open_fd_scratch(const char *prefix, int linux_flags) +{ + char dir[80]; + int n = snprintf(dir, sizeof(dir), "/tmp/%s-XXXXXX", prefix); + if (n < 0 || (size_t) n >= sizeof(dir)) { + errno = ENAMETOOLONG; + return -1; + } + if (!mkdtemp(dir)) + return -1; + + for (int i = 0; i < FD_TABLE_SIZE; i++) { + fd_entry_t snap; + if (!fd_snapshot(i, &snap)) + continue; + char entry[160]; + int en = snprintf(entry, sizeof(entry), "%s/%d", dir, i); + if (en <= 0 || (size_t) en >= sizeof(entry)) + continue; + int tfd = open(entry, O_CREAT | O_WRONLY, 0444); + if (tfd >= 0) + close(tfd); + } + + pthread_once(&proc_scratch_atexit_once, proc_scratch_register_atexit); + + pthread_mutex_lock(&proc_scratch_lock); + if (proc_scratch_dirs_count < PROC_SCRATCH_DIRS_MAX) { + str_copy_trunc(proc_scratch_dirs[proc_scratch_dirs_count++], dir, + sizeof(proc_scratch_dirs[0])); + } + pthread_mutex_unlock(&proc_scratch_lock); + + int fd = proc_open_dir_fd(dir, linux_flags); + if (fd < 0) { + int saved = errno; + proc_scratch_remove_one(dir); + errno = saved; + } + return fd; +} + /* atexit cleanup: remove snapshot files and the temp directory tree. */ static void proc_tmpdir_cleanup(void) { @@ -190,12 +457,12 @@ static int proc_synthetic_fd(const void *data, size_t len) return fd; } -/* Lazy mkdtemp into a caller-provided buffer. Returns 0 on success (buf - * holds the path), or -1 on failure (buf[0] reset to '\0'). +/* Lazy mkdtemp into a caller-provided buffer. Returns 0 on success (buf holds + * the path), or -1 on failure (buf[0] reset to '\0'). * - * Caller must hold the lock that protects buf, since the helper runs the - * "is buf empty?" check and mkdtemp non-atomically. The created directory - * is reused across calls until process exit. + * Caller must hold the lock that protects buf, since the helper runs the "is + * buf empty?" check and mkdtemp non-atomically. The created directory is reused + * across calls until process exit. */ static int proc_lazy_mkdtemp(char *buf, size_t buf_size, const char *template) { @@ -222,6 +489,356 @@ static int proc_synthetic_fd_str(const char *buf, int snprintf_ret, size_t cap) return proc_synthetic_fd(buf, (size_t) snprintf_ret); } +/* Format a string into a stack buffer and return the synthetic fd in one + * step. Collapses the recurring three-line pattern: + * char buf[N]; + * int len = snprintf(buf, sizeof(buf), fmt, ...); + * return proc_synthetic_fd_str(buf, len, sizeof(buf)); + * 4096-byte cap is the largest formatted /proc payload elfuse emits via this + * helper (the few handlers that exceed it -- /proc/self/maps, /proc/net/tcp + * -- build their output incrementally and call proc_synthetic_fd directly). + */ +__attribute__((format(printf, 1, 2))) static int proc_emit_fmt(const char *fmt, + ...) +{ + char buf[4096]; + va_list ap; + va_start(ap, fmt); + int n = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + return proc_synthetic_fd_str(buf, n, sizeof(buf)); +} + +/* Emit a fixed string literal as a synthetic fd. Used for the handlers that + * return identical content every time (mountinfo, filesystems, /proc/sys + * constants); avoids allocating a stack buffer when there is nothing to format. + */ +static int proc_emit_literal(const char *s) +{ + return proc_synthetic_fd(s, strlen(s)); +} + +/* Return the basename of the loaded ELF binary, falling back to "elfuse" when + * the path is unavailable. Matches the comm-name semantic Linux uses for + * /proc//comm and the second field of /proc//stat. Storage is owned + * by proc_get_elf_path() (stable for process lifetime) or the literal fallback; + * caller must not free. + */ +static const char *proc_comm_name(void) +{ + const char *exe = proc_get_elf_path(); + if (!exe) + return "elfuse"; + const char *slash = strrchr(exe, '/'); + return slash ? slash + 1 : exe; +} + +/* Parse the numeric tail of a /proc/.../ or /dev/fd/ path. + * prefix_len is the length of the leading literal that the caller already + * matched with strncmp. Returns the parsed fd on success, or -1 with errno set + * to errno_on_invalid for any malformed input or out-of-range index. + */ +static int proc_parse_fd_index(const char *path, + size_t prefix_len, + int errno_on_invalid) +{ + char *endp; + long n = strtol(path + prefix_len, &endp, 10); + if (endp == path + prefix_len || *endp != '\0' || n < 0 || + n >= FD_TABLE_SIZE) { + errno = errno_on_invalid; + return -1; + } + return (int) n; +} + +/* Resolve a /dev/shm/ guest path to a host path inside the per-UID shm + * dir. Rejects empty, traversing, or compound suffixes with EACCES; reports + * ENAMETOOLONG when the host path overflows. The same validation runs in + * proc_intercept_open and proc_intercept_stat, so the helper is one source of + * truth for the security gate. + */ +static int dev_shm_resolve_path(const char *guest_suffix, + char *host_path, + size_t host_path_sz) +{ + const char *shm = shm_dir_path(); + if (!shm) + return -1; + if (strstr(guest_suffix, "..") || strchr(guest_suffix, '/') || + guest_suffix[0] == '\0') { + errno = EACCES; + return -1; + } + int n = snprintf(host_path, host_path_sz, "%s/%s", shm, guest_suffix); + if (n < 0 || (size_t) n >= host_path_sz) { + errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +/* Populate *st for a synthetic /proc directory entry. */ +static void stat_fill_proc_dir(struct stat *st, mode_t mode, nlink_t nlink) +{ + memset(st, 0, sizeof(*st)); + st->st_mode = S_IFDIR | mode; + st->st_nlink = nlink; +} + +/* Resolve a /dev/fd/ or /proc/self/fd/ path to a fresh dup() of the + * underlying host fd. prefix_len is the length of the matched literal (8 for + * "/dev/fd/", 14 for "/proc/self/fd/"). Returns the dup or -1 with errno=EBADF + * for malformed indices or closed slots. + * + * fd_to_host_dup duplicates the host fd atomically under fd_lock so a + * concurrent close+reopen on another vCPU cannot redirect the dup to an + * unrelated host object that took the freed slot. + */ +static int dev_fd_dup(const char *path, size_t prefix_len) +{ + int n = proc_parse_fd_index(path, prefix_len, EBADF); + if (n < 0) + return -1; + int dup_fd = fd_to_host_dup(n); + if (dup_fd < 0) { + errno = EBADF; + return -1; + } + return dup_fd; +} + +/* If path matches /proc/[/...], rewrite into alias as /proc/self[...] + * Used by both proc_intercept_open and proc_intercept_stat so the explicit-pid + * form aliases through the same /proc/self handlers (Linux treats them + * equivalent for the calling process). The trailing-character constraint + * admits the bare /proc/ directory and /proc//X files alike. + * + * Returns 1 when alias was rewritten (caller should recurse on alias), 0 when + * path is not a self-alias (caller continues with other handlers), or -1 with + * errno=ENAMETOOLONG when the rewrite would overflow alias_sz (matches Linux + * semantics for paths > PATH_MAX rather than letting the intercept fall through + * to a host syscall that would silently fail). + */ +static int proc_alias_self(const char *path, char *alias, size_t alias_sz) +{ + if (strncmp(path, "/proc/", 6) != 0) + return 0; + char *endp; + long pid = strtol(path + 6, &endp, 10); + if (endp == path + 6 || pid != (long) proc_get_pid()) + return 0; + if (*endp != '\0' && *endp != '/') + return 0; + int n = snprintf(alias, alias_sz, "/proc/self%s", endp); + if (n < 0 || (size_t) n >= alias_sz) { + errno = ENAMETOOLONG; + return -1; + } + return 1; +} + +/* Populate *st for a synthetic /proc regular-file entry. Linux reports + * st_size = 0 for proc nodes; mirroring that forces readers to drain to EOF + * instead of pre-sizing buffers from a stale value. + */ +static void stat_fill_proc_file(struct stat *st, mode_t mode) +{ + memset(st, 0, sizeof(*st)); + st->st_mode = S_IFREG | mode; + st->st_nlink = 1; + st->st_size = 0; + st->st_blksize = 4096; + st->st_blocks = 0; +} + +/* Visitor signature for proc_net_for_each_socket below. Returning false stops + * the iteration (used when the caller's output buffer is full). + * sinfo: kernel socket info for the current fd + * pid: pid that owns the fd (self or a fork child) + * fd_index: index within that pid's fdinfo list (used by /proc/net/unix + * to synthesize a fake-but-stable inode number) + * + * /proc/net/tcp's "sl" column must be dense, counting only emitted rows (not + * inspected sockets), so the iterator deliberately omits a global serial + * counter. Visitors that need one track it inside their own ctx and increment + * it only after a successful emit. + */ +typedef bool (*proc_net_socket_visitor)(const struct socket_fdinfo *sinfo, + pid_t pid, + int fd_index, + void *ctx); + +/* Walk every socket fd across self plus active fork children, invoking visit + * once per socket. Centralizes the proc_pidinfo + proc_pidfdinfo scaffolding + * shared by /proc/net/{tcp,udp,raw}{,6} and /proc/net/unix. + */ +static void proc_net_for_each_socket(proc_net_socket_visitor visit, void *ctx) +{ + pid_t pids[PROC_TABLE_SIZE + 1]; + pids[0] = getpid(); + int npids = 1 + proc_get_child_pids(pids + 1, PROC_TABLE_SIZE); + + for (int p = 0; p < npids; p++) { + struct proc_fdinfo fdinfo[512]; + int fdsz = + proc_pidinfo(pids[p], PROC_PIDLISTFDS, 0, fdinfo, sizeof(fdinfo)); + if (fdsz <= 0) + continue; + int nfds = fdsz / (int) PROC_PIDLISTFD_SIZE; + for (int fi = 0; fi < nfds; fi++) { + if (fdinfo[fi].proc_fdtype != PROX_FDTYPE_SOCKET) + continue; + struct socket_fdinfo sinfo; + int sz = + proc_pidfdinfo(pids[p], fdinfo[fi].proc_fd, + PROC_PIDFDSOCKETINFO, &sinfo, sizeof(sinfo)); + if (sz < (int) sizeof(sinfo)) + continue; + if (!visit(&sinfo, pids[p], fi, ctx)) + return; + } + } +} + +/* Visitor context + callback for /proc/net/{tcp,udp,raw}{,6}. + * sl counts only emitted rows so the "sl" column stays dense even when the + * iterator visits other-family sockets that the visitor filters out. + */ +struct proc_net_inet_ctx { + char *buf; + size_t bufsz; + int off; + int sl; + int want_af; + int want_stype; + bool want_tcp; + bool want_v6; +}; + +/* Map macOS TSI_S_* socket states (returned in tcp_connection_info.state) + * to the 1-based hex values Linux /proc/net/tcp uses (ESTABLISHED=01, + * LISTEN=0A, etc.). Indexed by macOS state ordinal. + */ +static int proc_net_tcp_state_linux(int kstate) +{ + static const int state_map[] = { + 0x07, /* 0: CLOSED */ + 0x0A, /* 1: LISTEN */ + 0x02, /* 2: SYN_SENT */ + 0x03, /* 3: SYN_RECEIVED */ + 0x01, /* 4: ESTABLISHED */ + 0x08, /* 5: CLOSE_WAIT */ + 0x04, /* 6: FIN_WAIT_1 */ + 0x06, /* 7: CLOSING */ + 0x09, /* 8: LAST_ACK */ + 0x05, /* 9: FIN_WAIT_2 */ + 0x0B, /* 10: TIME_WAIT */ + }; + return RANGE_CHECK(kstate, 0, 11) ? state_map[kstate] : 0x07; +} + +static bool proc_net_inet_visit(const struct socket_fdinfo *sinfo, + pid_t pid, + int fd_index, + void *ctx_v) +{ + (void) pid; + (void) fd_index; + struct proc_net_inet_ctx *c = ctx_v; + if (c->off >= (int) c->bufsz - 256) + return false; + if (sinfo->psi.soi_family != c->want_af || + sinfo->psi.soi_type != c->want_stype) + return true; + + const struct in_sockinfo *ini = + c->want_tcp ? &sinfo->psi.soi_proto.pri_tcp.tcpsi_ini + : &sinfo->psi.soi_proto.pri_in; + char laddr[33], raddr[33]; + format_proc_net_addr(laddr, ini, 1, c->want_v6); + format_proc_net_addr(raddr, ini, 0, c->want_v6); + int st = + c->want_tcp + ? proc_net_tcp_state_linux(sinfo->psi.soi_proto.pri_tcp.tcpsi_state) + : 0x07; + c->off = append_proc_net_row(c->buf, c->bufsz, c->off, c->want_tcp, c->sl, + laddr, ntohs(ini->insi_lport), raddr, + ntohs(ini->insi_fport), st); + c->sl++; + return true; +} + +/* Visitor context + callback for /proc/net/unix. */ +struct proc_net_unix_ctx { + char *buf; + size_t bufsz; + int off; +}; + +/* Lock-protected handle to a persistent /tmp directory used to back synthetic + * /proc subdirectories whose contents must repopulate per open (e.g. + * /proc/self/task with its dynamic TID set). The static buffer + lazy mkdtemp + * pattern is shared by multiple handlers so the helper keeps one source of + * truth for the locking and creation order. + */ +typedef struct { + char path[128]; + pthread_mutex_t lock; + const char *template; +} proc_persistent_dir_t; + +#define PROC_PERSISTENT_DIR(prefix) \ + {.path = {0}, .lock = PTHREAD_MUTEX_INITIALIZER, .template = prefix} + +/* Acquire the persistent dir's lock and ensure the dir exists. Caller owns the + * lock until proc_persistent_dir_release(). Returns the directory path or NULL + * on failure (lock released, errno set). + */ +static const char *proc_persistent_dir_acquire(proc_persistent_dir_t *d) +{ + pthread_mutex_lock(&d->lock); + if (proc_lazy_mkdtemp(d->path, sizeof(d->path), d->template) < 0) { + pthread_mutex_unlock(&d->lock); + return NULL; + } + return d->path; +} + +static void proc_persistent_dir_release(proc_persistent_dir_t *d) +{ + pthread_mutex_unlock(&d->lock); +} + +static bool proc_net_unix_visit(const struct socket_fdinfo *sinfo, + pid_t pid, + int fd_index, + void *ctx_v) +{ + (void) pid; + struct proc_net_unix_ctx *c = ctx_v; + /* A unix row is up to 56 bytes of fixed format plus a sun_path of + * up to 108 bytes plus the trailing newline -- ~165 bytes worst + * case. The 128-byte margin previously inherited from the inline + * loop could leave a half-formatted row at the buffer tail; 256 + * matches the inet visitor and covers the longest possible path. + */ + if (c->off >= (int) c->bufsz - 256) + return false; + if (sinfo->psi.soi_family != AF_UNIX) + return true; + int stype = sinfo->psi.soi_type; + int lt = (stype == SOCK_STREAM) ? 1 + : (stype == SOCK_DGRAM) ? 2 + : (stype == SOCK_SEQPACKET) ? 5 + : 1; + const char *spath = sinfo->psi.soi_proto.pri_un.unsi_addr.ua_sun.sun_path; + c->off += snprintf(c->buf + c->off, c->bufsz - (size_t) c->off, + "%016X: %08X %08X %08X %04X %02X %5d %s\n", 0, 3, 0, 0, + lt, 3, 10000 + fd_index, spath[0] ? spath : ""); + return true; +} + static int append_proc_net_row(char *buf, size_t bufsz, int off, @@ -299,17 +916,6 @@ static int proc_open_numbered_dir(const char *dir, int64_t id, int linux_flags) return proc_open_dir_fd(path, linux_flags); } -static int proc_is_oom_writable(const char *path) -{ - return !strcmp(path, "/proc/self/oom_score_adj") || - !strcmp(path, "/proc/self/oom_adj"); -} - -static int proc_is_oom_path(const char *path) -{ - return proc_is_oom_writable(path) || !strcmp(path, "/proc/self/oom_score"); -} - static int copy_fd_to_path(int src_fd, const char *path) { int out = open(path, O_CREAT | O_TRUNC | O_WRONLY, 0444); @@ -366,6 +972,17 @@ static void populate_proc_snapshot(const guest_t *g, close(fd); } +static void populate_proc_placeholder(const char *dir, const char *name) +{ + char path[LINUX_PATH_MAX]; + if (snprintf(path, sizeof(path), "%s/%s", dir, name) >= (int) sizeof(path)) + return; + + int fd = open(path, O_CREAT | O_TRUNC | O_WRONLY, 0444); + if (fd >= 0) + close(fd); +} + static void format_proc_net_addr(char out[33], const struct in_sockinfo *ini, int local, @@ -427,6 +1044,16 @@ static const char *ensure_proc_tmpdir(const guest_t *g) snprintf(taskdir, sizeof(taskdir), "%s/task", piddir); mkdir(taskdir, 0755); + char netdir[128]; + snprintf(netdir, sizeof(netdir), "%s/net", proc_tmpdir); + if (mkdir(netdir, 0755) == 0 || errno == EEXIST) { + static const char *net_files[] = { + "tcp", "tcp6", "udp", "udp6", "raw", "raw6", "unix", NULL, + }; + for (const char **name = net_files; *name; name++) + populate_proc_placeholder(netdir, *name); + } + char exepath[128]; snprintf(exepath, sizeof(exepath), "%s/exe", piddir); const char *exe = proc_get_elf_path(); @@ -455,21 +1082,6 @@ static void proc_task_collect_cb(thread_entry_t *t, void *arg) c->tids[c->ntids++] = t->guest_tid; } -static char fddir[128]; -static pthread_mutex_t fddir_lock = PTHREAD_MUTEX_INITIALIZER; - -static void cleanup_fddir(void) -{ - if (fddir[0] != '\0') { - for (int i = 0; i < FD_TABLE_SIZE; i++) { - char entry[192]; - snprintf(entry, sizeof(entry), "%s/%d", fddir, i); - unlink(entry); - } - rmdir(fddir); - } -} - int proc_intercept_open(const guest_t *g, const char *path, int linux_flags, @@ -504,8 +1116,7 @@ int proc_intercept_open(const guest_t *g, */ int oflags = host_accmode | (translate_open_flags(linux_flags) & (O_NONBLOCK | O_CLOEXEC)); - int fd = open(host_dev, oflags); - return fd >= 0 ? fd : -1; + return open(host_dev, oflags); } /* /dev/shm -> tmpfs-backed host temp directory. @@ -513,32 +1124,19 @@ int proc_intercept_open(const guest_t *g, * Redirect to one shared host namespace so named shm works across elfuse * processes and fork children. */ - if (!strcmp(path, "/dev/shm") || !strncmp(path, "/dev/shm/", 9)) { + if (!strcmp(path, "/dev/shm")) { const char *shm = shm_dir_path(); - if (!shm) - return -1; - if (!strcmp(path, "/dev/shm")) - return proc_open_dir_fd(shm, linux_flags); - /* /dev/shm/name -> /tmp/elfuse-shm-UID/name - * Reject any path component traversal: "..", "/", or leading "/" - */ - const char *suffix = path + 9; - if (strstr(suffix, "..") || strchr(suffix, '/') || suffix[0] == '\0') { - errno = EACCES; - return -1; - } + return shm ? proc_open_dir_fd(shm, linux_flags) : -1; + } + if (!strncmp(path, "/dev/shm/", 9)) { char host_path[512]; - int n = snprintf(host_path, sizeof(host_path), "%s/%s", shm, suffix); - if (n < 0 || (size_t) n >= sizeof(host_path)) { - errno = ENAMETOOLONG; + if (dev_shm_resolve_path(path + 9, host_path, sizeof(host_path)) < 0) return -1; - } int oflags = translate_open_flags(linux_flags); /* O_NOFOLLOW: do not follow symlinks created by the guest inside the * shm directory (prevents symlink-based escape). */ - int fd = open(host_path, oflags | O_NOFOLLOW, mode); - return fd >= 0 ? fd : -1; + return open(host_path, oflags | O_NOFOLLOW, mode); } /* /dev/stdin -> dup(0), /dev/stdout -> dup(1), /dev/stderr -> dup(2) */ @@ -550,21 +1148,8 @@ int proc_intercept_open(const guest_t *g, return dup(STDERR_FILENO); /* /dev/fd/N -> dup(N) */ - if (!strncmp(path, "/dev/fd/", 8)) { - char *endptr; - long n = strtol(path + 8, &endptr, 10); - if (endptr == path + 8 || *endptr != '\0' || n < 0 || - n >= FD_TABLE_SIZE) { - errno = EBADF; - return -1; - } - int host_fd = fd_to_host((int) n); - if (host_fd < 0) { - errno = EBADF; - return -1; - } - return dup(host_fd); - } + if (!strncmp(path, "/dev/fd/", 8)) + return dev_fd_dup(path, 8); /* /proc -> synthetic directory with PID entries for busybox ps, top, etc. * Creates a temp dir once (cached for the process lifetime) with entries @@ -576,8 +1161,7 @@ int proc_intercept_open(const guest_t *g, const char *dir = ensure_proc_tmpdir(g); if (!dir) return -1; - int fd = proc_open_dir_fd(dir, linux_flags); - return fd >= 0 ? fd : -1; + return proc_open_dir_fd(dir, linux_flags); } /* /proc/self -> directory fd for the PID subdirectory */ @@ -585,87 +1169,53 @@ int proc_intercept_open(const guest_t *g, const char *dir = ensure_proc_tmpdir(g); if (!dir) return -1; - int fd = proc_open_numbered_dir(dir, proc_get_pid(), linux_flags); - return fd >= 0 ? fd : -1; + return proc_open_numbered_dir(dir, proc_get_pid(), linux_flags); } /* /proc/self/fd -> directory listing of guest-visible file descriptors. - * Use a persistent temp directory because macOS getdents-backed callers - * need real directory entries for fchdir/readdir to work. + * Each open gets its own scratch dir so concurrent enumerations cannot + * mutate one another (see proc_open_fd_scratch). */ if (!strcmp(path, "/proc/self/fd") || !strcmp(path, "/proc/self/fd/")) { - pthread_mutex_lock(&fddir_lock); - if (fddir[0] == '\0') { - if (proc_lazy_mkdtemp(fddir, sizeof(fddir), - "/tmp/elfuse-fd-XXXXXX") < 0) { - pthread_mutex_unlock(&fddir_lock); - return -1; - } - atexit(cleanup_fddir); - } + return proc_open_fd_scratch("elfuse-fd", linux_flags); + } - for (int i = 0; i < FD_TABLE_SIZE; i++) { - char entry[192]; - snprintf(entry, sizeof(entry), "%s/%d", fddir, i); - fd_entry_t snap; - if (fd_snapshot(i, &snap)) { - int tfd = open(entry, O_CREAT | O_WRONLY, 0444); - if (tfd >= 0) - close(tfd); - } else { - unlink(entry); - } + if (!strcmp(path, "/proc/net") || !strcmp(path, "/proc/net/")) { + const char *dir = ensure_proc_tmpdir(g); + if (!dir) + return -1; + char netdir[LINUX_PATH_MAX]; + if (snprintf(netdir, sizeof(netdir), "%s/net", dir) >= + (int) sizeof(netdir)) { + errno = ENAMETOOLONG; + return -1; } + return proc_open_dir_fd(netdir, linux_flags); + } - int fd = proc_open_dir_fd(fddir, linux_flags); - pthread_mutex_unlock(&fddir_lock); - return fd >= 0 ? fd : -1; + /* /proc/[/...] -> /proc/self[...]. Returns -1 on + * ENAMETOOLONG so the guest sees the same error a real Linux kernel + * would produce instead of falling through to a host syscall. + */ + { + char alias[LINUX_PATH_MAX]; + int aliased = proc_alias_self(path, alias, sizeof(alias)); + if (aliased < 0) + return -1; + if (aliased > 0) + return proc_intercept_open(g, alias, linux_flags, mode); } - /* /proc//stat -> redirect to /proc/self/stat for the current PID */ - if (!strncmp(path, "/proc/", 6)) { - char *endp; - long pid = strtol(path + 6, &endp, 10); - if (endp != path + 6 && pid == (long) proc_get_pid()) { - /* Rewrite /proc//X to /proc/self/X and recurse */ - if (!strncmp(endp, "/stat", 5) && endp[5] == '\0') - return proc_intercept_open(g, "/proc/self/stat", linux_flags, - mode); - if (!strncmp(endp, "/status", 7) && endp[7] == '\0') - return proc_intercept_open(g, "/proc/self/status", linux_flags, - mode); - if (!strncmp(endp, "/cmdline", 8) && endp[8] == '\0') - return proc_intercept_open(g, "/proc/self/cmdline", linux_flags, - mode); - if (!strncmp(endp, "/exe", 4) && endp[4] == '\0') - return proc_intercept_open(g, "/proc/self/exe", linux_flags, - mode); - if (!strncmp(endp, "/environ", 8) && endp[8] == '\0') - return proc_intercept_open(g, "/proc/self/environ", linux_flags, - mode); - if (!strncmp(endp, "/auxv", 5) && endp[5] == '\0') - return proc_intercept_open(g, "/proc/self/auxv", linux_flags, - mode); - if (!strncmp(endp, "/task", 5) && - (endp[5] == '\0' || endp[5] == '/')) { - char redir[128]; - snprintf(redir, sizeof(redir), "/proc/self/task%s", endp + 5); - return proc_intercept_open(g, redir, linux_flags, mode); - } - if (!strncmp(endp, "/fd", 3) && - (endp[3] == '\0' || endp[3] == '/')) { - char redir[128]; - snprintf(redir, sizeof(redir), "/proc/self/fd%s", endp + 3); - return proc_intercept_open(g, redir, linux_flags, mode); - } - if (!strcmp(endp, "") || !strcmp(endp, "/")) { - const char *dir = ensure_proc_tmpdir(g); - if (!dir) - return -1; - int fd = - proc_open_numbered_dir(dir, proc_get_pid(), linux_flags); - return fd >= 0 ? fd : -1; - } + int oom_kind = proc_oom_path_kind(path); + if (oom_kind == OOM_PATH_SCORE) { + /* Mirror the non-root Linux open contract for the 0444 proc node: + * reject writable opens immediately instead of letting the write path + * fail later against a synthetic temp file. + */ + int oom_accmode = translate_open_flags(linux_flags) & O_ACCMODE; + if (oom_accmode != O_RDONLY) { + errno = EACCES; + return -1; } } @@ -679,8 +1229,7 @@ int proc_intercept_open(const guest_t *g, errno = ENOENT; return -1; } - int fd = open(exe, O_RDONLY); - return fd >= 0 ? fd : -1; + return open(exe, O_RDONLY); } /* /proc/cpuinfo -> synthetic file with CPU count. @@ -734,20 +1283,10 @@ int proc_intercept_open(const guest_t *g, } vm_rss_kb /= 1024; - /* Extract basename from ELF path for the Name field (Linux uses the - * comm name, which is basename truncated to 15 chars) - */ - const char *exe = proc_get_elf_path(); - const char *name = "elfuse"; - if (exe) { - const char *slash = strrchr(exe, '/'); - name = slash ? slash + 1 : exe; - } - + /* Linux uses the comm name (basename truncated to 15 chars). */ + const char *name = proc_comm_name(); int threads = thread_active_count(); - char buf[2048]; - int len = snprintf( - buf, sizeof(buf), + return proc_emit_fmt( "Name:\t%.15s\n" "State:\tR (running)\n" "Tgid:\t%lld\n" @@ -764,7 +1303,6 @@ int proc_intercept_open(const guest_t *g, GUEST_UID, GUEST_GID, GUEST_GID, GUEST_GID, GUEST_GID, (unsigned long long) vm_size_kb, (unsigned long long) vm_size_kb, (unsigned long long) vm_rss_kb, threads); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); } /* /proc/self/limits -> resource limits from prlimit64 cache */ @@ -812,31 +1350,25 @@ int proc_intercept_open(const guest_t *g, * dirs returns empty. Uses a static path cleaned up at exit. */ if (!strcmp(path, "/proc/self/task") || !strcmp(path, "/proc/self/task/")) { - static char taskdir[128]; - static pthread_mutex_t taskdir_lock = PTHREAD_MUTEX_INITIALIZER; - - pthread_mutex_lock(&taskdir_lock); - if (proc_lazy_mkdtemp(taskdir, sizeof(taskdir), - "/tmp/elfuse-task-XXXXXX") < 0) { - pthread_mutex_unlock(&taskdir_lock); + static proc_persistent_dir_t taskdir = + PROC_PERSISTENT_DIR("/tmp/elfuse-task-XXXXXX"); + const char *dir = proc_persistent_dir_acquire(&taskdir); + if (!dir) return -1; - } int64_t tids[MAX_THREADS]; proc_task_collect_ctx_t ctx = {tids, 0}; thread_for_each(proc_task_collect_cb, &ctx); - for (int i = 0; i < ctx.ntids; i++) { char tidpath[128]; - snprintf(tidpath, sizeof(tidpath), "%s/%lld", taskdir, + snprintf(tidpath, sizeof(tidpath), "%s/%lld", dir, (long long) tids[i]); mkdir(tidpath, 0755); } - int fd = proc_open_dir_fd(taskdir, linux_flags); - pthread_mutex_unlock(&taskdir_lock); - - return fd >= 0 ? fd : -1; + int fd = proc_open_dir_fd(dir, linux_flags); + proc_persistent_dir_release(&taskdir); + return fd; } /* /proc/self/task//stat -> per-thread stat line */ @@ -853,75 +1385,51 @@ int proc_intercept_open(const guest_t *g, } if (!strcmp(endp, "/stat")) { - const char *exe = proc_get_elf_path(); - const char *name = "elfuse"; - if (exe) { - const char *slash = strrchr(exe, '/'); - name = slash ? slash + 1 : exe; - } - char buf[512]; - int len = - snprintf(buf, sizeof(buf), - "%ld (%.15s) R %lld %lld %lld 0 0 0 0 0 0 0 0 0 0 0 " - "20 0 %d 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 " - "0 0 0 0 0 0 0 0\n", - tid, name, (long long) proc_get_ppid(), - (long long) proc_get_pid(), /* pgid */ - (long long) proc_get_sid(), thread_active_count()); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); + return proc_emit_fmt( + "%ld (%.15s) R %lld %lld %lld 0 0 0 0 0 0 0 0 0 0 0 " + "20 0 %d 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 " + "0 0 0 0 0 0 0 0\n", + tid, proc_comm_name(), (long long) proc_get_ppid(), + (long long) proc_get_pid(), /* pgid */ + (long long) proc_get_sid(), thread_active_count()); } if (!strcmp(endp, "/status")) { - const char *exe = proc_get_elf_path(); - const char *name = "elfuse"; - if (exe) { - const char *slash = strrchr(exe, '/'); - name = slash ? slash + 1 : exe; - } - char buf[1024]; - int len = - snprintf(buf, sizeof(buf), - "Name:\t%.15s\n" - "State:\tR (running)\n" - "Tgid:\t%lld\n" - "Pid:\t%ld\n" - "PPid:\t%lld\n" - "Uid:\t%d\t%d\t%d\t%d\n" - "Gid:\t%d\t%d\t%d\t%d\n" - "Threads:\t%d\n", - name, (long long) proc_get_pid(), tid, - (long long) proc_get_ppid(), GUEST_UID, GUEST_UID, - GUEST_UID, GUEST_UID, GUEST_GID, GUEST_GID, GUEST_GID, - GUEST_GID, thread_active_count()); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); + return proc_emit_fmt( + "Name:\t%.15s\n" + "State:\tR (running)\n" + "Tgid:\t%lld\n" + "Pid:\t%ld\n" + "PPid:\t%lld\n" + "Uid:\t%d\t%d\t%d\t%d\n" + "Gid:\t%d\t%d\t%d\t%d\n" + "Threads:\t%d\n", + proc_comm_name(), (long long) proc_get_pid(), tid, + (long long) proc_get_ppid(), GUEST_UID, GUEST_UID, GUEST_UID, + GUEST_UID, GUEST_GID, GUEST_GID, GUEST_GID, GUEST_GID, + thread_active_count()); } - /* /proc/self/task/ directory itself */ + /* /proc/self/task/ directory itself: synthesize a dir with + * stat/status placeholder entries. Persistent so getdents sees + * the entries on macOS (which cannot enumerate unlinked dirs). + */ if (*endp == '\0' || !strcmp(endp, "/")) { - /* Return a synthetic directory with stat/status placeholder - * entries. Uses a persistent temp dir (not cleaned until process - * exit) so getdents sees entries on macOS. - */ - static char tiddir_base[128]; - static pthread_mutex_t tiddir_lock = PTHREAD_MUTEX_INITIALIZER; - - pthread_mutex_lock(&tiddir_lock); - if (proc_lazy_mkdtemp(tiddir_base, sizeof(tiddir_base), - "/tmp/elfuse-tid-XXXXXX") < 0) { - pthread_mutex_unlock(&tiddir_lock); + static proc_persistent_dir_t tiddir = + PROC_PERSISTENT_DIR("/tmp/elfuse-tid-XXXXXX"); + const char *dir = proc_persistent_dir_acquire(&tiddir); + if (!dir) return -1; - } char p[160]; - snprintf(p, sizeof(p), "%s/stat", tiddir_base); + snprintf(p, sizeof(p), "%s/stat", dir); close(open(p, O_CREAT | O_WRONLY, 0444)); - snprintf(p, sizeof(p), "%s/status", tiddir_base); + snprintf(p, sizeof(p), "%s/status", dir); close(open(p, O_CREAT | O_WRONLY, 0444)); - int fd = proc_open_dir_fd(tiddir_base, linux_flags); - pthread_mutex_unlock(&tiddir_lock); - - return fd >= 0 ? fd : -1; + int fd = proc_open_dir_fd(dir, linux_flags); + proc_persistent_dir_release(&tiddir); + return fd; } return -2; /* unknown /proc/self/task//XXX */ @@ -1045,9 +1553,7 @@ int proc_intercept_open(const guest_t *g, gettimeofday(&now, NULL); double uptime = (double) (now.tv_sec - boottime.tv_sec) + (double) (now.tv_usec - boottime.tv_usec) / 1e6; - char buf[128]; - int len = snprintf(buf, sizeof(buf), "%.2f 0.00\n", uptime); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); + return proc_emit_fmt("%.2f 0.00\n", uptime); } /* /proc/loadavg -> synthetic load averages. @@ -1056,11 +1562,9 @@ int proc_intercept_open(const guest_t *g, if (!strcmp(path, "/proc/loadavg")) { double loadavg[3] = {0}; getloadavg(loadavg, 3); - char buf[128]; - int len = - snprintf(buf, sizeof(buf), "%.2f %.2f %.2f 1/1 %lld\n", loadavg[0], - loadavg[1], loadavg[2], (long long) proc_get_pid()); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); + return proc_emit_fmt("%.2f %.2f %.2f 1/1 %lld\n", loadavg[0], + loadavg[1], loadavg[2], + (long long) proc_get_pid()); } /* /var/run/utmp, /run/utmp -> synthetic utmp with current user. @@ -1097,171 +1601,70 @@ int proc_intercept_open(const guest_t *g, !strcmp(path, "/proc/net/raw") || !strcmp(path, "/proc/net/raw6")) { bool want_tcp = !!strstr(path, "tcp"), want_udp = !!strstr(path, "udp"); bool want_v6 = (path[strlen(path) - 1] == '6'); - int want_af = want_v6 ? AF_INET6 : AF_INET; - int want_stype = want_tcp ? SOCK_STREAM - : want_udp ? SOCK_DGRAM - : SOCK_RAW; - const char *header_fmt = + struct proc_net_inet_ctx ctx = { + .buf = NULL, /* set below */ + .bufsz = 16384, + .off = 0, + .sl = 0, + .want_af = want_v6 ? AF_INET6 : AF_INET, + .want_stype = want_tcp ? SOCK_STREAM + : want_udp ? SOCK_DGRAM + : SOCK_RAW, + .want_tcp = want_tcp, + .want_v6 = want_v6, + }; + char buf[16384]; + ctx.buf = buf; + ctx.off = snprintf( + buf, sizeof(buf), "%s", want_tcp ? " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout inode\n" : " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout inode" - " ref pointer drops\n"; - char buf[16384]; - int off = snprintf(buf, sizeof(buf), "%s", header_fmt); - - /* Collect PIDs to scan: self + active children */ - pid_t pids[PROC_TABLE_SIZE + 1]; - pids[0] = getpid(); - int npids = 1 + proc_get_child_pids(pids + 1, PROC_TABLE_SIZE); - - int sl = 0; - for (int p = 0; p < npids && off < (int) sizeof(buf) - 256; p++) { - struct proc_fdinfo fdinfo[512]; - int fdsz = proc_pidinfo(pids[p], PROC_PIDLISTFDS, 0, fdinfo, - sizeof(fdinfo)); - if (fdsz <= 0) - continue; - int nfds = fdsz / (int) PROC_PIDLISTFD_SIZE; - - for (int fi = 0; fi < nfds && off < (int) sizeof(buf) - 256; fi++) { - if (fdinfo[fi].proc_fdtype != PROX_FDTYPE_SOCKET) - continue; - - struct socket_fdinfo sinfo; - int sz = - proc_pidfdinfo(pids[p], fdinfo[fi].proc_fd, - PROC_PIDFDSOCKETINFO, &sinfo, sizeof(sinfo)); - if (sz < (int) sizeof(sinfo)) - continue; - - int saf = sinfo.psi.soi_family, stype = sinfo.psi.soi_type; - if (saf != want_af || stype != want_stype) - continue; - - uint16_t lport = 0, rport = 0; - char laddr[33], raddr[33]; - const struct in_sockinfo *ini = - want_tcp ? &sinfo.psi.soi_proto.pri_tcp.tcpsi_ini - : &sinfo.psi.soi_proto.pri_in; - - format_proc_net_addr(laddr, ini, 1, want_v6); - lport = ntohs(ini->insi_lport); - format_proc_net_addr(raddr, ini, 0, want_v6); - rport = ntohs(ini->insi_fport); - - /* TCP state from the kernel's tcp_connection_info */ - int st = 0x07; /* TCP_CLOSE default */ - if (want_tcp) { - int kstate = sinfo.psi.soi_proto.pri_tcp.tcpsi_state; - /* macOS TSI_S_* matches Linux TCP state encoding: - * 0=CLOSED, 1=LISTEN, 2=SYN_SENT, etc. But Linux - * /proc/net uses 1-based: 01=ESTABLISHED, 0A=LISTEN - */ - static const int state_map[] = { - 0x07, /* 0: CLOSED */ - 0x0A, /* 1: LISTEN */ - 0x02, /* 2: SYN_SENT */ - 0x03, /* 3: SYN_RECEIVED */ - 0x01, /* 4: ESTABLISHED */ - 0x08, /* 5: CLOSE_WAIT */ - 0x04, /* 6: FIN_WAIT_1 */ - 0x06, /* 7: CLOSING */ - 0x09, /* 8: LAST_ACK */ - 0x05, /* 9: FIN_WAIT_2 */ - 0x0B, /* 10: TIME_WAIT */ - }; - if (RANGE_CHECK(kstate, 0, 11)) - st = state_map[kstate]; - } - - off = append_proc_net_row(buf, sizeof(buf), off, want_tcp, sl, - laddr, lport, raddr, rport, st); - sl++; - } - } - return proc_synthetic_fd_str(buf, off, sizeof(buf)); + " ref pointer drops\n"); + proc_net_for_each_socket(proc_net_inet_visit, &ctx); + return proc_synthetic_fd_str(buf, ctx.off, sizeof(buf)); } if (!strcmp(path, "/proc/net/unix")) { char buf[8192]; - int off = snprintf(buf, sizeof(buf), - "Num RefCount Protocol Flags Type St " - "Inode Path\n"); - - pid_t pids[PROC_TABLE_SIZE + 1]; - pids[0] = getpid(); - int npids = 1 + proc_get_child_pids(pids + 1, PROC_TABLE_SIZE); - - for (int p = 0; p < npids && off < (int) sizeof(buf) - 128; p++) { - struct proc_fdinfo fdinfo[512]; - int fdsz = proc_pidinfo(pids[p], PROC_PIDLISTFDS, 0, fdinfo, - sizeof(fdinfo)); - if (fdsz <= 0) - continue; - int nfds = fdsz / (int) PROC_PIDLISTFD_SIZE; - - for (int fi = 0; fi < nfds && off < (int) sizeof(buf) - 128; fi++) { - if (fdinfo[fi].proc_fdtype != PROX_FDTYPE_SOCKET) - continue; - struct socket_fdinfo sinfo; - int sz = - proc_pidfdinfo(pids[p], fdinfo[fi].proc_fd, - PROC_PIDFDSOCKETINFO, &sinfo, sizeof(sinfo)); - if (sz < (int) sizeof(sinfo)) - continue; - if (sinfo.psi.soi_family != AF_UNIX) - continue; - int stype = sinfo.psi.soi_type; - int lt = (stype == SOCK_STREAM) ? 1 - : (stype == SOCK_DGRAM) ? 2 - : (stype == SOCK_SEQPACKET) ? 5 - : 1; - /* Unix socket path from soi_proto.pri_un.unsi_addr */ - const char *spath = - sinfo.psi.soi_proto.pri_un.unsi_addr.ua_sun.sun_path; - off += - snprintf(buf + off, sizeof(buf) - off, - "%016X: %08X %08X %08X %04X %02X %5d %s\n", 0, 3, - 0, 0, lt, 3, 10000 + fi, spath[0] ? spath : ""); - } - } - return proc_synthetic_fd_str(buf, off, sizeof(buf)); + struct proc_net_unix_ctx ctx = { + .buf = buf, + .bufsz = sizeof(buf), + .off = snprintf(buf, sizeof(buf), + "Num RefCount Protocol Flags Type St " + "Inode Path\n"), + }; + proc_net_for_each_socket(proc_net_unix_visit, &ctx); + return proc_synthetic_fd_str(buf, ctx.off, sizeof(buf)); } /* /proc/sys/vm/mmap_min_addr -> synthetic mmap minimum address. */ - if (!strcmp(path, "/proc/sys/vm/mmap_min_addr")) { - const char *data = "32768\n"; - return proc_synthetic_fd(data, strlen(data)); - } + if (!strcmp(path, "/proc/sys/vm/mmap_min_addr")) + return proc_emit_literal("32768\n"); /* /proc/sys/kernel/randomize_va_space -> ASLR enabled (full). */ - if (!strcmp(path, "/proc/sys/kernel/randomize_va_space")) { - const char *data = "2\n"; - return proc_synthetic_fd(data, strlen(data)); - } + if (!strcmp(path, "/proc/sys/kernel/randomize_va_space")) + return proc_emit_literal("2\n"); /* /proc/version -> synthetic kernel version string */ if (!strcmp(path, "/proc/version")) { - char buf[256]; - int len = snprintf(buf, sizeof(buf), - "Linux version 6.17.0-20-generic " - "(buildd@bos03-arm64-051) " - "(aarch64-linux-gnu-gcc (Ubuntu 15.2.0-4ubuntu4) " - "15.2.0, GNU ld (GNU Binutils for Ubuntu) 2.45) " - "#20-Ubuntu SMP PREEMPT_DYNAMIC\n"); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); + return proc_emit_literal( + "Linux version 6.17.0-20-generic " + "(buildd@bos03-arm64-051) " + "(aarch64-linux-gnu-gcc (Ubuntu 15.2.0-4ubuntu4) " + "15.2.0, GNU ld (GNU Binutils for Ubuntu) 2.45) " + "#20-Ubuntu SMP PREEMPT_DYNAMIC\n"); } /* /proc/filesystems -> supported filesystem types */ if (!strcmp(path, "/proc/filesystems")) { - const char *data = + return proc_emit_literal( "\tmpfs\n" "\tproc\n" "\tsysfs\n" "\tdevtmpfs\n" "\text4\n" - "\tvfat\n"; - return proc_synthetic_fd(data, strlen(data)); + "\tvfat\n"); } /* /proc/self/mountinfo -> Linux mountinfo format (different from @@ -1269,121 +1672,133 @@ int proc_intercept_open(const guest_t *g, * - type source super_options */ if (!strcmp(path, "/proc/self/mountinfo")) { - char buf[1024]; - int len = - snprintf(buf, sizeof(buf), - "1 0 0:1 / / rw,relatime - ext4 /dev/root rw\n" - "2 1 0:2 / /proc rw,nosuid,nodev,noexec - proc proc rw\n" - "3 1 0:3 / /tmp rw,nosuid,nodev - tmpfs tmpfs rw\n" - "4 1 0:4 / /dev rw,nosuid - devtmpfs devtmpfs rw\n" - "5 4 0:5 / /dev/shm rw,nosuid,nodev - tmpfs tmpfs rw\n"); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); + return proc_emit_literal( + "1 0 0:1 / / rw,relatime - ext4 /dev/root rw\n" + "2 1 0:2 / /proc rw,nosuid,nodev,noexec - proc proc rw\n" + "3 1 0:3 / /tmp rw,nosuid,nodev - tmpfs tmpfs rw\n" + "4 1 0:4 / /dev rw,nosuid - devtmpfs devtmpfs rw\n" + "5 4 0:5 / /dev/shm rw,nosuid,nodev - tmpfs tmpfs rw\n"); } /* /proc/mounts, /etc/mtab -> synthetic mount table */ if (!strcmp(path, "/proc/mounts") || !strcmp(path, "/proc/self/mounts") || !strcmp(path, "/etc/mtab")) { - char buf[512]; - int len = snprintf(buf, sizeof(buf), - "/ / ext4 rw,relatime 0 0\n" - "proc /proc proc rw,nosuid,nodev,noexec 0 0\n" - "tmpfs /tmp tmpfs rw,nosuid,nodev 0 0\n" - "devtmpfs /dev devtmpfs rw,nosuid 0 0\n" - "tmpfs /dev/shm tmpfs rw,nosuid,nodev 0 0\n"); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); - } - - /* /proc/self/oom_score_adj -> writable stub. - * Containers and systemd write this; accept writes and return - * last-written value (default 0). + return proc_emit_literal( + "/ / ext4 rw,relatime 0 0\n" + "proc /proc proc rw,nosuid,nodev,noexec 0 0\n" + "tmpfs /tmp tmpfs rw,nosuid,nodev 0 0\n" + "devtmpfs /dev devtmpfs rw,nosuid 0 0\n" + "tmpfs /dev/shm tmpfs rw,nosuid,nodev 0 0\n"); + } + + /* OOM nodes share one stored adjustment. + * oom_score_adj: returns the raw adjustment in [-1000, 1000]. + * oom_adj: legacy view, scaled into [-17, 15] for compatibility. + * oom_score: stub computed score, currently a fixed 0. */ - if (proc_is_oom_path(path)) { - int val = atomic_load(&oom_score_adj_value); + if (oom_kind != OOM_PATH_NONE) { char buf[32]; - int len = snprintf(buf, sizeof(buf), "%d\n", val); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); + int len = proc_oom_format_value(oom_kind, buf, sizeof(buf)); + return proc_synthetic_fd(buf, (size_t) len); } - /* /proc/self/fdinfo/ -> per-fd flags/pos/mnt_id */ + /* /proc/self/fdinfo/ -> per-fd flags/pos/mnt_id plus type-specific + * fields for fds where Linux exposes additional state (eventfd counter, + * signalfd mask, timerfd settings). + */ if (!strncmp(path, "/proc/self/fdinfo/", 18)) { - char *endptr; - long n = strtol(path + 18, &endptr, 10); - if (endptr == path + 18 || *endptr != '\0' || n < 0 || - n >= FD_TABLE_SIZE) { - errno = ENOENT; + int n = proc_parse_fd_index(path, 18, ENOENT); + if (n < 0) return -1; - } fd_entry_t snap; - if (!fd_snapshot((int) n, &snap)) { + if (!fd_snapshot(n, &snap)) { errno = ENOENT; return -1; } - off_t pos = 0; - int host_fd = fd_to_host((int) n); - if (host_fd >= 0) - pos = lseek(host_fd, 0, SEEK_CUR); - if (pos < 0) - pos = 0; - int flags = snap.linux_flags; - char buf[256]; - int len = snprintf(buf, sizeof(buf), - "pos:\t%lld\n" - "flags:\t0%o\n" - "mnt_id:\t0\n", - (long long) pos, flags); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); - } - - /* /proc/self/fdinfo -> directory listing via persistent temp dir (macOS - * getdents needs real directory entries). - */ - if (!strcmp(path, "/proc/self/fdinfo") || - !strcmp(path, "/proc/self/fdinfo/")) { - static char fdinfodir[128]; - static pthread_mutex_t fdinfodir_lock = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&fdinfodir_lock); - if (proc_lazy_mkdtemp(fdinfodir, sizeof(fdinfodir), - "/tmp/elfuse-fdinfo-XXXXXX") < 0) { - pthread_mutex_unlock(&fdinfodir_lock); - return -1; + /* fd_to_host_dup atomically duplicates under fd_lock so a concurrent + * close+reopen on another vCPU cannot redirect the lseek to an + * unrelated host fd that took the freed slot. The probe pollutes + * errno with ESPIPE on non-seekable fds (sockets, pipes), so save + * and restore around the call to keep the caller's view clean. + */ + off_t pos = 0; + int dup_fd = fd_to_host_dup(n); + if (dup_fd >= 0) { + int saved_errno = errno; + off_t probe = lseek(dup_fd, 0, SEEK_CUR); + if (probe >= 0) + pos = probe; + errno = saved_errno; + close(dup_fd); } - for (int i = 0; i < FD_TABLE_SIZE; i++) { - char entry[192]; - snprintf(entry, sizeof(entry), "%s/%d", fdinfodir, i); - fd_entry_t snap; - if (fd_snapshot(i, &snap)) { - int tfd = open(entry, O_CREAT | O_WRONLY, 0444); - if (tfd >= 0) - close(tfd); - } else { - unlink(entry); + char extra[160]; + extra[0] = '\0'; + if (snap.type == FD_EVENTFD) { + uint64_t count; + /* fs/eventfd.c uses a single space after the colon, matching + * the timerfd convention (and unlike pos:/flags:/mnt_id: in + * fs/proc/fd.c which use tabs). */ + if (eventfd_fdinfo_snapshot(n, &count)) + snprintf(extra, sizeof(extra), "eventfd-count: %16llx\n", + (unsigned long long) count); + } else if (snap.type == FD_SIGNALFD) { + uint64_t mask; + /* fs/signalfd.c uses a tab after the colon (matching the + * pos:/flags:/mnt_id: convention in fs/proc/fd.c, not the + * single-space style of eventfd/timerfd). Verified against a + * real Linux 6.x /proc/self/fdinfo dump. */ + if (signalfd_fdinfo_snapshot(n, &mask)) + snprintf(extra, sizeof(extra), "sigmask:\t%016llx\n", + (unsigned long long) mask); + } else if (snap.type == FD_TIMERFD) { + int clockid; + uint64_t ticks; + int64_t value_ns, interval_ns; + if (timerfd_fdinfo_snapshot(n, &clockid, &ticks, &value_ns, + &interval_ns)) { + /* Linux fs/timerfd.c emits these fields with single + * spaces after the colon, not tabs (unlike pos:/flags:/ + * mnt_id: in fs/proc/fd.c, which do use tabs). Match the + * upstream format so guest readers parsing fdinfo via a + * "it_value: (" prefix find the field. */ + snprintf(extra, sizeof(extra), + "clockid: %d\n" + "ticks: %llu\n" + "settime flags: 0\n" + "it_value: (%lld, %lld)\n" + "it_interval: (%lld, %lld)\n", + clockid, (unsigned long long) ticks, + (long long) (value_ns / 1000000000LL), + (long long) (value_ns % 1000000000LL), + (long long) (interval_ns / 1000000000LL), + (long long) (interval_ns % 1000000000LL)); } } - int fd = proc_open_dir_fd(fdinfodir, linux_flags); - pthread_mutex_unlock(&fdinfodir_lock); - return fd >= 0 ? fd : -1; + return proc_emit_fmt( + "pos:\t%lld\n" + "flags:\t0%o\n" + "mnt_id:\t0\n" + "%s", + (long long) pos, snap.linux_flags, extra); } - /* /proc/self/fd/N -> open the target of the fd (readlink-style) */ - if (!strncmp(path, "/proc/self/fd/", 14)) { - char *endptr; - long n = strtol(path + 14, &endptr, 10); - if (endptr == path + 14 || *endptr != '\0' || n < 0 || - n >= FD_TABLE_SIZE) { - errno = EBADF; - return -1; - } - int host_fd = fd_to_host((int) n); - if (host_fd < 0) { - errno = EBADF; - return -1; - } - return dup(host_fd); + /* /proc/self/fdinfo -> directory listing. Each open gets its own scratch + * dir so concurrent getdents on independent dirfds cannot interfere + * (the previous shared-dir design unlinked entries under a sibling + * enumerator). The dirs are tracked for atexit cleanup. + */ + if (!strcmp(path, "/proc/self/fdinfo") || + !strcmp(path, "/proc/self/fdinfo/")) { + return proc_open_fd_scratch("elfuse-fdinfo", linux_flags); } + /* /proc/self/fd/N -> open the target of the fd (readlink-style) */ + if (!strncmp(path, "/proc/self/fd/", 14)) + return dev_fd_dup(path, 14); + /* /proc/meminfo -> synthetic memory info from host vm_statistics */ if (!strcmp(path, "/proc/meminfo")) { int64_t physmem = 0; @@ -1420,9 +1835,7 @@ int proc_intercept_open(const guest_t *g, buffers_kb = total_kb / 20; cached_kb = total_kb / 4; } - char buf[2048]; - int len = snprintf( - buf, sizeof(buf), + return proc_emit_fmt( "MemTotal: %llu kB\n" "MemFree: %llu kB\n" "MemAvailable: %llu kB\n" @@ -1456,7 +1869,6 @@ int proc_intercept_open(const guest_t *g, (unsigned long long) (total_kb - free_kb - cached_kb - buffers_kb), (unsigned long long) (cached_kb / 2), (unsigned long long) (total_kb / 2)); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); } /* /proc/self/io -> synthetic I/O counters. @@ -1465,15 +1877,14 @@ int proc_intercept_open(const guest_t *g, * it does not track per-guest I/O. */ if (!strcmp(path, "/proc/self/io")) { - static const char data[] = + return proc_emit_literal( "rchar: 0\n" "wchar: 0\n" "syscr: 0\n" "syscw: 0\n" "read_bytes: 0\n" "write_bytes: 0\n" - "cancelled_write_bytes: 0\n"; - return proc_synthetic_fd(data, sizeof(data) - 1); + "cancelled_write_bytes: 0\n"); } /* /proc/self/stat -> single-line process stat (man 5 proc). @@ -1505,33 +1916,24 @@ int proc_intercept_open(const guest_t *g, rss_pages += sz / (uint64_t) page_size; } - const char *exe = proc_get_elf_path(); - const char *comm = "elfuse"; - if (exe) { - const char *slash = strrchr(exe, '/'); - comm = slash ? slash + 1 : exe; - } - - char buf[1024]; /* Fields: pid(1) (comm)(2) state(3) ppid(4) pgrp(5) session(6) * tty_nr(7) tpgid(8) flags(9) minflt(10) cminflt(11) majflt(12) * cmajflt(13) utime(14) stime(15) cutime(16) cstime(17) * priority(18) nice(19) num_threads(20) itrealvalue(21) * starttime(22) vsize(23) rss(24) rsslim(25) ... (52 fields total) */ - int len = snprintf( - buf, sizeof(buf), + return proc_emit_fmt( "%lld (%.15s) R %lld %lld %lld 0 -1 0 " /* 1-9 */ "0 0 0 0 %ld %ld 0 0 " /* 10-17 */ "20 0 %d 0 0 %llu %llu " /* 18-24 */ "18446744073709551615 0 0 0 0 0 0 " /* 25-31 */ "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", /* 32-52 */ - (long long) proc_get_pid(), comm, (long long) proc_get_ppid(), + (long long) proc_get_pid(), proc_comm_name(), + (long long) proc_get_ppid(), (long long) proc_get_pid(), /* pgrp = pid */ (long long) proc_get_pid(), /* session = pid */ utime_ticks, stime_ticks, thread_active_count(), (unsigned long long) vsize, (unsigned long long) rss_pages); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); } /* /proc/stat -> synthetic CPU statistics */ @@ -1569,21 +1971,17 @@ int proc_intercept_open(const guest_t *g, /* /etc/passwd -> synthetic passwd with root + current user */ if (!strcmp(path, "/etc/passwd")) { - char buf[512]; - int len = snprintf(buf, sizeof(buf), - "root:x:0:0:root:/root:/bin/sh\n" - "user:x:1000:1000:user:/home/user:/bin/sh\n"); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); + return proc_emit_literal( + "root:x:0:0:root:/root:/bin/sh\n" + "user:x:1000:1000:user:/home/user:/bin/sh\n"); } /* /etc/group -> synthetic group file */ if (!strcmp(path, "/etc/group")) { - char buf[512]; - int len = snprintf(buf, sizeof(buf), - "root:x:0:\n" - "staff:x:20:\n" - "user:x:1000:\n"); - return proc_synthetic_fd_str(buf, len, sizeof(buf)); + return proc_emit_literal( + "root:x:0:\n" + "staff:x:20:\n" + "user:x:1000:\n"); } return PROC_NOT_INTERCEPTED; @@ -1602,35 +2000,20 @@ int proc_intercept_stat(const char *path, struct stat *st) */ /* /dev/shm is a directory */ if (!strcmp(path, "/dev/shm") || !strcmp(path, "/dev/shm/")) { - memset(st, 0, sizeof(*st)); - st->st_mode = S_IFDIR | 01777; /* sticky bit, like real /dev/shm */ - st->st_nlink = 2; + stat_fill_proc_dir(st, 01777, 2); /* sticky bit, like real /dev/shm */ return 0; } /* /dev/shm/ files: check the host temp dir */ if (!strncmp(path, "/dev/shm/", 9)) { - const char *shm = shm_dir_path(); - if (!shm) - return -1; - const char *suffix = path + 9; - if (strstr(suffix, "..") || strchr(suffix, '/') || suffix[0] == '\0') { - errno = EACCES; - return -1; - } char host_path[512]; - int n = snprintf(host_path, sizeof(host_path), "%s/%s", shm, suffix); - if (n < 0 || (size_t) n >= sizeof(host_path)) { - errno = ENAMETOOLONG; + if (dev_shm_resolve_path(path + 9, host_path, sizeof(host_path)) < 0) return -1; - } return stat(host_path, st); } /* /proc and /proc/ are directories */ if (!strcmp(path, "/proc") || !strcmp(path, "/proc/")) { - memset(st, 0, sizeof(*st)); - st->st_mode = S_IFDIR | 0555; - st->st_nlink = 3; + stat_fill_proc_dir(st, 0555, 3); return 0; } { @@ -1641,29 +2024,28 @@ int proc_intercept_stat(const char *path, struct stat *st) (long long) proc_get_pid()); if (!strcmp(path, pidbuf) || !strcmp(path, pidslash) || !strcmp(path, "/proc/self") || !strcmp(path, "/proc/self/")) { - memset(st, 0, sizeof(*st)); - st->st_mode = S_IFDIR | 0555; - st->st_nlink = 3; + stat_fill_proc_dir(st, 0555, 3); return 0; } } + if (!strcmp(path, "/proc/net") || !strcmp(path, "/proc/net/")) { + stat_fill_proc_dir(st, 0555, 2); + return 0; + } - /* /proc// -> treat as /proc/self/ */ - if (!strncmp(path, "/proc/", 6)) { - char *endp; - long pid = strtol(path + 6, &endp, 10); - if (endp != path + 6 && pid == (long) proc_get_pid() && *endp == '/') { - char alias[LINUX_PATH_MAX]; - snprintf(alias, sizeof(alias), "/proc/self%s", endp); + /* /proc/[/...] -> /proc/self[...]. */ + { + char alias[LINUX_PATH_MAX]; + int aliased = proc_alias_self(path, alias, sizeof(alias)); + if (aliased < 0) + return -1; + if (aliased > 0) return proc_intercept_stat(alias, st); - } } /* /proc/self/task and /proc/self/task/ are directories */ if (!strcmp(path, "/proc/self/task") || !strcmp(path, "/proc/self/task/")) { - memset(st, 0, sizeof(*st)); - st->st_mode = S_IFDIR | 0555; - st->st_nlink = 2 + thread_active_count(); + stat_fill_proc_dir(st, 0555, 2 + (nlink_t) thread_active_count()); return 0; } if (!strncmp(path, "/proc/self/task/", 16)) { @@ -1675,68 +2057,41 @@ int proc_intercept_stat(const char *path, struct stat *st) return -1; } if (*endp == '\0' || !strcmp(endp, "/")) { - /* /proc/self/task/ directory */ - memset(st, 0, sizeof(*st)); - st->st_mode = S_IFDIR | 0555; - st->st_nlink = 2; + stat_fill_proc_dir(st, 0555, 2); return 0; } if (!strcmp(endp, "/stat") || !strcmp(endp, "/status")) { - memset(st, 0, sizeof(*st)); - st->st_mode = S_IFREG | 0444; - st->st_nlink = 1; - st->st_size = 256; - st->st_blksize = 4096; - st->st_blocks = 1; + stat_fill_proc_file(st, 0444); return 0; } } } - if (proc_is_oom_path(path)) { - memset(st, 0, sizeof(*st)); - st->st_mode = S_IFREG | 0644; - st->st_nlink = 1; - st->st_size = 2; - st->st_blksize = 4096; - st->st_blocks = 1; - return 0; + { + int kind = proc_oom_path_kind(path); + if (kind != OOM_PATH_NONE) { + stat_fill_proc_file(st, (kind == OOM_PATH_SCORE) ? 0444 : 0644); + return 0; + } } if (!strcmp(path, "/proc/self/fdinfo") || - !strcmp(path, "/proc/self/fdinfo/")) { - memset(st, 0, sizeof(*st)); - st->st_mode = S_IFDIR | 0555; - st->st_nlink = 2; - return 0; - } - - if (!strcmp(path, "/proc/self/fd") || !strcmp(path, "/proc/self/fd/")) { - memset(st, 0, sizeof(*st)); - st->st_mode = S_IFDIR | 0555; - st->st_nlink = 2; + !strcmp(path, "/proc/self/fdinfo/") || !strcmp(path, "/proc/self/fd") || + !strcmp(path, "/proc/self/fd/")) { + stat_fill_proc_dir(st, 0555, 2); return 0; } if (!strncmp(path, "/proc/self/fdinfo/", 18)) { - char *endp; - long fd = strtol(path + 18, &endp, 10); - if (endp == path + 18 || *endp != '\0' || fd < 0 || - fd >= FD_TABLE_SIZE) { - errno = ENOENT; + int fd = proc_parse_fd_index(path, 18, ENOENT); + if (fd < 0) return -1; - } fd_entry_t snap; - if (!fd_snapshot((int) fd, &snap)) { + if (!fd_snapshot(fd, &snap)) { errno = ENOENT; return -1; } - memset(st, 0, sizeof(*st)); - st->st_mode = S_IFREG | 0444; - st->st_nlink = 1; - st->st_size = 32; - st->st_blksize = 4096; - st->st_blocks = 1; + stat_fill_proc_file(st, 0444); return 0; } @@ -1772,26 +2127,17 @@ int proc_intercept_stat(const char *path, struct stat *st) for (const char **p = known_proc_files; *p; p++) { if (!strcmp(path, *p)) { - memset(st, 0, sizeof(*st)); - st->st_mode = S_IFREG | 0444; /* Regular file, read-only */ - st->st_nlink = 1; - st->st_size = 256; /* Approximate; exact value not critical */ - st->st_blksize = 4096; - st->st_blocks = 1; + stat_fill_proc_file(st, 0444); return 0; } } /* /proc/self/fd/N: stat the underlying host fd */ if (!strncmp(path, "/proc/self/fd/", 14)) { - char *endptr; - long n = strtol(path + 14, &endptr, 10); - if (endptr == path + 14 || *endptr != '\0' || n < 0 || - n >= FD_TABLE_SIZE) { - errno = EBADF; + int n = proc_parse_fd_index(path, 14, EBADF); + if (n < 0) return -1; - } - int host_fd = fd_to_host((int) n); + int host_fd = fd_to_host(n); if (host_fd < 0) { errno = EBADF; return -1; @@ -1806,6 +2152,15 @@ int proc_intercept_stat(const char *path, struct stat *st) int proc_intercept_readlink(const char *path, char *buf, size_t bufsiz) { + { + char alias[LINUX_PATH_MAX]; + int aliased = proc_alias_self(path, alias, sizeof(alias)); + if (aliased < 0) + return -1; + if (aliased > 0) + return proc_intercept_readlink(alias, buf, bufsiz); + } + /* /proc/self/exe -> path of current ELF binary */ if (!strcmp(path, "/proc/self/exe")) { const char *exe = proc_get_elf_path(); @@ -1863,6 +2218,72 @@ int proc_intercept_readlink(const char *path, char *buf, size_t bufsiz) return PROC_NOT_INTERCEPTED; } +int proc_intercept_read(int guest_fd, + void *buf, + size_t count, + int64_t offset, + ssize_t *read_out) +{ + fd_entry_t snap; + if (!fd_snapshot(guest_fd, &snap)) + return 0; + + int kind = proc_oom_path_kind(snap.proc_path); + if (kind == OOM_PATH_NONE) + return 0; + + /* Recompute from the shared atomic on every read so lseek(0)+read on an + * already-open fd sees updates written through oom_score_adj or oom_adj. + */ + char text[32]; + int len = proc_oom_format_value(kind, text, sizeof(text)); + return proc_oom_copy_slice(buf, count, offset, text, (size_t) len, + read_out); +} + +int proc_intercept_readv(int guest_fd, + const struct iovec *iov, + int iovcnt, + int64_t offset, + ssize_t *read_out) +{ + fd_entry_t snap; + if (!fd_snapshot(guest_fd, &snap)) + return 0; + + int kind = proc_oom_path_kind(snap.proc_path); + if (kind == OOM_PATH_NONE) + return 0; + if (offset < 0) { + errno = EINVAL; + return -1; + } + + char text[32]; + int len = proc_oom_format_value(kind, text, sizeof(text)); + size_t src_len = (size_t) len; + if ((uint64_t) offset >= src_len) { + *read_out = 0; + return 1; + } + + size_t src_off = (size_t) offset; + ssize_t total = 0; + for (int i = 0; i < iovcnt && src_off < src_len; i++) { + size_t n = iov[i].iov_len; + if (n > src_len - src_off) + n = src_len - src_off; + if (n == 0) + continue; + memcpy(iov[i].iov_base, text + src_off, n); + src_off += n; + total += (ssize_t) n; + } + + *read_out = total; + return 1; +} + int proc_intercept_write(int guest_fd, int host_fd, const void *buf, @@ -1874,33 +2295,72 @@ int proc_intercept_write(int guest_fd, fd_entry_t snap; if (!fd_snapshot(guest_fd, &snap)) return 0; - if (!proc_is_oom_writable(snap.proc_path)) + int kind = proc_oom_path_kind(snap.proc_path); + if (kind == OOM_PATH_SCORE) { + /* Linux: oom_score has no write handler. proc_reg_write returns + * -EIO when the underlying proc_dir_entry exposes no write op, + * not -EINVAL. Match that so guests probing the error code see + * the same value as on a real kernel. */ + errno = EIO; + return -1; + } + if (kind != OOM_PATH_SCORE_ADJ && kind != OOM_PATH_ADJ) return 0; + /* Linux: zero-byte writes to proc nodes succeed without side effects. + * Without this short-circuit, sys_writev would funnel a zero-length + * vector through proc_parse_int_write and get -EINVAL. + */ + if (count == 0) { + *written_out = 0; + return 1; + } + int val; if (proc_parse_int_write(buf, count, &val) < 0) return -1; - if (val < -1000 || val > 1000) { - errno = EINVAL; - return -1; - } - atomic_store(&oom_score_adj_value, val); + int score_adj; + if (kind == OOM_PATH_ADJ) { + if (val < LINUX_OOM_DISABLE || val > LINUX_OOM_ADJUST_MAX) { + errno = EINVAL; + return -1; + } + score_adj = oom_adj_to_score_adj(val); + } else { + if (val < LINUX_OOM_SCORE_ADJ_MIN || val > LINUX_OOM_SCORE_ADJ_MAX) { + errno = EINVAL; + return -1; + } + score_adj = val; + } + /* Both interfaces persist the value the writer supplied: oom_adj keeps the + * legacy [-17,15] number, oom_score_adj keeps the [-1000,1000] number. + * proc_oom_refresh_live_fds_locked re-renders each open fd's backing file + * through proc_oom_format_value, so the kind-specific view stays correct + * across reads. + */ char text[32]; int len = snprintf(text, sizeof(text), "%d\n", val); - if (len < 0) { - errno = EINVAL; - return -1; - } + /* Serialize the backing-fd rewrite so concurrent writers cannot race the + * truncate+pwrite sequence. Publish to the global atomic last so a + * partial-rewrite failure leaves the process-wide value unchanged. + */ + pthread_mutex_lock(&oom_write_lock); + int rc = -1; if (ftruncate(host_fd, 0) < 0) - return -1; + goto unlock; if (pwrite(host_fd, text, (size_t) len, 0) != len) - return -1; + goto unlock; if (!use_pwrite && lseek(host_fd, offset + (int64_t) count, SEEK_SET) < 0) - return -1; - + goto unlock; + atomic_store(&oom_score_adj_value, score_adj); + proc_oom_refresh_live_fds_locked(); *written_out = (ssize_t) count; - return 1; + rc = 1; +unlock: + pthread_mutex_unlock(&oom_write_lock); + return rc; } diff --git a/src/runtime/procemu.h b/src/runtime/procemu.h index de5058a..ae52de2 100644 --- a/src/runtime/procemu.h +++ b/src/runtime/procemu.h @@ -12,6 +12,7 @@ #include #include +#include #include "core/guest.h" /* Sentinel return value: path was not intercepted, caller should fall through @@ -53,6 +54,24 @@ int proc_intercept_write(int guest_fd, int use_pwrite, ssize_t *written_out); +/* Intercept reads from synthetic proc files that must reflect shared state on + * every read rather than the per-open temp-file snapshot. + * Returns 1 if handled (with *read_out set), 0 if not intercepted, or -1 on + * error with errno set. + */ +int proc_intercept_read(int guest_fd, + void *buf, + size_t count, + int64_t offset, + ssize_t *read_out); + +/* Vector form of proc_intercept_read for readv/preadv. */ +int proc_intercept_readv(int guest_fd, + const struct iovec *iov, + int iovcnt, + int64_t offset, + ssize_t *read_out); + /* Get the /dev/shm emulation directory path (creating it on first call). * Used by sys_unlinkat to rewrite /dev/shm/ paths. */ diff --git a/src/syscall/fd.c b/src/syscall/fd.c index ebc2d95..3903e9b 100644 --- a/src/syscall/fd.c +++ b/src/syscall/fd.c @@ -116,6 +116,53 @@ static int timerfd_alloc(void) return sfd_alloc_slot(timerfd_state, TIMERFD_MAX, sizeof(timerfd_state[0])); } +/* Called with sfd_lock held. Drain any kevent expirations sitting on the + * timer's kqueue and fold them into the slot's accumulator. Used by + * timerfd_read before consuming the counter and by timerfd_fdinfo_snapshot + * before reporting it; without this drain, fdinfo would lag the actual + * fire count by however many ticks were pending in the kqueue. + */ +static void timerfd_drain_pending_locked(int slot) +{ + int kq = timerfd_state[slot].kq_fd; + struct kevent kev; + struct timespec ts_zero = {0, 0}; + int nev = kevent(kq, NULL, 0, &kev, 1, &ts_zero); + if (nev > 0) { + uint64_t fires = (uint64_t) kev.data; + if (fires == 0) + fires = 1; /* At least one expiration */ + timerfd_state[slot].expirations += fires; + } +} + +/* Called with sfd_lock held. Returns nanoseconds until the next expiration, + * or 0 when the timer is disarmed or a one-shot timer has already expired. + */ +static int64_t timerfd_remaining_ns_locked(int slot, int64_t now_ns) +{ + if (!timerfd_state[slot].armed) + return 0; + + int64_t elapsed = now_ns - timerfd_state[slot].arm_time_ns; + if (elapsed < 0) + elapsed = 0; + + if (timerfd_state[slot].interval_ns > 0) { + int64_t total = timerfd_state[slot].initial_ns; + if (elapsed >= total) { + int64_t since_first = elapsed - total; + int64_t interval = timerfd_state[slot].interval_ns; + int64_t remaining = interval - (since_first % interval); + return remaining == 0 ? interval : remaining; + } + return total - elapsed; + } + + int64_t remaining = timerfd_state[slot].initial_ns - elapsed; + return remaining > 0 ? remaining : 0; +} + int64_t sys_timerfd_create(int clockid, int flags) { if (clockid != LINUX_CLOCK_REALTIME && clockid != LINUX_CLOCK_MONOTONIC) @@ -203,8 +250,7 @@ int64_t sys_timerfd_settime(guest_t *g, struct timespec now; clock_gettime(CLOCK_MONOTONIC, &now); int64_t now_ns = now.tv_sec * NS_PER_SEC + now.tv_nsec; - int64_t elapsed = now_ns - timerfd_state[slot].arm_time_ns; - int64_t remaining = timerfd_state[slot].initial_ns - elapsed; + int64_t remaining = timerfd_remaining_ns_locked(slot, now_ns); if (remaining > 0) { old.it_value_sec = remaining / NS_PER_SEC; old.it_value_nsec = remaining % NS_PER_SEC; @@ -319,27 +365,10 @@ int64_t sys_timerfd_gettime(guest_t *g, int fd, uint64_t curr_value_gva) its.it_interval_sec = timerfd_state[slot].interval_ns / NS_PER_SEC; its.it_interval_nsec = timerfd_state[slot].interval_ns % NS_PER_SEC; - /* Compute actual remaining time from arm time + initial value */ struct timespec now; clock_gettime(CLOCK_MONOTONIC, &now); int64_t now_ns = now.tv_sec * NS_PER_SEC + now.tv_nsec; - int64_t elapsed = now_ns - timerfd_state[slot].arm_time_ns; - int64_t remaining; - - if (timerfd_state[slot].interval_ns > 0) { - /* Repeating timer: remaining = interval - (elapsed % interval) */ - int64_t total = timerfd_state[slot].initial_ns; - if (elapsed >= total) { - int64_t since_first = elapsed - total; - remaining = timerfd_state[slot].interval_ns - - (since_first % timerfd_state[slot].interval_ns); - } else { - remaining = total - elapsed; - } - } else { - /* One-shot: remaining = initial - elapsed */ - remaining = timerfd_state[slot].initial_ns - elapsed; - } + int64_t remaining = timerfd_remaining_ns_locked(slot, now_ns); if (remaining <= 0) { /* Timer already expired (one-shot) */ @@ -374,18 +403,8 @@ int64_t timerfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) int kq = timerfd_state[slot].kq_fd; - /* Collect pending timer events via kevent(). The data field contains - * the number of times the timer fired since the last kevent() call. - */ - struct kevent kev; - struct timespec ts_zero = {0, 0}; - int nev = kevent(kq, NULL, 0, &kev, 1, &ts_zero); - if (nev > 0) { - uint64_t fires = (uint64_t) kev.data; - if (fires == 0) - fires = 1; /* At least one expiration */ - timerfd_state[slot].expirations += fires; - } + /* Collect pending timer events into the slot's accumulator. */ + timerfd_drain_pending_locked(slot); if (timerfd_state[slot].expirations == 0) { /* No events yet; check if non-blocking */ @@ -408,8 +427,9 @@ int64_t timerfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) * kevent() returns EBADF in that case, and the code re-validates the * slot. */ + struct kevent kev; pthread_mutex_unlock(&sfd_lock); - nev = kevent(kq, NULL, 0, &kev, 1, NULL); + int nev = kevent(kq, NULL, 0, &kev, 1, NULL); pthread_mutex_lock(&sfd_lock); /* Re-validate: slot may have been freed by timerfd_close() */ if (timerfd_state[slot].guest_fd != guest_fd) { @@ -1073,3 +1093,67 @@ void signalfd_notify(int signum) } pthread_mutex_unlock(&sfd_lock); } + +/* /proc/self/fdinfo type-specific snapshots. Each takes sfd_lock to prevent + * tearing across concurrent read/write/settime; lock order is fd_lock(3) + * -> sfd_lock(5a), and these accessors take only sfd_lock so the procemu + * caller is free to drop fd_lock between fd_snapshot and the lookup here. + */ + +bool eventfd_fdinfo_snapshot(int guest_fd, uint64_t *count_out) +{ + pthread_mutex_lock(&sfd_lock); + int slot = eventfd_find(guest_fd); + if (slot < 0) { + pthread_mutex_unlock(&sfd_lock); + return false; + } + *count_out = eventfd_state[slot].counter; + pthread_mutex_unlock(&sfd_lock); + return true; +} + +bool signalfd_fdinfo_snapshot(int guest_fd, uint64_t *mask_out) +{ + pthread_mutex_lock(&sfd_lock); + int slot = signalfd_find(guest_fd); + if (slot < 0) { + pthread_mutex_unlock(&sfd_lock); + return false; + } + *mask_out = signalfd_state[slot].mask; + pthread_mutex_unlock(&sfd_lock); + return true; +} + +bool timerfd_fdinfo_snapshot(int guest_fd, + int *clockid_out, + uint64_t *ticks_out, + int64_t *value_ns_out, + int64_t *interval_ns_out) +{ + pthread_mutex_lock(&sfd_lock); + int slot = timerfd_find(guest_fd); + if (slot < 0) { + pthread_mutex_unlock(&sfd_lock); + return false; + } + /* Fold any pending kqueue fires into expirations before exporting, + * matching what timerfd_read does. Without this, fdinfo lags by + * however many ticks were sitting on the kqueue. + */ + timerfd_drain_pending_locked(slot); + *clockid_out = timerfd_state[slot].clockid; + *ticks_out = timerfd_state[slot].expirations; + *interval_ns_out = timerfd_state[slot].interval_ns; + int64_t value_ns = 0; + if (timerfd_state[slot].armed) { + struct timespec now; + clock_gettime(CLOCK_MONOTONIC, &now); + int64_t now_ns = (int64_t) now.tv_sec * NS_PER_SEC + now.tv_nsec; + value_ns = timerfd_remaining_ns_locked(slot, now_ns); + } + *value_ns_out = value_ns; + pthread_mutex_unlock(&sfd_lock); + return true; +} diff --git a/src/syscall/fd.h b/src/syscall/fd.h index 60bad04..e087ed4 100644 --- a/src/syscall/fd.h +++ b/src/syscall/fd.h @@ -66,3 +66,15 @@ int64_t timerfd_read(int guest_fd, * writes a byte to make poll/epoll see readability. */ void signalfd_notify(int signum); + +/* Snapshot per-fd state for /proc/self/fdinfo. Each accessor returns true when + * the guest_fd refers to a live instance of that special-fd type. The values + * are read under sfd_lock so concurrent read/write/settime cannot tear them. + */ +bool eventfd_fdinfo_snapshot(int guest_fd, uint64_t *count_out); +bool signalfd_fdinfo_snapshot(int guest_fd, uint64_t *mask_out); +bool timerfd_fdinfo_snapshot(int guest_fd, + int *clockid_out, + uint64_t *ticks_out, + int64_t *value_ns_out, + int64_t *interval_ns_out); diff --git a/src/syscall/fs.c b/src/syscall/fs.c index f7860f5..90a3c97 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -66,17 +66,15 @@ static const char *proc_virtual_dir_path(const char *path, static const char *proc_stateful_file_path(const char *path) { - if (!path || strncmp(path, "/proc", 5) != 0) + if (!path || strncmp(path, "/proc/", 6) != 0) return NULL; if (!strcmp(path, "/proc/self/oom_score_adj") || - !strcmp(path, "/proc/self/oom_adj")) { + !strcmp(path, "/proc/self/oom_adj") || + !strcmp(path, "/proc/self/oom_score")) { return path; } - if (strncmp(path, "/proc/", 6) != 0) - return NULL; - char *endp; long pid = strtol(path + 6, &endp, 10); if (endp == path + 6 || pid != (long) proc_get_pid()) @@ -86,6 +84,8 @@ static const char *proc_stateful_file_path(const char *path) return "/proc/self/oom_score_adj"; if (!strcmp(endp, "/oom_adj")) return "/proc/self/oom_adj"; + if (!strcmp(endp, "/oom_score")) + return "/proc/self/oom_score"; return NULL; } @@ -117,9 +117,14 @@ static const char *proc_virtual_dir_path(const char *path, virt = "/proc"; } else if (!strcmp(path, "/proc/self") || !strcmp(path, "/proc/self/")) { virt = "/proc/self"; + } else if (!strcmp(path, "/proc/net") || !strcmp(path, "/proc/net/")) { + virt = "/proc/net"; } else if (!strcmp(path, "/proc/self/fd") || !strcmp(path, "/proc/self/fd/")) { virt = "/proc/self/fd"; + } else if (!strcmp(path, "/proc/self/fdinfo") || + !strcmp(path, "/proc/self/fdinfo/")) { + virt = "/proc/self/fdinfo"; } else if (!strcmp(path, "/proc/self/task") || !strcmp(path, "/proc/self/task/")) { virt = "/proc/self/task"; @@ -137,6 +142,9 @@ static const char *proc_virtual_dir_path(const char *path, if (endp != path + 6 && pid == (long) proc_get_pid() && (*endp == '\0' || !strcmp(endp, "/"))) { virt = "/proc/self"; + } else if (endp != path + 6 && pid == (long) proc_get_pid() && + (!strcmp(endp, "/fdinfo") || !strcmp(endp, "/fdinfo/"))) { + virt = "/proc/self/fdinfo"; } else if (endp != path + 6 && pid == (long) proc_get_pid() && !strcmp(endp, "/fd")) { virt = "/proc/self/fd"; diff --git a/src/syscall/io.c b/src/syscall/io.c index b938f42..3fa5b13 100644 --- a/src/syscall/io.c +++ b/src/syscall/io.c @@ -479,6 +479,66 @@ static int64_t host_fd_ref_open_regular_io(int guest_fd, host_fd_ref_t *ref) return host_fd_ref_open_io(guest_fd, ref); } +static int64_t proc_try_read_intercept(int fd, + int host_fd, + void *buf, + size_t count, + int64_t offset, + int use_pread) +{ + ssize_t intercepted = 0; + int handled = proc_intercept_read(fd, buf, count, offset, &intercepted); + if (handled < 0) + return linux_errno(); + if (handled > 0) { + if (!use_pread && + lseek(host_fd, offset + (int64_t) intercepted, SEEK_SET) < 0) + return linux_errno(); + return intercepted; + } + return INT64_MIN; +} + +static int64_t proc_try_readv_intercept(int fd, + int host_fd, + const struct iovec *iov, + int iovcnt, + int64_t offset, + int use_pread) +{ + ssize_t intercepted = 0; + int handled = proc_intercept_readv(fd, iov, iovcnt, offset, &intercepted); + if (handled < 0) + return linux_errno(); + if (handled > 0) { + if (!use_pread && + lseek(host_fd, offset + (int64_t) intercepted, SEEK_SET) < 0) + return linux_errno(); + return intercepted; + } + return INT64_MIN; +} + +/* Sendfile/copy_file_range chunk read: route the chunk through proc_intercept + * when the source fd is a synthetic /proc node, otherwise fall through + * (INT64_MIN). For the streaming (use_pread=0) variant the input offset is + * irrelevant; the helper queries the live host fd cursor. + */ +static int64_t proc_try_chunk_read_intercept(int fd, + int host_fd, + void *buf, + size_t count, + int64_t offset, + int use_pread) +{ + if (!use_pread) { + offset = lseek(host_fd, 0, SEEK_CUR); + if (offset < 0) + return INT64_MIN; + } + return proc_try_read_intercept(fd, host_fd, buf, count, offset, use_pread); +} + static int64_t proc_try_writev_intercept(int fd, int host_fd, const struct iovec *iov, @@ -613,6 +673,16 @@ int64_t sys_read(guest_t *g, int fd, uint64_t buf_gva, uint64_t count) if (count > avail) count = avail; + off_t offset = lseek(host_ref.fd, 0, SEEK_CUR); + if (offset >= 0) { + int64_t intercepted = + proc_try_read_intercept(fd, host_ref.fd, buf, count, offset, 0); + if (intercepted != INT64_MIN) { + host_fd_ref_close(&host_ref); + return intercepted; + } + } + ssize_t ret = read(host_ref.fd, buf, count); host_fd_ref_close(&host_ref); return ret < 0 ? linux_errno() : ret; @@ -642,6 +712,13 @@ int64_t sys_pread64(guest_t *g, if (count > avail) count = avail; + int64_t intercepted = + proc_try_read_intercept(fd, host_ref.fd, buf, count, offset, 1); + if (intercepted != INT64_MIN) { + host_fd_ref_close(&host_ref); + return intercepted; + } + ssize_t ret = pread(host_ref.fd, buf, count, offset); host_fd_ref_close(&host_ref); return ret < 0 ? linux_errno() : ret; @@ -832,6 +909,17 @@ int64_t sys_readv(guest_t *g, int fd, uint64_t iov_gva, int iovcnt) return err; } + off_t offset = lseek(host_ref.fd, 0, SEEK_CUR); + if (offset >= 0) { + int64_t intercepted = proc_try_readv_intercept( + fd, host_ref.fd, host_iov.iov, iovcnt, offset, 0); + if (intercepted != INT64_MIN) { + host_iov_free(&host_iov); + host_fd_ref_close(&host_ref); + return intercepted; + } + } + ssize_t ret = readv(host_ref.fd, host_iov.iov, iovcnt); int64_t result = ret < 0 ? linux_errno() : ret; host_iov_free(&host_iov); @@ -919,6 +1007,14 @@ int64_t sys_preadv(guest_t *g, return err; } + int64_t intercepted = proc_try_readv_intercept( + fd, host_ref.fd, host_iov.iov, iovcnt, offset, 1); + if (intercepted != INT64_MIN) { + host_iov_free(&host_iov); + host_fd_ref_close(&host_ref); + return intercepted; + } + ssize_t ret = preadv(host_ref.fd, host_iov.iov, iovcnt, offset); int64_t result = ret < 0 ? linux_errno() : ret; host_iov_free(&host_iov); @@ -1354,9 +1450,19 @@ int64_t sys_sendfile(guest_t *g, size_t chunk = remaining > sizeof(buf) ? sizeof(buf) : remaining; ssize_t nr; if (offset >= 0) { - nr = pread(in_ref.fd, buf, chunk, offset); + int64_t intercepted = proc_try_chunk_read_intercept( + in_fd, in_ref.fd, buf, chunk, offset, 1); + if (intercepted != INT64_MIN) + nr = intercepted; + else + nr = pread(in_ref.fd, buf, chunk, offset); } else { - nr = read(in_ref.fd, buf, chunk); + int64_t intercepted = proc_try_chunk_read_intercept( + in_fd, in_ref.fd, buf, chunk, 0, 0); + if (intercepted != INT64_MIN) + nr = intercepted; + else + nr = read(in_ref.fd, buf, chunk); } if (nr < 0) { if (total > 0) @@ -1443,9 +1549,19 @@ int64_t sys_copy_file_range(guest_t *g, size_t chunk = remaining > sizeof(buf) ? sizeof(buf) : remaining; ssize_t nr; if (off_in >= 0) { - nr = pread(in_ref.fd, buf, chunk, off_in); + int64_t intercepted = proc_try_chunk_read_intercept( + fd_in, in_ref.fd, buf, chunk, off_in, 1); + if (intercepted != INT64_MIN) + nr = intercepted; + else + nr = pread(in_ref.fd, buf, chunk, off_in); } else { - nr = read(in_ref.fd, buf, chunk); + int64_t intercepted = proc_try_chunk_read_intercept( + fd_in, in_ref.fd, buf, chunk, 0, 0); + if (intercepted != INT64_MIN) + nr = intercepted; + else + nr = read(in_ref.fd, buf, chunk); } if (nr < 0) { if (total > 0) diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index 11794eb..36f8c61 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -1583,7 +1583,12 @@ int syscall_dispatch(hv_vcpu_t vcpu, guest_t *g, int *exit_code, bool verbose) if (tp != FD_REGULAR && tp != FD_STDIO && tp != FD_PIPE && tp != FD_SOCKET) goto slow_path; - if (nr == SYS_write && fd_table[fd].proc_path[0] != '\0') + /* Proc-backed fds may need synthetic read/write handling (for + * example, oom_* rereads recompute content on each read and proc + * dirfds steer relative *at() resolution). Keep them on the slow + * path so the proc interceptors run. + */ + if (fd_table[fd].proc_path[0] != '\0') goto slow_path; host_fd_ref_t host_ref; diff --git a/tests/test-io-opt.c b/tests/test-io-opt.c index 263c732..e1691c1 100644 --- a/tests/test-io-opt.c +++ b/tests/test-io-opt.c @@ -16,6 +16,15 @@ #include "test-harness.h" +static void reset_oom_score_adj(void) +{ + int fd = open("/proc/self/oom_score_adj", O_RDWR); + if (fd >= 0) { + write(fd, "0\n", 2); + close(fd); + } +} + int main(void) { int passes = 0, fails = 0; @@ -79,6 +88,52 @@ int main(void) } } + TEST("sendfile rereads synthetic oom proc source"); + { + const char *proc_dst = "/tmp/elfuse-test-proc-sendfile.txt"; + unlink(proc_dst); + reset_oom_score_adj(); + + int in_fd = open("/proc/self/oom_adj", O_RDONLY); + int score_fd = open("/proc/self/oom_score_adj", O_RDWR); + int out_fd = open(proc_dst, O_CREAT | O_WRONLY | O_TRUNC, 0644); + if (in_fd >= 0 && score_fd >= 0 && out_fd >= 0) { + char buf[32] = {0}; + off_t offset = 0; + ssize_t wrote = write(score_fd, "1000\n", 5); + ssize_t copied = + wrote == 5 ? sendfile(out_fd, in_fd, &offset, 32) : -1; + close(out_fd); + close(score_fd); + close(in_fd); + + int verify_fd = open(proc_dst, O_RDONLY); + if (copied >= 0 && verify_fd >= 0) { + ssize_t n = read(verify_fd, buf, sizeof(buf) - 1); + close(verify_fd); + if (copied == 3 && offset == 3 && n == 3 && + memcmp(buf, "15\n", 3) == 0) + PASS(); + else + FAIL("unexpected sendfile proc content"); + } else { + if (verify_fd >= 0) + close(verify_fd); + FAIL("proc sendfile setup failed"); + } + } else { + if (in_fd >= 0) + close(in_fd); + if (score_fd >= 0) + close(score_fd); + if (out_fd >= 0) + close(out_fd); + PASS(); + } + reset_oom_score_adj(); + unlink(proc_dst); + } + /* Test fsync */ TEST("fsync"); { @@ -151,6 +206,54 @@ int main(void) unlink(cfr_dst); } + TEST("copy_file_range rereads synthetic oom proc source"); + { + const char *proc_dst = "/tmp/elfuse-test-proc-cfr.txt"; + unlink(proc_dst); + reset_oom_score_adj(); + + int in_fd = open("/proc/self/oom_adj", O_RDONLY); + int score_fd = open("/proc/self/oom_score_adj", O_RDWR); + int out_fd = open(proc_dst, O_CREAT | O_WRONLY | O_TRUNC, 0644); + if (in_fd >= 0 && score_fd >= 0 && out_fd >= 0) { + char buf[32] = {0}; + off_t off_in = 0, off_out = 0; + ssize_t wrote = write(score_fd, "1000\n", 5); + ssize_t copied = + wrote == 5 + ? copy_file_range(in_fd, &off_in, out_fd, &off_out, 32, 0) + : -1; + close(out_fd); + close(score_fd); + close(in_fd); + + int verify_fd = open(proc_dst, O_RDONLY); + if (copied >= 0 && verify_fd >= 0) { + ssize_t n = read(verify_fd, buf, sizeof(buf) - 1); + close(verify_fd); + if (copied == 3 && off_in == 3 && off_out == 3 && n == 3 && + memcmp(buf, "15\n", 3) == 0) + PASS(); + else + FAIL("unexpected copy_file_range proc content"); + } else { + if (verify_fd >= 0) + close(verify_fd); + FAIL("proc copy_file_range setup failed"); + } + } else { + if (in_fd >= 0) + close(in_fd); + if (score_fd >= 0) + close(score_fd); + if (out_fd >= 0) + close(out_fd); + PASS(); + } + reset_oom_score_adj(); + unlink(proc_dst); + } + /* Cleanup */ unlink(src_path); unlink(dst_path); diff --git a/tests/test-netstat.c b/tests/test-netstat.c index f275a65..822159c 100644 --- a/tests/test-netstat.c +++ b/tests/test-netstat.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include static int read_proc_file(const char *path, char *buf, size_t bufsz) { @@ -41,6 +43,35 @@ int main(void) int pass = 0, fail = 0; char buf[8192]; + /* 0. Verify /proc/net exists as a directory with expected children. */ + struct stat st; + if (stat("/proc/net", &st) == 0 && S_ISDIR(st.st_mode)) { + DIR *dir = opendir("/proc/net"); + if (dir) { + int found_tcp = 0, found_udp = 0, found_unix = 0; + struct dirent *de; + while ((de = readdir(dir))) { + found_tcp |= !strcmp(de->d_name, "tcp"); + found_udp |= !strcmp(de->d_name, "udp"); + found_unix |= !strcmp(de->d_name, "unix"); + } + closedir(dir); + if (found_tcp && found_udp && found_unix) { + printf("PASS: /proc/net enumerates synthetic socket tables\n"); + pass++; + } else { + printf("FAIL: /proc/net missing expected entries\n"); + fail++; + } + } else { + printf("FAIL: cannot open /proc/net: %s\n", strerror(errno)); + fail++; + } + } else { + printf("FAIL: /proc/net is not a directory: %s\n", strerror(errno)); + fail++; + } + /* 1. TCP listener on 127.0.0.1:7777 */ int tcp_fd = socket(AF_INET, SOCK_STREAM, 0); if (tcp_fd < 0) { diff --git a/tests/test-proc.c b/tests/test-proc.c index 754b0bc..c2735a5 100644 --- a/tests/test-proc.c +++ b/tests/test-proc.c @@ -144,6 +144,22 @@ int main(void) FAIL("readlink failed"); } + TEST("readlink /proc//exe aliases /proc/self/exe"); + { + char path[64]; + char self_buf[4096], pid_buf[4096]; + snprintf(path, sizeof(path), "/proc/%d/exe", getpid()); + ssize_t self_n = + readlink("/proc/self/exe", self_buf, sizeof(self_buf) - 1); + ssize_t pid_n = readlink(path, pid_buf, sizeof(pid_buf) - 1); + if (self_n > 0 && pid_n > 0) { + self_buf[self_n] = '\0'; + pid_buf[pid_n] = '\0'; + EXPECT_TRUE(!strcmp(self_buf, pid_buf), "exe targets differ"); + } else + FAIL("readlink failed"); + } + /* openat(procfd, "/stat"): proc walkers keep /proc as a dirfd. */ TEST("openat /proc//stat"); { diff --git a/tests/test-tier-b.c b/tests/test-tier-b.c index 0ffee12..8594524 100644 --- a/tests/test-tier-b.c +++ b/tests/test-tier-b.c @@ -7,6 +7,7 @@ * parity, /proc/self/oom_score_adj, /proc/self/fdinfo, cpuinfo scaling. */ +#include #include #include #include @@ -14,11 +15,18 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include #include +#include #include +#include #include #include "test-harness.h" @@ -604,6 +612,738 @@ static void test_proc_fdinfo(void) } } +static void test_proc_oom_score_adj_rejects_out_of_range(void) +{ + TEST("/proc/self/oom_score_adj rejects out-of-range writes"); + int fd = open("/proc/self/oom_score_adj", O_RDWR); + if (fd < 0) { + FAIL("open"); + return; + } + /* Linux validates the input domain on the writer side; the kernel + * returns EINVAL for any value outside [-1000, 1000]. */ + const char too_high[] = "1001\n"; + ssize_t rc = write(fd, too_high, sizeof(too_high) - 1); + int saved = errno; + close(fd); + if (rc < 0 && saved == EINVAL) + PASS(); + else + FAIL("expected -EINVAL"); +} + +static void test_proc_oom_adj_scaling(void) +{ + TEST("/proc/self/oom_adj scales to oom_score_adj"); + /* Reset to a known starting value so test ordering does not matter. */ + int z = open("/proc/self/oom_score_adj", O_RDWR); + if (z >= 0) { + write(z, "0\n", 2); + close(z); + } + + int fd = open("/proc/self/oom_adj", O_RDWR); + if (fd < 0) { + /* Some Linux configs deprecate oom_adj; treat absence as OK. */ + PASS(); + return; + } + /* Linux fs/proc/base.c oom_adj_write special-cases OOM_ADJUST_MAX so + * 15 maps directly to OOM_SCORE_ADJ_MAX (1000), not 15*1000/17 = 882. */ + if (write(fd, "15\n", 3) != 3) { + close(fd); + FAIL("write"); + return; + } + close(fd); + + int sa = open("/proc/self/oom_score_adj", O_RDONLY); + if (sa < 0) { + FAIL("reopen oom_score_adj"); + return; + } + char buf[32] = {0}; + ssize_t n = read(sa, buf, sizeof(buf) - 1); + close(sa); + if (n <= 0) { + FAIL("read"); + return; + } + int score = atoi(buf); + EXPECT_TRUE(score == 1000, "oom_adj=15 should map to oom_score_adj=1000"); +} + +static void test_proc_oom_adj_same_fd_roundtrip(void) +{ + TEST("/proc/self/oom_adj same-fd readback stays legacy"); + + int reset = open("/proc/self/oom_score_adj", O_RDWR); + if (reset >= 0) { + write(reset, "0\n", 2); + close(reset); + } + + int fd = open("/proc/self/oom_adj", O_RDWR); + if (fd < 0) { + PASS(); + return; + } + if (write(fd, "15\n", 3) != 3) { + close(fd); + FAIL("write"); + return; + } + if (lseek(fd, 0, SEEK_SET) < 0) { + close(fd); + FAIL("lseek"); + return; + } + + char buf[32] = {0}; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + reset = open("/proc/self/oom_score_adj", O_RDWR); + if (reset >= 0) { + write(reset, "0\n", 2); + close(reset); + } + if (n <= 0) { + FAIL("read"); + return; + } + EXPECT_TRUE(atoi(buf) == 15, "same-fd readback should preserve oom_adj"); +} + +static void test_proc_oom_score_no_write(void) +{ + TEST("/proc/self/oom_score writes are rejected"); + /* Linux: open succeeds (root bypasses the 0444 check, non-root sees + * EACCES from the permission gate); writes always fail because there + * is no write handler. The test focuses on the write side, which is + * uniform across uids. + */ + int fd = open("/proc/self/oom_score", O_RDONLY); + if (fd < 0) { + FAIL("open RDONLY"); + return; + } + char buf[32] = {0}; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + if (n <= 0) { + FAIL("read"); + return; + } + /* Stub returns 0; real Linux computes a small positive score, but for + * a userspace bridge a constant zero is acceptable. + */ + EXPECT_TRUE(atoi(buf) >= 0, "score must be non-negative"); +} + +static void test_proc_oom_score_write_fails(void) +{ + TEST("/proc/self/oom_score write is rejected"); + int fd = open("/proc/self/oom_score", O_WRONLY); + if (fd < 0) { + /* Non-root environments cannot open read-only file for write; + * that is also acceptable proof the file is not writable. + */ + if (errno == EACCES) { + PASS(); + return; + } + FAIL("open WRONLY"); + return; + } + ssize_t w = write(fd, "0\n", 2); + int saved = errno; + close(fd); + /* Linux's proc_reg_write returns -EIO when the proc node has no + * write op. Older or stripped kernels may return other errno; the + * load-bearing assertion is that the write fails, not the exact + * errno value. + */ + if (w < 0) + PASS(); + else + printf("FAIL: write succeeded rc=%zd errno=%d\n", w, saved), fails++; +} + +static void test_proc_oom_score_open_enforces_read_only(void) +{ + TEST("/proc/self/oom_score rejects writable open"); + errno = 0; + int fd = open("/proc/self/oom_score", O_WRONLY); + if (fd >= 0) { + close(fd); + FAIL("open should fail"); + return; + } + EXPECT_TRUE(errno == EACCES, "expected EACCES from open"); +} + +static void test_proc_oom_adj_reread_tracks_score_adj_updates(void) +{ + TEST("/proc/self/oom_adj reread reflects later score_adj writes"); + + int reset = open("/proc/self/oom_score_adj", O_RDWR); + if (reset < 0) { + FAIL("reset open"); + return; + } + if (write(reset, "0\n", 2) != 2) { + close(reset); + FAIL("reset write"); + return; + } + close(reset); + + int fd = open("/proc/self/oom_adj", O_RDONLY); + if (fd < 0) { + PASS(); + return; + } + + char buf[32] = {0}; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + if (n <= 0 || atoi(buf) != 0) { + close(fd); + FAIL("initial read"); + return; + } + + int score = open("/proc/self/oom_score_adj", O_RDWR); + if (score < 0) { + close(fd); + FAIL("score_adj open"); + return; + } + if (write(score, "1000\n", 5) != 5) { + close(score); + close(fd); + FAIL("score_adj write"); + return; + } + close(score); + + if (lseek(fd, 0, SEEK_SET) < 0) { + close(fd); + FAIL("lseek"); + return; + } + + memset(buf, 0, sizeof(buf)); + n = read(fd, buf, sizeof(buf) - 1); + close(fd); + + reset = open("/proc/self/oom_score_adj", O_RDWR); + if (reset >= 0) { + write(reset, "0\n", 2); + close(reset); + } + + if (n <= 0) { + FAIL("reread"); + return; + } + EXPECT_TRUE(atoi(buf) == 15, "oom_adj fd should reflect current score_adj"); +} + +static void test_proc_oom_score_adj_reread_tracks_updates(void) +{ + TEST("/proc/self/oom_score_adj reread reflects later writes"); + + int reset = open("/proc/self/oom_score_adj", O_RDWR); + if (reset < 0) { + FAIL("reset open"); + return; + } + if (write(reset, "0\n", 2) != 2) { + close(reset); + FAIL("reset write"); + return; + } + close(reset); + + int fd = open("/proc/self/oom_score_adj", O_RDONLY); + if (fd < 0) { + FAIL("open"); + return; + } + + char buf[32] = {0}; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + if (n <= 0 || atoi(buf) != 0) { + close(fd); + FAIL("initial read"); + return; + } + + int update = open("/proc/self/oom_score_adj", O_RDWR); + if (update < 0) { + close(fd); + FAIL("update open"); + return; + } + if (write(update, "1000\n", 5) != 5) { + close(update); + close(fd); + FAIL("update write"); + return; + } + close(update); + + if (lseek(fd, 0, SEEK_SET) < 0) { + close(fd); + FAIL("lseek"); + return; + } + + memset(buf, 0, sizeof(buf)); + n = read(fd, buf, sizeof(buf) - 1); + close(fd); + + reset = open("/proc/self/oom_score_adj", O_RDWR); + if (reset >= 0) { + write(reset, "0\n", 2); + close(reset); + } + + if (n <= 0) { + FAIL("reread"); + return; + } + EXPECT_TRUE(atoi(buf) == 1000, + "oom_score_adj fd should reflect current value"); +} + +static void test_proc_oom_zero_length_writev(void) +{ + TEST("/proc/self/oom_score_adj zero-length writev returns 0"); + int fd = open("/proc/self/oom_score_adj", O_WRONLY); + if (fd < 0) { + FAIL("open"); + return; + } + /* Two empty iovecs: total length zero. Linux returns 0; the previous + * implementation returned EINVAL via proc_parse_int_write. */ + char dummy = 0; + struct iovec iov[2] = {{&dummy, 0}, {&dummy, 0}}; + ssize_t n = writev(fd, iov, 2); + int saved = errno; + close(fd); + if (n == 0) + PASS(); + else + printf("FAIL: writev returned %zd errno=%d\n", n, saved), fails++; +} + +static void test_proc_oom_stat_size_zero(void) +{ + TEST("/proc/self/oom_score_adj stat reports size 0"); + struct stat st; + if (stat("/proc/self/oom_score_adj", &st) < 0) { + FAIL("stat"); + return; + } + /* A non-zero st_size would cap stat-sized read buffers, truncating + * "-1000\n" (6 bytes) to whatever size was hardcoded. */ + EXPECT_TRUE(st.st_size == 0, "st_size should be 0"); +} + +static void test_proc_fdinfo_eventfd_count(void) +{ + TEST("/proc/self/fdinfo/ exposes eventfd-count"); + int efd = eventfd(42, 0); + if (efd < 0) { + FAIL("eventfd"); + return; + } + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", efd); + int fd = open(path, O_RDONLY); + if (fd < 0) { + close(efd); + FAIL("open"); + return; + } + char buf[256] = {0}; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + close(efd); + if (n <= 0) { + FAIL("read"); + return; + } + /* Linux fs/eventfd.c emits "eventfd-count: %16llx" with a single + * space separator (not a tab, unlike pos:/flags:/mnt_id:). Pin the + * exact prefix so a regression to a tab is caught. Decimal 42 is 0x2a. + */ + const char *p = strstr(buf, "eventfd-count: "); + EXPECT_TRUE(p && strstr(p, "2a") != NULL, + "eventfd-count missing space separator or wrong hex value"); +} + +static void test_proc_fdinfo_signalfd_mask(void) +{ + TEST("/proc/self/fdinfo/ exposes sigmask"); + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, SIGUSR1); + int sfd = signalfd(-1, &mask, 0); + if (sfd < 0) { + FAIL("signalfd"); + return; + } + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", sfd); + int fd = open(path, O_RDONLY); + if (fd < 0) { + close(sfd); + FAIL("open"); + return; + } + char buf[256] = {0}; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + close(sfd); + if (n <= 0) { + FAIL("read"); + return; + } + /* Linux fs/signalfd.c emits "sigmask:\t%016llx" with a tab separator + * (verified against a real /proc/self/fdinfo dump on Linux 6.x). + * Pin the exact prefix so a regression to a space is caught. + */ + EXPECT_TRUE(strstr(buf, "sigmask:\t") != NULL, + "sigmask missing tab separator"); +} + +static void test_proc_fdinfo_timerfd_periodic_value(void) +{ + TEST("/proc/self/fdinfo/ reports periodic timerfd next expiry"); + int tfd = timerfd_create(CLOCK_MONOTONIC, 0); + if (tfd < 0) { + FAIL("timerfd_create"); + return; + } + + struct itimerspec its = {.it_value = {.tv_sec = 0, .tv_nsec = 50000000}, + .it_interval = {.tv_sec = 0, .tv_nsec = 50000000}}; + if (timerfd_settime(tfd, 0, &its, NULL) < 0) { + close(tfd); + FAIL("timerfd_settime"); + return; + } + + usleep(70000); + + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", tfd); + int fd = open(path, O_RDONLY); + if (fd < 0) { + close(tfd); + FAIL("open"); + return; + } + + char buf[256] = {0}; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + close(tfd); + if (n <= 0) { + FAIL("read"); + return; + } + + long long value_sec = -1, value_nsec = -1; + long long interval_sec = -1, interval_nsec = -1; + /* Linux fs/timerfd.c emits "it_value: (S, NS)" with a single space + * after the colon (unlike pos:/flags: which use tabs). */ + const char *value = strstr(buf, "it_value: ("); + const char *interval = strstr(buf, "it_interval: ("); + if (!value || !interval || + sscanf(value, "it_value: (%lld, %lld)", &value_sec, &value_nsec) != 2 || + sscanf(interval, "it_interval: (%lld, %lld)", &interval_sec, + &interval_nsec) != 2) { + FAIL("parse fdinfo"); + return; + } + + long long value_total_ns = value_sec * 1000000000LL + value_nsec; + long long interval_total_ns = interval_sec * 1000000000LL + interval_nsec; + /* it_interval is the static settime value and must round-trip; Linux's + * timerfd_get_remaining() reports 0 once the timer has fired, while + * elfuse computes time-until-next from the kqueue arm time. Both are + * non-negative and bounded by the interval, so accept either form. + */ + EXPECT_TRUE(interval_total_ns == 50000000 && value_total_ns >= 0 && + value_total_ns <= interval_total_ns, + "interval should round-trip and value should be within bounds"); +} + +static void test_proc_fdinfo_timerfd_ticks_drains_kqueue(void) +{ + TEST("/proc/self/fdinfo/ ticks reflects pending kqueue fires"); + /* Arm a periodic timer, wait for several fires, then read fdinfo + * WITHOUT first reading the timerfd. The pre-fix snapshot exported + * a stale expirations counter (the kqueue events had not been folded + * in), so ticks would read 0 even after multiple fires. Linux's + * fs/timerfd.c snapshots ticks under the wait-queue lock, where the + * counter reflects every fire that hit the kernel state. */ + int tfd = timerfd_create(CLOCK_MONOTONIC, 0); + if (tfd < 0) { + FAIL("timerfd_create"); + return; + } + struct itimerspec its = {.it_value = {.tv_sec = 0, .tv_nsec = 20000000}, + .it_interval = {.tv_sec = 0, .tv_nsec = 20000000}}; + if (timerfd_settime(tfd, 0, &its, NULL) < 0) { + close(tfd); + FAIL("timerfd_settime"); + return; + } + /* Wait long enough for the timer to fire at least three times. */ + usleep(120000); + + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", tfd); + int fd = open(path, O_RDONLY); + if (fd < 0) { + close(tfd); + FAIL("open"); + return; + } + char buf[256] = {0}; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + close(tfd); + if (n <= 0) { + FAIL("read"); + return; + } + + /* Linux uses "ticks: %llu" with a single space; elfuse matches. */ + const char *p = strstr(buf, "ticks: "); + unsigned long long ticks = 0; + if (!p || sscanf(p, "ticks: %llu", &ticks) != 1) { + FAIL("parse ticks"); + return; + } + /* At minimum one fire should be visible; on a slow host more would + * be expected. Pre-fix elfuse would report 0 here. */ + EXPECT_TRUE(ticks >= 1, "ticks should reflect at least one fire"); +} + +static void test_proc_fdinfo_dir_concurrent_safe(void) +{ + TEST("/proc/self/fdinfo dir tolerates concurrent re-open"); + /* Open the directory twice and verify both enumerate independently. + * The earlier shared-dir design could mutate one open's backing files + * while another iterated. Both Linux and the per-open scratch fix + * should at minimum surface stdin/out/err on each enumeration. + */ + DIR *d1 = opendir("/proc/self/fdinfo"); + if (!d1) { + FAIL("opendir 1"); + return; + } + DIR *d2 = opendir("/proc/self/fdinfo"); + if (!d2) { + closedir(d1); + FAIL("opendir 2"); + return; + } + + int n1 = 0, n2 = 0; + struct dirent *ent; + while ((ent = readdir(d1))) + if (ent->d_name[0] != '.') + n1++; + while ((ent = readdir(d2))) + if (ent->d_name[0] != '.') + n2++; + closedir(d1); + closedir(d2); + EXPECT_TRUE(n1 >= 3 && n2 >= 3, "concurrent enumeration broken"); +} + +static void test_proc_fdinfo_dirfd_openat_uses_virtual_entries(void) +{ + TEST("/proc/self/fdinfo dirfd openat resolves virtually"); + int dirfd = open("/proc/self/fdinfo", O_RDONLY | O_DIRECTORY); + if (dirfd < 0) { + FAIL("open dir"); + return; + } + + int fd = openat(dirfd, "0", O_RDONLY); + close(dirfd); + if (fd < 0) { + FAIL("openat"); + return; + } + + char buf[256] = {0}; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + if (n <= 0) { + FAIL("read"); + return; + } + + EXPECT_TRUE(strstr(buf, "pos:\t") && strstr(buf, "flags:\t"), + "fdinfo openat should yield synthetic payload"); +} + +static int bind_listen_loopback_tcp(void) +{ + int s = socket(AF_INET, SOCK_STREAM, 0); + if (s < 0) + return -1; + int one = 1; + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); + struct sockaddr_in sa = {0}; + sa.sin_family = AF_INET; + sa.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + sa.sin_port = 0; + if (bind(s, (struct sockaddr *) &sa, sizeof(sa)) < 0 || listen(s, 1) < 0) { + close(s); + return -1; + } + return s; +} + +static void test_proc_net_tcp_sl_dense(void) +{ + TEST("/proc/net/tcp sl column stays dense across mixed sockets"); + /* Interleave non-TCP sockets BEFORE the bound TCP listeners so the + * proc_pidinfo iterator visits the rejected sockets first and the + * pre-fix sparse-slot bug would assign nonzero sl to the first + * emitted row. Two TCP listeners ensure the second row's sl exposes + * any gap created by additional non-TCP visits between them. + * + * Pre-fix: udp1, udp2, sp[0], sp[1] all bump the iterator slot + * counter to 4 before tcp1 emits. tcp1 row: sl=4. tcp2 row: sl=5. + * The first-row check (sl == 0) would fail. + * Post-fix: only emitted rows increment the visitor's row counter; + * tcp1: sl=0, tcp2: sl=1. Dense. + */ + int udp1 = socket(AF_INET, SOCK_DGRAM, 0); + int udp2 = socket(AF_INET, SOCK_DGRAM, 0); + int sp[2] = {-1, -1}; + int sp_rc = socketpair(AF_UNIX, SOCK_STREAM, 0, sp); + int tcp1 = bind_listen_loopback_tcp(); + int tcp2 = bind_listen_loopback_tcp(); + if (udp1 < 0 || udp2 < 0 || sp_rc < 0 || tcp1 < 0 || tcp2 < 0) { + if (udp1 >= 0) + close(udp1); + if (udp2 >= 0) + close(udp2); + if (sp[0] >= 0) + close(sp[0]); + if (sp[1] >= 0) + close(sp[1]); + if (tcp1 >= 0) + close(tcp1); + if (tcp2 >= 0) + close(tcp2); + FAIL("socket setup"); + return; + } + + int fd = open("/proc/net/tcp", O_RDONLY); + if (fd < 0) { + close(udp1); + close(udp2); + close(sp[0]); + close(sp[1]); + close(tcp1); + close(tcp2); + FAIL("open"); + return; + } + char buf[16384]; + ssize_t total = 0; + for (;;) { + ssize_t n = read(fd, buf + total, sizeof(buf) - total - 1); + if (n <= 0) + break; + total += n; + } + close(fd); + close(udp1); + close(udp2); + close(sp[0]); + close(sp[1]); + close(tcp1); + close(tcp2); + buf[total] = '\0'; + + /* Skip the header line; collect each subsequent row's leading "sl" + * field. /proc/net/tcp's row format is " N: ..." with N a decimal + * serial. Verify the serials form 0,1,2,... with no gaps. + */ + char *line = strchr(buf, '\n'); + if (!line) { + FAIL("no rows"); + return; + } + line++; + int expected = 0; + while (*line) { + char *colon = strchr(line, ':'); + char *eol = strchr(line, '\n'); + if (!colon || (eol && colon > eol)) + break; + int sl = atoi(line); + if (sl != expected) { + printf("FAIL: sl=%d expected=%d\n", sl, expected); + fails++; + return; + } + expected++; + if (!eol) + break; + line = eol + 1; + } + if (expected == 0) { + /* The bound listener should have produced a row. Treat absence + * as failure since the regression coverage depends on it. */ + FAIL("no TCP rows after bind/listen"); + return; + } + PASS(); +} + +static void test_proc_net_dirfd_openat_uses_virtual_entries(void) +{ + TEST("/proc/net dirfd openat resolves virtually"); + int dirfd = open("/proc/net", O_RDONLY | O_DIRECTORY); + if (dirfd < 0) { + FAIL("open dir"); + return; + } + + int fd = openat(dirfd, "tcp", O_RDONLY); + close(dirfd); + if (fd < 0) { + FAIL("openat"); + return; + } + + char buf[512] = {0}; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + if (n <= 0) { + FAIL("read"); + return; + } + + EXPECT_TRUE(strstr(buf, "local_address"), + "proc net dirfd should preserve synthetic tcp table"); +} + static void test_proc_cpuinfo_all_cpus(void) { TEST("/proc/cpuinfo lists all CPUs"); @@ -668,7 +1408,25 @@ int main(void) /* /proc */ test_proc_oom_score_adj(); test_proc_oom_score_adj_persists_write(); + test_proc_oom_score_adj_rejects_out_of_range(); + test_proc_oom_adj_scaling(); + test_proc_oom_adj_same_fd_roundtrip(); + test_proc_oom_adj_reread_tracks_score_adj_updates(); + test_proc_oom_score_adj_reread_tracks_updates(); + test_proc_oom_score_no_write(); + test_proc_oom_score_write_fails(); + test_proc_oom_score_open_enforces_read_only(); + test_proc_oom_zero_length_writev(); + test_proc_oom_stat_size_zero(); test_proc_fdinfo(); + test_proc_fdinfo_eventfd_count(); + test_proc_fdinfo_signalfd_mask(); + test_proc_fdinfo_timerfd_periodic_value(); + test_proc_fdinfo_timerfd_ticks_drains_kqueue(); + test_proc_fdinfo_dir_concurrent_safe(); + test_proc_fdinfo_dirfd_openat_uses_virtual_entries(); + test_proc_net_tcp_sl_dense(); + test_proc_net_dirfd_openat_uses_virtual_entries(); test_proc_cpuinfo_all_cpus(); /* signalfd */