diff --git a/src/runtime/procemu.c b/src/runtime/procemu.c index 7219323..61698b6 100644 --- a/src/runtime/procemu.c +++ b/src/runtime/procemu.c @@ -77,6 +77,27 @@ static char proc_tmpdir[128]; static bool proc_tmpdir_ok; static pthread_mutex_t proc_tmpdir_lock = PTHREAD_MUTEX_INITIALIZER; +/* Synthetic /sys/devices/system/cpu directory backing store. Populated lazily + * on first access (Java GC, Go runtime, libnuma probe these to size thread + * pools). Layout matches the minimal subset Linux exposes: + * /online text file: "0\n" or "0-N\n" + * /possible same + * /present same + * /cpuN/ one empty dir per CPU (cache/topology stays empty + * until a real consumer asks for those subtrees) + * Population is a one-shot snapshot taken at first call: the host CPU count + * does not change at runtime, so refresh is unnecessary. + * + * syscpu_owner_pid records the pid that ran mkdtemp so atexit-driven cleanup + * runs only in that process. clone(CLONE_VM) children inherit the host atexit + * list and the populated syscpu_dir_ok state, so without the guard a child exit + * would rmdir the parent's still-active scratch tree. + */ +static char syscpu_dir[128]; +static bool syscpu_dir_ok; +static pid_t syscpu_owner_pid; +static pthread_mutex_t syscpu_dir_lock = PTHREAD_MUTEX_INITIALIZER; + /* OOM range constants from Linux include/uapi/linux/oom.h. */ #define LINUX_OOM_SCORE_ADJ_MIN (-1000) #define LINUX_OOM_SCORE_ADJ_MAX 1000 @@ -608,6 +629,8 @@ static void stat_fill_proc_dir(struct stat *st, st->st_nlink = nlink; st->st_dev = PROC_SYNTH_DEV; st->st_ino = proc_synth_ino(path); + st->st_uid = proc_get_uid(); + st->st_gid = proc_get_gid(); st->st_blksize = 4096; } @@ -674,6 +697,8 @@ static void stat_fill_proc_file(struct stat *st, mode_t mode, const char *path) st->st_nlink = 1; st->st_dev = PROC_SYNTH_DEV; st->st_ino = proc_synth_ino(path); + st->st_uid = proc_get_uid(); + st->st_gid = proc_get_gid(); st->st_size = 0; st->st_blksize = 4096; st->st_blocks = 0; @@ -1097,6 +1122,279 @@ static const char *ensure_proc_tmpdir(const guest_t *g) return proc_tmpdir; } +/* Online/possible/present format the kernel uses for cpumask range files: + * single CPU -> "0\n" + * N CPUs -> "0-N-1\n" + * Mirrors Linux bitmap_print_to_pagebuf("%*pbl"), which is what every + * /sys/devices/system/cpu cpumask file emits. + */ +static int syscpu_format_range(char *buf, size_t bufsz, int ncpu) +{ + if (ncpu <= 1) + return snprintf(buf, bufsz, "0\n"); + return snprintf(buf, bufsz, "0-%d\n", ncpu - 1); +} + +static int syscpu_count(void) +{ + int n = (int) sysconf(_SC_NPROCESSORS_ONLN); + if (n < 1) + n = 1; + return n; +} + +/* Walk syscpu_dir and remove every entry plus the dir itself. Caller is + * responsible for any owner/initialized checks; the partial-init recovery + * path needs to call this even when syscpu_dir_ok is still false. + */ +static void syscpu_dir_remove_tree(void) +{ + if (syscpu_dir[0] == '\0') + return; + + DIR *d = opendir(syscpu_dir); + if (d) { + struct dirent *ent; + char path[256]; + while ((ent = readdir(d))) { + if (ent->d_name[0] == '.' && + (ent->d_name[1] == '\0' || + (ent->d_name[1] == '.' && ent->d_name[2] == '\0'))) + continue; + int n = + snprintf(path, sizeof(path), "%s/%s", syscpu_dir, ent->d_name); + if (n <= 0 || (size_t) n >= sizeof(path)) + continue; + /* cpuN entries are directories, range files are regular files. + * rmdir succeeds for the dirs, fails with ENOTDIR for files; + * unlink covers the latter without an extra stat. + */ + if (rmdir(path) < 0) + unlink(path); + } + closedir(d); + } + rmdir(syscpu_dir); +} + +static void syscpu_dir_cleanup(void) +{ + if (!syscpu_dir_ok) + return; + /* Only the process that ran mkdtemp may remove the tree. CLONE_VM children + * inherit this atexit handler and the populated state, but the scratch dir + * itself belongs to the parent. + */ + if (getpid() != syscpu_owner_pid) + return; + syscpu_dir_remove_tree(); +} + +static int syscpu_write_file(const char *dir, + const char *name, + const char *data, + size_t len) +{ + char path[160]; + if (snprintf(path, sizeof(path), "%s/%s", dir, name) >= (int) sizeof(path)) + return -1; + int fd = open(path, O_CREAT | O_TRUNC | O_WRONLY, 0444); + if (fd < 0) + return -1; + int rc = 0; + size_t off = 0; + while (off < len) { + ssize_t w = write(fd, (const char *) data + off, len - off); + if (w < 0) { + if (errno == EINTR) + continue; + rc = -1; + break; + } + off += (size_t) w; + } + close(fd); + return rc; +} + +/* Lazily build /tmp/elfuse-syscpu-XXXXXX/ with the cpumask files and one + * empty cpuN directory per host CPU. Returns the temp dir path on success, + * or NULL on failure with errno set. Any failure mid-population tears down + * the partial tree so callers never observe a half-built directory. + * Thread-safe via syscpu_dir_lock. + */ +static const char *ensure_syscpu_dir(void) +{ + pthread_mutex_lock(&syscpu_dir_lock); + if (syscpu_dir_ok) { + pthread_mutex_unlock(&syscpu_dir_lock); + return syscpu_dir; + } + + str_copy_trunc(syscpu_dir, "/tmp/elfuse-syscpu-XXXXXX", sizeof(syscpu_dir)); + if (!mkdtemp(syscpu_dir)) { + syscpu_dir[0] = '\0'; + pthread_mutex_unlock(&syscpu_dir_lock); + return NULL; + } + + int ncpu = syscpu_count(); + char range[32]; + int range_len = syscpu_format_range(range, sizeof(range), ncpu); + if (range_len < 0) + range_len = 0; + + int saved_errno = 0; + static const char *cpumask_files[] = {"online", "possible", "present", + NULL}; + for (const char **f = cpumask_files; *f; f++) { + if (syscpu_write_file(syscpu_dir, *f, range, (size_t) range_len) < 0) { + saved_errno = errno; + goto fail; + } + } + + char cpu_path[160]; + for (int i = 0; i < ncpu; i++) { + if (snprintf(cpu_path, sizeof(cpu_path), "%s/cpu%d", syscpu_dir, i) >= + (int) sizeof(cpu_path)) { + saved_errno = ENAMETOOLONG; + goto fail; + } + if (mkdir(cpu_path, 0555) < 0) { + saved_errno = errno; + goto fail; + } + } + + /* Record the owner before flipping syscpu_dir_ok so the cleanup hook, + * if it ever observes the populated state, also sees the right pid. + */ + syscpu_owner_pid = getpid(); + atexit(syscpu_dir_cleanup); + syscpu_dir_ok = true; + pthread_mutex_unlock(&syscpu_dir_lock); + return syscpu_dir; + +fail: + /* Tear down the partial tree so a later call can mkdtemp a fresh slot. + * Bypass the syscpu_dir_ok guard since this path runs before the flag + * is flipped. + */ + syscpu_dir_remove_tree(); + syscpu_dir[0] = '\0'; + pthread_mutex_unlock(&syscpu_dir_lock); + errno = saved_errno; + return NULL; +} + +/* Reject any '..' component in suffix so the joined host path cannot escape + * the scratch dir. The synthetic /sys/devices/system/cpu tree has no use + * case for parent-directory traversal, and accepting it would let a guest + * call like open("/sys/devices/system/cpu/../../etc/passwd") drive + * lstat/open on an arbitrary host path. Empty components and '.' are + * harmless and pass through unchanged. + */ +static bool syscpu_suffix_safe(const char *suffix) +{ + const char *p = suffix; + while (*p) { + const char *seg = p; + while (*p && *p != '/') + p++; + size_t len = (size_t) (p - seg); + if (len == 2 && seg[0] == '.' && seg[1] == '.') + return false; + if (*p == '/') + p++; + } + return true; +} + +/* Translate a /sys/devices/system/cpu[/...] path into the path inside the + * scratch dir. Returns 0 on success (host_path filled), -1 with errno set + * for malformed inputs (ENOENT for missing init, EACCES for traversal, + * ENAMETOOLONG for overflow). When the suffix is empty (the root dir + * itself), host_path receives just the scratch dir. + */ +static int syscpu_resolve_path(const char *suffix, + char *host_path, + size_t host_path_sz) +{ + if (!syscpu_suffix_safe(suffix)) { + errno = EACCES; + return -1; + } + const char *dir = ensure_syscpu_dir(); + if (!dir) { + errno = ENOENT; + return -1; + } + int n; + if (!*suffix) + n = snprintf(host_path, host_path_sz, "%s", dir); + else + n = snprintf(host_path, host_path_sz, "%s/%s", dir, suffix); + if (n < 0 || (size_t) n >= host_path_sz) { + errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +/* The synthetic sysfs CPU tree is read-only. Accept only descriptor flags + * that make sense for a read-only open and reject mutating flags up front + * so the guest cannot create, truncate, or request write access anywhere + * in the stub. + */ +static bool syscpu_open_is_readonly(int linux_flags) +{ + int accmode = translate_open_flags(linux_flags) & O_ACCMODE; + return accmode == O_RDONLY && + !(linux_flags & (LINUX_O_CREAT | LINUX_O_TRUNC)); +} + +/* Classify a guest path against the synthetic sysfs CPU tree. + * SYSCPU_NONE - unrelated path; *suffix_out unset. + * SYSCPU_ROOT - matches "/sys/devices/system/cpu" or with a trailing '/'. + * *suffix_out is the empty string. + * SYSCPU_CHILD - matches "/sys/devices/system/cpu/"; *suffix_out + * points at (never a leading '/'; may be empty if + * the caller passed the trailing-slash form, which the + * ROOT branch already absorbed). + * Centralizes the prefix arithmetic so proc_intercept_open and + * proc_intercept_stat share one source of truth for the SYSFS_CPU shape. + */ +#define SYSFS_CPU "/sys/devices/system/cpu" +#define SYSFS_CPU_LEN (sizeof(SYSFS_CPU) - 1) +/* Host scratch-dir path buffer size: scratch dir is /tmp/elfuse-syscpu-<6> + * (under 30 chars) plus a /sys/devices/system/cpu/ remainder bounded + * by LINUX_PATH_MAX. 256 is comfortable for the realistic suffixes the stub + * exposes (cpuN, cpumask range files). + */ +#define SYSCPU_HOST_PATH_MAX 256 +typedef enum { + SYSCPU_NONE, + SYSCPU_ROOT, + SYSCPU_CHILD, +} syscpu_match_t; + +static syscpu_match_t syscpu_classify(const char *path, const char **suffix_out) +{ + if (strncmp(path, SYSFS_CPU, SYSFS_CPU_LEN) != 0) + return SYSCPU_NONE; + char tail = path[SYSFS_CPU_LEN]; + if (tail == '\0' || (tail == '/' && path[SYSFS_CPU_LEN + 1] == '\0')) { + *suffix_out = ""; + return SYSCPU_ROOT; + } + if (tail == '/') { + *suffix_out = path + SYSFS_CPU_LEN + 1; + return SYSCPU_CHILD; + } + return SYSCPU_NONE; +} + typedef struct { int64_t *tids; int ntids; @@ -2011,6 +2309,38 @@ int proc_intercept_open(const guest_t *g, "user:x:1000:\n"); } + /* /sys/devices/system/cpu[/...] -> synthetic CPU topology stub. + * Backs the lazy scratch dir that holds the cpumask range files plus + * one empty cpuN directory per host CPU. The cache/topology subtrees + * stay empty so consumers that only need cpu count (Java GC, Go + * scheduler init, libnuma) succeed; deeper queries return ENOENT. + */ + { + const char *suffix; + syscpu_match_t m = syscpu_classify(path, &suffix); + if (m != SYSCPU_NONE) { + if (!syscpu_open_is_readonly(linux_flags)) { + errno = EACCES; + return -1; + } + if (m == SYSCPU_ROOT) { + const char *dir = ensure_syscpu_dir(); + if (!dir) + return -1; + return proc_open_dir_fd(dir, linux_flags); + } + char host_path[SYSCPU_HOST_PATH_MAX]; + if (syscpu_resolve_path(suffix, host_path, sizeof(host_path)) < 0) + return -1; + /* O_NOFOLLOW: the scratch dir contents are owned by elfuse, but + * a caller could still race a symlink into the tree before this + * open. Block any cross-tree escape attempt regardless. + */ + int oflags = translate_open_flags(linux_flags); + return open(host_path, oflags | O_NOFOLLOW, mode); + } + } + return PROC_NOT_INTERCEPTED; } @@ -2176,6 +2506,39 @@ int proc_intercept_stat(const char *path, struct stat *st) return 0; } + /* /sys/devices/system/cpu[/...]: synthesize stat from the lazy scratch + * dir. Anything not present in the scratch dir (e.g. cpuN/topology, + * cpuN/cache) returns ENOENT, which matches the "stub-empty" contract. + */ + { + const char *suffix; + syscpu_match_t m = syscpu_classify(path, &suffix); + if (m == SYSCPU_ROOT) { + if (!ensure_syscpu_dir()) + return -1; + stat_fill_proc_dir(st, 0555, 2, path); + return 0; + } + if (m == SYSCPU_CHILD) { + char host_path[SYSCPU_HOST_PATH_MAX]; + if (syscpu_resolve_path(suffix, host_path, sizeof(host_path)) < 0) + return -1; + struct stat host_st; + if (lstat(host_path, &host_st) < 0) + return -1; + /* Replace host inode/dev with the synthetic-procfs convention so + * the guest sees a stable identity that does not collide with + * real host files (and so st_size reads as 0 for cpumask files, + * matching real sysfs). + */ + if (S_ISDIR(host_st.st_mode)) + stat_fill_proc_dir(st, 0555, 2, path); + else + stat_fill_proc_file(st, 0444, path); + return 0; + } + } + return PROC_NOT_INTERCEPTED; } diff --git a/src/syscall/fs.c b/src/syscall/fs.c index 90a3c97..196c3be 100644 --- a/src/syscall/fs.c +++ b/src/syscall/fs.c @@ -1410,15 +1410,16 @@ int64_t sys_faccessat(guest_t *g, if (host_dirfd_ref_open(dirfd, &dir_ref) < 0) return -LINUX_EBADF; - /* Check /proc paths first since macOS has no /proc filesystem, so - * access("/proc/self/stat", R_OK) etc. must be intercepted. - * If proc_intercept_stat succeeds, the path is a known emulated - * entry and the code reports it as accessible. + /* Check intercepted stat paths first since macOS has no /proc filesystem + * and the sysfs CPU tree is synthetic. Access must reflect the synthetic + * mode bits, not just path existence. */ - struct stat dummy_st; + struct stat intercepted_st; if (path_might_use_stat_intercept(access_path) && - proc_intercept_stat(access_path, &dummy_st) == 0) { + proc_intercept_stat(access_path, &intercepted_st) == 0) { host_fd_ref_close(&dir_ref); + if (path_check_intercept_access(&intercepted_st, mode, flags) < 0) + return linux_errno(); return 0; } diff --git a/src/syscall/path.c b/src/syscall/path.c index 7ef42d2..dfed81d 100644 --- a/src/syscall/path.c +++ b/src/syscall/path.c @@ -26,40 +26,106 @@ #define PROC_PATH_COMPONENTS_MAX (LINUX_PATH_MAX / 2) -int path_might_use_open_intercept(const char *path) +/* True when path equals prefix exactly, or extends it with '/'. Avoids the + * surprise where "/sys/devices/system/cpufoo" would match a bare strncmp on + * "/sys/devices/system/cpu" and pull an unrelated path through the intercept + * layer. + */ +static bool path_prefix_match(const char *path, const char *prefix, size_t plen) +{ + if (strncmp(path, prefix, plen) != 0) + return false; + return path[plen] == '\0' || path[plen] == '/'; +} + +#define SYSFS_CPU_PREFIX "/sys/devices/system/cpu" + +bool path_might_use_open_intercept(const char *path) { if (!path || path[0] != '/') - return 0; + return false; if (!strncmp(path, "/proc", 5)) - return 1; + return true; if (!strncmp(path, "/dev", 4)) - return 1; + return true; + if (path_prefix_match(path, SYSFS_CPU_PREFIX, sizeof(SYSFS_CPU_PREFIX) - 1)) + return true; if (!strcmp(path, "/etc/mtab") || !strcmp(path, "/etc/passwd") || !strcmp(path, "/etc/group")) - return 1; + return true; if (!strcmp(path, "/var/run/utmp") || !strcmp(path, "/run/utmp")) - return 1; + return true; - return 0; + return false; } -int path_might_use_stat_intercept(const char *path) +bool path_might_use_stat_intercept(const char *path) { if (!path || path[0] != '/') - return 0; + return false; if (!strncmp(path, "/proc", 5)) - return 1; + return true; if (!strncmp(path, "/dev/shm", 8)) - return 1; + return true; + if (path_prefix_match(path, SYSFS_CPU_PREFIX, sizeof(SYSFS_CPU_PREFIX) - 1)) + return true; - return 0; + return false; } -static int path_next_component(const char **pathp, - const char **comp, - size_t *len) +int path_check_intercept_access(const struct stat *st, int mode, int flags) +{ + if ((mode & ~(F_OK | R_OK | W_OK | X_OK)) != 0) { + errno = EINVAL; + return -1; + } + if (mode == F_OK) + return 0; + + mode_t granted = 0; + uint32_t uid = + (flags & LINUX_AT_EACCESS) ? proc_get_euid() : proc_get_uid(); + uint32_t gid = + (flags & LINUX_AT_EACCESS) ? proc_get_egid() : proc_get_gid(); + + if (uid == 0) { + /* CAP_DAC_OVERRIDE: root reads and writes any file regardless of mode + * bits. Execute still requires at least one x-bit set so non-executable + * files cannot be run as root. Matches Linux generic_permission() in + * fs/namei.c. + */ + granted |= R_OK | W_OK; + if (st->st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) + granted |= X_OK; + } else { + mode_t bits; + if (uid == st->st_uid) + bits = (st->st_mode >> 6) & 7; + else if (gid == st->st_gid) + bits = (st->st_mode >> 3) & 7; + else + bits = st->st_mode & 7; + + if (bits & 4) + granted |= R_OK; + if (bits & 2) + granted |= W_OK; + if (bits & 1) + granted |= X_OK; + } + + if ((mode & granted) == mode) + return 0; + + errno = EACCES; + return -1; +} + +static bool path_next_component(const char **pathp, + const char **comp, + size_t *len) { const char *p = *pathp; @@ -67,7 +133,7 @@ static int path_next_component(const char **pathp, p++; if (*p == '\0') { *pathp = p; - return 0; + return false; } *comp = p; @@ -75,10 +141,10 @@ static int path_next_component(const char **pathp, p++; *len = (size_t) (p - *comp); *pathp = p; - return 1; + return true; } -static int path_component_is_dot(const char *comp, size_t len) +static bool path_component_is_dot(const char *comp, size_t len) { return len == 1 && comp[0] == '.'; } diff --git a/src/syscall/path.h b/src/syscall/path.h index c781db5..29e1715 100644 --- a/src/syscall/path.h +++ b/src/syscall/path.h @@ -9,11 +9,13 @@ #include #include #include +#include #include "syscall/internal.h" -int path_might_use_open_intercept(const char *path); -int path_might_use_stat_intercept(const char *path); +bool path_might_use_open_intercept(const char *path); +bool path_might_use_stat_intercept(const char *path); +int path_check_intercept_access(const struct stat *st, int mode, int flags); int resolve_proc_at_path(guest_fd_t dirfd, const char *path, char *out, diff --git a/src/syscall/syscall.c b/src/syscall/syscall.c index bb8ef14..a75fa73 100644 --- a/src/syscall/syscall.c +++ b/src/syscall/syscall.c @@ -213,7 +213,11 @@ SC_FORWARD(sc_fchmodat2, sys_fchmodat(g, (int) x0, x1, (uint32_t) x2, (int) x3 SC_FORWARD(sc_fchownat, sys_fchownat(g, (int) x0, x1, (uint32_t) x2, (uint32_t) x3, (int) x4)) SC_FORWARD(sc_fchown, sys_fchown((int) x0, (uint32_t) x1, (uint32_t) x2)) SC_FORWARD(sc_utimensat, sys_utimensat(g, (int) x0, x1, x2, (int) x3)) -SC_FORWARD(sc_faccessat, sys_faccessat(g, (int) x0, x1, (int) x2, (int) x3)) +/* Linux faccessat (SYS 48) is 3-arg: dirfd, path, mode. + * The flags parameter was added in faccessat2 (SYS 439). + * x3 contains garbage from the caller's register state. + */ +SC_FORWARD(sc_faccessat, sys_faccessat(g, (int) x0, x1, (int) x2, 0)) SC_FORWARD(sc_faccessat2, sys_faccessat(g, (int) x0, x1, (int) x2, (int) x3)) SC_FORWARD(sc_ftruncate, sys_ftruncate((int) x0, (int64_t) x1)) SC_FORWARD(sc_truncate, sys_truncate(g, x0, (int64_t) x1)) diff --git a/tests/manifest.txt b/tests/manifest.txt index 71eb420..3fbdb75 100644 --- a/tests/manifest.txt +++ b/tests/manifest.txt @@ -57,6 +57,7 @@ test-large-io-boundary [section] /proc and /dev emulation tests test-proc +test-sysfs-cpu [section] Network tests test-net diff --git a/tests/test-matrix.sh b/tests/test-matrix.sh index 9235ae3..ec6e929 100755 --- a/tests/test-matrix.sh +++ b/tests/test-matrix.sh @@ -180,9 +180,12 @@ run_elfuse_sysroot() # Tests that either hang under qemu-system-aarch64 on Apple Silicon # (raw clone / PI futex / massive thread+mmap stress) or currently diverge # from the Alpine linux-virt reference kernel on the deprecated oom_adj -# procfs compatibility path exercised by test-io-opt. They still run in -# elfuse-aarch64 mode and in `make check`; the qemu reference run skips them. -QEMU_SKIP="test-thread test-stress test-futex-pi test-io-opt" +# procfs compatibility path exercised by test-io-opt. test-sysfs-cpu asserts +# the elfuse stub contract (cache/topology subtree empty, possible == online, +# cpuN count == online count) which a real kernel does not honor. All listed +# tests still run in elfuse-aarch64 mode and in `make check`; the qemu +# reference run skips them. +QEMU_SKIP="test-thread test-stress test-futex-pi test-io-opt test-sysfs-cpu" is_qemu_skipped() { @@ -355,6 +358,7 @@ run_unit_tests() printf "\n/proc and /dev\n" test_check "$runner" "test-proc" "0 failed" "$bindir/test-proc" + test_check "$runner" "test-sysfs-cpu" "0 failed" "$bindir/test-sysfs-cpu" printf "\nNetwork\n" test_check "$runner" "test-net" "0 failed" "$bindir/test-net" diff --git a/tests/test-sysfs-cpu.c b/tests/test-sysfs-cpu.c new file mode 100644 index 0000000..8caaa55 --- /dev/null +++ b/tests/test-sysfs-cpu.c @@ -0,0 +1,232 @@ +/* Test /sys/devices/system/cpu emulation + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Tests: /sys/devices/system/cpu/{online,possible,present} read, + * opendir + readdir on /sys/devices/system/cpu, stat on cpuN + * directories, and ENOENT on the cache/topology subtrees that the + * stub deliberately leaves empty, plus access()/faccessat() mode + * checks for the read-only synthetic tree. + * + * Syscalls exercised: openat(56), read(63), close(57), getdents64(61), + * newfstatat(79). + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test-harness.h" +#include "test-util.h" + +/* Parse a "0\n" or "0-N\n" cpumask range file into the highest set CPU + * index. Returns -1 on malformed input. + */ +static int parse_cpurange(const char *s, ssize_t len) +{ + if (len <= 0) + return -1; + /* Skip leading whitespace just in case */ + int i = 0; + while (i < len && (s[i] == ' ' || s[i] == '\t')) + i++; + if (i >= len || s[i] < '0' || s[i] > '9') + return -1; + int low = 0; + while (i < len && s[i] >= '0' && s[i] <= '9') + low = low * 10 + (s[i++] - '0'); + + if (i < len && s[i] == '-') { + i++; + if (i >= len || s[i] < '0' || s[i] > '9') + return -1; + int high = 0; + while (i < len && s[i] >= '0' && s[i] <= '9') + high = high * 10 + (s[i++] - '0'); + return high; + } + return low; +} + +/* Read a cpumask range file (online/possible/present) and return the + * highest CPU index it advertises. -1 on read or parse failure. + */ +static int read_cpurange(const char *path) +{ + char buf[64]; + ssize_t n = read_file_nul(path, buf, sizeof(buf)); + if (n <= 0) + return -1; + return parse_cpurange(buf, n); +} + +int main(void) +{ + int passes = 0, fails = 0; + + printf("test-sysfs-cpu: /sys/devices/system/cpu emulation\n"); + + TEST("read /sys/devices/system/cpu/online"); + int max_cpu = read_cpurange("/sys/devices/system/cpu/online"); + EXPECT_TRUE(max_cpu >= 0, "online read or parse failed"); + + TEST("possible matches online"); + EXPECT_EQ(read_cpurange("/sys/devices/system/cpu/possible"), max_cpu, + "possible disagrees with online"); + + TEST("present matches online"); + EXPECT_EQ(read_cpurange("/sys/devices/system/cpu/present"), max_cpu, + "present disagrees with online"); + + TEST("readdir lists cpu0"); + { + DIR *dir = opendir("/sys/devices/system/cpu"); + if (dir) { + int found_cpu0 = 0; + struct dirent *de; + while ((de = readdir(dir))) { + if (!strcmp(de->d_name, "cpu0")) { + found_cpu0 = 1; + break; + } + } + closedir(dir); + EXPECT_TRUE(found_cpu0, "cpu0 entry not found"); + } else + FAIL("opendir failed"); + } + + TEST("cpu0 is a directory"); + { + struct stat st; + if (stat("/sys/devices/system/cpu/cpu0", &st) == 0) + EXPECT_TRUE(S_ISDIR(st.st_mode), "cpu0 is not a directory"); + else + FAIL("stat failed"); + } + + TEST("ENOENT on missing topology subtree"); + { + errno = 0; + int fd = + open("/sys/devices/system/cpu/cpu0/topology/core_id", O_RDONLY); + if (fd < 0 && errno == ENOENT) { + PASS(); + } else { + if (fd >= 0) + close(fd); + FAIL("expected ENOENT for empty subtree"); + } + } + + TEST("opendir on /sys/devices/system/cpu enumerates ncpu cpuN dirs"); + { + DIR *dir = opendir("/sys/devices/system/cpu"); + if (dir) { + int ncpu_dirs = 0; + struct dirent *de; + while ((de = readdir(dir))) { + if (!strncmp(de->d_name, "cpu", 3) && de->d_name[3] >= '0' && + de->d_name[3] <= '9') { + ncpu_dirs++; + } + } + closedir(dir); + EXPECT_EQ(ncpu_dirs, max_cpu + 1, "cpuN dir count != online+1"); + } else + FAIL("opendir failed"); + } + + /* The stub is read-only: O_WRONLY / O_RDWR / O_CREAT / O_TRUNC must + * fail with EACCES so a guest cannot mutate the synthetic tree (and + * cannot pivot a creation into the host scratch dir). + */ + TEST("EACCES on O_WRONLY of online"); + { + errno = 0; + int fd = open("/sys/devices/system/cpu/online", O_WRONLY); + if (fd >= 0) + close(fd); + EXPECT_TRUE(fd < 0 && errno == EACCES, "writable open accepted"); + } + + TEST("EACCES on O_WRONLY of sysfs cpu root"); + { + errno = 0; + int fd = open("/sys/devices/system/cpu", O_WRONLY); + if (fd >= 0) + close(fd); + EXPECT_TRUE(fd < 0 && errno == EACCES, "writable root open accepted"); + } + + TEST("EACCES on O_CREAT of new entry"); + { + errno = 0; + int fd = + open("/sys/devices/system/cpu/intruder", O_WRONLY | O_CREAT, 0644); + if (fd >= 0) { + close(fd); + unlink("/sys/devices/system/cpu/intruder"); + } + EXPECT_TRUE(fd < 0 && errno == EACCES, "O_CREAT accepted"); + } + + TEST("access reports online readable but not writable or executable"); + { + EXPECT_TRUE(access("/sys/devices/system/cpu/online", F_OK) == 0, + "F_OK failed"); + EXPECT_TRUE(access("/sys/devices/system/cpu/online", R_OK) == 0, + "R_OK failed"); + errno = 0; + EXPECT_TRUE(access("/sys/devices/system/cpu/online", W_OK) < 0 && + errno == EACCES, + "W_OK unexpectedly succeeded"); + errno = 0; + EXPECT_TRUE(access("/sys/devices/system/cpu/online", X_OK) < 0 && + errno == EACCES, + "X_OK unexpectedly succeeded"); + } + + TEST("access reports cpu root searchable but not writable"); + { + EXPECT_TRUE(access("/sys/devices/system/cpu", R_OK) == 0, + "cpu root R_OK failed"); + EXPECT_TRUE(access("/sys/devices/system/cpu", X_OK) == 0, + "cpu root X_OK failed"); + errno = 0; + EXPECT_TRUE( + access("/sys/devices/system/cpu", W_OK) < 0 && errno == EACCES, + "cpu root W_OK unexpectedly succeeded"); + } + + /* '..' in the suffix must not let the open/stat fall through onto an + * arbitrary host path. The stub keeps the tree closed against + * traversal regardless of where the scratch dir happens to live. + */ + TEST("EACCES on dotdot traversal in open"); + { + errno = 0; + int fd = open("/sys/devices/system/cpu/../../etc/hostname", O_RDONLY); + if (fd >= 0) + close(fd); + EXPECT_TRUE(fd < 0 && errno == EACCES, "dotdot traversal accepted"); + } + + TEST("EACCES on dotdot traversal in stat"); + { + struct stat st; + errno = 0; + int rc = stat("/sys/devices/system/cpu/../../etc/hostname", &st); + EXPECT_TRUE(rc < 0 && errno == EACCES, + "dotdot traversal accepted in stat"); + } + + SUMMARY("test-sysfs-cpu"); + return fails > 0 ? 1 : 0; +}