From 79c1d2aaaebb3800eba3fd671338b16c0f108c31 Mon Sep 17 00:00:00 2001 From: Sylirre Date: Sat, 23 May 2026 18:01:48 +0000 Subject: [PATCH 01/10] Make sandbox helpers like bubblewrap work inside proot Sandbox tools (bwrap, etc.) call unshare/setns/mount/pivot_root and write to /proc/self/{uid,gid}_map to set up a namespace; under proot these all fail because we have no real namespace. Pretend they succeed and use proot's binding system to make the resulting paths actually accessible. - syscall/enter.c: void unshare/setns/umount; turn mount into a runtime binding (bind/proc/sysfs/tmpfs); turn pivot_root into a root-binding swap that re-exposes the old root at put_old; redirect open of /proc//{uid_map,gid_map,setgroups} to /dev/null; also fix prctl(PR_SET_DUMPABLE) to actually return 0 (previously leaked -ENOSYS). - syscall/seccomp.c: filter PR_unshare/PR_setns so the handlers above run under seccomp mode 2. - path/canon.c: when a symlink's guest path aliases /proc via a binding (e.g. /oldroot/proc/self), route through readlink_proc so "self" resolves to the tracee's pid, not proot's. - extension/mountinfo: append synthesized lines for runtime bindings so bwrap's parse_mountinfo finds the mounts it just asked for. - path/temp.c: skip chmod on symlinks during temp-dir cleanup; bwrap leaves /dev/{stdin,fd,...} symlinks pointing at /proc/self/fd/N inside the emulated tmpfs dirs. --- src/extension/mountinfo/mountinfo.c | 143 +++++++++++---- src/path/canon.c | 76 +++++--- src/path/temp.c | 17 +- src/syscall/enter.c | 269 +++++++++++++++++++++++++--- src/syscall/seccomp.c | 2 + 5 files changed, 411 insertions(+), 96 deletions(-) diff --git a/src/extension/mountinfo/mountinfo.c b/src/extension/mountinfo/mountinfo.c index 9b66c76c..1abf2735 100644 --- a/src/extension/mountinfo/mountinfo.c +++ b/src/extension/mountinfo/mountinfo.c @@ -1,9 +1,42 @@ #include "extension/extension.h" #include "path/path.h" /* translate_path, */ +#include "path/binding.h" /* Binding, bindings */ #include "path/temp.h" /* create_temp_file, */ #include /* INT_MAX, */ #include /* PATH_MAX, */ #include /* strlen, strcmp */ +#include /* FILE, getline, fprintf */ +#include /* free, */ +#include /* CIRCLEQ_*, */ + +/** + * Append a synthesized mount-table line to @fp for each runtime + * binding (i.e. one that wasn't part of the static -r/-b set). This + * is what lets sandbox helpers like bubblewrap find the mount they + * just asked PRoot to create via emulate_mount(). + */ +static void append_runtime_binding_lines(Tracee *target_tracee, FILE *fp) +{ + Binding *binding; + int next_id = 1000000; + int parent_id = 1; + + if (target_tracee->fs->bindings.guest == NULL) + return; + + for (binding = CIRCLEQ_FIRST(target_tracee->fs->bindings.guest); + binding != (void *) target_tracee->fs->bindings.guest; + binding = CIRCLEQ_NEXT(binding, link.guest)) { + /* Skip the root binding "/" — already present as the kernel root. */ + if (strcmp(binding->guest.path, "/") == 0) + continue; + + fprintf(fp, + "%d %d 0:1 / %s rw,relatime - bind %s rw,relatime\n", + next_id++, parent_id, + binding->guest.path, binding->host.path); + } +} static void mountinfo_check_open_path(Tracee *tracee, char path[PATH_MAX]) { /* Try matching "/proc//mountinfo" */ @@ -33,10 +66,29 @@ static void mountinfo_check_open_path(Tracee *tracee, char path[PATH_MAX]) { char root_path[PATH_MAX]; // Host path to guest root translate_path(target_tracee, root_path, AT_FDCWD, "/", true); Comparison compare_result = compare_paths(root_path, "/data"); - if (compare_result != PATH2_IS_PREFIX && compare_result != PATHS_ARE_EQUAL) { - return; + bool is_android_data = (compare_result == PATH2_IS_PREFIX || compare_result == PATHS_ARE_EQUAL); + + /* Are there bindings to expose as fake mounts (mount(2) + * calls from sandbox helpers are converted into + * bindings — see emulate_mount). Skip the root + * binding, which the real kernel mount table already + * covers. */ + bool has_extra_bindings = false; + if (target_tracee->fs->bindings.guest != NULL) { + Binding *b; + for (b = CIRCLEQ_FIRST(target_tracee->fs->bindings.guest); + b != (void *) target_tracee->fs->bindings.guest; + b = CIRCLEQ_NEXT(b, link.guest)) { + if (strcmp(b->guest.path, "/") != 0) { + has_extra_bindings = true; + break; + } + } } + if (!is_android_data && !has_extra_bindings) + return; + /* Open real /proc//mountinfo */ FILE *real_mountinfo_fp = fopen(path, "r"); if (real_mountinfo_fp == NULL) { @@ -55,38 +107,14 @@ static void mountinfo_check_open_path(Tracee *tracee, char path[PATH_MAX]) { size_t line_buf_len = 0; ssize_t line_len = 0; bool found_line = false; - while ((line_len = getline(&line, &line_buf_len, real_mountinfo_fp)) > 0) { - char *chunk = line; - /* Skip columns before 'root' */ - for (int i = 0; i < 4 && chunk - line < line_len; i++) { - chunk = strchr(chunk, ' '); - if (chunk == NULL) goto end_line_scan; - chunk++; - } - - /* Match path */ - char *chunk_end = strchr(chunk, ' '); - if (chunk_end == NULL) continue; - if (chunk_end - chunk == 5 && 0 == memcmp(chunk, "/data", 5)) { - /* Write line into new file keeping only "/" from root column */ - fwrite(line, chunk - line + 1, 1, new_mountinfo_fp); - fwrite(chunk_end, line_len - (chunk_end - line), 1, new_mountinfo_fp); - found_line = true; - break; - } -end_line_scan: ; - } - - /* Once root was added, rescan and add other standard mounts */ - if (found_line) { - fseek(real_mountinfo_fp, 0, SEEK_SET); + if (is_android_data) { while ((line_len = getline(&line, &line_buf_len, real_mountinfo_fp)) > 0) { char *chunk = line; /* Skip columns before 'root' */ for (int i = 0; i < 4 && chunk - line < line_len; i++) { chunk = strchr(chunk, ' '); - if (chunk == NULL) goto end_line_scan2; + if (chunk == NULL) goto end_line_scan; chunk++; } @@ -94,22 +122,59 @@ end_line_scan: ; char *chunk_end = strchr(chunk, ' '); if (chunk_end == NULL) continue; - size_t mount_len = chunk_end - chunk; - if ( - (mount_len == 4 && 0 == memcmp(chunk, "/dev", 4)) || - (mount_len >= 5 && 0 == memcmp(chunk, "/dev/", 5)) || - (mount_len == 5 && 0 == memcmp(chunk, "/proc", 5)) || - (mount_len == 4 && 0 == memcmp(chunk, "/sys", 4)) || - (mount_len >= 5 && 0 == memcmp(chunk, "/sys/", 5)) || - (mount_len == 4 && 0 == memcmp(chunk, "/tmp", 4)) - ) { - /* Copy line into new file verbatim */ - fwrite(line, line_len, 1, new_mountinfo_fp); + if (chunk_end - chunk == 5 && 0 == memcmp(chunk, "/data", 5)) { + /* Write line into new file keeping only "/" from root column */ + fwrite(line, chunk - line + 1, 1, new_mountinfo_fp); + fwrite(chunk_end, line_len - (chunk_end - line), 1, new_mountinfo_fp); + found_line = true; + break; } +end_line_scan: ; + } + + /* Once root was added, rescan and add other standard mounts */ + if (found_line) { + fseek(real_mountinfo_fp, 0, SEEK_SET); + while ((line_len = getline(&line, &line_buf_len, real_mountinfo_fp)) > 0) { + char *chunk = line; + /* Skip columns before 'root' */ + for (int i = 0; i < 4 && chunk - line < line_len; i++) { + chunk = strchr(chunk, ' '); + if (chunk == NULL) goto end_line_scan2; + chunk++; + } + + /* Match path */ + char *chunk_end = strchr(chunk, ' '); + if (chunk_end == NULL) continue; + + size_t mount_len = chunk_end - chunk; + if ( + (mount_len == 4 && 0 == memcmp(chunk, "/dev", 4)) || + (mount_len >= 5 && 0 == memcmp(chunk, "/dev/", 5)) || + (mount_len == 5 && 0 == memcmp(chunk, "/proc", 5)) || + (mount_len == 4 && 0 == memcmp(chunk, "/sys", 4)) || + (mount_len >= 5 && 0 == memcmp(chunk, "/sys/", 5)) || + (mount_len == 4 && 0 == memcmp(chunk, "/tmp", 4)) + ) { + /* Copy line into new file verbatim */ + fwrite(line, line_len, 1, new_mountinfo_fp); + } end_line_scan2: ; + } } + } else { + /* Non-Android case: copy real mountinfo verbatim. */ + while ((line_len = getline(&line, &line_buf_len, real_mountinfo_fp)) > 0) + fwrite(line, line_len, 1, new_mountinfo_fp); + found_line = true; } + /* Append synthesized entries for runtime bindings so + * helpers like bubblewrap find the mounts they think + * they just created. */ + append_runtime_binding_lines(target_tracee, new_mountinfo_fp); + free(line); fclose(new_mountinfo_fp); fclose(real_mountinfo_fp); diff --git a/src/path/canon.c b/src/path/canon.c index 271749bc..0fd58b0c 100644 --- a/src/path/canon.c +++ b/src/path/canon.c @@ -282,37 +282,59 @@ int canonicalize(Tracee *tracee, const char *user_path, bool deref_final, /* It's a link, so we have to dereference *and* * canonicalize to ensure we are not going outside the * new root. */ - comparison = compare_paths("/proc", guest_path); - switch (comparison) { - case PATHS_ARE_EQUAL: - case PATH1_IS_PREFIX: - /* Some links in "/proc" are generated - * dynamically by the kernel. PRoot has to - * emulate some of them. */ - status = readlink_proc(tracee, scratch_path, - guest_path, component, comparison); - switch (status) { - case CANONICALIZE: - /* The symlink is already dereferenced, - * now canonicalize it. */ - goto canon; - - case DONT_CANONICALIZE: - /* If and only very final, this symlink - * shouldn't be dereferenced nor canonicalized. */ - if (finality == FINAL_NORMAL) { - strcpy(guest_path, scratch_path); - return 0; + { + const char *proc_base = guest_path; + char alias_base[PATH_MAX]; + + comparison = compare_paths("/proc", guest_path); + + /* If guest_path is not under /proc directly, + * check whether it aliases /proc via a binding + * (e.g. /oldroot/proc when /oldroot is bound to + * /). Otherwise links like /oldroot/proc/self + * would be resolved by the real kernel readlink + * and return PRoot's own pid. */ + if (comparison != PATHS_ARE_EQUAL && comparison != PATH1_IS_PREFIX) { + strncpy(alias_base, guest_path, PATH_MAX - 1); + alias_base[PATH_MAX - 1] = '\0'; + (void) substitute_binding(tracee, GUEST, alias_base); + if (strcmp(alias_base, guest_path) != 0) { + comparison = compare_paths("/proc", alias_base); + proc_base = alias_base; + } + } + + switch (comparison) { + case PATHS_ARE_EQUAL: + case PATH1_IS_PREFIX: + /* Some links in "/proc" are generated + * dynamically by the kernel. PRoot has to + * emulate some of them. */ + status = readlink_proc(tracee, scratch_path, + proc_base, component, comparison); + switch (status) { + case CANONICALIZE: + /* The symlink is already dereferenced, + * now canonicalize it. */ + goto canon; + + case DONT_CANONICALIZE: + /* If and only very final, this symlink + * shouldn't be dereferenced nor canonicalized. */ + if (finality == FINAL_NORMAL) { + strcpy(guest_path, scratch_path); + return 0; + } + break; + + default: + if (status < 0) + return status; } - break; default: - if (status < 0) - return status; + break; } - - default: - break; } status = readlink(host_path, scratch_path, sizeof(scratch_path)); diff --git a/src/path/temp.c b/src/path/temp.c index 8d8aa638..dde2296d 100644 --- a/src/path/temp.c +++ b/src/path/temp.c @@ -106,11 +106,18 @@ static int clean_temp_cwd() || strcmp(entry->d_name, "..") == 0) continue; - status = chmod(entry->d_name, 0700); - if (status < 0) { - note(NULL, WARNING, SYSTEM, "cant chmod '%s'", entry->d_name); - nb_errors++; - continue; + /* Skip chmod on symlinks: chmod follows them and would + * report spurious errors when the target no longer + * exists (common with the /dev/{stdin,fd,...} symlinks + * bubblewrap leaves behind in emulated tmpfs dirs). + * We only need to unlink the symlink itself. */ + if (entry->d_type != DT_LNK) { + status = chmod(entry->d_name, 0700); + if (status < 0) { + note(NULL, WARNING, SYSTEM, "cant chmod '%s'", entry->d_name); + nb_errors++; + continue; + } } if (entry->d_type == DT_DIR) { diff --git a/src/syscall/enter.c b/src/syscall/enter.c index 50f1543b..5ff1f1e5 100644 --- a/src/syscall/enter.c +++ b/src/syscall/enter.c @@ -27,7 +27,9 @@ #include /* AT_FDCWD, */ #include /* PATH_MAX, */ #include /* strcpy */ +#include /* bool */ #include /* PR_SET_DUMPABLE */ +#include /* MS_BIND, MS_REMOUNT, ... */ #include /* TCSETS, TCSANOW */ #include "cli/note.h" @@ -45,6 +47,8 @@ #include "tracee/abi.h" #include "path/path.h" #include "path/canon.h" +#include "path/binding.h" +#include "path/temp.h" #include "arch.h" /** @@ -87,6 +91,187 @@ static int translate_sysarg(Tracee *tracee, Reg reg, Type type) return translate_path2(tracee, AT_FDCWD, old_path, reg, type); } +/** + * Canonicalize @user_path as a guest path, relative to the @tracee's + * cwd when @user_path is relative. Stores the result in @guest_path + * with any trailing "/" or "/." stripped, so it can be used as a + * binding key. Returns 0 on success, -errno otherwise. + */ +static int guest_canonicalize(Tracee *tracee, const char *user_path, + char guest_path[PATH_MAX]) +{ + int status; + + if (user_path[0] == '/') + strcpy(guest_path, "/"); + else { + status = getcwd2(tracee, guest_path); + if (status < 0) + return status; + } + + status = canonicalize(tracee, user_path, true, guest_path, 0); + if (status < 0) + return status; + + chop_finality(guest_path); + return 0; +} + +/** + * Emulate mount(@src_user, @target_user, @fstype, @flags) by adding a + * PRoot binding from a host directory to the canonicalized target. + * Bind mounts use the translated source; "proc"/"sysfs" use the + * matching host file-system; "tmpfs"/"devpts"/"devtmpfs" get a fresh + * empty directory. Any other case is silently ignored: the caller + * will still see the syscall succeed (we always void it). + */ +static void emulate_mount(Tracee *tracee, const char *src_user, + const char *target_user, const char *fstype, + unsigned long flags) +{ + char host_path[PATH_MAX]; + char guest_path[PATH_MAX]; + const char *tmpdir; + + if ((flags & MS_REMOUNT) != 0) + return; + + if ((flags & MS_BIND) != 0) { + if (translate_path(tracee, host_path, AT_FDCWD, src_user, true) < 0) + return; + } + else if (strcmp(fstype, "proc") == 0) + strcpy(host_path, "/proc"); + else if (strcmp(fstype, "sysfs") == 0) + strcpy(host_path, "/sys"); + else if ( strcmp(fstype, "tmpfs") == 0 + || strcmp(fstype, "devpts") == 0 + || strcmp(fstype, "devtmpfs") == 0) { + tmpdir = create_temp_directory(tracee->fs, "proot-tmpfs-"); + if (tmpdir == NULL) + return; + strncpy(host_path, tmpdir, PATH_MAX - 1); + host_path[PATH_MAX - 1] = '\0'; + } + else + return; + + chop_finality(host_path); + + if (guest_canonicalize(tracee, target_user, guest_path) < 0) + return; + + (void) insort_binding3(tracee, tracee->fs, host_path, guest_path); +} + +/** + * Emulate pivot_root(@new_root_user, @put_old_user) by changing the + * tracee's root binding to point at @new_root_user (translated to + * host) and re-exposing the previous root at @put_old_user, so that + * sandbox helpers like bubblewrap can keep accessing the prior + * file-system through the agreed "oldroot" path. + */ +static void emulate_pivot_root(Tracee *tracee, const char *new_root_user, + const char *put_old_user) +{ + char new_root_host[PATH_MAX]; + char new_root_guest[PATH_MAX]; + char put_old_guest[PATH_MAX]; + char old_root_host[PATH_MAX]; + Binding *root_binding; + size_t prefix_len; + const char *put_old_after; + + if (translate_path(tracee, new_root_host, AT_FDCWD, new_root_user, true) < 0) + return; + chop_finality(new_root_host); + + if (guest_canonicalize(tracee, new_root_user, new_root_guest) < 0) + return; + + /* put_old is relative to new_root, so resolve it against + * new_root_guest rather than the current cwd. */ + if (put_old_user[0] == '/') + strcpy(put_old_guest, "/"); + else + strcpy(put_old_guest, new_root_guest); + if (canonicalize(tracee, put_old_user, true, put_old_guest, 0) < 0) + return; + + root_binding = get_binding(tracee, GUEST, "/"); + if (root_binding == NULL) + return; + strncpy(old_root_host, root_binding->host.path, PATH_MAX - 1); + old_root_host[PATH_MAX - 1] = '\0'; + + remove_binding_from_all_lists(tracee, root_binding); + (void) insort_binding3(tracee, tracee->fs, new_root_host, "/"); + + /* If put_old is a path strictly under new_root, expose the + * previous root there. The pivot_root(".", ".") trick used to + * detach the old root leaves new_root and put_old equal; in + * that case there is nowhere to expose the old root. */ + prefix_len = strlen(new_root_guest); + if ( prefix_len > 0 + && strncmp(put_old_guest, new_root_guest, prefix_len) == 0 + && ( put_old_guest[prefix_len] == '/' + || (prefix_len == 1 && new_root_guest[0] == '/'))) { + put_old_after = put_old_guest + (prefix_len == 1 ? 0 : prefix_len); + if (put_old_after[0] == '/' && put_old_after[1] != '\0') + (void) insort_binding3(tracee, tracee->fs, + old_root_host, put_old_after); + } +} + +/** + * Detect /proc//{uid_map,gid_map,setgroups}, which sandbox + * helpers like bubblewrap write to during user-namespace setup. The + * tracee cannot really create namespaces under PRoot, so silently + * redirect those writes to /dev/null. + */ +static bool is_proc_userns_file(const char *path) +{ + const char *p; + const char *suffix; + + if (strncmp(path, "/proc/", 6) != 0) + return false; + p = path + 6; + + if (strncmp(p, "self/", 5) == 0) + p += 5; + else { + const char *digits = p; + while (*p >= '0' && *p <= '9') + p++; + if (p == digits || *p != '/') + return false; + p++; + } + + suffix = p; + return strcmp(suffix, "uid_map") == 0 + || strcmp(suffix, "gid_map") == 0 + || strcmp(suffix, "setgroups") == 0; +} + +/** + * Redirect openat()/open() of /proc/.../uid_map etc. to /dev/null so + * that writes appear to succeed. @reg holds the path argument; the + * path has already been translated to host form. + */ +static void maybe_redirect_userns_file(Tracee *tracee, Reg reg) +{ + char host_path[PATH_MAX]; + + if (get_sysarg_path(tracee, host_path, reg) < 0) + return; + if (!is_proc_userns_file(host_path)) + return; + (void) set_sysarg_path(tracee, "/dev/null", reg); +} + /** * Translate the input arguments of the current @tracee's syscall in the * @tracee->pid process area. This function sets @tracee->status to @@ -390,14 +575,66 @@ int translate_syscall_enter(Tracee *tracee) case PR_swapon: case PR_truncate: case PR_truncate64: - case PR_umount: - case PR_umount2: case PR_uselib: case PR_utime: case PR_utimes: status = translate_sysarg(tracee, SYSARG_1, REGULAR); break; + /* Pretend namespace/unmount syscalls succeed without doing + * anything; PRoot can't really create namespaces, and sandbox + * helpers like bubblewrap only check the return value. */ + case PR_unshare: + case PR_setns: + case PR_umount: + case PR_umount2: + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + + /* mount(2) and pivot_root(2) are emulated by translating them + * into PRoot bindings (see emulate_mount/emulate_pivot_root) + * so the resulting paths actually become accessible. */ + case PR_mount: { + char src_user[PATH_MAX]; + char target_user[PATH_MAX]; + char fstype[256]; + word_t fstype_addr; + unsigned long flags; + + fstype[0] = '\0'; + + if (get_sysarg_path(tracee, src_user, SYSARG_1) >= 0 + && get_sysarg_path(tracee, target_user, SYSARG_2) >= 0) { + fstype_addr = peek_reg(tracee, CURRENT, SYSARG_3); + if (fstype_addr != 0) + (void) read_string(tracee, fstype, fstype_addr, + sizeof(fstype) - 1); + flags = peek_reg(tracee, CURRENT, SYSARG_4); + emulate_mount(tracee, src_user, target_user, fstype, flags); + } + + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + } + + case PR_pivot_root: { + char new_root_user[PATH_MAX]; + char put_old_user[PATH_MAX]; + + if (get_sysarg_path(tracee, new_root_user, SYSARG_1) >= 0 + && get_sysarg_path(tracee, put_old_user, SYSARG_2) >= 0) + emulate_pivot_root(tracee, new_root_user, put_old_user); + + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + } + case PR_open: flags = peek_reg(tracee, CURRENT, SYSARG_2); @@ -413,6 +650,8 @@ int translate_syscall_enter(Tracee *tracee) status = translate_sysarg(tracee, SYSARG_1, SYMLINK); else status = translate_sysarg(tracee, SYSARG_1, REGULAR); + if (status >= 0) + maybe_redirect_userns_file(tracee, SYSARG_1); break; case PR_fchownat: @@ -476,14 +715,6 @@ int translate_syscall_enter(Tracee *tracee) status = translate_sysarg(tracee, SYSARG_1, SYMLINK); break; - case PR_pivot_root: - status = translate_sysarg(tracee, SYSARG_1, REGULAR); - if (status < 0) - break; - - status = translate_sysarg(tracee, SYSARG_2, REGULAR); - break; - case PR_linkat: olddirfd = peek_reg(tracee, CURRENT, SYSARG_1); newdirfd = peek_reg(tracee, CURRENT, SYSARG_3); @@ -507,21 +738,6 @@ int translate_syscall_enter(Tracee *tracee) status = translate_path2(tracee, newdirfd, newpath, SYSARG_4, SYMLINK); break; - case PR_mount: - status = get_sysarg_path(tracee, path, SYSARG_1); - if (status < 0) - break; - - /* The following check covers only 90% of the cases. */ - if (path[0] == '/' || path[0] == '.') { - status = translate_path2(tracee, AT_FDCWD, path, SYSARG_1, REGULAR); - if (status < 0) - break; - } - - status = translate_sysarg(tracee, SYSARG_2, REGULAR); - break; - case PR_openat: dirfd = peek_reg(tracee, CURRENT, SYSARG_1); flags = peek_reg(tracee, CURRENT, SYSARG_3); @@ -540,6 +756,8 @@ int translate_syscall_enter(Tracee *tracee) status = translate_path2(tracee, dirfd, path, SYSARG_2, SYMLINK); else status = translate_path2(tracee, dirfd, path, SYSARG_2, REGULAR); + if (status >= 0) + maybe_redirect_userns_file(tracee, SYSARG_2); break; case PR_readlinkat: @@ -617,6 +835,7 @@ int translate_syscall_enter(Tracee *tracee) /* Prevent tracees from setting dumpable flag. * (Otherwise it could break tracee memory access) */ if (peek_reg(tracee, CURRENT, SYSARG_1) == PR_SET_DUMPABLE) { + poke_reg(tracee, SYSARG_RESULT, 0); set_sysnum(tracee, PR_void); status = 0; } diff --git a/src/syscall/seccomp.c b/src/syscall/seccomp.c index ee961437..04bb136b 100644 --- a/src/syscall/seccomp.c +++ b/src/syscall/seccomp.c @@ -413,6 +413,8 @@ static FilteredSysnum proot_sysnums[] = { { PR_umount, 0 }, { PR_umount2, 0 }, { PR_uname, FILTER_SYSEXIT }, + { PR_unshare, 0 }, + { PR_setns, 0 }, { PR_unlink, 0 }, { PR_unlinkat, 0 }, { PR_uselib, 0 }, From 96b22df846cb7052b7e34f27f62a65e322bd147d Mon Sep 17 00:00:00 2001 From: Sylirre Date: Sat, 23 May 2026 18:01:55 +0000 Subject: [PATCH 02/10] syscall: strip CLONE_NEW* flags from clone/clone3 bubblewrap calls clone(CLONE_NEWNS|SIGCHLD) directly (without going through unshare) and the Android kernel rejects it with EPERM when unprivileged user namespaces are disabled. Drop the namespace flags before the kernel sees them so the fork/thread itself still succeeds; PRoot keeps tracking the child via PTRACE_EVENT_CLONE. clone3 takes its flags from a struct clone_args in tracee memory, so read/write via peek_word/poke_word. Add both syscalls to the seccomp filter so the enter handler runs under seccomp mode 2. --- src/syscall/enter.c | 41 +++++++++++++++++++++++++++++++++++++++++ src/syscall/seccomp.c | 2 ++ 2 files changed, 43 insertions(+) diff --git a/src/syscall/enter.c b/src/syscall/enter.c index 5ff1f1e5..c023791b 100644 --- a/src/syscall/enter.c +++ b/src/syscall/enter.c @@ -30,6 +30,7 @@ #include /* bool */ #include /* PR_SET_DUMPABLE */ #include /* MS_BIND, MS_REMOUNT, ... */ +#include /* CLONE_NEW*, */ #include /* TCSETS, TCSANOW */ #include "cli/note.h" @@ -51,6 +52,18 @@ #include "path/temp.h" #include "arch.h" +/* Older kernel headers may lack these. */ +#ifndef CLONE_NEWTIME +#define CLONE_NEWTIME 0x00000080 +#endif +#ifndef CLONE_NEWCGROUP +#define CLONE_NEWCGROUP 0x02000000 +#endif + +#define CLONE_NS_MASK (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | \ + CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET | \ + CLONE_NEWCGROUP | CLONE_NEWTIME) + /** * Translate @path and put the result in the @tracee's memory address * space pointed to by the @reg argument of the current syscall. See @@ -593,6 +606,34 @@ int translate_syscall_enter(Tracee *tracee) status = 0; break; + /* Strip CLONE_NEW* flags from clone(2)/clone3(2) so the + * syscall doesn't fail with EPERM on kernels that disallow + * unprivileged namespace creation (typical on Android). The + * fork/thread itself still proceeds normally and PRoot keeps + * tracking the child through PTRACE_EVENT_CLONE. */ + case PR_clone: { + word_t flags = peek_reg(tracee, CURRENT, SYSARG_1); + if ((flags & CLONE_NS_MASK) != 0) + poke_reg(tracee, SYSARG_1, flags & ~(word_t) CLONE_NS_MASK); + status = 0; + break; + } + + case PR_clone3: { + word_t args_addr = peek_reg(tracee, CURRENT, SYSARG_1); + word_t flags; + + if (args_addr != 0) { + errno = 0; + flags = peek_word(tracee, args_addr); + if (errno == 0 && (flags & CLONE_NS_MASK) != 0) + poke_word(tracee, args_addr, + flags & ~(word_t) CLONE_NS_MASK); + } + status = 0; + break; + } + /* mount(2) and pivot_root(2) are emulated by translating them * into PRoot bindings (see emulate_mount/emulate_pivot_root) * so the resulting paths actually become accessible. */ diff --git a/src/syscall/seccomp.c b/src/syscall/seccomp.c index 04bb136b..ea372e6b 100644 --- a/src/syscall/seccomp.c +++ b/src/syscall/seccomp.c @@ -340,6 +340,8 @@ static FilteredSysnum proot_sysnums[] = { { PR_chown, 0 }, { PR_chown32, 0 }, { PR_chroot, 0 }, + { PR_clone, 0 }, + { PR_clone3, 0 }, { PR_close, 0 }, { PR_connect, 0 }, { PR_creat, 0 }, From 506e7622d2d31cd79f6ef16b4a47d1f9d1a79c1d Mon Sep 17 00:00:00 2001 From: Sylirre Date: Sat, 23 May 2026 18:02:01 +0000 Subject: [PATCH 03/10] syscall: force 0 return for voided syscalls in exit handler On some aarch64 kernels (notably Android) the SYSCALL_AVOIDER trick leaks -ENOSYS through to the tracee even though set_sysnum() ran: chdir under PROOT_NO_SECCOMP and bwrap's mount(NULL, "/", ...) both returned "Function not implemented". Add an exit-stage poke so the result is always 0 for these emulated syscalls, regardless of how the kernel handled SYSCALL_AVOIDER. Requires the syscalls to be filtered with FILTER_SYSEXIT under seccomp mode so the exit handler actually runs. Also poke SYSARG_RESULT=0 at enter for getcwd/chdir/fchdir, mirroring what mount/unshare/etc. already do. --- src/syscall/enter.c | 2 ++ src/syscall/exit.c | 9 +++++++++ src/syscall/seccomp.c | 12 ++++++------ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/syscall/enter.c b/src/syscall/enter.c index c023791b..2ea22629 100644 --- a/src/syscall/enter.c +++ b/src/syscall/enter.c @@ -355,6 +355,7 @@ int translate_syscall_enter(Tracee *tracee) break; case PR_getcwd: + poke_reg(tracee, SYSARG_RESULT, 0); set_sysnum(tracee, PR_void); status = 0; break; @@ -425,6 +426,7 @@ int translate_syscall_enter(Tracee *tracee) tracee->fs->cwd = tmp; talloc_set_name_const(tracee->fs->cwd, "$cwd"); + poke_reg(tracee, SYSARG_RESULT, 0); set_sysnum(tracee, PR_void); status = 0; break; diff --git a/src/syscall/exit.c b/src/syscall/exit.c index 59e05046..67fe8d7f 100644 --- a/src/syscall/exit.c +++ b/src/syscall/exit.c @@ -244,6 +244,15 @@ void translate_syscall_exit(Tracee *tracee) case PR_fchdir: case PR_chdir: + /* These syscalls are voided in enter.c; make sure the + * tracee always sees a 0 return value even on kernels where + * the SYSCALL_AVOIDER trick leaks -ENOSYS through. */ + case PR_unshare: + case PR_setns: + case PR_mount: + case PR_umount: + case PR_umount2: + case PR_pivot_root: /* These syscalls are fully emulated, see enter.c for details * (like errors). */ status = 0; diff --git a/src/syscall/seccomp.c b/src/syscall/seccomp.c index ea372e6b..1732039e 100644 --- a/src/syscall/seccomp.c +++ b/src/syscall/seccomp.c @@ -380,14 +380,14 @@ static FilteredSysnum proot_sysnums[] = { { PR_mkdirat, 0 }, { PR_mknod, 0 }, { PR_mknodat, 0 }, - { PR_mount, 0 }, + { PR_mount, FILTER_SYSEXIT }, { PR_name_to_handle_at, 0 }, { PR_newfstatat, 0 }, { PR_oldlstat, 0 }, { PR_oldstat, 0 }, { PR_open, 0 }, { PR_openat, 0 }, - { PR_pivot_root, 0 }, + { PR_pivot_root, FILTER_SYSEXIT }, { PR_prctl, 0 }, { PR_prlimit64, FILTER_SYSEXIT }, { PR_ptrace, FILTER_SYSEXIT }, @@ -412,11 +412,11 @@ static FilteredSysnum proot_sysnums[] = { { PR_symlinkat, 0 }, { PR_truncate, 0 }, { PR_truncate64, 0 }, - { PR_umount, 0 }, - { PR_umount2, 0 }, + { PR_umount, FILTER_SYSEXIT }, + { PR_umount2, FILTER_SYSEXIT }, { PR_uname, FILTER_SYSEXIT }, - { PR_unshare, 0 }, - { PR_setns, 0 }, + { PR_unshare, FILTER_SYSEXIT }, + { PR_setns, FILTER_SYSEXIT }, { PR_unlink, 0 }, { PR_unlinkat, 0 }, { PR_uselib, 0 }, From da1955fe9930217145a973c64a9dbf6e1bf4880e Mon Sep 17 00:00:00 2001 From: Sylirre Date: Sat, 23 May 2026 18:02:07 +0000 Subject: [PATCH 04/10] seccomp: handle SIGSYS for namespace/mount syscalls Android's parent process installs a system-wide seccomp filter that traps mount/umount/pivot_root/unshare/setns with SIGSYS. Our regular sysenter handlers in enter.c never run for those syscalls because the kernel sends SIGSYS instead of executing the call, so bwrap was getting -ENOSYS from the SIGSYS handler's default branch. Add cases in handle_seccomp_event_common that pretend the syscall succeeded (mirroring what enter.c does), and apply the mount / pivot_root binding emulation so sandbox helpers like bubblewrap see the bindings they expect. The emulation helpers in enter.c are factored out into apply_emulated_mount() / apply_emulated_pivot_root() so the SIGSYS handler and the normal enter path share the same code. --- src/syscall/enter.c | 79 +++++++++++++++++++++++++++---------------- src/syscall/syscall.h | 3 ++ src/tracee/seccomp.c | 22 ++++++++++++ 3 files changed, 75 insertions(+), 29 deletions(-) diff --git a/src/syscall/enter.c b/src/syscall/enter.c index 2ea22629..e5987ba3 100644 --- a/src/syscall/enter.c +++ b/src/syscall/enter.c @@ -237,6 +237,52 @@ static void emulate_pivot_root(Tracee *tracee, const char *new_root_user, } } +/** + * Read mount(2) arguments from the @tracee's registers and apply + * emulate_mount(). Safe to call from both the normal sysenter path + * and the SIGSYS handler (Android's parent seccomp filter traps + * mount, so the syscall never reaches our regular case). + */ +void apply_emulated_mount(Tracee *tracee) +{ + char src_user[PATH_MAX]; + char target_user[PATH_MAX]; + char fstype[256]; + word_t fstype_addr; + unsigned long flags; + + fstype[0] = '\0'; + + if (get_sysarg_path(tracee, src_user, SYSARG_1) < 0) + return; + if (get_sysarg_path(tracee, target_user, SYSARG_2) < 0) + return; + + fstype_addr = peek_reg(tracee, CURRENT, SYSARG_3); + if (fstype_addr != 0) + (void) read_string(tracee, fstype, fstype_addr, sizeof(fstype) - 1); + flags = peek_reg(tracee, CURRENT, SYSARG_4); + + emulate_mount(tracee, src_user, target_user, fstype, flags); +} + +/** + * Read pivot_root(2) arguments from the @tracee's registers and apply + * emulate_pivot_root(). See apply_emulated_mount() for context. + */ +void apply_emulated_pivot_root(Tracee *tracee) +{ + char new_root_user[PATH_MAX]; + char put_old_user[PATH_MAX]; + + if (get_sysarg_path(tracee, new_root_user, SYSARG_1) < 0) + return; + if (get_sysarg_path(tracee, put_old_user, SYSARG_2) < 0) + return; + + emulate_pivot_root(tracee, new_root_user, put_old_user); +} + /** * Detect /proc//{uid_map,gid_map,setgroups}, which sandbox * helpers like bubblewrap write to during user-namespace setup. The @@ -639,44 +685,19 @@ int translate_syscall_enter(Tracee *tracee) /* mount(2) and pivot_root(2) are emulated by translating them * into PRoot bindings (see emulate_mount/emulate_pivot_root) * so the resulting paths actually become accessible. */ - case PR_mount: { - char src_user[PATH_MAX]; - char target_user[PATH_MAX]; - char fstype[256]; - word_t fstype_addr; - unsigned long flags; - - fstype[0] = '\0'; - - if (get_sysarg_path(tracee, src_user, SYSARG_1) >= 0 - && get_sysarg_path(tracee, target_user, SYSARG_2) >= 0) { - fstype_addr = peek_reg(tracee, CURRENT, SYSARG_3); - if (fstype_addr != 0) - (void) read_string(tracee, fstype, fstype_addr, - sizeof(fstype) - 1); - flags = peek_reg(tracee, CURRENT, SYSARG_4); - emulate_mount(tracee, src_user, target_user, fstype, flags); - } - + case PR_mount: + apply_emulated_mount(tracee); poke_reg(tracee, SYSARG_RESULT, 0); set_sysnum(tracee, PR_void); status = 0; break; - } - - case PR_pivot_root: { - char new_root_user[PATH_MAX]; - char put_old_user[PATH_MAX]; - - if (get_sysarg_path(tracee, new_root_user, SYSARG_1) >= 0 - && get_sysarg_path(tracee, put_old_user, SYSARG_2) >= 0) - emulate_pivot_root(tracee, new_root_user, put_old_user); + case PR_pivot_root: + apply_emulated_pivot_root(tracee); poke_reg(tracee, SYSARG_RESULT, 0); set_sysnum(tracee, PR_void); status = 0; break; - } case PR_open: flags = peek_reg(tracee, CURRENT, SYSARG_2); diff --git a/src/syscall/syscall.h b/src/syscall/syscall.h index e255cc88..3b067c42 100644 --- a/src/syscall/syscall.h +++ b/src/syscall/syscall.h @@ -36,4 +36,7 @@ extern void translate_syscall(Tracee *tracee); extern int translate_syscall_enter(Tracee *tracee); extern void translate_syscall_exit(Tracee *tracee); +extern void apply_emulated_mount(Tracee *tracee); +extern void apply_emulated_pivot_root(Tracee *tracee); + #endif /* SYSCALL_H */ diff --git a/src/tracee/seccomp.c b/src/tracee/seccomp.c index 77d5b273..650e32f6 100644 --- a/src/tracee/seccomp.c +++ b/src/tracee/seccomp.c @@ -160,6 +160,28 @@ static int handle_seccomp_event_common(Tracee *tracee) set_result_after_seccomp(tracee, 0); break; + /* The Android parent process commonly installs a seccomp + * filter that traps mount/umount/pivot_root/unshare/setns + * with SIGSYS. Mirror what enter.c does for these: pretend + * they succeeded and apply the mount/pivot_root binding + * emulation so sandbox helpers like bubblewrap can proceed. */ + case PR_mount: + apply_emulated_mount(tracee); + set_result_after_seccomp(tracee, 0); + break; + + case PR_pivot_root: + apply_emulated_pivot_root(tracee); + set_result_after_seccomp(tracee, 0); + break; + + case PR_umount: + case PR_umount2: + case PR_unshare: + case PR_setns: + set_result_after_seccomp(tracee, 0); + break; + case PR_getpgrp: /* Query value with getpgid and set it as result. */ set_result_after_seccomp(tracee, getpgid(tracee->pid)); From 2ae9c383f34cb5b5c122542a3e813863ce1de1d8 Mon Sep 17 00:00:00 2001 From: Sylirre Date: Sat, 23 May 2026 18:02:13 +0000 Subject: [PATCH 05/10] syscall: re-expose recommended bindings under put_old after pivot_root bubblewrap reads /oldroot/proc/self/fd/ to verify the mount it just asked for. With only a single /oldroot binding pointing at the previous rootfs host path, /oldroot/proc resolved to /proc on the host (empty), not the real /proc, so the readlink failed. After installing the put_old binding, walk the existing non-root bindings and add a parallel / binding for each. The host /proc bound at /proc thus also becomes reachable at /oldroot/proc, which is what bwrap (and similar sandbox helpers) expects. --- src/syscall/enter.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/syscall/enter.c b/src/syscall/enter.c index e5987ba3..fece415a 100644 --- a/src/syscall/enter.c +++ b/src/syscall/enter.c @@ -231,9 +231,45 @@ static void emulate_pivot_root(Tracee *tracee, const char *new_root_user, && ( put_old_guest[prefix_len] == '/' || (prefix_len == 1 && new_root_guest[0] == '/'))) { put_old_after = put_old_guest + (prefix_len == 1 ? 0 : prefix_len); - if (put_old_after[0] == '/' && put_old_after[1] != '\0') + if (put_old_after[0] == '/' && put_old_after[1] != '\0') { + Binding *iter; + Binding *next; + size_t put_old_len = strlen(put_old_after); + char aliased[PATH_MAX]; + (void) insort_binding3(tracee, tracee->fs, old_root_host, put_old_after); + + /* Snapshot existing non-root bindings and + * re-expose each one at put_old_after/, + * so sandbox helpers can still reach the host + * /proc, /dev, ... through the agreed + * "oldroot" prefix. Iterate carefully: we + * mutate the same list we walk. */ + for (iter = CIRCLEQ_FIRST(tracee->fs->bindings.guest); + iter != (void *) tracee->fs->bindings.guest; + iter = next) { + next = CIRCLEQ_NEXT(iter, link.guest); + + if (strcmp(iter->guest.path, "/") == 0) + continue; + /* Skip the binding we just added for + * put_old itself, and anything already + * sitting under put_old. */ + if (strncmp(iter->guest.path, put_old_after, put_old_len) == 0 + && (iter->guest.path[put_old_len] == '\0' + || iter->guest.path[put_old_len] == '/')) + continue; + + if ((size_t) snprintf(aliased, sizeof(aliased), "%s%s", + put_old_after, iter->guest.path) + >= sizeof(aliased)) + continue; + + (void) insort_binding3(tracee, tracee->fs, + iter->host.path, aliased); + } + } } } From 3b4d807f27bc82978dca19794960ce6cbb6ee71d Mon Sep 17 00:00:00 2001 From: Sylirre Date: Sat, 23 May 2026 18:02:19 +0000 Subject: [PATCH 06/10] syscall: isolate emulated mounts to CLONE_NEWNS children, emulate umount Two related fixes for the bubblewrap-on-PRoot emulation: 1. Subsequent bwrap runs in the same shell were failing with "Creating newroot failed: No such file or directory" because the bindings added by the previous bwrap leaked into the parent. bubblewrap clones with CLONE_NEWNS (which we strip); remember that on the tracee, and in new_child() deep-copy the binding tree so emulated mount(2) calls in the child don't propagate back to the parent. 2. umount of a runtime bind was a silent no-op. Add emulate_umount() that removes the matching binding when its guest path exactly equals the unmount target, and call it from both the regular sysenter handler and the SIGSYS handler. --- src/syscall/enter.c | 74 +++++++++++++++++++++++++++++++++++++++---- src/syscall/syscall.h | 1 + src/tracee/seccomp.c | 4 +++ src/tracee/tracee.c | 38 ++++++++++++++++++---- src/tracee/tracee.h | 6 ++++ 5 files changed, 111 insertions(+), 12 deletions(-) diff --git a/src/syscall/enter.c b/src/syscall/enter.c index fece415a..fffa16b5 100644 --- a/src/syscall/enter.c +++ b/src/syscall/enter.c @@ -273,6 +273,53 @@ static void emulate_pivot_root(Tracee *tracee, const char *new_root_user, } } +/** + * Emulate umount(@target_user) by removing the matching binding (if + * any) so that a subsequent access to @target_user no longer goes + * through the now-unmounted location. This is the inverse of + * emulate_mount(). Bindings put in place at PRoot startup + * (recommended -R bindings, the rootfs itself) are NOT removed: we + * only drop runtime bindings whose guest path exactly matches. + */ +static void emulate_umount(Tracee *tracee, const char *target_user) +{ + char guest_path[PATH_MAX]; + Binding *binding; + + if (guest_canonicalize(tracee, target_user, guest_path) < 0) + return; + + /* Never drop the root binding. */ + if (strcmp(guest_path, "/") == 0) + return; + + binding = get_binding(tracee, GUEST, guest_path); + if (binding == NULL) + return; + + /* Only drop the binding if its guest path is exactly the + * unmount target; otherwise we'd unbind something the tracee + * didn't ask to unmount (e.g. its containing rootfs). */ + if (strcmp(binding->guest.path, guest_path) != 0) + return; + + remove_binding_from_all_lists(tracee, binding); +} + +/** + * Read umount(2)/umount2(2) arguments from the @tracee's registers + * and apply emulate_umount(). + */ +void apply_emulated_umount(Tracee *tracee) +{ + char target_user[PATH_MAX]; + + if (get_sysarg_path(tracee, target_user, SYSARG_1) < 0) + return; + + emulate_umount(tracee, target_user); +} + /** * Read mount(2) arguments from the @tracee's registers and apply * emulate_mount(). Safe to call from both the normal sysenter path @@ -678,13 +725,19 @@ int translate_syscall_enter(Tracee *tracee) status = translate_sysarg(tracee, SYSARG_1, REGULAR); break; - /* Pretend namespace/unmount syscalls succeed without doing - * anything; PRoot can't really create namespaces, and sandbox - * helpers like bubblewrap only check the return value. */ + /* Pretend namespace syscalls succeed without doing anything; + * PRoot can't really create namespaces, and sandbox helpers + * like bubblewrap only check the return value. */ case PR_unshare: case PR_setns: + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + case PR_umount: case PR_umount2: + apply_emulated_umount(tracee); poke_reg(tracee, SYSARG_RESULT, 0); set_sysnum(tracee, PR_void); status = 0; @@ -694,11 +747,17 @@ int translate_syscall_enter(Tracee *tracee) * syscall doesn't fail with EPERM on kernels that disallow * unprivileged namespace creation (typical on Android). The * fork/thread itself still proceeds normally and PRoot keeps - * tracking the child through PTRACE_EVENT_CLONE. */ + * tracking the child through PTRACE_EVENT_CLONE. When the + * caller asked for CLONE_NEWNS, remember it on the tracee so + * the new child gets isolated bindings (otherwise emulated + * mount(2) calls in the child would leak into the parent). */ case PR_clone: { word_t flags = peek_reg(tracee, CURRENT, SYSARG_1); - if ((flags & CLONE_NS_MASK) != 0) + if ((flags & CLONE_NS_MASK) != 0) { + if ((flags & CLONE_NEWNS) != 0) + tracee->clone_stripped_newns = true; poke_reg(tracee, SYSARG_1, flags & ~(word_t) CLONE_NS_MASK); + } status = 0; break; } @@ -710,9 +769,12 @@ int translate_syscall_enter(Tracee *tracee) if (args_addr != 0) { errno = 0; flags = peek_word(tracee, args_addr); - if (errno == 0 && (flags & CLONE_NS_MASK) != 0) + if (errno == 0 && (flags & CLONE_NS_MASK) != 0) { + if ((flags & CLONE_NEWNS) != 0) + tracee->clone_stripped_newns = true; poke_word(tracee, args_addr, flags & ~(word_t) CLONE_NS_MASK); + } } status = 0; break; diff --git a/src/syscall/syscall.h b/src/syscall/syscall.h index 3b067c42..e0e263c9 100644 --- a/src/syscall/syscall.h +++ b/src/syscall/syscall.h @@ -38,5 +38,6 @@ extern void translate_syscall_exit(Tracee *tracee); extern void apply_emulated_mount(Tracee *tracee); extern void apply_emulated_pivot_root(Tracee *tracee); +extern void apply_emulated_umount(Tracee *tracee); #endif /* SYSCALL_H */ diff --git a/src/tracee/seccomp.c b/src/tracee/seccomp.c index 650e32f6..cd664ddc 100644 --- a/src/tracee/seccomp.c +++ b/src/tracee/seccomp.c @@ -177,6 +177,10 @@ static int handle_seccomp_event_common(Tracee *tracee) case PR_umount: case PR_umount2: + apply_emulated_umount(tracee); + set_result_after_seccomp(tracee, 0); + break; + case PR_unshare: case PR_setns: set_result_after_seccomp(tracee, 0); diff --git a/src/tracee/tracee.c b/src/tracee/tracee.c index 678d1639..68bd10d7 100644 --- a/src/tracee/tracee.c +++ b/src/tracee/tracee.c @@ -540,12 +540,38 @@ int new_child(Tracee *parent, word_t clone_flags) return -ENOMEM; talloc_set_name_const(child->fs->cwd, "$cwd"); - /* Bindings are shared across file-system name-spaces since a - * "mount --bind" made by a process affects all other processes - * under Linux. Actually they are copied when a sub - * reconfiguration occured (nested proot or chroot(2)). */ - child->fs->bindings.guest = talloc_reference(child->fs, parent->fs->bindings.guest); - child->fs->bindings.host = talloc_reference(child->fs, parent->fs->bindings.host); + if (parent->clone_stripped_newns + && parent->fs->bindings.guest != NULL) { + /* Caller asked for CLONE_NEWNS (which we + * silently stripped). Give the child its own + * copy of the binding tree so emulated mount(2) + * calls don't propagate back to the parent. */ + Binding *iter; + + child->fs->bindings.guest = talloc_zero(child->fs, Bindings); + child->fs->bindings.host = talloc_zero(child->fs, Bindings); + if ( child->fs->bindings.guest == NULL + || child->fs->bindings.host == NULL) + return -ENOMEM; + CIRCLEQ_INIT(child->fs->bindings.guest); + CIRCLEQ_INIT(child->fs->bindings.host); + + for (iter = CIRCLEQ_FIRST(parent->fs->bindings.guest); + iter != (void *) parent->fs->bindings.guest; + iter = CIRCLEQ_NEXT(iter, link.guest)) + (void) insort_binding3(child, child->fs, + iter->host.path, + iter->guest.path); + parent->clone_stripped_newns = false; + } + else { + /* Bindings are shared across file-system name-spaces since a + * "mount --bind" made by a process affects all other processes + * under Linux. Actually they are copied when a sub + * reconfiguration occured (nested proot or chroot(2)). */ + child->fs->bindings.guest = talloc_reference(child->fs, parent->fs->bindings.guest); + child->fs->bindings.host = talloc_reference(child->fs, parent->fs->bindings.host); + } } /* The path to the executable is unshared only once the child diff --git a/src/tracee/tracee.h b/src/tracee/tracee.h index 60c55175..fed9a2cc 100644 --- a/src/tracee/tracee.h +++ b/src/tracee/tracee.h @@ -103,6 +103,12 @@ typedef struct tracee { /* Is it a "clone", i.e has the same parent as its creator. */ bool clone; + /* Set when the current clone(2)/clone3(2) had CLONE_NEW* flags + * stripped (see translate_syscall_enter); the new child should + * get its own copy of the bindings so emulated mount(2) calls + * stay scoped to the would-be namespace. Reset once consumed. */ + bool clone_stripped_newns; + /* Support for ptrace emulation (tracer side). */ struct { size_t nb_ptracees; From ed4ab00adb4328e06375fc1ffda51c339b756898 Mon Sep 17 00:00:00 2001 From: Sylirre Date: Sat, 23 May 2026 18:27:26 +0000 Subject: [PATCH 07/10] syscall: bind host /dev and /dev/pts for devtmpfs/devpts mounts Previously emulated mount of fstype "devtmpfs" or "devpts" got the same empty-tmpdir treatment as "tmpfs", which meant the tracee saw an empty directory instead of any real device. Bind the host /dev (for devtmpfs) and /dev/pts (for devpts) instead, so things like opening /dev/null or a pty inside the sandbox actually work. --- src/syscall/enter.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/syscall/enter.c b/src/syscall/enter.c index fffa16b5..d590622e 100644 --- a/src/syscall/enter.c +++ b/src/syscall/enter.c @@ -158,9 +158,11 @@ static void emulate_mount(Tracee *tracee, const char *src_user, strcpy(host_path, "/proc"); else if (strcmp(fstype, "sysfs") == 0) strcpy(host_path, "/sys"); - else if ( strcmp(fstype, "tmpfs") == 0 - || strcmp(fstype, "devpts") == 0 - || strcmp(fstype, "devtmpfs") == 0) { + else if (strcmp(fstype, "devtmpfs") == 0) + strcpy(host_path, "/dev"); + else if (strcmp(fstype, "devpts") == 0) + strcpy(host_path, "/dev/pts"); + else if (strcmp(fstype, "tmpfs") == 0) { tmpdir = create_temp_directory(tracee->fs, "proot-tmpfs-"); if (tmpdir == NULL) return; From aac060c89d4fa2abe115326dbc6733cd9ecd9e64 Mon Sep 17 00:00:00 2001 From: Sylirre Date: Sat, 23 May 2026 20:39:22 +0000 Subject: [PATCH 08/10] syscall: emulate AF_NETLINK / NETLINK_ROUTE for loopback_setup Bubblewrap's --unshare-net path calls loopback_setup(), which: 1. if_nametoindex("lo") -> ioctl(SIOCGIFINDEX, {ifr_name="lo"}) 2. socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE) 3. bind() the netlink socket 4. sendto/recv RTM_NEWADDR + RTM_NEWLINK On Android the underlying syscalls return EACCES because the real caller lacks CAP_NET_ADMIN. We can't really set the loopback up but we can make bwrap think we did. - ioctl(SIOCGIFINDEX) for "lo" is intercepted and filled with index 1. - socket(AF_NETLINK, ...) is silently rewritten to socket(AF_UNIX, SOCK_DGRAM, ...). The resulting fd is tracked on the tracee. - bind/sendto/recvfrom on a tracked fd is voided. sendto records the request's nlmsg_seq, recvfrom writes back a synthesised NLMSG_ERROR reply with error=0, nlmsg_seq from the request, and nlmsg_pid set to the tracee's pid (bwrap checks both). - close() on a tracked fd removes it from the set so a reused fd number doesn't keep being intercepted. --- src/syscall/enter.c | 238 +++++++++++++++++++++++++++++++++++++++--- src/syscall/exit.c | 23 ++++ src/syscall/seccomp.c | 5 + src/tracee/tracee.h | 12 +++ 4 files changed, 263 insertions(+), 15 deletions(-) diff --git a/src/syscall/enter.c b/src/syscall/enter.c index d590622e..8b126570 100644 --- a/src/syscall/enter.c +++ b/src/syscall/enter.c @@ -28,10 +28,15 @@ #include /* PATH_MAX, */ #include /* strcpy */ #include /* bool */ +#include /* uint32_t */ #include /* PR_SET_DUMPABLE */ #include /* MS_BIND, MS_REMOUNT, ... */ +#include /* AF_NETLINK, AF_UNIX, SOCK_DGRAM, SOCK_CLOEXEC */ #include /* CLONE_NEW*, */ #include /* TCSETS, TCSANOW */ +#include /* struct nlmsghdr, NLMSG_ERROR, struct nlmsgerr */ +#include /* SIOCGIFINDEX */ +#include /* struct ifreq, IFNAMSIZ */ #include "cli/note.h" #include "syscall/syscall.h" @@ -368,6 +373,99 @@ void apply_emulated_pivot_root(Tracee *tracee) emulate_pivot_root(tracee, new_root_user, put_old_user); } +/** + * Helpers for emulating AF_NETLINK / NETLINK_ROUTE traffic. The + * tracee asks for a real netlink socket which Android usually denies + * with EACCES (no CAP_NET_ADMIN); we silently substitute an + * AF_UNIX/SOCK_DGRAM socket and intercept the few netlink-shaped + * syscalls bubblewrap's loopback_setup() actually makes + * (bind/sendto/recvfrom), synthesising an NLMSG_ERROR success reply. + */ + +static bool is_fake_netlink_fd(const Tracee *tracee, int fd) +{ + int i; + if (fd < 0) + return false; + for (i = 0; i < tracee->fake_netlink_fds_count; i++) + if (tracee->fake_netlink_fds[i] == fd) + return true; + return false; +} + +static void unmark_fake_netlink_fd(Tracee *tracee, int fd) +{ + int i; + for (i = 0; i < tracee->fake_netlink_fds_count; i++) { + if (tracee->fake_netlink_fds[i] == fd) { + tracee->fake_netlink_fds[i] = + tracee->fake_netlink_fds[--tracee->fake_netlink_fds_count]; + return; + } + } +} + +/** + * Write a synthetic NLMSG_ERROR reply (with error=0) into the + * tracee's buffer, mirroring the @seq the caller used in its request. + * Returns the number of bytes written, or 0 on failure. + * + * bubblewrap's rtnl_read_reply checks both the sequence number AND + * that nlmsg_pid matches the tracee's own pid, so set them both. + */ +static size_t write_fake_netlink_ack(Tracee *tracee, word_t buf_addr, + word_t buf_len, uint32_t seq) +{ + struct { + struct nlmsghdr hdr; + struct nlmsgerr err; + } reply; + size_t reply_len = sizeof(reply); + + if (buf_len < reply_len) + return 0; + + memset(&reply, 0, sizeof(reply)); + reply.hdr.nlmsg_len = reply_len; + reply.hdr.nlmsg_type = NLMSG_ERROR; + reply.hdr.nlmsg_flags = 0; + reply.hdr.nlmsg_seq = seq; + reply.hdr.nlmsg_pid = (uint32_t) tracee->pid; + reply.err.error = 0; + /* reply.err.msg is the (zeroed) header of the original request; + * loopback_setup() only checks the error field. */ + + if (write_data(tracee, buf_addr, &reply, reply_len) < 0) + return 0; + return reply_len; +} + +/** + * If @cmd is SIOCGIFINDEX for "lo", fake an answer of 1 in the + * tracee's ifreq buffer. Android often denies this ioctl when the + * caller lacks CAP_NET_ADMIN; bubblewrap's loopback_setup() calls + * if_nametoindex("lo") which goes through this ioctl and bails out + * with "Permission denied" on failure. + */ +static bool maybe_fake_siocgifindex(Tracee *tracee, word_t cmd, word_t arg) +{ + struct ifreq ifr; + + if (cmd != SIOCGIFINDEX) + return false; + if (arg == 0) + return false; + if (read_data(tracee, &ifr, arg, sizeof(ifr)) < 0) + return false; + if (strncmp(ifr.ifr_name, "lo", IFNAMSIZ) != 0) + return false; + + ifr.ifr_ifindex = 1; + if (write_data(tracee, arg, &ifr, sizeof(ifr)) < 0) + return false; + return true; +} + /** * Detect /proc//{uid_map,gid_map,setgroups}, which sandbox * helpers like bubblewrap write to during user-namespace setup. The @@ -568,6 +666,17 @@ int translate_syscall_enter(Tracee *tracee) word_t address; word_t size; + /* If we already redirected this fd to AF_UNIX as part + * of the AF_NETLINK emulation, fail the bind silently + * (the kernel would otherwise refuse our sockaddr_nl). */ + if (syscall_number == PR_bind + && is_fake_netlink_fd(tracee, peek_reg(tracee, CURRENT, SYSARG_1))) { + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + } + address = peek_reg(tracee, CURRENT, SYSARG_2); size = peek_reg(tracee, CURRENT, SYSARG_3); @@ -626,6 +735,89 @@ int translate_syscall_enter(Tracee *tracee) break; } + /* Substitute an AF_UNIX/SOCK_DGRAM socket for AF_NETLINK + * requests so the kernel doesn't reject them with EACCES on + * Android, then track the resulting fd so bind/sendto/recvfrom + * on it can be faked too. */ + case PR_socket: { + word_t domain = peek_reg(tracee, CURRENT, SYSARG_1); + if (domain == AF_NETLINK) { + word_t type = peek_reg(tracee, CURRENT, SYSARG_2); + poke_reg(tracee, SYSARG_1, AF_UNIX); + poke_reg(tracee, SYSARG_2, SOCK_DGRAM | (type & SOCK_CLOEXEC)); + poke_reg(tracee, SYSARG_3, 0); + tracee->pending_fake_netlink_socket = true; + tracee->sysexit_pending = true; + tracee->restart_how = PTRACE_SYSCALL; + } + status = 0; + break; + } + + case PR_sendto: { + int fd = peek_reg(tracee, CURRENT, SYSARG_1); + if (is_fake_netlink_fd(tracee, fd)) { + word_t buf = peek_reg(tracee, CURRENT, SYSARG_2); + word_t len = peek_reg(tracee, CURRENT, SYSARG_3); + struct nlmsghdr hdr; + + if (buf != 0 && len >= sizeof(hdr) + && read_data(tracee, &hdr, buf, sizeof(hdr)) == 0) + tracee->fake_netlink_pending_seq = hdr.nlmsg_seq; + + poke_reg(tracee, SYSARG_RESULT, len); + set_sysnum(tracee, PR_void); + status = 0; + break; + } + status = 0; + break; + } + + case PR_sendmsg: { + int fd = peek_reg(tracee, CURRENT, SYSARG_1); + if (is_fake_netlink_fd(tracee, fd)) { + /* Pretend we sent everything. bubblewrap only + * uses sendto(); we accept sendmsg too just in + * case. */ + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + } + status = 0; + break; + } + + case PR_recvfrom: { + int fd = peek_reg(tracee, CURRENT, SYSARG_1); + if (is_fake_netlink_fd(tracee, fd)) { + word_t buf = peek_reg(tracee, CURRENT, SYSARG_2); + word_t len = peek_reg(tracee, CURRENT, SYSARG_3); + size_t n = write_fake_netlink_ack(tracee, buf, len, + tracee->fake_netlink_pending_seq); + poke_reg(tracee, SYSARG_RESULT, (word_t) n); + set_sysnum(tracee, PR_void); + status = 0; + break; + } + status = 0; + break; + } + + case PR_recvmsg: { + int fd = peek_reg(tracee, CURRENT, SYSARG_1); + if (is_fake_netlink_fd(tracee, fd)) { + /* Same fallback as sendmsg: return EOF. */ + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + } + status = 0; + break; + } + case PR_socketcall: { word_t args_addr; word_t sock_addr_saved; @@ -1013,31 +1205,39 @@ int translate_syscall_enter(Tracee *tracee) } break; + case PR_ioctl: { + word_t cmd = peek_reg(tracee, CURRENT, SYSARG_2); + word_t arg = peek_reg(tracee, CURRENT, SYSARG_3); + + /* SIOCGIFINDEX for "lo": Android often denies this with + * EACCES; fake an answer of 1 so bubblewrap's + * loopback_setup() can proceed. */ + if (cmd == SIOCGIFINDEX && maybe_fake_siocgifindex(tracee, cmd, arg)) { + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + break; + } + #ifdef __ANDROID__ - case PR_ioctl: /* Using literal value because Termux build system patches TCSAFLUSH */ - if (peek_reg(tracee, CURRENT, SYSARG_2) == TCSETS + 2 /* + TCSAFLUSH */) { + if (cmd == TCSETS + 2 /* + TCSAFLUSH */) poke_reg(tracee, SYSARG_2, TCSETS + TCSANOW); - } - if (peek_reg(tracee, CURRENT, SYSARG_2) == TCGETS2) { + if (cmd == TCGETS2) poke_reg(tracee, SYSARG_2, TCGETS); - } - if (peek_reg(tracee, CURRENT, SYSARG_2) == TCSETS2) { + if (cmd == TCSETS2) poke_reg(tracee, SYSARG_2, TCSETS); - } - if (peek_reg(tracee, CURRENT, SYSARG_2) == TCSETSW2) { + if (cmd == TCSETSW2) poke_reg(tracee, SYSARG_2, TCSETSW); - } - if (peek_reg(tracee, CURRENT, SYSARG_2) == TCSETSF2) { + if (cmd == TCSETSF2) poke_reg(tracee, SYSARG_2, TCSETSF); - } +#endif break; -#endif + } case PR_memfd_create: { @@ -1068,12 +1268,20 @@ int translate_syscall_enter(Tracee *tracee) } break; } - case PR_close: + case PR_close: { + int closed_fd = (int) peek_reg(tracee, CURRENT, SYSARG_1); + /* Stop tracking auxv_fd once the tracee closes it. */ - if (tracee->auxv_fd >= 0 - && (int) peek_reg(tracee, CURRENT, SYSARG_1) == tracee->auxv_fd) + if (tracee->auxv_fd >= 0 && closed_fd == tracee->auxv_fd) tracee->auxv_fd = -1; + + /* Drop the fd from the fake-AF_NETLINK tracking set, + * otherwise its number could be reused for an unrelated + * file and we'd keep intercepting sendto/recvfrom on + * it. */ + unmark_fake_netlink_fd(tracee, closed_fd); break; + } } diff --git a/src/syscall/exit.c b/src/syscall/exit.c index 67fe8d7f..c646543b 100644 --- a/src/syscall/exit.c +++ b/src/syscall/exit.c @@ -673,6 +673,29 @@ void translate_syscall_exit(Tracee *tracee) } goto end; + case PR_socket: + /* Record the fd we substituted for an AF_NETLINK request. */ + if (tracee->pending_fake_netlink_socket) { + int fd = (int) peek_reg(tracee, CURRENT, SYSARG_RESULT); + if (fd >= 0) { + int i; + if (tracee->fake_netlink_fds_count < MAX_FAKE_NETLINK_FDS) { + /* Avoid duplicates. */ + bool present = false; + for (i = 0; i < tracee->fake_netlink_fds_count; i++) { + if (tracee->fake_netlink_fds[i] == fd) { + present = true; + break; + } + } + if (!present) + tracee->fake_netlink_fds[tracee->fake_netlink_fds_count++] = fd; + } + } + tracee->pending_fake_netlink_socket = false; + } + goto end; + default: goto end; } diff --git a/src/syscall/seccomp.c b/src/syscall/seccomp.c index 1732039e..af38038a 100644 --- a/src/syscall/seccomp.c +++ b/src/syscall/seccomp.c @@ -345,6 +345,11 @@ static FilteredSysnum proot_sysnums[] = { { PR_close, 0 }, { PR_connect, 0 }, { PR_creat, 0 }, + { PR_recvfrom, 0 }, + { PR_recvmsg, 0 }, + { PR_sendmsg, 0 }, + { PR_sendto, 0 }, + { PR_socket, FILTER_SYSEXIT }, { PR_execve, FILTER_SYSEXIT }, { PR_execveat, FILTER_SYSEXIT }, { PR_faccessat, 0 }, diff --git a/src/tracee/tracee.h b/src/tracee/tracee.h index fed9a2cc..05702c38 100644 --- a/src/tracee/tracee.h +++ b/src/tracee/tracee.h @@ -109,6 +109,18 @@ typedef struct tracee { * stay scoped to the would-be namespace. Reset once consumed. */ bool clone_stripped_newns; + /* Emulation of AF_NETLINK / NETLINK_ROUTE sockets for + * sandbox helpers like bubblewrap that try to bring up the + * loopback interface inside their would-be net namespace. + * fake_netlink_fds holds the fds of sockets we silently + * redirected from AF_NETLINK to AF_UNIX/SOCK_DGRAM; see + * enter.c / exit.c for the intercepts. */ +#define MAX_FAKE_NETLINK_FDS 8 + int fake_netlink_fds[MAX_FAKE_NETLINK_FDS]; + int fake_netlink_fds_count; + bool pending_fake_netlink_socket; + uint32_t fake_netlink_pending_seq; + /* Support for ptrace emulation (tracer side). */ struct { size_t nb_ptracees; From 02ec0b0b30e87c398519ce984a4b8e36391dc53b Mon Sep 17 00:00:00 2001 From: Sylirre Date: Sun, 24 May 2026 23:02:03 +0000 Subject: [PATCH 09/10] syscall: only intercept AF_NETLINK when host kernel denies it Previously socket(AF_NETLINK, ...) was unconditionally rewritten to socket(AF_UNIX, SOCK_DGRAM, 0), which broke legitimate netlink users inside the rootfs (e.g. c-ares under dnf would observe a zero-byte recvmsg and abort with "Unexpected netlink response of size 0 on descriptor N (address family 1)"). The rewrite is only needed where the kernel refuses AF_NETLINK outright (typical on Android/Termux). Probe socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE) once at first use and cache the outcome; only fall back to the AF_UNIX emulation when the probe fails. When the probe succeeds, no fd is tracked, so the dependent bind/sendto/recv intercepts stay inert. --- src/syscall/enter.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/syscall/enter.c b/src/syscall/enter.c index 8b126570..ef16a952 100644 --- a/src/syscall/enter.c +++ b/src/syscall/enter.c @@ -380,8 +380,31 @@ void apply_emulated_pivot_root(Tracee *tracee) * AF_UNIX/SOCK_DGRAM socket and intercept the few netlink-shaped * syscalls bubblewrap's loopback_setup() actually makes * (bind/sendto/recvfrom), synthesising an NLMSG_ERROR success reply. + * + * Only kick in when the host kernel actually denies AF_NETLINK + * (Termux/Android with restrictive seccomp). On stock Linux the + * tracee can open NETLINK_ROUTE just fine and our rewrite would only + * break legitimate users like c-ares (dnf, getaddrinfo, ...). */ +static bool host_blocks_af_netlink(void) +{ + static int cached = -1; /* -1: unknown, 0: works, 1: blocked */ + int fd; + + if (cached != -1) + return cached == 1; + + fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); + if (fd >= 0) { + close(fd); + cached = 0; + return false; + } + cached = 1; + return true; +} + static bool is_fake_netlink_fd(const Tracee *tracee, int fd) { int i; @@ -741,7 +764,7 @@ int translate_syscall_enter(Tracee *tracee) * on it can be faked too. */ case PR_socket: { word_t domain = peek_reg(tracee, CURRENT, SYSARG_1); - if (domain == AF_NETLINK) { + if (domain == AF_NETLINK && host_blocks_af_netlink()) { word_t type = peek_reg(tracee, CURRENT, SYSARG_2); poke_reg(tracee, SYSARG_1, AF_UNIX); poke_reg(tracee, SYSARG_2, SOCK_DGRAM | (type & SOCK_CLOEXEC)); From 4d0c46f515d17b642590648c6061bc18541e850c Mon Sep 17 00:00:00 2001 From: Sylirre Date: Sun, 24 May 2026 23:48:47 +0000 Subject: [PATCH 10/10] syscall: include unistd.h and report when AF_NETLINK fallback engages Pulls in for close(2) so the netlink probe builds without the implicit-declaration warning. While here, replace the magic -1/0/1 cache values with a named enum, fix the comment to call out the real reasons AF_NETLINK gets denied (SELinux, inherited seccomp, hardened containers) rather than implying it is always seccomp, and emit a VERBOSE note the first time the probe falls back so users can tell from -v output whether the AF_UNIX shim is active. --- src/syscall/enter.c | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/src/syscall/enter.c b/src/syscall/enter.c index ef16a952..91e1520e 100644 --- a/src/syscall/enter.c +++ b/src/syscall/enter.c @@ -25,6 +25,7 @@ #include /* struct sockaddr_un, */ #include /* SYS_*, */ #include /* AT_FDCWD, */ +#include /* close(2), */ #include /* PATH_MAX, */ #include /* strcpy */ #include /* bool */ @@ -374,34 +375,43 @@ void apply_emulated_pivot_root(Tracee *tracee) } /** - * Helpers for emulating AF_NETLINK / NETLINK_ROUTE traffic. The - * tracee asks for a real netlink socket which Android usually denies - * with EACCES (no CAP_NET_ADMIN); we silently substitute an - * AF_UNIX/SOCK_DGRAM socket and intercept the few netlink-shaped - * syscalls bubblewrap's loopback_setup() actually makes - * (bind/sendto/recvfrom), synthesising an NLMSG_ERROR success reply. + * Helpers for emulating AF_NETLINK / NETLINK_ROUTE traffic. Some + * environments deny the tracee a real netlink socket (Android's + * SELinux policy on untrusted_app domains, seccomp filters inherited + * from a Termux-like launcher, hardened containers, ...); in that + * case we silently substitute an AF_UNIX/SOCK_DGRAM socket and + * intercept the few netlink-shaped syscalls bubblewrap's + * loopback_setup() actually makes (bind/sendto/recvfrom), + * synthesising an NLMSG_ERROR success reply. * - * Only kick in when the host kernel actually denies AF_NETLINK - * (Termux/Android with restrictive seccomp). On stock Linux the - * tracee can open NETLINK_ROUTE just fine and our rewrite would only - * break legitimate users like c-ares (dnf, getaddrinfo, ...). + * The substitution only happens when the host kernel actually + * refuses AF_NETLINK; otherwise the tracee gets a real netlink + * socket and ordinary users like c-ares (dnf, getaddrinfo, ...) + * keep working. */ -static bool host_blocks_af_netlink(void) +static bool host_blocks_af_netlink(const Tracee *tracee) { - static int cached = -1; /* -1: unknown, 0: works, 1: blocked */ + enum { PROBE_UNKNOWN, PROBE_ALLOWED, PROBE_BLOCKED }; + static int cached = PROBE_UNKNOWN; int fd; + int saved_errno; - if (cached != -1) - return cached == 1; + if (cached != PROBE_UNKNOWN) + return cached == PROBE_BLOCKED; fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); if (fd >= 0) { close(fd); - cached = 0; + cached = PROBE_ALLOWED; return false; } - cached = 1; + + saved_errno = errno; + cached = PROBE_BLOCKED; + VERBOSE(tracee, 1, "AF_NETLINK denied by host (%s); enabling " + "AF_UNIX fallback for sandbox helpers", + strerror(saved_errno)); return true; } @@ -764,7 +774,7 @@ int translate_syscall_enter(Tracee *tracee) * on it can be faked too. */ case PR_socket: { word_t domain = peek_reg(tracee, CURRENT, SYSARG_1); - if (domain == AF_NETLINK && host_blocks_af_netlink()) { + if (domain == AF_NETLINK && host_blocks_af_netlink(tracee)) { word_t type = peek_reg(tracee, CURRENT, SYSARG_2); poke_reg(tracee, SYSARG_1, AF_UNIX); poke_reg(tracee, SYSARG_2, SOCK_DGRAM | (type & SOCK_CLOEXEC));