diff --git a/src/extension/mountinfo/mountinfo.c b/src/extension/mountinfo/mountinfo.c index 9b66c76c..1abf2735 100644 --- a/src/extension/mountinfo/mountinfo.c +++ b/src/extension/mountinfo/mountinfo.c @@ -1,9 +1,42 @@ #include "extension/extension.h" #include "path/path.h" /* translate_path, */ +#include "path/binding.h" /* Binding, bindings */ #include "path/temp.h" /* create_temp_file, */ #include /* INT_MAX, */ #include /* PATH_MAX, */ #include /* strlen, strcmp */ +#include /* FILE, getline, fprintf */ +#include /* free, */ +#include /* CIRCLEQ_*, */ + +/** + * Append a synthesized mount-table line to @fp for each runtime + * binding (i.e. one that wasn't part of the static -r/-b set). This + * is what lets sandbox helpers like bubblewrap find the mount they + * just asked PRoot to create via emulate_mount(). + */ +static void append_runtime_binding_lines(Tracee *target_tracee, FILE *fp) +{ + Binding *binding; + int next_id = 1000000; + int parent_id = 1; + + if (target_tracee->fs->bindings.guest == NULL) + return; + + for (binding = CIRCLEQ_FIRST(target_tracee->fs->bindings.guest); + binding != (void *) target_tracee->fs->bindings.guest; + binding = CIRCLEQ_NEXT(binding, link.guest)) { + /* Skip the root binding "/" — already present as the kernel root. */ + if (strcmp(binding->guest.path, "/") == 0) + continue; + + fprintf(fp, + "%d %d 0:1 / %s rw,relatime - bind %s rw,relatime\n", + next_id++, parent_id, + binding->guest.path, binding->host.path); + } +} static void mountinfo_check_open_path(Tracee *tracee, char path[PATH_MAX]) { /* Try matching "/proc//mountinfo" */ @@ -33,10 +66,29 @@ static void mountinfo_check_open_path(Tracee *tracee, char path[PATH_MAX]) { char root_path[PATH_MAX]; // Host path to guest root translate_path(target_tracee, root_path, AT_FDCWD, "/", true); Comparison compare_result = compare_paths(root_path, "/data"); - if (compare_result != PATH2_IS_PREFIX && compare_result != PATHS_ARE_EQUAL) { - return; + bool is_android_data = (compare_result == PATH2_IS_PREFIX || compare_result == PATHS_ARE_EQUAL); + + /* Are there bindings to expose as fake mounts (mount(2) + * calls from sandbox helpers are converted into + * bindings — see emulate_mount). Skip the root + * binding, which the real kernel mount table already + * covers. */ + bool has_extra_bindings = false; + if (target_tracee->fs->bindings.guest != NULL) { + Binding *b; + for (b = CIRCLEQ_FIRST(target_tracee->fs->bindings.guest); + b != (void *) target_tracee->fs->bindings.guest; + b = CIRCLEQ_NEXT(b, link.guest)) { + if (strcmp(b->guest.path, "/") != 0) { + has_extra_bindings = true; + break; + } + } } + if (!is_android_data && !has_extra_bindings) + return; + /* Open real /proc//mountinfo */ FILE *real_mountinfo_fp = fopen(path, "r"); if (real_mountinfo_fp == NULL) { @@ -55,38 +107,14 @@ static void mountinfo_check_open_path(Tracee *tracee, char path[PATH_MAX]) { size_t line_buf_len = 0; ssize_t line_len = 0; bool found_line = false; - while ((line_len = getline(&line, &line_buf_len, real_mountinfo_fp)) > 0) { - char *chunk = line; - /* Skip columns before 'root' */ - for (int i = 0; i < 4 && chunk - line < line_len; i++) { - chunk = strchr(chunk, ' '); - if (chunk == NULL) goto end_line_scan; - chunk++; - } - - /* Match path */ - char *chunk_end = strchr(chunk, ' '); - if (chunk_end == NULL) continue; - if (chunk_end - chunk == 5 && 0 == memcmp(chunk, "/data", 5)) { - /* Write line into new file keeping only "/" from root column */ - fwrite(line, chunk - line + 1, 1, new_mountinfo_fp); - fwrite(chunk_end, line_len - (chunk_end - line), 1, new_mountinfo_fp); - found_line = true; - break; - } -end_line_scan: ; - } - - /* Once root was added, rescan and add other standard mounts */ - if (found_line) { - fseek(real_mountinfo_fp, 0, SEEK_SET); + if (is_android_data) { while ((line_len = getline(&line, &line_buf_len, real_mountinfo_fp)) > 0) { char *chunk = line; /* Skip columns before 'root' */ for (int i = 0; i < 4 && chunk - line < line_len; i++) { chunk = strchr(chunk, ' '); - if (chunk == NULL) goto end_line_scan2; + if (chunk == NULL) goto end_line_scan; chunk++; } @@ -94,22 +122,59 @@ end_line_scan: ; char *chunk_end = strchr(chunk, ' '); if (chunk_end == NULL) continue; - size_t mount_len = chunk_end - chunk; - if ( - (mount_len == 4 && 0 == memcmp(chunk, "/dev", 4)) || - (mount_len >= 5 && 0 == memcmp(chunk, "/dev/", 5)) || - (mount_len == 5 && 0 == memcmp(chunk, "/proc", 5)) || - (mount_len == 4 && 0 == memcmp(chunk, "/sys", 4)) || - (mount_len >= 5 && 0 == memcmp(chunk, "/sys/", 5)) || - (mount_len == 4 && 0 == memcmp(chunk, "/tmp", 4)) - ) { - /* Copy line into new file verbatim */ - fwrite(line, line_len, 1, new_mountinfo_fp); + if (chunk_end - chunk == 5 && 0 == memcmp(chunk, "/data", 5)) { + /* Write line into new file keeping only "/" from root column */ + fwrite(line, chunk - line + 1, 1, new_mountinfo_fp); + fwrite(chunk_end, line_len - (chunk_end - line), 1, new_mountinfo_fp); + found_line = true; + break; } +end_line_scan: ; + } + + /* Once root was added, rescan and add other standard mounts */ + if (found_line) { + fseek(real_mountinfo_fp, 0, SEEK_SET); + while ((line_len = getline(&line, &line_buf_len, real_mountinfo_fp)) > 0) { + char *chunk = line; + /* Skip columns before 'root' */ + for (int i = 0; i < 4 && chunk - line < line_len; i++) { + chunk = strchr(chunk, ' '); + if (chunk == NULL) goto end_line_scan2; + chunk++; + } + + /* Match path */ + char *chunk_end = strchr(chunk, ' '); + if (chunk_end == NULL) continue; + + size_t mount_len = chunk_end - chunk; + if ( + (mount_len == 4 && 0 == memcmp(chunk, "/dev", 4)) || + (mount_len >= 5 && 0 == memcmp(chunk, "/dev/", 5)) || + (mount_len == 5 && 0 == memcmp(chunk, "/proc", 5)) || + (mount_len == 4 && 0 == memcmp(chunk, "/sys", 4)) || + (mount_len >= 5 && 0 == memcmp(chunk, "/sys/", 5)) || + (mount_len == 4 && 0 == memcmp(chunk, "/tmp", 4)) + ) { + /* Copy line into new file verbatim */ + fwrite(line, line_len, 1, new_mountinfo_fp); + } end_line_scan2: ; + } } + } else { + /* Non-Android case: copy real mountinfo verbatim. */ + while ((line_len = getline(&line, &line_buf_len, real_mountinfo_fp)) > 0) + fwrite(line, line_len, 1, new_mountinfo_fp); + found_line = true; } + /* Append synthesized entries for runtime bindings so + * helpers like bubblewrap find the mounts they think + * they just created. */ + append_runtime_binding_lines(target_tracee, new_mountinfo_fp); + free(line); fclose(new_mountinfo_fp); fclose(real_mountinfo_fp); diff --git a/src/path/canon.c b/src/path/canon.c index 271749bc..0fd58b0c 100644 --- a/src/path/canon.c +++ b/src/path/canon.c @@ -282,37 +282,59 @@ int canonicalize(Tracee *tracee, const char *user_path, bool deref_final, /* It's a link, so we have to dereference *and* * canonicalize to ensure we are not going outside the * new root. */ - comparison = compare_paths("/proc", guest_path); - switch (comparison) { - case PATHS_ARE_EQUAL: - case PATH1_IS_PREFIX: - /* Some links in "/proc" are generated - * dynamically by the kernel. PRoot has to - * emulate some of them. */ - status = readlink_proc(tracee, scratch_path, - guest_path, component, comparison); - switch (status) { - case CANONICALIZE: - /* The symlink is already dereferenced, - * now canonicalize it. */ - goto canon; - - case DONT_CANONICALIZE: - /* If and only very final, this symlink - * shouldn't be dereferenced nor canonicalized. */ - if (finality == FINAL_NORMAL) { - strcpy(guest_path, scratch_path); - return 0; + { + const char *proc_base = guest_path; + char alias_base[PATH_MAX]; + + comparison = compare_paths("/proc", guest_path); + + /* If guest_path is not under /proc directly, + * check whether it aliases /proc via a binding + * (e.g. /oldroot/proc when /oldroot is bound to + * /). Otherwise links like /oldroot/proc/self + * would be resolved by the real kernel readlink + * and return PRoot's own pid. */ + if (comparison != PATHS_ARE_EQUAL && comparison != PATH1_IS_PREFIX) { + strncpy(alias_base, guest_path, PATH_MAX - 1); + alias_base[PATH_MAX - 1] = '\0'; + (void) substitute_binding(tracee, GUEST, alias_base); + if (strcmp(alias_base, guest_path) != 0) { + comparison = compare_paths("/proc", alias_base); + proc_base = alias_base; + } + } + + switch (comparison) { + case PATHS_ARE_EQUAL: + case PATH1_IS_PREFIX: + /* Some links in "/proc" are generated + * dynamically by the kernel. PRoot has to + * emulate some of them. */ + status = readlink_proc(tracee, scratch_path, + proc_base, component, comparison); + switch (status) { + case CANONICALIZE: + /* The symlink is already dereferenced, + * now canonicalize it. */ + goto canon; + + case DONT_CANONICALIZE: + /* If and only very final, this symlink + * shouldn't be dereferenced nor canonicalized. */ + if (finality == FINAL_NORMAL) { + strcpy(guest_path, scratch_path); + return 0; + } + break; + + default: + if (status < 0) + return status; } - break; default: - if (status < 0) - return status; + break; } - - default: - break; } status = readlink(host_path, scratch_path, sizeof(scratch_path)); diff --git a/src/path/temp.c b/src/path/temp.c index 8d8aa638..dde2296d 100644 --- a/src/path/temp.c +++ b/src/path/temp.c @@ -106,11 +106,18 @@ static int clean_temp_cwd() || strcmp(entry->d_name, "..") == 0) continue; - status = chmod(entry->d_name, 0700); - if (status < 0) { - note(NULL, WARNING, SYSTEM, "cant chmod '%s'", entry->d_name); - nb_errors++; - continue; + /* Skip chmod on symlinks: chmod follows them and would + * report spurious errors when the target no longer + * exists (common with the /dev/{stdin,fd,...} symlinks + * bubblewrap leaves behind in emulated tmpfs dirs). + * We only need to unlink the symlink itself. */ + if (entry->d_type != DT_LNK) { + status = chmod(entry->d_name, 0700); + if (status < 0) { + note(NULL, WARNING, SYSTEM, "cant chmod '%s'", entry->d_name); + nb_errors++; + continue; + } } if (entry->d_type == DT_DIR) { diff --git a/src/syscall/enter.c b/src/syscall/enter.c index 50f1543b..91e1520e 100644 --- a/src/syscall/enter.c +++ b/src/syscall/enter.c @@ -25,10 +25,19 @@ #include /* struct sockaddr_un, */ #include /* SYS_*, */ #include /* AT_FDCWD, */ +#include /* close(2), */ #include /* PATH_MAX, */ #include /* strcpy */ +#include /* bool */ +#include /* uint32_t */ #include /* PR_SET_DUMPABLE */ +#include /* MS_BIND, MS_REMOUNT, ... */ +#include /* AF_NETLINK, AF_UNIX, SOCK_DGRAM, SOCK_CLOEXEC */ +#include /* CLONE_NEW*, */ #include /* TCSETS, TCSANOW */ +#include /* struct nlmsghdr, NLMSG_ERROR, struct nlmsgerr */ +#include /* SIOCGIFINDEX */ +#include /* struct ifreq, IFNAMSIZ */ #include "cli/note.h" #include "syscall/syscall.h" @@ -45,8 +54,22 @@ #include "tracee/abi.h" #include "path/path.h" #include "path/canon.h" +#include "path/binding.h" +#include "path/temp.h" #include "arch.h" +/* Older kernel headers may lack these. */ +#ifndef CLONE_NEWTIME +#define CLONE_NEWTIME 0x00000080 +#endif +#ifndef CLONE_NEWCGROUP +#define CLONE_NEWCGROUP 0x02000000 +#endif + +#define CLONE_NS_MASK (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | \ + CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET | \ + CLONE_NEWCGROUP | CLONE_NEWTIME) + /** * Translate @path and put the result in the @tracee's memory address * space pointed to by the @reg argument of the current syscall. See @@ -87,6 +110,443 @@ static int translate_sysarg(Tracee *tracee, Reg reg, Type type) return translate_path2(tracee, AT_FDCWD, old_path, reg, type); } +/** + * Canonicalize @user_path as a guest path, relative to the @tracee's + * cwd when @user_path is relative. Stores the result in @guest_path + * with any trailing "/" or "/." stripped, so it can be used as a + * binding key. Returns 0 on success, -errno otherwise. + */ +static int guest_canonicalize(Tracee *tracee, const char *user_path, + char guest_path[PATH_MAX]) +{ + int status; + + if (user_path[0] == '/') + strcpy(guest_path, "/"); + else { + status = getcwd2(tracee, guest_path); + if (status < 0) + return status; + } + + status = canonicalize(tracee, user_path, true, guest_path, 0); + if (status < 0) + return status; + + chop_finality(guest_path); + return 0; +} + +/** + * Emulate mount(@src_user, @target_user, @fstype, @flags) by adding a + * PRoot binding from a host directory to the canonicalized target. + * Bind mounts use the translated source; "proc"/"sysfs" use the + * matching host file-system; "tmpfs"/"devpts"/"devtmpfs" get a fresh + * empty directory. Any other case is silently ignored: the caller + * will still see the syscall succeed (we always void it). + */ +static void emulate_mount(Tracee *tracee, const char *src_user, + const char *target_user, const char *fstype, + unsigned long flags) +{ + char host_path[PATH_MAX]; + char guest_path[PATH_MAX]; + const char *tmpdir; + + if ((flags & MS_REMOUNT) != 0) + return; + + if ((flags & MS_BIND) != 0) { + if (translate_path(tracee, host_path, AT_FDCWD, src_user, true) < 0) + return; + } + else if (strcmp(fstype, "proc") == 0) + strcpy(host_path, "/proc"); + else if (strcmp(fstype, "sysfs") == 0) + strcpy(host_path, "/sys"); + else if (strcmp(fstype, "devtmpfs") == 0) + strcpy(host_path, "/dev"); + else if (strcmp(fstype, "devpts") == 0) + strcpy(host_path, "/dev/pts"); + else if (strcmp(fstype, "tmpfs") == 0) { + tmpdir = create_temp_directory(tracee->fs, "proot-tmpfs-"); + if (tmpdir == NULL) + return; + strncpy(host_path, tmpdir, PATH_MAX - 1); + host_path[PATH_MAX - 1] = '\0'; + } + else + return; + + chop_finality(host_path); + + if (guest_canonicalize(tracee, target_user, guest_path) < 0) + return; + + (void) insort_binding3(tracee, tracee->fs, host_path, guest_path); +} + +/** + * Emulate pivot_root(@new_root_user, @put_old_user) by changing the + * tracee's root binding to point at @new_root_user (translated to + * host) and re-exposing the previous root at @put_old_user, so that + * sandbox helpers like bubblewrap can keep accessing the prior + * file-system through the agreed "oldroot" path. + */ +static void emulate_pivot_root(Tracee *tracee, const char *new_root_user, + const char *put_old_user) +{ + char new_root_host[PATH_MAX]; + char new_root_guest[PATH_MAX]; + char put_old_guest[PATH_MAX]; + char old_root_host[PATH_MAX]; + Binding *root_binding; + size_t prefix_len; + const char *put_old_after; + + if (translate_path(tracee, new_root_host, AT_FDCWD, new_root_user, true) < 0) + return; + chop_finality(new_root_host); + + if (guest_canonicalize(tracee, new_root_user, new_root_guest) < 0) + return; + + /* put_old is relative to new_root, so resolve it against + * new_root_guest rather than the current cwd. */ + if (put_old_user[0] == '/') + strcpy(put_old_guest, "/"); + else + strcpy(put_old_guest, new_root_guest); + if (canonicalize(tracee, put_old_user, true, put_old_guest, 0) < 0) + return; + + root_binding = get_binding(tracee, GUEST, "/"); + if (root_binding == NULL) + return; + strncpy(old_root_host, root_binding->host.path, PATH_MAX - 1); + old_root_host[PATH_MAX - 1] = '\0'; + + remove_binding_from_all_lists(tracee, root_binding); + (void) insort_binding3(tracee, tracee->fs, new_root_host, "/"); + + /* If put_old is a path strictly under new_root, expose the + * previous root there. The pivot_root(".", ".") trick used to + * detach the old root leaves new_root and put_old equal; in + * that case there is nowhere to expose the old root. */ + prefix_len = strlen(new_root_guest); + if ( prefix_len > 0 + && strncmp(put_old_guest, new_root_guest, prefix_len) == 0 + && ( put_old_guest[prefix_len] == '/' + || (prefix_len == 1 && new_root_guest[0] == '/'))) { + put_old_after = put_old_guest + (prefix_len == 1 ? 0 : prefix_len); + if (put_old_after[0] == '/' && put_old_after[1] != '\0') { + Binding *iter; + Binding *next; + size_t put_old_len = strlen(put_old_after); + char aliased[PATH_MAX]; + + (void) insort_binding3(tracee, tracee->fs, + old_root_host, put_old_after); + + /* Snapshot existing non-root bindings and + * re-expose each one at put_old_after/, + * so sandbox helpers can still reach the host + * /proc, /dev, ... through the agreed + * "oldroot" prefix. Iterate carefully: we + * mutate the same list we walk. */ + for (iter = CIRCLEQ_FIRST(tracee->fs->bindings.guest); + iter != (void *) tracee->fs->bindings.guest; + iter = next) { + next = CIRCLEQ_NEXT(iter, link.guest); + + if (strcmp(iter->guest.path, "/") == 0) + continue; + /* Skip the binding we just added for + * put_old itself, and anything already + * sitting under put_old. */ + if (strncmp(iter->guest.path, put_old_after, put_old_len) == 0 + && (iter->guest.path[put_old_len] == '\0' + || iter->guest.path[put_old_len] == '/')) + continue; + + if ((size_t) snprintf(aliased, sizeof(aliased), "%s%s", + put_old_after, iter->guest.path) + >= sizeof(aliased)) + continue; + + (void) insort_binding3(tracee, tracee->fs, + iter->host.path, aliased); + } + } + } +} + +/** + * Emulate umount(@target_user) by removing the matching binding (if + * any) so that a subsequent access to @target_user no longer goes + * through the now-unmounted location. This is the inverse of + * emulate_mount(). Bindings put in place at PRoot startup + * (recommended -R bindings, the rootfs itself) are NOT removed: we + * only drop runtime bindings whose guest path exactly matches. + */ +static void emulate_umount(Tracee *tracee, const char *target_user) +{ + char guest_path[PATH_MAX]; + Binding *binding; + + if (guest_canonicalize(tracee, target_user, guest_path) < 0) + return; + + /* Never drop the root binding. */ + if (strcmp(guest_path, "/") == 0) + return; + + binding = get_binding(tracee, GUEST, guest_path); + if (binding == NULL) + return; + + /* Only drop the binding if its guest path is exactly the + * unmount target; otherwise we'd unbind something the tracee + * didn't ask to unmount (e.g. its containing rootfs). */ + if (strcmp(binding->guest.path, guest_path) != 0) + return; + + remove_binding_from_all_lists(tracee, binding); +} + +/** + * Read umount(2)/umount2(2) arguments from the @tracee's registers + * and apply emulate_umount(). + */ +void apply_emulated_umount(Tracee *tracee) +{ + char target_user[PATH_MAX]; + + if (get_sysarg_path(tracee, target_user, SYSARG_1) < 0) + return; + + emulate_umount(tracee, target_user); +} + +/** + * Read mount(2) arguments from the @tracee's registers and apply + * emulate_mount(). Safe to call from both the normal sysenter path + * and the SIGSYS handler (Android's parent seccomp filter traps + * mount, so the syscall never reaches our regular case). + */ +void apply_emulated_mount(Tracee *tracee) +{ + char src_user[PATH_MAX]; + char target_user[PATH_MAX]; + char fstype[256]; + word_t fstype_addr; + unsigned long flags; + + fstype[0] = '\0'; + + if (get_sysarg_path(tracee, src_user, SYSARG_1) < 0) + return; + if (get_sysarg_path(tracee, target_user, SYSARG_2) < 0) + return; + + fstype_addr = peek_reg(tracee, CURRENT, SYSARG_3); + if (fstype_addr != 0) + (void) read_string(tracee, fstype, fstype_addr, sizeof(fstype) - 1); + flags = peek_reg(tracee, CURRENT, SYSARG_4); + + emulate_mount(tracee, src_user, target_user, fstype, flags); +} + +/** + * Read pivot_root(2) arguments from the @tracee's registers and apply + * emulate_pivot_root(). See apply_emulated_mount() for context. + */ +void apply_emulated_pivot_root(Tracee *tracee) +{ + char new_root_user[PATH_MAX]; + char put_old_user[PATH_MAX]; + + if (get_sysarg_path(tracee, new_root_user, SYSARG_1) < 0) + return; + if (get_sysarg_path(tracee, put_old_user, SYSARG_2) < 0) + return; + + emulate_pivot_root(tracee, new_root_user, put_old_user); +} + +/** + * Helpers for emulating AF_NETLINK / NETLINK_ROUTE traffic. Some + * environments deny the tracee a real netlink socket (Android's + * SELinux policy on untrusted_app domains, seccomp filters inherited + * from a Termux-like launcher, hardened containers, ...); in that + * case we silently substitute an AF_UNIX/SOCK_DGRAM socket and + * intercept the few netlink-shaped syscalls bubblewrap's + * loopback_setup() actually makes (bind/sendto/recvfrom), + * synthesising an NLMSG_ERROR success reply. + * + * The substitution only happens when the host kernel actually + * refuses AF_NETLINK; otherwise the tracee gets a real netlink + * socket and ordinary users like c-ares (dnf, getaddrinfo, ...) + * keep working. + */ + +static bool host_blocks_af_netlink(const Tracee *tracee) +{ + enum { PROBE_UNKNOWN, PROBE_ALLOWED, PROBE_BLOCKED }; + static int cached = PROBE_UNKNOWN; + int fd; + int saved_errno; + + if (cached != PROBE_UNKNOWN) + return cached == PROBE_BLOCKED; + + fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); + if (fd >= 0) { + close(fd); + cached = PROBE_ALLOWED; + return false; + } + + saved_errno = errno; + cached = PROBE_BLOCKED; + VERBOSE(tracee, 1, "AF_NETLINK denied by host (%s); enabling " + "AF_UNIX fallback for sandbox helpers", + strerror(saved_errno)); + return true; +} + +static bool is_fake_netlink_fd(const Tracee *tracee, int fd) +{ + int i; + if (fd < 0) + return false; + for (i = 0; i < tracee->fake_netlink_fds_count; i++) + if (tracee->fake_netlink_fds[i] == fd) + return true; + return false; +} + +static void unmark_fake_netlink_fd(Tracee *tracee, int fd) +{ + int i; + for (i = 0; i < tracee->fake_netlink_fds_count; i++) { + if (tracee->fake_netlink_fds[i] == fd) { + tracee->fake_netlink_fds[i] = + tracee->fake_netlink_fds[--tracee->fake_netlink_fds_count]; + return; + } + } +} + +/** + * Write a synthetic NLMSG_ERROR reply (with error=0) into the + * tracee's buffer, mirroring the @seq the caller used in its request. + * Returns the number of bytes written, or 0 on failure. + * + * bubblewrap's rtnl_read_reply checks both the sequence number AND + * that nlmsg_pid matches the tracee's own pid, so set them both. + */ +static size_t write_fake_netlink_ack(Tracee *tracee, word_t buf_addr, + word_t buf_len, uint32_t seq) +{ + struct { + struct nlmsghdr hdr; + struct nlmsgerr err; + } reply; + size_t reply_len = sizeof(reply); + + if (buf_len < reply_len) + return 0; + + memset(&reply, 0, sizeof(reply)); + reply.hdr.nlmsg_len = reply_len; + reply.hdr.nlmsg_type = NLMSG_ERROR; + reply.hdr.nlmsg_flags = 0; + reply.hdr.nlmsg_seq = seq; + reply.hdr.nlmsg_pid = (uint32_t) tracee->pid; + reply.err.error = 0; + /* reply.err.msg is the (zeroed) header of the original request; + * loopback_setup() only checks the error field. */ + + if (write_data(tracee, buf_addr, &reply, reply_len) < 0) + return 0; + return reply_len; +} + +/** + * If @cmd is SIOCGIFINDEX for "lo", fake an answer of 1 in the + * tracee's ifreq buffer. Android often denies this ioctl when the + * caller lacks CAP_NET_ADMIN; bubblewrap's loopback_setup() calls + * if_nametoindex("lo") which goes through this ioctl and bails out + * with "Permission denied" on failure. + */ +static bool maybe_fake_siocgifindex(Tracee *tracee, word_t cmd, word_t arg) +{ + struct ifreq ifr; + + if (cmd != SIOCGIFINDEX) + return false; + if (arg == 0) + return false; + if (read_data(tracee, &ifr, arg, sizeof(ifr)) < 0) + return false; + if (strncmp(ifr.ifr_name, "lo", IFNAMSIZ) != 0) + return false; + + ifr.ifr_ifindex = 1; + if (write_data(tracee, arg, &ifr, sizeof(ifr)) < 0) + return false; + return true; +} + +/** + * Detect /proc//{uid_map,gid_map,setgroups}, which sandbox + * helpers like bubblewrap write to during user-namespace setup. The + * tracee cannot really create namespaces under PRoot, so silently + * redirect those writes to /dev/null. + */ +static bool is_proc_userns_file(const char *path) +{ + const char *p; + const char *suffix; + + if (strncmp(path, "/proc/", 6) != 0) + return false; + p = path + 6; + + if (strncmp(p, "self/", 5) == 0) + p += 5; + else { + const char *digits = p; + while (*p >= '0' && *p <= '9') + p++; + if (p == digits || *p != '/') + return false; + p++; + } + + suffix = p; + return strcmp(suffix, "uid_map") == 0 + || strcmp(suffix, "gid_map") == 0 + || strcmp(suffix, "setgroups") == 0; +} + +/** + * Redirect openat()/open() of /proc/.../uid_map etc. to /dev/null so + * that writes appear to succeed. @reg holds the path argument; the + * path has already been translated to host form. + */ +static void maybe_redirect_userns_file(Tracee *tracee, Reg reg) +{ + char host_path[PATH_MAX]; + + if (get_sysarg_path(tracee, host_path, reg) < 0) + return; + if (!is_proc_userns_file(host_path)) + return; + (void) set_sysarg_path(tracee, "/dev/null", reg); +} + /** * Translate the input arguments of the current @tracee's syscall in the * @tracee->pid process area. This function sets @tracee->status to @@ -157,6 +617,7 @@ int translate_syscall_enter(Tracee *tracee) break; case PR_getcwd: + poke_reg(tracee, SYSARG_RESULT, 0); set_sysnum(tracee, PR_void); status = 0; break; @@ -227,6 +688,7 @@ int translate_syscall_enter(Tracee *tracee) tracee->fs->cwd = tmp; talloc_set_name_const(tracee->fs->cwd, "$cwd"); + poke_reg(tracee, SYSARG_RESULT, 0); set_sysnum(tracee, PR_void); status = 0; break; @@ -237,6 +699,17 @@ int translate_syscall_enter(Tracee *tracee) word_t address; word_t size; + /* If we already redirected this fd to AF_UNIX as part + * of the AF_NETLINK emulation, fail the bind silently + * (the kernel would otherwise refuse our sockaddr_nl). */ + if (syscall_number == PR_bind + && is_fake_netlink_fd(tracee, peek_reg(tracee, CURRENT, SYSARG_1))) { + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + } + address = peek_reg(tracee, CURRENT, SYSARG_2); size = peek_reg(tracee, CURRENT, SYSARG_3); @@ -295,6 +768,89 @@ int translate_syscall_enter(Tracee *tracee) break; } + /* Substitute an AF_UNIX/SOCK_DGRAM socket for AF_NETLINK + * requests so the kernel doesn't reject them with EACCES on + * Android, then track the resulting fd so bind/sendto/recvfrom + * on it can be faked too. */ + case PR_socket: { + word_t domain = peek_reg(tracee, CURRENT, SYSARG_1); + if (domain == AF_NETLINK && host_blocks_af_netlink(tracee)) { + word_t type = peek_reg(tracee, CURRENT, SYSARG_2); + poke_reg(tracee, SYSARG_1, AF_UNIX); + poke_reg(tracee, SYSARG_2, SOCK_DGRAM | (type & SOCK_CLOEXEC)); + poke_reg(tracee, SYSARG_3, 0); + tracee->pending_fake_netlink_socket = true; + tracee->sysexit_pending = true; + tracee->restart_how = PTRACE_SYSCALL; + } + status = 0; + break; + } + + case PR_sendto: { + int fd = peek_reg(tracee, CURRENT, SYSARG_1); + if (is_fake_netlink_fd(tracee, fd)) { + word_t buf = peek_reg(tracee, CURRENT, SYSARG_2); + word_t len = peek_reg(tracee, CURRENT, SYSARG_3); + struct nlmsghdr hdr; + + if (buf != 0 && len >= sizeof(hdr) + && read_data(tracee, &hdr, buf, sizeof(hdr)) == 0) + tracee->fake_netlink_pending_seq = hdr.nlmsg_seq; + + poke_reg(tracee, SYSARG_RESULT, len); + set_sysnum(tracee, PR_void); + status = 0; + break; + } + status = 0; + break; + } + + case PR_sendmsg: { + int fd = peek_reg(tracee, CURRENT, SYSARG_1); + if (is_fake_netlink_fd(tracee, fd)) { + /* Pretend we sent everything. bubblewrap only + * uses sendto(); we accept sendmsg too just in + * case. */ + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + } + status = 0; + break; + } + + case PR_recvfrom: { + int fd = peek_reg(tracee, CURRENT, SYSARG_1); + if (is_fake_netlink_fd(tracee, fd)) { + word_t buf = peek_reg(tracee, CURRENT, SYSARG_2); + word_t len = peek_reg(tracee, CURRENT, SYSARG_3); + size_t n = write_fake_netlink_ack(tracee, buf, len, + tracee->fake_netlink_pending_seq); + poke_reg(tracee, SYSARG_RESULT, (word_t) n); + set_sysnum(tracee, PR_void); + status = 0; + break; + } + status = 0; + break; + } + + case PR_recvmsg: { + int fd = peek_reg(tracee, CURRENT, SYSARG_1); + if (is_fake_netlink_fd(tracee, fd)) { + /* Same fallback as sendmsg: return EOF. */ + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + } + status = 0; + break; + } + case PR_socketcall: { word_t args_addr; word_t sock_addr_saved; @@ -390,14 +946,84 @@ int translate_syscall_enter(Tracee *tracee) case PR_swapon: case PR_truncate: case PR_truncate64: - case PR_umount: - case PR_umount2: case PR_uselib: case PR_utime: case PR_utimes: status = translate_sysarg(tracee, SYSARG_1, REGULAR); break; + /* Pretend namespace syscalls succeed without doing anything; + * PRoot can't really create namespaces, and sandbox helpers + * like bubblewrap only check the return value. */ + case PR_unshare: + case PR_setns: + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + + case PR_umount: + case PR_umount2: + apply_emulated_umount(tracee); + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + + /* Strip CLONE_NEW* flags from clone(2)/clone3(2) so the + * syscall doesn't fail with EPERM on kernels that disallow + * unprivileged namespace creation (typical on Android). The + * fork/thread itself still proceeds normally and PRoot keeps + * tracking the child through PTRACE_EVENT_CLONE. When the + * caller asked for CLONE_NEWNS, remember it on the tracee so + * the new child gets isolated bindings (otherwise emulated + * mount(2) calls in the child would leak into the parent). */ + case PR_clone: { + word_t flags = peek_reg(tracee, CURRENT, SYSARG_1); + if ((flags & CLONE_NS_MASK) != 0) { + if ((flags & CLONE_NEWNS) != 0) + tracee->clone_stripped_newns = true; + poke_reg(tracee, SYSARG_1, flags & ~(word_t) CLONE_NS_MASK); + } + status = 0; + break; + } + + case PR_clone3: { + word_t args_addr = peek_reg(tracee, CURRENT, SYSARG_1); + word_t flags; + + if (args_addr != 0) { + errno = 0; + flags = peek_word(tracee, args_addr); + if (errno == 0 && (flags & CLONE_NS_MASK) != 0) { + if ((flags & CLONE_NEWNS) != 0) + tracee->clone_stripped_newns = true; + poke_word(tracee, args_addr, + flags & ~(word_t) CLONE_NS_MASK); + } + } + status = 0; + break; + } + + /* mount(2) and pivot_root(2) are emulated by translating them + * into PRoot bindings (see emulate_mount/emulate_pivot_root) + * so the resulting paths actually become accessible. */ + case PR_mount: + apply_emulated_mount(tracee); + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + + case PR_pivot_root: + apply_emulated_pivot_root(tracee); + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + status = 0; + break; + case PR_open: flags = peek_reg(tracee, CURRENT, SYSARG_2); @@ -413,6 +1039,8 @@ int translate_syscall_enter(Tracee *tracee) status = translate_sysarg(tracee, SYSARG_1, SYMLINK); else status = translate_sysarg(tracee, SYSARG_1, REGULAR); + if (status >= 0) + maybe_redirect_userns_file(tracee, SYSARG_1); break; case PR_fchownat: @@ -476,14 +1104,6 @@ int translate_syscall_enter(Tracee *tracee) status = translate_sysarg(tracee, SYSARG_1, SYMLINK); break; - case PR_pivot_root: - status = translate_sysarg(tracee, SYSARG_1, REGULAR); - if (status < 0) - break; - - status = translate_sysarg(tracee, SYSARG_2, REGULAR); - break; - case PR_linkat: olddirfd = peek_reg(tracee, CURRENT, SYSARG_1); newdirfd = peek_reg(tracee, CURRENT, SYSARG_3); @@ -507,21 +1127,6 @@ int translate_syscall_enter(Tracee *tracee) status = translate_path2(tracee, newdirfd, newpath, SYSARG_4, SYMLINK); break; - case PR_mount: - status = get_sysarg_path(tracee, path, SYSARG_1); - if (status < 0) - break; - - /* The following check covers only 90% of the cases. */ - if (path[0] == '/' || path[0] == '.') { - status = translate_path2(tracee, AT_FDCWD, path, SYSARG_1, REGULAR); - if (status < 0) - break; - } - - status = translate_sysarg(tracee, SYSARG_2, REGULAR); - break; - case PR_openat: dirfd = peek_reg(tracee, CURRENT, SYSARG_1); flags = peek_reg(tracee, CURRENT, SYSARG_3); @@ -540,6 +1145,8 @@ int translate_syscall_enter(Tracee *tracee) status = translate_path2(tracee, dirfd, path, SYSARG_2, SYMLINK); else status = translate_path2(tracee, dirfd, path, SYSARG_2, REGULAR); + if (status >= 0) + maybe_redirect_userns_file(tracee, SYSARG_2); break; case PR_readlinkat: @@ -617,6 +1224,7 @@ int translate_syscall_enter(Tracee *tracee) /* Prevent tracees from setting dumpable flag. * (Otherwise it could break tracee memory access) */ if (peek_reg(tracee, CURRENT, SYSARG_1) == PR_SET_DUMPABLE) { + poke_reg(tracee, SYSARG_RESULT, 0); set_sysnum(tracee, PR_void); status = 0; } @@ -630,31 +1238,39 @@ int translate_syscall_enter(Tracee *tracee) } break; + case PR_ioctl: { + word_t cmd = peek_reg(tracee, CURRENT, SYSARG_2); + word_t arg = peek_reg(tracee, CURRENT, SYSARG_3); + + /* SIOCGIFINDEX for "lo": Android often denies this with + * EACCES; fake an answer of 1 so bubblewrap's + * loopback_setup() can proceed. */ + if (cmd == SIOCGIFINDEX && maybe_fake_siocgifindex(tracee, cmd, arg)) { + poke_reg(tracee, SYSARG_RESULT, 0); + set_sysnum(tracee, PR_void); + break; + } + #ifdef __ANDROID__ - case PR_ioctl: /* Using literal value because Termux build system patches TCSAFLUSH */ - if (peek_reg(tracee, CURRENT, SYSARG_2) == TCSETS + 2 /* + TCSAFLUSH */) { + if (cmd == TCSETS + 2 /* + TCSAFLUSH */) poke_reg(tracee, SYSARG_2, TCSETS + TCSANOW); - } - if (peek_reg(tracee, CURRENT, SYSARG_2) == TCGETS2) { + if (cmd == TCGETS2) poke_reg(tracee, SYSARG_2, TCGETS); - } - if (peek_reg(tracee, CURRENT, SYSARG_2) == TCSETS2) { + if (cmd == TCSETS2) poke_reg(tracee, SYSARG_2, TCSETS); - } - if (peek_reg(tracee, CURRENT, SYSARG_2) == TCSETSW2) { + if (cmd == TCSETSW2) poke_reg(tracee, SYSARG_2, TCSETSW); - } - if (peek_reg(tracee, CURRENT, SYSARG_2) == TCSETSF2) { + if (cmd == TCSETSF2) poke_reg(tracee, SYSARG_2, TCSETSF); - } +#endif break; -#endif + } case PR_memfd_create: { @@ -685,12 +1301,20 @@ int translate_syscall_enter(Tracee *tracee) } break; } - case PR_close: + case PR_close: { + int closed_fd = (int) peek_reg(tracee, CURRENT, SYSARG_1); + /* Stop tracking auxv_fd once the tracee closes it. */ - if (tracee->auxv_fd >= 0 - && (int) peek_reg(tracee, CURRENT, SYSARG_1) == tracee->auxv_fd) + if (tracee->auxv_fd >= 0 && closed_fd == tracee->auxv_fd) tracee->auxv_fd = -1; + + /* Drop the fd from the fake-AF_NETLINK tracking set, + * otherwise its number could be reused for an unrelated + * file and we'd keep intercepting sendto/recvfrom on + * it. */ + unmark_fake_netlink_fd(tracee, closed_fd); break; + } } diff --git a/src/syscall/exit.c b/src/syscall/exit.c index 59e05046..c646543b 100644 --- a/src/syscall/exit.c +++ b/src/syscall/exit.c @@ -244,6 +244,15 @@ void translate_syscall_exit(Tracee *tracee) case PR_fchdir: case PR_chdir: + /* These syscalls are voided in enter.c; make sure the + * tracee always sees a 0 return value even on kernels where + * the SYSCALL_AVOIDER trick leaks -ENOSYS through. */ + case PR_unshare: + case PR_setns: + case PR_mount: + case PR_umount: + case PR_umount2: + case PR_pivot_root: /* These syscalls are fully emulated, see enter.c for details * (like errors). */ status = 0; @@ -664,6 +673,29 @@ void translate_syscall_exit(Tracee *tracee) } goto end; + case PR_socket: + /* Record the fd we substituted for an AF_NETLINK request. */ + if (tracee->pending_fake_netlink_socket) { + int fd = (int) peek_reg(tracee, CURRENT, SYSARG_RESULT); + if (fd >= 0) { + int i; + if (tracee->fake_netlink_fds_count < MAX_FAKE_NETLINK_FDS) { + /* Avoid duplicates. */ + bool present = false; + for (i = 0; i < tracee->fake_netlink_fds_count; i++) { + if (tracee->fake_netlink_fds[i] == fd) { + present = true; + break; + } + } + if (!present) + tracee->fake_netlink_fds[tracee->fake_netlink_fds_count++] = fd; + } + } + tracee->pending_fake_netlink_socket = false; + } + goto end; + default: goto end; } diff --git a/src/syscall/seccomp.c b/src/syscall/seccomp.c index ee961437..af38038a 100644 --- a/src/syscall/seccomp.c +++ b/src/syscall/seccomp.c @@ -340,9 +340,16 @@ static FilteredSysnum proot_sysnums[] = { { PR_chown, 0 }, { PR_chown32, 0 }, { PR_chroot, 0 }, + { PR_clone, 0 }, + { PR_clone3, 0 }, { PR_close, 0 }, { PR_connect, 0 }, { PR_creat, 0 }, + { PR_recvfrom, 0 }, + { PR_recvmsg, 0 }, + { PR_sendmsg, 0 }, + { PR_sendto, 0 }, + { PR_socket, FILTER_SYSEXIT }, { PR_execve, FILTER_SYSEXIT }, { PR_execveat, FILTER_SYSEXIT }, { PR_faccessat, 0 }, @@ -378,14 +385,14 @@ static FilteredSysnum proot_sysnums[] = { { PR_mkdirat, 0 }, { PR_mknod, 0 }, { PR_mknodat, 0 }, - { PR_mount, 0 }, + { PR_mount, FILTER_SYSEXIT }, { PR_name_to_handle_at, 0 }, { PR_newfstatat, 0 }, { PR_oldlstat, 0 }, { PR_oldstat, 0 }, { PR_open, 0 }, { PR_openat, 0 }, - { PR_pivot_root, 0 }, + { PR_pivot_root, FILTER_SYSEXIT }, { PR_prctl, 0 }, { PR_prlimit64, FILTER_SYSEXIT }, { PR_ptrace, FILTER_SYSEXIT }, @@ -410,9 +417,11 @@ static FilteredSysnum proot_sysnums[] = { { PR_symlinkat, 0 }, { PR_truncate, 0 }, { PR_truncate64, 0 }, - { PR_umount, 0 }, - { PR_umount2, 0 }, + { PR_umount, FILTER_SYSEXIT }, + { PR_umount2, FILTER_SYSEXIT }, { PR_uname, FILTER_SYSEXIT }, + { PR_unshare, FILTER_SYSEXIT }, + { PR_setns, FILTER_SYSEXIT }, { PR_unlink, 0 }, { PR_unlinkat, 0 }, { PR_uselib, 0 }, diff --git a/src/syscall/syscall.h b/src/syscall/syscall.h index e255cc88..e0e263c9 100644 --- a/src/syscall/syscall.h +++ b/src/syscall/syscall.h @@ -36,4 +36,8 @@ extern void translate_syscall(Tracee *tracee); extern int translate_syscall_enter(Tracee *tracee); extern void translate_syscall_exit(Tracee *tracee); +extern void apply_emulated_mount(Tracee *tracee); +extern void apply_emulated_pivot_root(Tracee *tracee); +extern void apply_emulated_umount(Tracee *tracee); + #endif /* SYSCALL_H */ diff --git a/src/tracee/seccomp.c b/src/tracee/seccomp.c index 77d5b273..cd664ddc 100644 --- a/src/tracee/seccomp.c +++ b/src/tracee/seccomp.c @@ -160,6 +160,32 @@ static int handle_seccomp_event_common(Tracee *tracee) set_result_after_seccomp(tracee, 0); break; + /* The Android parent process commonly installs a seccomp + * filter that traps mount/umount/pivot_root/unshare/setns + * with SIGSYS. Mirror what enter.c does for these: pretend + * they succeeded and apply the mount/pivot_root binding + * emulation so sandbox helpers like bubblewrap can proceed. */ + case PR_mount: + apply_emulated_mount(tracee); + set_result_after_seccomp(tracee, 0); + break; + + case PR_pivot_root: + apply_emulated_pivot_root(tracee); + set_result_after_seccomp(tracee, 0); + break; + + case PR_umount: + case PR_umount2: + apply_emulated_umount(tracee); + set_result_after_seccomp(tracee, 0); + break; + + case PR_unshare: + case PR_setns: + set_result_after_seccomp(tracee, 0); + break; + case PR_getpgrp: /* Query value with getpgid and set it as result. */ set_result_after_seccomp(tracee, getpgid(tracee->pid)); diff --git a/src/tracee/tracee.c b/src/tracee/tracee.c index 678d1639..68bd10d7 100644 --- a/src/tracee/tracee.c +++ b/src/tracee/tracee.c @@ -540,12 +540,38 @@ int new_child(Tracee *parent, word_t clone_flags) return -ENOMEM; talloc_set_name_const(child->fs->cwd, "$cwd"); - /* Bindings are shared across file-system name-spaces since a - * "mount --bind" made by a process affects all other processes - * under Linux. Actually they are copied when a sub - * reconfiguration occured (nested proot or chroot(2)). */ - child->fs->bindings.guest = talloc_reference(child->fs, parent->fs->bindings.guest); - child->fs->bindings.host = talloc_reference(child->fs, parent->fs->bindings.host); + if (parent->clone_stripped_newns + && parent->fs->bindings.guest != NULL) { + /* Caller asked for CLONE_NEWNS (which we + * silently stripped). Give the child its own + * copy of the binding tree so emulated mount(2) + * calls don't propagate back to the parent. */ + Binding *iter; + + child->fs->bindings.guest = talloc_zero(child->fs, Bindings); + child->fs->bindings.host = talloc_zero(child->fs, Bindings); + if ( child->fs->bindings.guest == NULL + || child->fs->bindings.host == NULL) + return -ENOMEM; + CIRCLEQ_INIT(child->fs->bindings.guest); + CIRCLEQ_INIT(child->fs->bindings.host); + + for (iter = CIRCLEQ_FIRST(parent->fs->bindings.guest); + iter != (void *) parent->fs->bindings.guest; + iter = CIRCLEQ_NEXT(iter, link.guest)) + (void) insort_binding3(child, child->fs, + iter->host.path, + iter->guest.path); + parent->clone_stripped_newns = false; + } + else { + /* Bindings are shared across file-system name-spaces since a + * "mount --bind" made by a process affects all other processes + * under Linux. Actually they are copied when a sub + * reconfiguration occured (nested proot or chroot(2)). */ + child->fs->bindings.guest = talloc_reference(child->fs, parent->fs->bindings.guest); + child->fs->bindings.host = talloc_reference(child->fs, parent->fs->bindings.host); + } } /* The path to the executable is unshared only once the child diff --git a/src/tracee/tracee.h b/src/tracee/tracee.h index 60c55175..05702c38 100644 --- a/src/tracee/tracee.h +++ b/src/tracee/tracee.h @@ -103,6 +103,24 @@ typedef struct tracee { /* Is it a "clone", i.e has the same parent as its creator. */ bool clone; + /* Set when the current clone(2)/clone3(2) had CLONE_NEW* flags + * stripped (see translate_syscall_enter); the new child should + * get its own copy of the bindings so emulated mount(2) calls + * stay scoped to the would-be namespace. Reset once consumed. */ + bool clone_stripped_newns; + + /* Emulation of AF_NETLINK / NETLINK_ROUTE sockets for + * sandbox helpers like bubblewrap that try to bring up the + * loopback interface inside their would-be net namespace. + * fake_netlink_fds holds the fds of sockets we silently + * redirected from AF_NETLINK to AF_UNIX/SOCK_DGRAM; see + * enter.c / exit.c for the intercepts. */ +#define MAX_FAKE_NETLINK_FDS 8 + int fake_netlink_fds[MAX_FAKE_NETLINK_FDS]; + int fake_netlink_fds_count; + bool pending_fake_netlink_socket; + uint32_t fake_netlink_pending_seq; + /* Support for ptrace emulation (tracer side). */ struct { size_t nb_ptracees;