diff --git a/src/libcrun/cgroup-utils.c b/src/libcrun/cgroup-utils.c index b894e24ae1..87780f4d18 100644 --- a/src/libcrun/cgroup-utils.c +++ b/src/libcrun/cgroup-utils.c @@ -210,35 +210,13 @@ libcrun_get_cgroup_mode (libcrun_error_t *err) return cgroup_mode; } -int -libcrun_get_cgroup_process (pid_t pid, char **path, bool absolute, libcrun_error_t *err) +static int +get_cgroup_process_from_content (char *content, int cgroup_mode, char **path, bool absolute, libcrun_error_t *err) { - cleanup_free char *content = NULL; - char proc_cgroup_file[64]; char *cg_path = NULL; - size_t content_size; char *controller; char *saveptr; - int cgroup_mode; bool has_data; - int ret; - - cgroup_mode = libcrun_get_cgroup_mode (err); - if (UNLIKELY (cgroup_mode < 0)) - return cgroup_mode; - - if (pid == 0) - strcpy (proc_cgroup_file, PROC_SELF_CGROUP); - else - { - int len = snprintf (proc_cgroup_file, sizeof (proc_cgroup_file), "/proc/%d/cgroup", pid); - if (UNLIKELY (len >= (int) sizeof (proc_cgroup_file))) - return crun_make_error (err, 0, "internal error: static buffer too small"); - } - - ret = read_all_file (proc_cgroup_file, &content, &content_size, err); - if (UNLIKELY (ret < 0)) - return ret; for (has_data = read_proc_cgroup (content, &saveptr, NULL, &controller, &cg_path); has_data; @@ -266,6 +244,54 @@ libcrun_get_cgroup_process (pid_t pid, char **path, bool absolute, libcrun_error return 0; } +int +libcrun_get_cgroup_process_at (int dirfd, char **path, bool absolute, libcrun_error_t *err) +{ + cleanup_free char *content = NULL; + size_t content_size; + int cgroup_mode; + int ret; + + cgroup_mode = libcrun_get_cgroup_mode (err); + if (UNLIKELY (cgroup_mode < 0)) + return cgroup_mode; + + ret = read_all_file_at (dirfd, SELF_CGROUP, &content, &content_size, err); + if (UNLIKELY (ret < 0)) + return ret; + + return get_cgroup_process_from_content (content, cgroup_mode, path, absolute, err); +} + +int +libcrun_get_cgroup_process (pid_t pid, char **path, bool absolute, libcrun_error_t *err) +{ + cleanup_free char *content = NULL; + char proc_cgroup_file[64]; + size_t content_size; + int cgroup_mode; + int ret; + + cgroup_mode = libcrun_get_cgroup_mode (err); + if (UNLIKELY (cgroup_mode < 0)) + return cgroup_mode; + + if (pid == 0) + strcpy (proc_cgroup_file, PROC_SELF_CGROUP); + else + { + int len = snprintf (proc_cgroup_file, sizeof (proc_cgroup_file), "/proc/%d/cgroup", pid); + if (UNLIKELY (len >= (int) sizeof (proc_cgroup_file))) + return crun_make_error (err, 0, "internal error: static buffer too small"); + } + + ret = read_all_file (proc_cgroup_file, &content, &content_size, err); + if (UNLIKELY (ret < 0)) + return ret; + + return get_cgroup_process_from_content (content, cgroup_mode, path, absolute, err); +} + static int read_pids_cgroup (int dfd, bool recurse, pid_t **pids, size_t *n_pids, size_t *allocated, libcrun_error_t *err) { diff --git a/src/libcrun/cgroup-utils.h b/src/libcrun/cgroup-utils.h index 1cec12472d..3ec24d295c 100644 --- a/src/libcrun/cgroup-utils.h +++ b/src/libcrun/cgroup-utils.h @@ -28,6 +28,8 @@ int libcrun_cgroups_create_symlinks (int dirfd, libcrun_error_t *err); int libcrun_get_cgroup_process (pid_t pid, char **path, bool absolute, libcrun_error_t *err); +int libcrun_get_cgroup_process_at (int dirfd, char **path, bool absolute, libcrun_error_t *err); + int libcrun_get_cgroup_mode (libcrun_error_t *err); int libcrun_get_cgroup_dirfd (struct libcrun_cgroup_status *status, const char *sub_cgroup, libcrun_error_t *err); diff --git a/src/libcrun/cgroup.h b/src/libcrun/cgroup.h index 74aa6ca0bb..0e1acfb3ee 100644 --- a/src/libcrun/cgroup.h +++ b/src/libcrun/cgroup.h @@ -26,8 +26,12 @@ # define CGROUP_ROOT "/sys/fs/cgroup" #endif +#ifndef SELF_CGROUP +# define SELF_CGROUP "self/cgroup" +#endif + #ifndef PROC_SELF_CGROUP -# define PROC_SELF_CGROUP "/proc/self/cgroup" +# define PROC_SELF_CGROUP "/proc/" SELF_CGROUP #endif enum diff --git a/src/libcrun/container.c b/src/libcrun/container.c index 5fd045f46f..9c1f9f96ae 100644 --- a/src/libcrun/container.c +++ b/src/libcrun/container.c @@ -1339,11 +1339,6 @@ container_init_setup (void *args, pid_t own_pid, char *notify_socket, if (UNLIKELY (ret < 0)) return ret; - /* sync 2 and 3 are sent as part of libcrun_set_mounts. */ - ret = libcrun_set_mounts (entrypoint_args, container, rootfs, send_sync_cb, &sync_socket, err); - if (UNLIKELY (ret < 0)) - return ret; - if (def->hooks && def->hooks->create_container_len) { libcrun_error_t tmp_err = NULL; @@ -1357,6 +1352,15 @@ container_init_setup (void *args, pid_t own_pid, char *notify_socket, return ret; } + ret = libcrun_do_pivot_root (container, entrypoint_args->context->no_pivot, &rootfs, err); + if (UNLIKELY (ret < 0)) + return ret; + + /* sync 2 and 3 are sent as part of libcrun_set_mounts. */ + ret = libcrun_set_mounts (entrypoint_args, container, rootfs, send_sync_cb, &sync_socket, err); + if (UNLIKELY (ret < 0)) + return ret; + ret = libcrun_finalize_mounts (entrypoint_args, container, rootfs, err); if (UNLIKELY (ret < 0)) return ret; @@ -1376,13 +1380,6 @@ container_init_setup (void *args, pid_t own_pid, char *notify_socket, if (UNLIKELY (ret < 0)) crun_error_write_warning_and_release (entrypoint_args->context->output_handler_arg, &err); - if (rootfs) - { - ret = libcrun_do_pivot_root (container, entrypoint_args->context->no_pivot, rootfs, err); - if (UNLIKELY (ret < 0)) - return ret; - } - ret = libcrun_reopen_dev_null (err); if (UNLIKELY (ret < 0)) return ret; diff --git a/src/libcrun/linux.c b/src/libcrun/linux.c index 24569dec8e..bda82847ba 100644 --- a/src/libcrun/linux.c +++ b/src/libcrun/linux.c @@ -88,6 +88,10 @@ # define OPEN_TREE_CLOEXEC O_CLOEXEC #endif +#ifndef OPEN_TREE_NAMESPACE +# define OPEN_TREE_NAMESPACE 2 +#endif + #ifndef MOVE_MOUNT_F_EMPTY_PATH # define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 #endif @@ -137,13 +141,20 @@ struct private_data_s unsigned long rootfs_propagation; bool deny_setgroups; + int procfd; + const char *rootfs; int rootfsfd; int notify_socket_tree_fd; + int old_root_fd; struct libcrun_fd_map *mount_fds; struct libcrun_fd_map *dev_fds; + struct libcrun_fd_map *needed_devs_fds; + + char **copy_symlink_targets; + size_t n_copy_symlink_targets; /* Used to save stdin, stdout, stderr during checkpointing to descriptors.json * and needed during restore. */ @@ -154,6 +165,9 @@ struct private_data_s char *maskdir_proc_path; bool maskdir_bind_failed; bool maskdir_warned; + bool joined_mount_ns; + bool needs_pivot; + bool no_pivot; }; struct linux_namespace_s @@ -170,13 +184,26 @@ cleanup_private_data (void *private_data) if (p->rootfsfd >= 0) TEMP_FAILURE_RETRY (close (p->rootfsfd)); + if (p->procfd >= 0) + TEMP_FAILURE_RETRY (close (p->procfd)); if (p->maskdir_fd >= 0) TEMP_FAILURE_RETRY (close (p->maskdir_fd)); + if (p->old_root_fd >= 0) + TEMP_FAILURE_RETRY (close (p->old_root_fd)); if (p->mount_fds) cleanup_close_mapp (&(p->mount_fds)); if (p->dev_fds) cleanup_close_mapp (&(p->dev_fds)); + if (p->needed_devs_fds) + cleanup_close_mapp (&(p->needed_devs_fds)); + if (p->copy_symlink_targets) + { + size_t i; + for (i = 0; i < p->n_copy_symlink_targets; i++) + free (p->copy_symlink_targets[i]); + free (p->copy_symlink_targets); + } free (p->unified_cgroup_path); free (p->host_notify_socket_path); free (p->container_notify_socket_path); @@ -193,7 +220,9 @@ get_private_data (struct libcrun_container_s *container) struct private_data_s *p = xmalloc0 (sizeof (*p)); container->private_data = p; p->rootfsfd = -1; + p->procfd = -1; p->notify_socket_tree_fd = -1; + p->old_root_fd = -1; p->maskdir_fd = -1; container->cleanup_private_data = cleanup_private_data; } @@ -273,6 +302,24 @@ syscall_pidfd_send_signal (int pidfd, int sig, siginfo_t *info, unsigned int fla #endif } +/* Translate MS_* mount flags to MOUNT_ATTR_* values for mount_setattr(). */ +static uint64_t +ms_flags_to_mount_attr (uint64_t ms_flags) +{ + uint64_t attr = ms_flags & (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | MOUNT_ATTR_NOEXEC); + + if (ms_flags & MS_NOATIME) + attr |= MOUNT_ATTR_NOATIME; + if (ms_flags & MS_STRICTATIME) + attr |= MOUNT_ATTR_STRICTATIME; + if (ms_flags & MS_NODIRATIME) + attr |= MOUNT_ATTR_NODIRATIME; + if (ms_flags & MS_NOSYMFOLLOW) + attr |= MOUNT_ATTR_NOSYMFOLLOW; + + return attr; +} + static int do_mount_setattr (bool recursive, const char *target, int targetfd, uint64_t clear, uint64_t set, libcrun_error_t *err) { @@ -285,8 +332,11 @@ do_mount_setattr (bool recursive, const char *target, int targetfd, uint64_t cle clear &= ~MS_BIND; attr.propagation = set & ALL_PROPAGATIONS_NO_REC; - attr.attr_set = set & (~ALL_PROPAGATIONS); - attr.attr_clr = clear & (~ALL_PROPAGATIONS); + attr.attr_set = ms_flags_to_mount_attr (set & (~ALL_PROPAGATIONS)); + attr.attr_clr = ms_flags_to_mount_attr (clear & (~ALL_PROPAGATIONS)); + + if (attr.attr_set & MOUNT_ATTR__ATIME) + attr.attr_clr |= MOUNT_ATTR__ATIME; ret = syscall_mount_setattr (targetfd, "", (recursive ? AT_RECURSIVE : 0) | AT_EMPTY_PATH, &attr); if (UNLIKELY (ret < 0)) @@ -716,7 +766,26 @@ get_mount_flags_or_option (const char *name, int current_flags, unsigned long *e __attribute__ ((unused)) cleanup_free char *prev = NULL; unsigned long flags = get_mount_flags (name, current_flags, &found, extra_flags, rec_clear, rec_set); if (found) - return flags; + { + /* MS_SYNCHRONOUS and MS_DIRSYNC cannot be set through fsmount() + attr_flags. Pass them also as data so that fsconfig(SET_FLAG) + can set SB_SYNCHRONOUS / SB_DIRSYNC on the superblock. */ + const char *data_flag = NULL; + if ((flags & MS_SYNCHRONOUS) && ! (current_flags & MS_SYNCHRONOUS)) + data_flag = "sync"; + else if ((flags & MS_DIRSYNC) && ! (current_flags & MS_DIRSYNC)) + data_flag = "dirsync"; + + if (data_flag) + { + prev = *option; + if (*option && **option) + xasprintf (option, "%s,%s", *option, data_flag); + else + *option = xstrdup (data_flag); + } + return flags; + } prev = *option; if (*option && **option) @@ -760,21 +829,26 @@ make_remount (int targetfd, const char *target, unsigned long flags, const char static int do_remount (int targetfd, const char *target, unsigned long flags, const char *data, libcrun_error_t *err) { - int ret; proc_fd_path_t target_buffer; + int ret; const char *real_target = target; + /* Older kernels (seen on 4.18) fail with EINVAL if data is set when + setting MS_RDONLY. */ + if (flags & (MS_REMOUNT | MS_RDONLY)) + data = NULL; + if (targetfd >= 0) { + ret = do_mount_setattr (false, target, targetfd, 0, flags & ~MS_REMOUNT, err); + if (LIKELY (ret == 0)) + return 0; + crun_error_release (err); + get_proc_self_fd_path (target_buffer, targetfd); real_target = target_buffer; } - /* Older kernels (seen on 4.18) fail with EINVAL if data is set when - setting MS_RDONLY. */ - if (flags & (MS_REMOUNT | MS_RDONLY)) - data = NULL; - ret = mount (NULL, real_target, NULL, flags, data); if (UNLIKELY (ret < 0)) { @@ -862,7 +936,7 @@ open_mount_target (libcrun_container_t *container, const char *target_rel, libcr /* Attempt to open a mount of the specified type. */ static int -fsopen_mount (const char *type, const char *labeltype, const char *label) +fsopen_mount (const char *type, const char *source_name, const char *labeltype, const char *label, const char *data) { #ifdef HAVE_NEW_MOUNT_API cleanup_close int fsfd = -1; @@ -872,6 +946,8 @@ fsopen_mount (const char *type, const char *labeltype, const char *label) if (UNLIKELY (fsfd < 0)) return fsfd; + (void) syscall_fsconfig (fsfd, FSCONFIG_SET_STRING, "source", source_name ? source_name : type, 0); + if (labeltype) { ret = syscall_fsconfig (fsfd, FSCONFIG_SET_STRING, labeltype, label, 0); @@ -879,6 +955,29 @@ fsopen_mount (const char *type, const char *labeltype, const char *label) return ret; } + if (data && data[0]) + { + cleanup_free char *data_copy = xstrdup (data); + char *saveptr = NULL; + char *token; + + for (token = strtok_r (data_copy, ",", &saveptr); token; token = strtok_r (NULL, ",", &saveptr)) + { + char *eq = strchr (token, '='); + if (eq) + { + *eq = '\0'; + + ret = syscall_fsconfig (fsfd, FSCONFIG_SET_STRING, token, eq + 1, 0); + } + else + ret = syscall_fsconfig (fsfd, FSCONFIG_SET_FLAG, token, NULL, 0); + + if (UNLIKELY (ret < 0)) + return ret; + } + } + ret = syscall_fsconfig (fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); if (UNLIKELY (ret < 0)) return ret; @@ -886,11 +985,41 @@ fsopen_mount (const char *type, const char *labeltype, const char *label) return syscall_fsmount (fsfd, FSMOUNT_CLOEXEC, 0); #else (void) type; + (void) source_name; + (void) data; errno = ENOSYS; return -1; #endif } +static int +get_procfd (struct private_data_s *data, libcrun_error_t *err) +{ + int fd; + + if (data->procfd >= 0) + return data->procfd; + + fd = fsopen_mount ("proc", NULL, NULL, NULL, NULL); + if (fd < 0) + fd = open ("/proc", O_DIRECTORY | O_CLOEXEC); + if (UNLIKELY (fd < 0)) + return crun_make_error (err, errno, "open `/proc`"); + + data->procfd = fd; + return fd; +} + +static int +get_old_root_fd (struct private_data_s *data) +{ + if (data->old_root_fd >= 0) + return data->old_root_fd; + + data->old_root_fd = open ("/", O_PATH | O_CLOEXEC); + return data->old_root_fd; +} + static int fs_move_mount_to (int fd, int dirfd, const char *name) { @@ -956,12 +1085,14 @@ get_shared_empty_dir_cached (libcrun_container_t *container, char **proc_fd_path struct private_data_s *private_data = get_private_data (container); cleanup_close int fd = -1; cleanup_free char *empty_dir_path = NULL; + proc_fd_path_t fd_path; int ret; /* Fast path: return cached proc fd path if already set up */ if (private_data->maskdir_proc_path != NULL) { - *proc_fd_path = private_data->maskdir_proc_path; + if (proc_fd_path) + *proc_fd_path = private_data->maskdir_proc_path; return 0; } @@ -975,13 +1106,14 @@ get_shared_empty_dir_cached (libcrun_container_t *container, char **proc_fd_path if (fd < 0) return crun_make_error (err, errno, "open directory `%s`", empty_dir_path); - /* Cache the /proc/self/fd path for fast mounting */ - xasprintf (&private_data->maskdir_proc_path, "/proc/self/fd/%d", fd); + get_self_fd_path (fd_path, fd); + private_data->maskdir_proc_path = xstrdup (fd_path); private_data->maskdir_fd = fd; fd = -1; /* Don't auto-close */ - *proc_fd_path = private_data->maskdir_proc_path; + if (proc_fd_path) + *proc_fd_path = private_data->maskdir_proc_path; return 0; } @@ -989,14 +1121,15 @@ static int mount_masked_dir (libcrun_container_t *container, int pathfd, const char *rel_path, libcrun_error_t *err) { struct private_data_s *private_data = get_private_data (container); + cleanup_close int mountfd = -1; char *proc_fd_path = NULL; libcrun_error_t tmp_err = NULL; + int procfd; int ret; if (private_data->maskdir_bind_failed) goto fallback_to_tmpfs; - /* Get cached /proc/self/fd path (fast after first call) */ ret = get_shared_empty_dir_cached (container, &proc_fd_path, &tmp_err); if (ret < 0) { @@ -1006,28 +1139,20 @@ mount_masked_dir (libcrun_container_t *container, int pathfd, const char *rel_pa goto fallback_to_tmpfs; } - { - cleanup_close int mountfd = -1; - - mountfd = get_bind_mount (-1, proc_fd_path, false, true, false, MS_PRIVATE, &tmp_err); - if (mountfd >= 0) - { - ret = fs_move_mount_to (mountfd, pathfd, NULL); - if (LIKELY (ret == 0)) - return 0; - } - crun_error_release (&tmp_err); - } + procfd = get_procfd (get_private_data (container), err); + if (UNLIKELY (procfd < 0)) + return procfd; - ret = do_mount (container, proc_fd_path, pathfd, rel_path, NULL, MS_BIND | MS_RDONLY, NULL, LABEL_MOUNT, &tmp_err); - if (LIKELY (ret >= 0)) - return ret; + mountfd = get_bind_mount (procfd, proc_fd_path, false, true, false, MS_PRIVATE, &tmp_err); + if (mountfd >= 0) + { + ret = fs_move_mount_to (mountfd, pathfd, NULL); + if (LIKELY (ret == 0)) + return 0; + } - /* Bind mount failed - mark as failed and fall back for all future mounts */ private_data->maskdir_bind_failed = true; - libcrun_warning ("bind mount failed for %s to %s: %s, falling back to tmpfs", - proc_fd_path, rel_path, tmp_err->msg); - warn_tmpfs_fallback_once (private_data, tmp_err->msg); + warn_tmpfs_fallback_once (private_data, tmp_err ? tmp_err->msg : "unknown error"); crun_error_release (&tmp_err); fallback_to_tmpfs: @@ -1062,17 +1187,21 @@ do_masked_or_readonly_path (libcrun_container_t *container, const char *rel_path if (readonly) { + int procfd = get_procfd (get_private_data (container), err); proc_fd_path_t source_buffer; cleanup_close int mountfd = -1; - get_proc_self_fd_path (source_buffer, pathfd); + if (UNLIKELY (procfd < 0)) + return procfd; + + get_self_fd_path (source_buffer, pathfd); /* Try open_tree + mount_setattr to apply MS_RDONLY atomically. Only when keep_flags is false, since keep_flags needs statfs to inherit parent mount flags. */ if (! keep_flags) { - mountfd = get_bind_mount (-1, source_buffer, true, true, false, MS_PRIVATE, err); + mountfd = get_bind_mount (procfd, source_buffer, true, true, false, MS_PRIVATE, err); if (mountfd >= 0) ret = fs_move_mount_to (mountfd, pathfd, NULL); else @@ -1087,21 +1216,45 @@ do_masked_or_readonly_path (libcrun_container_t *container, const char *rel_path mount_flags = MS_BIND | MS_PRIVATE | MS_RDONLY | MS_REC; if (keep_flags) { - ret = statfs (source_buffer, &sfs); + cleanup_close int sfd = openat (procfd, source_buffer, O_PATH | O_CLOEXEC); + if (UNLIKELY (sfd < 0)) + return crun_make_error (err, errno, "openat procfd for `%s`", rel_path); + + ret = fstatfs (sfd, &sfs); if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "statfs `%s`", source_buffer); + return crun_make_error (err, errno, "fstatfs `%s`", rel_path); mount_flags = mount_flags | sfs.f_flags; - // Parent might contain `MS_REMOUNT` but the new readonly path is not - // actually mounted. Specifically in the case of `/proc` this will end - // up with EINVAL therefore remove `MS_REMOUNT` if it's getting - // inherited from the parent. + /* Parent might contain MS_REMOUNT but the new readonly path is not + actually mounted. Specifically in the case of /proc this will end + up with EINVAL therefore remove MS_REMOUNT if it is getting + inherited from the parent. */ mount_flags = mount_flags & ~MS_REMOUNT; } - ret = do_mount (container, source_buffer, pathfd, rel_path, NULL, mount_flags, NULL, - LABEL_NONE, err); - if (UNLIKELY (ret < 0)) - return ret; + + close_and_reset (&mountfd); + mountfd = get_bind_mount (procfd, source_buffer, true, true, false, MS_PRIVATE, err); + if (mountfd >= 0) + { + ret = do_mount_setattr (true, rel_path, mountfd, 0, mount_flags & ~(MS_BIND | MS_PRIVATE), err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = fs_move_mount_to (mountfd, pathfd, NULL); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "move mount for readonly path `%s`", rel_path); + } + else + { + proc_fd_path_t abs_source; + + crun_error_release (err); + get_proc_self_fd_path (abs_source, pathfd); + ret = do_mount (container, abs_source, pathfd, rel_path, NULL, mount_flags, NULL, + LABEL_NONE, err); + if (UNLIKELY (ret < 0)) + return ret; + } } } else @@ -1177,10 +1330,11 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, int label_how, libcrun_error_t *err) { cleanup_free char *data_with_label = NULL; + proc_fd_path_t target_buffer; + const char *context_type = NULL; cleanup_close int ms_move_fd = -1; const char *real_target = target; bool single_instance = false; - proc_fd_path_t target_buffer; bool needs_remount = false; cleanup_close int fd = -1; const char *label = NULL; @@ -1194,15 +1348,13 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, if (targetfd >= 0) { get_proc_self_fd_path (target_buffer, targetfd); - real_target = target_buffer; - needs_remount = true; } if (label_how == LABEL_MOUNT) { - const char *context_type = get_selinux_context_type (container, err); + context_type = get_selinux_context_type (container, err); if (UNLIKELY (context_type == NULL)) return -1; @@ -1233,7 +1385,61 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, { unsigned long flags = mountflags & ~(ALL_PROPAGATIONS_NO_REC | MS_RDONLY); - ret = mount (source, real_target, fstype, flags, data); + ret = -1; + + if (targetfd >= 0) + { + if (mountflags & MS_BIND) + { + cleanup_close int bindfd = -1; + libcrun_error_t tmp_err = NULL; + + bindfd = get_bind_mount (AT_FDCWD, source, + (mountflags & MS_REC) != 0, + false, false, 0, &tmp_err); + if (bindfd < 0) + { + int old_root_fd = get_old_root_fd (get_private_data (container)); + + if (old_root_fd >= 0 && source && source[0] == '/') + { + crun_error_release (&tmp_err); + bindfd = get_bind_mount (old_root_fd, (source[1] == '\0') ? "." : source + 1, + (mountflags & MS_REC) != 0, + false, false, 0, &tmp_err); + } + } + if (bindfd >= 0) + ret = fs_move_mount_to (bindfd, targetfd, NULL); + if (ret < 0) + crun_error_release (&tmp_err); + } + else + { + cleanup_close int newfs = fsopen_mount (fstype, source, context_type, label, data); + if (newfs >= 0) + ret = fs_move_mount_to (newfs, targetfd, NULL); + } + } + + if (ret < 0) + { + ret = mount (source, real_target, fstype, flags, data); + if (ret < 0 && (mountflags & MS_BIND) && source && source[0] == '/') + { + int old_root_fd = get_old_root_fd (get_private_data (container)); + + if (old_root_fd >= 0) + { + cleanup_free char *proc_source_buf = NULL; + proc_fd_path_t old_root_path; + + get_proc_self_fd_path (old_root_path, old_root_fd); + xasprintf (&proc_source_buf, "%s/%s", old_root_path, source + 1); + ret = mount (proc_source_buf, real_target, fstype, flags, data); + } + } + } if (UNLIKELY (ret < 0)) { int saved_errno = errno; @@ -1249,17 +1455,44 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, if (ret > 0) { cleanup_close int mountfd = -1; + int sys_old_root_fd = get_old_root_fd (get_private_data (container)); - if (! has_mount_for (container, "/sys/fs/cgroup")) + if (sys_old_root_fd >= 0) + mountfd = get_bind_mount (sys_old_root_fd, "sys", true, false, false, MS_PRIVATE, err); + if (mountfd < 0) { - ret = mount ("/sys", real_target, NULL, MS_BIND | MS_REC, NULL); - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "bind mount `/sys` from the host"); + crun_error_release (err); + mountfd = get_bind_mount (AT_FDCWD, "/sys", true, false, false, MS_PRIVATE, err); + } + if (! has_mount_for (container, "/sys/fs/cgroup")) + { + if (mountfd >= 0) + { + ret = fs_move_mount_to (mountfd, targetfd, NULL); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "move mount `/sys` to `%s`", real_target); + } + else + { + crun_error_release (err); + if (sys_old_root_fd >= 0) + { + cleanup_free char *sys_path = NULL; + proc_fd_path_t sys_root_path; + + get_proc_self_fd_path (sys_root_path, sys_old_root_fd); + xasprintf (&sys_path, "%s/sys", sys_root_path); + ret = mount (sys_path, real_target, NULL, MS_BIND | MS_REC, NULL); + } + else + ret = mount ("/sys", real_target, NULL, MS_BIND | MS_REC, NULL); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "bind mount `/sys` from the host"); + } return do_masked_or_readonly_path (container, "/sys/fs/cgroup", false, false, err); } - mountfd = get_bind_mount (-1, "/sys", true, true, false, MS_PRIVATE, err); if (UNLIKELY (mountfd < 0)) return mountfd; @@ -1286,7 +1519,27 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, /* We are replacing the rootfs, reopen it. */ if (is_empty_string (target)) { - int tmp = dup (fd); + int procfd = get_procfd (get_private_data (container), err); + int tmp; + if (UNLIKELY (procfd < 0)) + return procfd; + + { + cleanup_close int nsfd = openat (procfd, "self/ns/mnt", O_RDONLY | O_CLOEXEC); + if (UNLIKELY (nsfd < 0)) + return crun_make_error (err, errno, "open `/proc/self/ns/mnt`"); + + ret = setns (nsfd, CLONE_NEWNS); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "setns `CLONE_NEWNS`"); + } + + close_and_reset (&fd); + fd = open (get_private_data (container)->rootfs, O_PATH | O_CLOEXEC); + if (UNLIKELY (fd < 0)) + return crun_make_error (err, errno, "reopen rootfs after mount on /"); + + tmp = dup (fd); if (UNLIKELY (tmp < 0)) return crun_make_error (err, errno, "dup"); @@ -1297,26 +1550,44 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, #ifdef HAVE_FGETXATTR if (label_how == LABEL_XATTR) { - proc_fd_path_t proc_file; - - get_proc_self_fd_path (proc_file, fd); + int procfd = get_procfd (get_private_data (container), err); + if (procfd >= 0) + { + proc_fd_path_t proc_self_path; + cleanup_close int xfd = -1; - /* We need to go through the proc_file since fd itself is opened as O_PATH. */ - (void) setxattr (proc_file, "security.selinux", label, strlen (label), 0); + get_self_fd_path (proc_self_path, fd); + xfd = openat (procfd, proc_self_path, O_RDONLY | O_CLOEXEC); + if (xfd >= 0) + (void) fsetxattr (xfd, "security.selinux", label, strlen (label), 0); + } } #endif targetfd = fd; - get_proc_self_fd_path (target_buffer, targetfd); - real_target = target_buffer; } } if (mountflags & ALL_PROPAGATIONS_NO_REC) { - ret = mount (NULL, real_target, NULL, mountflags & ALL_PROPAGATIONS, NULL); - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "set propagation for `%s`", target); + bool propagation_done = false; + + if (targetfd >= 0) + { + libcrun_error_t tmp_err = NULL; + ret = do_mount_setattr (false, target, targetfd, 0, mountflags & ALL_PROPAGATIONS, &tmp_err); + if (LIKELY (ret == 0)) + propagation_done = true; + else + crun_error_release (&tmp_err); + } + + if (! propagation_done) + { + ret = mount (NULL, real_target, NULL, mountflags & ALL_PROPAGATIONS, NULL); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "set propagation for `%s`", target); + } } if (mountflags & (MS_BIND | MS_RDONLY)) @@ -1334,7 +1605,7 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, if ((remount_flags & MS_RDONLY) == 0) { - ret = do_remount (fd, real_target, remount_flags, data, err); + ret = do_remount (fd >= 0 ? fd : targetfd, real_target, remount_flags, data, err); if (UNLIKELY (ret < 0)) return ret; } @@ -1360,15 +1631,15 @@ do_mount (libcrun_container_t *container, const char *source, int targetfd, static void try_umount (int targetfd, const char *target) { - const char *real_target = target; proc_fd_path_t target_buffer; + const char *real_target = target; if (targetfd >= 0) { - /* Best effort cleanup for the tmpfs. */ get_proc_self_fd_path (target_buffer, targetfd); real_target = target_buffer; } + umount2 (real_target, MNT_DETACH); } @@ -1492,7 +1763,13 @@ do_mount_cgroup_v1 (libcrun_container_t *container, const char *source, int targ return tmpfsdirfd; targetfd = tmpfsdirfd; - ret = read_all_file (PROC_SELF_CGROUP, &content, NULL, err); + { + int procfd = get_procfd (get_private_data (container), err); + if (UNLIKELY (procfd < 0)) + return procfd; + + ret = read_all_file_at (procfd, SELF_CGROUP, &content, NULL, err); + } if (UNLIKELY (ret < 0)) return ret; @@ -1644,6 +1921,28 @@ relative_path_under_dev (const char *path) return NULL; } +static int +mknod_and_set_attrs (int dirfd, const char *name, struct device_s *device, mode_t type, dev_t dev, libcrun_error_t *err) +{ + int ret; + + ret = mknodat (dirfd, name, device->mode | type, dev); + if (UNLIKELY (ret < 0 && errno == EEXIST)) + return 0; + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "mknodat `%s`", device->path); + + ret = fchmodat (dirfd, name, device->mode, 0); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "fchmod `%s`", device->path); + + ret = fchownat (dirfd, name, device->uid, device->gid, AT_SYMLINK_NOFOLLOW); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "fchown `%s`", device->path); + + return 1; +} + int libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, struct device_s *device, bool binds, bool ensure_parent_dir, @@ -1653,7 +1952,6 @@ libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, dev_t dev; mode_t type = (device->type[0] == 'b') ? S_IFBLK : ((device->type[0] == 'p') ? S_IFIFO : S_IFCHR); const char *fullname = device->path; - cleanup_close int fd = -1; const char *rootfs = get_private_data (container)->rootfs; if (is_empty_string (fullname)) return crun_make_error (err, EINVAL, "device path is empty"); @@ -1723,8 +2021,6 @@ libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, } else { - proc_fd_path_t fd_buffer; - dev = makedev (device->major, device->minor); /* Check whether the path is directly under /dev. Since we already have an open fd to /dev and mknodat(2) @@ -1733,26 +2029,9 @@ libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, */ if (rel_dev) { - ret = mknodat (devfd, rel_dev, device->mode | type, dev); - /* We don't fail when the file already exists. */ - if (UNLIKELY (ret < 0 && errno == EEXIST)) - return 0; - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "mknodat `%s`", device->path); - - fd = safe_openat (devfd, rootfs, rel_dev, O_PATH | O_CLOEXEC, 0, err); - if (UNLIKELY (fd < 0)) - return fd; - - get_proc_self_fd_path (fd_buffer, fd); - - ret = chmod (fd_buffer, device->mode); - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "chmod `%s`", device->path); - - ret = chown (fd_buffer, device->uid, device->gid); /* lgtm [cpp/toctou-race-condition] */ - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "chown `%s`", device->path); + ret = mknod_and_set_attrs (devfd, rel_dev, device, type, dev, err); + if (ret <= 0) + return ret; } else { @@ -1787,27 +2066,9 @@ libcrun_create_dev (libcrun_container_t *container, int devfd, int srcfd, return dirfd; } - ret = mknodat (dirfd, basename, device->mode | type, dev); - - /* We don't fail when the file already exists. */ - if (UNLIKELY (ret < 0 && errno == EEXIST)) - return 0; - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "mknodat `%s`", device->path); - - fd = safe_openat (dirfd, rootfs, basename, O_PATH | O_CLOEXEC, 0, err); - if (UNLIKELY (fd < 0)) - return crun_error_wrap (err, "openat `%s`", device->path); - - get_proc_self_fd_path (fd_buffer, fd); - - ret = chmod (fd_buffer, device->mode); - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "chmod `%s`", device->path); - - ret = chown (fd_buffer, device->uid, device->gid); /* lgtm [cpp/toctou-race-condition] */ - if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "chown `%s`", device->path); + ret = mknod_and_set_attrs (dirfd, basename, device, type, dev, err); + if (ret <= 0) + return ret; } } return 0; @@ -1838,10 +2099,14 @@ create_missing_devs (libcrun_container_t *container, bool binds, libcrun_error_t runtime_spec_schema_config_schema *def = container->container_def; const char *rootfs = get_private_data (container)->rootfs; cleanup_close_map struct libcrun_fd_map *dev_fds = NULL; + cleanup_close_map struct libcrun_fd_map *needed_fds = NULL; dev_fds = get_private_data (container)->dev_fds; get_private_data (container)->dev_fds = NULL; + needed_fds = get_private_data (container)->needed_devs_fds; + get_private_data (container)->needed_devs_fds = NULL; + if (! def || ! def->linux) return 0; @@ -1868,12 +2133,16 @@ create_missing_devs (libcrun_container_t *container, bool binds, libcrun_error_t return ret; } - for (it = needed_devs; it->path; it++) + for (it = needed_devs, i = 0; it->path; it++, i++) { + int srcfd = (needed_fds && i < needed_fds->nfds) ? needed_fds->fds[i] : -1; + /* make sure the parent directory exists only on the first iteration. */ - ret = libcrun_create_dev (container, devfd, -1, it, binds, it == needed_devs, err); + ret = libcrun_create_dev (container, devfd, srcfd, it, binds, it == needed_devs, err); if (UNLIKELY (ret < 0)) return ret; + if (srcfd >= 0) + needed_fds->fds[i] = -1; } for (i = 0; symlinks[i].target; i++) @@ -1951,9 +2220,10 @@ do_masked_and_readonly_paths (libcrun_container_t *container, libcrun_error_t *e static int do_pivot (libcrun_container_t *container, const char *rootfs, libcrun_error_t *err) { - int ret; + struct private_data_s *private_data = get_private_data (container); cleanup_close int oldrootfd = -1; cleanup_close int newrootfd = -1; + int ret; oldrootfd = open ("/", O_DIRECTORY | O_PATH | O_CLOEXEC); if (UNLIKELY (oldrootfd < 0)) @@ -1975,6 +2245,27 @@ do_pivot (libcrun_container_t *container, const char *rootfs, libcrun_error_t *e if (UNLIKELY (ret < 0)) return crun_make_error (err, errno, "fchdir `%s`", rootfs); + /* Clone the shared empty directory mount before the old root is detached. */ + if (private_data->maskdir_fd >= 0) + { + cleanup_close int new_fd = -1; + + new_fd = syscall_open_tree (private_data->maskdir_fd, "", + AT_EMPTY_PATH | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + if (new_fd >= 0) + { + proc_fd_path_t fd_path; + + close (private_data->maskdir_fd); + private_data->maskdir_fd = new_fd; + new_fd = -1; + + get_self_fd_path (fd_path, private_data->maskdir_fd); + free (private_data->maskdir_proc_path); + private_data->maskdir_proc_path = xstrdup (fd_path); + } + } + ret = do_mount (container, NULL, -1, ".", NULL, MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err); if (UNLIKELY (ret < 0)) return ret; @@ -2159,18 +2450,28 @@ safe_create_symlink (int rootfsfd, const char *rootfs, const char *target, const static int handle_copy_symlink (libcrun_container_t *container, const char *rootfs, - runtime_spec_schema_defs_mount *mount, libcrun_error_t *err) + runtime_spec_schema_defs_mount *mount, + const char *link_target, libcrun_error_t *err) { cleanup_free char *target = NULL; - ssize_t len; - /* Copy the origin symlink instead of performing the mount operation. */ - len = safe_readlinkat (AT_FDCWD, mount->source, &target, 0, err); - if (UNLIKELY (len < 0)) - return len; + if (link_target == NULL) + { + int old_root_fd = get_old_root_fd (get_private_data (container)); + const char *source = mount->source; + ssize_t len; + + if (old_root_fd >= 0 && source[0] == '/') + len = safe_readlinkat (old_root_fd, source + 1, &target, 0, err); + else + len = safe_readlinkat (AT_FDCWD, source, &target, 0, err); + if (UNLIKELY (len < 0)) + return len; + link_target = target; + } return safe_create_symlink (get_private_data (container)->rootfsfd, rootfs, - target, mount->destination, err); + link_target, mount->destination, err); } static int @@ -2214,6 +2515,12 @@ handle_tmpcopyup (libcrun_container_t *container, const char *rootfs, const char { int destfd, ret; cleanup_close int tmpfd = copy_from_fd; + + target = consume_slashes (target); + + if (is_empty_string (target)) + return crun_make_error (err, 0, "tmpcopyup on `/` is not supported"); + destfd = safe_openat (get_private_data (container)->rootfsfd, rootfs, target, O_CLOEXEC | O_DIRECTORY, 0, err); if (UNLIKELY (destfd < 0)) @@ -2222,7 +2529,6 @@ handle_tmpcopyup (libcrun_container_t *container, const char *rootfs, const char // copy_recursive_fd_to_fd closes tmpfd and destfd ret = copy_recursive_fd_to_fd (tmpfd, destfd, target, target, err); tmpfd = -1; - return ret; } @@ -2299,24 +2605,52 @@ process_single_mount (libcrun_container_t *container, const char *rootfs, proc_fd_path_t proc_buf; const char *path = mount->source; + if ((extra_flags & OPTION_COPY_SYMLINK) && (extra_flags & (OPTION_SRC_NOFOLLOW | OPTION_DEST_NOFOLLOW))) + return crun_make_error (err, 0, "`copy-symlink` is mutually exclusive with `src-nofollow` and `dest-nofollow`"); + /* If copy-symlink is provided, ignore the pre-opened file descriptor since its source was resolved. */ if (source_mountfd >= 0 && ! (extra_flags & OPTION_COPY_SYMLINK)) { - get_proc_self_fd_path (proc_buf, source_mountfd); + get_self_fd_path (proc_buf, source_mountfd); path = proc_buf; - } - if ((extra_flags & OPTION_COPY_SYMLINK) && (extra_flags & (OPTION_SRC_NOFOLLOW | OPTION_DEST_NOFOLLOW))) - return crun_make_error (err, 0, "`copy-symlink` is mutually exclusive with `src-nofollow` and `dest-nofollow`"); + ret = get_file_type_at (source_mountfd, &src_mode, true, NULL); + } + else if ((extra_flags & OPTION_COPY_SYMLINK) + && get_private_data (container)->copy_symlink_targets + && mount_index < get_private_data (container)->n_copy_symlink_targets + && get_private_data (container)->copy_symlink_targets[mount_index]) + { + src_mode = S_IFLNK; + ret = 0; + } + else if (path[0] == '/') + { + int old_root_fd = get_old_root_fd (get_private_data (container)); + bool nofollow = (extra_flags & (OPTION_SRC_NOFOLLOW | OPTION_COPY_SYMLINK)) ? true : false; - /* Do not resolve the symlink only when src-nofollow and copy-symlink are used. */ - ret = get_file_type (&src_mode, (extra_flags & (OPTION_SRC_NOFOLLOW | OPTION_COPY_SYMLINK)) ? true : false, path); + if (old_root_fd >= 0) + ret = get_file_type_at (old_root_fd, &src_mode, nofollow, (path[1] == '\0') ? NULL : path + 1); + else + ret = get_file_type_at (AT_FDCWD, &src_mode, nofollow, path); + } + else + ret = get_file_type (&src_mode, (extra_flags & (OPTION_SRC_NOFOLLOW | OPTION_COPY_SYMLINK)) ? true : false, path); if (UNLIKELY (ret < 0)) return crun_make_error (err, errno, "cannot stat `%s`", path); if (S_ISLNK (src_mode) && (extra_flags & OPTION_DEST_NOFOLLOW) && source_mountfd < 0) { - ret = get_bind_mount (AT_FDCWD, mount->source, true, true, extra_flags & OPTION_SRC_NOFOLLOW, MS_PRIVATE, err); + int bind_dirfd = AT_FDCWD; + const char *bind_source = mount->source; + int old_root_fd = get_old_root_fd (get_private_data (container)); + + if (old_root_fd >= 0 && mount->source && mount->source[0] == '/') + { + bind_dirfd = old_root_fd; + bind_source = (mount->source[1] == '\0') ? "." : mount->source + 1; + } + ret = get_bind_mount (bind_dirfd, bind_source, true, true, extra_flags & OPTION_SRC_NOFOLLOW, MS_PRIVATE, err); if (UNLIKELY (ret < 0)) return ret; @@ -2328,7 +2662,13 @@ process_single_mount (libcrun_container_t *container, const char *rootfs, if (S_ISLNK (src_mode) && (extra_flags & OPTION_COPY_SYMLINK)) { - ret = handle_copy_symlink (container, rootfs, mount, err); + const char *cached_target = NULL; + + if (get_private_data (container)->copy_symlink_targets + && mount_index < get_private_data (container)->n_copy_symlink_targets) + cached_target = get_private_data (container)->copy_symlink_targets[mount_index]; + + ret = handle_copy_symlink (container, rootfs, mount, cached_target, err); if (UNLIKELY (ret < 0)) return ret; @@ -2385,15 +2725,28 @@ process_single_mount (libcrun_container_t *container, const char *rootfs, /* Check if there is already a mount for the requested file system. */ if (! mounted && source_mountfd >= 0) { - ret = fs_move_mount_to (source_mountfd, targetfd, NULL); if (LIKELY (ret == 0)) { - /* Force no MS_BIND flag to not attempt again the bind mount. */ - ret = do_mount (container, NULL, source_mountfd, target, NULL, flags & ~MS_BIND, data, LABEL_NONE, err); - if (UNLIKELY (ret < 0)) - return ret; + unsigned long remaining_flags = flags & ~MS_BIND; + if (remaining_flags) + { + ret = do_mount (container, NULL, source_mountfd, target, NULL, remaining_flags, data, LABEL_NONE, err); + if (UNLIKELY (ret < 0)) + return ret; + } + mounted = true; + + if (is_empty_string (target)) + { + int new_rootfsfd = open (rootfs, O_PATH | O_CLOEXEC); + if (UNLIKELY (new_rootfsfd < 0)) + return crun_make_error (err, errno, "reopen rootfs after mount on /"); + + TEMP_FAILURE_RETRY (close (get_private_data (container)->rootfsfd)); + get_private_data (container)->rootfsfd = new_rootfsfd; + } } } @@ -2599,7 +2952,7 @@ get_notify_fd (libcrun_context_t *context, libcrun_container_t *container, int * #ifdef HAVE_SYSTEMD static int -do_notify_socket (libcrun_container_t *container, const char *rootfs, libcrun_error_t *err) +do_notify_socket (libcrun_container_t *container, libcrun_error_t *err) { int ret; const char *notify_socket = container->context->notify_socket; @@ -2619,7 +2972,7 @@ do_notify_socket (libcrun_container_t *container, const char *rootfs, libcrun_er if (UNLIKELY (ret < 0)) return ret; - ret = append_paths (&container_notify_socket_path, err, rootfs, notify_socket, "notify", NULL); + ret = append_paths (&container_notify_socket_path, err, "/", notify_socket, "notify", NULL); if (UNLIKELY (ret < 0)) return ret; @@ -2727,28 +3080,31 @@ static int make_parent_mount_private (const char *rootfs, libcrun_error_t *err) { cleanup_close int rootfsfd = -1; + struct stat prev_st; proc_fd_path_t proc_path; - size_t n_slashes = 1; - const char *it; - - for (it = rootfs; *it; it++) - if (*it == '/') - n_slashes++; /* rootfs could be a relative path. */ rootfsfd = open (rootfs, O_PATH | O_CLOEXEC); if (UNLIKELY (rootfsfd < 0)) return crun_make_error (err, errno, "open `%s`", rootfs); - /* prevent a potential infinite loop. */ - while (n_slashes-- > 0) + if (UNLIKELY (fstat (rootfsfd, &prev_st) < 0)) + return crun_make_error (err, errno, "fstat `%s`", rootfs); + + for (;;) { + struct stat cur_st; + libcrun_error_t tmp_err = NULL; int ret; - errno = 0; cleanup_close int parentfd = -1; - get_proc_self_fd_path (proc_path, rootfsfd); - ret = mount (NULL, proc_path, NULL, MS_PRIVATE, NULL); + ret = do_mount_setattr (false, rootfs, rootfsfd, 0, MS_PRIVATE, &tmp_err); + if (ret == 0) + return 0; + crun_error_release (&tmp_err); + + get_proc_self_fd_path (proc_path, rootfsfd); + ret = mount (NULL, proc_path, NULL, MS_PRIVATE, NULL); if (ret == 0) return 0; @@ -2762,11 +3118,18 @@ make_parent_mount_private (const char *rootfs, libcrun_error_t *err) return crun_make_error (err, saved_errno, "make `%s` private: cannot open component", rootfs); } + if (UNLIKELY (fstat (parentfd, &cur_st) < 0)) + return crun_make_error (err, errno, "fstat parent of `%s`", rootfs); + + /* Reached the root of the file system: ".." points to itself. */ + if (cur_st.st_dev == prev_st.st_dev && cur_st.st_ino == prev_st.st_ino) + break; + + prev_st = cur_st; close_and_reset (&rootfsfd); rootfsfd = get_and_reset (&parentfd); } - /* should never get this far. */ return crun_make_error (err, 0, "make `%s` private", rootfs); } @@ -2774,37 +3137,12 @@ int libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_container_t *container, const char *rootfs, set_mounts_cb_t cb, void *cb_data, libcrun_error_t *err) { runtime_spec_schema_config_schema *def = container->container_def; - unsigned long rootfs_propagation = 0; - int cgroup_mode; int is_user_ns = 0; int ret = 0; if (rootfs == NULL || def->mounts == NULL) return 0; - if (def->linux && def->linux->rootfs_propagation) - rootfs_propagation = get_mount_flags (def->linux->rootfs_propagation, 0, NULL, NULL, NULL, NULL); - - if ((rootfs_propagation & ALL_PROPAGATIONS_NO_REC) == 0) - rootfs_propagation = MS_REC | MS_PRIVATE; - - get_private_data (container)->rootfs_propagation = rootfs_propagation; - - if (get_private_data (container)->unshare_flags & CLONE_NEWNS) - { - ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err); - if (UNLIKELY (ret < 0)) - return ret; - - ret = make_parent_mount_private (rootfs, err); - if (UNLIKELY (ret < 0)) - return ret; - - ret = do_mount (container, rootfs, -1, rootfs, NULL, MS_BIND | MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err); - if (UNLIKELY (ret < 0)) - return ret; - } - ret = open (rootfs, O_PATH | O_CLOEXEC); if (UNLIKELY (ret < 0)) return crun_make_error (err, errno, "open `%s`", rootfs); @@ -2831,32 +3169,7 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont get_private_data (container)->remounts = r; } - if (! container->context->force_no_cgroup) - { - cgroup_mode = libcrun_get_cgroup_mode (err); - if (UNLIKELY (cgroup_mode < 0)) - return cgroup_mode; - - if (cgroup_mode == CGROUP_MODE_UNIFIED) - { - char *unified_cgroup_path = NULL; - - /* Read the cgroup path before we enter the cgroupns. */ - ret = libcrun_get_cgroup_process (0, &unified_cgroup_path, true, err); - if (UNLIKELY (ret < 0)) - return ret; - - get_private_data (container)->unified_cgroup_path = unified_cgroup_path; - } - } - - ret = libcrun_container_enter_cgroup_ns (container, err); - if (UNLIKELY (ret < 0)) - return ret; - - ret = do_mounts (container, rootfs, err); - if (UNLIKELY (ret < 0)) - return ret; + /* Mounts are already done before pivot_root in setup_mount_namespace. */ is_user_ns = (get_private_data (container)->unshare_flags & CLONE_NEWUSER); if (! is_user_ns) @@ -2881,10 +3194,6 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont return ret; } - ret = do_finalize_notify_socket (container, err); - if (UNLIKELY (ret < 0)) - return ret; - if (def->process && def->process->cwd) { libcrun_error_t tmp_err = NULL; @@ -2901,25 +3210,6 @@ libcrun_set_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_cont return 0; } -int -libcrun_finalize_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_container_t *container, const char *rootfs, libcrun_error_t *err) -{ - int ret; - - ret = finalize_mounts (container, err); - if (UNLIKELY (ret < 0)) - return ret; - - // configure handler mounts for phase: HANDLER_CONFIGURE_AFTER_MOUNTS - ret = libcrun_container_notify_handler (entrypoint_args, HANDLER_CONFIGURE_AFTER_MOUNTS, container, rootfs, err); - if (UNLIKELY (ret < 0)) - return crun_error_wrap (err, "failed configuring mounts for handler at phase: HANDLER_CONFIGURE_AFTER_MOUNTS"); - - close_and_reset (&(get_private_data (container)->rootfsfd)); - - return 0; -} - static int umount_or_hide (const char *target, libcrun_error_t *err) { @@ -2978,12 +3268,22 @@ move_root (const char *rootfs, libcrun_error_t *err) } int -libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, const char *rootfs, libcrun_error_t *err) +libcrun_finalize_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_container_t *container, const char *rootfs, libcrun_error_t *err) { int ret; - if (get_private_data (container)->unshare_flags & CLONE_NEWNS) + + // configure handler mounts for phase: HANDLER_CONFIGURE_AFTER_MOUNTS + ret = libcrun_container_notify_handler (entrypoint_args, HANDLER_CONFIGURE_AFTER_MOUNTS, container, rootfs, err); + if (UNLIKELY (ret < 0)) + return crun_error_wrap (err, "failed configuring mounts for handler at phase: HANDLER_CONFIGURE_AFTER_MOUNTS"); + + close_and_reset (&(get_private_data (container)->rootfsfd)); + + if (get_private_data (container)->needs_pivot) { - if (no_pivot) + get_private_data (container)->needs_pivot = false; + + if (get_private_data (container)->no_pivot) { ret = move_root (rootfs, err); if (UNLIKELY (ret < 0)) @@ -2996,21 +3296,445 @@ libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, const char return ret; } - ret = do_mount (container, NULL, -1, "/", NULL, get_private_data (container)->rootfs_propagation, NULL, - LABEL_MOUNT, err); + ret = do_mount (container, NULL, -1, "/", NULL, + get_private_data (container)->rootfs_propagation, + NULL, LABEL_MOUNT, err); if (UNLIKELY (ret < 0)) return ret; + + ret = chdir ("/"); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "chdir to `/`"); + } + + ret = do_finalize_notify_socket (container, err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = finalize_mounts (container, err); + if (UNLIKELY (ret < 0)) + return ret; + + return 0; +} + +static int +maybe_open_tree_namespace (const char *rootfs, int *out_fd, libcrun_error_t *err) +{ + cleanup_close int rootfs_fd = -1; + int tree_fd; + + *out_fd = -1; + + rootfs_fd = open (rootfs, O_DIRECTORY | O_PATH | O_CLOEXEC); + if (UNLIKELY (rootfs_fd < 0)) + return crun_make_error (err, errno, "open `%s`", rootfs); + + tree_fd = syscall_open_tree (rootfs_fd, "", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC + | AT_EMPTY_PATH | AT_RECURSIVE); + if (tree_fd < 0) + { + if (errno == EINVAL || errno == ENOSYS || errno == EPERM) + return 0; + return crun_make_error (err, errno, "open_tree `%s`", rootfs); + } + + *out_fd = tree_fd; + return 0; +} + +static struct libcrun_fd_map * +get_fd_map (libcrun_container_t *container) +{ + struct libcrun_fd_map *mount_fds = get_private_data (container)->mount_fds; + + if (mount_fds == NULL) + { + runtime_spec_schema_config_schema *def = container->container_def; + mount_fds = make_libcrun_fd_map (def->mounts_len); + get_private_data (container)->mount_fds = mount_fds; + } + return mount_fds; +} + +static int +open_mount_of_type (libcrun_container_t *container, + runtime_spec_schema_defs_mount *mount, + libcrun_error_t *err) +{ + runtime_spec_schema_config_schema *def = container->container_def; + cleanup_free char *mnt_data = NULL; + unsigned long mnt_flags = 0, mnt_extra = 0; + uint64_t mnt_rec_clear = 0, mnt_rec_set = 0; + const char *label_type = NULL; + const char *label_val = NULL; + const char *fstype; + int mnt_fd; + size_t j; + + if (mount->type == NULL) + return -1; + + fstype = mount->type; + + for (j = 0; j < mount->options_len; j++) + mnt_flags |= get_mount_flags_or_option (mount->options[j], + mnt_flags, &mnt_extra, &mnt_data, + &mnt_rec_clear, &mnt_rec_set); + + if (def->linux && def->linux->mount_label) + { + bool is_sysfs_or_proc = strcmp (fstype, "sysfs") == 0 + || strcmp (fstype, "proc") == 0; + int label_how = get_mount_label_how (fstype, is_sysfs_or_proc); + if (label_how == LABEL_MOUNT) + { + label_type = get_selinux_context_type (container, err); + if (label_type == NULL) + crun_error_release (err); + else + label_val = def->linux->mount_label; + } + } + + if (strcmp (fstype, "cgroup") == 0) + { + int cgroup_mode = libcrun_get_cgroup_mode (err); + if (cgroup_mode == CGROUP_MODE_UNIFIED) + fstype = "cgroup2"; + crun_error_release (err); + } + + mnt_fd = fsopen_mount (fstype, mount->source, label_type, label_val, mnt_data); + if (mnt_fd < 0 && label_type) + mnt_fd = fsopen_mount (fstype, mount->source, NULL, NULL, mnt_data); + + return mnt_fd; +} + +static bool +can_use_open_tree_namespace (libcrun_container_t *container) +{ + runtime_spec_schema_config_schema *def = container->container_def; + struct libcrun_fd_map *mount_fds; + bool has_hooks = def->hooks + && (def->hooks->prestart_len || def->hooks->create_runtime_len); + bool has_userns = get_private_data (container)->unshare_flags & CLONE_NEWUSER; + size_t i; + + if (has_hooks || has_userns) + return false; + + mount_fds = get_fd_map (container); + for (i = 0; i < def->mounts_len; i++) + { + if (mount_fds->fds[i] < 0) + return false; + } + + return true; +} + +static int +setup_mount_namespace (libcrun_container_t *container, bool no_pivot, char **rootfs, libcrun_error_t *err) +{ + runtime_spec_schema_config_schema *def = container->container_def; + unsigned long rootfs_propagation = 0; + cleanup_close int tree_fd = -1; + libcrun_error_t tmp_err = NULL; + bool use_open_tree = false; + size_t i; + int ret; + + if (def->linux && def->linux->rootfs_propagation) + rootfs_propagation = get_mount_flags (def->linux->rootfs_propagation, 0, NULL, NULL, NULL, NULL); + + if ((rootfs_propagation & ALL_PROPAGATIONS_NO_REC) == 0) + rootfs_propagation = MS_REC | MS_PRIVATE; + + get_private_data (container)->rootfs_propagation = rootfs_propagation; + + ret = libcrun_container_enter_cgroup_ns (container, err); + if (UNLIKELY (ret < 0)) + return ret; + + /* Pre-create mounts and cache paths before pivot_root, + while the host file system is still reachable. */ + for (i = 0; i < def->mounts_len; i++) + { + struct libcrun_fd_map *mount_fds = get_fd_map (container); + cleanup_free char *dest_path = NULL; + cleanup_close int mnt_fd = -1; + bool recursive_bind, nofollow; + struct stat st; + + if (mount_fds && mount_fds->fds[i] >= 0) + continue; + + /* Pre-read symlink targets for copy-symlink mounts while the host file system is still reachable. */ + if (def->mounts[i]->source != NULL + && def->mounts[i]->source[0] == '/') + { + cleanup_free char *mnt_data = NULL; + unsigned long mnt_flags = 0; + unsigned long mnt_extra = 0; + uint64_t mnt_rec_clear = 0, mnt_rec_set = 0; + size_t j; + + for (j = 0; j < def->mounts[i]->options_len; j++) + mnt_flags |= get_mount_flags_or_option (def->mounts[i]->options[j], + mnt_flags, &mnt_extra, &mnt_data, + &mnt_rec_clear, &mnt_rec_set); + + if (mnt_extra & OPTION_COPY_SYMLINK) + { + cleanup_free char *target = NULL; + libcrun_error_t tmp_err = NULL; + ssize_t len; + + len = safe_readlinkat (AT_FDCWD, def->mounts[i]->source, &target, 0, &tmp_err); + if (UNLIKELY (len < 0)) + crun_error_release (&tmp_err); + + if (get_private_data (container)->copy_symlink_targets == NULL) + { + get_private_data (container)->copy_symlink_targets + = xmalloc0 (def->mounts_len * sizeof (char *)); + get_private_data (container)->n_copy_symlink_targets = def->mounts_len; + } + get_private_data (container)->copy_symlink_targets[i] = target; + target = NULL; + } + } + + if (is_bind_mount (def->mounts[i], &recursive_bind, &nofollow)) + { + if (mount_fds && def->mounts[i]->source != NULL) + { + libcrun_error_t tmp_err = NULL; + + mount_fds->fds[i] = get_bind_mount (AT_FDCWD, def->mounts[i]->source, + recursive_bind, false, nofollow, + MS_PRIVATE, &tmp_err); + if (mount_fds->fds[i] < 0) + crun_error_release (&tmp_err); + } + continue; + } + + mnt_fd = open_mount_of_type (container, def->mounts[i], err); + if (mnt_fd < 0) + { + crun_error_release (err); + continue; + } + + ret = append_paths (&dest_path, err, *rootfs, def->mounts[i]->destination, NULL); + if (UNLIKELY (ret < 0)) + return ret; + if (stat (dest_path, &st) == 0) + { + proc_fd_path_t fd_path; + int procfd = get_procfd (get_private_data (container), err); + if (UNLIKELY (procfd < 0)) + return procfd; + + get_self_fd_path (fd_path, mnt_fd); + ret = fchmodat (procfd, fd_path, st.st_mode & 07777, 0); + (void) ret; + } + + mount_fds->fds[i] = mnt_fd; + mnt_fd = -1; + } + + if (! get_private_data (container)->joined_mount_ns + && can_use_open_tree_namespace (container)) + use_open_tree = true; + + if (use_open_tree) + { + ret = maybe_open_tree_namespace (*rootfs, &tree_fd, err); + if (UNLIKELY (ret < 0)) + return ret; + } + + if (tree_fd >= 0) + { + ret = setns (tree_fd, CLONE_NEWNS); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "setns `CLONE_NEWNS`"); + + ret = mount (NULL, "/", NULL, MS_REMOUNT | MS_BIND, NULL); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "remount `/`"); + + ret = do_mount (container, NULL, -1, "/", NULL, MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err); + if (UNLIKELY (ret < 0)) + return ret; + + get_private_data (container)->needs_pivot = false; + get_private_data (container)->no_pivot = no_pivot; + free (*rootfs); + *rootfs = xstrdup ("/"); } else { - ret = chroot (rootfs); + if (! get_private_data (container)->joined_mount_ns) + { + ret = unshare (CLONE_NEWNS); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "unshare `CLONE_NEWNS`"); + } + + ret = do_mount (container, NULL, -1, "/", NULL, rootfs_propagation, NULL, LABEL_MOUNT, err); if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "chroot to `%s`", rootfs); + return ret; + + ret = make_parent_mount_private (*rootfs, err); + if (UNLIKELY (ret < 0)) + return ret; + + ret = do_mount (container, *rootfs, -1, *rootfs, NULL, MS_BIND | MS_REC | MS_PRIVATE, NULL, LABEL_MOUNT, err); + if (UNLIKELY (ret < 0)) + return ret; + + get_private_data (container)->needs_pivot = true; + get_private_data (container)->no_pivot = no_pivot; } - ret = chdir ("/"); + /* Mount everything before pivot_root while host paths are still reachable. + Use the pre-created fd when available, fall back to mount(). */ + ret = open (*rootfs, O_PATH | O_CLOEXEC); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "open `%s`", *rootfs); + get_private_data (container)->rootfsfd = ret; + get_private_data (container)->rootfs = *rootfs; + + if (! container->context->force_no_cgroup) + { + int cgroup_mode = libcrun_get_cgroup_mode (err); + if (UNLIKELY (cgroup_mode < 0)) + return cgroup_mode; + + if (cgroup_mode == CGROUP_MODE_UNIFIED) + { + char *unified_cgroup_path = NULL; + int procfd = get_procfd (get_private_data (container), err); + if (UNLIKELY (procfd < 0)) + return procfd; + + ret = libcrun_get_cgroup_process_at (procfd, &unified_cgroup_path, true, err); + if (UNLIKELY (ret < 0)) + return ret; + + get_private_data (container)->unified_cgroup_path = unified_cgroup_path; + } + } + + ret = do_mounts (container, *rootfs, err); if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "chdir to `/`"); + { + close_and_reset (&get_private_data (container)->rootfsfd); + get_private_data (container)->rootfs = NULL; + return ret; + } + + close_and_reset (&get_private_data (container)->rootfsfd); + get_private_data (container)->rootfs = NULL; + + /* Pre-open needed device fds for rootless containers. + The parent skips open_tree for rootless (EPERM), but the child + has CAP_SYS_ADMIN in its user namespace. */ + if (get_private_data (container)->needed_devs_fds) + { + struct libcrun_fd_map *dev_fds = get_private_data (container)->needed_devs_fds; + + for (i = 0; needed_devs[i].path; i++) + { + if (i < dev_fds->nfds && dev_fds->fds[i] >= 0) + continue; + + int fd = syscall_open_tree (AT_FDCWD, needed_devs[i].path, + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + if (fd >= 0 && i < dev_fds->nfds) + dev_fds->fds[i] = fd; + else if (fd >= 0) + close (fd); + } + } + + if (get_private_data (container)->notify_socket_tree_fd < 0 + && get_private_data (container)->host_notify_socket_path) + { + int fd = syscall_open_tree (AT_FDCWD, get_private_data (container)->host_notify_socket_path, + OPEN_TREE_CLONE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd >= 0) + get_private_data (container)->notify_socket_tree_fd = fd; + } + + get_old_root_fd (get_private_data (container)); + + /* After pivot_root, /proc is not yet mounted. */ + ret = get_procfd (get_private_data (container), &tmp_err); + if (ret < 0) + crun_error_release (&tmp_err); + + check_running_in_user_namespace (&tmp_err); + crun_error_release (&tmp_err); + + ret = get_shared_empty_dir_cached (container, NULL, &tmp_err); + if (ret < 0) + { + crun_error_release (&tmp_err); + get_private_data (container)->maskdir_bind_failed = true; + } + + return 0; +} + +int +libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, char **rootfs, libcrun_error_t *err) +{ + int ret; + + if (*rootfs == NULL) + return 0; + + if (get_private_data (container)->unshare_flags & CLONE_NEWNS) + { + ret = setup_mount_namespace (container, no_pivot, rootfs, err); + if (UNLIKELY (ret < 0)) + return ret; + + /* If setup_mount_namespace used OPEN_TREE_NAMESPACE, rootfs is + already set to "/". Otherwise pivot_root is deferred until + after the mounts are created. */ + if (strcmp (*rootfs, "/") == 0) + { + ret = chdir ("/"); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "chdir to `/`"); + } + } + else + { + ret = chroot (*rootfs); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "chroot to `%s`", *rootfs); + + ret = chdir ("/"); + if (UNLIKELY (ret < 0)) + return crun_make_error (err, errno, "chdir to `/`"); + + free (*rootfs); + *rootfs = xstrdup ("/"); + } return 0; } @@ -4293,18 +5017,19 @@ get_devices_fd_map (libcrun_container_t *container) return dev_fds; } +#define NUM_NEEDED_DEVS 6 + static struct libcrun_fd_map * -get_fd_map (libcrun_container_t *container) +get_needed_devs_fd_map (libcrun_container_t *container) { - struct libcrun_fd_map *mount_fds = get_private_data (container)->mount_fds; + struct libcrun_fd_map *fds = get_private_data (container)->needed_devs_fds; - if (mount_fds == NULL) + if (fds == NULL) { - runtime_spec_schema_config_schema *def = container->container_def; - mount_fds = make_libcrun_fd_map (def->mounts_len); - get_private_data (container)->mount_fds = mount_fds; + fds = make_libcrun_fd_map (NUM_NEEDED_DEVS); + get_private_data (container)->needed_devs_fds = fds; } - return mount_fds; + return fds; } bool @@ -4313,7 +5038,7 @@ is_bind_mount (runtime_spec_schema_defs_mount *mnt, bool *recursive, bool *src_n bool ret = false; size_t i; - if (src_nofollow == NULL) + if (src_nofollow != NULL) *src_nofollow = false; for (i = 0; i < mnt->options_len; i++) @@ -4368,23 +5093,23 @@ get_idmapped_option (runtime_spec_schema_defs_mount *mnt, bool *recursive) } static int -open_mount_of_type (runtime_spec_schema_defs_mount *mnt, int *out_fd, libcrun_error_t *err) +open_mount_type (const char *type, int *out_fd, libcrun_error_t *err) { cleanup_close int fsopen_fd = -1; cleanup_close int newfs_fd = -1; int ret; - fsopen_fd = syscall_fsopen (mnt->type, FSOPEN_CLOEXEC); + fsopen_fd = syscall_fsopen (type, FSOPEN_CLOEXEC); if (UNLIKELY (fsopen_fd < 0)) - return crun_make_error (err, errno, "fsopen `%s`", mnt->type); + return crun_make_error (err, errno, "fsopen `%s`", type); ret = syscall_fsconfig (fsopen_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); if (UNLIKELY (ret < 0)) - return crun_make_error (err, errno, "fsconfig create `%s`", mnt->type); + return crun_make_error (err, errno, "fsconfig create `%s`", type); newfs_fd = syscall_fsmount (fsopen_fd, FSMOUNT_CLOEXEC, 0); if (UNLIKELY (newfs_fd < 0)) - return crun_make_error (err, errno, "fsmount `%s`", mnt->type); + return crun_make_error (err, errno, "fsmount `%s`", type); *out_fd = get_and_reset (&newfs_fd); return 0; @@ -4452,7 +5177,7 @@ maybe_get_idmapped_mount (libcrun_container_t *container, runtime_spec_schema_co } else { - ret = open_mount_of_type (mnt, &newfs_fd, err); + ret = open_mount_type (mnt->type, &newfs_fd, err); if (UNLIKELY (ret < 0)) return ret; } @@ -4575,8 +5300,6 @@ prepare_and_send_mount_mounts (libcrun_container_t *container, pid_t pid, int sy mount_fds = make_libcrun_fd_map (def->mounts_len); - /* If the container is already running in a user namespace, apply the same logic as if a new - user namespace was created as part of the container itself. */ if (! has_userns) { int is_in_userns = check_running_in_user_namespace (err); @@ -4607,7 +5330,6 @@ prepare_and_send_mount_mounts (libcrun_container_t *container, pid_t pid, int sy if (propagation == 0) propagation = MS_PRIVATE; - /* If the bind mount failed, do not fail here, but attempt to create it from within the container. */ mount_fd = get_bind_mount (-1, def->mounts[i]->source, recursive, false, nofollow, propagation, err); if (UNLIKELY (mount_fd < 0)) crun_error_release (err); @@ -4692,7 +5414,7 @@ prepare_and_send_dev_mounts (libcrun_container_t *container, int sync_socket_hos } } - devs_mountfd = fsopen_mount ("tmpfs", context_type, label); + devs_mountfd = fsopen_mount ("tmpfs", NULL, context_type, label, NULL); if (UNLIKELY (devs_mountfd < 0)) { ret = crun_make_error (err, errno, "fsopen_mount `tmpfs`"); @@ -4752,6 +5474,33 @@ prepare_and_send_dev_mounts (libcrun_container_t *container, int sync_socket_hos return ret; } +static int +prepare_and_send_needed_dev_mounts (libcrun_container_t *container, int sync_socket_host, libcrun_error_t *err) +{ + cleanup_close_map struct libcrun_fd_map *fds = NULL; + bool has_userns = (get_private_data (container)->unshare_flags & CLONE_NEWUSER) ? true : false; + size_t how_many = 0; + size_t i; + + fds = make_libcrun_fd_map (NUM_NEEDED_DEVS); + + if (! has_userns || geteuid () > 0) + return send_mounts (sync_socket_host, fds, 0, NUM_NEEDED_DEVS, err); + + for (i = 0; i < NUM_NEEDED_DEVS && needed_devs[i].path; i++) + { + int fd = syscall_open_tree (AT_FDCWD, needed_devs[i].path, + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + if (fd >= 0) + { + fds->fds[i] = fd; + how_many++; + } + } + + return send_mounts (sync_socket_host, fds, how_many, NUM_NEEDED_DEVS, err); +} + static int prepare_and_send_mounts (libcrun_container_t *container, pid_t pid, int sync_socket_host, libcrun_error_t *err) { @@ -4769,6 +5518,10 @@ prepare_and_send_mounts (libcrun_container_t *container, pid_t pid, int sync_soc if (UNLIKELY (ret < 0)) return ret; + ret = prepare_and_send_needed_dev_mounts (container, sync_socket_host, err); + if (UNLIKELY (ret < 0)) + return ret; + return 0; } @@ -4933,9 +5686,9 @@ init_container (libcrun_container_t *container, int sync_socket_container, struc An error will be generated later if it is not possible to join the namespace. */ if (init_status->join_pidns && strcmp (def->mounts[i]->type, "proc") == 0) - fd = fsopen_mount (def->mounts[i]->type, NULL, NULL); + fd = fsopen_mount (def->mounts[i]->type, def->mounts[i]->source, NULL, NULL, NULL); if (init_status->join_ipcns && strcmp (def->mounts[i]->type, "mqueue") == 0) - fd = fsopen_mount (def->mounts[i]->type, NULL, NULL); + fd = fsopen_mount (def->mounts[i]->type, def->mounts[i]->source, NULL, NULL, NULL); if (fd >= 0) { @@ -4987,10 +5740,10 @@ init_container (libcrun_container_t *container, int sync_socket_container, struc if (UNLIKELY (ret < 0)) return ret; - if (init_status->namespaces_to_unshare & ~CLONE_NEWCGROUP) + if (init_status->namespaces_to_unshare & ~(CLONE_NEWCGROUP | CLONE_NEWNS)) { /* New namespaces to create for the container. */ - ret = unshare (init_status->namespaces_to_unshare & ~CLONE_NEWCGROUP); + ret = unshare (init_status->namespaces_to_unshare & ~(CLONE_NEWCGROUP | CLONE_NEWNS)); if (UNLIKELY (ret < 0)) return crun_make_error (err, errno, "unshare"); } @@ -5072,6 +5825,10 @@ init_container (libcrun_container_t *container, int sync_socket_container, struc if (UNLIKELY (ret < 0)) return ret; + ret = receive_mounts (get_needed_devs_fd_map (container), sync_socket_container, err); + if (UNLIKELY (ret < 0)) + return ret; + ret = libcrun_container_setgroups (container, container->container_def->process, err); if (UNLIKELY (ret < 0)) return ret; @@ -5132,6 +5889,9 @@ libcrun_run_linux_container (libcrun_container_t *container, container_entrypoin return ret; get_private_data (container)->unshare_flags = init_status.all_namespaces; + get_private_data (container)->joined_mount_ns + = (init_status.all_namespaces & CLONE_NEWNS) + && ! (init_status.namespaces_to_unshare & CLONE_NEWNS); /* cgroup will be unshared later. Once the process is in the correct cgroup. */ init_status.all_namespaces &= ~CLONE_NEWCGROUP; get_private_data (container)->unshare_cgroupns = init_status.namespaces_to_unshare & CLONE_NEWCGROUP; @@ -5146,7 +5906,7 @@ libcrun_run_linux_container (libcrun_container_t *container, container_entrypoin #ifdef HAVE_SYSTEMD if (def->root) { - ret = do_notify_socket (container, def->root->path, err); + ret = do_notify_socket (container, err); if (UNLIKELY (ret < 0)) return ret; } @@ -5220,7 +5980,7 @@ libcrun_run_linux_container (libcrun_container_t *container, container_entrypoin else if ((init_status.all_namespaces & CLONE_NEWUSER) == 0) { /* If it doesn't create a user namespace or need to join one, create the new requested namespaces now. */ - first_clone_args = init_status.namespaces_to_unshare & ~(CLONE_NEWTIME | CLONE_NEWCGROUP); + first_clone_args = init_status.namespaces_to_unshare & ~(CLONE_NEWTIME | CLONE_NEWCGROUP | CLONE_NEWNS); } init_status.namespaces_to_unshare &= ~first_clone_args; @@ -6454,7 +7214,7 @@ libcrun_make_runtime_mounts (libcrun_container_t *container, libcrun_container_s } else { - ret = open_mount_of_type (mounts[i], &(fds->fds[i]), err); + ret = open_mount_type (mounts[i]->type, &(fds->fds[i]), err); if (UNLIKELY (ret < 0)) return ret; } diff --git a/src/libcrun/linux.h b/src/libcrun/linux.h index 8dc1abc4fe..4cee3037b6 100644 --- a/src/libcrun/linux.h +++ b/src/libcrun/linux.h @@ -68,7 +68,7 @@ int libcrun_set_mounts (struct container_entrypoint_s *args, libcrun_container_t int libcrun_finalize_mounts (struct container_entrypoint_s *entrypoint_args, libcrun_container_t *container, const char *rootfs, libcrun_error_t *err); int libcrun_init_caps (libcrun_container_t *container, libcrun_error_t *err); -int libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, const char *rootfs, libcrun_error_t *err); +int libcrun_do_pivot_root (libcrun_container_t *container, bool no_pivot, char **rootfs, libcrun_error_t *err); int libcrun_reopen_dev_null (libcrun_error_t *err); int libcrun_set_usernamespace (libcrun_container_t *container, pid_t pid, libcrun_error_t *err); int libcrun_set_caps (runtime_spec_schema_config_schema_process_capabilities *capabilities, uid_t uid, gid_t gid, diff --git a/src/libcrun/syscalls.h b/src/libcrun/syscalls.h index 9c90ef8606..35c5368afd 100644 --- a/src/libcrun/syscalls.h +++ b/src/libcrun/syscalls.h @@ -54,10 +54,42 @@ # define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */ #endif +#ifndef MOUNT_ATTR_NOSUID +# define MOUNT_ATTR_NOSUID 0x00000002 +#endif + +#ifndef MOUNT_ATTR_NODEV +# define MOUNT_ATTR_NODEV 0x00000004 +#endif + +#ifndef MOUNT_ATTR_NOEXEC +# define MOUNT_ATTR_NOEXEC 0x00000008 +#endif + +#ifndef MOUNT_ATTR__ATIME +# define MOUNT_ATTR__ATIME 0x00000070 +#endif + +#ifndef MOUNT_ATTR_NOATIME +# define MOUNT_ATTR_NOATIME 0x00000010 +#endif + +#ifndef MOUNT_ATTR_STRICTATIME +# define MOUNT_ATTR_STRICTATIME 0x00000020 +#endif + +#ifndef MOUNT_ATTR_NODIRATIME +# define MOUNT_ATTR_NODIRATIME 0x00000080 +#endif + #ifndef MOUNT_ATTR_IDMAP # define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */ #endif +#ifndef MOUNT_ATTR_NOSYMFOLLOW +# define MOUNT_ATTR_NOSYMFOLLOW 0x00200000 +#endif + /* close_range flags */ #ifndef CLOSE_RANGE_CLOEXEC # define CLOSE_RANGE_CLOEXEC (1U << 2) diff --git a/src/libcrun/utils.c b/src/libcrun/utils.c index 34a9364e4e..aa9c6aa6ce 100644 --- a/src/libcrun/utils.c +++ b/src/libcrun/utils.c @@ -385,9 +385,16 @@ safe_openat (int dirfd, const char *rootfs, const char *path, int flags, int mod if (UNLIKELY (fd < 0)) return crun_make_error (err, errno, "open `%s`", rootfs); - ret = check_fd_is_path (rootfs, fd, path, err); - if (UNLIKELY (ret < 0)) - return ret; + /* Skip the readlink-based check when opening the root + directory itself (rootfs="/", path=""). open("/") can + only return the root, and after setns the /proc-based + readlink may not be reachable by path yet. */ + if (rootfs[0] != '/' || rootfs[1] != '\0') + { + ret = check_fd_is_path (rootfs, fd, path, err); + if (UNLIKELY (ret < 0)) + return ret; + } ret = fd; fd = -1; diff --git a/src/libcrun/utils.h b/src/libcrun/utils.h index c916378da1..67485aa9d6 100644 --- a/src/libcrun/utils.h +++ b/src/libcrun/utils.h @@ -502,6 +502,12 @@ get_proc_self_fd_path (proc_fd_path_t path, int fd) get_proc_fd_path (path, 0, fd); } +static inline void +get_self_fd_path (proc_fd_path_t path, int fd) +{ + snprintf (path, sizeof (proc_fd_path_t), "self/fd/%d", fd); +} + static inline int validate_options (unsigned int specified_options, unsigned int supported_options, libcrun_error_t *err) { diff --git a/tests/init.c b/tests/init.c index 4eb85b8dbe..a00017d552 100644 --- a/tests/init.c +++ b/tests/init.c @@ -679,6 +679,32 @@ main (int argc, char **argv) return 0; } + if (strcmp (argv[1], "nlink") == 0) + { + struct stat st; + + if (argc < 3) + error (EXIT_FAILURE, 0, "'nlink' requires a path argument"); + if (stat (argv[2], &st) < 0) + error (EXIT_FAILURE, errno, "stat %s", argv[2]); + + printf ("%lu\n", (unsigned long) st.st_nlink); + return 0; + } + + if (strcmp (argv[1], "isdir") == 0) + { + struct stat st; + + if (argc < 3) + error (EXIT_FAILURE, 0, "'isdir' requires a path argument"); + if (stat (argv[2], &st) < 0) + error (EXIT_FAILURE, errno, "stat %s", argv[2]); + + printf ("%d\n", S_ISDIR (st.st_mode) ? 1 : 0); + return 0; + } + if (strcmp (argv[1], "id") == 0) { int ret; diff --git a/tests/test_mounts.py b/tests/test_mounts.py index 2b08b6e28d..c80b26580f 100755 --- a/tests/test_mounts.py +++ b/tests/test_mounts.py @@ -153,17 +153,21 @@ def test_mount_bind_to_rootfs(): return 0 def test_mount_tmpfs_to_rootfs(): + # tmpcopyup on "/" is rejected: after pivot_root is moved before mounts, + # there is no original rootfs content to copy from. conf = base_config() conf['process']['args'] = ['/init', 'true'] add_all_namespaces(conf) - tmpdir = tempfile.mkdtemp() mounts = [ {"destination": "/", "type": "tmpfs", "source": "tmpfs", "options": ["tmpcopyup"]}, ] conf['mounts'] = mounts + conf['mounts'] - _, _ = run_and_get_output(conf, hide_stderr=True) - return 0 + try: + _, _ = run_and_get_output(conf, hide_stderr=True) + return -1 + except Exception as e: + return 0 def test_ro_cgroup(): for cgroupns in [True, False]: @@ -1066,6 +1070,107 @@ def test_mount_propagation_slave(): logger.info("mountinfo output: %s", out) return -1 +def test_mount_tmpfs_size(): + """Verify tmpfs mounts with size= data option work via fsopen_mount.""" + conf = base_config() + conf['process']['args'] = ['/init', 'cat', '/proc/self/mountinfo'] + add_all_namespaces(conf) + conf['mounts'].append({ + "destination": "/sized-tmpfs", + "type": "tmpfs", + "source": "tmpfs", + "options": ["nosuid", "nodev", "size=64k"], + }) + out, _ = run_and_get_output(conf, hide_stderr=True) + with tempfile.NamedTemporaryFile(mode='w', delete=True) as f: + f.write(out) + f.flush() + t = libmount.Table(f.name) + m = t.find_target('/sized-tmpfs') + if m is None: + logger.info("tmpfs size test: /sized-tmpfs not found in mountinfo") + return -1 + if m.fs_options is None or 'size=64k' not in m.fs_options: + logger.info("tmpfs size test: expected 'size=64k' in fs_options, got: %s", m.fs_options) + return -1 + return 0 + +def test_mount_tmpfs_size_userns(): + """Verify tmpfs with data options works inside a user namespace. + + This exercises the fsopen_mount pre-mount path: before pivot_root, + crun uses fsopen/fsconfig/fsmount to create a detached tmpfs with + the specified data, then moves it into place via move_mount. + """ + conf = base_config() + conf['process']['args'] = ['/init', 'cat', '/proc/self/mountinfo'] + add_all_namespaces(conf, userns=True) + conf['mounts'].append({ + "destination": "/sized-tmpfs", + "type": "tmpfs", + "source": "tmpfs", + "options": ["nosuid", "nodev", "mode=1777", "size=128k"], + }) + out, _ = run_and_get_output(conf, hide_stderr=True) + with tempfile.NamedTemporaryFile(mode='w', delete=True) as f: + f.write(out) + f.flush() + t = libmount.Table(f.name) + m = t.find_target('/sized-tmpfs') + if m is None: + logger.info("tmpfs userns test: /sized-tmpfs not found in mountinfo") + return -1 + if m.fs_options is None or 'size=128k' not in m.fs_options: + logger.info("tmpfs userns test: expected 'size=128k' in fs_options, got: %s", m.fs_options) + return -1 + return 0 + +def test_mount_overlay_fs(): + """Verify overlay mounts with lowerdir/upperdir/workdir work via fsopen_mount. + + Creates overlay directories on the host and mounts them into the + container. The premount loop in setup_mount_namespace uses + fsopen/fsconfig/fsmount to create a detached overlay before + pivot_root (while host paths are still reachable), then moves + the mount into place. + """ + if is_rootless(): + return (77, "requires root privileges") + + overlay_base = tempfile.mkdtemp(prefix="crun-overlay-test-") + lower = os.path.join(overlay_base, "lower") + upper = os.path.join(overlay_base, "upper") + work = os.path.join(overlay_base, "work") + try: + for d in [lower, upper, work]: + os.makedirs(d, exist_ok=True) + with open(os.path.join(lower, "from-lower"), "w") as f: + f.write("lower-data") + + def prepare(rootfs): + os.makedirs(os.path.join(rootfs, "overlay-mnt"), exist_ok=True) + + conf = base_config() + conf['process']['args'] = ['/init', 'cat', '/overlay-mnt/from-lower'] + add_all_namespaces(conf) + conf['mounts'].append({ + "destination": "/overlay-mnt", + "type": "overlay", + "source": "overlay", + "options": [ + "lowerdir=%s" % lower, + "upperdir=%s" % upper, + "workdir=%s" % work, + ], + }) + out, _ = run_and_get_output(conf, hide_stderr=True, callback_prepare_rootfs=prepare) + if "lower-data" not in out: + logger.info("overlay test: expected 'lower-data', got: %s", out) + return -1 + finally: + shutil.rmtree(overlay_base, ignore_errors=True) + return 0 + all_tests = { "mount-ro" : test_mount_ro, "mount-rro" : test_mount_rro, @@ -1106,6 +1211,9 @@ def test_mount_propagation_slave(): "mount-propagation-private": test_mount_propagation_private, "mount-no-leak-to-host": test_mount_no_leak_to_host, "mount-propagation-slave": test_mount_propagation_slave, + "mount-tmpfs-size": test_mount_tmpfs_size, + "mount-tmpfs-size-userns": test_mount_tmpfs_size_userns, + "mount-overlay-fs": test_mount_overlay_fs, } if __name__ == "__main__": diff --git a/tests/test_paths.py b/tests/test_paths.py index 1edc2b04dc..7a6e4fdcf5 100755 --- a/tests/test_paths.py +++ b/tests/test_paths.py @@ -15,6 +15,7 @@ # You should have received a copy of the GNU General Public License # along with crun. If not, see . +import os from tests_utils import * def test_readonly_paths(): @@ -39,10 +40,35 @@ def test_masked_paths(): if len(out) > 0: return -1 return 0 - + +def test_masked_paths_no_run_crun(): + def prepare(rootfs): + os.makedirs(os.path.join(rootfs, "run"), exist_ok=True) + conf = base_config() + conf['process']['args'] = ['/init', 'ls', '/run'] + conf['linux']['maskedPaths'] = ['/proc/acpi', '/proc/kcore'] + add_all_namespaces(conf) + out, _ = run_and_get_output(conf, hide_stderr=True, callback_prepare_rootfs=prepare) + if 'crun' in out: + return -1 + return 0 + +def test_masked_dir_nlink(): + conf = base_config() + conf['process']['args'] = ['/init', 'isdir', '/proc/acpi'] + conf['linux']['maskedPaths'] = ['/proc/acpi'] + add_all_namespaces(conf) + out, _ = run_and_get_output(conf, hide_stderr=True) + is_dir = int(out.strip()) + if is_dir != 1: + return -1 + return 0 + all_tests = { "readonly-paths" : test_readonly_paths, "masked-paths" : test_masked_paths, + "masked-paths-no-run-crun" : test_masked_paths_no_run_crun, + "masked-dir-nlink" : test_masked_dir_nlink, } if __name__ == "__main__":