diff options
-rw-r--r-- | fs/fhandle.c | 115 | ||||
-rw-r--r-- | fs/libfs.c | 1 | ||||
-rw-r--r-- | fs/namespace.c | 10 | ||||
-rw-r--r-- | fs/pidfs.c | 298 | ||||
-rw-r--r-- | include/linux/exportfs.h | 20 | ||||
-rw-r--r-- | include/linux/pid.h | 2 | ||||
-rw-r--r-- | include/linux/pidfs.h | 3 | ||||
-rw-r--r-- | include/linux/pseudo_fs.h | 1 | ||||
-rw-r--r-- | kernel/pid.c | 14 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/.gitignore | 2 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/Makefile | 3 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/pidfd.h | 39 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/pidfd_bind_mount.c | 188 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/pidfd_file_handle_test.c | 503 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/pidfd_setns_test.c | 47 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/pidfd_wait.c | 47 |
16 files changed, 1110 insertions, 183 deletions
diff --git a/fs/fhandle.c b/fs/fhandle.c index ec9145047dfc..3e092ae6d142 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -187,17 +187,6 @@ static int get_path_from_fd(int fd, struct path *root) return 0; } -enum handle_to_path_flags { - HANDLE_CHECK_PERMS = (1 << 0), - HANDLE_CHECK_SUBTREE = (1 << 1), -}; - -struct handle_to_path_ctx { - struct path root; - enum handle_to_path_flags flags; - unsigned int fh_flags; -}; - static int vfs_dentry_acceptable(void *context, struct dentry *dentry) { struct handle_to_path_ctx *ctx = context; @@ -261,50 +250,55 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path, { int handle_dwords; struct vfsmount *mnt = ctx->root.mnt; + struct dentry *dentry; /* change the handle size to multiple of sizeof(u32) */ handle_dwords = handle->handle_bytes >> 2; - path->dentry = exportfs_decode_fh_raw(mnt, - (struct fid *)handle->f_handle, - handle_dwords, handle->handle_type, - ctx->fh_flags, - vfs_dentry_acceptable, ctx); - if (IS_ERR_OR_NULL(path->dentry)) { - if (path->dentry == ERR_PTR(-ENOMEM)) + dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle, + handle_dwords, handle->handle_type, + ctx->fh_flags, vfs_dentry_acceptable, + ctx); + if (IS_ERR_OR_NULL(dentry)) { + if (dentry == ERR_PTR(-ENOMEM)) return -ENOMEM; return -ESTALE; } + path->dentry = dentry; path->mnt = mntget(mnt); return 0; } -/* - * Allow relaxed permissions of file handles if the caller has the - * ability to mount the filesystem or create a bind-mount of the - * provided @mountdirfd. - * - * In both cases the caller may be able to get an unobstructed way to - * the encoded file handle. If the caller is only able to create a - * bind-mount we need to verify that there are no locked mounts on top - * of it that could prevent us from getting to the encoded file. - * - * In principle, locked mounts can prevent the caller from mounting the - * filesystem but that only applies to procfs and sysfs neither of which - * support decoding file handles. - */ -static inline bool may_decode_fh(struct handle_to_path_ctx *ctx, - unsigned int o_flags) +static inline int may_decode_fh(struct handle_to_path_ctx *ctx, + unsigned int o_flags) { struct path *root = &ctx->root; + if (capable(CAP_DAC_READ_SEARCH)) + return 0; + /* - * Restrict to O_DIRECTORY to provide a deterministic API that avoids a - * confusing api in the face of disconnected non-dir dentries. + * Allow relaxed permissions of file handles if the caller has + * the ability to mount the filesystem or create a bind-mount of + * the provided @mountdirfd. + * + * In both cases the caller may be able to get an unobstructed + * way to the encoded file handle. If the caller is only able to + * create a bind-mount we need to verify that there are no + * locked mounts on top of it that could prevent us from getting + * to the encoded file. + * + * In principle, locked mounts can prevent the caller from + * mounting the filesystem but that only applies to procfs and + * sysfs neither of which support decoding file handles. + * + * Restrict to O_DIRECTORY to provide a deterministic API that + * avoids a confusing api in the face of disconnected non-dir + * dentries. * * There's only one dentry for each directory inode (VFS rule)... */ if (!(o_flags & O_DIRECTORY)) - return false; + return -EPERM; if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) ctx->flags = HANDLE_CHECK_PERMS; @@ -314,14 +308,14 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx, !has_locked_children(real_mount(root->mnt), root->dentry)) ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE; else - return false; + return -EPERM; /* Are we able to override DAC permissions? */ if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH)) - return false; + return -EPERM; ctx->fh_flags = EXPORT_FH_DIR_ONLY; - return true; + return 0; } static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, @@ -331,15 +325,19 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, struct file_handle f_handle; struct file_handle *handle = NULL; struct handle_to_path_ctx ctx = {}; + const struct export_operations *eops; retval = get_path_from_fd(mountdirfd, &ctx.root); if (retval) goto out_err; - if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) { - retval = -EPERM; + eops = ctx.root.mnt->mnt_sb->s_export_op; + if (eops && eops->permission) + retval = eops->permission(&ctx, o_flags); + else + retval = may_decode_fh(&ctx, o_flags); + if (retval) goto out_path; - } if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { retval = -EFAULT; @@ -398,29 +396,28 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag) { long retval = 0; - struct path path; + struct path path __free(path_put) = {}; struct file *file; - int fd; + const struct export_operations *eops; retval = handle_to_path(mountdirfd, ufh, &path, open_flag); if (retval) return retval; - fd = get_unused_fd_flags(open_flag); - if (fd < 0) { - path_put(&path); + CLASS(get_unused_fd, fd)(O_CLOEXEC); + if (fd < 0) return fd; - } - file = file_open_root(&path, "", open_flag, 0); - if (IS_ERR(file)) { - put_unused_fd(fd); - retval = PTR_ERR(file); - } else { - retval = fd; - fd_install(fd, file); - } - path_put(&path); - return retval; + + eops = path.mnt->mnt_sb->s_export_op; + if (eops->open) + file = eops->open(&path, open_flag); + else + file = file_open_root(&path, "", open_flag, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + fd_install(fd, file); + return take_fd(fd); } /** diff --git a/fs/libfs.c b/fs/libfs.c index 748ac5923154..2890a9c4a414 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -673,6 +673,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = ctx->magic; s->s_op = ctx->ops ?: &simple_super_operations; + s->s_export_op = ctx->eops; s->s_xattr = ctx->xattr; s->s_time_gran = 1; root = new_inode(s); diff --git a/fs/namespace.c b/fs/namespace.c index 851af89e8d72..64deda6f5b2c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -32,6 +32,7 @@ #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> +#include <linux/pidfs.h> #include <linux/nospec.h> #include "pnode.h" @@ -2736,8 +2737,13 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) if (IS_MNT_UNBINDABLE(old)) return mnt; - if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations) - return mnt; + if (!check_mnt(old)) { + const struct dentry_operations *d_op = old_path->dentry->d_op; + + if (d_op != &ns_dentry_operations && + d_op != &pidfs_dentry_operations) + return mnt; + } if (!recurse && has_locked_children(old, old_path->dentry)) return mnt; diff --git a/fs/pidfs.c b/fs/pidfs.c index 618abb1fa1b8..049352f973de 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/anon_inodes.h> +#include <linux/exportfs.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/cgroup.h> @@ -23,6 +24,97 @@ #include "internal.h" #include "mount.h" +static struct rb_root pidfs_ino_tree = RB_ROOT; + +#if BITS_PER_LONG == 32 +static inline unsigned long pidfs_ino(u64 ino) +{ + return lower_32_bits(ino); +} + +/* On 32 bit the generation number are the upper 32 bits. */ +static inline u32 pidfs_gen(u64 ino) +{ + return upper_32_bits(ino); +} + +#else + +/* On 64 bit simply return ino. */ +static inline unsigned long pidfs_ino(u64 ino) +{ + return ino; +} + +/* On 64 bit the generation number is 0. */ +static inline u32 pidfs_gen(u64 ino) +{ + return 0; +} +#endif + +static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct pid *pid_a = rb_entry(a, struct pid, pidfs_node); + struct pid *pid_b = rb_entry(b, struct pid, pidfs_node); + u64 pid_ino_a = pid_a->ino; + u64 pid_ino_b = pid_b->ino; + + if (pid_ino_a < pid_ino_b) + return -1; + if (pid_ino_a > pid_ino_b) + return 1; + return 0; +} + +void pidfs_add_pid(struct pid *pid) +{ + static u64 pidfs_ino_nr = 2; + + /* + * On 64 bit nothing special happens. The 64bit number assigned + * to struct pid is the inode number. + * + * On 32 bit the 64 bit number assigned to struct pid is split + * into two 32 bit numbers. The lower 32 bits are used as the + * inode number and the upper 32 bits are used as the inode + * generation number. + * + * On 32 bit pidfs_ino() will return the lower 32 bit. When + * pidfs_ino() returns zero a wrap around happened. When a + * wraparound happens the 64 bit number will be incremented by 2 + * so inode numbering starts at 2 again. + * + * On 64 bit comparing two pidfds is as simple as comparing + * inode numbers. + * + * When a wraparound happens on 32 bit multiple pidfds with the + * same inode number are likely to exist (This isn't a problem + * since before pidfs pidfds used the anonymous inode meaning + * all pidfds had the same inode number.). Userspace can + * reconstruct the 64 bit identifier by retrieving both the + * inode number and the inode generation number to compare or + * use file handles. + */ + if (pidfs_ino(pidfs_ino_nr) == 0) + pidfs_ino_nr += 2; + + pid->ino = pidfs_ino_nr; + pid->stashed = NULL; + pidfs_ino_nr++; + + write_seqcount_begin(&pidmap_lock_seq); + rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp); + write_seqcount_end(&pidmap_lock_seq); +} + +void pidfs_remove_pid(struct pid *pid) +{ + write_seqcount_begin(&pidmap_lock_seq); + rb_erase(&pid->pidfs_node, &pidfs_ino_tree); + write_seqcount_end(&pidmap_lock_seq); +} + #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd @@ -190,6 +282,27 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long return 0; } +static bool pidfs_ioctl_valid(unsigned int cmd) +{ + switch (cmd) { + case FS_IOC_GETVERSION: + case PIDFD_GET_CGROUP_NAMESPACE: + case PIDFD_GET_INFO: + case PIDFD_GET_IPC_NAMESPACE: + case PIDFD_GET_MNT_NAMESPACE: + case PIDFD_GET_NET_NAMESPACE: + case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: + case PIDFD_GET_TIME_NAMESPACE: + case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: + case PIDFD_GET_UTS_NAMESPACE: + case PIDFD_GET_USER_NAMESPACE: + case PIDFD_GET_PID_NAMESPACE: + return true; + } + + return false; +} + static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct task_struct *task __free(put_task) = NULL; @@ -198,6 +311,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; + if (!pidfs_ioctl_valid(cmd)) + return -ENOIOCTLCMD; + + if (cmd == FS_IOC_GETVERSION) { + if (!arg) + return -EINVAL; + + __u32 __user *argp = (__u32 __user *)arg; + return put_user(file_inode(file)->i_generation, argp); + } + task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; @@ -318,40 +442,6 @@ struct pid *pidfd_pid(const struct file *file) static struct vfsmount *pidfs_mnt __ro_after_init; -#if BITS_PER_LONG == 32 -/* - * Provide a fallback mechanism for 32-bit systems so processes remain - * reliably comparable by inode number even on those systems. - */ -static DEFINE_IDA(pidfd_inum_ida); - -static int pidfs_inum(struct pid *pid, unsigned long *ino) -{ - int ret; - - ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, - UINT_MAX, GFP_ATOMIC); - if (ret < 0) - return -ENOSPC; - - *ino = ret; - return 0; -} - -static inline void pidfs_free_inum(unsigned long ino) -{ - if (ino > 0) - ida_free(&pidfd_inum_ida, ino); -} -#else -static inline int pidfs_inum(struct pid *pid, unsigned long *ino) -{ - *ino = pid->ino; - return 0; -} -#define pidfs_free_inum(ino) ((void)(ino)) -#endif - /* * The vfs falls back to simple_setattr() if i_op->setattr() isn't * implemented. Let's reject it completely until we have a clean @@ -403,7 +493,6 @@ static void pidfs_evict_inode(struct inode *inode) clear_inode(inode); put_pid(pid); - pidfs_free_inum(inode->i_ino); } static const struct super_operations pidfs_sops = { @@ -421,25 +510,149 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); } -static const struct dentry_operations pidfs_dentry_operations = { +const struct dentry_operations pidfs_dentry_operations = { .d_delete = always_delete_dentry, .d_dname = pidfs_dname, .d_prune = stashed_dentry_prune, }; +static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + const struct pid *pid = inode->i_private; + + if (*max_len < 2) { + *max_len = 2; + return FILEID_INVALID; + } + + *max_len = 2; + *(u64 *)fh = pid->ino; + return FILEID_KERNFS; +} + +static int pidfs_ino_find(const void *key, const struct rb_node *node) +{ + const u64 pid_ino = *(u64 *)key; + const struct pid *pid = rb_entry(node, struct pid, pidfs_node); + + if (pid_ino < pid->ino) + return -1; + if (pid_ino > pid->ino) + return 1; + return 0; +} + +/* Find a struct pid based on the inode number. */ +static struct pid *pidfs_ino_get_pid(u64 ino) +{ + struct pid *pid; + struct rb_node *node; + unsigned int seq; + + guard(rcu)(); + do { + seq = read_seqcount_begin(&pidmap_lock_seq); + node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find); + if (node) + break; + } while (read_seqcount_retry(&pidmap_lock_seq, seq)); + + if (!node) + return NULL; + + pid = rb_entry(node, struct pid, pidfs_node); + + /* Within our pid namespace hierarchy? */ + if (pid_vnr(pid) == 0) + return NULL; + + return get_pid(pid); +} + +static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) +{ + int ret; + u64 pid_ino; + struct path path; + struct pid *pid; + + if (fh_len < 2) + return NULL; + + switch (fh_type) { + case FILEID_KERNFS: + pid_ino = *(u64 *)fid; + break; + default: + return NULL; + } + + pid = pidfs_ino_get_pid(pid_ino); + if (!pid) + return NULL; + + ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); + if (ret < 0) + return ERR_PTR(ret); + + mntput(path.mnt); + return path.dentry; +} + +/* + * Make sure that we reject any nonsensical flags that users pass via + * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and + * PIDFD_NONBLOCK as O_NONBLOCK. + */ +#define VALID_FILE_HANDLE_OPEN_FLAGS \ + (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL) + +static int pidfs_export_permission(struct handle_to_path_ctx *ctx, + unsigned int oflags) +{ + if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE)) + return -EINVAL; + + /* + * pidfd_ino_get_pid() will verify that the struct pid is part + * of the caller's pid namespace hierarchy. No further + * permission checks are needed. + */ + return 0; +} + +static struct file *pidfs_export_open(struct path *path, unsigned int oflags) +{ + /* + * Clear O_LARGEFILE as open_by_handle_at() forces it and raise + * O_RDWR as pidfds always are. + */ + oflags &= ~O_LARGEFILE; + return dentry_open(path, oflags | O_RDWR, current_cred()); +} + +static const struct export_operations pidfs_export_operations = { + .encode_fh = pidfs_encode_fh, + .fh_to_dentry = pidfs_fh_to_dentry, + .open = pidfs_export_open, + .permission = pidfs_export_permission, +}; + static int pidfs_init_inode(struct inode *inode, void *data) { + const struct pid *pid = data; + inode->i_private = data; inode->i_flags |= S_PRIVATE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; - /* - * Inode numbering for pidfs start at RESERVED_PIDS + 1. This - * avoids collisions with the root inode which is 1 for pseudo - * filesystems. - */ - return pidfs_inum(data, &inode->i_ino); + inode->i_ino = pidfs_ino(pid->ino); + inode->i_generation = pidfs_gen(pid->ino); + return 0; } static void pidfs_put_data(void *data) @@ -462,6 +675,7 @@ static int pidfs_init_fs_context(struct fs_context *fc) return -ENOMEM; ctx->ops = &pidfs_sops; + ctx->eops = &pidfs_export_operations; ctx->dops = &pidfs_dentry_operations; fc->s_fs_info = (void *)&pidfs_stashed_ops; return 0; diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 4cc8801e50e3..a087606ace19 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -3,6 +3,7 @@ #define LINUX_EXPORTFS_H 1 #include <linux/types.h> +#include <linux/path.h> struct dentry; struct iattr; @@ -156,6 +157,17 @@ struct fid { }; }; +enum handle_to_path_flags { + HANDLE_CHECK_PERMS = (1 << 0), + HANDLE_CHECK_SUBTREE = (1 << 1), +}; + +struct handle_to_path_ctx { + struct path root; + enum handle_to_path_flags flags; + unsigned int fh_flags; +}; + #define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */ #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ #define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ @@ -225,6 +237,12 @@ struct fid { * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * + * permission: + * Allow filesystems to specify a custom permission function. + * + * open: + * Allow filesystems to specify a custom open function. + * * commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * @@ -251,6 +269,8 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); + int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags); + struct file * (*open)(struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ diff --git a/include/linux/pid.h b/include/linux/pid.h index a3aad9b4074c..fe575fcdb4af 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -59,6 +59,7 @@ struct pid spinlock_t lock; struct dentry *stashed; u64 ino; + struct rb_node pidfs_node; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; @@ -68,6 +69,7 @@ struct pid struct upid numbers[]; }; +extern seqcount_spinlock_t pidmap_lock_seq; extern struct pid init_struct_pid; struct file; diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 75bdf9807802..7c830d0dec9a 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -4,5 +4,8 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); +void pidfs_add_pid(struct pid *pid); +void pidfs_remove_pid(struct pid *pid); +extern const struct dentry_operations pidfs_dentry_operations; #endif /* _LINUX_PID_FS_H */ diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h index 730f77381d55..2503f7625d65 100644 --- a/include/linux/pseudo_fs.h +++ b/include/linux/pseudo_fs.h @@ -5,6 +5,7 @@ struct pseudo_fs_context { const struct super_operations *ops; + const struct export_operations *eops; const struct xattr_handler * const *xattr; const struct dentry_operations *dops; unsigned long magic; diff --git a/kernel/pid.c b/kernel/pid.c index 115448e89c3e..aa2a7d4da455 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -43,6 +43,7 @@ #include <linux/sched/task.h> #include <linux/idr.h> #include <linux/pidfs.h> +#include <linux/seqlock.h> #include <net/sock.h> #include <uapi/linux/pidfd.h> @@ -64,11 +65,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -/* - * Pseudo filesystems start inode numbering after one. We use Reserved - * PIDs as a natural offset. - */ -static u64 pidfs_ino = RESERVED_PIDS; /* * PID-map pages start out as NULL, they get allocated upon @@ -108,6 +104,7 @@ EXPORT_SYMBOL_GPL(init_pid_ns); */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); +seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); void put_pid(struct pid *pid) { @@ -158,6 +155,7 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } + pidfs_remove_pid(pid); spin_unlock_irqrestore(&pidmap_lock, flags); call_rcu(&pid->rcu, delayed_put_pid); @@ -273,22 +271,24 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, INIT_HLIST_HEAD(&pid->inodes); upid = pid->numbers + ns->level; + idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; - pid->stashed = NULL; - pid->ino = ++pidfs_ino; + pidfs_add_pid(pid); for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); + idr_preload_end(); return pid; out_unlock: spin_unlock_irq(&pidmap_lock); + idr_preload_end(); put_pid_ns(ns); out_free: diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore index 973198a3ec3d..bf92481f925c 100644 --- a/tools/testing/selftests/pidfd/.gitignore +++ b/tools/testing/selftests/pidfd/.gitignore @@ -6,3 +6,5 @@ pidfd_wait pidfd_fdinfo_test pidfd_getfd_test pidfd_setns_test +pidfd_file_handle_test +pidfd_bind_mount diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index d731e3e76d5b..301343a11b62 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -2,7 +2,8 @@ CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \ - pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test + pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \ + pidfd_file_handle_test pidfd_bind_mount include ../lib.mk diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index 88d6830ee004..28a471c88c51 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -17,6 +17,7 @@ #include <sys/wait.h> #include "../kselftest.h" +#include "../clone3/clone3_selftests.h" #ifndef P_PIDFD #define P_PIDFD 3 @@ -68,6 +69,11 @@ #define PIDFD_SKIP 3 #define PIDFD_XFAIL 4 +static inline int sys_waitid(int which, pid_t pid, siginfo_t *info, int options) +{ + return syscall(__NR_waitid, which, pid, info, options, NULL); +} + static inline int wait_for_pid(pid_t pid) { int status, ret; @@ -114,4 +120,37 @@ static inline int sys_memfd_create(const char *name, unsigned int flags) return syscall(__NR_memfd_create, name, flags); } +static inline pid_t create_child(int *pidfd, unsigned flags) +{ + struct __clone_args args = { + .flags = CLONE_PIDFD | flags, + .exit_signal = SIGCHLD, + .pidfd = ptr_to_u64(pidfd), + }; + + return sys_clone3(&args, sizeof(struct __clone_args)); +} + +static inline ssize_t read_nointr(int fd, void *buf, size_t count) +{ + ssize_t ret; + + do { + ret = read(fd, buf, count); + } while (ret < 0 && errno == EINTR); + + return ret; +} + +static inline ssize_t write_nointr(int fd, const void *buf, size_t count) +{ + ssize_t ret; + + do { + ret = write(fd, buf, count); + } while (ret < 0 && errno == EINTR); + + return ret; +} + #endif /* __PIDFD_H */ diff --git a/tools/testing/selftests/pidfd/pidfd_bind_mount.c b/tools/testing/selftests/pidfd/pidfd_bind_mount.c new file mode 100644 index 000000000000..7822dd080258 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_bind_mount.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2024 Christian Brauner <brauner@kernel.org> + +#define _GNU_SOURCE +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <string.h> +#include <linux/fs.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/mount.h> +#include <unistd.h> + +#include "pidfd.h" +#include "../kselftest_harness.h" + +#ifndef __NR_open_tree + #if defined __alpha__ + #define __NR_open_tree 538 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_open_tree 4428 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_open_tree 6428 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_open_tree 5428 + #endif + #elif defined __ia64__ + #define __NR_open_tree (428 + 1024) + #else + #define __NR_open_tree 428 + #endif +#endif + +#ifndef __NR_move_mount + #if defined __alpha__ + #define __NR_move_mount 539 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_move_mount 4429 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_move_mount 6429 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_move_mount 5429 + #endif + #elif defined __ia64__ + #define __NR_move_mount (428 + 1024) + #else + #define __NR_move_mount 429 + #endif +#endif + +#ifndef MOVE_MOUNT_F_EMPTY_PATH +#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ +#endif + +#ifndef MOVE_MOUNT_F_EMPTY_PATH +#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ +#endif + +static inline int sys_move_mount(int from_dfd, const char *from_pathname, + int to_dfd, const char *to_pathname, + unsigned int flags) +{ + return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, + to_pathname, flags); +} + +#ifndef OPEN_TREE_CLONE +#define OPEN_TREE_CLONE 1 +#endif + +#ifndef OPEN_TREE_CLOEXEC +#define OPEN_TREE_CLOEXEC O_CLOEXEC +#endif + +#ifndef AT_RECURSIVE +#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ +#endif + +static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags) +{ + return syscall(__NR_open_tree, dfd, filename, flags); +} + +FIXTURE(pidfd_bind_mount) { + char template[PATH_MAX]; + int fd_tmp; + int pidfd; + struct stat st1; + struct stat st2; + __u32 gen1; + __u32 gen2; + bool must_unmount; +}; + +FIXTURE_SETUP(pidfd_bind_mount) +{ + self->fd_tmp = -EBADF; + self->must_unmount = false; + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_LE(snprintf(self->template, PATH_MAX, "%s", P_tmpdir "/pidfd_bind_mount_XXXXXX"), PATH_MAX); + self->fd_tmp = mkstemp(self->template); + ASSERT_GE(self->fd_tmp, 0); + self->pidfd = sys_pidfd_open(getpid(), 0); + ASSERT_GE(self->pidfd, 0); + ASSERT_GE(fstat(self->pidfd, &self->st1), 0); + ASSERT_EQ(ioctl(self->pidfd, FS_IOC_GETVERSION, &self->gen1), 0); +} + +FIXTURE_TEARDOWN(pidfd_bind_mount) +{ + ASSERT_EQ(close(self->fd_tmp), 0); + if (self->must_unmount) + ASSERT_EQ(umount2(self->template, 0), 0); + ASSERT_EQ(unlink(self->template), 0); +} + +/* + * Test that a detached mount can be created for a pidfd and then + * attached to the filesystem hierarchy. + */ +TEST_F(pidfd_bind_mount, bind_mount) +{ + int fd_tree; + + fd_tree = sys_open_tree(self->pidfd, "", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH); + ASSERT_GE(fd_tree, 0); + + ASSERT_EQ(move_mount(fd_tree, "", self->fd_tmp, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + self->must_unmount = true; + + ASSERT_EQ(close(fd_tree), 0); +} + +/* Test that a pidfd can be reopened through procfs. */ +TEST_F(pidfd_bind_mount, reopen) +{ + int pidfd; + char proc_path[PATH_MAX]; + + sprintf(proc_path, "/proc/self/fd/%d", self->pidfd); + pidfd = open(proc_path, O_RDONLY | O_NOCTTY | O_CLOEXEC); + ASSERT_GE(pidfd, 0); + + ASSERT_GE(fstat(self->pidfd, &self->st2), 0); + ASSERT_EQ(ioctl(self->pidfd, FS_IOC_GETVERSION, &self->gen2), 0); + + ASSERT_TRUE(self->st1.st_dev == self->st2.st_dev && self->st1.st_ino == self->st2.st_ino); + ASSERT_TRUE(self->gen1 == self->gen2); + + ASSERT_EQ(close(pidfd), 0); +} + +/* + * Test that a detached mount can be created for a pidfd and then + * attached to the filesystem hierarchy and reopened. + */ +TEST_F(pidfd_bind_mount, bind_mount_reopen) +{ + int fd_tree, fd_pidfd_mnt; + + fd_tree = sys_open_tree(self->pidfd, "", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH); + ASSERT_GE(fd_tree, 0); + + ASSERT_EQ(move_mount(fd_tree, "", self->fd_tmp, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + self->must_unmount = true; + + fd_pidfd_mnt = openat(-EBADF, self->template, O_RDONLY | O_NOCTTY | O_CLOEXEC); + ASSERT_GE(fd_pidfd_mnt, 0); + + ASSERT_GE(fstat(fd_tree, &self->st2), 0); + ASSERT_EQ(ioctl(fd_pidfd_mnt, FS_IOC_GETVERSION, &self->gen2), 0); + + ASSERT_TRUE(self->st1.st_dev == self->st2.st_dev && self->st1.st_ino == self->st2.st_ino); + ASSERT_TRUE(self->gen1 == self->gen2); + + ASSERT_EQ(close(fd_tree), 0); + ASSERT_EQ(close(fd_pidfd_mnt), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/pidfd/pidfd_file_handle_test.c b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c new file mode 100644 index 000000000000..439b9c6c0457 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c @@ -0,0 +1,503 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/types.h> +#include <poll.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <sys/prctl.h> +#include <sys/wait.h> +#include <unistd.h> +#include <sys/socket.h> +#include <linux/kcmp.h> +#include <sys/stat.h> + +#include "pidfd.h" +#include "../kselftest_harness.h" + +FIXTURE(file_handle) +{ + pid_t pid; + int pidfd; + + pid_t child_pid1; + int child_pidfd1; + + pid_t child_pid2; + int child_pidfd2; + + pid_t child_pid3; + int child_pidfd3; +}; + +FIXTURE_SETUP(file_handle) +{ + int ret; + int ipc_sockets[2]; + char c; + + self->pid = getpid(); + self->pidfd = sys_pidfd_open(self->pid, 0); + ASSERT_GE(self->pidfd, 0); + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER); + EXPECT_GE(self->child_pid1, 0); + + if (self->child_pid1 == 0) { + close(ipc_sockets[0]); + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + _exit(EXIT_FAILURE); + + close(ipc_sockets[1]); + + pause(); + _exit(EXIT_SUCCESS); + } + + close(ipc_sockets[1]); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + close(ipc_sockets[0]); + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID); + EXPECT_GE(self->child_pid2, 0); + + if (self->child_pid2 == 0) { + close(ipc_sockets[0]); + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + _exit(EXIT_FAILURE); + + close(ipc_sockets[1]); + + pause(); + _exit(EXIT_SUCCESS); + } + + close(ipc_sockets[1]); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + close(ipc_sockets[0]); + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + self->child_pid3 = create_child(&self->child_pidfd3, CLONE_NEWUSER | CLONE_NEWPID); + EXPECT_GE(self->child_pid3, 0); + + if (self->child_pid3 == 0) { + close(ipc_sockets[0]); + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + _exit(EXIT_FAILURE); + + close(ipc_sockets[1]); + + pause(); + _exit(EXIT_SUCCESS); + } + + close(ipc_sockets[1]); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + close(ipc_sockets[0]); +} + +FIXTURE_TEARDOWN(file_handle) +{ + EXPECT_EQ(close(self->pidfd), 0); + + EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd1, SIGKILL, NULL, 0), 0); + if (self->child_pidfd1 >= 0) + EXPECT_EQ(0, close(self->child_pidfd1)); + + EXPECT_EQ(sys_waitid(P_PID, self->child_pid1, NULL, WEXITED), 0); + + EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd2, SIGKILL, NULL, 0), 0); + if (self->child_pidfd2 >= 0) + EXPECT_EQ(0, close(self->child_pidfd2)); + + EXPECT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0); + + if (self->child_pidfd3 >= 0) { + EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd3, SIGKILL, NULL, 0), 0); + EXPECT_EQ(0, close(self->child_pidfd3)); + EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0); + } +} + +/* + * Test that we can decode a pidfs file handle in the same pid + * namespace. + */ +TEST_F(file_handle, file_handle_same_pidns) +{ + int mnt_id; + struct file_handle *fh; + int pidfd = -EBADF; + struct stat st1, st2; + + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); + ASSERT_NE(fh, NULL); + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); + fh->handle_bytes = MAX_HANDLE_SZ; + + ASSERT_EQ(name_to_handle_at(self->child_pidfd1, "", fh, &mnt_id, AT_EMPTY_PATH), 0); + + ASSERT_EQ(fstat(self->child_pidfd1, &st1), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, O_CLOEXEC); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, O_NONBLOCK); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + free(fh); +} + +/* + * Test that we can decode a pidfs file handle from a child pid + * namespace. + */ +TEST_F(file_handle, file_handle_child_pidns) +{ + int mnt_id; + struct file_handle *fh; + int pidfd = -EBADF; + struct stat st1, st2; + + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); + ASSERT_NE(fh, NULL); + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); + fh->handle_bytes = MAX_HANDLE_SZ; + + ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0); + + ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, O_CLOEXEC); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, O_NONBLOCK); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + free(fh); +} + +/* + * Test that we fail to decode a pidfs file handle from an ancestor + * child pid namespace. + */ +TEST_F(file_handle, file_handle_foreign_pidns) +{ + int mnt_id; + struct file_handle *fh; + pid_t pid; + + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); + ASSERT_NE(fh, NULL); + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); + fh->handle_bytes = MAX_HANDLE_SZ; + + ASSERT_EQ(name_to_handle_at(self->pidfd, "", fh, &mnt_id, AT_EMPTY_PATH), 0); + + ASSERT_EQ(setns(self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int pidfd = open_by_handle_at(self->pidfd, fh, 0); + if (pidfd >= 0) { + TH_LOG("Managed to open pidfd outside of the caller's pid namespace hierarchy"); + _exit(1); + } + _exit(0); + } + + ASSERT_EQ(wait_for_pid(pid), 0); + + free(fh); +} + +/* + * Test that we can decode a pidfs file handle of a process that has + * exited but not been reaped. + */ +TEST_F(file_handle, pid_has_exited) +{ + int mnt_id, pidfd, child_pidfd3; + struct file_handle *fh; + struct stat st1, st2; + + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); + ASSERT_NE(fh, NULL); + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); + fh->handle_bytes = MAX_HANDLE_SZ; + + ASSERT_EQ(name_to_handle_at(self->child_pidfd3, "", fh, &mnt_id, AT_EMPTY_PATH), 0); + + ASSERT_EQ(fstat(self->child_pidfd3, &st1), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + child_pidfd3 = self->child_pidfd3; + self->child_pidfd3 = -EBADF; + EXPECT_EQ(sys_pidfd_send_signal(child_pidfd3, SIGKILL, NULL, 0), 0); + EXPECT_EQ(close(child_pidfd3), 0); + EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED | WNOWAIT), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, 0); + ASSERT_GE(pidfd, 0); + + EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0); +} + +/* + * Test that we fail to decode a pidfs file handle of a process that has + * already been reaped. + */ +TEST_F(file_handle, pid_has_been_reaped) +{ + int mnt_id, pidfd, child_pidfd3; + struct file_handle *fh; + struct stat st1, st2; + + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); + ASSERT_NE(fh, NULL); + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); + fh->handle_bytes = MAX_HANDLE_SZ; + + ASSERT_EQ(name_to_handle_at(self->child_pidfd3, "", fh, &mnt_id, AT_EMPTY_PATH), 0); + + ASSERT_EQ(fstat(self->child_pidfd3, &st1), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); + + child_pidfd3 = self->child_pidfd3; + self->child_pidfd3 = -EBADF; + EXPECT_EQ(sys_pidfd_send_signal(child_pidfd3, SIGKILL, NULL, 0), 0); + EXPECT_EQ(close(child_pidfd3), 0); + EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, 0); + ASSERT_LT(pidfd, 0); +} + +/* + * Test valid flags to open a pidfd file handle. Note, that + * PIDFD_NONBLOCK is defined as O_NONBLOCK and O_NONBLOCK is an alias to + * O_NDELAY. Also note that PIDFD_THREAD is an alias for O_EXCL. + */ +TEST_F(file_handle, open_by_handle_at_valid_flags) +{ + int mnt_id; + struct file_handle *fh; + int pidfd = -EBADF; + struct stat st1, st2; + + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); + ASSERT_NE(fh, NULL); + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); + fh->handle_bytes = MAX_HANDLE_SZ; + + ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0); + + ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, + O_RDONLY | + O_WRONLY | + O_RDWR | + O_NONBLOCK | + O_NDELAY | + O_CLOEXEC | + O_EXCL); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); +} + +/* + * Test that invalid flags passed to open a pidfd file handle are + * rejected. + */ +TEST_F(file_handle, open_by_handle_at_invalid_flags) +{ + int mnt_id; + struct file_handle *fh; + int pidfd = -EBADF; + static const struct invalid_pidfs_file_handle_flags { + int oflag; + const char *oflag_name; + } invalid_pidfs_file_handle_flags[] = { + { FASYNC, "FASYNC" }, + { O_CREAT, "O_CREAT" }, + { O_NOCTTY, "O_NOCTTY" }, + { O_CREAT, "O_CREAT" }, + { O_TRUNC, "O_TRUNC" }, + { O_APPEND, "O_APPEND" }, + { O_SYNC, "O_SYNC" }, + { O_DSYNC, "O_DSYNC" }, + { O_DIRECT, "O_DIRECT" }, + { O_DIRECTORY, "O_DIRECTORY" }, + { O_NOFOLLOW, "O_NOFOLLOW" }, + { O_NOATIME, "O_NOATIME" }, + { O_PATH, "O_PATH" }, + { O_TMPFILE, "O_TMPFILE" }, + /* + * O_LARGEFILE is added implicitly by + * open_by_handle_at() so pidfs simply masks it off. + */ + }; + + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); + ASSERT_NE(fh, NULL); + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); + fh->handle_bytes = MAX_HANDLE_SZ; + + ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0); + + for (int i = 0; i < ARRAY_SIZE(invalid_pidfs_file_handle_flags); i++) { + pidfd = open_by_handle_at(self->pidfd, fh, invalid_pidfs_file_handle_flags[i].oflag); + ASSERT_LT(pidfd, 0) { + TH_LOG("open_by_handle_at() succeeded with invalid flags: %s", invalid_pidfs_file_handle_flags[i].oflag_name); + } + } +} + +/* Test that lookup fails. */ +TEST_F(file_handle, lookup_must_fail) +{ + int mnt_id; + struct file_handle *fh; + + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); + ASSERT_NE(fh, NULL); + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); + fh->handle_bytes = MAX_HANDLE_SZ; + + ASSERT_NE(name_to_handle_at(self->child_pidfd2, "lookup-is-not-possible-with-pidfs", fh, &mnt_id, AT_EMPTY_PATH), 0); + ASSERT_EQ(errno, ENOTDIR); + ASSERT_NE(name_to_handle_at(self->child_pidfd2, "lookup-is-not-possible-with-pidfs", fh, &mnt_id, 0), 0); + ASSERT_EQ(errno, ENOTDIR); +} + +#ifndef AT_HANDLE_CONNECTABLE +#define AT_HANDLE_CONNECTABLE 0x002 +#endif + +/* + * Test that AT_HANDLE_CONNECTABLE is rejected. Connectable file handles + * don't make sense for pidfs. Note that currently AT_HANDLE_CONNECTABLE + * is rejected because it is incompatible with AT_EMPTY_PATH which is + * required with pidfds as we don't support lookup. + */ +TEST_F(file_handle, invalid_name_to_handle_at_flags) +{ + int mnt_id; + struct file_handle *fh; + + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); + ASSERT_NE(fh, NULL); + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); + fh->handle_bytes = MAX_HANDLE_SZ; + + ASSERT_NE(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH | AT_HANDLE_CONNECTABLE), 0); +} + +#ifndef AT_HANDLE_FID +#define AT_HANDLE_FID 0x200 +#endif + +/* + * Test that a request with AT_HANDLE_FID always leads to decodable file + * handle as pidfs always provides export operations. + */ +TEST_F(file_handle, valid_name_to_handle_at_flags) +{ + int mnt_id, pidfd; + struct file_handle *fh; + struct stat st1, st2; + + fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ); + ASSERT_NE(fh, NULL); + memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ); + fh->handle_bytes = MAX_HANDLE_SZ; + + ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH | AT_HANDLE_FID), 0); + + ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0); + + pidfd = open_by_handle_at(self->pidfd, fh, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_EQ(fstat(pidfd, &st2), 0); + ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); + + ASSERT_EQ(close(pidfd), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c index 7c2a4349170a..222f8131283b 100644 --- a/tools/testing/selftests/pidfd/pidfd_setns_test.c +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -19,7 +19,6 @@ #include <linux/ioctl.h> #include "pidfd.h" -#include "../clone3/clone3_selftests.h" #include "../kselftest_harness.h" #ifndef PIDFS_IOCTL_MAGIC @@ -118,22 +117,6 @@ FIXTURE(current_nsset) int child_pidfd_derived_nsfds2[PIDFD_NS_MAX]; }; -static int sys_waitid(int which, pid_t pid, int options) -{ - return syscall(__NR_waitid, which, pid, NULL, options, NULL); -} - -pid_t create_child(int *pidfd, unsigned flags) -{ - struct __clone_args args = { - .flags = CLONE_PIDFD | flags, - .exit_signal = SIGCHLD, - .pidfd = ptr_to_u64(pidfd), - }; - - return sys_clone3(&args, sizeof(struct clone_args)); -} - static bool switch_timens(void) { int fd, ret; @@ -150,28 +133,6 @@ static bool switch_timens(void) return ret == 0; } -static ssize_t read_nointr(int fd, void *buf, size_t count) -{ - ssize_t ret; - - do { - ret = read(fd, buf, count); - } while (ret < 0 && errno == EINTR); - - return ret; -} - -static ssize_t write_nointr(int fd, const void *buf, size_t count) -{ - ssize_t ret; - - do { - ret = write(fd, buf, count); - } while (ret < 0 && errno == EINTR); - - return ret; -} - FIXTURE_SETUP(current_nsset) { int i, proc_fd, ret; @@ -229,7 +190,7 @@ FIXTURE_SETUP(current_nsset) _exit(EXIT_SUCCESS); } - ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, NULL, WEXITED | WNOWAIT), 0); self->pidfd = sys_pidfd_open(self->pid, 0); EXPECT_GE(self->pidfd, 0) { @@ -432,9 +393,9 @@ FIXTURE_TEARDOWN(current_nsset) EXPECT_EQ(0, close(self->child_pidfd1)); if (self->child_pidfd2 >= 0) EXPECT_EQ(0, close(self->child_pidfd2)); - ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0); - ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0); - ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, NULL, WEXITED), 0); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, NULL, WEXITED), 0); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0); } static int preserve_ns(const int pid, const char *ns) diff --git a/tools/testing/selftests/pidfd/pidfd_wait.c b/tools/testing/selftests/pidfd/pidfd_wait.c index 0dcb8365ddc3..1e2d49751cde 100644 --- a/tools/testing/selftests/pidfd/pidfd_wait.c +++ b/tools/testing/selftests/pidfd/pidfd_wait.c @@ -26,22 +26,11 @@ #define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__) #endif -static pid_t sys_clone3(struct clone_args *args) -{ - return syscall(__NR_clone3, args, sizeof(struct clone_args)); -} - -static int sys_waitid(int which, pid_t pid, siginfo_t *info, int options, - struct rusage *ru) -{ - return syscall(__NR_waitid, which, pid, info, options, ru); -} - TEST(wait_simple) { int pidfd = -1; pid_t parent_tid = -1; - struct clone_args args = { + struct __clone_args args = { .parent_tid = ptr_to_u64(&parent_tid), .pidfd = ptr_to_u64(&pidfd), .flags = CLONE_PIDFD | CLONE_PARENT_SETTID, @@ -55,7 +44,7 @@ TEST(wait_simple) pidfd = open("/proc/self", O_DIRECTORY | O_RDONLY | O_CLOEXEC); ASSERT_GE(pidfd, 0); - pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL); + pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED); ASSERT_NE(pid, 0); EXPECT_EQ(close(pidfd), 0); pidfd = -1; @@ -63,18 +52,18 @@ TEST(wait_simple) pidfd = open("/dev/null", O_RDONLY | O_CLOEXEC); ASSERT_GE(pidfd, 0); - pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL); + pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED); ASSERT_NE(pid, 0); EXPECT_EQ(close(pidfd), 0); pidfd = -1; - pid = sys_clone3(&args); + pid = sys_clone3(&args, sizeof(args)); ASSERT_GE(pid, 0); if (pid == 0) exit(EXIT_SUCCESS); - pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL); + pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED); ASSERT_GE(pid, 0); ASSERT_EQ(WIFEXITED(info.si_status), true); ASSERT_EQ(WEXITSTATUS(info.si_status), 0); @@ -89,7 +78,7 @@ TEST(wait_states) { int pidfd = -1; pid_t parent_tid = -1; - struct clone_args args = { + struct __clone_args args = { .parent_tid = ptr_to_u64(&parent_tid), .pidfd = ptr_to_u64(&pidfd), .flags = CLONE_PIDFD | CLONE_PARENT_SETTID, @@ -102,7 +91,7 @@ TEST(wait_states) }; ASSERT_EQ(pipe(pfd), 0); - pid = sys_clone3(&args); + pid = sys_clone3(&args, sizeof(args)); ASSERT_GE(pid, 0); if (pid == 0) { @@ -117,28 +106,28 @@ TEST(wait_states) } close(pfd[0]); - ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0); + ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED), 0); ASSERT_EQ(info.si_signo, SIGCHLD); ASSERT_EQ(info.si_code, CLD_STOPPED); ASSERT_EQ(info.si_pid, parent_tid); ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0); - ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED, NULL), 0); + ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED), 0); ASSERT_EQ(write(pfd[1], "C", 1), 1); close(pfd[1]); ASSERT_EQ(info.si_signo, SIGCHLD); ASSERT_EQ(info.si_code, CLD_CONTINUED); ASSERT_EQ(info.si_pid, parent_tid); - ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED, NULL), 0); + ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED), 0); ASSERT_EQ(info.si_signo, SIGCHLD); ASSERT_EQ(info.si_code, CLD_STOPPED); ASSERT_EQ(info.si_pid, parent_tid); ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0); - ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0); + ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED), 0); ASSERT_EQ(info.si_signo, SIGCHLD); ASSERT_EQ(info.si_code, CLD_KILLED); ASSERT_EQ(info.si_pid, parent_tid); @@ -151,7 +140,7 @@ TEST(wait_nonblock) int pidfd; unsigned int flags = 0; pid_t parent_tid = -1; - struct clone_args args = { + struct __clone_args args = { .parent_tid = ptr_to_u64(&parent_tid), .flags = CLONE_PARENT_SETTID, .exit_signal = SIGCHLD, @@ -173,12 +162,12 @@ TEST(wait_nonblock) SKIP(return, "Skipping PIDFD_NONBLOCK test"); } - ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL); + ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED); ASSERT_LT(ret, 0); ASSERT_EQ(errno, ECHILD); EXPECT_EQ(close(pidfd), 0); - pid = sys_clone3(&args); + pid = sys_clone3(&args, sizeof(args)); ASSERT_GE(pid, 0); if (pid == 0) { @@ -201,7 +190,7 @@ TEST(wait_nonblock) * Callers need to see EAGAIN/EWOULDBLOCK with non-blocking pidfd when * child processes exist but none have exited. */ - ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL); + ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED); ASSERT_LT(ret, 0); ASSERT_EQ(errno, EAGAIN); @@ -210,19 +199,19 @@ TEST(wait_nonblock) * WNOHANG raised explicitly when child processes exist but none have * exited. */ - ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG, NULL); + ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG); ASSERT_EQ(ret, 0); ASSERT_EQ(fcntl(pidfd, F_SETFL, (flags & ~O_NONBLOCK)), 0); - ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0); + ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED), 0); ASSERT_EQ(info.si_signo, SIGCHLD); ASSERT_EQ(info.si_code, CLD_STOPPED); ASSERT_EQ(info.si_pid, parent_tid); ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0); - ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0); + ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED), 0); ASSERT_EQ(info.si_signo, SIGCHLD); ASSERT_EQ(info.si_code, CLD_EXITED); ASSERT_EQ(info.si_pid, parent_tid); |