summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/fhandle.c115
-rw-r--r--fs/libfs.c1
-rw-r--r--fs/namespace.c10
-rw-r--r--fs/pidfs.c298
-rw-r--r--include/linux/exportfs.h20
-rw-r--r--include/linux/pid.h2
-rw-r--r--include/linux/pidfs.h3
-rw-r--r--include/linux/pseudo_fs.h1
-rw-r--r--kernel/pid.c14
-rw-r--r--tools/testing/selftests/pidfd/.gitignore2
-rw-r--r--tools/testing/selftests/pidfd/Makefile3
-rw-r--r--tools/testing/selftests/pidfd/pidfd.h39
-rw-r--r--tools/testing/selftests/pidfd/pidfd_bind_mount.c188
-rw-r--r--tools/testing/selftests/pidfd/pidfd_file_handle_test.c503
-rw-r--r--tools/testing/selftests/pidfd/pidfd_setns_test.c47
-rw-r--r--tools/testing/selftests/pidfd/pidfd_wait.c47
16 files changed, 1110 insertions, 183 deletions
diff --git a/fs/fhandle.c b/fs/fhandle.c
index ec9145047dfc..3e092ae6d142 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -187,17 +187,6 @@ static int get_path_from_fd(int fd, struct path *root)
return 0;
}
-enum handle_to_path_flags {
- HANDLE_CHECK_PERMS = (1 << 0),
- HANDLE_CHECK_SUBTREE = (1 << 1),
-};
-
-struct handle_to_path_ctx {
- struct path root;
- enum handle_to_path_flags flags;
- unsigned int fh_flags;
-};
-
static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
{
struct handle_to_path_ctx *ctx = context;
@@ -261,50 +250,55 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
{
int handle_dwords;
struct vfsmount *mnt = ctx->root.mnt;
+ struct dentry *dentry;
/* change the handle size to multiple of sizeof(u32) */
handle_dwords = handle->handle_bytes >> 2;
- path->dentry = exportfs_decode_fh_raw(mnt,
- (struct fid *)handle->f_handle,
- handle_dwords, handle->handle_type,
- ctx->fh_flags,
- vfs_dentry_acceptable, ctx);
- if (IS_ERR_OR_NULL(path->dentry)) {
- if (path->dentry == ERR_PTR(-ENOMEM))
+ dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle,
+ handle_dwords, handle->handle_type,
+ ctx->fh_flags, vfs_dentry_acceptable,
+ ctx);
+ if (IS_ERR_OR_NULL(dentry)) {
+ if (dentry == ERR_PTR(-ENOMEM))
return -ENOMEM;
return -ESTALE;
}
+ path->dentry = dentry;
path->mnt = mntget(mnt);
return 0;
}
-/*
- * Allow relaxed permissions of file handles if the caller has the
- * ability to mount the filesystem or create a bind-mount of the
- * provided @mountdirfd.
- *
- * In both cases the caller may be able to get an unobstructed way to
- * the encoded file handle. If the caller is only able to create a
- * bind-mount we need to verify that there are no locked mounts on top
- * of it that could prevent us from getting to the encoded file.
- *
- * In principle, locked mounts can prevent the caller from mounting the
- * filesystem but that only applies to procfs and sysfs neither of which
- * support decoding file handles.
- */
-static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
- unsigned int o_flags)
+static inline int may_decode_fh(struct handle_to_path_ctx *ctx,
+ unsigned int o_flags)
{
struct path *root = &ctx->root;
+ if (capable(CAP_DAC_READ_SEARCH))
+ return 0;
+
/*
- * Restrict to O_DIRECTORY to provide a deterministic API that avoids a
- * confusing api in the face of disconnected non-dir dentries.
+ * Allow relaxed permissions of file handles if the caller has
+ * the ability to mount the filesystem or create a bind-mount of
+ * the provided @mountdirfd.
+ *
+ * In both cases the caller may be able to get an unobstructed
+ * way to the encoded file handle. If the caller is only able to
+ * create a bind-mount we need to verify that there are no
+ * locked mounts on top of it that could prevent us from getting
+ * to the encoded file.
+ *
+ * In principle, locked mounts can prevent the caller from
+ * mounting the filesystem but that only applies to procfs and
+ * sysfs neither of which support decoding file handles.
+ *
+ * Restrict to O_DIRECTORY to provide a deterministic API that
+ * avoids a confusing api in the face of disconnected non-dir
+ * dentries.
*
* There's only one dentry for each directory inode (VFS rule)...
*/
if (!(o_flags & O_DIRECTORY))
- return false;
+ return -EPERM;
if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
ctx->flags = HANDLE_CHECK_PERMS;
@@ -314,14 +308,14 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
!has_locked_children(real_mount(root->mnt), root->dentry))
ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
else
- return false;
+ return -EPERM;
/* Are we able to override DAC permissions? */
if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH))
- return false;
+ return -EPERM;
ctx->fh_flags = EXPORT_FH_DIR_ONLY;
- return true;
+ return 0;
}
static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
@@ -331,15 +325,19 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
struct file_handle f_handle;
struct file_handle *handle = NULL;
struct handle_to_path_ctx ctx = {};
+ const struct export_operations *eops;
retval = get_path_from_fd(mountdirfd, &ctx.root);
if (retval)
goto out_err;
- if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) {
- retval = -EPERM;
+ eops = ctx.root.mnt->mnt_sb->s_export_op;
+ if (eops && eops->permission)
+ retval = eops->permission(&ctx, o_flags);
+ else
+ retval = may_decode_fh(&ctx, o_flags);
+ if (retval)
goto out_path;
- }
if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
retval = -EFAULT;
@@ -398,29 +396,28 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
int open_flag)
{
long retval = 0;
- struct path path;
+ struct path path __free(path_put) = {};
struct file *file;
- int fd;
+ const struct export_operations *eops;
retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
if (retval)
return retval;
- fd = get_unused_fd_flags(open_flag);
- if (fd < 0) {
- path_put(&path);
+ CLASS(get_unused_fd, fd)(O_CLOEXEC);
+ if (fd < 0)
return fd;
- }
- file = file_open_root(&path, "", open_flag, 0);
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- retval = PTR_ERR(file);
- } else {
- retval = fd;
- fd_install(fd, file);
- }
- path_put(&path);
- return retval;
+
+ eops = path.mnt->mnt_sb->s_export_op;
+ if (eops->open)
+ file = eops->open(&path, open_flag);
+ else
+ file = file_open_root(&path, "", open_flag, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ fd_install(fd, file);
+ return take_fd(fd);
}
/**
diff --git a/fs/libfs.c b/fs/libfs.c
index 748ac5923154..2890a9c4a414 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -673,6 +673,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = ctx->magic;
s->s_op = ctx->ops ?: &simple_super_operations;
+ s->s_export_op = ctx->eops;
s->s_xattr = ctx->xattr;
s->s_time_gran = 1;
root = new_inode(s);
diff --git a/fs/namespace.c b/fs/namespace.c
index 851af89e8d72..64deda6f5b2c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -32,6 +32,7 @@
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
+#include <linux/pidfs.h>
#include <linux/nospec.h>
#include "pnode.h"
@@ -2736,8 +2737,13 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
if (IS_MNT_UNBINDABLE(old))
return mnt;
- if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
- return mnt;
+ if (!check_mnt(old)) {
+ const struct dentry_operations *d_op = old_path->dentry->d_op;
+
+ if (d_op != &ns_dentry_operations &&
+ d_op != &pidfs_dentry_operations)
+ return mnt;
+ }
if (!recurse && has_locked_children(old, old_path->dentry))
return mnt;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 618abb1fa1b8..049352f973de 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/anon_inodes.h>
+#include <linux/exportfs.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/cgroup.h>
@@ -23,6 +24,97 @@
#include "internal.h"
#include "mount.h"
+static struct rb_root pidfs_ino_tree = RB_ROOT;
+
+#if BITS_PER_LONG == 32
+static inline unsigned long pidfs_ino(u64 ino)
+{
+ return lower_32_bits(ino);
+}
+
+/* On 32 bit the generation number are the upper 32 bits. */
+static inline u32 pidfs_gen(u64 ino)
+{
+ return upper_32_bits(ino);
+}
+
+#else
+
+/* On 64 bit simply return ino. */
+static inline unsigned long pidfs_ino(u64 ino)
+{
+ return ino;
+}
+
+/* On 64 bit the generation number is 0. */
+static inline u32 pidfs_gen(u64 ino)
+{
+ return 0;
+}
+#endif
+
+static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
+ struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
+ u64 pid_ino_a = pid_a->ino;
+ u64 pid_ino_b = pid_b->ino;
+
+ if (pid_ino_a < pid_ino_b)
+ return -1;
+ if (pid_ino_a > pid_ino_b)
+ return 1;
+ return 0;
+}
+
+void pidfs_add_pid(struct pid *pid)
+{
+ static u64 pidfs_ino_nr = 2;
+
+ /*
+ * On 64 bit nothing special happens. The 64bit number assigned
+ * to struct pid is the inode number.
+ *
+ * On 32 bit the 64 bit number assigned to struct pid is split
+ * into two 32 bit numbers. The lower 32 bits are used as the
+ * inode number and the upper 32 bits are used as the inode
+ * generation number.
+ *
+ * On 32 bit pidfs_ino() will return the lower 32 bit. When
+ * pidfs_ino() returns zero a wrap around happened. When a
+ * wraparound happens the 64 bit number will be incremented by 2
+ * so inode numbering starts at 2 again.
+ *
+ * On 64 bit comparing two pidfds is as simple as comparing
+ * inode numbers.
+ *
+ * When a wraparound happens on 32 bit multiple pidfds with the
+ * same inode number are likely to exist (This isn't a problem
+ * since before pidfs pidfds used the anonymous inode meaning
+ * all pidfds had the same inode number.). Userspace can
+ * reconstruct the 64 bit identifier by retrieving both the
+ * inode number and the inode generation number to compare or
+ * use file handles.
+ */
+ if (pidfs_ino(pidfs_ino_nr) == 0)
+ pidfs_ino_nr += 2;
+
+ pid->ino = pidfs_ino_nr;
+ pid->stashed = NULL;
+ pidfs_ino_nr++;
+
+ write_seqcount_begin(&pidmap_lock_seq);
+ rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
+ write_seqcount_end(&pidmap_lock_seq);
+}
+
+void pidfs_remove_pid(struct pid *pid)
+{
+ write_seqcount_begin(&pidmap_lock_seq);
+ rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
+ write_seqcount_end(&pidmap_lock_seq);
+}
+
#ifdef CONFIG_PROC_FS
/**
* pidfd_show_fdinfo - print information about a pidfd
@@ -190,6 +282,27 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
return 0;
}
+static bool pidfs_ioctl_valid(unsigned int cmd)
+{
+ switch (cmd) {
+ case FS_IOC_GETVERSION:
+ case PIDFD_GET_CGROUP_NAMESPACE:
+ case PIDFD_GET_INFO:
+ case PIDFD_GET_IPC_NAMESPACE:
+ case PIDFD_GET_MNT_NAMESPACE:
+ case PIDFD_GET_NET_NAMESPACE:
+ case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
+ case PIDFD_GET_TIME_NAMESPACE:
+ case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
+ case PIDFD_GET_UTS_NAMESPACE:
+ case PIDFD_GET_USER_NAMESPACE:
+ case PIDFD_GET_PID_NAMESPACE:
+ return true;
+ }
+
+ return false;
+}
+
static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct task_struct *task __free(put_task) = NULL;
@@ -198,6 +311,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct ns_common *ns_common = NULL;
struct pid_namespace *pid_ns;
+ if (!pidfs_ioctl_valid(cmd))
+ return -ENOIOCTLCMD;
+
+ if (cmd == FS_IOC_GETVERSION) {
+ if (!arg)
+ return -EINVAL;
+
+ __u32 __user *argp = (__u32 __user *)arg;
+ return put_user(file_inode(file)->i_generation, argp);
+ }
+
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
return -ESRCH;
@@ -318,40 +442,6 @@ struct pid *pidfd_pid(const struct file *file)
static struct vfsmount *pidfs_mnt __ro_after_init;
-#if BITS_PER_LONG == 32
-/*
- * Provide a fallback mechanism for 32-bit systems so processes remain
- * reliably comparable by inode number even on those systems.
- */
-static DEFINE_IDA(pidfd_inum_ida);
-
-static int pidfs_inum(struct pid *pid, unsigned long *ino)
-{
- int ret;
-
- ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1,
- UINT_MAX, GFP_ATOMIC);
- if (ret < 0)
- return -ENOSPC;
-
- *ino = ret;
- return 0;
-}
-
-static inline void pidfs_free_inum(unsigned long ino)
-{
- if (ino > 0)
- ida_free(&pidfd_inum_ida, ino);
-}
-#else
-static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
-{
- *ino = pid->ino;
- return 0;
-}
-#define pidfs_free_inum(ino) ((void)(ino))
-#endif
-
/*
* The vfs falls back to simple_setattr() if i_op->setattr() isn't
* implemented. Let's reject it completely until we have a clean
@@ -403,7 +493,6 @@ static void pidfs_evict_inode(struct inode *inode)
clear_inode(inode);
put_pid(pid);
- pidfs_free_inum(inode->i_ino);
}
static const struct super_operations pidfs_sops = {
@@ -421,25 +510,149 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
}
-static const struct dentry_operations pidfs_dentry_operations = {
+const struct dentry_operations pidfs_dentry_operations = {
.d_delete = always_delete_dentry,
.d_dname = pidfs_dname,
.d_prune = stashed_dentry_prune,
};
+static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
+{
+ const struct pid *pid = inode->i_private;
+
+ if (*max_len < 2) {
+ *max_len = 2;
+ return FILEID_INVALID;
+ }
+
+ *max_len = 2;
+ *(u64 *)fh = pid->ino;
+ return FILEID_KERNFS;
+}
+
+static int pidfs_ino_find(const void *key, const struct rb_node *node)
+{
+ const u64 pid_ino = *(u64 *)key;
+ const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
+
+ if (pid_ino < pid->ino)
+ return -1;
+ if (pid_ino > pid->ino)
+ return 1;
+ return 0;
+}
+
+/* Find a struct pid based on the inode number. */
+static struct pid *pidfs_ino_get_pid(u64 ino)
+{
+ struct pid *pid;
+ struct rb_node *node;
+ unsigned int seq;
+
+ guard(rcu)();
+ do {
+ seq = read_seqcount_begin(&pidmap_lock_seq);
+ node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
+ if (node)
+ break;
+ } while (read_seqcount_retry(&pidmap_lock_seq, seq));
+
+ if (!node)
+ return NULL;
+
+ pid = rb_entry(node, struct pid, pidfs_node);
+
+ /* Within our pid namespace hierarchy? */
+ if (pid_vnr(pid) == 0)
+ return NULL;
+
+ return get_pid(pid);
+}
+
+static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len,
+ int fh_type)
+{
+ int ret;
+ u64 pid_ino;
+ struct path path;
+ struct pid *pid;
+
+ if (fh_len < 2)
+ return NULL;
+
+ switch (fh_type) {
+ case FILEID_KERNFS:
+ pid_ino = *(u64 *)fid;
+ break;
+ default:
+ return NULL;
+ }
+
+ pid = pidfs_ino_get_pid(pid_ino);
+ if (!pid)
+ return NULL;
+
+ ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ mntput(path.mnt);
+ return path.dentry;
+}
+
+/*
+ * Make sure that we reject any nonsensical flags that users pass via
+ * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and
+ * PIDFD_NONBLOCK as O_NONBLOCK.
+ */
+#define VALID_FILE_HANDLE_OPEN_FLAGS \
+ (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
+
+static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
+ unsigned int oflags)
+{
+ if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE))
+ return -EINVAL;
+
+ /*
+ * pidfd_ino_get_pid() will verify that the struct pid is part
+ * of the caller's pid namespace hierarchy. No further
+ * permission checks are needed.
+ */
+ return 0;
+}
+
+static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
+{
+ /*
+ * Clear O_LARGEFILE as open_by_handle_at() forces it and raise
+ * O_RDWR as pidfds always are.
+ */
+ oflags &= ~O_LARGEFILE;
+ return dentry_open(path, oflags | O_RDWR, current_cred());
+}
+
+static const struct export_operations pidfs_export_operations = {
+ .encode_fh = pidfs_encode_fh,
+ .fh_to_dentry = pidfs_fh_to_dentry,
+ .open = pidfs_export_open,
+ .permission = pidfs_export_permission,
+};
+
static int pidfs_init_inode(struct inode *inode, void *data)
{
+ const struct pid *pid = data;
+
inode->i_private = data;
inode->i_flags |= S_PRIVATE;
inode->i_mode |= S_IRWXU;
inode->i_op = &pidfs_inode_operations;
inode->i_fop = &pidfs_file_operations;
- /*
- * Inode numbering for pidfs start at RESERVED_PIDS + 1. This
- * avoids collisions with the root inode which is 1 for pseudo
- * filesystems.
- */
- return pidfs_inum(data, &inode->i_ino);
+ inode->i_ino = pidfs_ino(pid->ino);
+ inode->i_generation = pidfs_gen(pid->ino);
+ return 0;
}
static void pidfs_put_data(void *data)
@@ -462,6 +675,7 @@ static int pidfs_init_fs_context(struct fs_context *fc)
return -ENOMEM;
ctx->ops = &pidfs_sops;
+ ctx->eops = &pidfs_export_operations;
ctx->dops = &pidfs_dentry_operations;
fc->s_fs_info = (void *)&pidfs_stashed_ops;
return 0;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 4cc8801e50e3..a087606ace19 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -3,6 +3,7 @@
#define LINUX_EXPORTFS_H 1
#include <linux/types.h>
+#include <linux/path.h>
struct dentry;
struct iattr;
@@ -156,6 +157,17 @@ struct fid {
};
};
+enum handle_to_path_flags {
+ HANDLE_CHECK_PERMS = (1 << 0),
+ HANDLE_CHECK_SUBTREE = (1 << 1),
+};
+
+struct handle_to_path_ctx {
+ struct path root;
+ enum handle_to_path_flags flags;
+ unsigned int fh_flags;
+};
+
#define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */
#define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */
#define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */
@@ -225,6 +237,12 @@ struct fid {
* is also a directory. In the event that it cannot be found, or storage
* space cannot be allocated, a %ERR_PTR should be returned.
*
+ * permission:
+ * Allow filesystems to specify a custom permission function.
+ *
+ * open:
+ * Allow filesystems to specify a custom open function.
+ *
* commit_metadata:
* @commit_metadata should commit metadata changes to stable storage.
*
@@ -251,6 +269,8 @@ struct export_operations {
bool write, u32 *device_generation);
int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
int nr_iomaps, struct iattr *iattr);
+ int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags);
+ struct file * (*open)(struct path *path, unsigned int oflags);
#define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */
#define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */
#define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */
diff --git a/include/linux/pid.h b/include/linux/pid.h
index a3aad9b4074c..fe575fcdb4af 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -59,6 +59,7 @@ struct pid
spinlock_t lock;
struct dentry *stashed;
u64 ino;
+ struct rb_node pidfs_node;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes;
@@ -68,6 +69,7 @@ struct pid
struct upid numbers[];
};
+extern seqcount_spinlock_t pidmap_lock_seq;
extern struct pid init_struct_pid;
struct file;
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 75bdf9807802..7c830d0dec9a 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -4,5 +4,8 @@
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
void __init pidfs_init(void);
+void pidfs_add_pid(struct pid *pid);
+void pidfs_remove_pid(struct pid *pid);
+extern const struct dentry_operations pidfs_dentry_operations;
#endif /* _LINUX_PID_FS_H */
diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
index 730f77381d55..2503f7625d65 100644
--- a/include/linux/pseudo_fs.h
+++ b/include/linux/pseudo_fs.h
@@ -5,6 +5,7 @@
struct pseudo_fs_context {
const struct super_operations *ops;
+ const struct export_operations *eops;
const struct xattr_handler * const *xattr;
const struct dentry_operations *dops;
unsigned long magic;
diff --git a/kernel/pid.c b/kernel/pid.c
index 115448e89c3e..aa2a7d4da455 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -43,6 +43,7 @@
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <linux/pidfs.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>
@@ -64,11 +65,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-/*
- * Pseudo filesystems start inode numbering after one. We use Reserved
- * PIDs as a natural offset.
- */
-static u64 pidfs_ino = RESERVED_PIDS;
/*
* PID-map pages start out as NULL, they get allocated upon
@@ -108,6 +104,7 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
void put_pid(struct pid *pid)
{
@@ -158,6 +155,7 @@ void free_pid(struct pid *pid)
idr_remove(&ns->idr, upid->nr);
}
+ pidfs_remove_pid(pid);
spin_unlock_irqrestore(&pidmap_lock, flags);
call_rcu(&pid->rcu, delayed_put_pid);
@@ -273,22 +271,24 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
INIT_HLIST_HEAD(&pid->inodes);
upid = pid->numbers + ns->level;
+ idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
- pid->stashed = NULL;
- pid->ino = ++pidfs_ino;
+ pidfs_add_pid(pid);
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
}
spin_unlock_irq(&pidmap_lock);
+ idr_preload_end();
return pid;
out_unlock:
spin_unlock_irq(&pidmap_lock);
+ idr_preload_end();
put_pid_ns(ns);
out_free:
diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore
index 973198a3ec3d..bf92481f925c 100644
--- a/tools/testing/selftests/pidfd/.gitignore
+++ b/tools/testing/selftests/pidfd/.gitignore
@@ -6,3 +6,5 @@ pidfd_wait
pidfd_fdinfo_test
pidfd_getfd_test
pidfd_setns_test
+pidfd_file_handle_test
+pidfd_bind_mount
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
index d731e3e76d5b..301343a11b62 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -2,7 +2,8 @@
CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall
TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
- pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test
+ pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \
+ pidfd_file_handle_test pidfd_bind_mount
include ../lib.mk
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
index 88d6830ee004..28a471c88c51 100644
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -17,6 +17,7 @@
#include <sys/wait.h>
#include "../kselftest.h"
+#include "../clone3/clone3_selftests.h"
#ifndef P_PIDFD
#define P_PIDFD 3
@@ -68,6 +69,11 @@
#define PIDFD_SKIP 3
#define PIDFD_XFAIL 4
+static inline int sys_waitid(int which, pid_t pid, siginfo_t *info, int options)
+{
+ return syscall(__NR_waitid, which, pid, info, options, NULL);
+}
+
static inline int wait_for_pid(pid_t pid)
{
int status, ret;
@@ -114,4 +120,37 @@ static inline int sys_memfd_create(const char *name, unsigned int flags)
return syscall(__NR_memfd_create, name, flags);
}
+static inline pid_t create_child(int *pidfd, unsigned flags)
+{
+ struct __clone_args args = {
+ .flags = CLONE_PIDFD | flags,
+ .exit_signal = SIGCHLD,
+ .pidfd = ptr_to_u64(pidfd),
+ };
+
+ return sys_clone3(&args, sizeof(struct __clone_args));
+}
+
+static inline ssize_t read_nointr(int fd, void *buf, size_t count)
+{
+ ssize_t ret;
+
+ do {
+ ret = read(fd, buf, count);
+ } while (ret < 0 && errno == EINTR);
+
+ return ret;
+}
+
+static inline ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+ ssize_t ret;
+
+ do {
+ ret = write(fd, buf, count);
+ } while (ret < 0 && errno == EINTR);
+
+ return ret;
+}
+
#endif /* __PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_bind_mount.c b/tools/testing/selftests/pidfd/pidfd_bind_mount.c
new file mode 100644
index 000000000000..7822dd080258
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_bind_mount.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest_harness.h"
+
+#ifndef __NR_open_tree
+ #if defined __alpha__
+ #define __NR_open_tree 538
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_open_tree 4428
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_open_tree 6428
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_open_tree 5428
+ #endif
+ #elif defined __ia64__
+ #define __NR_open_tree (428 + 1024)
+ #else
+ #define __NR_open_tree 428
+ #endif
+#endif
+
+#ifndef __NR_move_mount
+ #if defined __alpha__
+ #define __NR_move_mount 539
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_move_mount 4429
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_move_mount 6429
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_move_mount 5429
+ #endif
+ #elif defined __ia64__
+ #define __NR_move_mount (428 + 1024)
+ #else
+ #define __NR_move_mount 429
+ #endif
+#endif
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
+#endif
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */
+#endif
+
+static inline int sys_move_mount(int from_dfd, const char *from_pathname,
+ int to_dfd, const char *to_pathname,
+ unsigned int flags)
+{
+ return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
+ to_pathname, flags);
+}
+
+#ifndef OPEN_TREE_CLONE
+#define OPEN_TREE_CLONE 1
+#endif
+
+#ifndef OPEN_TREE_CLOEXEC
+#define OPEN_TREE_CLOEXEC O_CLOEXEC
+#endif
+
+#ifndef AT_RECURSIVE
+#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
+#endif
+
+static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
+{
+ return syscall(__NR_open_tree, dfd, filename, flags);
+}
+
+FIXTURE(pidfd_bind_mount) {
+ char template[PATH_MAX];
+ int fd_tmp;
+ int pidfd;
+ struct stat st1;
+ struct stat st2;
+ __u32 gen1;
+ __u32 gen2;
+ bool must_unmount;
+};
+
+FIXTURE_SETUP(pidfd_bind_mount)
+{
+ self->fd_tmp = -EBADF;
+ self->must_unmount = false;
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_LE(snprintf(self->template, PATH_MAX, "%s", P_tmpdir "/pidfd_bind_mount_XXXXXX"), PATH_MAX);
+ self->fd_tmp = mkstemp(self->template);
+ ASSERT_GE(self->fd_tmp, 0);
+ self->pidfd = sys_pidfd_open(getpid(), 0);
+ ASSERT_GE(self->pidfd, 0);
+ ASSERT_GE(fstat(self->pidfd, &self->st1), 0);
+ ASSERT_EQ(ioctl(self->pidfd, FS_IOC_GETVERSION, &self->gen1), 0);
+}
+
+FIXTURE_TEARDOWN(pidfd_bind_mount)
+{
+ ASSERT_EQ(close(self->fd_tmp), 0);
+ if (self->must_unmount)
+ ASSERT_EQ(umount2(self->template, 0), 0);
+ ASSERT_EQ(unlink(self->template), 0);
+}
+
+/*
+ * Test that a detached mount can be created for a pidfd and then
+ * attached to the filesystem hierarchy.
+ */
+TEST_F(pidfd_bind_mount, bind_mount)
+{
+ int fd_tree;
+
+ fd_tree = sys_open_tree(self->pidfd, "", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
+ ASSERT_GE(fd_tree, 0);
+
+ ASSERT_EQ(move_mount(fd_tree, "", self->fd_tmp, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0);
+ self->must_unmount = true;
+
+ ASSERT_EQ(close(fd_tree), 0);
+}
+
+/* Test that a pidfd can be reopened through procfs. */
+TEST_F(pidfd_bind_mount, reopen)
+{
+ int pidfd;
+ char proc_path[PATH_MAX];
+
+ sprintf(proc_path, "/proc/self/fd/%d", self->pidfd);
+ pidfd = open(proc_path, O_RDONLY | O_NOCTTY | O_CLOEXEC);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_GE(fstat(self->pidfd, &self->st2), 0);
+ ASSERT_EQ(ioctl(self->pidfd, FS_IOC_GETVERSION, &self->gen2), 0);
+
+ ASSERT_TRUE(self->st1.st_dev == self->st2.st_dev && self->st1.st_ino == self->st2.st_ino);
+ ASSERT_TRUE(self->gen1 == self->gen2);
+
+ ASSERT_EQ(close(pidfd), 0);
+}
+
+/*
+ * Test that a detached mount can be created for a pidfd and then
+ * attached to the filesystem hierarchy and reopened.
+ */
+TEST_F(pidfd_bind_mount, bind_mount_reopen)
+{
+ int fd_tree, fd_pidfd_mnt;
+
+ fd_tree = sys_open_tree(self->pidfd, "", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
+ ASSERT_GE(fd_tree, 0);
+
+ ASSERT_EQ(move_mount(fd_tree, "", self->fd_tmp, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0);
+ self->must_unmount = true;
+
+ fd_pidfd_mnt = openat(-EBADF, self->template, O_RDONLY | O_NOCTTY | O_CLOEXEC);
+ ASSERT_GE(fd_pidfd_mnt, 0);
+
+ ASSERT_GE(fstat(fd_tree, &self->st2), 0);
+ ASSERT_EQ(ioctl(fd_pidfd_mnt, FS_IOC_GETVERSION, &self->gen2), 0);
+
+ ASSERT_TRUE(self->st1.st_dev == self->st2.st_dev && self->st1.st_ino == self->st2.st_ino);
+ ASSERT_TRUE(self->gen1 == self->gen2);
+
+ ASSERT_EQ(close(fd_tree), 0);
+ ASSERT_EQ(close(fd_pidfd_mnt), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidfd/pidfd_file_handle_test.c b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c
new file mode 100644
index 000000000000..439b9c6c0457
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c
@@ -0,0 +1,503 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <poll.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <linux/kcmp.h>
+#include <sys/stat.h>
+
+#include "pidfd.h"
+#include "../kselftest_harness.h"
+
+FIXTURE(file_handle)
+{
+ pid_t pid;
+ int pidfd;
+
+ pid_t child_pid1;
+ int child_pidfd1;
+
+ pid_t child_pid2;
+ int child_pidfd2;
+
+ pid_t child_pid3;
+ int child_pidfd3;
+};
+
+FIXTURE_SETUP(file_handle)
+{
+ int ret;
+ int ipc_sockets[2];
+ char c;
+
+ self->pid = getpid();
+ self->pidfd = sys_pidfd_open(self->pid, 0);
+ ASSERT_GE(self->pidfd, 0);
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ EXPECT_EQ(ret, 0);
+
+ self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER);
+ EXPECT_GE(self->child_pid1, 0);
+
+ if (self->child_pid1 == 0) {
+ close(ipc_sockets[0]);
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+ _exit(EXIT_FAILURE);
+
+ close(ipc_sockets[1]);
+
+ pause();
+ _exit(EXIT_SUCCESS);
+ }
+
+ close(ipc_sockets[1]);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ close(ipc_sockets[0]);
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ EXPECT_EQ(ret, 0);
+
+ self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID);
+ EXPECT_GE(self->child_pid2, 0);
+
+ if (self->child_pid2 == 0) {
+ close(ipc_sockets[0]);
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+ _exit(EXIT_FAILURE);
+
+ close(ipc_sockets[1]);
+
+ pause();
+ _exit(EXIT_SUCCESS);
+ }
+
+ close(ipc_sockets[1]);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ close(ipc_sockets[0]);
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ EXPECT_EQ(ret, 0);
+
+ self->child_pid3 = create_child(&self->child_pidfd3, CLONE_NEWUSER | CLONE_NEWPID);
+ EXPECT_GE(self->child_pid3, 0);
+
+ if (self->child_pid3 == 0) {
+ close(ipc_sockets[0]);
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+ _exit(EXIT_FAILURE);
+
+ close(ipc_sockets[1]);
+
+ pause();
+ _exit(EXIT_SUCCESS);
+ }
+
+ close(ipc_sockets[1]);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ close(ipc_sockets[0]);
+}
+
+FIXTURE_TEARDOWN(file_handle)
+{
+ EXPECT_EQ(close(self->pidfd), 0);
+
+ EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd1, SIGKILL, NULL, 0), 0);
+ if (self->child_pidfd1 >= 0)
+ EXPECT_EQ(0, close(self->child_pidfd1));
+
+ EXPECT_EQ(sys_waitid(P_PID, self->child_pid1, NULL, WEXITED), 0);
+
+ EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd2, SIGKILL, NULL, 0), 0);
+ if (self->child_pidfd2 >= 0)
+ EXPECT_EQ(0, close(self->child_pidfd2));
+
+ EXPECT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0);
+
+ if (self->child_pidfd3 >= 0) {
+ EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd3, SIGKILL, NULL, 0), 0);
+ EXPECT_EQ(0, close(self->child_pidfd3));
+ EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
+ }
+}
+
+/*
+ * Test that we can decode a pidfs file handle in the same pid
+ * namespace.
+ */
+TEST_F(file_handle, file_handle_same_pidns)
+{
+ int mnt_id;
+ struct file_handle *fh;
+ int pidfd = -EBADF;
+ struct stat st1, st2;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd1, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ ASSERT_EQ(fstat(self->child_pidfd1, &st1), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, O_CLOEXEC);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, O_NONBLOCK);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ free(fh);
+}
+
+/*
+ * Test that we can decode a pidfs file handle from a child pid
+ * namespace.
+ */
+TEST_F(file_handle, file_handle_child_pidns)
+{
+ int mnt_id;
+ struct file_handle *fh;
+ int pidfd = -EBADF;
+ struct stat st1, st2;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, O_CLOEXEC);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, O_NONBLOCK);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ free(fh);
+}
+
+/*
+ * Test that we fail to decode a pidfs file handle from an ancestor
+ * child pid namespace.
+ */
+TEST_F(file_handle, file_handle_foreign_pidns)
+{
+ int mnt_id;
+ struct file_handle *fh;
+ pid_t pid;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->pidfd, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ ASSERT_EQ(setns(self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ if (pidfd >= 0) {
+ TH_LOG("Managed to open pidfd outside of the caller's pid namespace hierarchy");
+ _exit(1);
+ }
+ _exit(0);
+ }
+
+ ASSERT_EQ(wait_for_pid(pid), 0);
+
+ free(fh);
+}
+
+/*
+ * Test that we can decode a pidfs file handle of a process that has
+ * exited but not been reaped.
+ */
+TEST_F(file_handle, pid_has_exited)
+{
+ int mnt_id, pidfd, child_pidfd3;
+ struct file_handle *fh;
+ struct stat st1, st2;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd3, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ ASSERT_EQ(fstat(self->child_pidfd3, &st1), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ child_pidfd3 = self->child_pidfd3;
+ self->child_pidfd3 = -EBADF;
+ EXPECT_EQ(sys_pidfd_send_signal(child_pidfd3, SIGKILL, NULL, 0), 0);
+ EXPECT_EQ(close(child_pidfd3), 0);
+ EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED | WNOWAIT), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_GE(pidfd, 0);
+
+ EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
+}
+
+/*
+ * Test that we fail to decode a pidfs file handle of a process that has
+ * already been reaped.
+ */
+TEST_F(file_handle, pid_has_been_reaped)
+{
+ int mnt_id, pidfd, child_pidfd3;
+ struct file_handle *fh;
+ struct stat st1, st2;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd3, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ ASSERT_EQ(fstat(self->child_pidfd3, &st1), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+
+ child_pidfd3 = self->child_pidfd3;
+ self->child_pidfd3 = -EBADF;
+ EXPECT_EQ(sys_pidfd_send_signal(child_pidfd3, SIGKILL, NULL, 0), 0);
+ EXPECT_EQ(close(child_pidfd3), 0);
+ EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_LT(pidfd, 0);
+}
+
+/*
+ * Test valid flags to open a pidfd file handle. Note, that
+ * PIDFD_NONBLOCK is defined as O_NONBLOCK and O_NONBLOCK is an alias to
+ * O_NDELAY. Also note that PIDFD_THREAD is an alias for O_EXCL.
+ */
+TEST_F(file_handle, open_by_handle_at_valid_flags)
+{
+ int mnt_id;
+ struct file_handle *fh;
+ int pidfd = -EBADF;
+ struct stat st1, st2;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh,
+ O_RDONLY |
+ O_WRONLY |
+ O_RDWR |
+ O_NONBLOCK |
+ O_NDELAY |
+ O_CLOEXEC |
+ O_EXCL);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+}
+
+/*
+ * Test that invalid flags passed to open a pidfd file handle are
+ * rejected.
+ */
+TEST_F(file_handle, open_by_handle_at_invalid_flags)
+{
+ int mnt_id;
+ struct file_handle *fh;
+ int pidfd = -EBADF;
+ static const struct invalid_pidfs_file_handle_flags {
+ int oflag;
+ const char *oflag_name;
+ } invalid_pidfs_file_handle_flags[] = {
+ { FASYNC, "FASYNC" },
+ { O_CREAT, "O_CREAT" },
+ { O_NOCTTY, "O_NOCTTY" },
+ { O_CREAT, "O_CREAT" },
+ { O_TRUNC, "O_TRUNC" },
+ { O_APPEND, "O_APPEND" },
+ { O_SYNC, "O_SYNC" },
+ { O_DSYNC, "O_DSYNC" },
+ { O_DIRECT, "O_DIRECT" },
+ { O_DIRECTORY, "O_DIRECTORY" },
+ { O_NOFOLLOW, "O_NOFOLLOW" },
+ { O_NOATIME, "O_NOATIME" },
+ { O_PATH, "O_PATH" },
+ { O_TMPFILE, "O_TMPFILE" },
+ /*
+ * O_LARGEFILE is added implicitly by
+ * open_by_handle_at() so pidfs simply masks it off.
+ */
+ };
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+ for (int i = 0; i < ARRAY_SIZE(invalid_pidfs_file_handle_flags); i++) {
+ pidfd = open_by_handle_at(self->pidfd, fh, invalid_pidfs_file_handle_flags[i].oflag);
+ ASSERT_LT(pidfd, 0) {
+ TH_LOG("open_by_handle_at() succeeded with invalid flags: %s", invalid_pidfs_file_handle_flags[i].oflag_name);
+ }
+ }
+}
+
+/* Test that lookup fails. */
+TEST_F(file_handle, lookup_must_fail)
+{
+ int mnt_id;
+ struct file_handle *fh;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_NE(name_to_handle_at(self->child_pidfd2, "lookup-is-not-possible-with-pidfs", fh, &mnt_id, AT_EMPTY_PATH), 0);
+ ASSERT_EQ(errno, ENOTDIR);
+ ASSERT_NE(name_to_handle_at(self->child_pidfd2, "lookup-is-not-possible-with-pidfs", fh, &mnt_id, 0), 0);
+ ASSERT_EQ(errno, ENOTDIR);
+}
+
+#ifndef AT_HANDLE_CONNECTABLE
+#define AT_HANDLE_CONNECTABLE 0x002
+#endif
+
+/*
+ * Test that AT_HANDLE_CONNECTABLE is rejected. Connectable file handles
+ * don't make sense for pidfs. Note that currently AT_HANDLE_CONNECTABLE
+ * is rejected because it is incompatible with AT_EMPTY_PATH which is
+ * required with pidfds as we don't support lookup.
+ */
+TEST_F(file_handle, invalid_name_to_handle_at_flags)
+{
+ int mnt_id;
+ struct file_handle *fh;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_NE(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH | AT_HANDLE_CONNECTABLE), 0);
+}
+
+#ifndef AT_HANDLE_FID
+#define AT_HANDLE_FID 0x200
+#endif
+
+/*
+ * Test that a request with AT_HANDLE_FID always leads to decodable file
+ * handle as pidfs always provides export operations.
+ */
+TEST_F(file_handle, valid_name_to_handle_at_flags)
+{
+ int mnt_id, pidfd;
+ struct file_handle *fh;
+ struct stat st1, st2;
+
+ fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(fh, NULL);
+ memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+ fh->handle_bytes = MAX_HANDLE_SZ;
+
+ ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH | AT_HANDLE_FID), 0);
+
+ ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
+
+ pidfd = open_by_handle_at(self->pidfd, fh, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_EQ(fstat(pidfd, &st2), 0);
+ ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+ ASSERT_EQ(close(pidfd), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c
index 7c2a4349170a..222f8131283b 100644
--- a/tools/testing/selftests/pidfd/pidfd_setns_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c
@@ -19,7 +19,6 @@
#include <linux/ioctl.h>
#include "pidfd.h"
-#include "../clone3/clone3_selftests.h"
#include "../kselftest_harness.h"
#ifndef PIDFS_IOCTL_MAGIC
@@ -118,22 +117,6 @@ FIXTURE(current_nsset)
int child_pidfd_derived_nsfds2[PIDFD_NS_MAX];
};
-static int sys_waitid(int which, pid_t pid, int options)
-{
- return syscall(__NR_waitid, which, pid, NULL, options, NULL);
-}
-
-pid_t create_child(int *pidfd, unsigned flags)
-{
- struct __clone_args args = {
- .flags = CLONE_PIDFD | flags,
- .exit_signal = SIGCHLD,
- .pidfd = ptr_to_u64(pidfd),
- };
-
- return sys_clone3(&args, sizeof(struct clone_args));
-}
-
static bool switch_timens(void)
{
int fd, ret;
@@ -150,28 +133,6 @@ static bool switch_timens(void)
return ret == 0;
}
-static ssize_t read_nointr(int fd, void *buf, size_t count)
-{
- ssize_t ret;
-
- do {
- ret = read(fd, buf, count);
- } while (ret < 0 && errno == EINTR);
-
- return ret;
-}
-
-static ssize_t write_nointr(int fd, const void *buf, size_t count)
-{
- ssize_t ret;
-
- do {
- ret = write(fd, buf, count);
- } while (ret < 0 && errno == EINTR);
-
- return ret;
-}
-
FIXTURE_SETUP(current_nsset)
{
int i, proc_fd, ret;
@@ -229,7 +190,7 @@ FIXTURE_SETUP(current_nsset)
_exit(EXIT_SUCCESS);
}
- ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, NULL, WEXITED | WNOWAIT), 0);
self->pidfd = sys_pidfd_open(self->pid, 0);
EXPECT_GE(self->pidfd, 0) {
@@ -432,9 +393,9 @@ FIXTURE_TEARDOWN(current_nsset)
EXPECT_EQ(0, close(self->child_pidfd1));
if (self->child_pidfd2 >= 0)
EXPECT_EQ(0, close(self->child_pidfd2));
- ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0);
- ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0);
- ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, NULL, WEXITED), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, NULL, WEXITED), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0);
}
static int preserve_ns(const int pid, const char *ns)
diff --git a/tools/testing/selftests/pidfd/pidfd_wait.c b/tools/testing/selftests/pidfd/pidfd_wait.c
index 0dcb8365ddc3..1e2d49751cde 100644
--- a/tools/testing/selftests/pidfd/pidfd_wait.c
+++ b/tools/testing/selftests/pidfd/pidfd_wait.c
@@ -26,22 +26,11 @@
#define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__)
#endif
-static pid_t sys_clone3(struct clone_args *args)
-{
- return syscall(__NR_clone3, args, sizeof(struct clone_args));
-}
-
-static int sys_waitid(int which, pid_t pid, siginfo_t *info, int options,
- struct rusage *ru)
-{
- return syscall(__NR_waitid, which, pid, info, options, ru);
-}
-
TEST(wait_simple)
{
int pidfd = -1;
pid_t parent_tid = -1;
- struct clone_args args = {
+ struct __clone_args args = {
.parent_tid = ptr_to_u64(&parent_tid),
.pidfd = ptr_to_u64(&pidfd),
.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
@@ -55,7 +44,7 @@ TEST(wait_simple)
pidfd = open("/proc/self", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
ASSERT_GE(pidfd, 0);
- pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_NE(pid, 0);
EXPECT_EQ(close(pidfd), 0);
pidfd = -1;
@@ -63,18 +52,18 @@ TEST(wait_simple)
pidfd = open("/dev/null", O_RDONLY | O_CLOEXEC);
ASSERT_GE(pidfd, 0);
- pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_NE(pid, 0);
EXPECT_EQ(close(pidfd), 0);
pidfd = -1;
- pid = sys_clone3(&args);
+ pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
if (pid == 0)
exit(EXIT_SUCCESS);
- pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_GE(pid, 0);
ASSERT_EQ(WIFEXITED(info.si_status), true);
ASSERT_EQ(WEXITSTATUS(info.si_status), 0);
@@ -89,7 +78,7 @@ TEST(wait_states)
{
int pidfd = -1;
pid_t parent_tid = -1;
- struct clone_args args = {
+ struct __clone_args args = {
.parent_tid = ptr_to_u64(&parent_tid),
.pidfd = ptr_to_u64(&pidfd),
.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
@@ -102,7 +91,7 @@ TEST(wait_states)
};
ASSERT_EQ(pipe(pfd), 0);
- pid = sys_clone3(&args);
+ pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
if (pid == 0) {
@@ -117,28 +106,28 @@ TEST(wait_states)
}
close(pfd[0]);
- ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_STOPPED);
ASSERT_EQ(info.si_pid, parent_tid);
ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
- ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED, NULL), 0);
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED), 0);
ASSERT_EQ(write(pfd[1], "C", 1), 1);
close(pfd[1]);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_CONTINUED);
ASSERT_EQ(info.si_pid, parent_tid);
- ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED, NULL), 0);
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_STOPPED);
ASSERT_EQ(info.si_pid, parent_tid);
ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0);
- ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_KILLED);
ASSERT_EQ(info.si_pid, parent_tid);
@@ -151,7 +140,7 @@ TEST(wait_nonblock)
int pidfd;
unsigned int flags = 0;
pid_t parent_tid = -1;
- struct clone_args args = {
+ struct __clone_args args = {
.parent_tid = ptr_to_u64(&parent_tid),
.flags = CLONE_PARENT_SETTID,
.exit_signal = SIGCHLD,
@@ -173,12 +162,12 @@ TEST(wait_nonblock)
SKIP(return, "Skipping PIDFD_NONBLOCK test");
}
- ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_LT(ret, 0);
ASSERT_EQ(errno, ECHILD);
EXPECT_EQ(close(pidfd), 0);
- pid = sys_clone3(&args);
+ pid = sys_clone3(&args, sizeof(args));
ASSERT_GE(pid, 0);
if (pid == 0) {
@@ -201,7 +190,7 @@ TEST(wait_nonblock)
* Callers need to see EAGAIN/EWOULDBLOCK with non-blocking pidfd when
* child processes exist but none have exited.
*/
- ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
ASSERT_LT(ret, 0);
ASSERT_EQ(errno, EAGAIN);
@@ -210,19 +199,19 @@ TEST(wait_nonblock)
* WNOHANG raised explicitly when child processes exist but none have
* exited.
*/
- ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG, NULL);
+ ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG);
ASSERT_EQ(ret, 0);
ASSERT_EQ(fcntl(pidfd, F_SETFL, (flags & ~O_NONBLOCK)), 0);
- ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_STOPPED);
ASSERT_EQ(info.si_pid, parent_tid);
ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
- ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED), 0);
ASSERT_EQ(info.si_signo, SIGCHLD);
ASSERT_EQ(info.si_code, CLD_EXITED);
ASSERT_EQ(info.si_pid, parent_tid);