16 files changed, 1110 insertions, 183 deletions
diff --git a/fs/fhandle.c b/fs/fhandle.c
index ec9145047dfc..3e092ae6d142 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -187,17 +187,6 @@ static int get_path_from_fd(int fd, struct path *root)
 	return 0;
 }
 
-enum handle_to_path_flags {
-	HANDLE_CHECK_PERMS   = (1 << 0),
-	HANDLE_CHECK_SUBTREE = (1 << 1),
-};
-
-struct handle_to_path_ctx {
-	struct path root;
-	enum handle_to_path_flags flags;
-	unsigned int fh_flags;
-};
-
 static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
 {
 	struct handle_to_path_ctx *ctx = context;
@@ -261,50 +250,55 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
 {
 	int handle_dwords;
 	struct vfsmount *mnt = ctx->root.mnt;
+	struct dentry *dentry;
 
 	/* change the handle size to multiple of sizeof(u32) */
 	handle_dwords = handle->handle_bytes >> 2;
-	path->dentry = exportfs_decode_fh_raw(mnt,
-					  (struct fid *)handle->f_handle,
-					  handle_dwords, handle->handle_type,
-					  ctx->fh_flags,
-					  vfs_dentry_acceptable, ctx);
-	if (IS_ERR_OR_NULL(path->dentry)) {
-		if (path->dentry == ERR_PTR(-ENOMEM))
+	dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle,
+					handle_dwords, handle->handle_type,
+					ctx->fh_flags, vfs_dentry_acceptable,
+					ctx);
+	if (IS_ERR_OR_NULL(dentry)) {
+		if (dentry == ERR_PTR(-ENOMEM))
 			return -ENOMEM;
 		return -ESTALE;
 	}
+	path->dentry = dentry;
 	path->mnt = mntget(mnt);
 	return 0;
 }
 
-/*
- * Allow relaxed permissions of file handles if the caller has the
- * ability to mount the filesystem or create a bind-mount of the
- * provided @mountdirfd.
- *
- * In both cases the caller may be able to get an unobstructed way to
- * the encoded file handle. If the caller is only able to create a
- * bind-mount we need to verify that there are no locked mounts on top
- * of it that could prevent us from getting to the encoded file.
- *
- * In principle, locked mounts can prevent the caller from mounting the
- * filesystem but that only applies to procfs and sysfs neither of which
- * support decoding file handles.
- */
-static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
-				 unsigned int o_flags)
+static inline int may_decode_fh(struct handle_to_path_ctx *ctx,
+				unsigned int o_flags)
 {
 	struct path *root = &ctx->root;
 
+	if (capable(CAP_DAC_READ_SEARCH))
+		return 0;
+
 	/*
-	 * Restrict to O_DIRECTORY to provide a deterministic API that avoids a
-	 * confusing api in the face of disconnected non-dir dentries.
+	 * Allow relaxed permissions of file handles if the caller has
+	 * the ability to mount the filesystem or create a bind-mount of
+	 * the provided @mountdirfd.
+	 *
+	 * In both cases the caller may be able to get an unobstructed
+	 * way to the encoded file handle. If the caller is only able to
+	 * create a bind-mount we need to verify that there are no
+	 * locked mounts on top of it that could prevent us from getting
+	 * to the encoded file.
+	 *
+	 * In principle, locked mounts can prevent the caller from
+	 * mounting the filesystem but that only applies to procfs and
+	 * sysfs neither of which support decoding file handles.
+	 *
+	 * Restrict to O_DIRECTORY to provide a deterministic API that
+	 * avoids a confusing api in the face of disconnected non-dir
+	 * dentries.
 	 *
 	 * There's only one dentry for each directory inode (VFS rule)...
 	 */
 	if (!(o_flags & O_DIRECTORY))
-		return false;
+		return -EPERM;
 
 	if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
 		ctx->flags = HANDLE_CHECK_PERMS;
@@ -314,14 +308,14 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
 		 !has_locked_children(real_mount(root->mnt), root->dentry))
 		ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
 	else
-		return false;
+		return -EPERM;
 
 	/* Are we able to override DAC permissions? */
 	if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH))
-		return false;
+		return -EPERM;
 
 	ctx->fh_flags = EXPORT_FH_DIR_ONLY;
-	return true;
+	return 0;
 }
 
 static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
@@ -331,15 +325,19 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
 	struct file_handle f_handle;
 	struct file_handle *handle = NULL;
 	struct handle_to_path_ctx ctx = {};
+	const struct export_operations *eops;
 
 	retval = get_path_from_fd(mountdirfd, &ctx.root);
 	if (retval)
 		goto out_err;
 
-	if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) {
-		retval = -EPERM;
+	eops = ctx.root.mnt->mnt_sb->s_export_op;
+	if (eops && eops->permission)
+		retval = eops->permission(&ctx, o_flags);
+	else
+		retval = may_decode_fh(&ctx, o_flags);
+	if (retval)
 		goto out_path;
-	}
 
 	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
 		retval = -EFAULT;
@@ -398,29 +396,28 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
 			   int open_flag)
 {
 	long retval = 0;
-	struct path path;
+	struct path path __free(path_put) = {};
 	struct file *file;
-	int fd;
+	const struct export_operations *eops;
 
 	retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
 	if (retval)
 		return retval;
 
-	fd = get_unused_fd_flags(open_flag);
-	if (fd < 0) {
-		path_put(&path);
+	CLASS(get_unused_fd, fd)(O_CLOEXEC);
+	if (fd < 0)
 		return fd;
-	}
-	file = file_open_root(&path, "", open_flag, 0);
-	if (IS_ERR(file)) {
-		put_unused_fd(fd);
-		retval =  PTR_ERR(file);
-	} else {
-		retval = fd;
-		fd_install(fd, file);
-	}
-	path_put(&path);
-	return retval;
+
+	eops = path.mnt->mnt_sb->s_export_op;
+	if (eops->open)
+		file = eops->open(&path, open_flag);
+	else
+		file = file_open_root(&path, "", open_flag, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	fd_install(fd, file);
+	return take_fd(fd);
 }
 
 /**
diff --git a/fs/libfs.c b/fs/libfs.c
index 748ac5923154..2890a9c4a414 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -673,6 +673,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_blocksize_bits = PAGE_SHIFT;
 	s->s_magic = ctx->magic;
 	s->s_op = ctx->ops ?: &simple_super_operations;
+	s->s_export_op = ctx->eops;
 	s->s_xattr = ctx->xattr;
 	s->s_time_gran = 1;
 	root = new_inode(s);
diff --git a/fs/namespace.c b/fs/namespace.c
index 851af89e8d72..64deda6f5b2c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -32,6 +32,7 @@
 #include <linux/fs_context.h>
 #include <linux/shmem_fs.h>
 #include <linux/mnt_idmapping.h>
+#include <linux/pidfs.h>
 #include <linux/nospec.h>
 
 #include "pnode.h"
@@ -2736,8 +2737,13 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
 	if (IS_MNT_UNBINDABLE(old))
 		return mnt;
 
-	if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
-		return mnt;
+	if (!check_mnt(old)) {
+		const struct dentry_operations *d_op = old_path->dentry->d_op;
+
+		if (d_op != &ns_dentry_operations &&
+		    d_op != &pidfs_dentry_operations)
+			return mnt;
+	}
 
 	if (!recurse && has_locked_children(old, old_path->dentry))
 		return mnt;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 618abb1fa1b8..049352f973de 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/anon_inodes.h>
+#include <linux/exportfs.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/cgroup.h>
@@ -23,6 +24,97 @@
 #include "internal.h"
 #include "mount.h"
 
+static struct rb_root pidfs_ino_tree = RB_ROOT;
+
+#if BITS_PER_LONG == 32
+static inline unsigned long pidfs_ino(u64 ino)
+{
+	return lower_32_bits(ino);
+}
+
+/* On 32 bit the generation number are the upper 32 bits. */
+static inline u32 pidfs_gen(u64 ino)
+{
+	return upper_32_bits(ino);
+}
+
+#else
+
+/* On 64 bit simply return ino. */
+static inline unsigned long pidfs_ino(u64 ino)
+{
+	return ino;
+}
+
+/* On 64 bit the generation number is 0. */
+static inline u32 pidfs_gen(u64 ino)
+{
+	return 0;
+}
+#endif
+
+static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
+{
+	struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
+	struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
+	u64 pid_ino_a = pid_a->ino;
+	u64 pid_ino_b = pid_b->ino;
+
+	if (pid_ino_a < pid_ino_b)
+		return -1;
+	if (pid_ino_a > pid_ino_b)
+		return 1;
+	return 0;
+}
+
+void pidfs_add_pid(struct pid *pid)
+{
+	static u64 pidfs_ino_nr = 2;
+
+	/*
+	 * On 64 bit nothing special happens. The 64bit number assigned
+	 * to struct pid is the inode number.
+	 *
+	 * On 32 bit the 64 bit number assigned to struct pid is split
+	 * into two 32 bit numbers. The lower 32 bits are used as the
+	 * inode number and the upper 32 bits are used as the inode
+	 * generation number.
+	 *
+	 * On 32 bit pidfs_ino() will return the lower 32 bit. When
+	 * pidfs_ino() returns zero a wrap around happened. When a
+	 * wraparound happens the 64 bit number will be incremented by 2
+	 * so inode numbering starts at 2 again.
+	 *
+	 * On 64 bit comparing two pidfds is as simple as comparing
+	 * inode numbers.
+	 *
+	 * When a wraparound happens on 32 bit multiple pidfds with the
+	 * same inode number are likely to exist (This isn't a problem
+	 * since before pidfs pidfds used the anonymous inode meaning
+	 * all pidfds had the same inode number.). Userspace can
+	 * reconstruct the 64 bit identifier by retrieving both the
+	 * inode number and the inode generation number to compare or
+	 * use file handles.
+	 */
+	if (pidfs_ino(pidfs_ino_nr) == 0)
+		pidfs_ino_nr += 2;
+
+	pid->ino = pidfs_ino_nr;
+	pid->stashed = NULL;
+	pidfs_ino_nr++;
+
+	write_seqcount_begin(&pidmap_lock_seq);
+	rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
+	write_seqcount_end(&pidmap_lock_seq);
+}
+
+void pidfs_remove_pid(struct pid *pid)
+{
+	write_seqcount_begin(&pidmap_lock_seq);
+	rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
+	write_seqcount_end(&pidmap_lock_seq);
+}
+
 #ifdef CONFIG_PROC_FS
 /**
  * pidfd_show_fdinfo - print information about a pidfd
@@ -190,6 +282,27 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
 	return 0;
 }
 
+static bool pidfs_ioctl_valid(unsigned int cmd)
+{
+	switch (cmd) {
+	case FS_IOC_GETVERSION:
+	case PIDFD_GET_CGROUP_NAMESPACE:
+	case PIDFD_GET_INFO:
+	case PIDFD_GET_IPC_NAMESPACE:
+	case PIDFD_GET_MNT_NAMESPACE:
+	case PIDFD_GET_NET_NAMESPACE:
+	case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
+	case PIDFD_GET_TIME_NAMESPACE:
+	case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
+	case PIDFD_GET_UTS_NAMESPACE:
+	case PIDFD_GET_USER_NAMESPACE:
+	case PIDFD_GET_PID_NAMESPACE:
+		return true;
+	}
+
+	return false;
+}
+
 static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct task_struct *task __free(put_task) = NULL;
@@ -198,6 +311,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	struct ns_common *ns_common = NULL;
 	struct pid_namespace *pid_ns;
 
+	if (!pidfs_ioctl_valid(cmd))
+		return -ENOIOCTLCMD;
+
+	if (cmd == FS_IOC_GETVERSION) {
+		if (!arg)
+			return -EINVAL;
+
+		__u32 __user *argp = (__u32 __user *)arg;
+		return put_user(file_inode(file)->i_generation, argp);
+	}
+
 	task = get_pid_task(pid, PIDTYPE_PID);
 	if (!task)
 		return -ESRCH;
@@ -318,40 +442,6 @@ struct pid *pidfd_pid(const struct file *file)
 
 static struct vfsmount *pidfs_mnt __ro_after_init;
 
-#if BITS_PER_LONG == 32
-/*
- * Provide a fallback mechanism for 32-bit systems so processes remain
- * reliably comparable by inode number even on those systems.
- */
-static DEFINE_IDA(pidfd_inum_ida);
-
-static int pidfs_inum(struct pid *pid, unsigned long *ino)
-{
-	int ret;
-
-	ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1,
-			      UINT_MAX, GFP_ATOMIC);
-	if (ret < 0)
-		return -ENOSPC;
-
-	*ino = ret;
-	return 0;
-}
-
-static inline void pidfs_free_inum(unsigned long ino)
-{
-	if (ino > 0)
-		ida_free(&pidfd_inum_ida, ino);
-}
-#else
-static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
-{
-	*ino = pid->ino;
-	return 0;
-}
-#define pidfs_free_inum(ino) ((void)(ino))
-#endif
-
 /*
  * The vfs falls back to simple_setattr() if i_op->setattr() isn't
  * implemented. Let's reject it completely until we have a clean
@@ -403,7 +493,6 @@ static void pidfs_evict_inode(struct inode *inode)
 
 	clear_inode(inode);
 	put_pid(pid);
-	pidfs_free_inum(inode->i_ino);
 }
 
 static const struct super_operations pidfs_sops = {
@@ -421,25 +510,149 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
 	return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
 }
 
-static const struct dentry_operations pidfs_dentry_operations = {
+const struct dentry_operations pidfs_dentry_operations = {
 	.d_delete	= always_delete_dentry,
 	.d_dname	= pidfs_dname,
 	.d_prune	= stashed_dentry_prune,
 };
 
+static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+			   struct inode *parent)
+{
+	const struct pid *pid = inode->i_private;
+
+	if (*max_len < 2) {
+		*max_len = 2;
+		return FILEID_INVALID;
+	}
+
+	*max_len = 2;
+	*(u64 *)fh = pid->ino;
+	return FILEID_KERNFS;
+}
+
+static int pidfs_ino_find(const void *key, const struct rb_node *node)
+{
+	const u64 pid_ino = *(u64 *)key;
+	const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
+
+	if (pid_ino < pid->ino)
+		return -1;
+	if (pid_ino > pid->ino)
+		return 1;
+	return 0;
+}
+
+/* Find a struct pid based on the inode number. */
+static struct pid *pidfs_ino_get_pid(u64 ino)
+{
+	struct pid *pid;
+	struct rb_node *node;
+	unsigned int seq;
+
+	guard(rcu)();
+	do {
+		seq = read_seqcount_begin(&pidmap_lock_seq);
+		node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
+		if (node)
+			break;
+	} while (read_seqcount_retry(&pidmap_lock_seq, seq));
+
+	if (!node)
+		return NULL;
+
+	pid = rb_entry(node, struct pid, pidfs_node);
+
+	/* Within our pid namespace hierarchy? */
+	if (pid_vnr(pid) == 0)
+		return NULL;
+
+	return get_pid(pid);
+}
+
+static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
+					 struct fid *fid, int fh_len,
+					 int fh_type)
+{
+	int ret;
+	u64 pid_ino;
+	struct path path;
+	struct pid *pid;
+
+	if (fh_len < 2)
+		return NULL;
+
+	switch (fh_type) {
+	case FILEID_KERNFS:
+		pid_ino = *(u64 *)fid;
+		break;
+	default:
+		return NULL;
+	}
+
+	pid = pidfs_ino_get_pid(pid_ino);
+	if (!pid)
+		return NULL;
+
+	ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	mntput(path.mnt);
+	return path.dentry;
+}
+
+/*
+ * Make sure that we reject any nonsensical flags that users pass via
+ * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and
+ * PIDFD_NONBLOCK as O_NONBLOCK.
+ */
+#define VALID_FILE_HANDLE_OPEN_FLAGS \
+	(O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
+
+static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
+				   unsigned int oflags)
+{
+	if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE))
+		return -EINVAL;
+
+	/*
+	 * pidfd_ino_get_pid() will verify that the struct pid is part
+	 * of the caller's pid namespace hierarchy. No further
+	 * permission checks are needed.
+	 */
+	return 0;
+}
+
+static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
+{
+	/*
+	 * Clear O_LARGEFILE as open_by_handle_at() forces it and raise
+	 * O_RDWR as pidfds always are.
+	 */
+	oflags &= ~O_LARGEFILE;
+	return dentry_open(path, oflags | O_RDWR, current_cred());
+}
+
+static const struct export_operations pidfs_export_operations = {
+	.encode_fh	= pidfs_encode_fh,
+	.fh_to_dentry	= pidfs_fh_to_dentry,
+	.open		= pidfs_export_open,
+	.permission	= pidfs_export_permission,
+};
+
 static int pidfs_init_inode(struct inode *inode, void *data)
 {
+	const struct pid *pid = data;
+
 	inode->i_private = data;
 	inode->i_flags |= S_PRIVATE;
 	inode->i_mode |= S_IRWXU;
 	inode->i_op = &pidfs_inode_operations;
 	inode->i_fop = &pidfs_file_operations;
-	/*
-	 * Inode numbering for pidfs start at RESERVED_PIDS + 1. This
-	 * avoids collisions with the root inode which is 1 for pseudo
-	 * filesystems.
-	 */
-	return pidfs_inum(data, &inode->i_ino);
+	inode->i_ino = pidfs_ino(pid->ino);
+	inode->i_generation = pidfs_gen(pid->ino);
+	return 0;
 }
 
 static void pidfs_put_data(void *data)
@@ -462,6 +675,7 @@ static int pidfs_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 
 	ctx->ops = &pidfs_sops;
+	ctx->eops = &pidfs_export_operations;
 	ctx->dops = &pidfs_dentry_operations;
 	fc->s_fs_info = (void *)&pidfs_stashed_ops;
 	return 0;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 4cc8801e50e3..a087606ace19 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -3,6 +3,7 @@
 #define LINUX_EXPORTFS_H 1
 
 #include <linux/types.h>
+#include <linux/path.h>
 
 struct dentry;
 struct iattr;
@@ -156,6 +157,17 @@ struct fid {
 	};
 };
 
+enum handle_to_path_flags {
+	HANDLE_CHECK_PERMS   = (1 << 0),
+	HANDLE_CHECK_SUBTREE = (1 << 1),
+};
+
+struct handle_to_path_ctx {
+	struct path root;
+	enum handle_to_path_flags flags;
+	unsigned int fh_flags;
+};
+
 #define EXPORT_FH_CONNECTABLE	0x1 /* Encode file handle with parent */
 #define EXPORT_FH_FID		0x2 /* File handle may be non-decodeable */
 #define EXPORT_FH_DIR_ONLY	0x4 /* Only decode file handle for a directory */
@@ -225,6 +237,12 @@ struct fid {
  *    is also a directory.  In the event that it cannot be found, or storage
  *    space cannot be allocated, a %ERR_PTR should be returned.
  *
+ * permission:
+ *    Allow filesystems to specify a custom permission function.
+ *
+ * open:
+ *    Allow filesystems to specify a custom open function.
+ *
  * commit_metadata:
  *    @commit_metadata should commit metadata changes to stable storage.
  *
@@ -251,6 +269,8 @@ struct export_operations {
 			  bool write, u32 *device_generation);
 	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
 			     int nr_iomaps, struct iattr *iattr);
+	int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags);
+	struct file * (*open)(struct path *path, unsigned int oflags);
 #define	EXPORT_OP_NOWCC			(0x1) /* don't collect v3 wcc data */
 #define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
 #define	EXPORT_OP_CLOSE_BEFORE_UNLINK	(0x4) /* close files before unlink */
diff --git a/include/linux/pid.h b/include/linux/pid.h
index a3aad9b4074c..fe575fcdb4af 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -59,6 +59,7 @@ struct pid
 	spinlock_t lock;
 	struct dentry *stashed;
 	u64 ino;
+	struct rb_node pidfs_node;
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
 	struct hlist_head inodes;
@@ -68,6 +69,7 @@ struct pid
 	struct upid numbers[];
 };
 
+extern seqcount_spinlock_t pidmap_lock_seq;
 extern struct pid init_struct_pid;
 
 struct file;
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 75bdf9807802..7c830d0dec9a 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -4,5 +4,8 @@
 
 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
 void __init pidfs_init(void);
+void pidfs_add_pid(struct pid *pid);
+void pidfs_remove_pid(struct pid *pid);
+extern const struct dentry_operations pidfs_dentry_operations;
 
 #endif /* _LINUX_PID_FS_H */
diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
index 730f77381d55..2503f7625d65 100644
--- a/include/linux/pseudo_fs.h
+++ b/include/linux/pseudo_fs.h
@@ -5,6 +5,7 @@
 
 struct pseudo_fs_context {
 	const struct super_operations *ops;
+	const struct export_operations *eops;
 	const struct xattr_handler * const *xattr;
 	const struct dentry_operations *dops;
 	unsigned long magic;
diff --git a/kernel/pid.c b/kernel/pid.c
index 115448e89c3e..aa2a7d4da455 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -43,6 +43,7 @@
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 #include <linux/pidfs.h>
+#include <linux/seqlock.h>
 #include <net/sock.h>
 #include <uapi/linux/pidfd.h>
 
@@ -64,11 +65,6 @@ int pid_max = PID_MAX_DEFAULT;
 
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
-/*
- * Pseudo filesystems start inode numbering after one. We use Reserved
- * PIDs as a natural offset.
- */
-static u64 pidfs_ino = RESERVED_PIDS;
 
 /*
  * PID-map pages start out as NULL, they get allocated upon
@@ -108,6 +104,7 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
  */
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
 
 void put_pid(struct pid *pid)
 {
@@ -158,6 +155,7 @@ void free_pid(struct pid *pid)
 
 		idr_remove(&ns->idr, upid->nr);
 	}
+	pidfs_remove_pid(pid);
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
 	call_rcu(&pid->rcu, delayed_put_pid);
@@ -273,22 +271,24 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 	INIT_HLIST_HEAD(&pid->inodes);
 
 	upid = pid->numbers + ns->level;
+	idr_preload(GFP_KERNEL);
 	spin_lock_irq(&pidmap_lock);
 	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
-	pid->stashed = NULL;
-	pid->ino = ++pidfs_ino;
+	pidfs_add_pid(pid);
 	for ( ; upid >= pid->numbers; --upid) {
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);
 		upid->ns->pid_allocated++;
 	}
 	spin_unlock_irq(&pidmap_lock);
+	idr_preload_end();
 
 	return pid;
 
 out_unlock:
 	spin_unlock_irq(&pidmap_lock);
+	idr_preload_end();
 	put_pid_ns(ns);
 
 out_free:
diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore
index 973198a3ec3d..bf92481f925c 100644
--- a/tools/testing/selftests/pidfd/.gitignore
+++ b/tools/testing/selftests/pidfd/.gitignore
@@ -6,3 +6,5 @@ pidfd_wait
 pidfd_fdinfo_test
 pidfd_getfd_test
 pidfd_setns_test
+pidfd_file_handle_test
+pidfd_bind_mount
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
index d731e3e76d5b..301343a11b62 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -2,7 +2,8 @@
 CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall
 
 TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
-	pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test
+	pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \
+	pidfd_file_handle_test pidfd_bind_mount
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
index 88d6830ee004..28a471c88c51 100644
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -17,6 +17,7 @@
 #include <sys/wait.h>
 
 #include "../kselftest.h"
+#include "../clone3/clone3_selftests.h"
 
 #ifndef P_PIDFD
 #define P_PIDFD 3
@@ -68,6 +69,11 @@
 #define PIDFD_SKIP 3
 #define PIDFD_XFAIL 4
 
+static inline int sys_waitid(int which, pid_t pid, siginfo_t *info, int options)
+{
+	return syscall(__NR_waitid, which, pid, info, options, NULL);
+}
+
 static inline int wait_for_pid(pid_t pid)
 {
 	int status, ret;
@@ -114,4 +120,37 @@ static inline int sys_memfd_create(const char *name, unsigned int flags)
 	return syscall(__NR_memfd_create, name, flags);
 }
 
+static inline pid_t create_child(int *pidfd, unsigned flags)
+{
+	struct __clone_args args = {
+		.flags		= CLONE_PIDFD | flags,
+		.exit_signal	= SIGCHLD,
+		.pidfd		= ptr_to_u64(pidfd),
+	};
+
+	return sys_clone3(&args, sizeof(struct __clone_args));
+}
+
+static inline ssize_t read_nointr(int fd, void *buf, size_t count)
+{
+	ssize_t ret;
+
+	do {
+		ret = read(fd, buf, count);
+	} while (ret < 0 && errno == EINTR);
+
+	return ret;
+}
+
+static inline ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+	ssize_t ret;
+
+	do {
+		ret = write(fd, buf, count);
+	} while (ret < 0 && errno == EINTR);
+
+	return ret;
+}
+
 #endif /* __PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_bind_mount.c b/tools/testing/selftests/pidfd/pidfd_bind_mount.c
new file mode 100644
index 000000000000..7822dd080258
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_bind_mount.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest_harness.h"
+
+#ifndef __NR_open_tree
+	#if defined __alpha__
+		#define __NR_open_tree 538
+	#elif defined _MIPS_SIM
+		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
+			#define __NR_open_tree 4428
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
+			#define __NR_open_tree 6428
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
+			#define __NR_open_tree 5428
+		#endif
+	#elif defined __ia64__
+		#define __NR_open_tree (428 + 1024)
+	#else
+		#define __NR_open_tree 428
+	#endif
+#endif
+
+#ifndef __NR_move_mount
+	#if defined __alpha__
+		#define __NR_move_mount 539
+	#elif defined _MIPS_SIM
+		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
+			#define __NR_move_mount 4429
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
+			#define __NR_move_mount 6429
+		#endif
+		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
+			#define __NR_move_mount 5429
+		#endif
+	#elif defined __ia64__
+		#define __NR_move_mount (428 + 1024)
+	#else
+		#define __NR_move_mount 429
+	#endif
+#endif
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
+#endif
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */
+#endif
+
+static inline int sys_move_mount(int from_dfd, const char *from_pathname,
+                                 int to_dfd, const char *to_pathname,
+                                 unsigned int flags)
+{
+        return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
+                       to_pathname, flags);
+}
+
+#ifndef OPEN_TREE_CLONE
+#define OPEN_TREE_CLONE 1
+#endif
+
+#ifndef OPEN_TREE_CLOEXEC
+#define OPEN_TREE_CLOEXEC O_CLOEXEC
+#endif
+
+#ifndef AT_RECURSIVE
+#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
+#endif
+
+static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
+{
+	return syscall(__NR_open_tree, dfd, filename, flags);
+}
+
+FIXTURE(pidfd_bind_mount) {
+	char template[PATH_MAX];
+	int fd_tmp;
+	int pidfd;
+	struct stat st1;
+	struct stat st2;
+	__u32 gen1;
+	__u32 gen2;
+	bool must_unmount;
+};
+
+FIXTURE_SETUP(pidfd_bind_mount)
+{
+	self->fd_tmp = -EBADF;
+	self->must_unmount = false;
+	ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+	ASSERT_LE(snprintf(self->template, PATH_MAX, "%s", P_tmpdir "/pidfd_bind_mount_XXXXXX"), PATH_MAX);
+	self->fd_tmp = mkstemp(self->template);
+	ASSERT_GE(self->fd_tmp, 0);
+	self->pidfd = sys_pidfd_open(getpid(), 0);
+	ASSERT_GE(self->pidfd, 0);
+	ASSERT_GE(fstat(self->pidfd, &self->st1), 0);
+	ASSERT_EQ(ioctl(self->pidfd, FS_IOC_GETVERSION, &self->gen1), 0);
+}
+
+FIXTURE_TEARDOWN(pidfd_bind_mount)
+{
+	ASSERT_EQ(close(self->fd_tmp), 0);
+	if (self->must_unmount)
+		ASSERT_EQ(umount2(self->template, 0), 0);
+	ASSERT_EQ(unlink(self->template), 0);
+}
+
+/*
+ * Test that a detached mount can be created for a pidfd and then
+ * attached to the filesystem hierarchy.
+ */
+TEST_F(pidfd_bind_mount, bind_mount)
+{
+	int fd_tree;
+
+	fd_tree = sys_open_tree(self->pidfd, "", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
+	ASSERT_GE(fd_tree, 0);
+
+	ASSERT_EQ(move_mount(fd_tree, "", self->fd_tmp, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0);
+	self->must_unmount = true;
+
+	ASSERT_EQ(close(fd_tree), 0);
+}
+
+/* Test that a pidfd can be reopened through procfs. */
+TEST_F(pidfd_bind_mount, reopen)
+{
+	int pidfd;
+	char proc_path[PATH_MAX];
+
+	sprintf(proc_path, "/proc/self/fd/%d", self->pidfd);
+	pidfd = open(proc_path, O_RDONLY | O_NOCTTY | O_CLOEXEC);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_GE(fstat(self->pidfd, &self->st2), 0);
+	ASSERT_EQ(ioctl(self->pidfd, FS_IOC_GETVERSION, &self->gen2), 0);
+
+	ASSERT_TRUE(self->st1.st_dev == self->st2.st_dev && self->st1.st_ino == self->st2.st_ino);
+	ASSERT_TRUE(self->gen1 == self->gen2);
+
+	ASSERT_EQ(close(pidfd), 0);
+}
+
+/*
+ * Test that a detached mount can be created for a pidfd and then
+ * attached to the filesystem hierarchy and reopened.
+ */
+TEST_F(pidfd_bind_mount, bind_mount_reopen)
+{
+	int fd_tree, fd_pidfd_mnt;
+
+	fd_tree = sys_open_tree(self->pidfd, "", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
+	ASSERT_GE(fd_tree, 0);
+
+	ASSERT_EQ(move_mount(fd_tree, "", self->fd_tmp, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0);
+	self->must_unmount = true;
+
+	fd_pidfd_mnt = openat(-EBADF, self->template, O_RDONLY | O_NOCTTY | O_CLOEXEC);
+	ASSERT_GE(fd_pidfd_mnt, 0);
+
+	ASSERT_GE(fstat(fd_tree, &self->st2), 0);
+	ASSERT_EQ(ioctl(fd_pidfd_mnt, FS_IOC_GETVERSION, &self->gen2), 0);
+
+	ASSERT_TRUE(self->st1.st_dev == self->st2.st_dev && self->st1.st_ino == self->st2.st_ino);
+	ASSERT_TRUE(self->gen1 == self->gen2);
+
+	ASSERT_EQ(close(fd_tree), 0);
+	ASSERT_EQ(close(fd_pidfd_mnt), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidfd/pidfd_file_handle_test.c b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c
new file mode 100644
index 000000000000..439b9c6c0457
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_file_handle_test.c
@@ -0,0 +1,503 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <poll.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <linux/kcmp.h>
+#include <sys/stat.h>
+
+#include "pidfd.h"
+#include "../kselftest_harness.h"
+
+FIXTURE(file_handle)
+{
+	pid_t pid;
+	int pidfd;
+
+	pid_t child_pid1;
+	int child_pidfd1;
+
+	pid_t child_pid2;
+	int child_pidfd2;
+
+	pid_t child_pid3;
+	int child_pidfd3;
+};
+
+FIXTURE_SETUP(file_handle)
+{
+	int ret;
+	int ipc_sockets[2];
+	char c;
+
+	self->pid = getpid();
+	self->pidfd = sys_pidfd_open(self->pid, 0);
+	ASSERT_GE(self->pidfd, 0);
+
+	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+	EXPECT_EQ(ret, 0);
+
+	self->child_pid1 = create_child(&self->child_pidfd1, CLONE_NEWUSER);
+	EXPECT_GE(self->child_pid1, 0);
+
+	if (self->child_pid1 == 0) {
+		close(ipc_sockets[0]);
+
+		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+			_exit(EXIT_FAILURE);
+
+		close(ipc_sockets[1]);
+
+		pause();
+		_exit(EXIT_SUCCESS);
+	}
+
+	close(ipc_sockets[1]);
+	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+	close(ipc_sockets[0]);
+
+	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+	EXPECT_EQ(ret, 0);
+
+	self->child_pid2 = create_child(&self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID);
+	EXPECT_GE(self->child_pid2, 0);
+
+	if (self->child_pid2 == 0) {
+		close(ipc_sockets[0]);
+
+		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+			_exit(EXIT_FAILURE);
+
+		close(ipc_sockets[1]);
+
+		pause();
+		_exit(EXIT_SUCCESS);
+	}
+
+	close(ipc_sockets[1]);
+	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+	close(ipc_sockets[0]);
+
+	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+	EXPECT_EQ(ret, 0);
+
+	self->child_pid3 = create_child(&self->child_pidfd3, CLONE_NEWUSER | CLONE_NEWPID);
+	EXPECT_GE(self->child_pid3, 0);
+
+	if (self->child_pid3 == 0) {
+		close(ipc_sockets[0]);
+
+		if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+			_exit(EXIT_FAILURE);
+
+		close(ipc_sockets[1]);
+
+		pause();
+		_exit(EXIT_SUCCESS);
+	}
+
+	close(ipc_sockets[1]);
+	ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+	close(ipc_sockets[0]);
+}
+
+FIXTURE_TEARDOWN(file_handle)
+{
+	EXPECT_EQ(close(self->pidfd), 0);
+
+	EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd1, SIGKILL, NULL, 0), 0);
+	if (self->child_pidfd1 >= 0)
+		EXPECT_EQ(0, close(self->child_pidfd1));
+
+	EXPECT_EQ(sys_waitid(P_PID, self->child_pid1, NULL, WEXITED), 0);
+
+	EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd2, SIGKILL, NULL, 0), 0);
+	if (self->child_pidfd2 >= 0)
+		EXPECT_EQ(0, close(self->child_pidfd2));
+
+	EXPECT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0);
+
+	if (self->child_pidfd3 >= 0) {
+		EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd3, SIGKILL, NULL, 0), 0);
+		EXPECT_EQ(0, close(self->child_pidfd3));
+		EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
+	}
+}
+
+/*
+ * Test that we can decode a pidfs file handle in the same pid
+ * namespace.
+ */
+TEST_F(file_handle, file_handle_same_pidns)
+{
+	int mnt_id;
+	struct file_handle *fh;
+	int pidfd = -EBADF;
+	struct stat st1, st2;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd1, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	ASSERT_EQ(fstat(self->child_pidfd1, &st1), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, O_CLOEXEC);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, O_NONBLOCK);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	free(fh);
+}
+
+/*
+ * Test that we can decode a pidfs file handle from a child pid
+ * namespace.
+ */
+TEST_F(file_handle, file_handle_child_pidns)
+{
+	int mnt_id;
+	struct file_handle *fh;
+	int pidfd = -EBADF;
+	struct stat st1, st2;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, O_CLOEXEC);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, O_NONBLOCK);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	free(fh);
+}
+
+/*
+ * Test that we fail to decode a pidfs file handle from an ancestor
+ * child pid namespace.
+ */
+TEST_F(file_handle, file_handle_foreign_pidns)
+{
+	int mnt_id;
+	struct file_handle *fh;
+	pid_t pid;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->pidfd, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	ASSERT_EQ(setns(self->child_pidfd2, CLONE_NEWUSER | CLONE_NEWPID), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		int pidfd = open_by_handle_at(self->pidfd, fh, 0);
+		if (pidfd >= 0) {
+			TH_LOG("Managed to open pidfd outside of the caller's pid namespace hierarchy");
+			_exit(1);
+		}
+		_exit(0);
+	}
+
+	ASSERT_EQ(wait_for_pid(pid), 0);
+
+	free(fh);
+}
+
+/*
+ * Test that we can decode a pidfs file handle of a process that has
+ * exited but not been reaped.
+ */
+TEST_F(file_handle, pid_has_exited)
+{
+	int mnt_id, pidfd, child_pidfd3;
+	struct file_handle *fh;
+	struct stat st1, st2;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd3, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	ASSERT_EQ(fstat(self->child_pidfd3, &st1), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	child_pidfd3 = self->child_pidfd3;
+	self->child_pidfd3 = -EBADF;
+	EXPECT_EQ(sys_pidfd_send_signal(child_pidfd3, SIGKILL, NULL, 0), 0);
+	EXPECT_EQ(close(child_pidfd3), 0);
+	EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED | WNOWAIT), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_GE(pidfd, 0);
+
+	EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
+}
+
+/*
+ * Test that we fail to decode a pidfs file handle of a process that has
+ * already been reaped.
+ */
+TEST_F(file_handle, pid_has_been_reaped)
+{
+	int mnt_id, pidfd, child_pidfd3;
+	struct file_handle *fh;
+	struct stat st1, st2;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd3, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	ASSERT_EQ(fstat(self->child_pidfd3, &st1), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+
+	child_pidfd3 = self->child_pidfd3;
+	self->child_pidfd3 = -EBADF;
+	EXPECT_EQ(sys_pidfd_send_signal(child_pidfd3, SIGKILL, NULL, 0), 0);
+	EXPECT_EQ(close(child_pidfd3), 0);
+	EXPECT_EQ(sys_waitid(P_PID, self->child_pid3, NULL, WEXITED), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_LT(pidfd, 0);
+}
+
+/*
+ * Test valid flags to open a pidfd file handle. Note, that
+ * PIDFD_NONBLOCK is defined as O_NONBLOCK and O_NONBLOCK is an alias to
+ * O_NDELAY. Also note that PIDFD_THREAD is an alias for O_EXCL.
+ */
+TEST_F(file_handle, open_by_handle_at_valid_flags)
+{
+	int mnt_id;
+	struct file_handle *fh;
+	int pidfd = -EBADF;
+	struct stat st1, st2;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh,
+				  O_RDONLY |
+				  O_WRONLY |
+				  O_RDWR |
+				  O_NONBLOCK |
+				  O_NDELAY |
+				  O_CLOEXEC |
+				  O_EXCL);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+}
+
+/*
+ * Test that invalid flags passed to open a pidfd file handle are
+ * rejected.
+ */
+TEST_F(file_handle, open_by_handle_at_invalid_flags)
+{
+	int mnt_id;
+	struct file_handle *fh;
+	int pidfd = -EBADF;
+	static const struct invalid_pidfs_file_handle_flags {
+		int oflag;
+		const char *oflag_name;
+	}  invalid_pidfs_file_handle_flags[] = {
+		{ FASYNC,	"FASYNC"	},
+		{ O_CREAT,	"O_CREAT"	},
+		{ O_NOCTTY,	"O_NOCTTY"	},
+		{ O_CREAT,	"O_CREAT"	},
+		{ O_TRUNC,	"O_TRUNC"	},
+		{ O_APPEND,	"O_APPEND"	},
+		{ O_SYNC,	"O_SYNC"	},
+		{ O_DSYNC,	"O_DSYNC"	},
+		{ O_DIRECT,	"O_DIRECT"	},
+		{ O_DIRECTORY,	"O_DIRECTORY"	},
+		{ O_NOFOLLOW,	"O_NOFOLLOW"	},
+		{ O_NOATIME,	"O_NOATIME"	},
+		{ O_PATH,	"O_PATH"	},
+		{ O_TMPFILE,	"O_TMPFILE"	},
+		/*
+		 * O_LARGEFILE is added implicitly by
+		 * open_by_handle_at() so pidfs simply masks it off.
+		 */
+	};
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
+
+	for (int i = 0; i < ARRAY_SIZE(invalid_pidfs_file_handle_flags); i++) {
+		pidfd = open_by_handle_at(self->pidfd, fh, invalid_pidfs_file_handle_flags[i].oflag);
+		ASSERT_LT(pidfd, 0) {
+			TH_LOG("open_by_handle_at() succeeded with invalid flags: %s", invalid_pidfs_file_handle_flags[i].oflag_name);
+		}
+	}
+}
+
+/* Test that lookup fails. */
+TEST_F(file_handle, lookup_must_fail)
+{
+	int mnt_id;
+	struct file_handle *fh;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_NE(name_to_handle_at(self->child_pidfd2, "lookup-is-not-possible-with-pidfs", fh, &mnt_id, AT_EMPTY_PATH), 0);
+	ASSERT_EQ(errno, ENOTDIR);
+	ASSERT_NE(name_to_handle_at(self->child_pidfd2, "lookup-is-not-possible-with-pidfs", fh, &mnt_id, 0), 0);
+	ASSERT_EQ(errno, ENOTDIR);
+}
+
+#ifndef AT_HANDLE_CONNECTABLE
+#define AT_HANDLE_CONNECTABLE 0x002
+#endif
+
+/*
+ * Test that AT_HANDLE_CONNECTABLE is rejected. Connectable file handles
+ * don't make sense for pidfs. Note that currently AT_HANDLE_CONNECTABLE
+ * is rejected because it is incompatible with AT_EMPTY_PATH which is
+ * required with pidfds as we don't support lookup.
+ */
+TEST_F(file_handle, invalid_name_to_handle_at_flags)
+{
+	int mnt_id;
+	struct file_handle *fh;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_NE(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH | AT_HANDLE_CONNECTABLE), 0);
+}
+
+#ifndef AT_HANDLE_FID
+#define AT_HANDLE_FID 0x200
+#endif
+
+/*
+ * Test that a request with AT_HANDLE_FID always leads to decodable file
+ * handle as pidfs always provides export operations.
+ */
+TEST_F(file_handle, valid_name_to_handle_at_flags)
+{
+	int mnt_id, pidfd;
+	struct file_handle *fh;
+	struct stat st1, st2;
+
+	fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	ASSERT_NE(fh, NULL);
+	memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
+	fh->handle_bytes = MAX_HANDLE_SZ;
+
+	ASSERT_EQ(name_to_handle_at(self->child_pidfd2, "", fh, &mnt_id, AT_EMPTY_PATH | AT_HANDLE_FID), 0);
+
+	ASSERT_EQ(fstat(self->child_pidfd2, &st1), 0);
+
+	pidfd = open_by_handle_at(self->pidfd, fh, 0);
+	ASSERT_GE(pidfd, 0);
+
+	ASSERT_EQ(fstat(pidfd, &st2), 0);
+	ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+
+	ASSERT_EQ(close(pidfd), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c
index 7c2a4349170a..222f8131283b 100644
--- a/tools/testing/selftests/pidfd/pidfd_setns_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c
@@ -19,7 +19,6 @@
 #include <linux/ioctl.h>
 
 #include "pidfd.h"
-#include "../clone3/clone3_selftests.h"
 #include "../kselftest_harness.h"
 
 #ifndef PIDFS_IOCTL_MAGIC
@@ -118,22 +117,6 @@ FIXTURE(current_nsset)
 	int child_pidfd_derived_nsfds2[PIDFD_NS_MAX];
 };
 
-static int sys_waitid(int which, pid_t pid, int options)
-{
-	return syscall(__NR_waitid, which, pid, NULL, options, NULL);
-}
-
-pid_t create_child(int *pidfd, unsigned flags)
-{
-	struct __clone_args args = {
-		.flags		= CLONE_PIDFD | flags,
-		.exit_signal	= SIGCHLD,
-		.pidfd		= ptr_to_u64(pidfd),
-	};
-
-	return sys_clone3(&args, sizeof(struct clone_args));
-}
-
 static bool switch_timens(void)
 {
 	int fd, ret;
@@ -150,28 +133,6 @@ static bool switch_timens(void)
 	return ret == 0;
 }
 
-static ssize_t read_nointr(int fd, void *buf, size_t count)
-{
-	ssize_t ret;
-
-	do {
-		ret = read(fd, buf, count);
-	} while (ret < 0 && errno == EINTR);
-
-	return ret;
-}
-
-static ssize_t write_nointr(int fd, const void *buf, size_t count)
-{
-	ssize_t ret;
-
-	do {
-		ret = write(fd, buf, count);
-	} while (ret < 0 && errno == EINTR);
-
-	return ret;
-}
-
 FIXTURE_SETUP(current_nsset)
 {
 	int i, proc_fd, ret;
@@ -229,7 +190,7 @@ FIXTURE_SETUP(current_nsset)
 		_exit(EXIT_SUCCESS);
 	}
 
-	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
+	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, NULL, WEXITED | WNOWAIT), 0);
 
 	self->pidfd = sys_pidfd_open(self->pid, 0);
 	EXPECT_GE(self->pidfd, 0) {
@@ -432,9 +393,9 @@ FIXTURE_TEARDOWN(current_nsset)
 		EXPECT_EQ(0, close(self->child_pidfd1));
 	if (self->child_pidfd2 >= 0)
 		EXPECT_EQ(0, close(self->child_pidfd2));
-	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0);
-	ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0);
-	ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0);
+	ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, NULL, WEXITED), 0);
+	ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, NULL, WEXITED), 0);
+	ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0);
 }
 
 static int preserve_ns(const int pid, const char *ns)
diff --git a/tools/testing/selftests/pidfd/pidfd_wait.c b/tools/testing/selftests/pidfd/pidfd_wait.c
index 0dcb8365ddc3..1e2d49751cde 100644
--- a/tools/testing/selftests/pidfd/pidfd_wait.c
+++ b/tools/testing/selftests/pidfd/pidfd_wait.c
@@ -26,22 +26,11 @@
 #define SKIP(s, ...)	XFAIL(s, ##__VA_ARGS__)
 #endif
 
-static pid_t sys_clone3(struct clone_args *args)
-{
-	return syscall(__NR_clone3, args, sizeof(struct clone_args));
-}
-
-static int sys_waitid(int which, pid_t pid, siginfo_t *info, int options,
-		      struct rusage *ru)
-{
-	return syscall(__NR_waitid, which, pid, info, options, ru);
-}
-
 TEST(wait_simple)
 {
 	int pidfd = -1;
 	pid_t parent_tid = -1;
-	struct clone_args args = {
+	struct __clone_args args = {
 		.parent_tid = ptr_to_u64(&parent_tid),
 		.pidfd = ptr_to_u64(&pidfd),
 		.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
@@ -55,7 +44,7 @@ TEST(wait_simple)
 	pidfd = open("/proc/self", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
 	ASSERT_GE(pidfd, 0);
 
-	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
 	ASSERT_NE(pid, 0);
 	EXPECT_EQ(close(pidfd), 0);
 	pidfd = -1;
@@ -63,18 +52,18 @@ TEST(wait_simple)
 	pidfd = open("/dev/null", O_RDONLY | O_CLOEXEC);
 	ASSERT_GE(pidfd, 0);
 
-	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
 	ASSERT_NE(pid, 0);
 	EXPECT_EQ(close(pidfd), 0);
 	pidfd = -1;
 
-	pid = sys_clone3(&args);
+	pid = sys_clone3(&args, sizeof(args));
 	ASSERT_GE(pid, 0);
 
 	if (pid == 0)
 		exit(EXIT_SUCCESS);
 
-	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
 	ASSERT_GE(pid, 0);
 	ASSERT_EQ(WIFEXITED(info.si_status), true);
 	ASSERT_EQ(WEXITSTATUS(info.si_status), 0);
@@ -89,7 +78,7 @@ TEST(wait_states)
 {
 	int pidfd = -1;
 	pid_t parent_tid = -1;
-	struct clone_args args = {
+	struct __clone_args args = {
 		.parent_tid = ptr_to_u64(&parent_tid),
 		.pidfd = ptr_to_u64(&pidfd),
 		.flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
@@ -102,7 +91,7 @@ TEST(wait_states)
 	};
 
 	ASSERT_EQ(pipe(pfd), 0);
-	pid = sys_clone3(&args);
+	pid = sys_clone3(&args, sizeof(args));
 	ASSERT_GE(pid, 0);
 
 	if (pid == 0) {
@@ -117,28 +106,28 @@ TEST(wait_states)
 	}
 
 	close(pfd[0]);
-	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED), 0);
 	ASSERT_EQ(info.si_signo, SIGCHLD);
 	ASSERT_EQ(info.si_code, CLD_STOPPED);
 	ASSERT_EQ(info.si_pid, parent_tid);
 
 	ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
 
-	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED, NULL), 0);
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED), 0);
 	ASSERT_EQ(write(pfd[1], "C", 1), 1);
 	close(pfd[1]);
 	ASSERT_EQ(info.si_signo, SIGCHLD);
 	ASSERT_EQ(info.si_code, CLD_CONTINUED);
 	ASSERT_EQ(info.si_pid, parent_tid);
 
-	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED, NULL), 0);
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED), 0);
 	ASSERT_EQ(info.si_signo, SIGCHLD);
 	ASSERT_EQ(info.si_code, CLD_STOPPED);
 	ASSERT_EQ(info.si_pid, parent_tid);
 
 	ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0);
 
-	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED), 0);
 	ASSERT_EQ(info.si_signo, SIGCHLD);
 	ASSERT_EQ(info.si_code, CLD_KILLED);
 	ASSERT_EQ(info.si_pid, parent_tid);
@@ -151,7 +140,7 @@ TEST(wait_nonblock)
 	int pidfd;
 	unsigned int flags = 0;
 	pid_t parent_tid = -1;
-	struct clone_args args = {
+	struct __clone_args args = {
 		.parent_tid = ptr_to_u64(&parent_tid),
 		.flags = CLONE_PARENT_SETTID,
 		.exit_signal = SIGCHLD,
@@ -173,12 +162,12 @@ TEST(wait_nonblock)
 		SKIP(return, "Skipping PIDFD_NONBLOCK test");
 	}
 
-	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
 	ASSERT_LT(ret, 0);
 	ASSERT_EQ(errno, ECHILD);
 	EXPECT_EQ(close(pidfd), 0);
 
-	pid = sys_clone3(&args);
+	pid = sys_clone3(&args, sizeof(args));
 	ASSERT_GE(pid, 0);
 
 	if (pid == 0) {
@@ -201,7 +190,7 @@ TEST(wait_nonblock)
 	 * Callers need to see EAGAIN/EWOULDBLOCK with non-blocking pidfd when
 	 * child processes exist but none have exited.
 	 */
-	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED);
 	ASSERT_LT(ret, 0);
 	ASSERT_EQ(errno, EAGAIN);
 
@@ -210,19 +199,19 @@ TEST(wait_nonblock)
 	 * WNOHANG raised explicitly when child processes exist but none have
 	 * exited.
 	 */
-	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG, NULL);
+	ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG);
 	ASSERT_EQ(ret, 0);
 
 	ASSERT_EQ(fcntl(pidfd, F_SETFL, (flags & ~O_NONBLOCK)), 0);
 
-	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED), 0);
 	ASSERT_EQ(info.si_signo, SIGCHLD);
 	ASSERT_EQ(info.si_code, CLD_STOPPED);
 	ASSERT_EQ(info.si_pid, parent_tid);
 
 	ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
 
-	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
+	ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED), 0);
 	ASSERT_EQ(info.si_signo, SIGCHLD);
 	ASSERT_EQ(info.si_code, CLD_EXITED);
 	ASSERT_EQ(info.si_pid, parent_tid);