diff options
-rw-r--r-- | fs/mount.h | 18 | ||||
-rw-r--r-- | fs/namespace.c | 160 | ||||
-rw-r--r-- | fs/nsfs.c | 5 | ||||
-rw-r--r-- | include/linux/rculist.h | 44 | ||||
-rw-r--r-- | samples/vfs/.gitignore | 1 | ||||
-rw-r--r-- | samples/vfs/Makefile | 2 | ||||
-rw-r--r-- | samples/vfs/test-list-all-mounts.c | 235 | ||||
-rw-r--r-- | tools/testing/selftests/filesystems/nsfs/.gitignore (renamed from tools/testing/selftests/nsfs/.gitignore) | 1 | ||||
-rw-r--r-- | tools/testing/selftests/filesystems/nsfs/Makefile (renamed from tools/testing/selftests/nsfs/Makefile) | 4 | ||||
-rw-r--r-- | tools/testing/selftests/filesystems/nsfs/config (renamed from tools/testing/selftests/nsfs/config) | 0 | ||||
-rw-r--r-- | tools/testing/selftests/filesystems/nsfs/iterate_mntns.c | 149 | ||||
-rw-r--r-- | tools/testing/selftests/filesystems/nsfs/owner.c (renamed from tools/testing/selftests/nsfs/owner.c) | 0 | ||||
-rw-r--r-- | tools/testing/selftests/filesystems/nsfs/pidns.c (renamed from tools/testing/selftests/nsfs/pidns.c) | 0 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/pidfd.h | 1 |
14 files changed, 540 insertions, 80 deletions
diff --git a/fs/mount.h b/fs/mount.h index 179f690a0c72..e9f48e563c0f 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -12,11 +12,15 @@ struct mnt_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ - wait_queue_head_t poll; + union { + wait_queue_head_t poll; + struct rcu_head mnt_ns_rcu; + }; u64 event; unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ + struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; @@ -157,15 +161,9 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) } bool has_locked_children(struct mount *mnt, struct dentry *dentry); -struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous); -static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns) -{ - return __lookup_next_mnt_ns(mntns, false); -} -static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns) -{ - return __lookup_next_mnt_ns(mntns, true); -} +struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns, + bool previous); + static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); diff --git a/fs/namespace.c b/fs/namespace.c index 1af8da8e1e97..a382be402f62 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -32,7 +32,6 @@ #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> -#include <linux/nospec.h> #include "pnode.h" #include "internal.h" @@ -79,8 +78,10 @@ static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ -static DEFINE_RWLOCK(mnt_ns_tree_lock); +static DEFINE_SEQLOCK(mnt_ns_tree_lock); + static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ +static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ struct mount_kattr { unsigned int attr_set; @@ -106,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) -{ - u64 seq_b = ns->seq; - - if (seq < seq_b) - return -1; - if (seq > seq_b) - return 1; - return 0; -} - static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { if (!node) @@ -124,24 +114,53 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); } -static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) +static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) { struct mnt_namespace *ns_a = node_to_mnt_ns(a); struct mnt_namespace *ns_b = node_to_mnt_ns(b); u64 seq_a = ns_a->seq; + u64 seq_b = ns_b->seq; + + if (seq_a < seq_b) + return -1; + if (seq_a > seq_b) + return 1; + return 0; +} + +static inline void mnt_ns_tree_write_lock(void) +{ + write_seqlock(&mnt_ns_tree_lock); +} - return mnt_ns_cmp(seq_a, ns_b) < 0; +static inline void mnt_ns_tree_write_unlock(void) +{ + write_sequnlock(&mnt_ns_tree_lock); } static void mnt_ns_tree_add(struct mnt_namespace *ns) { - guard(write_lock)(&mnt_ns_tree_lock); - rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); + struct rb_node *node, *prev; + + mnt_ns_tree_write_lock(); + node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); + /* + * If there's no previous entry simply add it after the + * head and if there is add it after the previous entry. + */ + prev = rb_prev(&ns->mnt_ns_tree_node); + if (!prev) + list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list); + else + list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list); + mnt_ns_tree_write_unlock(); + + WARN_ON_ONCE(node); } static void mnt_ns_release(struct mnt_namespace *ns) { - lockdep_assert_not_held(&mnt_ns_tree_lock); + lockdep_assert_not_held(&mnt_ns_tree_lock.lock); /* keep alive for {list,stat}mount() */ if (refcount_dec_and_test(&ns->passive)) { @@ -151,41 +170,34 @@ static void mnt_ns_release(struct mnt_namespace *ns) } DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) +static void mnt_ns_release_rcu(struct rcu_head *rcu) +{ + mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu)); +} + static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ if (!is_anon_ns(ns)) { - guard(write_lock)(&mnt_ns_tree_lock); + mnt_ns_tree_write_lock(); rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); + list_bidir_del_rcu(&ns->mnt_ns_list); + mnt_ns_tree_write_unlock(); } - mnt_ns_release(ns); + call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); } -/* - * Returns the mount namespace which either has the specified id, or has the - * next smallest id afer the specified one. - */ -static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) +static int mnt_ns_find(const void *key, const struct rb_node *node) { - struct rb_node *node = mnt_ns_tree.rb_node; - struct mnt_namespace *ret = NULL; - - lockdep_assert_held(&mnt_ns_tree_lock); + const u64 mnt_ns_id = *(u64 *)key; + const struct mnt_namespace *ns = node_to_mnt_ns(node); - while (node) { - struct mnt_namespace *n = node_to_mnt_ns(node); - - if (mnt_ns_id <= n->seq) { - ret = node_to_mnt_ns(node); - if (mnt_ns_id == n->seq) - break; - node = node->rb_left; - } else { - node = node->rb_right; - } - } - return ret; + if (mnt_ns_id < ns->seq) + return -1; + if (mnt_ns_id > ns->seq) + return 1; + return 0; } /* @@ -195,18 +207,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) * namespace the @namespace_sem must first be acquired. If the namespace has * already shut down before acquiring @namespace_sem, {list,stat}mount() will * see that the mount rbtree of the namespace is empty. + * + * Note the lookup is lockless protected by a sequence counter. We only + * need to guard against false negatives as false positives aren't + * possible. So if we didn't find a mount namespace and the sequence + * counter has changed we need to retry. If the sequence counter is + * still the same we know the search actually failed. */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { - struct mnt_namespace *ns; + struct mnt_namespace *ns; + struct rb_node *node; + unsigned int seq; - guard(read_lock)(&mnt_ns_tree_lock); - ns = mnt_ns_find_id_at(mnt_ns_id); - if (!ns || ns->seq != mnt_ns_id) - return NULL; + guard(rcu)(); + do { + seq = read_seqbegin(&mnt_ns_tree_lock); + node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); + if (node) + break; + } while (read_seqretry(&mnt_ns_tree_lock, seq)); - refcount_inc(&ns->passive); - return ns; + if (!node) + return NULL; + + /* + * The last reference count is put with RCU delay so we can + * unconditonally acquire a reference here. + */ + ns = node_to_mnt_ns(node); + refcount_inc(&ns->passive); + return ns; } static inline void lock_mount_hash(void) @@ -2063,30 +2094,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) return &mnt->ns; } -struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous) +struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) { - guard(read_lock)(&mnt_ns_tree_lock); + guard(rcu)(); + for (;;) { - struct rb_node *node; + struct list_head *list; if (previous) - node = rb_prev(&mntns->mnt_ns_tree_node); + list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list)); else - node = rb_next(&mntns->mnt_ns_tree_node); - if (!node) + list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list)); + if (list_is_head(list, &mnt_ns_list)) return ERR_PTR(-ENOENT); - mntns = node_to_mnt_ns(node); - node = &mntns->mnt_ns_tree_node; + mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list); + /* + * The last passive reference count is put with RCU + * delay so accessing the mount namespace is not just + * safe but all relevant members are still valid. + */ if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN)) continue; /* - * Holding mnt_ns_tree_lock prevents the mount namespace from - * being freed but it may well be on it's deathbed. We want an - * active reference, not just a passive one here as we're - * persisting the mount namespace. + * We need an active reference count as we're persisting + * the mount namespace and it might already be on its + * deathbed. */ if (!refcount_inc_not_zero(&mntns->ns.count)) continue; @@ -3903,6 +3938,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a refcount_set(&new_ns->ns.count, 1); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; + INIT_LIST_HEAD(&new_ns->mnt_ns_list); RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); @@ -3982,7 +4018,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } - mnt_ns_tree_add(new_ns); namespace_unlock(); if (rootmnt) @@ -3990,6 +4025,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (pwdmnt) mntput(pwdmnt); + mnt_ns_tree_add(new_ns); return new_ns; } diff --git a/fs/nsfs.c b/fs/nsfs.c index c675fc40ce2d..663f8656158d 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -274,10 +274,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, if (usize < MNT_NS_INFO_SIZE_VER0) return -EINVAL; - if (previous) - mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns)); - else - mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns)); + mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous); if (IS_ERR(mnt_ns)) return PTR_ERR(mnt_ns); diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 14dfa6008467..1b11926ddd47 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -30,6 +30,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) * way, we must not access it directly */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) +/* + * Return the ->prev pointer of a list_head in an rcu safe way. Don't + * access it directly. + * + * Any list traversed with list_bidir_prev_rcu() must never use + * list_del_rcu(). Doing so will poison the ->prev pointer that + * list_bidir_prev_rcu() relies on, which will result in segfaults. + * To prevent these segfaults, use list_bidir_del_rcu() instead + * of list_del_rcu(). + */ +#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev))) /** * list_tail_rcu - returns the prev pointer of the head of the list @@ -159,6 +170,39 @@ static inline void list_del_rcu(struct list_head *entry) } /** + * list_bidir_del_rcu - deletes entry from list without re-initialization + * @entry: the element to delete from the list. + * + * In contrast to list_del_rcu() doesn't poison the prev pointer thus + * allowing backwards traversal via list_bidir_prev_rcu(). + * + * Note: list_empty() on entry does not return true after this because + * the entry is in a special undefined state that permits RCU-based + * lockfree reverse traversal. In particular this means that we can not + * poison the forward and backwards pointers that may still be used for + * walking the list. + * + * The caller must take whatever precautions are necessary (such as + * holding appropriate locks) to avoid racing with another list-mutation + * primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on + * this same list. However, it is perfectly legal to run concurrently + * with the _rcu list-traversal primitives, such as + * list_for_each_entry_rcu(). + * + * Note that list_del_rcu() and list_bidir_del_rcu() must not be used on + * the same list. + * + * Note that the caller is not permitted to immediately free + * the newly deleted entry. Instead, either synchronize_rcu() + * or call_rcu() must be used to defer freeing until an RCU + * grace period has elapsed. + */ +static inline void list_bidir_del_rcu(struct list_head *entry) +{ + __list_del_entry(entry); +} + +/** * hlist_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * diff --git a/samples/vfs/.gitignore b/samples/vfs/.gitignore index 33a03cffe072..8708341bc082 100644 --- a/samples/vfs/.gitignore +++ b/samples/vfs/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only /test-fsmount +/test-list-all-mounts /test-statx /mountinfo diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile index fb9bb33fdc75..6554b73a75c8 100644 --- a/samples/vfs/Makefile +++ b/samples/vfs/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -userprogs-always-y += test-fsmount test-statx mountinfo +userprogs-always-y += test-fsmount test-statx mountinfo test-list-all-mounts userccflags += -I usr/include diff --git a/samples/vfs/test-list-all-mounts.c b/samples/vfs/test-list-all-mounts.c new file mode 100644 index 000000000000..f372d5aea471 --- /dev/null +++ b/samples/vfs/test-list-all-mounts.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2024 Christian Brauner <brauner@kernel.org> + +#define _GNU_SOURCE +#include <errno.h> +#include <limits.h> +#include <linux/types.h> +#include <stdio.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> + +#include "../../tools/testing/selftests/pidfd/pidfd.h" + +#define die_errno(format, ...) \ + do { \ + fprintf(stderr, "%m | %s: %d: %s: " format "\n", __FILE__, \ + __LINE__, __func__, ##__VA_ARGS__); \ + exit(EXIT_FAILURE); \ + } while (0) + +/* Get the id for a mount namespace */ +#define NS_GET_MNTNS_ID _IO(0xb7, 0x5) +/* Get next mount namespace. */ + +struct mnt_ns_info { + __u32 size; + __u32 nr_mounts; + __u64 mnt_ns_id; +}; + +#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */ + +/* Get information about namespace. */ +#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info) +/* Get next namespace. */ +#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info) +/* Get previous namespace. */ +#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info) + +#define PIDFD_GET_MNT_NAMESPACE _IO(0xFF, 3) + +#ifndef __NR_listmount +#define __NR_listmount 458 +#endif + +#ifndef __NR_statmount +#define __NR_statmount 457 +#endif + +/* @mask bits for statmount(2) */ +#define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */ +#define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */ +#define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */ +#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ +#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ +#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ +#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ +#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ + +#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ + +struct statmount { + __u32 size; + __u32 mnt_opts; + __u64 mask; + __u32 sb_dev_major; + __u32 sb_dev_minor; + __u64 sb_magic; + __u32 sb_flags; + __u32 fs_type; + __u64 mnt_id; + __u64 mnt_parent_id; + __u32 mnt_id_old; + __u32 mnt_parent_id_old; + __u64 mnt_attr; + __u64 mnt_propagation; + __u64 mnt_peer_group; + __u64 mnt_master; + __u64 propagate_from; + __u32 mnt_root; + __u32 mnt_point; + __u64 mnt_ns_id; + __u64 __spare2[49]; + char str[]; +}; + +struct mnt_id_req { + __u32 size; + __u32 spare; + __u64 mnt_id; + __u64 param; + __u64 mnt_ns_id; +}; + +#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */ + +#define LSMT_ROOT 0xffffffffffffffff /* root mount */ + +static int __statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask, + struct statmount *stmnt, size_t bufsize, + unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER1, + .mnt_id = mnt_id, + .param = mask, + .mnt_ns_id = mnt_ns_id, + }; + + return syscall(__NR_statmount, &req, stmnt, bufsize, flags); +} + +static struct statmount *sys_statmount(__u64 mnt_id, __u64 mnt_ns_id, + __u64 mask, unsigned int flags) +{ + size_t bufsize = 1 << 15; + struct statmount *stmnt = NULL, *tmp = NULL; + int ret; + + for (;;) { + tmp = realloc(stmnt, bufsize); + if (!tmp) + goto out; + + stmnt = tmp; + ret = __statmount(mnt_id, mnt_ns_id, mask, stmnt, bufsize, flags); + if (!ret) + return stmnt; + + if (errno != EOVERFLOW) + goto out; + + bufsize <<= 1; + if (bufsize >= UINT_MAX / 2) + goto out; + } + +out: + free(stmnt); + return NULL; +} + +static ssize_t sys_listmount(__u64 mnt_id, __u64 last_mnt_id, __u64 mnt_ns_id, + __u64 list[], size_t num, unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER1, + .mnt_id = mnt_id, + .param = last_mnt_id, + .mnt_ns_id = mnt_ns_id, + }; + + return syscall(__NR_listmount, &req, list, num, flags); +} + +int main(int argc, char *argv[]) +{ +#define LISTMNT_BUFFER 10 + __u64 list[LISTMNT_BUFFER], last_mnt_id = 0; + int ret, pidfd, fd_mntns; + struct mnt_ns_info info = {}; + + pidfd = sys_pidfd_open(getpid(), 0); + if (pidfd < 0) + die_errno("pidfd_open failed"); + + fd_mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, 0); + if (fd_mntns < 0) + die_errno("ioctl(PIDFD_GET_MNT_NAMESPACE) failed"); + + ret = ioctl(fd_mntns, NS_MNT_GET_INFO, &info); + if (ret < 0) + die_errno("ioctl(NS_GET_MNTNS_ID) failed"); + + printf("Listing %u mounts for mount namespace %llu\n", + info.nr_mounts, info.mnt_ns_id); + for (;;) { + ssize_t nr_mounts; +next: + nr_mounts = sys_listmount(LSMT_ROOT, last_mnt_id, + info.mnt_ns_id, list, LISTMNT_BUFFER, + 0); + if (nr_mounts <= 0) { + int fd_mntns_next; + + printf("Finished listing %u mounts for mount namespace %llu\n\n", + info.nr_mounts, info.mnt_ns_id); + fd_mntns_next = ioctl(fd_mntns, NS_MNT_GET_NEXT, &info); + if (fd_mntns_next < 0) { + if (errno == ENOENT) { + printf("Finished listing all mount namespaces\n"); + exit(0); + } + die_errno("ioctl(NS_MNT_GET_NEXT) failed"); + } + close(fd_mntns); + fd_mntns = fd_mntns_next; + last_mnt_id = 0; + printf("Listing %u mounts for mount namespace %llu\n", + info.nr_mounts, info.mnt_ns_id); + goto next; + } + + for (size_t cur = 0; cur < nr_mounts; cur++) { + struct statmount *stmnt; + + last_mnt_id = list[cur]; + + stmnt = sys_statmount(last_mnt_id, info.mnt_ns_id, + STATMOUNT_SB_BASIC | + STATMOUNT_MNT_BASIC | + STATMOUNT_MNT_ROOT | + STATMOUNT_MNT_POINT | + STATMOUNT_MNT_NS_ID | + STATMOUNT_MNT_OPTS | + STATMOUNT_FS_TYPE, 0); + if (!stmnt) { + printf("Failed to statmount(%llu) in mount namespace(%llu)\n", + last_mnt_id, info.mnt_ns_id); + continue; + } + + printf("mnt_id:\t\t%llu\nmnt_parent_id:\t%llu\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n\n", + stmnt->mnt_id, + stmnt->mnt_parent_id, + stmnt->str + stmnt->fs_type, + stmnt->str + stmnt->mnt_root, + stmnt->str + stmnt->mnt_point, + stmnt->str + stmnt->mnt_opts); + free(stmnt); + } + } + + exit(0); +} diff --git a/tools/testing/selftests/nsfs/.gitignore b/tools/testing/selftests/filesystems/nsfs/.gitignore index ed79ebdf286e..92a8249006d1 100644 --- a/tools/testing/selftests/nsfs/.gitignore +++ b/tools/testing/selftests/filesystems/nsfs/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only owner pidns +iterate_mntns diff --git a/tools/testing/selftests/nsfs/Makefile b/tools/testing/selftests/filesystems/nsfs/Makefile index dd9bd50b7b93..231aaa7dfd95 100644 --- a/tools/testing/selftests/nsfs/Makefile +++ b/tools/testing/selftests/filesystems/nsfs/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -TEST_GEN_PROGS := owner pidns +TEST_GEN_PROGS := owner pidns iterate_mntns CFLAGS := -Wall -Werror -include ../lib.mk +include ../../lib.mk diff --git a/tools/testing/selftests/nsfs/config b/tools/testing/selftests/filesystems/nsfs/config index 598d0a225fc9..598d0a225fc9 100644 --- a/tools/testing/selftests/nsfs/config +++ b/tools/testing/selftests/filesystems/nsfs/config diff --git a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c new file mode 100644 index 000000000000..457cf76f3c5f --- /dev/null +++ b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2024 Christian Brauner <brauner@kernel.org> + +#define _GNU_SOURCE +#include <fcntl.h> +#include <sched.h> +#include <stdio.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/mount.h> +#include <unistd.h> + +#include "../../kselftest_harness.h" + +#define MNT_NS_COUNT 11 +#define MNT_NS_LAST_INDEX 10 + +struct mnt_ns_info { + __u32 size; + __u32 nr_mounts; + __u64 mnt_ns_id; +}; + +#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */ + +/* Get information about namespace. */ +#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info) +/* Get next namespace. */ +#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info) +/* Get previous namespace. */ +#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info) + +FIXTURE(iterate_mount_namespaces) { + int fd_mnt_ns[MNT_NS_COUNT]; + __u64 mnt_ns_id[MNT_NS_COUNT]; +}; + +FIXTURE_SETUP(iterate_mount_namespaces) +{ + for (int i = 0; i < MNT_NS_COUNT; i++) + self->fd_mnt_ns[i] = -EBADF; + + /* + * Creating a new user namespace let's us guarantee that we only see + * mount namespaces that we did actually create. + */ + ASSERT_EQ(unshare(CLONE_NEWUSER), 0); + + for (int i = 0; i < MNT_NS_COUNT; i++) { + struct mnt_ns_info info = {}; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + self->fd_mnt_ns[i] = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC); + ASSERT_GE(self->fd_mnt_ns[i], 0); + ASSERT_EQ(ioctl(self->fd_mnt_ns[i], NS_MNT_GET_INFO, &info), 0); + self->mnt_ns_id[i] = info.mnt_ns_id; + } +} + +FIXTURE_TEARDOWN(iterate_mount_namespaces) +{ + for (int i = 0; i < MNT_NS_COUNT; i++) { + if (self->fd_mnt_ns[i] < 0) + continue; + ASSERT_EQ(close(self->fd_mnt_ns[i]), 0); + } +} + +TEST_F(iterate_mount_namespaces, iterate_all_forward) +{ + int fd_mnt_ns_cur, count = 0; + + fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[0], F_DUPFD_CLOEXEC); + ASSERT_GE(fd_mnt_ns_cur, 0); + + for (;; count++) { + struct mnt_ns_info info = {}; + int fd_mnt_ns_next; + + fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info); + if (fd_mnt_ns_next < 0 && errno == ENOENT) + break; + ASSERT_GE(fd_mnt_ns_next, 0); + ASSERT_EQ(close(fd_mnt_ns_cur), 0); + fd_mnt_ns_cur = fd_mnt_ns_next; + } + ASSERT_EQ(count, MNT_NS_LAST_INDEX); +} + +TEST_F(iterate_mount_namespaces, iterate_all_backwards) +{ + int fd_mnt_ns_cur, count = 0; + + fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[MNT_NS_LAST_INDEX], F_DUPFD_CLOEXEC); + ASSERT_GE(fd_mnt_ns_cur, 0); + + for (;; count++) { + struct mnt_ns_info info = {}; + int fd_mnt_ns_prev; + + fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info); + if (fd_mnt_ns_prev < 0 && errno == ENOENT) + break; + ASSERT_GE(fd_mnt_ns_prev, 0); + ASSERT_EQ(close(fd_mnt_ns_cur), 0); + fd_mnt_ns_cur = fd_mnt_ns_prev; + } + ASSERT_EQ(count, MNT_NS_LAST_INDEX); +} + +TEST_F(iterate_mount_namespaces, iterate_forward) +{ + int fd_mnt_ns_cur; + + ASSERT_EQ(setns(self->fd_mnt_ns[0], CLONE_NEWNS), 0); + + fd_mnt_ns_cur = self->fd_mnt_ns[0]; + for (int i = 1; i < MNT_NS_COUNT; i++) { + struct mnt_ns_info info = {}; + int fd_mnt_ns_next; + + fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info); + ASSERT_GE(fd_mnt_ns_next, 0); + ASSERT_EQ(close(fd_mnt_ns_cur), 0); + fd_mnt_ns_cur = fd_mnt_ns_next; + ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]); + } +} + +TEST_F(iterate_mount_namespaces, iterate_backward) +{ + int fd_mnt_ns_cur; + + ASSERT_EQ(setns(self->fd_mnt_ns[MNT_NS_LAST_INDEX], CLONE_NEWNS), 0); + + fd_mnt_ns_cur = self->fd_mnt_ns[MNT_NS_LAST_INDEX]; + for (int i = MNT_NS_LAST_INDEX - 1; i >= 0; i--) { + struct mnt_ns_info info = {}; + int fd_mnt_ns_prev; + + fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info); + ASSERT_GE(fd_mnt_ns_prev, 0); + ASSERT_EQ(close(fd_mnt_ns_cur), 0); + fd_mnt_ns_cur = fd_mnt_ns_prev; + ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/nsfs/owner.c b/tools/testing/selftests/filesystems/nsfs/owner.c index 96a976c74550..96a976c74550 100644 --- a/tools/testing/selftests/nsfs/owner.c +++ b/tools/testing/selftests/filesystems/nsfs/owner.c diff --git a/tools/testing/selftests/nsfs/pidns.c b/tools/testing/selftests/filesystems/nsfs/pidns.c index e3c772c6a7c7..e3c772c6a7c7 100644 --- a/tools/testing/selftests/nsfs/pidns.c +++ b/tools/testing/selftests/filesystems/nsfs/pidns.c diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index 88d6830ee004..3a96053e52e7 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -12,7 +12,6 @@ #include <stdlib.h> #include <string.h> #include <syscall.h> -#include <sys/mount.h> #include <sys/types.h> #include <sys/wait.h> |