summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-05 17:00:56 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-05 17:00:56 -0700
commite5f76a2e0e84ca2a215ecbf6feae88780d055c56 (patch)
tree50c64139c9bdc9dda7efe58e14ad2a81d13114d0
parentc96e6dabfbdb241e32b3c588dbfa1ccb87d2c95a (diff)
parent296990deb389c7da21c78030376ba244dc1badf5 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull mnt namespace updates from Eric Biederman: "A big break-through came during this development cycle as a way was found to maintain the existing umount -l semantics while allowing for optimizations that improve the performance. That is represented by the first change in this series moving the reparenting of mounts into their own pass. This has allowed addressing the horrific performance of umount -l on a carefully crafted tree of mounts with locks held (0.06s vs 60s in my testing). What allowed this was not changing where umounts propagate to while propgating umounts. The next change fixes the case where the order of the mount whose umount are being progated visits a tree where the mounts are stacked upon each other in another order. This is weird but not hard to implement. The final change takes advantage of the unchanging mount propgation tree to skip parts of the mount propgation tree that have already been visited. Yielding a very nice speed up in the worst case. There remains one outstanding question about the semantics of umount -l that I am still discussiong with Ram Pai. In practice that area of the semantics was changed by 1064f874abc0 ("mnt: Tuck mounts under others instead of creating shadow/side mounts.") and no regressions have been reported. Still I intend to finish talking that out with him to ensure there is not something a more intense use of mount propagation in the future will not cause to become significant" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: mnt: Make propagate_umount less slow for overlapping mount propagation trees mnt: In propgate_umount handle visiting mounts in any order mnt: In umount propagation reparent in a separate pass
-rw-r--r--fs/mount.h1
-rw-r--r--fs/namespace.c1
-rw-r--r--fs/pnode.c212
3 files changed, 165 insertions, 49 deletions
diff --git a/fs/mount.h b/fs/mount.h
index bf1fda6eed8f..de45d9e76748 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,6 +58,7 @@ struct mount {
struct mnt_namespace *mnt_ns; /* containing namespace */
struct mountpoint *mnt_mp; /* where is it mounted */
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
+ struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
diff --git a/fs/namespace.c b/fs/namespace.c
index 5a4438445bf7..f70914a859a4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -236,6 +236,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
INIT_HLIST_NODE(&mnt->mnt_mp_list);
+ INIT_LIST_HEAD(&mnt->mnt_umounting);
init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
}
return mnt;
diff --git a/fs/pnode.c b/fs/pnode.c
index 5bc7896d122a..53d411a371ce 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p)
return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
}
+static inline struct mount *last_slave(struct mount *p)
+{
+ return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
+}
+
static inline struct mount *next_slave(struct mount *p)
{
return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
@@ -162,6 +167,19 @@ static struct mount *propagation_next(struct mount *m,
}
}
+static struct mount *skip_propagation_subtree(struct mount *m,
+ struct mount *origin)
+{
+ /*
+ * Advance m such that propagation_next will not return
+ * the slaves of m.
+ */
+ if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ m = last_slave(m);
+
+ return m;
+}
+
static struct mount *next_group(struct mount *m, struct mount *origin)
{
while (1) {
@@ -413,65 +431,104 @@ void propagate_mount_unlock(struct mount *mnt)
}
}
-/*
- * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
- */
-static void mark_umount_candidates(struct mount *mnt)
+static void umount_one(struct mount *mnt, struct list_head *to_umount)
{
- struct mount *parent = mnt->mnt_parent;
- struct mount *m;
-
- BUG_ON(parent == mnt);
-
- for (m = propagation_next(parent, parent); m;
- m = propagation_next(m, parent)) {
- struct mount *child = __lookup_mnt(&m->mnt,
- mnt->mnt_mountpoint);
- if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
- continue;
- if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
- SET_MNT_MARK(child);
- }
- }
+ CLEAR_MNT_MARK(mnt);
+ mnt->mnt.mnt_flags |= MNT_UMOUNT;
+ list_del_init(&mnt->mnt_child);
+ list_del_init(&mnt->mnt_umounting);
+ list_move_tail(&mnt->mnt_list, to_umount);
}
/*
* NOTE: unmounting 'mnt' naturally propagates to all other mounts its
* parent propagates to.
*/
-static void __propagate_umount(struct mount *mnt)
+static bool __propagate_umount(struct mount *mnt,
+ struct list_head *to_umount,
+ struct list_head *to_restore)
{
- struct mount *parent = mnt->mnt_parent;
- struct mount *m;
+ bool progress = false;
+ struct mount *child;
- BUG_ON(parent == mnt);
+ /*
+ * The state of the parent won't change if this mount is
+ * already unmounted or marked as without children.
+ */
+ if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
+ goto out;
- for (m = propagation_next(parent, parent); m;
- m = propagation_next(m, parent)) {
- struct mount *topper;
- struct mount *child = __lookup_mnt(&m->mnt,
- mnt->mnt_mountpoint);
- /*
- * umount the child only if the child has no children
- * and the child is marked safe to unmount.
- */
- if (!child || !IS_MNT_MARKED(child))
+ /* Verify topper is the only grandchild that has not been
+ * speculatively unmounted.
+ */
+ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
+ if (child->mnt_mountpoint == mnt->mnt.mnt_root)
continue;
- CLEAR_MNT_MARK(child);
+ if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
+ continue;
+ /* Found a mounted child */
+ goto children;
+ }
- /* If there is exactly one mount covering all of child
- * replace child with that mount.
- */
- topper = find_topper(child);
- if (topper)
- mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
- topper);
+ /* Mark mounts that can be unmounted if not locked */
+ SET_MNT_MARK(mnt);
+ progress = true;
+
+ /* If a mount is without children and not locked umount it. */
+ if (!IS_MNT_LOCKED(mnt)) {
+ umount_one(mnt, to_umount);
+ } else {
+children:
+ list_move_tail(&mnt->mnt_umounting, to_restore);
+ }
+out:
+ return progress;
+}
+
+static void umount_list(struct list_head *to_umount,
+ struct list_head *to_restore)
+{
+ struct mount *mnt, *child, *tmp;
+ list_for_each_entry(mnt, to_umount, mnt_list) {
+ list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
+ /* topper? */
+ if (child->mnt_mountpoint == mnt->mnt.mnt_root)
+ list_move_tail(&child->mnt_umounting, to_restore);
+ else
+ umount_one(child, to_umount);
+ }
+ }
+}
- if (list_empty(&child->mnt_mounts)) {
- list_del_init(&child->mnt_child);
- child->mnt.mnt_flags |= MNT_UMOUNT;
- list_move_tail(&child->mnt_list, &mnt->mnt_list);
+static void restore_mounts(struct list_head *to_restore)
+{
+ /* Restore mounts to a clean working state */
+ while (!list_empty(to_restore)) {
+ struct mount *mnt, *parent;
+ struct mountpoint *mp;
+
+ mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
+ CLEAR_MNT_MARK(mnt);
+ list_del_init(&mnt->mnt_umounting);
+
+ /* Should this mount be reparented? */
+ mp = mnt->mnt_mp;
+ parent = mnt->mnt_parent;
+ while (parent->mnt.mnt_flags & MNT_UMOUNT) {
+ mp = parent->mnt_mp;
+ parent = parent->mnt_parent;
}
+ if (parent != mnt->mnt_parent)
+ mnt_change_mountpoint(parent, mp, mnt);
+ }
+}
+
+static void cleanup_umount_visitations(struct list_head *visited)
+{
+ while (!list_empty(visited)) {
+ struct mount *mnt =
+ list_first_entry(visited, struct mount, mnt_umounting);
+ list_del_init(&mnt->mnt_umounting);
}
}
@@ -485,11 +542,68 @@ static void __propagate_umount(struct mount *mnt)
int propagate_umount(struct list_head *list)
{
struct mount *mnt;
+ LIST_HEAD(to_restore);
+ LIST_HEAD(to_umount);
+ LIST_HEAD(visited);
+
+ /* Find candidates for unmounting */
+ list_for_each_entry_reverse(mnt, list, mnt_list) {
+ struct mount *parent = mnt->mnt_parent;
+ struct mount *m;
+
+ /*
+ * If this mount has already been visited it is known that it's
+ * entire peer group and all of their slaves in the propagation
+ * tree for the mountpoint has already been visited and there is
+ * no need to visit them again.
+ */
+ if (!list_empty(&mnt->mnt_umounting))
+ continue;
+
+ list_add_tail(&mnt->mnt_umounting, &visited);
+ for (m = propagation_next(parent, parent); m;
+ m = propagation_next(m, parent)) {
+ struct mount *child = __lookup_mnt(&m->mnt,
+ mnt->mnt_mountpoint);
+ if (!child)
+ continue;
+
+ if (!list_empty(&child->mnt_umounting)) {
+ /*
+ * If the child has already been visited it is
+ * know that it's entire peer group and all of
+ * their slaves in the propgation tree for the
+ * mountpoint has already been visited and there
+ * is no need to visit this subtree again.
+ */
+ m = skip_propagation_subtree(m, parent);
+ continue;
+ } else if (child->mnt.mnt_flags & MNT_UMOUNT) {
+ /*
+ * We have come accross an partially unmounted
+ * mount in list that has not been visited yet.
+ * Remember it has been visited and continue
+ * about our merry way.
+ */
+ list_add_tail(&child->mnt_umounting, &visited);
+ continue;
+ }
+
+ /* Check the child and parents while progress is made */
+ while (__propagate_umount(child,
+ &to_umount, &to_restore)) {
+ /* Is the parent a umount candidate? */
+ child = child->mnt_parent;
+ if (list_empty(&child->mnt_umounting))
+ break;
+ }
+ }
+ }
- list_for_each_entry_reverse(mnt, list, mnt_list)
- mark_umount_candidates(mnt);
+ umount_list(&to_umount, &to_restore);
+ restore_mounts(&to_restore);
+ cleanup_umount_visitations(&visited);
+ list_splice_tail(&to_umount, list);
- list_for_each_entry(mnt, list, mnt_list)
- __propagate_umount(mnt);
return 0;
}