xfs: introduce online scrub freeze

Introduce a new 'online scrub freeze' that we can use to lock out all filesystem modifications and background activity so that we can perform global scans in order to rebuild metadata. This introduces a new IFLAG to the scrub ioctl to indicate that userspace is willing to allow a freeze. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
author: Darrick J. Wong <djwong@kernel.org> 2021-09-01 10:46:48 -0700
committer: Darrick J. Wong <djwong@kernel.org> 2021-12-15 17:28:56 -0800
commit: e66c39ccc50ff07d006a1bdddf4c4e4c3a92ff78 (patch)
tree: 51fa741d6b24c92424ea5f637e6b5d4d8202ef13 /fs/xfs
parent: 89e522e5ddf84bdeed1668f3cc9778128ed77024 (diff)
10 files changed, 181 insertions, 5 deletions
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 7a7d9b2c6b02..95ea547f0486 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -753,7 +753,11 @@ struct xfs_scrub_metadata {
  */
 #define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7)
 
-#define XFS_SCRUB_FLAGS_IN	(XFS_SCRUB_IFLAG_REPAIR)
+/* i: Allow scrub to freeze the filesystem to perform global scans. */
+#define XFS_SCRUB_IFLAG_FREEZE_OK	(1 << 8)
+
+#define XFS_SCRUB_FLAGS_IN	(XFS_SCRUB_IFLAG_REPAIR | \
+				 XFS_SCRUB_IFLAG_FREEZE_OK)
 #define XFS_SCRUB_FLAGS_OUT	(XFS_SCRUB_OFLAG_CORRUPT | \
 				 XFS_SCRUB_OFLAG_PREEN | \
 				 XFS_SCRUB_OFLAG_XFAIL | \
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index d1273d4dd79e..c2475d9894ee 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -618,9 +618,13 @@ xchk_trans_alloc(
 	struct xfs_scrub	*sc,
 	uint			resblks)
 {
+	uint			flags = 0;
+
+	if (sc->flags & XCHK_FS_FROZEN)
+		flags |= XFS_TRANS_NO_WRITECOUNT;
 	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
 		return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
-				resblks, 0, 0, &sc->tp);
+				resblks, 0, flags, &sc->tp);
 
 	return xfs_trans_alloc_empty(sc->mp, &sc->tp);
 }
@@ -958,3 +962,97 @@ xchk_start_reaping(
 	}
 	sc->flags &= ~XCHK_REAPING_DISABLED;
 }
+
+/*
+ * Exclusive Filesystem Access During Scrub and Repair
+ * ===================================================
+ *
+ * While most scrub activity can occur while the filesystem is live, there
+ * are certain scenarios where we cannot tolerate concurrent metadata updates.
+ * We therefore must freeze the filesystem against all other changes.
+ *
+ * The typical scenarios envisioned for scrub freezes are (a) to lock out all
+ * other filesystem changes in order to check the global summary counters,
+ * and anything else that requires unusual behavioral semantics.
+ *
+ * The typical scenarios envisioned for repair freezes are (a) to avoid ABBA
+ * deadlocks when need to take locks in an unusual order; or (b) to update
+ * global filesystem state.  For example, reconstruction of a damaged reverse
+ * mapping btree requires us to hold the AG header locks while scanning
+ * inodes, which goes against the usual inode -> AG header locking order.
+ *
+ * A note about inode reclaim: when we freeze the filesystem, users can't
+ * modify things and periodic background reclaim of speculative preallocations
+ * and copy-on-write staging extents is stopped.  However, the scrub/repair
+ * thread must be careful about evicting an inode from memory -- if the
+ * eviction would require a transaction, we must defer the iput until after
+ * the scrub freeze.  The reasons for this are twofold: first, scrub/repair
+ * already have a transaction and xfs can't nest transactions; and second, we
+ * froze the fs to prevent modifications that we can't control directly.
+ * This guarantee is made by freezing the inode inactivation worker while
+ * frozen.
+ *
+ * Userspace is prevented from freezing or thawing the filesystem during a
+ * repair freeze by the ->freeze_super and ->thaw_super superblock operations,
+ * which block any changes to the freeze state while a repair freeze is
+ * running through the use of the m_scrub_freeze mutex.  It only makes sense
+ * to run one scrub/repair freeze at a time, so the mutex is fine.
+ *
+ * Scrub/repair freezes cannot be initiated during a regular freeze because
+ * freeze_super does not allow nested freeze.  Repair activity that does not
+ * require a repair freeze is also prevented from running during a regular
+ * freeze because transaction allocation blocks on the regular freeze.  We
+ * assume that the only other users of XFS_TRANS_NO_WRITECOUNT transactions
+ * either aren't modifying space metadata in a way that would affect repair,
+ * or that we can inhibit any of the ones that do.
+ *
+ * Note that thaw_super and freeze_super can call deactivate_locked_super
+ * which can free the xfs_mount.  This can happen if someone freezes the block
+ * device, unmounts the filesystem, and thaws the block device.  Therefore, we
+ * must be careful about who gets to unlock the repair freeze mutex.  See the
+ * comments in xfs_fs_put_super.
+ */
+
+/* Start a scrub/repair freeze. */
+int
+xchk_fs_freeze(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	if (!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_FREEZE_OK))
+		return -EUSERS;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+		mnt_drop_write_file(sc->file);
+
+	mutex_lock(&sc->mp->m_scrub_freeze);
+
+	/*
+	 * Clear out all the NEEDS_INACTIVE inodes because we won't be able
+	 * to iget them during any metadata scan.
+	 */
+	xfs_inodegc_flush(sc->mp);
+
+	error = freeze_super(sc->mp->m_super);
+	if (error) {
+		mutex_unlock(&sc->mp->m_scrub_freeze);
+		sc->sm->sm_flags &= ~XFS_SCRUB_IFLAG_REPAIR;
+		return error;
+	}
+	sc->flags |= XCHK_FS_FROZEN;
+	return 0;
+}
+
+/* Release a scrub/repair freeze. */
+int
+xchk_fs_thaw(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	sc->flags &= ~XCHK_FS_FROZEN;
+	error = thaw_super(sc->mp->m_super);
+	mutex_unlock(&sc->mp->m_scrub_freeze);
+	return error;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 385b030206f7..d8b501fabfee 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -197,5 +197,7 @@ static inline bool xchk_could_repair(const struct xfs_scrub *sc)
 int xchk_metadata_inode_forks(struct xfs_scrub *sc);
 void xchk_stop_reaping(struct xfs_scrub *sc);
 void xchk_start_reaping(struct xfs_scrub *sc);
+int xchk_fs_freeze(struct xfs_scrub *sc);
+int xchk_fs_thaw(struct xfs_scrub *sc);
 
 #endif	/* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index ea3985359cdf..e354f1ea56b1 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -168,8 +168,14 @@ xchk_teardown(
 		xfs_irele(sc->ip);
 		sc->ip = NULL;
 	}
-	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+	if (sc->flags & XCHK_FS_FROZEN) {
+		int		err2 = xchk_fs_thaw(sc);
+
+		if (!error && err2)
+			error = err2;
+	} else if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
 		mnt_drop_write_file(sc->file);
+	}
 	if (sc->flags & XCHK_REAPING_DISABLED)
 		xchk_start_reaping(sc);
 	if (sc->xfile) {
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 5071534324de..bde10b43260a 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -111,6 +111,7 @@ struct xfs_scrub {
 /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
 #define XCHK_TRY_HARDER		(1 << 0)  /* can't get resources, try again */
 #define XCHK_REAPING_DISABLED	(1 << 2)  /* background block reaping paused */
+#define XCHK_FS_FROZEN		(1 << 3)  /* we froze the fs to do things */
 #define XREP_RESET_PERAG_RESV	(1 << 30) /* must reset AG space reservation */
 #define XREP_ALREADY_FIXED	(1 << 31) /* checking our repair work */
 
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 2d8e33825309..fca5ce580657 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -99,7 +99,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
 	{ XFS_SCRUB_OFLAG_XCORRUPT,		"xcorrupt" }, \
 	{ XFS_SCRUB_OFLAG_INCOMPLETE,		"incomplete" }, \
 	{ XFS_SCRUB_OFLAG_WARNING,		"warning" }, \
-	{ XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED,	"norepair" }
+	{ XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED,	"norepair" }, \
+	{ XFS_SCRUB_IFLAG_FREEZE_OK,		"freeze" }
 
 DECLARE_EVENT_CLASS(xchk_class,
 	TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 174cd8950cb6..661e871708af 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1716,6 +1716,10 @@ xfs_ioc_scrub_metadata(
 	if (copy_from_user(&scrub, arg, sizeof(scrub)))
 		return -EFAULT;
 
+	if ((scrub.sm_flags & XFS_SCRUB_IFLAG_FREEZE_OK) &&
+			!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	error = xfs_scrub_metadata(file, &scrub);
 	if (error)
 		return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 96097a784cf7..b9a93d782175 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -257,6 +257,12 @@ typedef struct xfs_mount {
 	unsigned int		*m_errortag;
 	struct xfs_kobj		m_errortag_kobj;
 #endif
+	/*
+	 * Only allow one thread to initiate a repair freeze at a time.  We
+	 * also use this to block userspace from changing the freeze state
+	 * while a repair freeze is in progress.
+	 */
+	struct mutex		m_scrub_freeze;
 } xfs_mount_t;
 
 #define M_IGEO(mp)		(&(mp)->m_ino_geo)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c7ac486ca5d3..129e203ca6e8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -721,6 +721,10 @@ xfs_mount_free(
 {
 	kfree(mp->m_rtname);
 	kfree(mp->m_logname);
+
+	ASSERT(!mutex_is_locked(&mp->m_scrub_freeze));
+	mutex_destroy(&mp->m_scrub_freeze);
+
 	kmem_free(mp);
 }
 
@@ -926,6 +930,50 @@ xfs_fs_unfreeze(
 	return 0;
 }
 
+STATIC int
+xfs_fs_freeze_super(
+	struct super_block	*sb)
+{
+	int			error;
+
+	/*
+	 * Don't let userspace freeze while scrub has the filesystem frozen.
+	 * Take our own private reference to the vfs superblock so that we
+	 * don't lose the xfs superblock while frozen or trying to freeze.
+	 */
+	atomic_inc(&sb->s_active);
+	mutex_lock(&XFS_M(sb)->m_scrub_freeze);
+	error = freeze_super(sb);
+	mutex_unlock(&XFS_M(sb)->m_scrub_freeze);
+	deactivate_super(sb);
+
+	return error;
+}
+
+STATIC int
+xfs_fs_thaw_super(
+	struct super_block	*sb)
+{
+	int			error;
+
+	/*
+	 * Freeze takes an s_active reference to the filesystem and fs thaw
+	 * drops it.  If a filesystem on a frozen (dm) block device is
+	 * unmounted before the block device is thawed, the freezer state could
+	 * contain the one remaining s_active reference to the vfs superblock.
+	 * Since thaw_super drops that reference (which frees the xfs_mount)
+	 * and we still have to unlock the scrub freeze mutex, we must take our
+	 * own private s_active reference to avoid a use-after-free.
+	 */
+	atomic_inc(&sb->s_active);
+	mutex_lock(&XFS_M(sb)->m_scrub_freeze);
+	error = thaw_super(sb);
+	mutex_unlock(&XFS_M(sb)->m_scrub_freeze);
+	deactivate_super(sb);
+
+	return error;
+}
+
 /*
  * This function fills in xfs_mount_t fields based on mount args.
  * Note: the superblock _has_ now been read in.
@@ -1124,6 +1172,8 @@ static const struct super_operations xfs_super_operations = {
 	.show_options		= xfs_fs_show_options,
 	.nr_cached_objects	= xfs_fs_nr_cached_objects,
 	.free_cached_objects	= xfs_fs_free_cached_objects,
+	.freeze_super		= xfs_fs_freeze_super,
+	.thaw_super		= xfs_fs_thaw_super,
 };
 
 static int
@@ -1906,6 +1956,7 @@ static int xfs_init_fs_context(
 	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
 	spin_lock_init(&mp->m_perag_lock);
 	mutex_init(&mp->m_growlock);
+	mutex_init(&mp->m_scrub_freeze);
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c87616ef62c4..434be4319bad 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -326,9 +326,12 @@ retry:
 
 	/*
 	 * Zero-reservation ("empty") transactions can't modify anything, so
-	 * they're allowed to run while we're frozen.
+	 * they're allowed to run while we're frozen.  Scrub is allowed to
+	 * freeze the filesystem in order to obtain exclusive access to the
+	 * filesystem.
 	 */
 	WARN_ON(resp->tr_logres > 0 &&
+	        !mutex_is_locked(&mp->m_scrub_freeze) &&
 		mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
 	ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) ||
 	       xfs_has_lazysbcount(mp));
author	Darrick J. Wong <djwong@kernel.org>	2021-09-01 10:46:48 -0700
committer	Darrick J. Wong <djwong@kernel.org>	2021-12-15 17:28:56 -0800
commit	e66c39ccc50ff07d006a1bdddf4c4e4c3a92ff78 (patch)
tree	51fa741d6b24c92424ea5f637e6b5d4d8202ef13 /fs/xfs
parent	89e522e5ddf84bdeed1668f3cc9778128ed77024 (diff)