xfs: allow queued AG intents to drain before scrubbingscrub-drain-intents_2021-12-15

Currently, online scrub isn't sufficiently careful about quiescing allocation groups before checking them. While scrub does take the AG header locks, it doesn't serialize against chains of AG update intents that are being processed concurrently. If there's a collision, cross-referencing between data structures (e.g. rmapbt and refcountbt) can yield false corruption events; if repair is running, this results in incorrect repairs. Fix this by adding to the perag structure the count of active intents and make scrub wait until there aren't any to continue. This is a little stupid since transactions can queue intents without taking buffer locks, but we'll also wait for those transactions. XXX: should have instead a per-ag rwsem that gets taken as soon as the AG[IF] are locked and stays held until the transaction commits or moves on to the next AG? would we rather have a six lock so that intents can take an ix lock, and not have to upgrade to x until we actually want to make changes to that ag? is that how those even work?? Signed-off-by: Darrick J. Wong <djwong@kernel.org>
author: Darrick J. Wong <djwong@kernel.org> 2021-10-22 15:31:05 -0700
committer: Darrick J. Wong <djwong@kernel.org> 2021-12-15 17:29:29 -0800
commit: bb8092d02d822601113897de29384972342ed1e8 (patch)
tree: b496a8902d1bb28a09623580309e88d66f1193ae /fs/xfs/scrub
parent: be31442955a3d225b71223eebc056015fab0d698 (diff)
11 files changed, 168 insertions, 33 deletions
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 9ac01b24c9b0..dc472aa6f548 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -318,7 +318,12 @@ xchk_bmap_rt_iextent_xref(
 	struct xchk_bmap_info	*info,
 	struct xfs_bmbt_irec	*irec)
 {
-	xchk_rt_init(info->sc, &info->sc->sr);
+	int			error;
+
+	error = xchk_rt_init(info->sc, &info->sc->sr);
+	if (!xchk_fblock_process_error(info->sc, info->whichfork,
+			irec->br_startoff, &error))
+		goto out_free;
 
 	xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
 			irec->br_blockcount);
@@ -336,6 +341,7 @@ xchk_bmap_rt_iextent_xref(
 		break;
 	}
 
+out_free:
 	xchk_rt_btcur_free(&info->sc->sr);
 	xchk_rt_unlock(info->sc, &info->sc->sr);
 }
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 2256b77e20fc..ad6f00c4767f 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -339,7 +339,9 @@ xrep_bmap_scan_rt(
 	if (xrep_is_rtmeta_ino(sc, sc->ip->i_ino))
 		return 0;
 
-	xchk_rt_lock(sc, &sc->sr);
+	error = xchk_rt_lock(sc, &sc->sr);
+	if (error)
+		return error;
 	xrep_rt_btcur_init(sc, &sc->sr);
 	error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_bmap_walk_rtrmap, rb);
 	xchk_rt_btcur_free(&sc->sr);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 246d33ac46b9..4987aebca74e 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -484,7 +484,35 @@ want_ag_read_header_failure(
  *
  * The headers should be released by xchk_ag_free, but as a fail safe we attach
  * all the buffers we grab to the scrub transaction so they'll all be freed
- * when we cancel it.  Returns ENOENT if we can't grab the perag structure.
+ * when we cancel it.
+ */
+static inline int
+__xchk_ag_read_headers(
+	struct xfs_scrub	*sc,
+	xfs_agnumber_t		agno,
+	struct xchk_ag		*sa)
+{
+	struct xfs_mount	*mp = sc->mp;
+	int			error;
+
+	error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp);
+	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
+		return error;
+
+	error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp);
+	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
+		return error;
+
+	error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp);
+	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
+		return error;
+
+	return 0;
+}
+
+/*
+ * Grab all the headers for an AG, and wait until there aren't any pending
+ * intents.  Returns -ENOENT if we can't grab the perag structure.
  */
 int
 xchk_ag_read_headers(
@@ -502,29 +530,72 @@ xchk_ag_read_headers(
 	return xchk_ag_lock(sc);
 }
 
-/* Lock the AG headers. */
+/* Lock the AG headers, waiting for pending intents to drain. */
 int
 xchk_ag_lock(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_mount	*mp = sc->mp;
 	struct xchk_ag		*sa = &sc->sa;
-	xfs_agnumber_t		agno = sa->pag->pag_agno;
-	int			error;
+	int			error = 0;
 
-	error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp);
-	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
-		return error;
+	ASSERT(sa->pag != NULL);
+	ASSERT(sa->agi_bp == NULL);
+	ASSERT(sa->agf_bp == NULL);
+	ASSERT(sa->agfl_bp == NULL);
 
-	error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp);
-	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
-		return error;
+	do {
+		if (xchk_should_terminate(sc, &error))
+			return error;
 
-	error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp);
-	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
-		return error;
+		error = __xchk_ag_read_headers(sc, sa->pag->pag_agno, sa);
+		if (error)
+			return error;
 
-	return 0;
+		/*
+		 * Decide if this AG is quiet enough for all metadata to be
+		 * consistent with each other.  XFS allows the AG header buffer
+		 * locks to cycle across transaction rolls while processing
+		 * chains of deferred ops, which means that there could be
+		 * other threads in the middle of processing a chain of
+		 * deferred ops.  For regular operations we are careful about
+		 * ordering operations to prevent collisions between threads
+		 * (which is why we don't need a per-AG lock), but scrub and
+		 * repair have to serialize against chained operations.
+		 *
+		 * We just locked all the AG headers buffers; now take a look
+		 * to see if there are any intents in progress.  If there are,
+		 * drop the AG headers and wait for the intents to drain.
+		 * Since we hold all the AG header locks for the duration of
+		 * the scrub, this is the only time we have to sample the
+		 * intents counter; any threads increasing it after this point
+		 * can't possibly be in the middle of a chain of AG metadata
+		 * updates.
+		 *
+		 * Obviously, this should be slanted against scrub and in favor
+		 * of runtime threads.
+		 */
+		if (!xfs_drain_busy(&sa->pag->pag_intents))
+			return 0;
+
+		if (sa->agfl_bp) {
+			xfs_trans_brelse(sc->tp, sa->agfl_bp);
+			sa->agfl_bp = NULL;
+		}
+
+		if (sa->agf_bp) {
+			xfs_trans_brelse(sc->tp, sa->agf_bp);
+			sa->agf_bp = NULL;
+		}
+
+		if (sa->agi_bp) {
+			xfs_trans_brelse(sc->tp, sa->agi_bp);
+			sa->agi_bp = NULL;
+		}
+
+		error = xfs_perag_drain_intents(sa->pag);
+	} while (!error);
+
+	return error;
 }
 
 /* Release all the AG btree cursors. */
@@ -653,15 +724,63 @@ xchk_ag_init(
 	return 0;
 }
 
-/* Lock everything we need to work on realtime metadata. */
-void
+#ifdef CONFIG_XFS_RT
+/* Lock everything we need to work on realtime metadata and wait for intents. */
+int
+xchk_rt_lock(
+	struct xfs_scrub	*sc,
+	struct xchk_rt		*sr)
+{
+	int			error = 0;
+
+	do {
+		if (xchk_should_terminate(sc, &error))
+			return error;
+
+		xfs_rtlock(NULL, sc->mp, XFS_RTLOCK_ALL);
+
+		/*
+		 * Decide if the RT volume is quiet enough for all metadata to
+		 * be consistent with each other.  Regular file IO doesn't get
+		 * to lock all the rt inodes at the same time, which means that
+		 * there could be other threads in the middle of processing a
+		 * chain of deferred ops.
+		 *
+		 * We just locked all the rt inodes; now take a look to see if
+		 * there are any rt intents in progress.  If there are, drop
+		 * the rt inode locks and wait for the intents to drain.  Since
+		 * we hold the rt inode locks for the duration of the scrub,
+		 * this is the only time we have to sample the intents counter;
+		 * any threads increasing it after this point can't possibly be
+		 * in the middle of a chain of rt metadata updates.
+		 *
+		 * Obviously, this should be slanted against scrub and in favor
+		 * of runtime threads.
+		 */
+		if (!xfs_drain_busy(&sc->mp->m_rt_intents)) {
+			sr->locked = true;
+			return 0;
+		}
+
+		xfs_rtunlock(sc->mp, XFS_RTLOCK_ALL);
+
+		error = xfs_rt_drain_intents(sc->mp);
+	} while (!error);
+
+	return error;
+}
+#else
+/* Lock everything we need to work on realtime metadata and wait for intents. */
+int
 xchk_rt_lock(
 	struct xfs_scrub	*sc,
 	struct xchk_rt		*sr)
 {
 	xfs_rtlock(NULL, sc->mp, XFS_RTLOCK_ALL);
 	sr->locked = true;
+	return 0;
 }
+#endif /* CONFIG_XFS_RT */
 
 /*
  * For scrubbing a realtime file, grab all the in-core resources we'll need to
@@ -669,14 +788,17 @@ xchk_rt_lock(
  * metadata inodes.  Callers must not join these inodes to the transaction
  * with non-zero lockflags or concurrency problems will result.
  */
-void
+int
 xchk_rt_init(
 	struct xfs_scrub	*sc,
 	struct xchk_rt		*sr)
 {
 	struct xfs_mount	*mp = sc->mp;
+	int			error;
 
-	xchk_rt_lock(sc, sr);
+	error = xchk_rt_lock(sc, sr);
+	if (error)
+		return error;
 
 	if (xfs_has_rtrmapbt(mp))
 		sr->rmap_cur = xfs_rtrmapbt_init_cursor(mp, sc->tp,
@@ -685,6 +807,8 @@ xchk_rt_init(
 	if (xfs_has_reflink(mp))
 		sr->refc_cur = xfs_rtrefcountbt_init_cursor(mp, sc->tp,
 				mp->m_rrefcountip);
+
+	return 0;
 }
 
 /*
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index f2e1cf719c06..36a0be71cfbf 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -155,9 +155,9 @@ xchk_ag_init_existing(
 	return error == -ENOENT ? -EFSCORRUPTED : error;
 }
 
-void xchk_rt_init(struct xfs_scrub *sc, struct xchk_rt *sr);
+int xchk_rt_init(struct xfs_scrub *sc, struct xchk_rt *sr);
 void xchk_rt_btcur_free(struct xchk_rt *sr);
-void xchk_rt_lock(struct xfs_scrub *sc, struct xchk_rt *sr);
+int xchk_rt_lock(struct xfs_scrub *sc, struct xchk_rt *sr);
 void xchk_rt_unlock(struct xfs_scrub *sc, struct xchk_rt *sr);
 int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
 		struct xchk_ag *sa);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 97605313f097..8eec28aa1a95 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -597,7 +597,9 @@ xrep_dinode_count_rt_rmaps(
 	    xrep_is_rtmeta_ino(sc, sc->sm->sm_ino))
 		return 0;
 
-	xchk_rt_lock(sc, &sc->sr);
+	error = xchk_rt_lock(sc, &sc->sr);
+	if (error)
+		return error;
 	xrep_rt_btcur_init(sc, &sc->sr);
 	error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_dinode_walk_rtrmap,
 			dis);
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index c44283b0a502..8b4951ebe2cc 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -487,6 +487,7 @@ xrep_newbt_schedule_reap(
 
 	INIT_LIST_HEAD(&efi_item.xefi_list);
 	list_add(&efi_item.xefi_list, &items);
+	xfs_fs_bump_intents(xnr->sc->mp, false, resv->fsbno);
 	resv->efi = xfs_extent_free_defer_type.create_intent(xnr->sc->tp,
 			&items, 1, false);
 }
@@ -708,6 +709,7 @@ xrep_newbt_destroy(
 			goto junkit;
 
 		list_del(&resv->list);
+		xfs_fs_drop_intents(sc->mp, false, resv->fsbno);
 		kmem_free(resv);
 	}
 
@@ -720,6 +722,7 @@ junkit:
 	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
 		xfs_extent_free_defer_type.abort_intent(resv->efi);
 		list_del(&resv->list);
+		xfs_fs_drop_intents(sc->mp, false, resv->fsbno);
 		kmem_free(resv);
 	}
 
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 8f354d27e3ea..1f3e3354749d 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -46,8 +46,7 @@ xchk_setup_rtbitmap(
 	if (error)
 		return error;
 
-	xchk_rt_init(sc, &sc->sr);
-	return 0;
+	return xchk_rt_init(sc, &sc->sr);
 }
 
 /* Realtime bitmap. */
diff --git a/fs/xfs/scrub/rtrefcount.c b/fs/xfs/scrub/rtrefcount.c
index 2a7d5268bd43..860b9feff951 100644
--- a/fs/xfs/scrub/rtrefcount.c
+++ b/fs/xfs/scrub/rtrefcount.c
@@ -33,8 +33,7 @@ xchk_setup_rtrefcountbt(
 	if (error)
 		return error;
 
-	xchk_rt_init(sc, &sc->sr);
-	return 0;
+	return xchk_rt_init(sc, &sc->sr);
 }
 
 /* Realtime Reference count btree scrubber. */
diff --git a/fs/xfs/scrub/rtrmap.c b/fs/xfs/scrub/rtrmap.c
index acd37e63c4bd..25d56cb1b506 100644
--- a/fs/xfs/scrub/rtrmap.c
+++ b/fs/xfs/scrub/rtrmap.c
@@ -50,8 +50,7 @@ xchk_setup_rtrmapbt(
 	if (error)
 		return error;
 
-	xchk_rt_init(sc, &sc->sr);
-	return 0;
+	return xchk_rt_init(sc, &sc->sr);
 }
 
 /* Realtime reverse mapping. */
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index edcdc407f6cb..b9fd891f4476 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -583,7 +583,9 @@ xrep_rtrmap_find_rmaps(
 	error = xchk_setup_fs(sc);
 	if (error)
 		return error;
-	xchk_rt_lock(sc, &sc->sr);
+	error = xchk_rt_lock(sc, &sc->sr);
+	if (error)
+		return error;
 
 	/* Scan for old rtrmap blocks. */
 	for_each_perag(sc->mp, agno, pag) {
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index be5ec3e8bf12..f4e1a7e1466b 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -71,8 +71,7 @@ xchk_setup_rtsummary(
 	if (error)
 		return error;
 
-	xchk_rt_init(sc, &sc->sr);
-	return 0;
+	return xchk_rt_init(sc, &sc->sr);
 }
 
 /* Update the summary file to reflect the free extent that we've accumulated. */
author	Darrick J. Wong <djwong@kernel.org>	2021-10-22 15:31:05 -0700
committer	Darrick J. Wong <djwong@kernel.org>	2021-12-15 17:29:29 -0800
commit	bb8092d02d822601113897de29384972342ed1e8 (patch)
tree	b496a8902d1bb28a09623580309e88d66f1193ae /fs/xfs/scrub
parent	be31442955a3d225b71223eebc056015fab0d698 (diff)