1 files changed, 149 insertions, 19 deletions
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 0ea9b6b299ae..323179b3d17a 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -484,7 +484,35 @@ want_ag_read_header_failure(
  *
  * The headers should be released by xchk_ag_free, but as a fail safe we attach
  * all the buffers we grab to the scrub transaction so they'll all be freed
- * when we cancel it.  Returns ENOENT if we can't grab the perag structure.
+ * when we cancel it.
+ */
+static inline int
+__xchk_ag_read_headers(
+	struct xfs_scrub	*sc,
+	xfs_agnumber_t		agno,
+	struct xchk_ag		*sa)
+{
+	struct xfs_mount	*mp = sc->mp;
+	int			error;
+
+	error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp);
+	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
+		return error;
+
+	error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp);
+	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
+		return error;
+
+	error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp);
+	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
+		return error;
+
+	return 0;
+}
+
+/*
+ * Grab all the headers for an AG, and wait until there aren't any pending
+ * intents.  Returns -ENOENT if we can't grab the perag structure.
  */
 int
 xchk_ag_read_headers(
@@ -502,29 +530,83 @@ xchk_ag_read_headers(
 	return xchk_ag_lock(sc);
 }
 
-/* Lock the AG headers. */
+static inline bool
+xchk_ag_intents_pending(
+	struct xfs_perag	*pag)
+{
+	int			intents = atomic_read(&pag->pag_intents);
+
+	trace_xchk_ag_read_headers(pag->pag_mount, pag->pag_agno, intents,
+			_RET_IP_);
+
+	return intents > 0;
+}
+
+/* Lock the AG headers, waiting for pending intents to drain. */
 int
 xchk_ag_lock(
 	struct xfs_scrub	*sc)
 {
-	struct xfs_mount	*mp = sc->mp;
 	struct xchk_ag		*sa = &sc->sa;
-	xfs_agnumber_t		agno = sa->pag->pag_agno;
-	int			error;
+	int			error = 0;
 
-	error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp);
-	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
-		return error;
+	ASSERT(sa->pag != NULL);
+	ASSERT(sa->agi_bp == NULL);
+	ASSERT(sa->agf_bp == NULL);
+	ASSERT(sa->agfl_bp == NULL);
 
-	error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp);
-	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
-		return error;
+	do {
+		if (xchk_should_terminate(sc, &error))
+			break;
 
-	error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp);
-	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
-		return error;
+		error = __xchk_ag_read_headers(sc, sa->pag->pag_agno, sa);
+		if (error)
+			break;
 
-	return 0;
+		/*
+		 * Decide if this AG is quiet enough for all metadata to be
+		 * consistent with each other.  XFS allows the AG header buffer
+		 * locks to cycle across transaction rolls while processing
+		 * chains of deferred ops, which means that there could be
+		 * other threads in the middle of processing a chain of
+		 * deferred ops.  For regular operations we are careful about
+		 * ordering operations to prevent collisions between threads
+		 * (which is why we don't need a per-AG lock), but scrub and
+		 * repair have to serialize against chained operations.
+		 *
+		 * We just locked all the AG headers buffers; now take a look
+		 * to see if there are any intents in progress.  If there are,
+		 * drop the AG headers and wait for the intents to drain.
+		 * Since we hold all the AG header locks for the duration of
+		 * the scrub, this is the only time we have to sample the
+		 * intents counter; any threads increasing it after this point
+		 * can't possibly be in the middle of a chain of AG metadata
+		 * updates.
+		 */
+		if (!xchk_ag_intents_pending(sa->pag)) {
+			error = 0;
+			break;
+		}
+
+		if (sa->agfl_bp) {
+			xfs_trans_brelse(sc->tp, sa->agfl_bp);
+			sa->agfl_bp = NULL;
+		}
+
+		if (sa->agf_bp) {
+			xfs_trans_brelse(sc->tp, sa->agf_bp);
+			sa->agf_bp = NULL;
+		}
+
+		if (sa->agi_bp) {
+			xfs_trans_brelse(sc->tp, sa->agi_bp);
+			sa->agi_bp = NULL;
+		}
+
+		error = xfs_perag_wait_intents(sa->pag);
+	} while (!error);
+
+	return error;
 }
 
 /* Release all the AG btree cursors. */
@@ -653,14 +735,62 @@ xchk_ag_init(
 	return 0;
 }
 
-/* Lock everything we need to work on realtime metadata. */
-void
+#if IS_ENABLED(CONFIG_XFS_RT)
+static inline bool
+xchk_rt_intents_pending(
+	struct xfs_mount	*mp)
+{
+	int			intents = atomic_read(&mp->m_rt_intents);
+
+	trace_xchk_rt_lock(mp, -1U, intents, _RET_IP_);
+
+	return intents > 0;
+}
+#else
+# define xchk_rt_intents_pending(mp)	(false)
+#endif
+
+/* Lock everything we need to work on realtime metadata and wait for intents. */
+int
 xchk_rt_lock(
 	struct xfs_scrub	*sc,
 	struct xchk_rt		*sr)
 {
-	xfs_rtlock(NULL, sc->mp, XFS_RTLOCK_ALL);
-	sr->locked = true;
+	int			error = 0;
+
+	do {
+		if (xchk_should_terminate(sc, &error))
+			break;
+
+		xfs_rtlock(NULL, sc->mp, XFS_RTLOCK_ALL);
+
+		/*
+		 * Decide if the RT volume is quiet enough for all metadata to
+		 * be consistent with each other.  Regular file IO doesn't get
+		 * to lock all the rt inodes at the same time, which means that
+		 * there could be other threads in the middle of processing a
+		 * chain of deferred ops.
+		 *
+		 * We just locked all the rt inodes; now take a look to see if
+		 * there are any rt intents in progress.  If there are, drop
+		 * the rt inode locks and wait for the intents to drain.  Since
+		 * we hold the rt inode locks for the duration of the scrub,
+		 * this is the only time we have to sample the intents counter;
+		 * any threads increasing it after this point can't possibly be
+		 * in the middle of a chain of rt metadata updates.
+		 */
+		if (!xchk_rt_intents_pending(sc->mp)) {
+			sr->locked = true;
+			error = 0;
+			break;
+		}
+
+		xfs_rtunlock(sc->mp, XFS_RTLOCK_ALL);
+
+		error = xfs_rt_wait_intents(sc->mp);
+	} while (!error);
+
+	return error;
 }
 
 /*