42 files changed, 3910 insertions, 647 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 52e1823241fb..c9d653168ad0 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -128,6 +128,7 @@ config XFS_ONLINE_SCRUB
 	bool "XFS online metadata check support"
 	default n
 	depends on XFS_FS
+	depends on TMPFS && SHMEM
 	select XFS_DRAIN_INTENTS
 	help
 	  If you say Y here you will be able to check metadata on a
@@ -142,6 +143,23 @@ config XFS_ONLINE_SCRUB
 
 	  If unsure, say N.
 
+config XFS_ONLINE_SCRUB_STATS
+	bool "XFS online metadata check usage data collection"
+	default y
+	depends on XFS_ONLINE_SCRUB
+	select FS_DEBUG
+	help
+	  If you say Y here, the kernel will gather usage data about
+	  the online metadata check subsystem.  This includes the number
+	  of invocations, the outcomes, and the results of repairs, if any.
+	  This may slow down scrub slightly due to the use of high precision
+	  timers and the need to merge per-invocation information into the
+	  filesystem counters.
+
+	  Usage data are collected in /sys/kernel/debug/xfs/scrub.
+
+	  If unsure, say N.
+
 config XFS_ONLINE_REPAIR
 	bool "XFS online metadata repair support"
 	default n
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 16e4eb431230..7762c01a85cf 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -164,15 +164,24 @@ xfs-y				+= $(addprefix scrub/, \
 				   rmap.o \
 				   scrub.o \
 				   symlink.o \
+				   xfarray.o \
+				   xfile.o \
+				   )
+
+xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o
+
+xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
+				   rtbitmap.o \
+				   rtsummary.o \
 				   )
 
-xfs-$(CONFIG_XFS_RT)		+= scrub/rtbitmap.o
 xfs-$(CONFIG_XFS_QUOTA)		+= scrub/quota.o
 
 # online repair
 ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
 xfs-y				+= $(addprefix scrub/, \
 				   agheader_repair.o \
+				   reap.o \
 				   repair.o \
 				   )
 endif
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 2cbf9ea39b8c..6360073865db 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -743,7 +743,11 @@ struct xfs_scrub_metadata {
  */
 #define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7)
 
-#define XFS_SCRUB_FLAGS_IN	(XFS_SCRUB_IFLAG_REPAIR)
+/* i: Rebuild the data structure. */
+#define XFS_SCRUB_IFLAG_FORCE_REBUILD	(1u << 8)
+
+#define XFS_SCRUB_FLAGS_IN	(XFS_SCRUB_IFLAG_REPAIR | \
+				 XFS_SCRUB_IFLAG_FORCE_REBUILD)
 #define XFS_SCRUB_FLAGS_OUT	(XFS_SCRUB_OFLAG_CORRUPT | \
 				 XFS_SCRUB_OFLAG_PREEN | \
 				 XFS_SCRUB_OFLAG_XFAIL | \
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index bbaa65422c4f..876a2f41b063 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -26,6 +26,7 @@
 #include "scrub/trace.h"
 #include "scrub/repair.h"
 #include "scrub/bitmap.h"
+#include "scrub/reap.h"
 
 /* Superblock */
 
@@ -48,6 +49,10 @@ xrep_superblock(
 	if (error)
 		return error;
 
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
 	/* Copy AG 0's superblock to this one. */
 	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
 	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
@@ -423,6 +428,10 @@ xrep_agf(
 	if (error)
 		return error;
 
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
 	/* Start rewriting the header and implant the btrees we found. */
 	xrep_agf_init_header(sc, agf_bp, &old_agf);
 	xrep_agf_set_roots(sc, agf, fab);
@@ -444,13 +453,13 @@ out_revert:
 
 struct xrep_agfl {
 	/* Bitmap of alleged AGFL blocks that we're not going to add. */
-	struct xbitmap		crossed;
+	struct xagb_bitmap	crossed;
 
 	/* Bitmap of other OWN_AG metadata blocks. */
-	struct xbitmap		agmetablocks;
+	struct xagb_bitmap	agmetablocks;
 
 	/* Bitmap of free space. */
-	struct xbitmap		*freesp;
+	struct xagb_bitmap	*freesp;
 
 	/* rmapbt cursor for finding crosslinked blocks */
 	struct xfs_btree_cur	*rmap_cur;
@@ -466,7 +475,6 @@ xrep_agfl_walk_rmap(
 	void			*priv)
 {
 	struct xrep_agfl	*ra = priv;
-	xfs_fsblock_t		fsb;
 	int			error = 0;
 
 	if (xchk_should_terminate(ra->sc, &error))
@@ -474,14 +482,13 @@ xrep_agfl_walk_rmap(
 
 	/* Record all the OWN_AG blocks. */
 	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
-		fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-				rec->rm_startblock);
-		error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount);
+		error = xagb_bitmap_set(ra->freesp, rec->rm_startblock,
+				rec->rm_blockcount);
 		if (error)
 			return error;
 	}
 
-	return xbitmap_set_btcur_path(&ra->agmetablocks, cur);
+	return xagb_bitmap_set_btcur_path(&ra->agmetablocks, cur);
 }
 
 /* Strike out the blocks that are cross-linked according to the rmapbt. */
@@ -492,12 +499,10 @@ xrep_agfl_check_extent(
 	void			*priv)
 {
 	struct xrep_agfl	*ra = priv;
-	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start);
+	xfs_agblock_t		agbno = start;
 	xfs_agblock_t		last_agbno = agbno + len - 1;
 	int			error;
 
-	ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno);
-
 	while (agbno <= last_agbno) {
 		bool		other_owners;
 
@@ -507,7 +512,7 @@ xrep_agfl_check_extent(
 			return error;
 
 		if (other_owners) {
-			error = xbitmap_set(&ra->crossed, agbno, 1);
+			error = xagb_bitmap_set(&ra->crossed, agbno, 1);
 			if (error)
 				return error;
 		}
@@ -533,7 +538,7 @@ STATIC int
 xrep_agfl_collect_blocks(
 	struct xfs_scrub	*sc,
 	struct xfs_buf		*agf_bp,
-	struct xbitmap		*agfl_extents,
+	struct xagb_bitmap	*agfl_extents,
 	xfs_agblock_t		*flcount)
 {
 	struct xrep_agfl	ra;
@@ -543,8 +548,8 @@ xrep_agfl_collect_blocks(
 
 	ra.sc = sc;
 	ra.freesp = agfl_extents;
-	xbitmap_init(&ra.agmetablocks);
-	xbitmap_init(&ra.crossed);
+	xagb_bitmap_init(&ra.agmetablocks);
+	xagb_bitmap_init(&ra.crossed);
 
 	/* Find all space used by the free space btrees & rmapbt. */
 	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
@@ -556,7 +561,7 @@ xrep_agfl_collect_blocks(
 	/* Find all blocks currently being used by the bnobt. */
 	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
 			sc->sa.pag, XFS_BTNUM_BNO);
-	error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
+	error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
 	xfs_btree_del_cursor(cur, error);
 	if (error)
 		goto out_bmp;
@@ -564,7 +569,7 @@ xrep_agfl_collect_blocks(
 	/* Find all blocks currently being used by the cntbt. */
 	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
 			sc->sa.pag, XFS_BTNUM_CNT);
-	error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
+	error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
 	xfs_btree_del_cursor(cur, error);
 	if (error)
 		goto out_bmp;
@@ -573,17 +578,17 @@ xrep_agfl_collect_blocks(
 	 * Drop the freesp meta blocks that are in use by btrees.
 	 * The remaining blocks /should/ be AGFL blocks.
 	 */
-	error = xbitmap_disunion(agfl_extents, &ra.agmetablocks);
+	error = xagb_bitmap_disunion(agfl_extents, &ra.agmetablocks);
 	if (error)
 		goto out_bmp;
 
 	/* Strike out the blocks that are cross-linked. */
 	ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
-	error = xbitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra);
+	error = xagb_bitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra);
 	xfs_btree_del_cursor(ra.rmap_cur, error);
 	if (error)
 		goto out_bmp;
-	error = xbitmap_disunion(agfl_extents, &ra.crossed);
+	error = xagb_bitmap_disunion(agfl_extents, &ra.crossed);
 	if (error)
 		goto out_bmp;
 
@@ -591,12 +596,12 @@ xrep_agfl_collect_blocks(
 	 * Calculate the new AGFL size.  If we found more blocks than fit in
 	 * the AGFL we'll free them later.
 	 */
-	*flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents),
+	*flcount = min_t(uint64_t, xagb_bitmap_hweight(agfl_extents),
 			 xfs_agfl_size(mp));
 
 out_bmp:
-	xbitmap_destroy(&ra.crossed);
-	xbitmap_destroy(&ra.agmetablocks);
+	xagb_bitmap_destroy(&ra.crossed);
+	xagb_bitmap_destroy(&ra.agmetablocks);
 	return error;
 }
 
@@ -615,18 +620,24 @@ xrep_agfl_update_agf(
 	xfs_force_summary_recalc(sc->mp);
 
 	/* Update the AGF counters. */
-	if (xfs_perag_initialised_agf(sc->sa.pag))
+	if (xfs_perag_initialised_agf(sc->sa.pag)) {
 		sc->sa.pag->pagf_flcount = flcount;
+		clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET,
+				&sc->sa.pag->pag_opstate);
+	}
 	agf->agf_flfirst = cpu_to_be32(0);
 	agf->agf_flcount = cpu_to_be32(flcount);
-	agf->agf_fllast = cpu_to_be32(flcount - 1);
+	if (flcount)
+		agf->agf_fllast = cpu_to_be32(flcount - 1);
+	else
+		agf->agf_fllast = cpu_to_be32(xfs_agfl_size(sc->mp) - 1);
 
 	xfs_alloc_log_agf(sc->tp, agf_bp,
 			XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
 }
 
 struct xrep_agfl_fill {
-	struct xbitmap		used_extents;
+	struct xagb_bitmap	used_extents;
 	struct xfs_scrub	*sc;
 	__be32			*agfl_bno;
 	xfs_agblock_t		flcount;
@@ -642,17 +653,15 @@ xrep_agfl_fill(
 {
 	struct xrep_agfl_fill	*af = priv;
 	struct xfs_scrub	*sc = af->sc;
-	xfs_fsblock_t		fsbno = start;
+	xfs_agblock_t		agbno = start;
 	int			error;
 
-	while (fsbno < start + len && af->fl_off < af->flcount)
-		af->agfl_bno[af->fl_off++] =
-				cpu_to_be32(XFS_FSB_TO_AGBNO(sc->mp, fsbno++));
+	trace_xrep_agfl_insert(sc->sa.pag, agbno, len);
 
-	trace_xrep_agfl_insert(sc->mp, sc->sa.pag->pag_agno,
-			XFS_FSB_TO_AGBNO(sc->mp, start), len);
+	while (agbno < start + len && af->fl_off < af->flcount)
+		af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++);
 
-	error = xbitmap_set(&af->used_extents, start, fsbno - 1);
+	error = xagb_bitmap_set(&af->used_extents, start, agbno - 1);
 	if (error)
 		return error;
 
@@ -667,7 +676,7 @@ STATIC int
 xrep_agfl_init_header(
 	struct xfs_scrub	*sc,
 	struct xfs_buf		*agfl_bp,
-	struct xbitmap		*agfl_extents,
+	struct xagb_bitmap	*agfl_extents,
 	xfs_agblock_t		flcount)
 {
 	struct xrep_agfl_fill	af = {
@@ -695,17 +704,17 @@ xrep_agfl_init_header(
 	 * blocks than fit in the AGFL, they will be freed in a subsequent
 	 * step.
 	 */
-	xbitmap_init(&af.used_extents);
+	xagb_bitmap_init(&af.used_extents);
 	af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp),
-	xbitmap_walk(agfl_extents, xrep_agfl_fill, &af);
-	error = xbitmap_disunion(agfl_extents, &af.used_extents);
+	xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af);
+	error = xagb_bitmap_disunion(agfl_extents, &af.used_extents);
 	if (error)
 		return error;
 
 	/* Write new AGFL to disk. */
 	xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF);
 	xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1);
-	xbitmap_destroy(&af.used_extents);
+	xagb_bitmap_destroy(&af.used_extents);
 	return 0;
 }
 
@@ -714,7 +723,7 @@ int
 xrep_agfl(
 	struct xfs_scrub	*sc)
 {
-	struct xbitmap		agfl_extents;
+	struct xagb_bitmap	agfl_extents;
 	struct xfs_mount	*mp = sc->mp;
 	struct xfs_buf		*agf_bp;
 	struct xfs_buf		*agfl_bp;
@@ -725,7 +734,7 @@ xrep_agfl(
 	if (!xfs_has_rmapbt(mp))
 		return -EOPNOTSUPP;
 
-	xbitmap_init(&agfl_extents);
+	xagb_bitmap_init(&agfl_extents);
 
 	/*
 	 * Read the AGF so that we can query the rmapbt.  We hope that there's
@@ -753,6 +762,10 @@ xrep_agfl(
 	if (error)
 		goto err;
 
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		goto err;
+
 	/*
 	 * Update AGF and AGFL.  We reset the global free block counter when
 	 * we adjust the AGF flcount (which can fail) so avoid updating any
@@ -774,10 +787,10 @@ xrep_agfl(
 		goto err;
 
 	/* Dump any AGFL overflow. */
-	error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
+	error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
 			XFS_AG_RESV_AGFL);
 err:
-	xbitmap_destroy(&agfl_extents);
+	xagb_bitmap_destroy(&agfl_extents);
 	return error;
 }
 
@@ -1000,6 +1013,10 @@ xrep_agi(
 	if (error)
 		return error;
 
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
 	/* Start rewriting the header and implant the btrees we found. */
 	xrep_agi_init_header(sc, agi_bp, &old_agi);
 	xrep_agi_set_roots(sc, agi, fab);
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 0c959be396ea..e0c89a9a0ca0 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -301,21 +301,15 @@ xagb_bitmap_set_btblocks(
  * blocks going from the leaf towards the root.
  */
 int
-xbitmap_set_btcur_path(
-	struct xbitmap		*bitmap,
+xagb_bitmap_set_btcur_path(
+	struct xagb_bitmap	*bitmap,
 	struct xfs_btree_cur	*cur)
 {
-	struct xfs_buf		*bp;
-	xfs_fsblock_t		fsb;
 	int			i;
 	int			error;
 
 	for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
-		xfs_btree_get_block(cur, i, &bp);
-		if (!bp)
-			continue;
-		fsb = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
-		error = xbitmap_set(bitmap, fsb, 1);
+		error = xagb_bitmap_visit_btblock(cur, i, bitmap);
 		if (error)
 			return error;
 	}
@@ -323,35 +317,6 @@ xbitmap_set_btcur_path(
 	return 0;
 }
 
-/* Collect a btree's block in the bitmap. */
-STATIC int
-xbitmap_collect_btblock(
-	struct xfs_btree_cur	*cur,
-	int			level,
-	void			*priv)
-{
-	struct xbitmap		*bitmap = priv;
-	struct xfs_buf		*bp;
-	xfs_fsblock_t		fsbno;
-
-	xfs_btree_get_block(cur, level, &bp);
-	if (!bp)
-		return 0;
-
-	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
-	return xbitmap_set(bitmap, fsbno, 1);
-}
-
-/* Walk the btree and mark the bitmap wherever a btree block is found. */
-int
-xbitmap_set_btblocks(
-	struct xbitmap		*bitmap,
-	struct xfs_btree_cur	*cur)
-{
-	return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock,
-			XFS_BTREE_VISIT_ALL, bitmap);
-}
-
 /* How many bits are set in this bitmap? */
 uint64_t
 xbitmap_hweight(
@@ -385,43 +350,6 @@ xbitmap_walk(
 	return error;
 }
 
-struct xbitmap_walk_bits {
-	xbitmap_walk_bits_fn	fn;
-	void			*priv;
-};
-
-/* Walk all the bits in a run. */
-static int
-xbitmap_walk_bits_in_run(
-	uint64_t			start,
-	uint64_t			len,
-	void				*priv)
-{
-	struct xbitmap_walk_bits	*wb = priv;
-	uint64_t			i;
-	int				error = 0;
-
-	for (i = start; i < start + len; i++) {
-		error = wb->fn(i, wb->priv);
-		if (error)
-			break;
-	}
-
-	return error;
-}
-
-/* Call a function for every set bit in this bitmap. */
-int
-xbitmap_walk_bits(
-	struct xbitmap			*bitmap,
-	xbitmap_walk_bits_fn		fn,
-	void				*priv)
-{
-	struct xbitmap_walk_bits	wb = {.fn = fn, .priv = priv};
-
-	return xbitmap_walk(bitmap, xbitmap_walk_bits_in_run, &wb);
-}
-
 /* Does this bitmap have no bits set at all? */
 bool
 xbitmap_empty(
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 84981724ecaf..4fe58bad6734 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -16,10 +16,6 @@ void xbitmap_destroy(struct xbitmap *bitmap);
 int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len);
 int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
 int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
-int xbitmap_set_btcur_path(struct xbitmap *bitmap,
-		struct xfs_btree_cur *cur);
-int xbitmap_set_btblocks(struct xbitmap *bitmap,
-		struct xfs_btree_cur *cur);
 uint64_t xbitmap_hweight(struct xbitmap *bitmap);
 
 /*
@@ -33,10 +29,6 @@ typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv);
 int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn,
 		void *priv);
 
-typedef int (*xbitmap_walk_bits_fn)(uint64_t bit, void *priv);
-int xbitmap_walk_bits(struct xbitmap *bitmap, xbitmap_walk_bits_fn fn,
-		void *priv);
-
 bool xbitmap_empty(struct xbitmap *bitmap);
 bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len);
 
@@ -110,5 +102,7 @@ static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
 
 int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
 		struct xfs_btree_cur *cur);
+int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
+		struct xfs_btree_cur *cur);
 
 #endif	/* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 5bf4326e9783..75588915572e 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -38,8 +38,7 @@ xchk_setup_inode_bmap(
 	if (error)
 		goto out;
 
-	sc->ilock_flags = XFS_IOLOCK_EXCL;
-	xfs_ilock(sc->ip, XFS_IOLOCK_EXCL);
+	xchk_ilock(sc, XFS_IOLOCK_EXCL);
 
 	/*
 	 * We don't want any ephemeral data/cow fork updates sitting around
@@ -50,8 +49,7 @@ xchk_setup_inode_bmap(
 	    sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) {
 		struct address_space	*mapping = VFS_I(sc->ip)->i_mapping;
 
-		sc->ilock_flags |= XFS_MMAPLOCK_EXCL;
-		xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL);
+		xchk_ilock(sc, XFS_MMAPLOCK_EXCL);
 
 		inode_dio_wait(VFS_I(sc->ip));
 
@@ -79,9 +77,8 @@ xchk_setup_inode_bmap(
 	error = xchk_trans_alloc(sc, 0);
 	if (error)
 		goto out;
-	sc->ilock_flags |= XFS_ILOCK_EXCL;
-	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
 
+	xchk_ilock(sc, XFS_ILOCK_EXCL);
 out:
 	/* scrub teardown will unlock and release the inode */
 	return error;
@@ -844,7 +841,7 @@ xchk_bmap(
 
 	/* Non-existent forks can be ignored. */
 	if (!ifp)
-		goto out;
+		return -ENOENT;
 
 	info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
 	info.whichfork = whichfork;
@@ -853,10 +850,10 @@ xchk_bmap(
 
 	switch (whichfork) {
 	case XFS_COW_FORK:
-		/* No CoW forks on non-reflink inodes/filesystems. */
-		if (!xfs_is_reflink_inode(ip)) {
+		/* No CoW forks on non-reflink filesystems. */
+		if (!xfs_has_reflink(mp)) {
 			xchk_ino_set_corrupt(sc, sc->ip->i_ino);
-			goto out;
+			return 0;
 		}
 		break;
 	case XFS_ATTR_FORK:
@@ -876,31 +873,31 @@ xchk_bmap(
 		/* No mappings to check. */
 		if (whichfork == XFS_COW_FORK)
 			xchk_fblock_set_corrupt(sc, whichfork, 0);
-		goto out;
+		return 0;
 	case XFS_DINODE_FMT_EXTENTS:
 		break;
 	case XFS_DINODE_FMT_BTREE:
 		if (whichfork == XFS_COW_FORK) {
 			xchk_fblock_set_corrupt(sc, whichfork, 0);
-			goto out;
+			return 0;
 		}
 
 		error = xchk_bmap_btree(sc, whichfork, &info);
 		if (error)
-			goto out;
+			return error;
 		break;
 	default:
 		xchk_fblock_set_corrupt(sc, whichfork, 0);
-		goto out;
+		return 0;
 	}
 
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-		goto out;
+		return 0;
 
 	/* Find the offset of the last extent in the mapping. */
 	error = xfs_bmap_last_offset(ip, &endoff, whichfork);
 	if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
-		goto out;
+		return error;
 
 	/*
 	 * Scrub extent records.  We use a special iterator function here that
@@ -913,12 +910,12 @@ xchk_bmap(
 	while (xchk_bmap_iext_iter(&info, &irec)) {
 		if (xchk_should_terminate(sc, &error) ||
 		    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
-			goto out;
+			return 0;
 
 		if (irec.br_startoff >= endoff) {
 			xchk_fblock_set_corrupt(sc, whichfork,
 					irec.br_startoff);
-			goto out;
+			return 0;
 		}
 
 		if (isnullstartblock(irec.br_startblock))
@@ -931,10 +928,10 @@ xchk_bmap(
 	if (xchk_bmap_want_check_rmaps(&info)) {
 		error = xchk_bmap_check_rmaps(sc, whichfork);
 		if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error))
-			goto out;
+			return error;
 	}
-out:
-	return error;
+
+	return 0;
 }
 
 /* Scrub an inode's data fork. */
@@ -958,8 +955,5 @@ int
 xchk_bmap_cow(
 	struct xfs_scrub	*sc)
 {
-	if (!xfs_is_reflink_inode(sc->ip))
-		return -ENOENT;
-
 	return xchk_bmap(sc, XFS_COW_FORK);
 }
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 7a20256be969..de24532fe083 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -832,6 +832,25 @@ xchk_install_handle_inode(
 }
 
 /*
+ * Install an already-referenced inode for scrubbing.  Get our own reference to
+ * the inode to make disposal simpler.  The inode must not be in I_FREEING or
+ * I_WILL_FREE state!
+ */
+int
+xchk_install_live_inode(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*ip)
+{
+	if (!igrab(VFS_I(ip))) {
+		xchk_ino_set_corrupt(sc, ip->i_ino);
+		return -EFSCORRUPTED;
+	}
+
+	sc->ip = ip;
+	return 0;
+}
+
+/*
  * In preparation to scrub metadata structures that hang off of an inode,
  * grab either the inode referenced in the scrub control structure or the
  * inode passed in.  If the inumber does not reference an allocated inode
@@ -854,10 +873,8 @@ xchk_iget_for_scrubbing(
 	ASSERT(sc->tp == NULL);
 
 	/* We want to scan the inode we already had opened. */
-	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
-		sc->ip = ip_in;
-		return 0;
-	}
+	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
+		return xchk_install_live_inode(sc, ip_in);
 
 	/* Reject internal metadata files and obviously bad inode numbers. */
 	if (xfs_internal_inum(mp, sc->sm->sm_ino))
@@ -1005,20 +1022,48 @@ xchk_setup_inode_contents(
 		return error;
 
 	/* Lock the inode so the VFS cannot touch this file. */
-	sc->ilock_flags = XFS_IOLOCK_EXCL;
-	xfs_ilock(sc->ip, sc->ilock_flags);
+	xchk_ilock(sc, XFS_IOLOCK_EXCL);
 
 	error = xchk_trans_alloc(sc, resblks);
 	if (error)
 		goto out;
-	sc->ilock_flags |= XFS_ILOCK_EXCL;
-	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
-
+	xchk_ilock(sc, XFS_ILOCK_EXCL);
 out:
 	/* scrub teardown will unlock and release the inode for us */
 	return error;
 }
 
+void
+xchk_ilock(
+	struct xfs_scrub	*sc,
+	unsigned int		ilock_flags)
+{
+	xfs_ilock(sc->ip, ilock_flags);
+	sc->ilock_flags |= ilock_flags;
+}
+
+bool
+xchk_ilock_nowait(
+	struct xfs_scrub	*sc,
+	unsigned int		ilock_flags)
+{
+	if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
+		sc->ilock_flags |= ilock_flags;
+		return true;
+	}
+
+	return false;
+}
+
+void
+xchk_iunlock(
+	struct xfs_scrub	*sc,
+	unsigned int		ilock_flags)
+{
+	sc->ilock_flags &= ~ilock_flags;
+	xfs_iunlock(sc->ip, ilock_flags);
+}
+
 /*
  * Predicate that decides if we need to evaluate the cross-reference check.
  * If there was an error accessing the cross-reference btree, just delete
@@ -1185,3 +1230,155 @@ xchk_fsgates_enable(
 
 	sc->flags |= scrub_fsgates;
 }
+
+/*
+ * Decide if this is this a cached inode that's also allocated.  The caller
+ * must hold a reference to an AG and the AGI buffer lock to prevent inodes
+ * from being allocated or freed.
+ *
+ * Look up an inode by number in the given file system.  If the inode number
+ * is invalid, return -EINVAL.  If the inode is not in cache, return -ENODATA.
+ * If the inode is being reclaimed, return -ENODATA because we know the inode
+ * cache cannot be updating the ondisk metadata.
+ *
+ * Otherwise, the incore inode is the one we want, and it is either live,
+ * somewhere in the inactivation machinery, or reclaimable.  The inode is
+ * allocated if i_mode is nonzero.  In all three cases, the cached inode will
+ * be more up to date than the ondisk inode buffer, so we must use the incore
+ * i_mode.
+ */
+int
+xchk_inode_is_allocated(
+	struct xfs_scrub	*sc,
+	xfs_agino_t		agino,
+	bool			*inuse)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_perag	*pag = sc->sa.pag;
+	xfs_ino_t		ino;
+	struct xfs_inode	*ip;
+	int			error;
+
+	/* caller must hold perag reference */
+	if (pag == NULL) {
+		ASSERT(pag != NULL);
+		return -EINVAL;
+	}
+
+	/* caller must have AGI buffer */
+	if (sc->sa.agi_bp == NULL) {
+		ASSERT(sc->sa.agi_bp != NULL);
+		return -EINVAL;
+	}
+
+	/* reject inode numbers outside existing AGs */
+	ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
+	if (!xfs_verify_ino(mp, ino))
+		return -EINVAL;
+
+	error = -ENODATA;
+	rcu_read_lock();
+	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+	if (!ip) {
+		/* cache miss */
+		goto out_rcu;
+	}
+
+	/*
+	 * If the inode number doesn't match, the incore inode got reused
+	 * during an RCU grace period and the radix tree hasn't been updated.
+	 * This isn't the inode we want.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (ip->i_ino != ino)
+		goto out_skip;
+
+	trace_xchk_inode_is_allocated(ip);
+
+	/*
+	 * We have an incore inode that matches the inode we want, and the
+	 * caller holds the perag structure and the AGI buffer.  Let's check
+	 * our assumptions below:
+	 */
+
+#ifdef DEBUG
+	/*
+	 * (1) If the incore inode is live (i.e. referenced from the dcache),
+	 * it will not be INEW, nor will it be in the inactivation or reclaim
+	 * machinery.  The ondisk inode had better be allocated.  This is the
+	 * most trivial case.
+	 */
+	if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
+			     XFS_INACTIVATING))) {
+		/* live inode */
+		ASSERT(VFS_I(ip)->i_mode != 0);
+	}
+
+	/*
+	 * If the incore inode is INEW, there are several possibilities:
+	 *
+	 * (2) For a file that is being created, note that we allocate the
+	 * ondisk inode before allocating, initializing, and adding the incore
+	 * inode to the radix tree.
+	 *
+	 * (3) If the incore inode is being recycled, the inode has to be
+	 * allocated because we don't allow freed inodes to be recycled.
+	 * Recycling doesn't touch i_mode.
+	 */
+	if (ip->i_flags & XFS_INEW) {
+		/* created on disk already or recycling */
+		ASSERT(VFS_I(ip)->i_mode != 0);
+	}
+
+	/*
+	 * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
+	 * inactivation has not started (!INACTIVATING), it is still allocated.
+	 */
+	if ((ip->i_flags & XFS_NEED_INACTIVE) &&
+	    !(ip->i_flags & XFS_INACTIVATING)) {
+		/* definitely before difree */
+		ASSERT(VFS_I(ip)->i_mode != 0);
+	}
+#endif
+
+	/*
+	 * If the incore inode is undergoing inactivation (INACTIVATING), there
+	 * are two possibilities:
+	 *
+	 * (5) It is before the point where it would get freed ondisk, in which
+	 * case i_mode is still nonzero.
+	 *
+	 * (6) It has already been freed, in which case i_mode is zero.
+	 *
+	 * We don't take the ILOCK here, but difree and dialloc update the AGI,
+	 * and we've taken the AGI buffer lock, which prevents that from
+	 * happening.
+	 */
+
+	/*
+	 * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
+	 * reclaim (IRECLAIMABLE) could be allocated or free.  i_mode still
+	 * reflects the ondisk state.
+	 */
+
+	/*
+	 * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
+	 * the flush code uses i_mode to format the ondisk inode.
+	 */
+
+	/*
+	 * (9) If the inode is in IRECLAIM and was reachable via the radix
+	 * tree, it still has the same i_mode as it did before it entered
+	 * reclaim.  The inode object is still alive because we hold the RCU
+	 * read lock.
+	 */
+
+	*inuse = VFS_I(ip)->i_mode != 0;
+	error = 0;
+
+out_skip:
+	spin_unlock(&ip->i_flags_lock);
+out_rcu:
+	rcu_read_unlock();
+	return error;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 791235cd9b00..cabdc0e16838 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -88,10 +88,16 @@ int xchk_setup_xattr(struct xfs_scrub *sc);
 int xchk_setup_symlink(struct xfs_scrub *sc);
 int xchk_setup_parent(struct xfs_scrub *sc);
 #ifdef CONFIG_XFS_RT
-int xchk_setup_rt(struct xfs_scrub *sc);
+int xchk_setup_rtbitmap(struct xfs_scrub *sc);
+int xchk_setup_rtsummary(struct xfs_scrub *sc);
 #else
 static inline int
-xchk_setup_rt(struct xfs_scrub *sc)
+xchk_setup_rtbitmap(struct xfs_scrub *sc)
+{
+	return -ENOENT;
+}
+static inline int
+xchk_setup_rtsummary(struct xfs_scrub *sc)
 {
 	return -ENOENT;
 }
@@ -137,6 +143,12 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
 int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log);
 int xchk_iget_for_scrubbing(struct xfs_scrub *sc);
 int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks);
+int xchk_install_live_inode(struct xfs_scrub *sc, struct xfs_inode *ip);
+
+void xchk_ilock(struct xfs_scrub *sc, unsigned int ilock_flags);
+bool xchk_ilock_nowait(struct xfs_scrub *sc, unsigned int ilock_flags);
+void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags);
+
 void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
 
 int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp);
@@ -155,9 +167,29 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
 			       XFS_SCRUB_OFLAG_XCORRUPT);
 }
 
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+/* Decide if a repair is required. */
+static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
+{
+	return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+			       XFS_SCRUB_OFLAG_XCORRUPT |
+			       XFS_SCRUB_OFLAG_PREEN);
+}
+#else
+# define xchk_needs_repair(sc)		(false)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
 int xchk_metadata_inode_forks(struct xfs_scrub *sc);
 
 /*
+ * Helper macros to allocate and format xfile description strings.
+ * Callers must kfree the pointer returned.
+ */
+#define xchk_xfile_descr(sc, fmt, ...) \
+	kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \
+			(sc)->mp->m_super->s_id, ##__VA_ARGS__)
+
+/*
  * Setting up a hook to wait for intents to drain is costly -- we have to take
  * the CPU hotplug lock and force an i-cache flush on all CPUs once to set it
  * up, and again to tear it down.  These costs add up quickly, so we only want
@@ -171,4 +203,7 @@ static inline bool xchk_need_intent_drain(struct xfs_scrub *sc)
 
 void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks);
 
+int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino,
+		bool *inuse);
+
 #endif	/* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index d2b2a1cb6533..5e2b09ed6e29 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -226,6 +226,16 @@ xchk_ag_btree_healthy_enough(
 		return true;
 	}
 
+	/*
+	 * If we just repaired some AG metadata, sc->sick_mask will reflect all
+	 * the per-AG metadata types that were repaired.  Exclude these from
+	 * the filesystem health query because we have not yet updated the
+	 * health status and we want everything to be scanned.
+	 */
+	if ((sc->flags & XREP_ALREADY_FIXED) &&
+	    type_to_health_flag[sc->sm->sm_type].group == XHG_AG)
+		mask &= ~sc->sick_mask;
+
 	if (xfs_ag_has_sickness(pag, mask)) {
 		sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
 		return false;
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 575f22a02ebe..fb7bbf47ae5d 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -328,8 +328,7 @@ xchk_iallocbt_check_cluster_ifree(
 		goto out;
 	}
 
-	error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
-			&ino_inuse);
+	error = xchk_inode_is_allocated(bs->sc, agino, &ino_inuse);
 	if (error == -ENODATA) {
 		/* Not cached, just read the disk buffer */
 		freemask_ok = irec_free ^ !!(dip->di_mode);
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 3e1e02e340a6..59d7912fb75f 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -32,15 +32,13 @@ xchk_prepare_iscrub(
 {
 	int			error;
 
-	sc->ilock_flags = XFS_IOLOCK_EXCL;
-	xfs_ilock(sc->ip, sc->ilock_flags);
+	xchk_ilock(sc, XFS_IOLOCK_EXCL);
 
 	error = xchk_trans_alloc(sc, 0);
 	if (error)
 		return error;
 
-	sc->ilock_flags |= XFS_ILOCK_EXCL;
-	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+	xchk_ilock(sc, XFS_ILOCK_EXCL);
 	return 0;
 }
 
@@ -83,7 +81,10 @@ xchk_setup_inode(
 
 	/* We want to scan the opened inode, so lock it and exit. */
 	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
-		sc->ip = ip_in;
+		error = xchk_install_live_inode(sc, ip_in);
+		if (error)
+			return error;
+
 		return xchk_prepare_iscrub(sc);
 	}
 
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 58d5dfb7ea21..e6155d86f791 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -150,8 +150,8 @@ xchk_parent_validate(
 
 	lock_mode = xchk_parent_ilock_dir(dp);
 	if (!lock_mode) {
-		xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
-		xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+		xchk_iunlock(sc, XFS_ILOCK_EXCL);
+		xchk_ilock(sc, XFS_ILOCK_EXCL);
 		error = -EAGAIN;
 		goto out_rele;
 	}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index e6caa358cbda..5671c8153433 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -59,9 +59,12 @@ xchk_setup_quota(
 	error = xchk_setup_fs(sc);
 	if (error)
 		return error;
-	sc->ip = xfs_quota_inode(sc->mp, dqtype);
-	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
-	sc->ilock_flags = XFS_ILOCK_EXCL;
+
+	error = xchk_install_live_inode(sc, xfs_quota_inode(sc->mp, dqtype));
+	if (error)
+		return error;
+
+	xchk_ilock(sc, XFS_ILOCK_EXCL);
 	return 0;
 }
 
@@ -235,13 +238,11 @@ xchk_quota(
 	 * data fork we have to drop ILOCK_EXCL to use the regular dquot
 	 * functions.
 	 */
-	xfs_iunlock(sc->ip, sc->ilock_flags);
-	sc->ilock_flags = 0;
+	xchk_iunlock(sc, sc->ilock_flags);
 	sqi.sc = sc;
 	sqi.last_id = 0;
 	error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi);
-	sc->ilock_flags = XFS_ILOCK_EXCL;
-	xfs_ilock(sc->ip, sc->ilock_flags);
+	xchk_ilock(sc, XFS_ILOCK_EXCL);
 	if (error == -ECANCELED)
 		error = 0;
 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK,
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
new file mode 100644
index 000000000000..86a62420e02c
--- /dev/null
+++ b/fs/xfs/scrub/reap.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_extent_busy.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_bmap.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_remote.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/reap.h"
+
+/*
+ * Disposal of Blocks from Old Metadata
+ *
+ * Now that we've constructed a new btree to replace the damaged one, we want
+ * to dispose of the blocks that (we think) the old btree was using.
+ * Previously, we used the rmapbt to collect the extents (bitmap) with the
+ * rmap owner corresponding to the tree we rebuilt, collected extents for any
+ * blocks with the same rmap owner that are owned by another data structure
+ * (sublist), and subtracted sublist from bitmap.  In theory the extents
+ * remaining in bitmap are the old btree's blocks.
+ *
+ * Unfortunately, it's possible that the btree was crosslinked with other
+ * blocks on disk.  The rmap data can tell us if there are multiple owners, so
+ * if the rmapbt says there is an owner of this block other than @oinfo, then
+ * the block is crosslinked.  Remove the reverse mapping and continue.
+ *
+ * If there is one rmap record, we can free the block, which removes the
+ * reverse mapping but doesn't add the block to the free space.  Our repair
+ * strategy is to hope the other metadata objects crosslinked on this block
+ * will be rebuilt (atop different blocks), thereby removing all the cross
+ * links.
+ *
+ * If there are no rmap records at all, we also free the block.  If the btree
+ * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
+ * supposed to be a rmap record and everything is ok.  For other btrees there
+ * had to have been an rmap entry for the block to have ended up on @bitmap,
+ * so if it's gone now there's something wrong and the fs will shut down.
+ *
+ * Note: If there are multiple rmap records with only the same rmap owner as
+ * the btree we're trying to rebuild and the block is indeed owned by another
+ * data structure with the same rmap owner, then the block will be in sublist
+ * and therefore doesn't need disposal.  If there are multiple rmap records
+ * with only the same rmap owner but the block is not owned by something with
+ * the same rmap owner, the block will be freed.
+ *
+ * The caller is responsible for locking the AG headers for the entire rebuild
+ * operation so that nothing else can sneak in and change the AG state while
+ * we're not looking.  We must also invalidate any buffers associated with
+ * @bitmap.
+ */
+
+/* Information about reaping extents after a repair. */
+struct xreap_state {
+	struct xfs_scrub		*sc;
+
+	/* Reverse mapping owner and metadata reservation type. */
+	const struct xfs_owner_info	*oinfo;
+	enum xfs_ag_resv_type		resv;
+
+	/* If true, roll the transaction before reaping the next extent. */
+	bool				force_roll;
+
+	/* Number of deferred reaps attached to the current transaction. */
+	unsigned int			deferred;
+
+	/* Number of invalidated buffers logged to the current transaction. */
+	unsigned int			invalidated;
+
+	/* Number of deferred reaps queued during the whole reap sequence. */
+	unsigned long long		total_deferred;
+};
+
+/* Put a block back on the AGFL. */
+STATIC int
+xreap_put_freelist(
+	struct xfs_scrub	*sc,
+	xfs_agblock_t		agbno)
+{
+	struct xfs_buf		*agfl_bp;
+	int			error;
+
+	/* Make sure there's space on the freelist. */
+	error = xrep_fix_freelist(sc, true);
+	if (error)
+		return error;
+
+	/*
+	 * Since we're "freeing" a lost block onto the AGFL, we have to
+	 * create an rmap for the block prior to merging it or else other
+	 * parts will break.
+	 */
+	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
+			&XFS_RMAP_OINFO_AG);
+	if (error)
+		return error;
+
+	/* Put the block on the AGFL. */
+	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+	if (error)
+		return error;
+
+	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
+			agfl_bp, agbno, 0);
+	if (error)
+		return error;
+	xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
+			XFS_EXTENT_BUSY_SKIP_DISCARD);
+
+	return 0;
+}
+
+/* Are there any uncommitted reap operations? */
+static inline bool xreap_dirty(const struct xreap_state *rs)
+{
+	if (rs->force_roll)
+		return true;
+	if (rs->deferred)
+		return true;
+	if (rs->invalidated)
+		return true;
+	if (rs->total_deferred)
+		return true;
+	return false;
+}
+
+#define XREAP_MAX_BINVAL	(2048)
+
+/*
+ * Decide if we want to roll the transaction after reaping an extent.  We don't
+ * want to overrun the transaction reservation, so we prohibit more than
+ * 128 EFIs per transaction.  For the same reason, we limit the number
+ * of buffer invalidations to 2048.
+ */
+static inline bool xreap_want_roll(const struct xreap_state *rs)
+{
+	if (rs->force_roll)
+		return true;
+	if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
+		return true;
+	if (rs->invalidated > XREAP_MAX_BINVAL)
+		return true;
+	return false;
+}
+
+static inline void xreap_reset(struct xreap_state *rs)
+{
+	rs->total_deferred += rs->deferred;
+	rs->deferred = 0;
+	rs->invalidated = 0;
+	rs->force_roll = false;
+}
+
+#define XREAP_MAX_DEFER_CHAIN		(2048)
+
+/*
+ * Decide if we want to finish the deferred ops that are attached to the scrub
+ * transaction.  We don't want to queue huge chains of deferred ops because
+ * that can consume a lot of log space and kernel memory.  Hence we trigger a
+ * xfs_defer_finish if there are more than 2048 deferred reap operations or the
+ * caller did some real work.
+ */
+static inline bool
+xreap_want_defer_finish(const struct xreap_state *rs)
+{
+	if (rs->force_roll)
+		return true;
+	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
+		return true;
+	return false;
+}
+
+static inline void xreap_defer_finish_reset(struct xreap_state *rs)
+{
+	rs->total_deferred = 0;
+	rs->deferred = 0;
+	rs->invalidated = 0;
+	rs->force_roll = false;
+}
+
+/* Try to invalidate the incore buffers for an extent that we're freeing. */
+STATIC void
+xreap_agextent_binval(
+	struct xreap_state	*rs,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		*aglenp)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	struct xfs_perag	*pag = sc->sa.pag;
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agnumber_t		agno = sc->sa.pag->pag_agno;
+	xfs_agblock_t		agbno_next = agbno + *aglenp;
+	xfs_agblock_t		bno = agbno;
+
+	/*
+	 * Avoid invalidating AG headers and post-EOFS blocks because we never
+	 * own those.
+	 */
+	if (!xfs_verify_agbno(pag, agbno) ||
+	    !xfs_verify_agbno(pag, agbno_next - 1))
+		return;
+
+	/*
+	 * If there are incore buffers for these blocks, invalidate them.  We
+	 * assume that the lack of any other known owners means that the buffer
+	 * can be locked without risk of deadlocking.  The buffer cache cannot
+	 * detect aliasing, so employ nested loops to scan for incore buffers
+	 * of any plausible size.
+	 */
+	while (bno < agbno_next) {
+		xfs_agblock_t	fsbcount;
+		xfs_agblock_t	max_fsbs;
+
+		/*
+		 * Max buffer size is the max remote xattr buffer size, which
+		 * is one fs block larger than 64k.
+		 */
+		max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
+				xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
+
+		for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) {
+			struct xfs_buf	*bp = NULL;
+			xfs_daddr_t	daddr;
+			int		error;
+
+			daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
+			error = xfs_buf_incore(mp->m_ddev_targp, daddr,
+					XFS_FSB_TO_BB(mp, fsbcount),
+					XBF_LIVESCAN, &bp);
+			if (error)
+				continue;
+
+			xfs_trans_bjoin(sc->tp, bp);
+			xfs_trans_binval(sc->tp, bp);
+			rs->invalidated++;
+
+			/*
+			 * Stop invalidating if we've hit the limit; we should
+			 * still have enough reservation left to free however
+			 * far we've gotten.
+			 */
+			if (rs->invalidated > XREAP_MAX_BINVAL) {
+				*aglenp -= agbno_next - bno;
+				goto out;
+			}
+		}
+
+		bno++;
+	}
+
+out:
+	trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
+}
+
+/*
+ * Figure out the longest run of blocks that we can dispose of with a single
+ * call.  Cross-linked blocks should have their reverse mappings removed, but
+ * single-owner extents can be freed.  AGFL blocks can only be put back one at
+ * a time.
+ */
+STATIC int
+xreap_agextent_select(
+	struct xreap_state	*rs,
+	xfs_agblock_t		agbno,
+	xfs_agblock_t		agbno_next,
+	bool			*crosslinked,
+	xfs_extlen_t		*aglenp)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	struct xfs_btree_cur	*cur;
+	xfs_agblock_t		bno = agbno + 1;
+	xfs_extlen_t		len = 1;
+	int			error;
+
+	/*
+	 * Determine if there are any other rmap records covering the first
+	 * block of this extent.  If so, the block is crosslinked.
+	 */
+	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+			sc->sa.pag);
+	error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
+			crosslinked);
+	if (error)
+		goto out_cur;
+
+	/* AGFL blocks can only be deal with one at a time. */
+	if (rs->resv == XFS_AG_RESV_AGFL)
+		goto out_found;
+
+	/*
+	 * Figure out how many of the subsequent blocks have the same crosslink
+	 * status.
+	 */
+	while (bno < agbno_next) {
+		bool		also_crosslinked;
+
+		error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
+				&also_crosslinked);
+		if (error)
+			goto out_cur;
+
+		if (*crosslinked != also_crosslinked)
+			break;
+
+		len++;
+		bno++;
+	}
+
+out_found:
+	*aglenp = len;
+	trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
+out_cur:
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/*
+ * Dispose of as much of the beginning of this AG extent as possible.  The
+ * number of blocks disposed of will be returned in @aglenp.
+ */
+STATIC int
+xreap_agextent_iter(
+	struct xreap_state	*rs,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		*aglenp,
+	bool			crosslinked)
+{
+	struct xfs_scrub	*sc = rs->sc;
+	xfs_fsblock_t		fsbno;
+	int			error = 0;
+
+	fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
+
+	/*
+	 * If there are other rmappings, this block is cross linked and must
+	 * not be freed.  Remove the reverse mapping and move on.  Otherwise,
+	 * we were the only owner of the block, so free the extent, which will
+	 * also remove the rmap.
+	 *
+	 * XXX: XFS doesn't support detecting the case where a single block
+	 * metadata structure is crosslinked with a multi-block structure
+	 * because the buffer cache doesn't detect aliasing problems, so we
+	 * can't fix 100% of crosslinking problems (yet).  The verifiers will
+	 * blow on writeout, the filesystem will shut down, and the admin gets
+	 * to run xfs_repair.
+	 */
+	if (crosslinked) {
+		trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
+
+		rs->force_roll = true;
+		return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
+				*aglenp, rs->oinfo);
+	}
+
+	trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
+
+	/*
+	 * Invalidate as many buffers as we can, starting at agbno.  If this
+	 * function sets *aglenp to zero, the transaction is full of logged
+	 * buffer invalidations, so we need to return early so that we can
+	 * roll and retry.
+	 */
+	xreap_agextent_binval(rs, agbno, aglenp);
+	if (*aglenp == 0) {
+		ASSERT(xreap_want_roll(rs));
+		return 0;
+	}
+
+	/* Put blocks back on the AGFL one at a time. */
+	if (rs->resv == XFS_AG_RESV_AGFL) {
+		ASSERT(*aglenp == 1);
+		error = xreap_put_freelist(sc, agbno);
+		if (error)
+			return error;
+
+		rs->force_roll = true;
+		return 0;
+	}
+
+	/*
+	 * Use deferred frees to get rid of the old btree blocks to try to
+	 * minimize the window in which we could crash and lose the old blocks.
+	 */
+	error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
+			rs->resv, true);
+	if (error)
+		return error;
+
+	rs->deferred++;
+	return 0;
+}
+
+/*
+ * Break an AG metadata extent into sub-extents by fate (crosslinked, not
+ * crosslinked), and dispose of each sub-extent separately.
+ */
+STATIC int
+xreap_agmeta_extent(
+	uint64_t		fsbno,
+	uint64_t		len,
+	void			*priv)
+{
+	struct xreap_state	*rs = priv;
+	struct xfs_scrub	*sc = rs->sc;
+	xfs_agblock_t		agbno = fsbno;
+	xfs_agblock_t		agbno_next = agbno + len;
+	int			error = 0;
+
+	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
+	ASSERT(sc->ip == NULL);
+
+	while (agbno < agbno_next) {
+		xfs_extlen_t	aglen;
+		bool		crosslinked;
+
+		error = xreap_agextent_select(rs, agbno, agbno_next,
+				&crosslinked, &aglen);
+		if (error)
+			return error;
+
+		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
+		if (error)
+			return error;
+
+		if (xreap_want_defer_finish(rs)) {
+			error = xrep_defer_finish(sc);
+			if (error)
+				return error;
+			xreap_defer_finish_reset(rs);
+		} else if (xreap_want_roll(rs)) {
+			error = xrep_roll_ag_trans(sc);
+			if (error)
+				return error;
+			xreap_reset(rs);
+		}
+
+		agbno += aglen;
+	}
+
+	return 0;
+}
+
+/* Dispose of every block of every AG metadata extent in the bitmap. */
+int
+xrep_reap_agblocks(
+	struct xfs_scrub		*sc,
+	struct xagb_bitmap		*bitmap,
+	const struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type		type)
+{
+	struct xreap_state		rs = {
+		.sc			= sc,
+		.oinfo			= oinfo,
+		.resv			= type,
+	};
+	int				error;
+
+	ASSERT(xfs_has_rmapbt(sc->mp));
+	ASSERT(sc->ip == NULL);
+
+	error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
+	if (error)
+		return error;
+
+	if (xreap_dirty(&rs))
+		return xrep_defer_finish(sc);
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
new file mode 100644
index 000000000000..fe24626af164
--- /dev/null
+++ b/fs/xfs/scrub/reap.h
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_REAP_H__
+#define __XFS_SCRUB_REAP_H__
+
+int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
+		const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+
+#endif /* __XFS_SCRUB_REAP_H__ */
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index ac6d8803e660..1b8b5439f2d7 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -26,11 +26,13 @@
 #include "xfs_ag_resv.h"
 #include "xfs_quota.h"
 #include "xfs_qm.h"
+#include "xfs_defer.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
 #include "scrub/repair.h"
 #include "scrub/bitmap.h"
+#include "scrub/stats.h"
 
 /*
  * Attempt to repair some metadata, if the metadata is corrupt and userspace
@@ -39,8 +41,10 @@
  */
 int
 xrep_attempt(
-	struct xfs_scrub	*sc)
+	struct xfs_scrub	*sc,
+	struct xchk_stats_run	*run)
 {
+	u64			repair_start;
 	int			error = 0;
 
 	trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
@@ -49,8 +53,11 @@ xrep_attempt(
 
 	/* Repair whatever's broken. */
 	ASSERT(sc->ops->repair);
+	run->repair_attempted = true;
+	repair_start = xchk_stats_now();
 	error = sc->ops->repair(sc);
 	trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
+	run->repair_ns += xchk_stats_elapsed_ns(repair_start);
 	switch (error) {
 	case 0:
 		/*
@@ -59,14 +66,17 @@ xrep_attempt(
 		 */
 		sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
 		sc->flags |= XREP_ALREADY_FIXED;
+		run->repair_succeeded = true;
 		return -EAGAIN;
 	case -ECHRNG:
 		sc->flags |= XCHK_NEED_DRAIN;
+		run->retries++;
 		return -EAGAIN;
 	case -EDEADLOCK:
 		/* Tell the caller to try again having grabbed all the locks. */
 		if (!(sc->flags & XCHK_TRY_HARDER)) {
 			sc->flags |= XCHK_TRY_HARDER;
+			run->retries++;
 			return -EAGAIN;
 		}
 		/*
@@ -166,6 +176,56 @@ xrep_roll_ag_trans(
 	return 0;
 }
 
+/* Finish all deferred work attached to the repair transaction. */
+int
+xrep_defer_finish(
+	struct xfs_scrub	*sc)
+{
+	int			error;
+
+	/*
+	 * Keep the AG header buffers locked while we complete deferred work
+	 * items.  Ensure that both AG buffers are dirty and held when we roll
+	 * the transaction so that they move forward in the log without losing
+	 * the bli (and hence the bli type) when the transaction commits.
+	 *
+	 * Normal code would never hold clean buffers across a roll, but repair
+	 * needs both buffers to maintain a total lock on the AG.
+	 */
+	if (sc->sa.agi_bp) {
+		xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
+		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
+	}
+
+	if (sc->sa.agf_bp) {
+		xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
+		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+	}
+
+	/*
+	 * Finish all deferred work items.  We still hold the AG header buffers
+	 * locked regardless of whether or not that succeeds.  On failure, the
+	 * buffers will be released during teardown on our way out of the
+	 * kernel.  If successful, join the buffers to the new transaction
+	 * and move on.
+	 */
+	error = xfs_defer_finish(&sc->tp);
+	if (error)
+		return error;
+
+	/*
+	 * Release the hold that we set above because defer_finish won't do
+	 * that for us.  The defer roll code redirties held buffers after each
+	 * roll, so the AG header buffers should be ready for logging.
+	 */
+	if (sc->sa.agi_bp)
+		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
+	if (sc->sa.agf_bp)
+		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
+
+	return 0;
+}
+
 /*
  * Does the given AG have enough space to rebuild a btree?  Neither AG
  * reservation can be critical, and we must have enough space (factoring
@@ -297,89 +357,6 @@ xrep_calc_ag_resblks(
 	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
 }
 
-/* Allocate a block in an AG. */
-int
-xrep_alloc_ag_block(
-	struct xfs_scrub		*sc,
-	const struct xfs_owner_info	*oinfo,
-	xfs_fsblock_t			*fsbno,
-	enum xfs_ag_resv_type		resv)
-{
-	struct xfs_alloc_arg		args = {0};
-	xfs_agblock_t			bno;
-	int				error;
-
-	switch (resv) {
-	case XFS_AG_RESV_AGFL:
-	case XFS_AG_RESV_RMAPBT:
-		error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp,
-				sc->sa.agf_bp, &bno, 1);
-		if (error)
-			return error;
-		if (bno == NULLAGBLOCK)
-			return -ENOSPC;
-		xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false);
-		*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
-		if (resv == XFS_AG_RESV_RMAPBT)
-			xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
-		return 0;
-	default:
-		break;
-	}
-
-	args.tp = sc->tp;
-	args.mp = sc->mp;
-	args.pag = sc->sa.pag;
-	args.oinfo = *oinfo;
-	args.minlen = 1;
-	args.maxlen = 1;
-	args.prod = 1;
-	args.resv = resv;
-
-	error = xfs_alloc_vextent_this_ag(&args, sc->sa.pag->pag_agno);
-	if (error)
-		return error;
-	if (args.fsbno == NULLFSBLOCK)
-		return -ENOSPC;
-	ASSERT(args.len == 1);
-	*fsbno = args.fsbno;
-
-	return 0;
-}
-
-/* Initialize a new AG btree root block with zero entries. */
-int
-xrep_init_btblock(
-	struct xfs_scrub		*sc,
-	xfs_fsblock_t			fsb,
-	struct xfs_buf			**bpp,
-	xfs_btnum_t			btnum,
-	const struct xfs_buf_ops	*ops)
-{
-	struct xfs_trans		*tp = sc->tp;
-	struct xfs_mount		*mp = sc->mp;
-	struct xfs_buf			*bp;
-	int				error;
-
-	trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
-			XFS_FSB_TO_AGBNO(mp, fsb), btnum);
-
-	ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno);
-	error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
-			XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
-			&bp);
-	if (error)
-		return error;
-	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
-	xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno);
-	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
-	xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
-	bp->b_ops = ops;
-	*bpp = bp;
-
-	return 0;
-}
-
 /*
  * Reconstructing per-AG Btrees
  *
@@ -404,91 +381,8 @@ xrep_init_btblock(
  * sublist.  As with the other btrees we subtract sublist from bitmap, and the
  * result (since the rmapbt lives in the free space) are the blocks from the
  * old rmapbt.
- *
- * Disposal of Blocks from Old per-AG Btrees
- *
- * Now that we've constructed a new btree to replace the damaged one, we want
- * to dispose of the blocks that (we think) the old btree was using.
- * Previously, we used the rmapbt to collect the extents (bitmap) with the
- * rmap owner corresponding to the tree we rebuilt, collected extents for any
- * blocks with the same rmap owner that are owned by another data structure
- * (sublist), and subtracted sublist from bitmap.  In theory the extents
- * remaining in bitmap are the old btree's blocks.
- *
- * Unfortunately, it's possible that the btree was crosslinked with other
- * blocks on disk.  The rmap data can tell us if there are multiple owners, so
- * if the rmapbt says there is an owner of this block other than @oinfo, then
- * the block is crosslinked.  Remove the reverse mapping and continue.
- *
- * If there is one rmap record, we can free the block, which removes the
- * reverse mapping but doesn't add the block to the free space.  Our repair
- * strategy is to hope the other metadata objects crosslinked on this block
- * will be rebuilt (atop different blocks), thereby removing all the cross
- * links.
- *
- * If there are no rmap records at all, we also free the block.  If the btree
- * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
- * supposed to be a rmap record and everything is ok.  For other btrees there
- * had to have been an rmap entry for the block to have ended up on @bitmap,
- * so if it's gone now there's something wrong and the fs will shut down.
- *
- * Note: If there are multiple rmap records with only the same rmap owner as
- * the btree we're trying to rebuild and the block is indeed owned by another
- * data structure with the same rmap owner, then the block will be in sublist
- * and therefore doesn't need disposal.  If there are multiple rmap records
- * with only the same rmap owner but the block is not owned by something with
- * the same rmap owner, the block will be freed.
- *
- * The caller is responsible for locking the AG headers for the entire rebuild
- * operation so that nothing else can sneak in and change the AG state while
- * we're not looking.  We also assume that the caller already invalidated any
- * buffers associated with @bitmap.
  */
 
-static int
-xrep_invalidate_block(
-	uint64_t		fsbno,
-	void			*priv)
-{
-	struct xfs_scrub	*sc = priv;
-	struct xfs_buf		*bp;
-	int			error;
-
-	/* Skip AG headers and post-EOFS blocks */
-	if (!xfs_verify_fsbno(sc->mp, fsbno))
-		return 0;
-
-	error = xfs_buf_incore(sc->mp->m_ddev_targp,
-			XFS_FSB_TO_DADDR(sc->mp, fsbno),
-			XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
-	if (error)
-		return 0;
-
-	xfs_trans_bjoin(sc->tp, bp);
-	xfs_trans_binval(sc->tp, bp);
-	return 0;
-}
-
-/*
- * Invalidate buffers for per-AG btree blocks we're dumping.  This function
- * is not intended for use with file data repairs; we have bunmapi for that.
- */
-int
-xrep_invalidate_blocks(
-	struct xfs_scrub	*sc,
-	struct xbitmap		*bitmap)
-{
-	/*
-	 * For each block in each extent, see if there's an incore buffer for
-	 * exactly that block; if so, invalidate it.  The buffer cache only
-	 * lets us look for one buffer at a time, so we have to look one block
-	 * at a time.  Avoid invalidating AG headers and post-EOFS blocks
-	 * because we never own those; and if we can't TRYLOCK the buffer we
-	 * assume it's owned by someone else.
-	 */
-	return xbitmap_walk_bits(bitmap, xrep_invalidate_block, sc);
-}
-
 /* Ensure the freelist is the correct size. */
 int
 xrep_fix_freelist(
@@ -507,155 +401,6 @@ xrep_fix_freelist(
 			can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
 }
 
-/* Information about reaping extents after a repair. */
-struct xrep_reap_state {
-	struct xfs_scrub		*sc;
-
-	/* Reverse mapping owner and metadata reservation type. */
-	const struct xfs_owner_info	*oinfo;
-	enum xfs_ag_resv_type		resv;
-};
-
-/*
- * Put a block back on the AGFL.
- */
-STATIC int
-xrep_put_freelist(
-	struct xfs_scrub	*sc,
-	xfs_agblock_t		agbno)
-{
-	struct xfs_buf		*agfl_bp;
-	int			error;
-
-	/* Make sure there's space on the freelist. */
-	error = xrep_fix_freelist(sc, true);
-	if (error)
-		return error;
-
-	/*
-	 * Since we're "freeing" a lost block onto the AGFL, we have to
-	 * create an rmap for the block prior to merging it or else other
-	 * parts will break.
-	 */
-	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
-			&XFS_RMAP_OINFO_AG);
-	if (error)
-		return error;
-
-	/* Put the block on the AGFL. */
-	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
-	if (error)
-		return error;
-
-	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
-			agfl_bp, agbno, 0);
-	if (error)
-		return error;
-	xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
-			XFS_EXTENT_BUSY_SKIP_DISCARD);
-
-	return 0;
-}
-
-/* Dispose of a single block. */
-STATIC int
-xrep_reap_block(
-	uint64_t			fsbno,
-	void				*priv)
-{
-	struct xrep_reap_state		*rs = priv;
-	struct xfs_scrub		*sc = rs->sc;
-	struct xfs_btree_cur		*cur;
-	struct xfs_buf			*agf_bp = NULL;
-	xfs_agblock_t			agbno;
-	bool				has_other_rmap;
-	int				error;
-
-	ASSERT(sc->ip != NULL ||
-	       XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
-	trace_xrep_dispose_btree_extent(sc->mp,
-			XFS_FSB_TO_AGNO(sc->mp, fsbno),
-			XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
-
-	agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
-	ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
-
-	/*
-	 * If we are repairing per-inode metadata, we need to read in the AGF
-	 * buffer.  Otherwise, we're repairing a per-AG structure, so reuse
-	 * the AGF buffer that the setup functions already grabbed.
-	 */
-	if (sc->ip) {
-		error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
-		if (error)
-			return error;
-	} else {
-		agf_bp = sc->sa.agf_bp;
-	}
-	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag);
-
-	/* Can we find any other rmappings? */
-	error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
-			&has_other_rmap);
-	xfs_btree_del_cursor(cur, error);
-	if (error)
-		goto out_free;
-
-	/*
-	 * If there are other rmappings, this block is cross linked and must
-	 * not be freed.  Remove the reverse mapping and move on.  Otherwise,
-	 * we were the only owner of the block, so free the extent, which will
-	 * also remove the rmap.
-	 *
-	 * XXX: XFS doesn't support detecting the case where a single block
-	 * metadata structure is crosslinked with a multi-block structure
-	 * because the buffer cache doesn't detect aliasing problems, so we
-	 * can't fix 100% of crosslinking problems (yet).  The verifiers will
-	 * blow on writeout, the filesystem will shut down, and the admin gets
-	 * to run xfs_repair.
-	 */
-	if (has_other_rmap)
-		error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno,
-					1, rs->oinfo);
-	else if (rs->resv == XFS_AG_RESV_AGFL)
-		error = xrep_put_freelist(sc, agbno);
-	else
-		error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, rs->oinfo,
-				rs->resv);
-	if (agf_bp != sc->sa.agf_bp)
-		xfs_trans_brelse(sc->tp, agf_bp);
-	if (error)
-		return error;
-
-	if (sc->ip)
-		return xfs_trans_roll_inode(&sc->tp, sc->ip);
-	return xrep_roll_ag_trans(sc);
-
-out_free:
-	if (agf_bp != sc->sa.agf_bp)
-		xfs_trans_brelse(sc->tp, agf_bp);
-	return error;
-}
-
-/* Dispose of every block of every extent in the bitmap. */
-int
-xrep_reap_extents(
-	struct xfs_scrub		*sc,
-	struct xbitmap			*bitmap,
-	const struct xfs_owner_info	*oinfo,
-	enum xfs_ag_resv_type		type)
-{
-	struct xrep_reap_state		rs = {
-		.sc			= sc,
-		.oinfo			= oinfo,
-		.resv			= type,
-	};
-
-	ASSERT(xfs_has_rmapbt(sc->mp));
-
-	return xbitmap_walk_bits(bitmap, xrep_reap_block, &rs);
-}
-
 /*
  * Finding per-AG Btree Roots for AGF/AGI Reconstruction
  *
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index dce791c679ee..60d2a9ae5f2e 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -8,6 +8,8 @@
 
 #include "xfs_quota_defs.h"
 
+struct xchk_stats_run;
+
 static inline int xrep_notsupported(struct xfs_scrub *sc)
 {
 	return -EOPNOTSUPP;
@@ -15,28 +17,28 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
 
 #ifdef CONFIG_XFS_ONLINE_REPAIR
 
+/*
+ * This is the maximum number of deferred extent freeing item extents (EFIs)
+ * that we'll attach to a transaction without rolling the transaction to avoid
+ * overrunning a tr_itruncate reservation.
+ */
+#define XREP_MAX_ITRUNCATE_EFIS	(128)
+
+
 /* Repair helpers */
 
-int xrep_attempt(struct xfs_scrub *sc);
+int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run);
 void xrep_failure(struct xfs_mount *mp);
 int xrep_roll_ag_trans(struct xfs_scrub *sc);
+int xrep_defer_finish(struct xfs_scrub *sc);
 bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
 		enum xfs_ag_resv_type type);
 xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
-int xrep_alloc_ag_block(struct xfs_scrub *sc,
-		const struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
-		enum xfs_ag_resv_type resv);
-int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb,
-		struct xfs_buf **bpp, xfs_btnum_t btnum,
-		const struct xfs_buf_ops *ops);
 
 struct xbitmap;
 struct xagb_bitmap;
 
 int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
-int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist);
-int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist,
-		const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
 
 struct xrep_find_ag_btree {
 	/* in: rmap owner of the btree we're looking for */
@@ -70,7 +72,8 @@ int xrep_agi(struct xfs_scrub *sc);
 
 static inline int
 xrep_attempt(
-	struct xfs_scrub	*sc)
+	struct xfs_scrub	*sc,
+	struct xchk_stats_run	*run)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index e7dace7b4be8..008ddb599e13 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -19,19 +19,20 @@
 
 /* Set us up with the realtime metadata locked. */
 int
-xchk_setup_rt(
+xchk_setup_rtbitmap(
 	struct xfs_scrub	*sc)
 {
 	int			error;
 
-	error = xchk_setup_fs(sc);
+	error = xchk_trans_alloc(sc, 0);
 	if (error)
 		return error;
 
-	sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP;
-	sc->ip = sc->mp->m_rbmip;
-	xfs_ilock(sc->ip, sc->ilock_flags);
+	error = xchk_install_live_inode(sc, sc->mp->m_rbmip);
+	if (error)
+		return error;
 
+	xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
 	return 0;
 }
 
@@ -123,43 +124,6 @@ out:
 	return error;
 }
 
-/* Scrub the realtime summary. */
-int
-xchk_rtsummary(
-	struct xfs_scrub	*sc)
-{
-	struct xfs_inode	*rsumip = sc->mp->m_rsumip;
-	struct xfs_inode	*old_ip = sc->ip;
-	uint			old_ilock_flags = sc->ilock_flags;
-	int			error = 0;
-
-	/*
-	 * We ILOCK'd the rt bitmap ip in the setup routine, now lock the
-	 * rt summary ip in compliance with the rt inode locking rules.
-	 *
-	 * Since we switch sc->ip to rsumip we have to save the old ilock
-	 * flags so that we don't mix up the inode state that @sc tracks.
-	 */
-	sc->ip = rsumip;
-	sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM;
-	xfs_ilock(sc->ip, sc->ilock_flags);
-
-	/* Invoke the fork scrubber. */
-	error = xchk_metadata_inode_forks(sc);
-	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
-		goto out;
-
-	/* XXX: implement this some day */
-	xchk_set_incomplete(sc);
-out:
-	/* Switch back to the rtbitmap inode and lock flags. */
-	xfs_iunlock(sc->ip, sc->ilock_flags);
-	sc->ilock_flags = old_ilock_flags;
-	sc->ip = old_ip;
-	return error;
-}
-
-
 /* xref check that the extent is not free in the rtbitmap */
 void
 xchk_xref_is_used_rt_space(
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
new file mode 100644
index 000000000000..437ed9acbb27
--- /dev/null
+++ b/fs/xfs/scrub/rtsummary.c
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/xfile.h"
+
+/*
+ * Realtime Summary
+ * ================
+ *
+ * We check the realtime summary by scanning the realtime bitmap file to create
+ * a new summary file incore, and then we compare the computed version against
+ * the ondisk version.  We use the 'xfile' functionality to store this
+ * (potentially large) amount of data in pageable memory.
+ */
+
+/* Set us up to check the rtsummary file. */
+int
+xchk_setup_rtsummary(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	char			*descr;
+	int			error;
+
+	/*
+	 * Create an xfile to construct a new rtsummary file.  The xfile allows
+	 * us to avoid pinning kernel memory for this purpose.
+	 */
+	descr = xchk_xfile_descr(sc, "realtime summary file");
+	error = xfile_create(descr, mp->m_rsumsize, &sc->xfile);
+	kfree(descr);
+	if (error)
+		return error;
+
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		return error;
+
+	/* Allocate a memory buffer for the summary comparison. */
+	sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS);
+	if (!sc->buf)
+		return -ENOMEM;
+
+	error = xchk_install_live_inode(sc, mp->m_rsumip);
+	if (error)
+		return error;
+
+	/*
+	 * Locking order requires us to take the rtbitmap first.  We must be
+	 * careful to unlock it ourselves when we are done with the rtbitmap
+	 * file since the scrub infrastructure won't do that for us.  Only
+	 * then we can lock the rtsummary inode.
+	 */
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+	return 0;
+}
+
+/* Helper functions to record suminfo words in an xfile. */
+
+typedef unsigned int xchk_rtsumoff_t;
+
+static inline int
+xfsum_load(
+	struct xfs_scrub	*sc,
+	xchk_rtsumoff_t		sumoff,
+	xfs_suminfo_t		*info)
+{
+	return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t),
+			sumoff << XFS_WORDLOG);
+}
+
+static inline int
+xfsum_store(
+	struct xfs_scrub	*sc,
+	xchk_rtsumoff_t		sumoff,
+	const xfs_suminfo_t	info)
+{
+	return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t),
+			sumoff << XFS_WORDLOG);
+}
+
+static inline int
+xfsum_copyout(
+	struct xfs_scrub	*sc,
+	xchk_rtsumoff_t		sumoff,
+	xfs_suminfo_t		*info,
+	unsigned int		nr_words)
+{
+	return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG,
+			sumoff << XFS_WORDLOG);
+}
+
+/* Update the summary file to reflect the free extent that we've accumulated. */
+STATIC int
+xchk_rtsum_record_free(
+	struct xfs_mount		*mp,
+	struct xfs_trans		*tp,
+	const struct xfs_rtalloc_rec	*rec,
+	void				*priv)
+{
+	struct xfs_scrub		*sc = priv;
+	xfs_fileoff_t			rbmoff;
+	xfs_rtblock_t			rtbno;
+	xfs_filblks_t			rtlen;
+	xchk_rtsumoff_t			offs;
+	unsigned int			lenlog;
+	xfs_suminfo_t			v = 0;
+	int				error = 0;
+
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	/* Compute the relevant location in the rtsum file. */
+	rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext);
+	lenlog = XFS_RTBLOCKLOG(rec->ar_extcount);
+	offs = XFS_SUMOFFS(mp, lenlog, rbmoff);
+
+	rtbno = rec->ar_startext * mp->m_sb.sb_rextsize;
+	rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize;
+
+	if (!xfs_verify_rtext(mp, rtbno, rtlen)) {
+		xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
+		return -EFSCORRUPTED;
+	}
+
+	/* Bump the summary count. */
+	error = xfsum_load(sc, offs, &v);
+	if (error)
+		return error;
+
+	v++;
+	trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount,
+			lenlog, offs, v);
+
+	return xfsum_store(sc, offs, v);
+}
+
+/* Compute the realtime summary from the realtime bitmap. */
+STATIC int
+xchk_rtsum_compute(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	unsigned long long	rtbmp_bytes;
+
+	/* If the bitmap size doesn't match the computed size, bail. */
+	rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY);
+	if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) !=
+			mp->m_rbmip->i_disk_size)
+		return -EFSCORRUPTED;
+
+	return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free,
+			sc);
+}
+
+/* Compare the rtsummary file against the one we computed. */
+STATIC int
+xchk_rtsum_compare(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_buf		*bp;
+	struct xfs_bmbt_irec	map;
+	xfs_fileoff_t		off;
+	xchk_rtsumoff_t		sumoff = 0;
+	int			nmap;
+
+	for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) {
+		int		error = 0;
+
+		if (xchk_should_terminate(sc, &error))
+			return error;
+		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+			return 0;
+
+		/* Make sure we have a written extent. */
+		nmap = 1;
+		error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap,
+				XFS_DATA_FORK);
+		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
+			return error;
+
+		if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) {
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+			return 0;
+		}
+
+		/* Read a block's worth of ondisk rtsummary file. */
+		error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp);
+		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
+			return error;
+
+		/* Read a block's worth of computed rtsummary file. */
+		error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize);
+		if (error) {
+			xfs_trans_brelse(sc->tp, bp);
+			return error;
+		}
+
+		if (memcmp(bp->b_addr, sc->buf,
+					mp->m_blockwsize << XFS_WORDLOG) != 0)
+			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+
+		xfs_trans_brelse(sc->tp, bp);
+		sumoff += mp->m_blockwsize;
+	}
+
+	return 0;
+}
+
+/* Scrub the realtime summary. */
+int
+xchk_rtsummary(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	int			error = 0;
+
+	/* Invoke the fork scrubber. */
+	error = xchk_metadata_inode_forks(sc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		goto out_rbm;
+
+	/* Construct the new summary file from the rtbitmap. */
+	error = xchk_rtsum_compute(sc);
+	if (error == -EFSCORRUPTED) {
+		/*
+		 * EFSCORRUPTED means the rtbitmap is corrupt, which is an xref
+		 * error since we're checking the summary file.
+		 */
+		xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
+		error = 0;
+		goto out_rbm;
+	}
+	if (error)
+		goto out_rbm;
+
+	/* Does the computed summary file match the actual rtsummary file? */
+	error = xchk_rtsum_compare(sc);
+
+out_rbm:
+	/* Unlock the rtbitmap since we're done with it. */
+	xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+	return error;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index a0fffbcd022b..7d3aa14d81b5 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -22,6 +22,8 @@
 #include "scrub/trace.h"
 #include "scrub/repair.h"
 #include "scrub/health.h"
+#include "scrub/stats.h"
+#include "scrub/xfile.h"
 
 /*
  * Online Scrub and Repair
@@ -166,8 +168,6 @@ xchk_teardown(
 	struct xfs_scrub	*sc,
 	int			error)
 {
-	struct xfs_inode	*ip_in = XFS_I(file_inode(sc->file));
-
 	xchk_ag_free(sc, &sc->sa);
 	if (sc->tp) {
 		if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
@@ -178,16 +178,18 @@ xchk_teardown(
 	}
 	if (sc->ip) {
 		if (sc->ilock_flags)
-			xfs_iunlock(sc->ip, sc->ilock_flags);
-		if (sc->ip != ip_in &&
-		    !xfs_internal_inum(sc->mp, sc->ip->i_ino))
-			xchk_irele(sc, sc->ip);
+			xchk_iunlock(sc, sc->ilock_flags);
+		xchk_irele(sc, sc->ip);
 		sc->ip = NULL;
 	}
 	if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
 		sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
 		mnt_drop_write_file(sc->file);
 	}
+	if (sc->xfile) {
+		xfile_destroy(sc->xfile);
+		sc->xfile = NULL;
+	}
 	if (sc->buf) {
 		if (sc->buf_cleanup)
 			sc->buf_cleanup(sc->buf);
@@ -322,14 +324,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 	},
 	[XFS_SCRUB_TYPE_RTBITMAP] = {	/* realtime bitmap */
 		.type	= ST_FS,
-		.setup	= xchk_setup_rt,
+		.setup	= xchk_setup_rtbitmap,
 		.scrub	= xchk_rtbitmap,
 		.has	= xfs_has_realtime,
 		.repair	= xrep_notsupported,
 	},
 	[XFS_SCRUB_TYPE_RTSUM] = {	/* realtime summary */
 		.type	= ST_FS,
-		.setup	= xchk_setup_rt,
+		.setup	= xchk_setup_rtsummary,
 		.scrub	= xchk_rtsummary,
 		.has	= xfs_has_realtime,
 		.repair	= xrep_notsupported,
@@ -409,6 +411,11 @@ xchk_validate_inputs(
 		goto out;
 	}
 
+	/* No rebuild without repair. */
+	if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) &&
+	    !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+		return -EINVAL;
+
 	/*
 	 * We only want to repair read-write v5+ filesystems.  Defer the check
 	 * for ops->repair until after our scrub confirms that we need to
@@ -463,8 +470,10 @@ xfs_scrub_metadata(
 	struct file			*file,
 	struct xfs_scrub_metadata	*sm)
 {
+	struct xchk_stats_run		run = { };
 	struct xfs_scrub		*sc;
 	struct xfs_mount		*mp = XFS_I(file_inode(file))->i_mount;
+	u64				check_start;
 	int				error = 0;
 
 	BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
@@ -521,7 +530,9 @@ retry_op:
 		goto out_teardown;
 
 	/* Scrub for errors. */
+	check_start = xchk_stats_now();
 	error = sc->ops->scrub(sc);
+	run.scrub_ns += xchk_stats_elapsed_ns(check_start);
 	if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
 		goto try_harder;
 	if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
@@ -533,15 +544,16 @@ retry_op:
 
 	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
 	    !(sc->flags & XREP_ALREADY_FIXED)) {
-		bool needs_fix;
+		bool needs_fix = xchk_needs_repair(sc->sm);
+
+		/* Userspace asked us to rebuild the structure regardless. */
+		if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
+			needs_fix = true;
 
 		/* Let debug users force us into the repair routines. */
-		if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
-			sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+		if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+			needs_fix = true;
 
-		needs_fix = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
-						 XFS_SCRUB_OFLAG_XCORRUPT |
-						 XFS_SCRUB_OFLAG_PREEN));
 		/*
 		 * If userspace asked for a repair but it wasn't necessary,
 		 * report that back to userspace.
@@ -555,7 +567,7 @@ retry_op:
 		 * If it's broken, userspace wants us to fix it, and we haven't
 		 * already tried to fix it, then attempt a repair.
 		 */
-		error = xrep_attempt(sc);
+		error = xrep_attempt(sc, &run);
 		if (error == -EAGAIN) {
 			/*
 			 * Either the repair function succeeded or it couldn't
@@ -583,12 +595,15 @@ out:
 		sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 		error = 0;
 	}
+	if (error != -ENOENT)
+		xchk_stats_merge(mp, sm, &run);
 	return error;
 need_drain:
 	error = xchk_teardown(sc, 0);
 	if (error)
 		goto out_sc;
 	sc->flags |= XCHK_NEED_DRAIN;
+	run.retries++;
 	goto retry_op;
 try_harder:
 	/*
@@ -600,5 +615,6 @@ try_harder:
 	if (error)
 		goto out_sc;
 	sc->flags |= XCHK_TRY_HARDER;
+	run.retries++;
 	goto retry_op;
 }
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index f8ba00e51ca9..1ef9c6b4842a 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -88,6 +88,10 @@ struct xfs_scrub {
 	 */
 	void				(*buf_cleanup)(void *buf);
 
+	/* xfile used by the scrubbers; freed at teardown. */
+	struct xfile			*xfile;
+
+	/* Lock flags for @ip. */
 	uint				ilock_flags;
 
 	/* See the XCHK/XREP state flags below. */
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
new file mode 100644
index 000000000000..aeb92624176b
--- /dev/null
+++ b/fs/xfs/scrub/stats.c
@@ -0,0 +1,405 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_sysfs.h"
+#include "xfs_btree.h"
+#include "xfs_super.h"
+#include "scrub/scrub.h"
+#include "scrub/stats.h"
+#include "scrub/trace.h"
+
+struct xchk_scrub_stats {
+	/* all 32-bit counters here */
+
+	/* checking stats */
+	uint32_t		invocations;
+	uint32_t		clean;
+	uint32_t		corrupt;
+	uint32_t		preen;
+	uint32_t		xfail;
+	uint32_t		xcorrupt;
+	uint32_t		incomplete;
+	uint32_t		warning;
+	uint32_t		retries;
+
+	/* repair stats */
+	uint32_t		repair_invocations;
+	uint32_t		repair_success;
+
+	/* all 64-bit items here */
+
+	/* runtimes */
+	uint64_t		checktime_us;
+	uint64_t		repairtime_us;
+
+	/* non-counter state must go at the end for clearall */
+	spinlock_t		css_lock;
+};
+
+struct xchk_stats {
+	struct dentry		*cs_debugfs;
+	struct xchk_scrub_stats	cs_stats[XFS_SCRUB_TYPE_NR];
+};
+
+
+static struct xchk_stats	global_stats;
+
+static const char *name_map[XFS_SCRUB_TYPE_NR] = {
+	[XFS_SCRUB_TYPE_SB]		= "sb",
+	[XFS_SCRUB_TYPE_AGF]		= "agf",
+	[XFS_SCRUB_TYPE_AGFL]		= "agfl",
+	[XFS_SCRUB_TYPE_AGI]		= "agi",
+	[XFS_SCRUB_TYPE_BNOBT]		= "bnobt",
+	[XFS_SCRUB_TYPE_CNTBT]		= "cntbt",
+	[XFS_SCRUB_TYPE_INOBT]		= "inobt",
+	[XFS_SCRUB_TYPE_FINOBT]		= "finobt",
+	[XFS_SCRUB_TYPE_RMAPBT]		= "rmapbt",
+	[XFS_SCRUB_TYPE_REFCNTBT]	= "refcountbt",
+	[XFS_SCRUB_TYPE_INODE]		= "inode",
+	[XFS_SCRUB_TYPE_BMBTD]		= "bmapbtd",
+	[XFS_SCRUB_TYPE_BMBTA]		= "bmapbta",
+	[XFS_SCRUB_TYPE_BMBTC]		= "bmapbtc",
+	[XFS_SCRUB_TYPE_DIR]		= "directory",
+	[XFS_SCRUB_TYPE_XATTR]		= "xattr",
+	[XFS_SCRUB_TYPE_SYMLINK]	= "symlink",
+	[XFS_SCRUB_TYPE_PARENT]		= "parent",
+	[XFS_SCRUB_TYPE_RTBITMAP]	= "rtbitmap",
+	[XFS_SCRUB_TYPE_RTSUM]		= "rtsummary",
+	[XFS_SCRUB_TYPE_UQUOTA]		= "usrquota",
+	[XFS_SCRUB_TYPE_GQUOTA]		= "grpquota",
+	[XFS_SCRUB_TYPE_PQUOTA]		= "prjquota",
+	[XFS_SCRUB_TYPE_FSCOUNTERS]	= "fscounters",
+};
+
+/* Format the scrub stats into a text buffer, similar to pcp style. */
+STATIC ssize_t
+xchk_stats_format(
+	struct xchk_stats	*cs,
+	char			*buf,
+	size_t			remaining)
+{
+	struct xchk_scrub_stats	*css = &cs->cs_stats[0];
+	unsigned int		i;
+	ssize_t			copied = 0;
+	int			ret = 0;
+
+	for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) {
+		if (!name_map[i])
+			continue;
+
+		ret = scnprintf(buf, remaining,
+ "%s %u %u %u %u %u %u %u %u %u %llu %u %u %llu\n",
+				name_map[i],
+				(unsigned int)css->invocations,
+				(unsigned int)css->clean,
+				(unsigned int)css->corrupt,
+				(unsigned int)css->preen,
+				(unsigned int)css->xfail,
+				(unsigned int)css->xcorrupt,
+				(unsigned int)css->incomplete,
+				(unsigned int)css->warning,
+				(unsigned int)css->retries,
+				(unsigned long long)css->checktime_us,
+				(unsigned int)css->repair_invocations,
+				(unsigned int)css->repair_success,
+				(unsigned long long)css->repairtime_us);
+		if (ret <= 0)
+			break;
+
+		remaining -= ret;
+		copied += ret;
+		buf +=  ret;
+	}
+
+	return copied > 0 ? copied : ret;
+}
+
+/* Estimate the worst case buffer size required to hold the whole report. */
+STATIC size_t
+xchk_stats_estimate_bufsize(
+	struct xchk_stats	*cs)
+{
+	struct xchk_scrub_stats	*css = &cs->cs_stats[0];
+	unsigned int		i;
+	size_t			field_width;
+	size_t			ret = 0;
+
+	/* 4294967296 plus one space for each u32 field */
+	field_width = 11 * (offsetof(struct xchk_scrub_stats, checktime_us) /
+			    sizeof(uint32_t));
+
+	/* 18446744073709551615 plus one space for each u64 field */
+	field_width += 21 * ((offsetof(struct xchk_scrub_stats, css_lock) -
+			      offsetof(struct xchk_scrub_stats, checktime_us)) /
+			     sizeof(uint64_t));
+
+	for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) {
+		if (!name_map[i])
+			continue;
+
+		/* name plus one space */
+		ret += 1 + strlen(name_map[i]);
+
+		/* all fields, plus newline */
+		ret += field_width + 1;
+	}
+
+	return ret;
+}
+
+/* Clear all counters. */
+STATIC void
+xchk_stats_clearall(
+	struct xchk_stats	*cs)
+{
+	struct xchk_scrub_stats	*css = &cs->cs_stats[0];
+	unsigned int		i;
+
+	for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) {
+		spin_lock(&css->css_lock);
+		memset(css, 0, offsetof(struct xchk_scrub_stats, css_lock));
+		spin_unlock(&css->css_lock);
+	}
+}
+
+#define XFS_SCRUB_OFLAG_UNCLEAN	(XFS_SCRUB_OFLAG_CORRUPT | \
+				 XFS_SCRUB_OFLAG_PREEN | \
+				 XFS_SCRUB_OFLAG_XFAIL | \
+				 XFS_SCRUB_OFLAG_XCORRUPT | \
+				 XFS_SCRUB_OFLAG_INCOMPLETE | \
+				 XFS_SCRUB_OFLAG_WARNING)
+
+STATIC void
+xchk_stats_merge_one(
+	struct xchk_stats		*cs,
+	const struct xfs_scrub_metadata	*sm,
+	const struct xchk_stats_run	*run)
+{
+	struct xchk_scrub_stats		*css;
+
+	ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR);
+
+	css = &cs->cs_stats[sm->sm_type];
+	spin_lock(&css->css_lock);
+	css->invocations++;
+	if (!(sm->sm_flags & XFS_SCRUB_OFLAG_UNCLEAN))
+		css->clean++;
+	if (sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		css->corrupt++;
+	if (sm->sm_flags & XFS_SCRUB_OFLAG_PREEN)
+		css->preen++;
+	if (sm->sm_flags & XFS_SCRUB_OFLAG_XFAIL)
+		css->xfail++;
+	if (sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)
+		css->xcorrupt++;
+	if (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+		css->incomplete++;
+	if (sm->sm_flags & XFS_SCRUB_OFLAG_WARNING)
+		css->warning++;
+	css->retries += run->retries;
+	css->checktime_us += howmany_64(run->scrub_ns, NSEC_PER_USEC);
+
+	if (run->repair_attempted)
+		css->repair_invocations++;
+	if (run->repair_succeeded)
+		css->repair_success++;
+	css->repairtime_us += howmany_64(run->repair_ns, NSEC_PER_USEC);
+	spin_unlock(&css->css_lock);
+}
+
+/* Merge these scrub-run stats into the global and mount stat data. */
+void
+xchk_stats_merge(
+	struct xfs_mount		*mp,
+	const struct xfs_scrub_metadata	*sm,
+	const struct xchk_stats_run	*run)
+{
+	xchk_stats_merge_one(&global_stats, sm, run);
+	xchk_stats_merge_one(mp->m_scrub_stats, sm, run);
+}
+
+/* debugfs boilerplate */
+
+static ssize_t
+xchk_scrub_stats_read(
+	struct file		*file,
+	char __user		*ubuf,
+	size_t			count,
+	loff_t			*ppos)
+{
+	struct xchk_stats	*cs = file->private_data;
+	char			*buf;
+	size_t			bufsize;
+	ssize_t			avail, ret;
+
+	/*
+	 * This generates stringly snapshot of all the scrub counters, so we
+	 * do not want userspace to receive garbled text from multiple calls.
+	 * If the file position is greater than 0, return a short read.
+	 */
+	if (*ppos > 0)
+		return 0;
+
+	bufsize = xchk_stats_estimate_bufsize(cs);
+
+	buf = kvmalloc(bufsize, XCHK_GFP_FLAGS);
+	if (!buf)
+		return -ENOMEM;
+
+	avail = xchk_stats_format(cs, buf, bufsize);
+	if (avail < 0) {
+		ret = avail;
+		goto out;
+	}
+
+	ret = simple_read_from_buffer(ubuf, count, ppos, buf, avail);
+out:
+	kvfree(buf);
+	return ret;
+}
+
+static const struct file_operations scrub_stats_fops = {
+	.open			= simple_open,
+	.read			= xchk_scrub_stats_read,
+};
+
+static ssize_t
+xchk_clear_scrub_stats_write(
+	struct file		*file,
+	const char __user	*ubuf,
+	size_t			count,
+	loff_t			*ppos)
+{
+	struct xchk_stats	*cs = file->private_data;
+	unsigned int		val;
+	int			ret;
+
+	ret = kstrtouint_from_user(ubuf, count, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val != 1)
+		return -EINVAL;
+
+	xchk_stats_clearall(cs);
+	return count;
+}
+
+static const struct file_operations clear_scrub_stats_fops = {
+	.open			= simple_open,
+	.write			= xchk_clear_scrub_stats_write,
+};
+
+/* Initialize the stats object. */
+STATIC int
+xchk_stats_init(
+	struct xchk_stats	*cs,
+	struct xfs_mount	*mp)
+{
+	struct xchk_scrub_stats	*css = &cs->cs_stats[0];
+	unsigned int		i;
+
+	for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++)
+		spin_lock_init(&css->css_lock);
+
+	return 0;
+}
+
+/* Connect the stats object to debugfs. */
+void
+xchk_stats_register(
+	struct xchk_stats	*cs,
+	struct dentry		*parent)
+{
+	if (!parent)
+		return;
+
+	cs->cs_debugfs = xfs_debugfs_mkdir("scrub", parent);
+	if (!cs->cs_debugfs)
+		return;
+
+	debugfs_create_file("stats", 0644, cs->cs_debugfs, cs,
+			&scrub_stats_fops);
+	debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs,
+			&clear_scrub_stats_fops);
+}
+
+/* Free all resources related to the stats object. */
+STATIC int
+xchk_stats_teardown(
+	struct xchk_stats	*cs)
+{
+	return 0;
+}
+
+/* Disconnect the stats object from debugfs. */
+void
+xchk_stats_unregister(
+	struct xchk_stats	*cs)
+{
+	debugfs_remove(cs->cs_debugfs);
+}
+
+/* Initialize global stats and register them */
+int __init
+xchk_global_stats_setup(
+	struct dentry		*parent)
+{
+	int			error;
+
+	error = xchk_stats_init(&global_stats, NULL);
+	if (error)
+		return error;
+
+	xchk_stats_register(&global_stats, parent);
+	return 0;
+}
+
+/* Unregister global stats and tear them down */
+void
+xchk_global_stats_teardown(void)
+{
+	xchk_stats_unregister(&global_stats);
+	xchk_stats_teardown(&global_stats);
+}
+
+/* Allocate per-mount stats */
+int
+xchk_mount_stats_alloc(
+	struct xfs_mount	*mp)
+{
+	struct xchk_stats	*cs;
+	int			error;
+
+	cs = kvzalloc(sizeof(struct xchk_stats), GFP_KERNEL);
+	if (!cs)
+		return -ENOMEM;
+
+	error = xchk_stats_init(cs, mp);
+	if (error)
+		goto out_free;
+
+	mp->m_scrub_stats = cs;
+	return 0;
+out_free:
+	kvfree(cs);
+	return error;
+}
+
+/* Free per-mount stats */
+void
+xchk_mount_stats_free(
+	struct xfs_mount	*mp)
+{
+	xchk_stats_teardown(mp->m_scrub_stats);
+	kvfree(mp->m_scrub_stats);
+	mp->m_scrub_stats = NULL;
+}
diff --git a/fs/xfs/scrub/stats.h b/fs/xfs/scrub/stats.h
new file mode 100644
index 000000000000..b358ad8d8b90
--- /dev/null
+++ b/fs/xfs/scrub/stats.h
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_STATS_H__
+#define __XFS_SCRUB_STATS_H__
+
+struct xchk_stats_run {
+	u64			scrub_ns;
+	u64			repair_ns;
+	unsigned int		retries;
+	bool			repair_attempted;
+	bool			repair_succeeded;
+};
+
+#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS
+struct xchk_stats;
+
+int __init xchk_global_stats_setup(struct dentry *parent);
+void xchk_global_stats_teardown(void);
+
+int xchk_mount_stats_alloc(struct xfs_mount *mp);
+void xchk_mount_stats_free(struct xfs_mount *mp);
+
+void xchk_stats_register(struct xchk_stats *cs, struct dentry *parent);
+void xchk_stats_unregister(struct xchk_stats *cs);
+
+void xchk_stats_merge(struct xfs_mount *mp, const struct xfs_scrub_metadata *sm,
+		const struct xchk_stats_run *run);
+
+static inline u64 xchk_stats_now(void) { return ktime_get_ns(); }
+static inline u64 xchk_stats_elapsed_ns(u64 since)
+{
+	u64 now = xchk_stats_now();
+
+	/*
+	 * If the system doesn't have a high enough resolution clock, charge at
+	 * least one nanosecond so that our stats don't report instantaneous
+	 * runtimes.
+	 */
+	if (now == since)
+		return 1;
+
+	return now - since;
+}
+#else
+# define xchk_global_stats_setup(parent)	(0)
+# define xchk_global_stats_teardown()		((void)0)
+# define xchk_mount_stats_alloc(mp)		(0)
+# define xchk_mount_stats_free(mp)		((void)0)
+# define xchk_stats_register(cs, parent)	((void)0)
+# define xchk_stats_unregister(cs)		((void)0)
+# define xchk_stats_now()			(0)
+# define xchk_stats_elapsed_ns(x)		(0 * (x))
+# define xchk_stats_merge(mp, sm, run)		((void)0)
+#endif /* CONFIG_XFS_ONLINE_SCRUB_STATS */
+
+#endif /* __XFS_SCRUB_STATS_H__ */
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 0a975439d2b6..46249e7b17e0 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -12,8 +12,10 @@
 #include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "scrub/scrub.h"
 #include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
 
 /* Figure out which block the btree cursor was pointing to. */
 static inline xfs_fsblock_t
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 0b54f1a1cf0c..cbd4d01e253c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -16,6 +16,10 @@
 #include <linux/tracepoint.h>
 #include "xfs_bit.h"
 
+struct xfile;
+struct xfarray;
+struct xfarray_sortinfo;
+
 /*
  * ftrace's __print_symbolic requires that all enum values be wrapped in the
  * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
@@ -94,7 +98,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
 	{ XFS_SCRUB_OFLAG_XCORRUPT,		"xcorrupt" }, \
 	{ XFS_SCRUB_OFLAG_INCOMPLETE,		"incomplete" }, \
 	{ XFS_SCRUB_OFLAG_WARNING,		"warning" }, \
-	{ XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED,	"norepair" }
+	{ XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED,	"norepair" }, \
+	{ XFS_SCRUB_IFLAG_FORCE_REBUILD,	"rebuild" }
 
 #define XFS_SCRUB_STATE_STRINGS \
 	{ XCHK_TRY_HARDER,			"try_harder" }, \
@@ -636,6 +641,28 @@ TRACE_EVENT(xchk_iallocbt_check_cluster,
 		  __entry->cluster_ino)
 )
 
+TRACE_EVENT(xchk_inode_is_allocated,
+	TP_PROTO(struct xfs_inode *ip),
+	TP_ARGS(ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(unsigned long, iflags)
+		__field(umode_t, mode)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->iflags = ip->i_flags;
+		__entry->mode = VFS_I(ip)->i_mode;
+	),
+	TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx mode 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->iflags,
+		  __entry->mode)
+);
+
 TRACE_EVENT(xchk_fscounters_calc,
 	TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree,
 		 uint64_t fdblocks, uint64_t delalloc),
@@ -751,13 +778,302 @@ TRACE_EVENT(xchk_refcount_incorrect,
 		  __entry->seen)
 )
 
+TRACE_EVENT(xfile_create,
+	TP_PROTO(struct xfile *xf),
+	TP_ARGS(xf),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, ino)
+		__array(char, pathname, 256)
+	),
+	TP_fast_assign(
+		char		pathname[257];
+		char		*path;
+
+		__entry->ino = file_inode(xf->file)->i_ino;
+		memset(pathname, 0, sizeof(pathname));
+		path = file_path(xf->file, pathname, sizeof(pathname) - 1);
+		if (IS_ERR(path))
+			path = "(unknown)";
+		strncpy(__entry->pathname, path, sizeof(__entry->pathname));
+	),
+	TP_printk("xfino 0x%lx path '%s'",
+		  __entry->ino,
+		  __entry->pathname)
+);
+
+TRACE_EVENT(xfile_destroy,
+	TP_PROTO(struct xfile *xf),
+	TP_ARGS(xf),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long long, bytes)
+		__field(loff_t, size)
+	),
+	TP_fast_assign(
+		struct xfile_stat	statbuf;
+		int			ret;
+
+		ret = xfile_stat(xf, &statbuf);
+		if (!ret) {
+			__entry->bytes = statbuf.bytes;
+			__entry->size = statbuf.size;
+		} else {
+			__entry->bytes = -1;
+			__entry->size = -1;
+		}
+		__entry->ino = file_inode(xf->file)->i_ino;
+	),
+	TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx",
+		  __entry->ino,
+		  __entry->bytes,
+		  __entry->size)
+);
+
+DECLARE_EVENT_CLASS(xfile_class,
+	TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount),
+	TP_ARGS(xf, pos, bytecount),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long long, bytes_used)
+		__field(loff_t, pos)
+		__field(loff_t, size)
+		__field(unsigned long long, bytecount)
+	),
+	TP_fast_assign(
+		struct xfile_stat	statbuf;
+		int			ret;
+
+		ret = xfile_stat(xf, &statbuf);
+		if (!ret) {
+			__entry->bytes_used = statbuf.bytes;
+			__entry->size = statbuf.size;
+		} else {
+			__entry->bytes_used = -1;
+			__entry->size = -1;
+		}
+		__entry->ino = file_inode(xf->file)->i_ino;
+		__entry->pos = pos;
+		__entry->bytecount = bytecount;
+	),
+	TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx",
+		  __entry->ino,
+		  __entry->bytes_used,
+		  __entry->pos,
+		  __entry->bytecount,
+		  __entry->size)
+);
+#define DEFINE_XFILE_EVENT(name) \
+DEFINE_EVENT(xfile_class, name, \
+	TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \
+	TP_ARGS(xf, pos, bytecount))
+DEFINE_XFILE_EVENT(xfile_pread);
+DEFINE_XFILE_EVENT(xfile_pwrite);
+DEFINE_XFILE_EVENT(xfile_seek_data);
+DEFINE_XFILE_EVENT(xfile_get_page);
+DEFINE_XFILE_EVENT(xfile_put_page);
+
+TRACE_EVENT(xfarray_create,
+	TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity),
+	TP_ARGS(xfa, required_capacity),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(uint64_t, max_nr)
+		__field(size_t, obj_size)
+		__field(int, obj_size_log)
+		__field(unsigned long long, required_capacity)
+	),
+	TP_fast_assign(
+		__entry->max_nr = xfa->max_nr;
+		__entry->obj_size = xfa->obj_size;
+		__entry->obj_size_log = xfa->obj_size_log;
+		__entry->ino = file_inode(xfa->xfile->file)->i_ino;
+		__entry->required_capacity = required_capacity;
+	),
+	TP_printk("xfino 0x%lx max_nr %llu reqd_nr %llu objsz %zu objszlog %d",
+		  __entry->ino,
+		  __entry->max_nr,
+		  __entry->required_capacity,
+		  __entry->obj_size,
+		  __entry->obj_size_log)
+);
+
+TRACE_EVENT(xfarray_isort,
+	TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
+	TP_ARGS(si, lo, hi),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long long, lo)
+		__field(unsigned long long, hi)
+	),
+	TP_fast_assign(
+		__entry->ino = file_inode(si->array->xfile->file)->i_ino;
+		__entry->lo = lo;
+		__entry->hi = hi;
+	),
+	TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu",
+		  __entry->ino,
+		  __entry->lo,
+		  __entry->hi,
+		  __entry->hi - __entry->lo)
+);
+
+TRACE_EVENT(xfarray_pagesort,
+	TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
+	TP_ARGS(si, lo, hi),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long long, lo)
+		__field(unsigned long long, hi)
+	),
+	TP_fast_assign(
+		__entry->ino = file_inode(si->array->xfile->file)->i_ino;
+		__entry->lo = lo;
+		__entry->hi = hi;
+	),
+	TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu",
+		  __entry->ino,
+		  __entry->lo,
+		  __entry->hi,
+		  __entry->hi - __entry->lo)
+);
+
+TRACE_EVENT(xfarray_qsort,
+	TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
+	TP_ARGS(si, lo, hi),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long long, lo)
+		__field(unsigned long long, hi)
+		__field(int, stack_depth)
+		__field(int, max_stack_depth)
+	),
+	TP_fast_assign(
+		__entry->ino = file_inode(si->array->xfile->file)->i_ino;
+		__entry->lo = lo;
+		__entry->hi = hi;
+		__entry->stack_depth = si->stack_depth;
+		__entry->max_stack_depth = si->max_stack_depth;
+	),
+	TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu stack %d/%d",
+		  __entry->ino,
+		  __entry->lo,
+		  __entry->hi,
+		  __entry->hi - __entry->lo,
+		  __entry->stack_depth,
+		  __entry->max_stack_depth)
+);
+
+TRACE_EVENT(xfarray_sort,
+	TP_PROTO(struct xfarray_sortinfo *si, size_t bytes),
+	TP_ARGS(si, bytes),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long long, nr)
+		__field(size_t, obj_size)
+		__field(size_t, bytes)
+		__field(unsigned int, max_stack_depth)
+	),
+	TP_fast_assign(
+		__entry->nr = si->array->nr;
+		__entry->obj_size = si->array->obj_size;
+		__entry->ino = file_inode(si->array->xfile->file)->i_ino;
+		__entry->bytes = bytes;
+		__entry->max_stack_depth = si->max_stack_depth;
+	),
+	TP_printk("xfino 0x%lx nr %llu objsz %zu stack %u bytes %zu",
+		  __entry->ino,
+		  __entry->nr,
+		  __entry->obj_size,
+		  __entry->max_stack_depth,
+		  __entry->bytes)
+);
+
+TRACE_EVENT(xfarray_sort_stats,
+	TP_PROTO(struct xfarray_sortinfo *si, int error),
+	TP_ARGS(si, error),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+#ifdef DEBUG
+		__field(unsigned long long, loads)
+		__field(unsigned long long, stores)
+		__field(unsigned long long, compares)
+		__field(unsigned long long, heapsorts)
+#endif
+		__field(unsigned int, max_stack_depth)
+		__field(unsigned int, max_stack_used)
+		__field(int, error)
+	),
+	TP_fast_assign(
+		__entry->ino = file_inode(si->array->xfile->file)->i_ino;
+#ifdef DEBUG
+		__entry->loads = si->loads;
+		__entry->stores = si->stores;
+		__entry->compares = si->compares;
+		__entry->heapsorts = si->heapsorts;
+#endif
+		__entry->max_stack_depth = si->max_stack_depth;
+		__entry->max_stack_used = si->max_stack_used;
+		__entry->error = error;
+	),
+	TP_printk(
+#ifdef DEBUG
+		  "xfino 0x%lx loads %llu stores %llu compares %llu heapsorts %llu stack_depth %u/%u error %d",
+#else
+		  "xfino 0x%lx stack_depth %u/%u error %d",
+#endif
+		  __entry->ino,
+#ifdef DEBUG
+		  __entry->loads,
+		  __entry->stores,
+		  __entry->compares,
+		  __entry->heapsorts,
+#endif
+		  __entry->max_stack_used,
+		  __entry->max_stack_depth,
+		  __entry->error)
+);
+
+#ifdef CONFIG_XFS_RT
+TRACE_EVENT(xchk_rtsum_record_free,
+	TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start,
+		 uint64_t len, unsigned int log, loff_t pos, xfs_suminfo_t v),
+	TP_ARGS(mp, start, len, log, pos, v),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, rtdev)
+		__field(xfs_rtblock_t, start)
+		__field(unsigned long long, len)
+		__field(unsigned int, log)
+		__field(loff_t, pos)
+		__field(xfs_suminfo_t, v)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->rtdev = mp->m_rtdev_targp->bt_dev;
+		__entry->start = start;
+		__entry->len = len;
+		__entry->log = log;
+		__entry->pos = pos;
+		__entry->v = v;
+	),
+	TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+		  __entry->start,
+		  __entry->len,
+		  __entry->log,
+		  __entry->pos,
+		  __entry->v)
+);
+#endif /* CONFIG_XFS_RT */
+
 /* repair tracepoints */
 #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
 
 DECLARE_EVENT_CLASS(xrep_extent_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 xfs_agblock_t agbno, xfs_extlen_t len),
-	TP_ARGS(mp, agno, agbno, len),
+	TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(pag, agbno, len),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_agnumber_t, agno)
@@ -765,8 +1081,8 @@ DECLARE_EVENT_CLASS(xrep_extent_class,
 		__field(xfs_extlen_t, len)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
+		__entry->dev = pag->pag_mount->m_super->s_dev;
+		__entry->agno = pag->pag_agno;
 		__entry->agbno = agbno;
 		__entry->len = len;
 	),
@@ -778,12 +1094,45 @@ DECLARE_EVENT_CLASS(xrep_extent_class,
 );
 #define DEFINE_REPAIR_EXTENT_EVENT(name) \
 DEFINE_EVENT(xrep_extent_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-		 xfs_agblock_t agbno, xfs_extlen_t len), \
-	TP_ARGS(mp, agno, agbno, len))
-DEFINE_REPAIR_EXTENT_EVENT(xrep_dispose_btree_extent);
+	TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), \
+	TP_ARGS(pag, agbno, len))
+DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
 DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
 
+DECLARE_EVENT_CLASS(xrep_reap_find_class,
+	TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len,
+		bool crosslinked),
+	TP_ARGS(pag, agbno, len, crosslinked),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(bool, crosslinked)
+	),
+	TP_fast_assign(
+		__entry->dev = pag->pag_mount->m_super->s_dev;
+		__entry->agno = pag->pag_agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->crosslinked = crosslinked;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->crosslinked ? 1 : 0)
+);
+#define DEFINE_REPAIR_REAP_FIND_EVENT(name) \
+DEFINE_EVENT(xrep_reap_find_class, name, \
+	TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, \
+		 bool crosslinked), \
+	TP_ARGS(pag, agbno, len, crosslinked))
+DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select);
+
 DECLARE_EVENT_CLASS(xrep_rmap_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
 		 xfs_agblock_t agbno, xfs_extlen_t len,
@@ -853,28 +1202,6 @@ TRACE_EVENT(xrep_refcount_extent_fn,
 		  __entry->refcount)
 )
 
-TRACE_EVENT(xrep_init_btblock,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
-		 xfs_btnum_t btnum),
-	TP_ARGS(mp, agno, agbno, btnum),
-	TP_STRUCT__entry(
-		__field(dev_t, dev)
-		__field(xfs_agnumber_t, agno)
-		__field(xfs_agblock_t, agbno)
-		__field(uint32_t, btnum)
-	),
-	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
-		__entry->agbno = agbno;
-		__entry->btnum = btnum;
-	),
-	TP_printk("dev %d:%d agno 0x%x agbno 0x%x btree %s",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->agno,
-		  __entry->agbno,
-		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS))
-)
 TRACE_EVENT(xrep_findroot_block,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
 		 uint32_t magic, uint16_t level),
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
new file mode 100644
index 000000000000..f0f532c10a5a
--- /dev/null
+++ b/fs/xfs/scrub/xfarray.c
@@ -0,0 +1,1083 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2021-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/scrub.h"
+#include "scrub/trace.h"
+
+/*
+ * Large Arrays of Fixed-Size Records
+ * ==================================
+ *
+ * This memory array uses an xfile (which itself is a memfd "file") to store
+ * large numbers of fixed-size records in memory that can be paged out.  This
+ * puts less stress on the memory reclaim algorithms during an online repair
+ * because we don't have to pin so much memory.  However, array access is less
+ * direct than would be in a regular memory array.  Access to the array is
+ * performed via indexed load and store methods, and an append method is
+ * provided for convenience.  Array elements can be unset, which sets them to
+ * all zeroes.  Unset entries are skipped during iteration, though direct loads
+ * will return a zeroed buffer.  Callers are responsible for concurrency
+ * control.
+ */
+
+/*
+ * Pointer to scratch space.  Because we can't access the xfile data directly,
+ * we allocate a small amount of memory on the end of the xfarray structure to
+ * buffer array items when we need space to store values temporarily.
+ */
+static inline void *xfarray_scratch(struct xfarray *array)
+{
+	return (array + 1);
+}
+
+/* Compute array index given an xfile offset. */
+static xfarray_idx_t
+xfarray_idx(
+	struct xfarray	*array,
+	loff_t		pos)
+{
+	if (array->obj_size_log >= 0)
+		return (xfarray_idx_t)pos >> array->obj_size_log;
+
+	return div_u64((xfarray_idx_t)pos, array->obj_size);
+}
+
+/* Compute xfile offset of array element. */
+static inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx)
+{
+	if (array->obj_size_log >= 0)
+		return idx << array->obj_size_log;
+
+	return idx * array->obj_size;
+}
+
+/*
+ * Initialize a big memory array.  Array records cannot be larger than a
+ * page, and the array cannot span more bytes than the page cache supports.
+ * If @required_capacity is nonzero, the maximum array size will be set to this
+ * quantity and the array creation will fail if the underlying storage cannot
+ * support that many records.
+ */
+int
+xfarray_create(
+	const char		*description,
+	unsigned long long	required_capacity,
+	size_t			obj_size,
+	struct xfarray		**arrayp)
+{
+	struct xfarray		*array;
+	struct xfile		*xfile;
+	int			error;
+
+	ASSERT(obj_size < PAGE_SIZE);
+
+	error = xfile_create(description, 0, &xfile);
+	if (error)
+		return error;
+
+	error = -ENOMEM;
+	array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS);
+	if (!array)
+		goto out_xfile;
+
+	array->xfile = xfile;
+	array->obj_size = obj_size;
+
+	if (is_power_of_2(obj_size))
+		array->obj_size_log = ilog2(obj_size);
+	else
+		array->obj_size_log = -1;
+
+	array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE);
+	trace_xfarray_create(array, required_capacity);
+
+	if (required_capacity > 0) {
+		if (array->max_nr < required_capacity) {
+			error = -ENOMEM;
+			goto out_xfarray;
+		}
+		array->max_nr = required_capacity;
+	}
+
+	*arrayp = array;
+	return 0;
+
+out_xfarray:
+	kfree(array);
+out_xfile:
+	xfile_destroy(xfile);
+	return error;
+}
+
+/* Destroy the array. */
+void
+xfarray_destroy(
+	struct xfarray	*array)
+{
+	xfile_destroy(array->xfile);
+	kfree(array);
+}
+
+/* Load an element from the array. */
+int
+xfarray_load(
+	struct xfarray	*array,
+	xfarray_idx_t	idx,
+	void		*ptr)
+{
+	if (idx >= array->nr)
+		return -ENODATA;
+
+	return xfile_obj_load(array->xfile, ptr, array->obj_size,
+			xfarray_pos(array, idx));
+}
+
+/* Is this array element potentially unset? */
+static inline bool
+xfarray_is_unset(
+	struct xfarray	*array,
+	loff_t		pos)
+{
+	void		*temp = xfarray_scratch(array);
+	int		error;
+
+	if (array->unset_slots == 0)
+		return false;
+
+	error = xfile_obj_load(array->xfile, temp, array->obj_size, pos);
+	if (!error && xfarray_element_is_null(array, temp))
+		return true;
+
+	return false;
+}
+
+/*
+ * Unset an array element.  If @idx is the last element in the array, the
+ * array will be truncated.  Otherwise, the entry will be zeroed.
+ */
+int
+xfarray_unset(
+	struct xfarray	*array,
+	xfarray_idx_t	idx)
+{
+	void		*temp = xfarray_scratch(array);
+	loff_t		pos = xfarray_pos(array, idx);
+	int		error;
+
+	if (idx >= array->nr)
+		return -ENODATA;
+
+	if (idx == array->nr - 1) {
+		array->nr--;
+		return 0;
+	}
+
+	if (xfarray_is_unset(array, pos))
+		return 0;
+
+	memset(temp, 0, array->obj_size);
+	error = xfile_obj_store(array->xfile, temp, array->obj_size, pos);
+	if (error)
+		return error;
+
+	array->unset_slots++;
+	return 0;
+}
+
+/*
+ * Store an element in the array.  The element must not be completely zeroed,
+ * because those are considered unset sparse elements.
+ */
+int
+xfarray_store(
+	struct xfarray	*array,
+	xfarray_idx_t	idx,
+	const void	*ptr)
+{
+	int		ret;
+
+	if (idx >= array->max_nr)
+		return -EFBIG;
+
+	ASSERT(!xfarray_element_is_null(array, ptr));
+
+	ret = xfile_obj_store(array->xfile, ptr, array->obj_size,
+			xfarray_pos(array, idx));
+	if (ret)
+		return ret;
+
+	array->nr = max(array->nr, idx + 1);
+	return 0;
+}
+
+/* Is this array element NULL? */
+bool
+xfarray_element_is_null(
+	struct xfarray	*array,
+	const void	*ptr)
+{
+	return !memchr_inv(ptr, 0, array->obj_size);
+}
+
+/*
+ * Store an element anywhere in the array that is unset.  If there are no
+ * unset slots, append the element to the array.
+ */
+int
+xfarray_store_anywhere(
+	struct xfarray	*array,
+	const void	*ptr)
+{
+	void		*temp = xfarray_scratch(array);
+	loff_t		endpos = xfarray_pos(array, array->nr);
+	loff_t		pos;
+	int		error;
+
+	/* Find an unset slot to put it in. */
+	for (pos = 0;
+	     pos < endpos && array->unset_slots > 0;
+	     pos += array->obj_size) {
+		error = xfile_obj_load(array->xfile, temp, array->obj_size,
+				pos);
+		if (error || !xfarray_element_is_null(array, temp))
+			continue;
+
+		error = xfile_obj_store(array->xfile, ptr, array->obj_size,
+				pos);
+		if (error)
+			return error;
+
+		array->unset_slots--;
+		return 0;
+	}
+
+	/* No unset slots found; attach it on the end. */
+	array->unset_slots = 0;
+	return xfarray_append(array, ptr);
+}
+
+/* Return length of array. */
+uint64_t
+xfarray_length(
+	struct xfarray	*array)
+{
+	return array->nr;
+}
+
+/*
+ * Decide which array item we're going to read as part of an _iter_get.
+ * @cur is the array index, and @pos is the file offset of that array index in
+ * the backing xfile.  Returns ENODATA if we reach the end of the records.
+ *
+ * Reading from a hole in a sparse xfile causes page instantiation, so for
+ * iterating a (possibly sparse) array we need to figure out if the cursor is
+ * pointing at a totally uninitialized hole and move the cursor up if
+ * necessary.
+ */
+static inline int
+xfarray_find_data(
+	struct xfarray	*array,
+	xfarray_idx_t	*cur,
+	loff_t		*pos)
+{
+	unsigned int	pgoff = offset_in_page(*pos);
+	loff_t		end_pos = *pos + array->obj_size - 1;
+	loff_t		new_pos;
+
+	/*
+	 * If the current array record is not adjacent to a page boundary, we
+	 * are in the middle of the page.  We do not need to move the cursor.
+	 */
+	if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE)
+		return 0;
+
+	/*
+	 * Call SEEK_DATA on the last byte in the record we're about to read.
+	 * If the record ends at (or crosses) the end of a page then we know
+	 * that the first byte of the record is backed by pages and don't need
+	 * to query it.  If instead the record begins at the start of the page
+	 * then we know that querying the last byte is just as good as querying
+	 * the first byte, since records cannot be larger than a page.
+	 *
+	 * If the call returns the same file offset, we know this record is
+	 * backed by real pages.  We do not need to move the cursor.
+	 */
+	new_pos = xfile_seek_data(array->xfile, end_pos);
+	if (new_pos == -ENXIO)
+		return -ENODATA;
+	if (new_pos < 0)
+		return new_pos;
+	if (new_pos == end_pos)
+		return 0;
+
+	/*
+	 * Otherwise, SEEK_DATA told us how far up to move the file pointer to
+	 * find more data.  Move the array index to the first record past the
+	 * byte offset we were given.
+	 */
+	new_pos = roundup_64(new_pos, array->obj_size);
+	*cur = xfarray_idx(array, new_pos);
+	*pos = xfarray_pos(array, *cur);
+	return 0;
+}
+
+/*
+ * Starting at *idx, fetch the next non-null array entry and advance the index
+ * to set up the next _load_next call.  Returns ENODATA if we reach the end of
+ * the array.  Callers must set @*idx to XFARRAY_CURSOR_INIT before the first
+ * call to this function.
+ */
+int
+xfarray_load_next(
+	struct xfarray	*array,
+	xfarray_idx_t	*idx,
+	void		*rec)
+{
+	xfarray_idx_t	cur = *idx;
+	loff_t		pos = xfarray_pos(array, cur);
+	int		error;
+
+	do {
+		if (cur >= array->nr)
+			return -ENODATA;
+
+		/*
+		 * Ask the backing store for the location of next possible
+		 * written record, then retrieve that record.
+		 */
+		error = xfarray_find_data(array, &cur, &pos);
+		if (error)
+			return error;
+		error = xfarray_load(array, cur, rec);
+		if (error)
+			return error;
+
+		cur++;
+		pos += array->obj_size;
+	} while (xfarray_element_is_null(array, rec));
+
+	*idx = cur;
+	return 0;
+}
+
+/* Sorting functions */
+
+#ifdef DEBUG
+# define xfarray_sort_bump_loads(si)	do { (si)->loads++; } while (0)
+# define xfarray_sort_bump_stores(si)	do { (si)->stores++; } while (0)
+# define xfarray_sort_bump_compares(si)	do { (si)->compares++; } while (0)
+# define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0)
+#else
+# define xfarray_sort_bump_loads(si)
+# define xfarray_sort_bump_stores(si)
+# define xfarray_sort_bump_compares(si)
+# define xfarray_sort_bump_heapsorts(si)
+#endif /* DEBUG */
+
+/* Load an array element for sorting. */
+static inline int
+xfarray_sort_load(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		idx,
+	void			*ptr)
+{
+	xfarray_sort_bump_loads(si);
+	return xfarray_load(si->array, idx, ptr);
+}
+
+/* Store an array element for sorting. */
+static inline int
+xfarray_sort_store(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		idx,
+	void			*ptr)
+{
+	xfarray_sort_bump_stores(si);
+	return xfarray_store(si->array, idx, ptr);
+}
+
+/* Compare an array element for sorting. */
+static inline int
+xfarray_sort_cmp(
+	struct xfarray_sortinfo	*si,
+	const void		*a,
+	const void		*b)
+{
+	xfarray_sort_bump_compares(si);
+	return si->cmp_fn(a, b);
+}
+
+/* Return a pointer to the low index stack for quicksort partitioning. */
+static inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si)
+{
+	return (xfarray_idx_t *)(si + 1);
+}
+
+/* Return a pointer to the high index stack for quicksort partitioning. */
+static inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si)
+{
+	return xfarray_sortinfo_lo(si) + si->max_stack_depth;
+}
+
+/* Size of each element in the quicksort pivot array. */
+static inline size_t
+xfarray_pivot_rec_sz(
+	struct xfarray		*array)
+{
+	return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t);
+}
+
+/* Allocate memory to handle the sort. */
+static inline int
+xfarray_sortinfo_alloc(
+	struct xfarray		*array,
+	xfarray_cmp_fn		cmp_fn,
+	unsigned int		flags,
+	struct xfarray_sortinfo	**infop)
+{
+	struct xfarray_sortinfo	*si;
+	size_t			nr_bytes = sizeof(struct xfarray_sortinfo);
+	size_t			pivot_rec_sz = xfarray_pivot_rec_sz(array);
+	int			max_stack_depth;
+
+	/*
+	 * The median-of-nine pivot algorithm doesn't work if a subset has
+	 * fewer than 9 items.  Make sure the in-memory sort will always take
+	 * over for subsets where this wouldn't be the case.
+	 */
+	BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR);
+
+	/*
+	 * Tail-call recursion during the partitioning phase means that
+	 * quicksort will never recurse more than log2(nr) times.  We need one
+	 * extra level of stack to hold the initial parameters.  In-memory
+	 * sort will always take care of the last few levels of recursion for
+	 * us, so we can reduce the stack depth by that much.
+	 */
+	max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1);
+	if (max_stack_depth < 1)
+		max_stack_depth = 1;
+
+	/* Each level of quicksort uses a lo and a hi index */
+	nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2;
+
+	/* Scratchpad for in-memory sort, or finding the pivot */
+	nr_bytes += max_t(size_t,
+			(XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz,
+			XFARRAY_ISORT_NR * array->obj_size);
+
+	si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS);
+	if (!si)
+		return -ENOMEM;
+
+	si->array = array;
+	si->cmp_fn = cmp_fn;
+	si->flags = flags;
+	si->max_stack_depth = max_stack_depth;
+	si->max_stack_used = 1;
+
+	xfarray_sortinfo_lo(si)[0] = 0;
+	xfarray_sortinfo_hi(si)[0] = array->nr - 1;
+
+	trace_xfarray_sort(si, nr_bytes);
+	*infop = si;
+	return 0;
+}
+
+/* Should this sort be terminated by a fatal signal? */
+static inline bool
+xfarray_sort_terminated(
+	struct xfarray_sortinfo	*si,
+	int			*error)
+{
+	/*
+	 * If preemption is disabled, we need to yield to the scheduler every
+	 * few seconds so that we don't run afoul of the soft lockup watchdog
+	 * or RCU stall detector.
+	 */
+	cond_resched();
+
+	if ((si->flags & XFARRAY_SORT_KILLABLE) &&
+	    fatal_signal_pending(current)) {
+		if (*error == 0)
+			*error = -EINTR;
+		return true;
+	}
+	return false;
+}
+
+/* Do we want an in-memory sort? */
+static inline bool
+xfarray_want_isort(
+	struct xfarray_sortinfo *si,
+	xfarray_idx_t		start,
+	xfarray_idx_t		end)
+{
+	/*
+	 * For array subsets that fit in the scratchpad, it's much faster to
+	 * use the kernel's heapsort than quicksort's stack machine.
+	 */
+	return (end - start) < XFARRAY_ISORT_NR;
+}
+
+/* Return the scratch space within the sortinfo structure. */
+static inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si)
+{
+	return xfarray_sortinfo_hi(si) + si->max_stack_depth;
+}
+
+/*
+ * Sort a small number of array records using scratchpad memory.  The records
+ * need not be contiguous in the xfile's memory pages.
+ */
+STATIC int
+xfarray_isort(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		lo,
+	xfarray_idx_t		hi)
+{
+	void			*scratch = xfarray_sortinfo_isort_scratch(si);
+	loff_t			lo_pos = xfarray_pos(si->array, lo);
+	loff_t			len = xfarray_pos(si->array, hi - lo + 1);
+	int			error;
+
+	trace_xfarray_isort(si, lo, hi);
+
+	xfarray_sort_bump_loads(si);
+	error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos);
+	if (error)
+		return error;
+
+	xfarray_sort_bump_heapsorts(si);
+	sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
+
+	xfarray_sort_bump_stores(si);
+	return xfile_obj_store(si->array->xfile, scratch, len, lo_pos);
+}
+
+/* Grab a page for sorting records. */
+static inline int
+xfarray_sort_get_page(
+	struct xfarray_sortinfo	*si,
+	loff_t			pos,
+	uint64_t		len)
+{
+	int			error;
+
+	error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
+	if (error)
+		return error;
+
+	/*
+	 * xfile pages must never be mapped into userspace, so we skip the
+	 * dcache flush when mapping the page.
+	 */
+	si->page_kaddr = kmap_local_page(si->xfpage.page);
+	return 0;
+}
+
+/* Release a page we grabbed for sorting records. */
+static inline int
+xfarray_sort_put_page(
+	struct xfarray_sortinfo	*si)
+{
+	if (!si->page_kaddr)
+		return 0;
+
+	kunmap_local(si->page_kaddr);
+	si->page_kaddr = NULL;
+
+	return xfile_put_page(si->array->xfile, &si->xfpage);
+}
+
+/* Decide if these records are eligible for in-page sorting. */
+static inline bool
+xfarray_want_pagesort(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		lo,
+	xfarray_idx_t		hi)
+{
+	pgoff_t			lo_page;
+	pgoff_t			hi_page;
+	loff_t			end_pos;
+
+	/* We can only map one page at a time. */
+	lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
+	end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
+	hi_page = end_pos >> PAGE_SHIFT;
+
+	return lo_page == hi_page;
+}
+
+/* Sort a bunch of records that all live in the same memory page. */
+STATIC int
+xfarray_pagesort(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		lo,
+	xfarray_idx_t		hi)
+{
+	void			*startp;
+	loff_t			lo_pos = xfarray_pos(si->array, lo);
+	uint64_t		len = xfarray_pos(si->array, hi - lo);
+	int			error = 0;
+
+	trace_xfarray_pagesort(si, lo, hi);
+
+	xfarray_sort_bump_loads(si);
+	error = xfarray_sort_get_page(si, lo_pos, len);
+	if (error)
+		return error;
+
+	xfarray_sort_bump_heapsorts(si);
+	startp = si->page_kaddr + offset_in_page(lo_pos);
+	sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
+
+	xfarray_sort_bump_stores(si);
+	return xfarray_sort_put_page(si);
+}
+
+/* Return a pointer to the xfarray pivot record within the sortinfo struct. */
+static inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si)
+{
+	return xfarray_sortinfo_hi(si) + si->max_stack_depth;
+}
+
+/* Return a pointer to the start of the pivot array. */
+static inline void *
+xfarray_sortinfo_pivot_array(
+	struct xfarray_sortinfo	*si)
+{
+	return xfarray_sortinfo_pivot(si) + si->array->obj_size;
+}
+
+/* The xfarray record is stored at the start of each pivot array element. */
+static inline void *
+xfarray_pivot_array_rec(
+	void			*pa,
+	size_t			pa_recsz,
+	unsigned int		pa_idx)
+{
+	return pa + (pa_recsz * pa_idx);
+}
+
+/* The xfarray index is stored at the end of each pivot array element. */
+static inline xfarray_idx_t *
+xfarray_pivot_array_idx(
+	void			*pa,
+	size_t			pa_recsz,
+	unsigned int		pa_idx)
+{
+	return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) -
+			sizeof(xfarray_idx_t);
+}
+
+/*
+ * Find a pivot value for quicksort partitioning, swap it with a[lo], and save
+ * the cached pivot record for the next step.
+ *
+ * Load evenly-spaced records within the given range into memory, sort them,
+ * and choose the pivot from the median record.  Using multiple points will
+ * improve the quality of the pivot selection, and hopefully avoid the worst
+ * quicksort behavior, since our array values are nearly always evenly sorted.
+ */
+STATIC int
+xfarray_qsort_pivot(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		lo,
+	xfarray_idx_t		hi)
+{
+	void			*pivot = xfarray_sortinfo_pivot(si);
+	void			*parray = xfarray_sortinfo_pivot_array(si);
+	void			*recp;
+	xfarray_idx_t		*idxp;
+	xfarray_idx_t		step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1);
+	size_t			pivot_rec_sz = xfarray_pivot_rec_sz(si->array);
+	int			i, j;
+	int			error;
+
+	ASSERT(step > 0);
+
+	/*
+	 * Load the xfarray indexes of the records we intend to sample into the
+	 * pivot array.
+	 */
+	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0);
+	*idxp = lo;
+	for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) {
+		idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
+		*idxp = lo + (i * step);
+	}
+	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
+			XFARRAY_QSORT_PIVOT_NR - 1);
+	*idxp = hi;
+
+	/* Load the selected xfarray records into the pivot array. */
+	for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) {
+		xfarray_idx_t	idx;
+
+		recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i);
+		idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
+
+		/* No unset records; load directly into the array. */
+		if (likely(si->array->unset_slots == 0)) {
+			error = xfarray_sort_load(si, *idxp, recp);
+			if (error)
+				return error;
+			continue;
+		}
+
+		/*
+		 * Load non-null records into the scratchpad without changing
+		 * the xfarray_idx_t in the pivot array.
+		 */
+		idx = *idxp;
+		xfarray_sort_bump_loads(si);
+		error = xfarray_load_next(si->array, &idx, recp);
+		if (error)
+			return error;
+	}
+
+	xfarray_sort_bump_heapsorts(si);
+	sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL);
+
+	/*
+	 * We sorted the pivot array records (which includes the xfarray
+	 * indices) in xfarray record order.  The median element of the pivot
+	 * array contains the xfarray record that we will use as the pivot.
+	 * Copy that xfarray record to the designated space.
+	 */
+	recp = xfarray_pivot_array_rec(parray, pivot_rec_sz,
+			XFARRAY_QSORT_PIVOT_NR / 2);
+	memcpy(pivot, recp, si->array->obj_size);
+
+	/* If the pivot record we chose was already in a[lo] then we're done. */
+	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
+			XFARRAY_QSORT_PIVOT_NR / 2);
+	if (*idxp == lo)
+		return 0;
+
+	/*
+	 * Find the cached copy of a[lo] in the pivot array so that we can swap
+	 * a[lo] and a[pivot].
+	 */
+	for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) {
+		idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
+		if (*idxp == lo)
+			j = i;
+	}
+	if (j < 0) {
+		ASSERT(j >= 0);
+		return -EFSCORRUPTED;
+	}
+
+	/* Swap a[lo] and a[pivot]. */
+	error = xfarray_sort_store(si, lo, pivot);
+	if (error)
+		return error;
+
+	recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j);
+	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
+			XFARRAY_QSORT_PIVOT_NR / 2);
+	return xfarray_sort_store(si, *idxp, recp);
+}
+
+/*
+ * Set up the pointers for the next iteration.  We push onto the stack all of
+ * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the
+ * current stack frame to point to the unsorted values between a[beg[i]] and
+ * a[lo] so that those values will be sorted when we pop the stack.
+ */
+static inline int
+xfarray_qsort_push(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		*si_lo,
+	xfarray_idx_t		*si_hi,
+	xfarray_idx_t		lo,
+	xfarray_idx_t		hi)
+{
+	/* Check for stack overflows */
+	if (si->stack_depth >= si->max_stack_depth - 1) {
+		ASSERT(si->stack_depth < si->max_stack_depth - 1);
+		return -EFSCORRUPTED;
+	}
+
+	si->max_stack_used = max_t(uint8_t, si->max_stack_used,
+					    si->stack_depth + 2);
+
+	si_lo[si->stack_depth + 1] = lo + 1;
+	si_hi[si->stack_depth + 1] = si_hi[si->stack_depth];
+	si_hi[si->stack_depth++] = lo - 1;
+
+	/*
+	 * Always start with the smaller of the two partitions to keep the
+	 * amount of recursion in check.
+	 */
+	if (si_hi[si->stack_depth]     - si_lo[si->stack_depth] >
+	    si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) {
+		swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]);
+		swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]);
+	}
+
+	return 0;
+}
+
+/*
+ * Load an element from the array into the first scratchpad and cache the page,
+ * if possible.
+ */
+static inline int
+xfarray_sort_load_cached(
+	struct xfarray_sortinfo	*si,
+	xfarray_idx_t		idx,
+	void			*ptr)
+{
+	loff_t			idx_pos = xfarray_pos(si->array, idx);
+	pgoff_t			startpage;
+	pgoff_t			endpage;
+	int			error = 0;
+
+	/*
+	 * If this load would split a page, release the cached page, if any,
+	 * and perform a traditional read.
+	 */
+	startpage = idx_pos >> PAGE_SHIFT;
+	endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
+	if (startpage != endpage) {
+		error = xfarray_sort_put_page(si);
+		if (error)
+			return error;
+
+		if (xfarray_sort_terminated(si, &error))
+			return error;
+
+		return xfile_obj_load(si->array->xfile, ptr,
+				si->array->obj_size, idx_pos);
+	}
+
+	/* If the cached page is not the one we want, release it. */
+	if (xfile_page_cached(&si->xfpage) &&
+	    xfile_page_index(&si->xfpage) != startpage) {
+		error = xfarray_sort_put_page(si);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * If we don't have a cached page (and we know the load is contained
+	 * in a single page) then grab it.
+	 */
+	if (!xfile_page_cached(&si->xfpage)) {
+		if (xfarray_sort_terminated(si, &error))
+			return error;
+
+		error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT,
+				PAGE_SIZE);
+		if (error)
+			return error;
+	}
+
+	memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos),
+			si->array->obj_size);
+	return 0;
+}
+
+/*
+ * Sort the array elements via quicksort.  This implementation incorporates
+ * four optimizations discussed in Sedgewick:
+ *
+ * 1. Use an explicit stack of array indices to store the next array partition
+ *    to sort.  This helps us to avoid recursion in the call stack, which is
+ *    particularly expensive in the kernel.
+ *
+ * 2. For arrays with records in arbitrary or user-controlled order, choose the
+ *    pivot element using a median-of-nine decision tree.  This reduces the
+ *    probability of selecting a bad pivot value which causes worst case
+ *    behavior (i.e. partition sizes of 1).
+ *
+ * 3. The smaller of the two sub-partitions is pushed onto the stack to start
+ *    the next level of recursion, and the larger sub-partition replaces the
+ *    current stack frame.  This guarantees that we won't need more than
+ *    log2(nr) stack space.
+ *
+ * 4. For small sets, load the records into the scratchpad and run heapsort on
+ *    them because that is very fast.  In the author's experience, this yields
+ *    a ~10% reduction in runtime.
+ *
+ *    If a small set is contained entirely within a single xfile memory page,
+ *    map the page directly and run heap sort directly on the xfile page
+ *    instead of using the load/store interface.  This halves the runtime.
+ *
+ * 5. This optimization is specific to the implementation.  When converging lo
+ *    and hi after selecting a pivot, we will try to retain the xfile memory
+ *    page between load calls, which reduces run time by 50%.
+ */
+
+/*
+ * Due to the use of signed indices, we can only support up to 2^63 records.
+ * Files can only grow to 2^63 bytes, so this is not much of a limitation.
+ */
+#define QSORT_MAX_RECS		(1ULL << 63)
+
+int
+xfarray_sort(
+	struct xfarray		*array,
+	xfarray_cmp_fn		cmp_fn,
+	unsigned int		flags)
+{
+	struct xfarray_sortinfo	*si;
+	xfarray_idx_t		*si_lo, *si_hi;
+	void			*pivot;
+	void			*scratch = xfarray_scratch(array);
+	xfarray_idx_t		lo, hi;
+	int			error = 0;
+
+	if (array->nr < 2)
+		return 0;
+	if (array->nr >= QSORT_MAX_RECS)
+		return -E2BIG;
+
+	error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si);
+	if (error)
+		return error;
+	si_lo = xfarray_sortinfo_lo(si);
+	si_hi = xfarray_sortinfo_hi(si);
+	pivot = xfarray_sortinfo_pivot(si);
+
+	while (si->stack_depth >= 0) {
+		lo = si_lo[si->stack_depth];
+		hi = si_hi[si->stack_depth];
+
+		trace_xfarray_qsort(si, lo, hi);
+
+		/* Nothing left in this partition to sort; pop stack. */
+		if (lo >= hi) {
+			si->stack_depth--;
+			continue;
+		}
+
+		/*
+		 * If directly mapping the page and sorting can solve our
+		 * problems, we're done.
+		 */
+		if (xfarray_want_pagesort(si, lo, hi)) {
+			error = xfarray_pagesort(si, lo, hi);
+			if (error)
+				goto out_free;
+			si->stack_depth--;
+			continue;
+		}
+
+		/* If insertion sort can solve our problems, we're done. */
+		if (xfarray_want_isort(si, lo, hi)) {
+			error = xfarray_isort(si, lo, hi);
+			if (error)
+				goto out_free;
+			si->stack_depth--;
+			continue;
+		}
+
+		/* Pick a pivot, move it to a[lo] and stash it. */
+		error = xfarray_qsort_pivot(si, lo, hi);
+		if (error)
+			goto out_free;
+
+		/*
+		 * Rearrange a[lo..hi] such that everything smaller than the
+		 * pivot is on the left side of the range and everything larger
+		 * than the pivot is on the right side of the range.
+		 */
+		while (lo < hi) {
+			/*
+			 * Decrement hi until it finds an a[hi] less than the
+			 * pivot value.
+			 */
+			error = xfarray_sort_load_cached(si, hi, scratch);
+			if (error)
+				goto out_free;
+			while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
+								lo < hi) {
+				hi--;
+				error = xfarray_sort_load_cached(si, hi,
+						scratch);
+				if (error)
+					goto out_free;
+			}
+			error = xfarray_sort_put_page(si);
+			if (error)
+				goto out_free;
+
+			if (xfarray_sort_terminated(si, &error))
+				goto out_free;
+
+			/* Copy that item (a[hi]) to a[lo]. */
+			if (lo < hi) {
+				error = xfarray_sort_store(si, lo++, scratch);
+				if (error)
+					goto out_free;
+			}
+
+			/*
+			 * Increment lo until it finds an a[lo] greater than
+			 * the pivot value.
+			 */
+			error = xfarray_sort_load_cached(si, lo, scratch);
+			if (error)
+				goto out_free;
+			while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
+								lo < hi) {
+				lo++;
+				error = xfarray_sort_load_cached(si, lo,
+						scratch);
+				if (error)
+					goto out_free;
+			}
+			error = xfarray_sort_put_page(si);
+			if (error)
+				goto out_free;
+
+			if (xfarray_sort_terminated(si, &error))
+				goto out_free;
+
+			/* Copy that item (a[lo]) to a[hi]. */
+			if (lo < hi) {
+				error = xfarray_sort_store(si, hi--, scratch);
+				if (error)
+					goto out_free;
+			}
+
+			if (xfarray_sort_terminated(si, &error))
+				goto out_free;
+		}
+
+		/*
+		 * Put our pivot value in the correct place at a[lo].  All
+		 * values between a[beg[i]] and a[lo - 1] should be less than
+		 * the pivot; and all values between a[lo + 1] and a[end[i]-1]
+		 * should be greater than the pivot.
+		 */
+		error = xfarray_sort_store(si, lo, pivot);
+		if (error)
+			goto out_free;
+
+		/* Set up the stack frame to process the two partitions. */
+		error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi);
+		if (error)
+			goto out_free;
+
+		if (xfarray_sort_terminated(si, &error))
+			goto out_free;
+	}
+
+out_free:
+	trace_xfarray_sort_stats(si, error);
+	kvfree(si);
+	return error;
+}
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
new file mode 100644
index 000000000000..4ecac01363d9
--- /dev/null
+++ b/fs/xfs/scrub/xfarray.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2021-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_XFARRAY_H__
+#define __XFS_SCRUB_XFARRAY_H__
+
+/* xfile array index type, along with cursor initialization */
+typedef uint64_t		xfarray_idx_t;
+#define XFARRAY_CURSOR_INIT	((__force xfarray_idx_t)0)
+
+/* Iterate each index of an xfile array. */
+#define foreach_xfarray_idx(array, idx) \
+	for ((idx) = XFARRAY_CURSOR_INIT; \
+	     (idx) < xfarray_length(array); \
+	     (idx)++)
+
+struct xfarray {
+	/* Underlying file that backs the array. */
+	struct xfile	*xfile;
+
+	/* Number of array elements. */
+	xfarray_idx_t	nr;
+
+	/* Maximum possible array size. */
+	xfarray_idx_t	max_nr;
+
+	/* Number of unset slots in the array below @nr. */
+	uint64_t	unset_slots;
+
+	/* Size of an array element. */
+	size_t		obj_size;
+
+	/* log2 of array element size, if possible. */
+	int		obj_size_log;
+};
+
+int xfarray_create(const char *descr, unsigned long long required_capacity,
+		size_t obj_size, struct xfarray **arrayp);
+void xfarray_destroy(struct xfarray *array);
+int xfarray_load(struct xfarray *array, xfarray_idx_t idx, void *ptr);
+int xfarray_unset(struct xfarray *array, xfarray_idx_t idx);
+int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr);
+int xfarray_store_anywhere(struct xfarray *array, const void *ptr);
+bool xfarray_element_is_null(struct xfarray *array, const void *ptr);
+
+/* Append an element to the array. */
+static inline int xfarray_append(struct xfarray *array, const void *ptr)
+{
+	return xfarray_store(array, array->nr, ptr);
+}
+
+uint64_t xfarray_length(struct xfarray *array);
+int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec);
+
+/* Declarations for xfile array sort functionality. */
+
+typedef cmp_func_t xfarray_cmp_fn;
+
+/* Perform an in-memory heapsort for small subsets. */
+#define XFARRAY_ISORT_SHIFT		(4)
+#define XFARRAY_ISORT_NR		(1U << XFARRAY_ISORT_SHIFT)
+
+/* Evalulate this many points to find the qsort pivot. */
+#define XFARRAY_QSORT_PIVOT_NR		(9)
+
+struct xfarray_sortinfo {
+	struct xfarray		*array;
+
+	/* Comparison function for the sort. */
+	xfarray_cmp_fn		cmp_fn;
+
+	/* Maximum height of the partition stack. */
+	uint8_t			max_stack_depth;
+
+	/* Current height of the partition stack. */
+	int8_t			stack_depth;
+
+	/* Maximum stack depth ever used. */
+	uint8_t			max_stack_used;
+
+	/* XFARRAY_SORT_* flags; see below. */
+	unsigned int		flags;
+
+	/* Cache a page here for faster access. */
+	struct xfile_page	xfpage;
+	void			*page_kaddr;
+
+#ifdef DEBUG
+	/* Performance statistics. */
+	uint64_t		loads;
+	uint64_t		stores;
+	uint64_t		compares;
+	uint64_t		heapsorts;
+#endif
+	/*
+	 * Extra bytes are allocated beyond the end of the structure to store
+	 * quicksort information.  C does not permit multiple VLAs per struct,
+	 * so we document all of this in a comment.
+	 *
+	 * Pretend that we have a typedef for array records:
+	 *
+	 * typedef char[array->obj_size]	xfarray_rec_t;
+	 *
+	 * First comes the quicksort partition stack:
+	 *
+	 * xfarray_idx_t	lo[max_stack_depth];
+	 * xfarray_idx_t	hi[max_stack_depth];
+	 *
+	 * union {
+	 *
+	 * If for a given subset we decide to use an in-memory sort, we use a
+	 * block of scratchpad records here to compare items:
+	 *
+	 * 	xfarray_rec_t	scratch[ISORT_NR];
+	 *
+	 * Otherwise, we want to partition the records to partition the array.
+	 * We store the chosen pivot record at the start of the scratchpad area
+	 * and use the rest to sample some records to estimate the median.
+	 * The format of the qsort_pivot array enables us to use the kernel
+	 * heapsort function to place the median value in the middle.
+	 *
+	 * 	struct {
+	 * 		xfarray_rec_t	pivot;
+	 * 		struct {
+	 *			xfarray_rec_t	rec;  (rounded up to 8 bytes)
+	 * 			xfarray_idx_t	idx;
+	 *		} qsort_pivot[QSORT_PIVOT_NR];
+	 * 	};
+	 * }
+	 */
+};
+
+/* Sort can be interrupted by a fatal signal. */
+#define XFARRAY_SORT_KILLABLE	(1U << 0)
+
+int xfarray_sort(struct xfarray *array, xfarray_cmp_fn cmp_fn,
+		unsigned int flags);
+
+#endif /* __XFS_SCRUB_XFARRAY_H__ */
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
new file mode 100644
index 000000000000..d98e8e77c684
--- /dev/null
+++ b/fs/xfs/scrub/xfile.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_format.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/scrub.h"
+#include "scrub/trace.h"
+#include <linux/shmem_fs.h>
+
+/*
+ * Swappable Temporary Memory
+ * ==========================
+ *
+ * Online checking sometimes needs to be able to stage a large amount of data
+ * in memory.  This information might not fit in the available memory and it
+ * doesn't all need to be accessible at all times.  In other words, we want an
+ * indexed data buffer to store data that can be paged out.
+ *
+ * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
+ * requirements.  Therefore, the xfile mechanism uses an unlinked shmem file to
+ * store our staging data.  This file is not installed in the file descriptor
+ * table so that user programs cannot access the data, which means that the
+ * xfile must be freed with xfile_destroy.
+ *
+ * xfiles assume that the caller will handle all required concurrency
+ * management; standard vfs locks (freezer and inode) are not taken.  Reads
+ * and writes are satisfied directly from the page cache.
+ *
+ * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
+ * of a hole cause a page to be mapped into the file.  If you are going to
+ * create a sparse xfile, please be careful about reading from uninitialized
+ * parts of the file.  These pages are !Uptodate and will eventually be
+ * reclaimed if not written, but in the short term this boosts memory
+ * consumption.
+ */
+
+/*
+ * xfiles must not be exposed to userspace and require upper layers to
+ * coordinate access to the one handle returned by the constructor, so
+ * establish a separate lock class for xfiles to avoid confusing lockdep.
+ */
+static struct lock_class_key xfile_i_mutex_key;
+
+/*
+ * Create an xfile of the given size.  The description will be used in the
+ * trace output.
+ */
+int
+xfile_create(
+	const char		*description,
+	loff_t			isize,
+	struct xfile		**xfilep)
+{
+	struct inode		*inode;
+	struct xfile		*xf;
+	int			error = -ENOMEM;
+
+	xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
+	if (!xf)
+		return -ENOMEM;
+
+	xf->file = shmem_file_setup(description, isize, 0);
+	if (!xf->file)
+		goto out_xfile;
+	if (IS_ERR(xf->file)) {
+		error = PTR_ERR(xf->file);
+		goto out_xfile;
+	}
+
+	/*
+	 * We want a large sparse file that we can pread, pwrite, and seek.
+	 * xfile users are responsible for keeping the xfile hidden away from
+	 * all other callers, so we skip timestamp updates and security checks.
+	 * Make the inode only accessible by root, just in case the xfile ever
+	 * escapes.
+	 */
+	xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
+			    FMODE_LSEEK;
+	xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
+	inode = file_inode(xf->file);
+	inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
+	inode->i_mode &= ~0177;
+	inode->i_uid = GLOBAL_ROOT_UID;
+	inode->i_gid = GLOBAL_ROOT_GID;
+
+	lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
+
+	trace_xfile_create(xf);
+
+	*xfilep = xf;
+	return 0;
+out_xfile:
+	kfree(xf);
+	return error;
+}
+
+/* Close the file and release all resources. */
+void
+xfile_destroy(
+	struct xfile		*xf)
+{
+	struct inode		*inode = file_inode(xf->file);
+
+	trace_xfile_destroy(xf);
+
+	lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key);
+	fput(xf->file);
+	kfree(xf);
+}
+
+/*
+ * Read a memory object directly from the xfile's page cache.  Unlike regular
+ * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
+ * high an offset, instead of truncating the read.  Otherwise, we return
+ * bytes read or an error code, like regular pread.
+ */
+ssize_t
+xfile_pread(
+	struct xfile		*xf,
+	void			*buf,
+	size_t			count,
+	loff_t			pos)
+{
+	struct inode		*inode = file_inode(xf->file);
+	struct address_space	*mapping = inode->i_mapping;
+	struct page		*page = NULL;
+	ssize_t			read = 0;
+	unsigned int		pflags;
+	int			error = 0;
+
+	if (count > MAX_RW_COUNT)
+		return -E2BIG;
+	if (inode->i_sb->s_maxbytes - pos < count)
+		return -EFBIG;
+
+	trace_xfile_pread(xf, pos, count);
+
+	pflags = memalloc_nofs_save();
+	while (count > 0) {
+		void		*p, *kaddr;
+		unsigned int	len;
+
+		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
+
+		/*
+		 * In-kernel reads of a shmem file cause it to allocate a page
+		 * if the mapping shows a hole.  Therefore, if we hit ENOMEM
+		 * we can continue by zeroing the caller's buffer.
+		 */
+		page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
+				__GFP_NOWARN);
+		if (IS_ERR(page)) {
+			error = PTR_ERR(page);
+			if (error != -ENOMEM)
+				break;
+
+			memset(buf, 0, len);
+			goto advance;
+		}
+
+		if (PageUptodate(page)) {
+			/*
+			 * xfile pages must never be mapped into userspace, so
+			 * we skip the dcache flush.
+			 */
+			kaddr = kmap_local_page(page);
+			p = kaddr + offset_in_page(pos);
+			memcpy(buf, p, len);
+			kunmap_local(kaddr);
+		} else {
+			memset(buf, 0, len);
+		}
+		put_page(page);
+
+advance:
+		count -= len;
+		pos += len;
+		buf += len;
+		read += len;
+	}
+	memalloc_nofs_restore(pflags);
+
+	if (read > 0)
+		return read;
+	return error;
+}
+
+/*
+ * Write a memory object directly to the xfile's page cache.  Unlike regular
+ * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
+ * high an offset, instead of truncating the write.  Otherwise, we return
+ * bytes written or an error code, like regular pwrite.
+ */
+ssize_t
+xfile_pwrite(
+	struct xfile		*xf,
+	const void		*buf,
+	size_t			count,
+	loff_t			pos)
+{
+	struct inode		*inode = file_inode(xf->file);
+	struct address_space	*mapping = inode->i_mapping;
+	const struct address_space_operations *aops = mapping->a_ops;
+	struct page		*page = NULL;
+	ssize_t			written = 0;
+	unsigned int		pflags;
+	int			error = 0;
+
+	if (count > MAX_RW_COUNT)
+		return -E2BIG;
+	if (inode->i_sb->s_maxbytes - pos < count)
+		return -EFBIG;
+
+	trace_xfile_pwrite(xf, pos, count);
+
+	pflags = memalloc_nofs_save();
+	while (count > 0) {
+		void		*fsdata = NULL;
+		void		*p, *kaddr;
+		unsigned int	len;
+		int		ret;
+
+		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
+
+		/*
+		 * We call write_begin directly here to avoid all the freezer
+		 * protection lock-taking that happens in the normal path.
+		 * shmem doesn't support fs freeze, but lockdep doesn't know
+		 * that and will trip over that.
+		 */
+		error = aops->write_begin(NULL, mapping, pos, len, &page,
+				&fsdata);
+		if (error)
+			break;
+
+		/*
+		 * xfile pages must never be mapped into userspace, so we skip
+		 * the dcache flush.  If the page is not uptodate, zero it
+		 * before writing data.
+		 */
+		kaddr = kmap_local_page(page);
+		if (!PageUptodate(page)) {
+			memset(kaddr, 0, PAGE_SIZE);
+			SetPageUptodate(page);
+		}
+		p = kaddr + offset_in_page(pos);
+		memcpy(p, buf, len);
+		kunmap_local(kaddr);
+
+		ret = aops->write_end(NULL, mapping, pos, len, len, page,
+				fsdata);
+		if (ret < 0) {
+			error = ret;
+			break;
+		}
+
+		written += ret;
+		if (ret != len)
+			break;
+
+		count -= ret;
+		pos += ret;
+		buf += ret;
+	}
+	memalloc_nofs_restore(pflags);
+
+	if (written > 0)
+		return written;
+	return error;
+}
+
+/* Find the next written area in the xfile data for a given offset. */
+loff_t
+xfile_seek_data(
+	struct xfile		*xf,
+	loff_t			pos)
+{
+	loff_t			ret;
+
+	ret = vfs_llseek(xf->file, pos, SEEK_DATA);
+	trace_xfile_seek_data(xf, pos, ret);
+	return ret;
+}
+
+/* Query stat information for an xfile. */
+int
+xfile_stat(
+	struct xfile		*xf,
+	struct xfile_stat	*statbuf)
+{
+	struct kstat		ks;
+	int			error;
+
+	error = vfs_getattr_nosec(&xf->file->f_path, &ks,
+			STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
+	if (error)
+		return error;
+
+	statbuf->size = ks.size;
+	statbuf->bytes = ks.blocks << SECTOR_SHIFT;
+	return 0;
+}
+
+/*
+ * Grab the (locked) page for a memory object.  The object cannot span a page
+ * boundary.  Returns 0 (and a locked page) if successful, -ENOTBLK if we
+ * cannot grab the page, or the usual negative errno.
+ */
+int
+xfile_get_page(
+	struct xfile		*xf,
+	loff_t			pos,
+	unsigned int		len,
+	struct xfile_page	*xfpage)
+{
+	struct inode		*inode = file_inode(xf->file);
+	struct address_space	*mapping = inode->i_mapping;
+	const struct address_space_operations *aops = mapping->a_ops;
+	struct page		*page = NULL;
+	void			*fsdata = NULL;
+	loff_t			key = round_down(pos, PAGE_SIZE);
+	unsigned int		pflags;
+	int			error;
+
+	if (inode->i_sb->s_maxbytes - pos < len)
+		return -ENOMEM;
+	if (len > PAGE_SIZE - offset_in_page(pos))
+		return -ENOTBLK;
+
+	trace_xfile_get_page(xf, pos, len);
+
+	pflags = memalloc_nofs_save();
+
+	/*
+	 * We call write_begin directly here to avoid all the freezer
+	 * protection lock-taking that happens in the normal path.  shmem
+	 * doesn't support fs freeze, but lockdep doesn't know that and will
+	 * trip over that.
+	 */
+	error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
+			&fsdata);
+	if (error)
+		goto out_pflags;
+
+	/* We got the page, so make sure we push out EOF. */
+	if (i_size_read(inode) < pos + len)
+		i_size_write(inode, pos + len);
+
+	/*
+	 * If the page isn't up to date, fill it with zeroes before we hand it
+	 * to the caller and make sure the backing store will hold on to them.
+	 */
+	if (!PageUptodate(page)) {
+		void	*kaddr;
+
+		kaddr = kmap_local_page(page);
+		memset(kaddr, 0, PAGE_SIZE);
+		kunmap_local(kaddr);
+		SetPageUptodate(page);
+	}
+
+	/*
+	 * Mark each page dirty so that the contents are written to some
+	 * backing store when we drop this buffer, and take an extra reference
+	 * to prevent the xfile page from being swapped or removed from the
+	 * page cache by reclaim if the caller unlocks the page.
+	 */
+	set_page_dirty(page);
+	get_page(page);
+
+	xfpage->page = page;
+	xfpage->fsdata = fsdata;
+	xfpage->pos = key;
+out_pflags:
+	memalloc_nofs_restore(pflags);
+	return error;
+}
+
+/*
+ * Release the (locked) page for a memory object.  Returns 0 or a negative
+ * errno.
+ */
+int
+xfile_put_page(
+	struct xfile		*xf,
+	struct xfile_page	*xfpage)
+{
+	struct inode		*inode = file_inode(xf->file);
+	struct address_space	*mapping = inode->i_mapping;
+	const struct address_space_operations *aops = mapping->a_ops;
+	unsigned int		pflags;
+	int			ret;
+
+	trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
+
+	/* Give back the reference that we took in xfile_get_page. */
+	put_page(xfpage->page);
+
+	pflags = memalloc_nofs_save();
+	ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
+			xfpage->page, xfpage->fsdata);
+	memalloc_nofs_restore(pflags);
+	memset(xfpage, 0, sizeof(struct xfile_page));
+
+	if (ret < 0)
+		return ret;
+	if (ret != PAGE_SIZE)
+		return -EIO;
+	return 0;
+}
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
new file mode 100644
index 000000000000..d56643b0f429
--- /dev/null
+++ b/fs/xfs/scrub/xfile.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_XFILE_H__
+#define __XFS_SCRUB_XFILE_H__
+
+struct xfile_page {
+	struct page		*page;
+	void			*fsdata;
+	loff_t			pos;
+};
+
+static inline bool xfile_page_cached(const struct xfile_page *xfpage)
+{
+	return xfpage->page != NULL;
+}
+
+static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage)
+{
+	return xfpage->page->index;
+}
+
+struct xfile {
+	struct file		*file;
+};
+
+int xfile_create(const char *description, loff_t isize, struct xfile **xfilep);
+void xfile_destroy(struct xfile *xf);
+
+ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos);
+ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count,
+		loff_t pos);
+
+/*
+ * Load an object.  Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
+ */
+static inline int
+xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos)
+{
+	ssize_t	ret = xfile_pread(xf, buf, count, pos);
+
+	if (ret < 0 || ret != count)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * Store an object.  Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
+ */
+static inline int
+xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos)
+{
+	ssize_t	ret = xfile_pwrite(xf, buf, count, pos);
+
+	if (ret < 0 || ret != count)
+		return -ENOMEM;
+	return 0;
+}
+
+loff_t xfile_seek_data(struct xfile *xf, loff_t pos);
+
+struct xfile_stat {
+	loff_t			size;
+	unsigned long long	bytes;
+};
+
+int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf);
+
+int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len,
+		struct xfile_page *xbuf);
+int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf);
+
+#endif /* __XFS_SCRUB_XFILE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 2fca4b4e7fd8..465d7630bb21 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -478,7 +478,7 @@ xfs_discard_folio(
 			folio, ip->i_ino, pos);
 
 	/*
-	 * The end of the punch range is always the offset of the the first
+	 * The end of the punch range is always the offset of the first
 	 * byte of the next folio. Hence the end offset is only dependent on the
 	 * folio itself and not the start offset that is passed in.
 	 */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 3b903f6bce98..c1ece4a08ff4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -481,7 +481,8 @@ _xfs_buf_obj_cmp(
 		 * reallocating a busy extent. Skip this buffer and
 		 * continue searching for an exact match.
 		 */
-		ASSERT(bp->b_flags & XBF_STALE);
+		if (!(map->bm_flags & XBM_LIVESCAN))
+			ASSERT(bp->b_flags & XBF_STALE);
 		return 1;
 	}
 	return 0;
@@ -559,6 +560,10 @@ xfs_buf_find_lock(
 	 * intact here.
 	 */
 	if (bp->b_flags & XBF_STALE) {
+		if (flags & XBF_LIVESCAN) {
+			xfs_buf_unlock(bp);
+			return -ENOENT;
+		}
 		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
 		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
 		bp->b_ops = NULL;
@@ -682,6 +687,8 @@ xfs_buf_get_map(
 	int			error;
 	int			i;
 
+	if (flags & XBF_LIVESCAN)
+		cmap.bm_flags |= XBM_LIVESCAN;
 	for (i = 0; i < nmaps; i++)
 		cmap.bm_len += map[i].bm_len;
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 549c60942208..df8f47953bb4 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -44,6 +44,11 @@ struct xfs_buf;
 #define _XBF_DELWRI_Q	 (1u << 22)/* buffer on a delwri queue */
 
 /* flags used only as arguments to access routines */
+/*
+ * Online fsck is scanning the buffer cache for live buffers.  Do not warn
+ * about length mismatches during lookups and do not return stale buffers.
+ */
+#define XBF_LIVESCAN	 (1u << 28)
 #define XBF_INCORE	 (1u << 29)/* lookup only, return if found in cache */
 #define XBF_TRYLOCK	 (1u << 30)/* lock requested, but do not wait */
 #define XBF_UNMAPPED	 (1u << 31)/* do not map the buffer */
@@ -67,6 +72,7 @@ typedef unsigned int xfs_buf_flags_t;
 	{ _XBF_KMEM,		"KMEM" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
 	/* The following interface flags should never be set */ \
+	{ XBF_LIVESCAN,		"LIVESCAN" }, \
 	{ XBF_INCORE,		"INCORE" }, \
 	{ XBF_TRYLOCK,		"TRYLOCK" }, \
 	{ XBF_UNMAPPED,		"UNMAPPED" }
@@ -114,8 +120,15 @@ typedef struct xfs_buftarg {
 struct xfs_buf_map {
 	xfs_daddr_t		bm_bn;	/* block number for I/O */
 	int			bm_len;	/* size of I/O */
+	unsigned int		bm_flags;
 };
 
+/*
+ * Online fsck is scanning the buffer cache for live buffers.  Do not warn
+ * about length mismatches during lookups and do not return stale buffers.
+ */
+#define XBM_LIVESCAN		(1U << 0)
+
 #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
 	struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
 
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 7f071757f278..ac6ba646624d 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1386,7 +1386,7 @@ xfs_qm_dqiterate(
 			return error;
 
 		error = iter_fn(dq, type, priv);
-		id = dq->q_id;
+		id = dq->q_id + 1;
 		xfs_qm_dqput(dq);
 	} while (error == 0 && id != 0);
 
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 453890942d9f..e541f5c0bc25 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -803,44 +803,6 @@ out_error_or_again:
 }
 
 /*
- * "Is this a cached inode that's also allocated?"
- *
- * Look up an inode by number in the given file system.  If the inode is
- * in cache and isn't in purgatory, return 1 if the inode is allocated
- * and 0 if it is not.  For all other cases (not in cache, being torn
- * down, etc.), return a negative error code.
- *
- * The caller has to prevent inode allocation and freeing activity,
- * presumably by locking the AGI buffer.   This is to ensure that an
- * inode cannot transition from allocated to freed until the caller is
- * ready to allow that.  If the inode is in an intermediate state (new,
- * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
- * inode is not in the cache, -ENOENT will be returned.  The caller must
- * deal with these scenarios appropriately.
- *
- * This is a specialized use case for the online scrubber; if you're
- * reading this, you probably want xfs_iget.
- */
-int
-xfs_icache_inode_is_allocated(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
-	xfs_ino_t		ino,
-	bool			*inuse)
-{
-	struct xfs_inode	*ip;
-	int			error;
-
-	error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
-	if (error)
-		return error;
-
-	*inuse = !!(VFS_I(ip)->i_mode);
-	xfs_irele(ip);
-	return 0;
-}
-
-/*
  * Grab the inode for reclaim exclusively.
  *
  * We have found this inode via a lookup under RCU, so the inode may have
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 1dcdcb23796e..2fa6f2e09d07 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -71,10 +71,6 @@ void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip);
 void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
 
 void xfs_blockgc_worker(struct work_struct *work);
-
-int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
-				  xfs_ino_t ino, bool *inuse);
-
 void xfs_blockgc_stop(struct xfs_mount *mp);
 void xfs_blockgc_start(struct xfs_mount *mp);
 
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 74dcb05069e8..e9d317a3dafe 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -63,6 +63,7 @@ typedef __u32			xfs_nlink_t;
 #include <linux/rhashtable.h>
 #include <linux/xattr.h>
 #include <linux/mnt_idmapping.h>
+#include <linux/debugfs.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index fb87ffb48f7f..0a0fd19573d8 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -34,6 +34,7 @@
 #include "xfs_health.h"
 #include "xfs_trace.h"
 #include "xfs_ag.h"
+#include "scrub/stats.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
@@ -716,9 +717,11 @@ xfs_mountfs(
 	if (error)
 		goto out_remove_sysfs;
 
+	xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
+
 	error = xfs_error_sysfs_init(mp);
 	if (error)
-		goto out_del_stats;
+		goto out_remove_scrub_stats;
 
 	error = xfs_errortag_init(mp);
 	if (error)
@@ -1033,7 +1036,8 @@ xfs_mountfs(
 	xfs_errortag_del(mp);
  out_remove_error_sysfs:
 	xfs_error_sysfs_del(mp);
- out_del_stats:
+ out_remove_scrub_stats:
+	xchk_stats_unregister(mp->m_scrub_stats);
 	xfs_sysfs_del(&mp->m_stats.xs_kobj);
  out_remove_sysfs:
 	xfs_sysfs_del(&mp->m_kobj);
@@ -1105,6 +1109,7 @@ xfs_unmountfs(
 
 	xfs_errortag_del(mp);
 	xfs_error_sysfs_del(mp);
+	xchk_stats_unregister(mp->m_scrub_stats);
 	xfs_sysfs_del(&mp->m_stats.xs_kobj);
 	xfs_sysfs_del(&mp->m_kobj);
 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e2866e7fa60c..a25eece3be2b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -208,11 +208,15 @@ typedef struct xfs_mount {
 	uint64_t		m_resblks_avail;/* available reserved blocks */
 	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
+	struct dentry		*m_debugfs;	/* debugfs parent */
 	struct xfs_kobj		m_kobj;
 	struct xfs_kobj		m_error_kobj;
 	struct xfs_kobj		m_error_meta_kobj;
 	struct xfs_error_cfg	m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
 	struct xstats		m_stats;	/* per-fs stats */
+#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS
+	struct xchk_stats	*m_scrub_stats;
+#endif
 	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
 	atomic_t		m_agirotor;	/* last ag dir inode alloced */
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c79eac048456..1f77014c6e1a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -42,6 +42,7 @@
 #include "xfs_xattr.h"
 #include "xfs_iunlink_item.h"
 #include "xfs_dahash_test.h"
+#include "scrub/stats.h"
 
 #include <linux/magic.h>
 #include <linux/fs_context.h>
@@ -49,6 +50,7 @@
 
 static const struct super_operations xfs_super_operations;
 
+static struct dentry *xfs_debugfs;	/* top-level xfs debugfs dir */
 static struct kset *xfs_kset;		/* top-level xfs sysfs dir */
 #ifdef DEBUG
 static struct xfs_kobj xfs_dbg_kobj;	/* global debug sysfs attrs */
@@ -783,6 +785,7 @@ xfs_mount_free(
 	if (mp->m_ddev_targp)
 		xfs_free_buftarg(mp->m_ddev_targp);
 
+	debugfs_remove(mp->m_debugfs);
 	kfree(mp->m_rtname);
 	kfree(mp->m_logname);
 	kmem_free(mp);
@@ -1163,6 +1166,7 @@ xfs_fs_put_super(
 	xfs_unmountfs(mp);
 
 	xfs_freesb(mp);
+	xchk_mount_stats_free(mp);
 	free_percpu(mp->m_stats.xs_stats);
 	xfs_mount_list_del(mp);
 	xfs_inodegc_free_percpu(mp);
@@ -1497,6 +1501,21 @@ xfs_fs_validate_params(
 	return 0;
 }
 
+struct dentry *
+xfs_debugfs_mkdir(
+	const char	*name,
+	struct dentry	*parent)
+{
+	struct dentry	*child;
+
+	/* Apparently we're expected to ignore error returns?? */
+	child = debugfs_create_dir(name, parent);
+	if (IS_ERR(child))
+		return NULL;
+
+	return child;
+}
+
 static int
 xfs_fs_fill_super(
 	struct super_block	*sb,
@@ -1539,6 +1558,13 @@ xfs_fs_fill_super(
 	if (error)
 		return error;
 
+	if (xfs_debugfs) {
+		mp->m_debugfs = xfs_debugfs_mkdir(mp->m_super->s_id,
+						  xfs_debugfs);
+	} else {
+		mp->m_debugfs = NULL;
+	}
+
 	error = xfs_init_mount_workqueues(mp);
 	if (error)
 		goto out_shutdown_devices;
@@ -1565,10 +1591,14 @@ xfs_fs_fill_super(
 		goto out_destroy_inodegc;
 	}
 
-	error = xfs_readsb(mp, flags);
+	error = xchk_mount_stats_alloc(mp);
 	if (error)
 		goto out_free_stats;
 
+	error = xfs_readsb(mp, flags);
+	if (error)
+		goto out_free_scrub_stats;
+
 	error = xfs_finish_flags(mp);
 	if (error)
 		goto out_free_sb;
@@ -1746,6 +1776,8 @@ xfs_fs_fill_super(
 	xfs_filestream_unmount(mp);
  out_free_sb:
 	xfs_freesb(mp);
+ out_free_scrub_stats:
+	xchk_mount_stats_free(mp);
  out_free_stats:
 	free_percpu(mp->m_stats.xs_stats);
  out_destroy_inodegc:
@@ -2377,10 +2409,12 @@ init_xfs_fs(void)
 	if (error)
 		goto out_cleanup_procfs;
 
+	xfs_debugfs = xfs_debugfs_mkdir("xfs", NULL);
+
 	xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
 	if (!xfs_kset) {
 		error = -ENOMEM;
-		goto out_sysctl_unregister;
+		goto out_debugfs_unregister;
 	}
 
 	xfsstats.xs_kobj.kobject.kset = xfs_kset;
@@ -2396,11 +2430,15 @@ init_xfs_fs(void)
 	if (error)
 		goto out_free_stats;
 
+	error = xchk_global_stats_setup(xfs_debugfs);
+	if (error)
+		goto out_remove_stats_kobj;
+
 #ifdef DEBUG
 	xfs_dbg_kobj.kobject.kset = xfs_kset;
 	error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
 	if (error)
-		goto out_remove_stats_kobj;
+		goto out_remove_scrub_stats;
 #endif
 
 	error = xfs_qm_init();
@@ -2417,14 +2455,17 @@ init_xfs_fs(void)
  out_remove_dbg_kobj:
 #ifdef DEBUG
 	xfs_sysfs_del(&xfs_dbg_kobj);
- out_remove_stats_kobj:
+ out_remove_scrub_stats:
 #endif
+	xchk_global_stats_teardown();
+ out_remove_stats_kobj:
 	xfs_sysfs_del(&xfsstats.xs_kobj);
  out_free_stats:
 	free_percpu(xfsstats.xs_stats);
  out_kset_unregister:
 	kset_unregister(xfs_kset);
- out_sysctl_unregister:
+ out_debugfs_unregister:
+	debugfs_remove(xfs_debugfs);
 	xfs_sysctl_unregister();
  out_cleanup_procfs:
 	xfs_cleanup_procfs();
@@ -2448,9 +2489,11 @@ exit_xfs_fs(void)
 #ifdef DEBUG
 	xfs_sysfs_del(&xfs_dbg_kobj);
 #endif
+	xchk_global_stats_teardown();
 	xfs_sysfs_del(&xfsstats.xs_kobj);
 	free_percpu(xfsstats.xs_stats);
 	kset_unregister(xfs_kset);
+	debugfs_remove(xfs_debugfs);
 	xfs_sysctl_unregister();
 	xfs_cleanup_procfs();
 	xfs_mru_cache_uninit();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 364e2c2648a8..302e6e5d6c7e 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -100,4 +100,6 @@ extern struct workqueue_struct *xfs_discard_wq;
 
 #define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
 
+struct dentry *xfs_debugfs_mkdir(const char *name, struct dentry *parent);
+
 #endif	/* __XFS_SUPER_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index fd789e00dfd6..902c7f67a117 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -22,6 +22,9 @@
  * daddr: physical block number in 512b blocks
  * bbcount: number of blocks in a physical extent, in 512b blocks
  *
+ * rtx: physical rt extent number for extent mappings
+ * rtxcount: number of rt extents in an extent mapping
+ *
  * owner: reverse-mapping owner, usually inodes
  *
  * fileoff: file offset, in fs blocks