// SPDX-License-Identifier: GPL-2.0+
/*
 * Copyright (C) 2018 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <darrick.wong@oracle.com>
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_btree_staging.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_log.h"
#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_alloc.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_extent_busy.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_bmap.h"
#include "xfs_dir2.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_remote.h"
#include "xfs_defer.h"
#include "xfs_extfree_item.h"
#include "xfs_reflink.h"
#include "xfs_health.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtalloc.h"
#include "xfs_imeta.h"
#include "xfs_rtrefcount_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/xfile.h"

/*
 * Attempt to repair some metadata, if the metadata is corrupt and userspace
 * told us to fix it.  This function returns -EAGAIN to mean "re-run scrub",
 * and will set *fixed to true if it thinks it repaired anything.
 */
int
xrep_attempt(
	struct xfs_scrub	*sc)
{
	int			error = 0;

	trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);

	xchk_ag_btcur_free(&sc->sa);
	xchk_rt_btcur_free(&sc->sr);

	/* Repair whatever's broken. */
	ASSERT(sc->ops->repair);
	error = sc->ops->repair(sc);
	trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
	switch (error) {
	case 0:
		/*
		 * Repair succeeded.  Commit the fixes and perform a second
		 * scrub so that we can tell userspace if we fixed the problem.
		 */
		sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
		sc->flags |= XREP_ALREADY_FIXED;
		return -EAGAIN;
	case -EDEADLOCK:
	case -EAGAIN:
		/* Tell the caller to try again having grabbed all the locks. */
		if (!(sc->flags & XCHK_TRY_HARDER)) {
			sc->flags |= XCHK_TRY_HARDER;
			return -EAGAIN;
		}
		/*
		 * We tried harder but still couldn't grab all the resources
		 * we needed to fix it.  The corruption has not been fixed,
		 * so report back to userspace.
		 */
		return -EFSCORRUPTED;
	default:
		return error;
	}
}

/*
 * Complain about unfixable problems in the filesystem.  We don't log
 * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
 * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
 * administrator isn't running xfs_scrub in no-repairs mode.
 *
 * Use this helper function because _ratelimited silently declares a static
 * structure to track rate limiting information.
 */
void
xrep_failure(
	struct xfs_mount	*mp)
{
	xfs_alert_ratelimited(mp,
"Corruption not fixed during online repair.  Unmount and run xfs_repair.");
}

/*
 * Repair probe -- userspace uses this to probe if we're willing to repair a
 * given mountpoint.
 */
int
xrep_probe(
	struct xfs_scrub	*sc)
{
	int			error = 0;

	if (xchk_should_terminate(sc, &error))
		return error;

	return 0;
}

/*
 * Roll a transaction, keeping the AG headers locked and reinitializing
 * the btree cursors.
 */
int
xrep_roll_ag_trans(
	struct xfs_scrub	*sc)
{
	int			error;

	/* Keep the AG header buffers locked so we can keep going. */
	if (sc->sa.agi_bp)
		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
	if (sc->sa.agf_bp)
		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
	if (sc->sa.agfl_bp)
		xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);

	/*
	 * Roll the transaction.  We still own the buffer and the buffer lock
	 * regardless of whether or not the roll succeeds.  If the roll fails,
	 * the buffers will be released during teardown on our way out of the
	 * kernel.  If it succeeds, we join them to the new transaction and
	 * move on.
	 */
	error = xfs_trans_roll(&sc->tp);
	if (error)
		return error;

	/* Join AG headers to the new transaction. */
	if (sc->sa.agi_bp)
		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
	if (sc->sa.agf_bp)
		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
	if (sc->sa.agfl_bp)
		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);

	return 0;
}

/* Roll the scrub transaction, holding the primary metadata locked. */
int
xrep_roll_trans(
	struct xfs_scrub	*sc)
{
	int			error;

	if (!sc->ip)
		return xrep_roll_ag_trans(sc);

	/*
	 * Roll the transaction with the inode we're fixing and the temp inode,
	 * so that neither can pin the log.
	 */
	if (sc->tempip)
		xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE);
	error = xfs_trans_roll_inode(&sc->tp, sc->ip);
	if (sc->tempip)
		xfs_trans_ijoin(sc->tp, sc->tempip, 0);
	return error;
}

/*
 * Does the given AG have enough space to rebuild a btree?  Neither AG
 * reservation can be critical, and we must have enough space (factoring
 * in AG reservations) to construct a whole btree.
 */
bool
xrep_ag_has_space(
	struct xfs_perag	*pag,
	xfs_extlen_t		nr_blocks,
	enum xfs_ag_resv_type	type)
{
	return  !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
		!xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
		pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
}

/*
 * Figure out how many blocks to reserve for an AG repair.  We calculate the
 * worst case estimate for the number of blocks we'd need to rebuild one of
 * any type of per-AG btree.
 */
xfs_extlen_t
xrep_calc_ag_resblks(
	struct xfs_scrub		*sc)
{
	struct xfs_mount		*mp = sc->mp;
	struct xfs_scrub_metadata	*sm = sc->sm;
	struct xfs_perag		*pag;
	struct xfs_buf			*bp;
	xfs_agino_t			icount = NULLAGINO;
	xfs_extlen_t			aglen = NULLAGBLOCK;
	xfs_extlen_t			usedlen;
	xfs_extlen_t			freelen;
	xfs_extlen_t			bnobt_sz;
	xfs_extlen_t			inobt_sz;
	xfs_extlen_t			rmapbt_sz;
	xfs_extlen_t			refcbt_sz;
	int				error;

	if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
		return 0;

	pag = xfs_perag_get(mp, sm->sm_agno);
	if (pag->pagi_init) {
		/* Use in-core icount if possible. */
		icount = pag->pagi_count;
	} else {
		/* Try to get the actual counters from disk. */
		error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
		if (!error) {
			icount = pag->pagi_count;
			xfs_buf_relse(bp);
		}
	}

	/* Now grab the block counters from the AGF. */
	error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
	if (error) {
		aglen = xfs_ag_block_count(mp, sm->sm_agno);
		freelen = aglen;
		usedlen = aglen;
	} else {
		struct xfs_agf	*agf = bp->b_addr;

		aglen = be32_to_cpu(agf->agf_length);
		freelen = be32_to_cpu(agf->agf_freeblks);
		usedlen = aglen - freelen;
		xfs_buf_relse(bp);
	}
	xfs_perag_put(pag);

	/* If the icount is impossible, make some worst-case assumptions. */
	if (icount == NULLAGINO ||
	    !xfs_verify_agino(mp, sm->sm_agno, icount)) {
		xfs_agino_t	first, last;

		xfs_agino_range(mp, sm->sm_agno, &first, &last);
		icount = last - first + 1;
	}

	/* If the block counts are impossible, make worst-case assumptions. */
	if (aglen == NULLAGBLOCK ||
	    aglen != xfs_ag_block_count(mp, sm->sm_agno) ||
	    freelen >= aglen) {
		aglen = xfs_ag_block_count(mp, sm->sm_agno);
		freelen = aglen;
		usedlen = aglen;
	}

	trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
			freelen, usedlen);

	/*
	 * Figure out how many blocks we'd need worst case to rebuild
	 * each type of btree.  Note that we can only rebuild the
	 * bnobt/cntbt or inobt/finobt as pairs.
	 */
	bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
	if (xfs_has_sparseinodes(mp))
		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
				XFS_INODES_PER_HOLEMASK_BIT);
	else
		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
				XFS_INODES_PER_CHUNK);
	if (xfs_has_finobt(mp))
		inobt_sz *= 2;
	if (xfs_has_reflink(mp))
		refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
	else
		refcbt_sz = 0;
	if (xfs_has_rmapbt(mp)) {
		/*
		 * Guess how many blocks we need to rebuild the rmapbt.
		 * For non-reflink filesystems we can't have more records than
		 * used blocks.  However, with reflink it's possible to have
		 * more than one rmap record per AG block.  We don't know how
		 * many rmaps there could be in the AG, so we start off with
		 * what we hope is an generous over-estimation.
		 */
		if (xfs_has_reflink(mp))
			rmapbt_sz = xfs_rmapbt_calc_size(mp,
					(unsigned long long)aglen * 2);
		else
			rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
	} else {
		rmapbt_sz = 0;
	}

	trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
			inobt_sz, rmapbt_sz, refcbt_sz);

	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
}

/* Allocate a block in an AG. */
int
xrep_alloc_ag_block(
	struct xfs_scrub		*sc,
	const struct xfs_owner_info	*oinfo,
	xfs_fsblock_t			*fsbno,
	enum xfs_ag_resv_type		resv)
{
	struct xfs_alloc_arg		args = {0};
	xfs_agblock_t			bno;
	int				error;

	switch (resv) {
	case XFS_AG_RESV_AGFL:
	case XFS_AG_RESV_RMAPBT:
		error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
		if (error)
			return error;
		if (bno == NULLAGBLOCK)
			return -ENOSPC;
		xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno,
				1, false);
		*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
		if (resv == XFS_AG_RESV_RMAPBT)
			xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
		return 0;
	default:
		break;
	}

	args.tp = sc->tp;
	args.mp = sc->mp;
	args.oinfo = *oinfo;
	args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.pag->pag_agno, 0);
	args.minlen = 1;
	args.maxlen = 1;
	args.prod = 1;
	args.type = XFS_ALLOCTYPE_THIS_AG;
	args.resv = resv;

	error = xfs_alloc_vextent(&args);
	if (error)
		return error;
	if (args.fsbno == NULLFSBLOCK)
		return -ENOSPC;
	ASSERT(args.len == 1);
	*fsbno = args.fsbno;

	return 0;
}

/* Initialize a new AG btree root block with zero entries. */
int
xrep_init_btblock(
	struct xfs_scrub		*sc,
	xfs_fsblock_t			fsb,
	struct xfs_buf			**bpp,
	xfs_btnum_t			btnum,
	const struct xfs_buf_ops	*ops)
{
	struct xfs_trans		*tp = sc->tp;
	struct xfs_mount		*mp = sc->mp;
	struct xfs_buf			*bp;
	int				error;

	trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
			XFS_FSB_TO_AGBNO(mp, fsb), btnum);

	ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno);
	error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
			XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
			&bp);
	if (error)
		return error;
	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
	xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno);
	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
	xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
	bp->b_ops = ops;
	*bpp = bp;

	return 0;
}

/* Initialize accounting resources for staging a new AG btree. */
void
xrep_newbt_init_ag(
	struct xrep_newbt		*xnr,
	struct xfs_scrub		*sc,
	const struct xfs_owner_info	*oinfo,
	xfs_fsblock_t			alloc_hint,
	enum xfs_ag_resv_type		resv)
{
	memset(xnr, 0, sizeof(struct xrep_newbt));
	xnr->sc = sc;
	xnr->oinfo = *oinfo; /* structure copy */
	xnr->alloc_hint = alloc_hint;
	xnr->resv = resv;
	INIT_LIST_HEAD(&xnr->resv_list);
}

/* Initialize accounting resources for staging a new inode fork btree. */
void
xrep_newbt_init_inode(
	struct xrep_newbt		*xnr,
	struct xfs_scrub		*sc,
	int				whichfork,
	const struct xfs_owner_info	*oinfo)
{
	xrep_newbt_init_ag(xnr, sc, oinfo,
			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
			XFS_AG_RESV_NONE);
	xnr->ifake.if_fork = kmem_cache_zalloc(xfs_ifork_zone,
			GFP_NOFS | __GFP_NOFAIL);
	xnr->ifake.if_fork_size = XFS_IFORK_SIZE(sc->ip, whichfork);
}

/*
 * Initialize accounting resources for staging a new btree.  Callers are
 * expected to add their own reservations (and clean them up) manually.
 */
void
xrep_newbt_init_bare(
	struct xrep_newbt		*xnr,
	struct xfs_scrub		*sc)
{
	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
			XFS_AG_RESV_NONE);
}

/*
 * Set up automatic reaping of the blocks reserved for btree reconstruction in
 * case we crash by logging a deferred free item for each extent we allocate so
 * that we can get all of the space back if we crash before we can commit the
 * new btree.  This function returns a token that can be used to cancel
 * automatic reaping if repair is successful.
 */
static void
xrep_newbt_schedule_reap(
	struct xrep_newbt		*xnr,
	struct xrep_newbt_resv		*resv)
{
	struct xfs_extent_free_item	efi_item = {
		.xefi_startblock	= resv->fsbno,
		.xefi_blockcount	= resv->len,
		.xefi_oinfo		= xnr->oinfo, /* struct copy */
		.xefi_skip_discard	= true,
	};
	LIST_HEAD(items);

	INIT_LIST_HEAD(&efi_item.xefi_list);
	list_add(&efi_item.xefi_list, &items);
	resv->efi = xfs_extent_free_defer_type.create_intent(xnr->sc->tp,
			&items, 1, false);
}

/* Designate specific blocks to be used to build our new btree. */
static int
__xrep_newbt_add_blocks(
	struct xrep_newbt		*xnr,
	xfs_fsblock_t			fsbno,
	xfs_extlen_t			len,
	bool				auto_reap)
{
	struct xrep_newbt_resv		*resv;

	resv = kmem_alloc(sizeof(struct xrep_newbt_resv), KM_MAYFAIL);
	if (!resv)
		return -ENOMEM;

	INIT_LIST_HEAD(&resv->list);
	resv->fsbno = fsbno;
	resv->len = len;
	resv->used = 0;
	if (auto_reap)
		xrep_newbt_schedule_reap(xnr, resv);
	list_add_tail(&resv->list, &xnr->resv_list);
	return 0;
}

/*
 * Allow certain callers to add disk space directly to the reservation.
 * Callers are responsible for cleaning up the reservations.
 */
int
xrep_newbt_add_blocks(
	struct xrep_newbt		*xnr,
	xfs_fsblock_t			fsbno,
	xfs_extlen_t			len)
{
	return __xrep_newbt_add_blocks(xnr, fsbno, len, false);
}

/* Allocate disk space for our new btree. */
int
xrep_newbt_alloc_blocks(
	struct xrep_newbt	*xnr,
	uint64_t		nr_blocks)
{
	struct xfs_scrub	*sc = xnr->sc;
	xfs_alloctype_t		type;
	xfs_fsblock_t		alloc_hint = xnr->alloc_hint;
	int			error = 0;

	/*
	 * Inode-rooted btrees can allocate from any AG, whereas AG btrees
	 * require a specific AG mentioned in the alloc hint..
	 */
	type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;

	while (nr_blocks > 0) {
		struct xfs_alloc_arg	args = {
			.tp		= sc->tp,
			.mp		= sc->mp,
			.type		= type,
			.fsbno		= alloc_hint,
			.oinfo		= xnr->oinfo,
			.minlen		= 1,
			.maxlen		= nr_blocks,
			.prod		= 1,
			.resv		= xnr->resv,
		};

		if (xnr->alloc_vextent)
			error = xnr->alloc_vextent(sc, &args);
		else
			error = xfs_alloc_vextent(&args);
		if (error)
			return error;
		if (args.fsbno == NULLFSBLOCK)
			return -ENOSPC;

		trace_xrep_newbt_alloc_blocks(sc->mp,
				XFS_FSB_TO_AGNO(sc->mp, args.fsbno),
				XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
				args.len, xnr->oinfo.oi_owner);

		error = __xrep_newbt_add_blocks(xnr, args.fsbno, args.len,
				true);
		if (error)
			return error;

		nr_blocks -= args.len;
		alloc_hint = args.fsbno + args.len - 1;

		error = xrep_roll_trans(sc);
		if (error)
			return error;
	}

	return 0;
}

/*
 * Relog the EFIs attached to a staging btree so that we don't pin the log
 * tail.  Same logic as xfs_defer_relog.
 */
int
xrep_newbt_relog_efis(
	struct xrep_newbt	*xnr)
{
	struct xrep_newbt_resv	*resv;
	struct xfs_trans	*tp = xnr->sc->tp;

	list_for_each_entry(resv, &xnr->resv_list, list) {
		/*
		 * If the log intent item for this deferred op is in a
		 * different checkpoint, relog it to keep the log tail moving
		 * forward.  We're ok with this being racy because an incorrect
		 * decision means we'll be a little slower at pushing the tail.
		 */
		if (!resv->efi || xfs_log_item_in_current_chkpt(resv->efi))
			continue;

		resv->efi = xfs_trans_item_relog(resv->efi, tp);
	}

	if (tp->t_flags & XFS_TRANS_DIRTY)
		return xrep_roll_trans(xnr->sc);
	return 0;
}

/*
 * Release blocks that were reserved for a btree repair.  If the repair
 * succeeded then we log deferred frees for unused blocks.  Otherwise, we try
 * to free the extents immediately to roll the filesystem back to where it was
 * before we started.
 */
static inline int
xrep_newbt_destroy_reservation(
	struct xrep_newbt	*xnr,
	struct xrep_newbt_resv	*resv,
	bool			cancel_repair)
{
	struct xfs_scrub	*sc = xnr->sc;
	struct xfs_efd_log_item	*efdp;
	struct xfs_extent	*extp;
	struct xfs_log_item	*efd_lip;

	/*
	 * Earlier, we logged EFIs for the extents that we allocated to hold
	 * the new btree so that we could automatically roll back those
	 * allocations if the system crashed.  Now we log an EFD to cancel the
	 * EFI, either because the repair succeeded and the new blocks are in
	 * use; or because the repair was cancelled and we're about to free
	 * the extents directly.
	 */
	efd_lip = xfs_extent_free_defer_type.create_done(sc->tp, resv->efi, 1);
	efdp = container_of(efd_lip, struct xfs_efd_log_item, efd_item);
	extp = efdp->efd_format.efd_extents;
	extp->ext_start = resv->fsbno;
	extp->ext_len = resv->len;
	efdp->efd_next_extent++;
	set_bit(XFS_LI_DIRTY, &efd_lip->li_flags);

	if (cancel_repair) {
		int		error;

		/* Free the extent then roll the transaction. */
		error = xfs_free_extent(sc->tp, resv->fsbno, resv->len,
				&xnr->oinfo, xnr->resv);
		if (error)
			return error;

		return xrep_roll_trans(sc);
	}

	/*
	 * Use the deferred freeing mechanism to schedule for deletion any
	 * blocks we didn't use to rebuild the tree.  This enables us to log
	 * them all in the same transaction as the root change.
	 */
	resv->fsbno += resv->used;
	resv->len -= resv->used;
	resv->used = 0;

	if (resv->len == 0)
		return 0;

	trace_xrep_newbt_free_blocks(sc->mp,
			XFS_FSB_TO_AGNO(sc->mp, resv->fsbno),
			XFS_FSB_TO_AGBNO(sc->mp, resv->fsbno),
			resv->len, xnr->oinfo.oi_owner);

	xfs_free_extent_later(sc->tp, resv->fsbno, resv->len, &xnr->oinfo,
			XFS_FREE_EXTENT_SKIP_DISCARD);

	return 0;
}

/* Free all the accounting info and disk space we reserved for a new btree. */
void
xrep_newbt_destroy(
	struct xrep_newbt	*xnr,
	int			error)
{
	struct xfs_scrub	*sc = xnr->sc;
	struct xrep_newbt_resv	*resv, *n;
	int			err2;

	/*
	 * If the filesystem already went down, we can't free the blocks.  Skip
	 * ahead to freeing the incore metadata because we can't fix anything.
	 */
	if (xfs_is_shutdown(sc->mp))
		goto junkit;

	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
		err2 = xrep_newbt_destroy_reservation(xnr, resv, error != 0);
		if (err2)
			goto junkit;

		list_del(&resv->list);
		kmem_free(resv);
	}

junkit:
	/*
	 * If we still have reservations attached to @newbt, cleanup must have
	 * failed and the filesystem is about to go down.  Clean up the incore
	 * reservations.
	 */
	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
		xfs_extent_free_defer_type.abort_intent(resv->efi);
		list_del(&resv->list);
		kmem_free(resv);
	}

	if (sc->ip) {
		kmem_cache_free(xfs_ifork_zone, xnr->ifake.if_fork);
		xnr->ifake.if_fork = NULL;
	}
}

/* Feed one of the reserved btree blocks to the bulk loader. */
int
xrep_newbt_claim_block(
	struct xfs_btree_cur	*cur,
	struct xrep_newbt	*xnr,
	union xfs_btree_ptr	*ptr)
{
	struct xrep_newbt_resv	*resv;
	xfs_fsblock_t		fsb;

	/*
	 * The first item in the list should always have a free block unless
	 * we're completely out.
	 */
	resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
	if (resv->used == resv->len)
		return -ENOSPC;

	/*
	 * Peel off a block from the start of the reservation.  We allocate
	 * blocks in order to place blocks on disk in increasing record or key
	 * order.  The block reservations tend to end up on the list in
	 * decreasing order, which hopefully results in leaf blocks ending up
	 * together.
	 */
	fsb = resv->fsbno + resv->used;
	resv->used++;

	/* If we used all the blocks in this reservation, move it to the end. */
	if (resv->used == resv->len)
		list_move_tail(&resv->list, &xnr->resv_list);

	trace_xrep_newbt_claim_block(cur->bc_mp,
			XFS_FSB_TO_AGNO(cur->bc_mp, fsb),
			XFS_FSB_TO_AGBNO(cur->bc_mp, fsb),
			1, xnr->oinfo.oi_owner);

	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
		ptr->l = cpu_to_be64(fsb);
	else
		ptr->s = cpu_to_be32(XFS_FSB_TO_AGBNO(cur->bc_mp, fsb));
	return 0;
}

/*
 * Estimate proper slack values for a btree that's being reloaded.
 *
 * Under most circumstances, we'll take whatever default loading value the
 * btree bulk loading code calculates for us.  However, there are some
 * exceptions to this rule:
 *
 * (1) If someone turned one of the debug knobs.
 * (2) If this is a per-AG btree and the AG has less than ~9% space free.
 * (3) If this is an inode btree and the FS has less than ~9% space free.
 *
 * Note that we actually use 3/32 for the comparison to avoid division.
 */
void
xrep_bload_estimate_slack(
	struct xfs_scrub	*sc,
	struct xfs_btree_bload	*bload)
{
	uint64_t		free;
	uint64_t		sz;

	/*
	 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
	 * unless someone has set them otherwise, so we just pull the values
	 * here.
	 */
	bload->leaf_slack = xfs_globals.bload_leaf_slack;
	bload->node_slack = xfs_globals.bload_node_slack;

	if (sc->ops->type == ST_PERAG) {
		free = sc->sa.pag->pagf_freeblks;
		sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
	} else {
		free = percpu_counter_sum(&sc->mp->m_fdblocks);
		sz = sc->mp->m_sb.sb_dblocks;
	}

	/* No further changes if there's more than 3/32ths space left. */
	if (free >= ((sz * 3) >> 5))
		return;

	/* We're low on space; load the btrees as tightly as possible. */
	if (bload->leaf_slack < 0)
		bload->leaf_slack = 0;
	if (bload->node_slack < 0)
		bload->node_slack = 0;
}

/*
 * Reconstructing per-AG Btrees
 *
 * When a space btree is corrupt, we don't bother trying to fix it.  Instead,
 * we scan secondary space metadata to derive the records that should be in
 * the damaged btree, initialize a fresh btree root, and insert the records.
 * Note that for rebuilding the rmapbt we scan all the primary data to
 * generate the new records.
 *
 * However, that leaves the matter of removing all the metadata describing the
 * old broken structure.  For primary metadata we use the rmap data to collect
 * every extent with a matching rmap owner (bitmap); we then iterate all other
 * metadata structures with the same rmap owner to collect the extents that
 * cannot be removed (sublist).  We then subtract sublist from bitmap to
 * derive the blocks that were used by the old btree.  These blocks can be
 * reaped.
 *
 * For rmapbt reconstructions we must use different tactics for extent
 * collection.  First we iterate all primary metadata (this excludes the old
 * rmapbt, obviously) to generate new rmap records.  The gaps in the rmap
 * records are collected as bitmap.  The bnobt records are collected as
 * sublist.  As with the other btrees we subtract sublist from bitmap, and the
 * result (since the rmapbt lives in the free space) are the blocks from the
 * old rmapbt.
 *
 * Disposal of Blocks from Old per-AG Btrees
 *
 * Now that we've constructed a new btree to replace the damaged one, we want
 * to dispose of the blocks that (we think) the old btree was using.
 * Previously, we used the rmapbt to collect the extents (bitmap) with the
 * rmap owner corresponding to the tree we rebuilt, collected extents for any
 * blocks with the same rmap owner that are owned by another data structure
 * (sublist), and subtracted sublist from bitmap.  In theory the extents
 * remaining in bitmap are the old btree's blocks.
 *
 * Unfortunately, it's possible that the btree was crosslinked with other
 * blocks on disk.  The rmap data can tell us if there are multiple owners, so
 * if the rmapbt says there is an owner of this block other than @oinfo, then
 * the block is crosslinked.  Remove the reverse mapping and continue.
 *
 * If there is one rmap record, we can free the block, which removes the
 * reverse mapping but doesn't add the block to the free space.  Our repair
 * strategy is to hope the other metadata objects crosslinked on this block
 * will be rebuilt (atop different blocks), thereby removing all the cross
 * links.
 *
 * If there are no rmap records at all, we also free the block.  If the btree
 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
 * supposed to be a rmap record and everything is ok.  For other btrees there
 * had to have been an rmap entry for the block to have ended up on @bitmap,
 * so if it's gone now there's something wrong and the fs will shut down.
 *
 * Note: If there are multiple rmap records with only the same rmap owner as
 * the btree we're trying to rebuild and the block is indeed owned by another
 * data structure with the same rmap owner, then the block will be in sublist
 * and therefore doesn't need disposal.  If there are multiple rmap records
 * with only the same rmap owner but the block is not owned by something with
 * the same rmap owner, the block will be freed.
 *
 * The caller is responsible for locking the AG headers for the entire rebuild
 * operation so that nothing else can sneak in and change the AG state while
 * we're not looking.  We also assume that the caller already invalidated any
 * buffers associated with @bitmap.
 */

/* Ensure the freelist is the correct size. */
int
xrep_fix_freelist(
	struct xfs_scrub	*sc,
	int			alloc_flags)
{
	struct xfs_alloc_arg	args = {0};

	args.mp = sc->mp;
	args.tp = sc->tp;
	args.agno = sc->sa.pag->pag_agno;
	args.alignment = 1;
	args.pag = sc->sa.pag;

	return xfs_alloc_fix_freelist(&args, alloc_flags);
}

/*
 * Put a block back on the AGFL.
 */
STATIC int
xrep_put_freelist(
	struct xfs_scrub	*sc,
	xfs_agblock_t		agbno)
{
	int			error;

	/* Make sure there's space on the freelist. */
	error = xrep_fix_freelist(sc, 0);
	if (error)
		return error;

	/*
	 * Since we're "freeing" a lost block onto the AGFL, we have to
	 * create an rmap for the block prior to merging it or else other
	 * parts will break.
	 */
	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
			&XFS_RMAP_OINFO_AG);
	if (error)
		return error;

	/* Put the block on the AGFL. */
	error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
			agbno, 0);
	if (error)
		return error;
	xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
			XFS_EXTENT_BUSY_SKIP_DISCARD);

	return 0;
}

/*
 * Compute the maximum length of a buffer cache scan, given a quantity of fs
 * blocks.
 */
xfs_daddr_t
xrep_max_buf_len(
	struct xfs_mount	*mp,
	xfs_extlen_t		fsblocks)
{
	int			max_fsbs;

	/* Remote xattr values are the largest buffers that we support. */
	max_fsbs = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);

	return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs));
}

/* Return a buffer from a sector scan, or NULL if there are no buffers. */
struct xfs_buf *
xrep_buf_scan_advance(
	struct xfs_mount	*mp,
	struct xrep_buf_scan	*scan)
{
	while (++scan->__sector_count < scan->max_daddrs) {
		struct xfs_buf	*bp;

		bp = xfs_buf_incore(mp->m_ddev_targp, scan->daddr,
			XFS_FSB_TO_BB(mp, scan->__sector_count),
			XBF_TRYLOCK | _XBF_IGNORE_STALE);
		if (bp)
			return bp;
	}

	return NULL;
}

/* Try to invalidate the incore buffer for a block that we're about to free. */
STATIC void
xrep_reap_invalidate_extent(
	struct xfs_scrub	*sc,
	xfs_agblock_t		agbno,
	xfs_extlen_t		len)
{
	struct xfs_mount	*mp = sc->mp;
	struct xfs_buf		*bp;
	xfs_agnumber_t		agno = sc->sa.pag->pag_agno;
	xfs_agblock_t		agbno_next = agbno + len;

	/*
	 * Avoid invalidating AG headers and post-EOFS blocks because we never
	 * own those.
	 */
	if (!xfs_verify_agbno(mp, agno, agbno) ||
	    !xfs_verify_agbno(mp, agno, agbno_next - 1))
		return;

	/*
	 * If there are incore buffers for these blocks, invalidate them.  If
	 * we can't (try)lock the buffer we assume it's owned by someone else
	 * and leave it alone.  The buffer cache cannot detect aliasing, so
	 * employ nested loops to detect incore buffers of any plausible size.
	 */
	for (; agbno < agbno_next; agbno++) {
		struct xrep_buf_scan	scan = {
			.daddr		= XFS_AGB_TO_DADDR(mp, agno, agbno),
			.max_daddrs	= xrep_max_buf_len(mp,
							agbno_next - agbno),
			.daddr_step	= XFS_FSB_TO_BB(mp, 1),
		};

		while ((bp = xrep_buf_scan_advance(mp, &scan)) != NULL) {
			xfs_trans_bjoin(sc->tp, bp);
			xfs_trans_binval(sc->tp, bp);
		}
	}
}

struct xrep_reap_state {
	struct xfs_scrub		*sc;
	const struct xfs_owner_info	*oinfo;
	enum xfs_ag_resv_type		resv;
	unsigned int			deferred;
};

/* Dispose of a single extent. */
STATIC int
xrep_reap_ag_extent(
	struct xrep_reap_state	*rs,
	xfs_agblock_t		agbno,
	xfs_extlen_t		aglen,
	bool			crosslinked)
{
	struct xfs_scrub	*sc = rs->sc;
	xfs_fsblock_t		fsbno;
	bool			need_roll = true;
	int			error = 0;

	fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);

	/*
	 * If there are other rmappings, this block is cross linked and must
	 * not be freed.  Remove the reverse mapping and move on.  Otherwise,
	 * we were the only owner of the block, so free the extent, which will
	 * also remove the rmap.
	 *
	 * XXX: XFS doesn't support detecting the case where a single block
	 * metadata structure is crosslinked with a multi-block structure
	 * because the buffer cache doesn't detect aliasing problems, so we
	 * can't fix 100% of crosslinking problems (yet).  The verifiers will
	 * blow on writeout, the filesystem will shut down, and the admin gets
	 * to run xfs_repair.
	 */
	if (crosslinked) {
		trace_xrep_dispose_unmap_extent(sc->sa.pag, agbno, aglen);

		error = xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
				aglen, rs->oinfo);
		if (error)
			return error;

		goto roll_out;
	}

	trace_xrep_dispose_free_extent(sc->sa.pag, agbno, aglen);

	xrep_reap_invalidate_extent(sc, agbno, aglen);

	if (rs->resv == XFS_AG_RESV_AGFL) {
		error = xrep_put_freelist(sc, agbno);
	} else if (rs->resv == XFS_AG_RESV_RMAPBT) {
		/*
		 * rmapbt blocks are counted as free space, so we have to pass
		 * XFS_AG_RESV_RMAPBT in the freeing operation to avoid
		 * decreasing fdblocks incorrectly.
		 */
		error = xfs_free_extent(sc->tp, fsbno, aglen, rs->oinfo,
				rs->resv);
	} else if (rs->resv == XFS_AG_RESV_IMETA) {
		/*
		 * For metadata inodes, we want to free the space used by the
		 * old btree back into the metadata inode's reservation, if
		 * necessary.  We can't do the accounting asynchronously (as
		 * part of a deferred free), which means we free the blocks
		 * directly from this function.  This comes at a slight risk of
		 * leaking blocks if the system goes down, though a re-repair
		 * will find and free the leaks.
		 */
		xfs_imeta_resv_free_extent(sc->ip, sc->tp, aglen);
		error = __xfs_free_extent(sc->tp, fsbno, aglen, rs->oinfo,
				rs->resv, true);
	} else {
		/*
		 * Use deferred frees to get rid of the old btree blocks to try
		 * to minimize the window in which we could crash and lose the
		 * old blocks.  However, we still need to roll the transaction
		 * every 100 or so EFIs so that we don't exceed the log
		 * reservation.
		 */
		xfs_free_extent_later(sc->tp, fsbno, aglen, rs->oinfo, 0);
		rs->deferred++;
		need_roll = rs->deferred > 100;
	}
	if (error || !need_roll)
		return error;

roll_out:
	rs->deferred = 0;
	if (sc->ip) {
		/*
		 * If we're reaping file data, hold the AGF buffer across the
		 * transaction roll so that we don't have to reattach it to the
		 * xchk_ag structure.
		 */
		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
		error = xfs_trans_roll_inode(&sc->tp, sc->ip);
		if (error)
			return error;
		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
		return 0;
	}

	return xrep_roll_ag_trans(sc);
}

/*
 * Figure out the longest run of blocks that we can dispose of with a single
 * call.  Cross-linked blocks should have their reverse mappings removed, but
 * single-owner extents can be freed.  AGFL blocks can only be put back one at
 * a time.
 */
STATIC int
xrep_reap_find_longest(
	struct xrep_reap_state	*rs,
	struct xfs_btree_cur	*cur,
	xfs_agblock_t		agbno,
	xfs_agblock_t		agbno_next,
	bool			agbno_crosslinked,
	xfs_extlen_t		*len)
{
	int			error;

	if (rs->resv == XFS_AG_RESV_AGFL)
		return 0;

	for (agbno++; agbno < agbno_next; agbno++) {
		bool	crosslinked;

		error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
				&crosslinked);
		if (error)
			return error;

		if (crosslinked != agbno_crosslinked)
			return 0;
		(*len)++;
	}

	return 0;
}

/*
 * Break a bitmap extent into sub-extents by fate, and dispose of each
 * sub-extent separately.
 */
STATIC int
xrep_reap_extent(
	uint64_t		fsbno,
	uint64_t		len,
	void			*priv)
{
	struct xrep_reap_state	*rs = priv;
	struct xfs_scrub	*sc = rs->sc;
	struct xfs_btree_cur	*cur = NULL;
	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
	xfs_agblock_t		agbno_next = agbno + len;
	int			error = 0;

	ASSERT(len <= MAXEXTLEN);

	if (sc->ip != NULL) {
		/*
		 * We're reaping blocks after repairing file metadata, which
		 * means that the blocks can be in any AG, so we have to init
		 * the xchk_ag structure before we can reap each extent and
		 * release it afterwards.
		 */
		ASSERT(!sc->sa.pag);

		sc->sa.pag = xfs_perag_get(sc->mp, agno);
		if (!sc->sa.pag)
			return -EFSCORRUPTED;

		error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0,
				&sc->sa.agf_bp);
		if (error)
			goto out_pag;
	} else if (sc->sa.pag == NULL || sc->sa.pag->pag_agno != agno) {
		ASSERT(0);
		return -EFSCORRUPTED;
	}

	while (agbno < agbno_next) {
		xfs_extlen_t	len = 1;
		bool		agbno_crosslinked;

		cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
				sc->sa.pag);

		/* Find the longest extent we can reap all at once. */
		error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
				&agbno_crosslinked);
		if (error)
			goto out_cur;

		error = xrep_reap_find_longest(rs, cur, agbno, agbno_next,
				agbno_crosslinked, &len);
		if (error)
			goto out_cur;

		/* Free the cursor because reap rolls the transaction. */
		xfs_btree_del_cursor(cur, 0);
		cur = NULL;

		error = xrep_reap_ag_extent(rs, agbno, len, agbno_crosslinked);
		if (error)
			goto out_cur;

		agbno += len;
	}

out_cur:
	if (cur)
		xfs_btree_del_cursor(cur, error);
	if (sc->ip != NULL) {
		xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
		sc->sa.agf_bp = NULL;
	}
out_pag:
	if (sc->ip != NULL) {
		xfs_perag_put(sc->sa.pag);
		sc->sa.pag = NULL;
	}
	return error;
}

/* Dispose of every block of every extent in the bitmap. */
int
xrep_reap_extents(
	struct xfs_scrub		*sc,
	struct xbitmap			*bitmap,
	const struct xfs_owner_info	*oinfo,
	enum xfs_ag_resv_type		type)
{
	struct xrep_reap_state		rs = {
		.sc			= sc,
		.oinfo			= oinfo,
		.resv			= type,
	};
	int				error;

	ASSERT(xfs_has_rmapbt(sc->mp));

	error = xbitmap_walk(bitmap, xrep_reap_extent, &rs);
	if (error || rs.deferred == 0)
		return error;

	return xrep_roll_ag_trans(sc);
}

/*
 * Finding per-AG Btree Roots for AGF/AGI Reconstruction
 *
 * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
 * the AG headers by using the rmap data to rummage through the AG looking for
 * btree roots.  This is not guaranteed to work if the AG is heavily damaged
 * or the rmap data are corrupt.
 *
 * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL
 * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
 * AGI is being rebuilt.  It must maintain these locks until it's safe for
 * other threads to change the btrees' shapes.  The caller provides
 * information about the btrees to look for by passing in an array of
 * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
 * The (root, height) fields will be set on return if anything is found.  The
 * last element of the array should have a NULL buf_ops to mark the end of the
 * array.
 *
 * For every rmapbt record matching any of the rmap owners in btree_info,
 * read each block referenced by the rmap record.  If the block is a btree
 * block from this filesystem matching any of the magic numbers and has a
 * level higher than what we've already seen, remember the block and the
 * height of the tree required to have such a block.  When the call completes,
 * we return the highest block we've found for each btree description; those
 * should be the roots.
 */

struct xrep_findroot {
	struct xfs_scrub		*sc;
	struct xfs_buf			*agfl_bp;
	struct xfs_agf			*agf;
	struct xrep_find_ag_btree	*btree_info;
};

/* See if our block is in the AGFL. */
STATIC int
xrep_findroot_agfl_walk(
	struct xfs_mount	*mp,
	xfs_agblock_t		bno,
	void			*priv)
{
	xfs_agblock_t		*agbno = priv;

	return (*agbno == bno) ? -ECANCELED : 0;
}

/* Does this block match the btree information passed in? */
STATIC int
xrep_findroot_block(
	struct xrep_findroot		*ri,
	struct xrep_find_ag_btree	*fab,
	uint64_t			owner,
	xfs_agblock_t			agbno,
	bool				*done_with_block)
{
	struct xfs_mount		*mp = ri->sc->mp;
	struct xfs_buf			*bp;
	struct xfs_btree_block		*btblock;
	xfs_daddr_t			daddr;
	int				block_level;
	int				error = 0;

	daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);

	/*
	 * Blocks in the AGFL have stale contents that might just happen to
	 * have a matching magic and uuid.  We don't want to pull these blocks
	 * in as part of a tree root, so we have to filter out the AGFL stuff
	 * here.  If the AGFL looks insane we'll just refuse to repair.
	 */
	if (owner == XFS_RMAP_OWN_AG) {
		error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
				xrep_findroot_agfl_walk, &agbno);
		if (error == -ECANCELED)
			return 0;
		if (error)
			return error;
	}

	/*
	 * Read the buffer into memory so that we can see if it's a match for
	 * our btree type.  We have no clue if it is beforehand, and we want to
	 * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
	 * will cause needless disk reads in subsequent calls to this function)
	 * and logging metadata verifier failures.
	 *
	 * Therefore, pass in NULL buffer ops.  If the buffer was already in
	 * memory from some other caller it will already have b_ops assigned.
	 * If it was in memory from a previous unsuccessful findroot_block
	 * call, the buffer won't have b_ops but it should be clean and ready
	 * for us to try to verify if the read call succeeds.  The same applies
	 * if the buffer wasn't in memory at all.
	 *
	 * Note: If we never match a btree type with this buffer, it will be
	 * left in memory with NULL b_ops.  This shouldn't be a problem unless
	 * the buffer gets written.
	 */
	error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
			mp->m_bsize, 0, &bp, NULL);
	if (error)
		return error;

	/* Ensure the block magic matches the btree type we're looking for. */
	btblock = XFS_BUF_TO_BLOCK(bp);
	ASSERT(fab->buf_ops->magic[1] != 0);
	if (btblock->bb_magic != fab->buf_ops->magic[1])
		goto out;

	/*
	 * If the buffer already has ops applied and they're not the ones for
	 * this btree type, we know this block doesn't match the btree and we
	 * can bail out.
	 *
	 * If the buffer ops match ours, someone else has already validated
	 * the block for us, so we can move on to checking if this is a root
	 * block candidate.
	 *
	 * If the buffer does not have ops, nobody has successfully validated
	 * the contents and the buffer cannot be dirty.  If the magic, uuid,
	 * and structure match this btree type then we'll move on to checking
	 * if it's a root block candidate.  If there is no match, bail out.
	 */
	if (bp->b_ops) {
		if (bp->b_ops != fab->buf_ops)
			goto out;
	} else {
		ASSERT(!xfs_trans_buf_is_dirty(bp));
		if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
				&mp->m_sb.sb_meta_uuid))
			goto out;
		/*
		 * Read verifiers can reference b_ops, so we set the pointer
		 * here.  If the verifier fails we'll reset the buffer state
		 * to what it was before we touched the buffer.
		 */
		bp->b_ops = fab->buf_ops;
		fab->buf_ops->verify_read(bp);
		if (bp->b_error) {
			bp->b_ops = NULL;
			bp->b_error = 0;
			goto out;
		}

		/*
		 * Some read verifiers will (re)set b_ops, so we must be
		 * careful not to change b_ops after running the verifier.
		 */
	}

	/*
	 * This block passes the magic/uuid and verifier tests for this btree
	 * type.  We don't need the caller to try the other tree types.
	 */
	*done_with_block = true;

	/*
	 * Compare this btree block's level to the height of the current
	 * candidate root block.
	 *
	 * If the level matches the root we found previously, throw away both
	 * blocks because there can't be two candidate roots.
	 *
	 * If level is lower in the tree than the root we found previously,
	 * ignore this block.
	 */
	block_level = xfs_btree_get_level(btblock);
	if (block_level + 1 == fab->height) {
		fab->root = NULLAGBLOCK;
		goto out;
	} else if (block_level < fab->height) {
		goto out;
	}

	/*
	 * This is the highest block in the tree that we've found so far.
	 * Update the btree height to reflect what we've learned from this
	 * block.
	 */
	fab->height = block_level + 1;

	/*
	 * If this block doesn't have sibling pointers, then it's the new root
	 * block candidate.  Otherwise, the root will be found farther up the
	 * tree.
	 */
	if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
	    btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
		fab->root = agbno;
	else
		fab->root = NULLAGBLOCK;

	trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
			be32_to_cpu(btblock->bb_magic), fab->height - 1);
out:
	xfs_trans_brelse(ri->sc->tp, bp);
	return error;
}

/*
 * Do any of the blocks in this rmap record match one of the btrees we're
 * looking for?
 */
STATIC int
xrep_findroot_rmap(
	struct xfs_btree_cur		*cur,
	const struct xfs_rmap_irec	*rec,
	void				*priv)
{
	struct xrep_findroot		*ri = priv;
	struct xrep_find_ag_btree	*fab;
	xfs_agblock_t			b;
	bool				done;
	int				error = 0;

	/* Ignore anything that isn't AG metadata. */
	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
		return 0;

	/* Otherwise scan each block + btree type. */
	for (b = 0; b < rec->rm_blockcount; b++) {
		done = false;
		for (fab = ri->btree_info; fab->buf_ops; fab++) {
			if (rec->rm_owner != fab->rmap_owner)
				continue;
			error = xrep_findroot_block(ri, fab,
					rec->rm_owner, rec->rm_startblock + b,
					&done);
			if (error)
				return error;
			if (done)
				break;
		}
	}

	return 0;
}

/* Find the roots of the per-AG btrees described in btree_info. */
int
xrep_find_ag_btree_roots(
	struct xfs_scrub		*sc,
	struct xfs_buf			*agf_bp,
	struct xrep_find_ag_btree	*btree_info,
	struct xfs_buf			*agfl_bp)
{
	struct xfs_mount		*mp = sc->mp;
	struct xrep_findroot		ri;
	struct xrep_find_ag_btree	*fab;
	struct xfs_btree_cur		*cur;
	int				error;

	ASSERT(xfs_buf_islocked(agf_bp));
	ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));

	ri.sc = sc;
	ri.btree_info = btree_info;
	ri.agf = agf_bp->b_addr;
	ri.agfl_bp = agfl_bp;
	for (fab = btree_info; fab->buf_ops; fab++) {
		ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
		ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
		fab->root = NULLAGBLOCK;
		fab->height = 0;
	}

	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
	error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri);
	xfs_btree_del_cursor(cur, error);

	return error;
}

/* Force a quotacheck the next time we mount. */
void
xrep_force_quotacheck(
	struct xfs_scrub	*sc,
	xfs_dqtype_t		type)
{
	uint			flag;

	flag = xfs_quota_chkd_flag(type);
	if (!(flag & sc->mp->m_qflags))
		return;

	mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
	sc->mp->m_qflags &= ~flag;
	spin_lock(&sc->mp->m_sb_lock);
	sc->mp->m_sb.sb_qflags &= ~flag;
	spin_unlock(&sc->mp->m_sb_lock);
	xfs_log_sb(sc->tp);
	mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
}

/*
 * Attach dquots to this inode, or schedule quotacheck to fix them.
 *
 * This function ensures that the appropriate dquots are attached to an inode.
 * We cannot allow the dquot code to allocate an on-disk dquot block here
 * because we're already in transaction context with the inode locked.  The
 * on-disk dquot should already exist anyway.  If the quota code signals
 * corruption or missing quota information, schedule quotacheck, which will
 * repair corruptions in the quota metadata.
 */
int
xrep_ino_dqattach(
	struct xfs_scrub	*sc)
{
	int			error;

	error = xfs_qm_dqattach_locked(sc->ip, false);
	switch (error) {
	case -EFSBADCRC:
	case -EFSCORRUPTED:
	case -ENOENT:
		xfs_err_ratelimited(sc->mp,
"inode %llu repair encountered quota error %d, quotacheck forced.",
				(unsigned long long)sc->ip->i_ino, error);
		if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
			xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
		if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
			xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
		if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
			xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
		fallthrough;
	case -ESRCH:
		error = 0;
		break;
	default:
		break;
	}

	return error;
}

/* Initialize all the btree cursors for an AG repair. */
void
xrep_ag_btcur_init(
	struct xfs_scrub	*sc,
	struct xchk_ag		*sa)
{
	struct xfs_mount	*mp = sc->mp;

	/* Set up a bnobt cursor for cross-referencing. */
	if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
	    sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
		sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
				sc->sa.pag, XFS_BTNUM_BNO);
		sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
				sc->sa.pag, XFS_BTNUM_CNT);
	}

	/* Set up a inobt cursor for cross-referencing. */
	if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
	    sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
		sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
				sc->sa.pag, XFS_BTNUM_INO);
		if (xfs_has_finobt(mp))
			sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp,
					sa->agi_bp, sc->sa.pag, XFS_BTNUM_FINO);
	}

	/* Set up a rmapbt cursor for cross-referencing. */
	if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT &&
	    xfs_has_rmapbt(mp))
		sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
				sc->sa.pag);

	/* Set up a refcountbt cursor for cross-referencing. */
	if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT &&
	    xfs_has_reflink(mp))
		sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
				sa->agf_bp, sc->sa.pag);
}

/* Given a reference to a perag structure, load AG headers and cursors. */
int
xrep_ag_init(
	struct xfs_scrub	*sc,
	struct xfs_perag	*pag,
	struct xchk_ag		*sa)
{
	int			error;

	ASSERT(!sa->pag);

	error = xfs_ialloc_read_agi(sc->mp, sc->tp, pag->pag_agno,
			&sa->agi_bp);
	if (error)
		return error;

	error = xfs_alloc_read_agf(sc->mp, sc->tp, pag->pag_agno, 0,
			&sa->agf_bp);
	if (error)
		return error;

	error = xfs_alloc_read_agfl(sc->mp, sc->tp, pag->pag_agno,
			&sa->agfl_bp);
	if (error)
		return error;

	/* Grab our own reference to the perag structure. */
	atomic_inc(&pag->pag_ref);
	sa->pag = pag;
	xrep_ag_btcur_init(sc, sa);
	return 0;
}

/* Initialize all the btree cursors for a RT repair. */
void
xrep_rt_btcur_init(
	struct xfs_scrub	*sc,
	struct xchk_rt		*sr)
{
	struct xfs_mount	*mp = sc->mp;

	if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTRMAPBT &&
	    xfs_has_rtrmapbt(mp))
		sr->rmap_cur = xfs_rtrmapbt_init_cursor(mp, sc->tp,
				mp->m_rrmapip);

	if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTREFCBT &&
	    xfs_has_rtreflink(mp))
		sr->refc_cur = xfs_rtrefcountbt_init_cursor(mp, sc->tp,
				mp->m_rrefcountip);
}

/* Reinitialize the per-AG block reservation for the AG we just fixed. */
int
xrep_reset_perag_resv(
	struct xfs_scrub	*sc)
{
	if (!(sc->flags & XREP_RESET_PERAG_RESV))
		return 0;

	ASSERT(sc->sa.pag != NULL);
	ASSERT(sc->ops->type == ST_PERAG);
	ASSERT(sc->tp);

	sc->flags &= ~XREP_RESET_PERAG_RESV;
	xfs_ag_resv_free(sc->sa.pag);
	return xfs_ag_resv_init(sc->sa.pag, sc->tp);
}

/*
 * Repair the ondisk forks of a metadata inode.  The caller must ensure that
 * sc->ip points to the metadata inode and the ILOCK is held on that inode.
 * The inode must not be joined to the transaction before the call, and will
 * not be afterwards.
 */
int
xrep_metadata_inode_forks(
	struct xfs_scrub	*sc)
{
	__u32			smtype;
	__u32			smflags;
	bool			dirty = false;
	int			error;

	/* Clear the reflink flag since metadata never shares. */
	if (xfs_is_reflink_inode(sc->ip)) {
		dirty = true;
		xfs_trans_ijoin(sc->tp, sc->ip, 0);
		error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
		if (error)
			return error;
	}

	/* Clear the attr forks since metadata shouldn't have that. */
	if (xfs_inode_hasattr(sc->ip)) {
		if (!dirty) {
			dirty = true;
			xfs_trans_ijoin(sc->tp, sc->ip, 0);
		}
		error = xrep_xattr_reset_fork(sc, sc->ip);
		if (error)
			return error;
	}

	/*
	 * If we modified the inode, roll the transaction but don't rejoin the
	 * inode to the new transaction because xrep_bmap_data can do that.
	 */
	if (dirty) {
		error = xfs_trans_roll(&sc->tp);
		if (error)
			return error;
		dirty = false;
	}

	/*
	 * Let's see if the forks need repair.  We're going to open-code calls
	 * to the bmapbtd scrub and repair functions so that we can hang on to
	 * the resources that we already acquired instead of using the standard
	 * setup/teardown routines.
	 */
	smtype = sc->sm->sm_type;
	smflags = sc->sm->sm_flags;
	sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD;
	sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;

	error = xchk_metadata_inode_forks(sc);
	if (error || !xfs_scrub_needs_repair(sc->sm))
		goto out;

	/*
	 * Repair the data fork.  This will potentially join the inode to the
	 * transaction.  We do not allow unwritten extents in metadata files.
	 */
	error = xrep_bmap(sc, XFS_DATA_FORK, false);
	if (error)
		goto out;

	/*
	 * Roll the transaction but don't rejoin the inode to the new
	 * transaction because we're done making changes to the inode.
	 */
	error = xfs_trans_roll(&sc->tp);
	if (error)
		goto out;

	/* Bail out if we still need repairs. */
	sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
	error = xchk_metadata_inode_forks(sc);
	if (error)
		goto out;
	if (xfs_scrub_needs_repair(sc->sm))
		error = -EFSCORRUPTED;
out:
	sc->sm->sm_type = smtype;
	sc->sm->sm_flags = smflags;
	return error;
}

/*
 * See if this buffer can pass the given ->verify_struct() function.
 *
 * If the buffer already has ops attached and they're not the ones that were
 * passed in, we reject the buffer.  Otherwise, we perform the structure test
 * (note that we do not check CRCs) and return the outcome of the test.  The
 * buffer ops and error state are left unchanged.
 */
bool
xrep_buf_verify_struct(
	struct xfs_buf			*bp,
	const struct xfs_buf_ops	*ops)
{
	const struct xfs_buf_ops	*old_ops = bp->b_ops;
	xfs_failaddr_t			fa;
	int				old_error;

	if (old_ops) {
		if (old_ops != ops)
			return false;
	}

	old_error = bp->b_error;
	bp->b_ops = ops;
	fa = bp->b_ops->verify_struct(bp);
	bp->b_ops = old_ops;
	bp->b_error = old_error;

	return fa == NULL;
}

/* Look up the '..' entry for @sc->ip. */
xfs_ino_t
xrep_dotdot_lookup(
	struct xfs_scrub	*sc)
{
	xfs_ino_t		parent_ino;
	int			error;

	if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
		return NULLFSINO;

	error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &parent_ino,
			NULL);
	if (error)
		return NULLFSINO;
	if (!xfs_verify_dir_ino(sc->mp, parent_ino))
		return NULLFSINO;
	return parent_ino;
}

#ifdef CONFIG_XFS_RT
/*
 * Ensure that all rt blocks in the given range are not marked free or
 * misaligned.
 */
int
xrep_require_rtext_inuse(
	struct xfs_scrub	*sc,
	xfs_rtblock_t		rtbno,
	xfs_filblks_t		len,
	bool			must_align)
{
	struct xfs_mount	*mp = sc->mp;
	xfs_rtblock_t		startext;
	xfs_rtblock_t		endext;
	xfs_rtblock_t		extcount;
	uint32_t		mod;
	bool			is_free = false;
	int			error;

	/* Round the starting rt extent down and the end rt extent up. */
	startext = div_u64_rem(rtbno, mp->m_sb.sb_rextsize, &mod);
	if (mod != 0 && must_align)
		return -EFSCORRUPTED;
	endext = div_u64_rem(rtbno + len - 1, mp->m_sb.sb_rextsize, &mod);
	if (mod != 0 && must_align)
		return -EFSCORRUPTED;

	extcount = endext - startext + 1;
	error = xfs_rtalloc_extent_is_free(mp, sc->tp, startext, extcount,
			&is_free);
	if (error)
		return error;

	return is_free ? -EFSCORRUPTED : 0;
}
#endif

/* Are we looking at a realtime metadata inode? */
bool
xrep_is_rtmeta_ino(
	struct xfs_scrub	*sc,
	xfs_ino_t		ino)
{
	return ino == sc->mp->m_rbmip->i_ino ||
	       ino == sc->mp->m_rsumip->i_ino ||
	       ino == sc->mp->m_rrefcountip->i_ino ||
	       ino == sc->mp->m_rrmapip->i_ino;
}

/* Check the sanity of a rmap record for a metadata btree inode. */
int
xrep_check_ino_btree_mapping(
	struct xfs_scrub		*sc,
	const struct xfs_rmap_irec	*rec)
{
	bool				is_freesp;
	int				error;

	/*
	 * Metadata btree inodes never have extended attributes, and all blocks
	 * should have the bmbt block flag set.
	 */
	if ((rec->rm_flags & XFS_RMAP_ATTR_FORK) ||
	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
		return -EFSCORRUPTED;

	/* Make sure the block is within the AG. */
	if (!xfs_verify_agbext(sc->mp, sc->sa.pag->pag_agno, rec->rm_startblock,
				rec->rm_blockcount))
		return -EFSCORRUPTED;

	/* Make sure this isn't free space. */
	error = xfs_alloc_has_record(sc->sa.bno_cur, rec->rm_startblock,
			rec->rm_blockcount, &is_freesp);
	if (error)
		return error;
	if (is_freesp)
		return -EFSCORRUPTED;

	return 0;
}

/* Reset the block reservation for a metadata inode. */
int
xrep_reset_imeta_reservation(
	struct xfs_scrub	*sc)
{
	struct xfs_inode	*ip = sc->ip;
	int64_t			delta;
	int			error;

	delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked;
	if (delta == 0)
		return 0;

	if (delta > 0) {
		int64_t		give_back;

		/* Too many blocks, free from the incore reservation. */
		give_back = min_t(uint64_t, delta, ip->i_delayed_blks);
		if (give_back > 0) {
			xfs_mod_delalloc(ip->i_mount, -give_back);
			xfs_mod_fdblocks(ip->i_mount, give_back, true);
			ip->i_delayed_blks -= give_back;
		}

		return 0;
	}

	/* Not enough blocks, try to add more.  @delta is negative here. */
	error = xfs_mod_fdblocks(sc->mp, delta, true);
	if (error) {
		xfs_warn(sc->mp,
	"Cannot replenish metadata inode reservation!");
		return error;
	}

	xfs_mod_delalloc(sc->mp, -delta);
	ip->i_delayed_blks += -delta;
	return 0;
}