summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2021-09-01 10:45:02 -0700
committerDarrick J. Wong <djwong@kernel.org>2021-12-15 17:28:51 -0800
commit60e806ccaf1b4c8da12513ebf8e869eed5d3a387 (patch)
tree90b7e0dc8a4bb95f42e560d66cfaf1170bc7dfe4 /fs
parent72c3afb5c1f9c9c6739e7fc4202241e76110bfdc (diff)
xfs: repair free space btrees
Rebuild the free space btrees from the gaps in the rmap btree. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_types.c20
-rw-r--r--fs/xfs/libxfs/xfs_types.h9
-rw-r--r--fs/xfs/scrub/alloc.c14
-rw-r--r--fs/xfs/scrub/alloc_repair.c807
-rw-r--r--fs/xfs/scrub/common.h13
-rw-r--r--fs/xfs/scrub/repair.c57
-rw-r--r--fs/xfs/scrub/repair.h26
-rw-r--r--fs/xfs/scrub/scrub.c14
-rw-r--r--fs/xfs/scrub/scrub.h8
-rw-r--r--fs/xfs/scrub/trace.h24
-rw-r--r--fs/xfs/xfs_extent_busy.c13
-rw-r--r--fs/xfs/xfs_extent_busy.h2
14 files changed, 1000 insertions, 10 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index f85f3a19d133..71183f0f7e5d 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -167,6 +167,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
+ alloc_repair.o \
bitmap.o \
repair.o \
)
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index fe94058d4e9e..89628b672b6d 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -411,6 +411,8 @@ xfs_ag_resv_free_extent(
fallthrough;
case XFS_AG_RESV_NONE:
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+ fallthrough;
+ case XFS_AG_RESV_IGNORE:
return;
}
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index e810d23f2d97..96647008f9e0 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -47,6 +47,26 @@ xfs_verify_agbno(
}
/*
+ * Verify that an AG extent is fully contained inside the AG and does not point
+ * at static metadata.
+ */
+bool
+xfs_verify_agbext(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ xfs_agblock_t len)
+{
+ if (agbno + len <= agbno)
+ return false;
+
+ if (!xfs_verify_agbno(mp, agno, agbno))
+ return false;
+
+ return xfs_verify_agbno(mp, agno, agbno + len - 1);
+}
+
+/*
* Verify that an FS block number pointer neither points outside the
* filesystem nor points at static AG metadata.
*/
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b6da06b40989..d0afc3d11e37 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -179,6 +179,13 @@ enum xfs_ag_resv_type {
XFS_AG_RESV_AGFL,
XFS_AG_RESV_METADATA,
XFS_AG_RESV_RMAPBT,
+
+ /*
+ * Don't increase fdblocks when freeing extent. This is a pony for
+ * the bnobt repair functions to re-free the free space without
+ * altering fdblocks. If you think you need this you're wrong.
+ */
+ XFS_AG_RESV_IGNORE,
};
/*
@@ -189,6 +196,8 @@ struct xfs_mount;
xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno);
+bool xfs_verify_agbext(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_agblock_t len);
bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno,
xfs_fsblock_t len);
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 87518e1292f8..2d7d6e8fe9cd 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -12,10 +12,11 @@
#include "xfs_btree.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "xfs_ag.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub free space btrees.
@@ -24,7 +25,16 @@ int
xchk_setup_ag_allocbt(
struct xfs_scrub *sc)
{
- return xchk_setup_ag_btree(sc, false);
+ int error;
+
+ error = xchk_setup_ag_btree(sc, false);
+ if (error)
+ return error;
+
+ if (xchk_could_repair(sc))
+ return xrep_setup_ag_allocbt(sc);
+
+ return 0;
}
/* Free space btree scrubber. */
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
new file mode 100644
index 000000000000..eb1349146106
--- /dev/null
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -0,0 +1,807 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2021 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_refcount.h"
+#include "xfs_extent_busy.h"
+#include "xfs_health.h"
+#include "xfs_bmap.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/xfarray.h"
+
+/*
+ * Free Space Btree Repair
+ * =======================
+ *
+ * The reverse mappings are supposed to record all space usage for the entire
+ * AG. Therefore, we can recalculate the free extents in an AG by looking for
+ * gaps in the physical extents recorded in the rmapbt. On a reflink
+ * filesystem this is a little more tricky in that we have to be aware that
+ * the rmap records are allowed to overlap.
+ *
+ * We derive which blocks belonged to the old bnobt/cntbt by recording all the
+ * OWN_AG extents and subtracting out the blocks owned by all other OWN_AG
+ * metadata: the rmapbt blocks visited while iterating the reverse mappings
+ * and the AGFL blocks.
+ *
+ * Once we have both of those pieces, we can reconstruct the bnobt and cntbt
+ * by blowing out the free block state and freeing all the extents that we
+ * found. This adds the requirement that we can't have any busy extents in
+ * the AG because the busy code cannot handle duplicate records.
+ *
+ * Note that we can only rebuild both free space btrees at the same time
+ * because the regular extent freeing infrastructure loads both btrees at the
+ * same time.
+ *
+ * We use the prefix 'xrep_abt' here because we regenerate both free space
+ * allocation btrees at the same time.
+ */
+
+struct xrep_abt {
+ /* Blocks owned by the rmapbt or the agfl. */
+ struct xbitmap not_allocbt_blocks;
+
+ /* All OWN_AG blocks. */
+ struct xbitmap old_allocbt_blocks;
+
+ /*
+ * New bnobt information. All btree block reservations are added to
+ * the reservation list in new_bnobt_info.
+ */
+ struct xrep_newbt new_bnobt_info;
+ struct xfs_btree_bload bno_bload;
+
+ /* new cntbt information */
+ struct xrep_newbt new_cntbt_info;
+ struct xfs_btree_bload cnt_bload;
+
+ /* Free space extents. */
+ struct xfarray *free_records;
+
+ struct xfs_scrub *sc;
+
+ /* Number of non-null records in @free_records. */
+ uint64_t nr_real_records;
+
+ /* get_record()'s position in the free space record array. */
+ uint64_t iter;
+
+ /*
+ * Next block we anticipate seeing in the rmap records. If the next
+ * rmap record is greater than next_bno, we have found unused space.
+ */
+ xfs_agblock_t next_bno;
+
+ /* Number of free blocks in this AG. */
+ xfs_agblock_t nr_blocks;
+
+ /* Longest free extent we found in the AG. */
+ xfs_agblock_t longest;
+};
+
+/* Set up to repair AG free space btrees. */
+int
+xrep_setup_ag_allocbt(
+ struct xfs_scrub *sc)
+{
+ unsigned int busy_gen;
+
+ /*
+ * Make sure the busy extent list is clear because we can't put extents
+ * on there twice.
+ */
+ busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
+ if (!xfs_extent_busy_list_empty(sc->sa.pag))
+ xfs_extent_busy_flush(sc->mp, sc->sa.pag, busy_gen);
+
+ return 0;
+}
+
+/* Check for any obvious conflicts in the free extent. */
+STATIC int
+xrep_abt_check_free_ext(
+ struct xfs_scrub *sc,
+ const struct xfs_alloc_rec_incore *rec)
+{
+ bool has_inodes, shared;
+ int error;
+
+ /* Must be within the AG and not static data. */
+ if (!xfs_verify_agbext(sc->mp, sc->sa.pag->pag_agno, rec->ar_startblock,
+ rec->ar_blockcount))
+ return -EFSCORRUPTED;
+
+ /* Must not be an inode chunk. */
+ error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+ rec->ar_startblock, rec->ar_blockcount, &has_inodes);
+ if (error)
+ return error;
+ if (has_inodes)
+ return -EFSCORRUPTED;
+
+ /* Must not be shared or CoW staging. */
+ if (sc->sa.refc_cur) {
+ error = xfs_refcount_has_record(sc->sa.refc_cur,
+ rec->ar_startblock, rec->ar_blockcount,
+ &shared);
+ if (error)
+ return error;
+ if (shared)
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/*
+ * Stash a free space record for all the space since the last bno we found
+ * all the way up to @end.
+ */
+static int
+xrep_abt_stash(
+ struct xrep_abt *ra,
+ xfs_agblock_t end)
+{
+ struct xfs_alloc_rec_incore arec = {
+ .ar_startblock = ra->next_bno,
+ .ar_blockcount = end - ra->next_bno,
+ };
+ struct xfs_scrub *sc = ra->sc;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ error = xrep_abt_check_free_ext(ra->sc, &arec);
+ if (error)
+ return error;
+
+ trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
+
+ error = xfarray_append(ra->free_records, &arec);
+ if (error)
+ return error;
+
+ ra->nr_blocks += arec.ar_blockcount;
+ return 0;
+}
+
+/* Record extents that aren't in use from gaps in the rmap records. */
+STATIC int
+xrep_abt_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_abt *ra = priv;
+ xfs_fsblock_t fsb;
+ int error;
+
+ /* Record all the OWN_AG blocks... */
+ if (rec->rm_owner == XFS_RMAP_OWN_AG) {
+ fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ rec->rm_startblock);
+ error = xbitmap_set(&ra->old_allocbt_blocks, fsb,
+ rec->rm_blockcount);
+ if (error)
+ return error;
+ }
+
+ /* ...and all the rmapbt blocks... */
+ error = xbitmap_set_btcur_path(&ra->not_allocbt_blocks, cur);
+ if (error)
+ return error;
+
+ /* ...and all the free space. */
+ if (rec->rm_startblock > ra->next_bno) {
+ error = xrep_abt_stash(ra, rec->rm_startblock);
+ if (error)
+ return error;
+ }
+
+ /*
+ * rmap records can overlap on reflink filesystems, so project next_bno
+ * as far out into the AG space as we currently know about.
+ */
+ ra->next_bno = max_t(xfs_agblock_t, ra->next_bno,
+ rec->rm_startblock + rec->rm_blockcount);
+ return 0;
+}
+
+/* Collect an AGFL block for the not-to-release list. */
+static int
+xrep_abt_walk_agfl(
+ struct xfs_mount *mp,
+ xfs_agblock_t bno,
+ void *priv)
+{
+ struct xrep_abt *ra = priv;
+ xfs_fsblock_t fsb;
+
+ fsb = XFS_AGB_TO_FSB(mp, ra->sc->sa.pag->pag_agno, bno);
+ return xbitmap_set(&ra->not_allocbt_blocks, fsb, 1);
+}
+
+/*
+ * Compare two free space extents by block number. We want to sort by block
+ * number.
+ */
+static int
+xrep_bnobt_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_alloc_rec_incore *ap = a;
+ const struct xfs_alloc_rec_incore *bp = b;
+
+ if (ap->ar_startblock > bp->ar_startblock)
+ return 1;
+ else if (ap->ar_startblock < bp->ar_startblock)
+ return -1;
+ return 0;
+}
+
+/*
+ * Compare two free space extents by length and then block number. We want
+ * to sort first in order of decreasing length and then in increasing block
+ * number.
+ */
+static int
+xrep_cntbt_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_alloc_rec_incore *ap = a;
+ const struct xfs_alloc_rec_incore *bp = b;
+
+ if (ap->ar_blockcount > bp->ar_blockcount)
+ return 1;
+ else if (ap->ar_blockcount < bp->ar_blockcount)
+ return -1;
+ return xrep_bnobt_extent_cmp(a, b);
+}
+
+/*
+ * Iterate all reverse mappings to find (1) the gaps between rmap records (all
+ * unowned space), (2) the OWN_AG extents (which encompass the free space
+ * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL
+ * blocks. The free space is (1) + (2) - (3) - (4).
+ */
+STATIC int
+xrep_abt_find_freespace(
+ struct xrep_abt *ra)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ xfs_agblock_t agend;
+ int error;
+
+ xbitmap_init(&ra->not_allocbt_blocks);
+
+ xrep_ag_btcur_init(sc, &sc->sa);
+
+ /*
+ * Iterate all the reverse mappings to find gaps in the physical
+ * mappings, all the OWN_AG blocks, and all the rmapbt extents.
+ */
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra);
+ if (error)
+ goto err;
+
+ /* Insert a record for space between the last rmap and EOAG. */
+ agend = be32_to_cpu(agf->agf_length);
+ if (ra->next_bno < agend) {
+ error = xrep_abt_stash(ra, agend);
+ if (error)
+ goto err;
+ }
+
+ /* Collect all the AGFL blocks. */
+ error = xfs_agfl_walk(mp, agf, sc->sa.agfl_bp, xrep_abt_walk_agfl, ra);
+ if (error)
+ goto err;
+
+ /* Compute the old bnobt/cntbt blocks. */
+ error = xbitmap_disunion(&ra->old_allocbt_blocks,
+ &ra->not_allocbt_blocks);
+ if (error)
+ goto err;
+
+ ra->nr_real_records = xfarray_length(ra->free_records);
+err:
+ xchk_ag_btcur_free(&sc->sa);
+ xbitmap_destroy(&ra->not_allocbt_blocks);
+ return error;
+}
+
+/*
+ * We're going to use the observed free space records to reserve blocks for the
+ * new free space btrees, so we play an iterative game where we try to converge
+ * on the number of blocks we need:
+ *
+ * 1. Estimate how many blocks we'll need to store the records.
+ * 2. If the first free record has more blocks than we need, we're done.
+ * We will have to re-sort the records prior to building the cntbt.
+ * 3. If that record has exactly the number of blocks we need, null out the
+ * record. We're done.
+ * 4. Otherwise, we still need more blocks. Null out the record, subtract its
+ * length from the number of blocks we need, and go back to step 1.
+ *
+ * Fortunately, we don't have to do any transaction work to play this game, so
+ * we don't have to tear down the staging cursors.
+ */
+STATIC int
+xrep_abt_reserve_space(
+ struct xrep_abt *ra,
+ struct xfs_btree_cur *bno_cur,
+ struct xfs_btree_cur *cnt_cur,
+ bool *needs_sort)
+{
+ struct xfs_scrub *sc = ra->sc;
+ uint64_t record_nr = xfarray_length(ra->free_records) - 1;
+ unsigned int allocated = 0;
+ int error = 0;
+
+ *needs_sort = false;
+ do {
+ struct xfs_alloc_rec_incore arec;
+ xfs_fsblock_t fsbno;
+ uint64_t required;
+ unsigned int desired;
+ unsigned int len;
+
+ /* Compute how many blocks we'll need. */
+ error = xfs_btree_bload_compute_geometry(cnt_cur,
+ &ra->cnt_bload, ra->nr_real_records);
+ if (error)
+ break;
+
+ error = xfs_btree_bload_compute_geometry(bno_cur,
+ &ra->bno_bload, ra->nr_real_records);
+ if (error)
+ break;
+
+ /* How many btree blocks do we need to store all records? */
+ required = ra->cnt_bload.nr_blocks + ra->bno_bload.nr_blocks;
+ ASSERT(required < INT_MAX);
+
+ /* If we've reserved enough blocks, we're done. */
+ if (allocated >= required)
+ break;
+
+ desired = required - allocated;
+
+ /* We need space but there's none left; bye! */
+ if (ra->nr_real_records == 0) {
+ error = -ENOSPC;
+ break;
+ }
+
+ /* Grab the first record from the list. */
+ error = xfarray_load(ra->free_records, record_nr, &arec);
+ if (error)
+ break;
+
+ ASSERT(arec.ar_blockcount <= UINT_MAX);
+ len = min_t(unsigned int, arec.ar_blockcount, desired);
+ fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+ arec.ar_startblock);
+
+ trace_xrep_newbt_alloc_blocks(sc->mp, sc->sa.pag->pag_agno,
+ arec.ar_startblock, len, XFS_RMAP_OWN_AG);
+
+ error = xrep_newbt_add_blocks(&ra->new_bnobt_info, fsbno, len);
+ if (error)
+ break;
+ allocated += len;
+ ra->nr_blocks -= len;
+
+ if (arec.ar_blockcount > desired) {
+ /*
+ * Record has more space than we need. The number of
+ * free records doesn't change, so shrink the free
+ * record, inform the caller that we've broken the sort
+ * order of the records, and exit.
+ */
+ arec.ar_startblock += desired;
+ arec.ar_blockcount -= desired;
+ error = xfarray_store(ra->free_records, record_nr,
+ &arec);
+ if (error)
+ break;
+ *needs_sort = true;
+ break;
+ }
+
+ /*
+ * We're going to use up the entire record, so nullify it and
+ * move on to the next one. This changes the number of free
+ * records, so we must go around the loop once more to re-run
+ * _bload_init.
+ */
+ error = xfarray_nullify(ra->free_records, record_nr);
+ if (error)
+ break;
+ ra->nr_real_records--;
+ record_nr--;
+ } while (1);
+
+ return error;
+}
+
+/*
+ * Deal with all the space we reserved. Blocks that were allocated for the
+ * free space btrees need to have a (deferred) rmap added for the OWN_AG
+ * allocation, and blocks that didn't get used can be freed via the usual
+ * (deferred) means.
+ */
+STATIC void
+xrep_abt_dispose_reservations(
+ struct xrep_abt *ra,
+ int error)
+{
+ struct xrep_newbt_resv *resv, *n;
+ struct xfs_scrub *sc = ra->sc;
+
+ if (error)
+ goto junkit;
+
+ for_each_xrep_newbt_reservation(&ra->new_bnobt_info, resv, n) {
+ /* Add a deferred rmap for each extent we used. */
+ if (resv->used > 0)
+ xfs_rmap_alloc_extent(sc->tp,
+ XFS_FSB_TO_AGNO(sc->mp, resv->fsbno),
+ XFS_FSB_TO_AGBNO(sc->mp, resv->fsbno),
+ resv->used, XFS_RMAP_OWN_AG);
+
+ /*
+ * Add a deferred free for each block we didn't use and now
+ * have to add to the free space since the new btrees are
+ * online.
+ */
+ if (resv->used < resv->len)
+ __xfs_free_extent_later(sc->tp,
+ resv->fsbno + resv->used,
+ resv->len - resv->used, NULL, true);
+ }
+
+junkit:
+ for_each_xrep_newbt_reservation(&ra->new_bnobt_info, resv, n) {
+ list_del(&resv->list);
+ kmem_free(resv);
+ }
+
+ xrep_newbt_destroy(&ra->new_bnobt_info, error);
+ xrep_newbt_destroy(&ra->new_cntbt_info, error);
+}
+
+/* Retrieve free space data for bulk load. */
+STATIC int
+xrep_abt_get_record(
+ struct xfs_btree_cur *cur,
+ void *priv)
+{
+ struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a;
+ struct xrep_abt *ra = priv;
+ int error;
+
+ error = xfarray_load_next(ra->free_records, &ra->iter, arec);
+ if (error)
+ return error;
+
+ ra->longest = max(ra->longest, arec->ar_blockcount);
+ return 0;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_abt_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_abt *ra = priv;
+
+ return xrep_newbt_claim_block(cur, &ra->new_bnobt_info, ptr);
+}
+
+/*
+ * Reset the AGF counters to reflect the free space btrees that we just
+ * rebuilt, then reinitialize the per-AG data.
+ */
+STATIC int
+xrep_abt_reset_counters(
+ struct xrep_abt *ra,
+ unsigned int freesp_btreeblks)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_buf *bp;
+
+ /*
+ * Mark the pagf information stale and use the accessor function to
+ * forcibly reload it from the values we just logged. We still own the
+ * AGF buffer so we can safely ignore bp.
+ */
+ ASSERT(pag->pagf_init);
+ pag->pagf_init = 0;
+
+ agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
+ (be32_to_cpu(agf->agf_rmap_blocks) - 1));
+ agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
+ agf->agf_longest = cpu_to_be32(ra->longest);
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
+ XFS_AGF_LONGEST |
+ XFS_AGF_FREEBLKS);
+
+ return xfs_alloc_read_agf(sc->mp, sc->tp, sc->sa.pag->pag_agno, 0, &bp);
+}
+
+static void
+xrep_abt_init_bload(
+ struct xrep_abt *ra,
+ struct xfs_btree_bload *bload)
+{
+ bload->get_record = xrep_abt_get_record;
+ bload->claim_block = xrep_abt_claim_block;
+
+ xrep_bload_estimate_slack(ra->sc, bload);
+}
+
+/*
+ * Use the collected free space information to stage new free space btrees.
+ * If this is successful we'll return with the new btree root
+ * information logged to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_abt_build_new_trees(
+ struct xrep_abt *ra)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_btree_cur *bno_cur;
+ struct xfs_btree_cur *cnt_cur;
+ struct xfs_perag *pag = sc->sa.pag;
+ unsigned int old_bno_level, old_cnt_level;
+ bool needs_sort;
+ int error;
+
+ xrep_abt_init_bload(ra, &ra->bno_bload);
+ xrep_abt_init_bload(ra, &ra->cnt_bload);
+
+ /*
+ * Sort the free extents by length so that we can set up the free space
+ * btrees in as few extents as possible. This reduces the amount of
+ * deferred rmap / free work we have to do at the end.
+ */
+ error = xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the AG header.
+ */
+ xrep_newbt_init_bare(&ra->new_bnobt_info, sc);
+ xrep_newbt_init_bare(&ra->new_cntbt_info, sc);
+
+ /* Allocate cursors for the staged btrees. */
+ bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt_info.afake,
+ pag, XFS_BTNUM_BNO);
+ cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt_info.afake,
+ pag, XFS_BTNUM_CNT);
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_cur;
+
+ /* Reserve the space we'll need for the new btrees. */
+ error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_sort);
+ if (error)
+ goto err_cur;
+
+ /*
+ * If we need to re-sort the free extents by length, do so so that we
+ * can put the records into the cntbt in the correct order.
+ */
+ if (needs_sort) {
+ error = xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp);
+ if (error)
+ goto err_cur;
+ }
+
+ /*
+ * Due to btree slack factors, it's possible for a new btree to be one
+ * level taller than the old btree. Update the incore btree height so
+ * that we don't trip the verifiers when writing the new btree blocks
+ * to disk.
+ */
+ old_bno_level = pag->pagf_levels[XFS_BTNUM_BNOi];
+ old_cnt_level = pag->pagf_levels[XFS_BTNUM_CNTi];
+ pag->pagf_levels[XFS_BTNUM_BNOi] = ra->bno_bload.btree_height;
+ pag->pagf_levels[XFS_BTNUM_CNTi] = ra->cnt_bload.btree_height;
+
+ /* Load the free space by length tree. */
+ ra->iter = 0;
+ ra->longest = 0;
+ error = xfs_btree_bload(cnt_cur, &ra->cnt_bload, ra);
+ if (error)
+ goto err_levels;
+
+ /* Re-sort the free extents by block number so so that we can put the
+ * records into the bnobt in the correct order.
+ */
+ error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp);
+ if (error)
+ goto err_levels;
+
+ /* Load the free space by block number tree. */
+ ra->iter = 0;
+ error = xfs_btree_bload(bno_cur, &ra->bno_bload, ra);
+ if (error)
+ goto err_levels;
+
+ /*
+ * Install the new btrees in the AG header. After this point the old
+ * btree is no longer accessible and the new tree is live.
+ *
+ * Note: We re-read the AGF here to ensure the buffer type is set
+ * properly. Since we built a new tree without attaching to the AGF
+ * buffer, the buffer item may have fallen off the buffer. This ought
+ * to succeed since the AGF is held across transaction rolls.
+ */
+ error = xfs_read_agf(sc->mp, sc->tp, pag->pag_agno, 0, &sc->sa.agf_bp);
+ if (error)
+ goto err_levels;
+
+ /* Commit our new btrees. */
+ xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(bno_cur, 0);
+ xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(cnt_cur, 0);
+
+ /* Reset the AGF counters now that we've changed the btree shape. */
+ error = xrep_abt_reset_counters(ra, (ra->bno_bload.nr_blocks - 1) +
+ (ra->cnt_bload.nr_blocks - 1));
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ xrep_abt_dispose_reservations(ra, error);
+
+ return xrep_roll_ag_trans(sc);
+
+err_levels:
+ pag->pagf_levels[XFS_BTNUM_BNOi] = old_bno_level;
+ pag->pagf_levels[XFS_BTNUM_CNTi] = old_cnt_level;
+err_cur:
+ xfs_btree_del_cursor(cnt_cur, error);
+ xfs_btree_del_cursor(bno_cur, error);
+err_newbt:
+ xrep_abt_dispose_reservations(ra, error);
+ return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_abt_remove_old_trees(
+ struct xrep_abt *ra)
+{
+ /* Free the old inode btree blocks if they're not in use. */
+ return xrep_reap_extents(ra->sc, &ra->old_allocbt_blocks,
+ &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE);
+}
+
+/* Repair the freespace btrees for some AG. */
+int
+xrep_allocbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_abt *ra;
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+
+ ra = kmem_zalloc(sizeof(struct xrep_abt), KM_NOFS | KM_MAYFAIL);
+ if (!ra)
+ return -ENOMEM;
+ ra->sc = sc;
+
+ /* We rebuild both data structures. */
+ sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
+
+ /*
+ * Make sure the busy extent list is clear because we can't put extents
+ * on there twice. In theory we cleared this before we started, but
+ * let's not risk the filesystem.
+ */
+ if (!xfs_extent_busy_list_empty(sc->sa.pag))
+ return -EDEADLOCK;
+
+ /* Set up some storage */
+ error = xfarray_create(mp, "free space extents",
+ sizeof(struct xfs_alloc_rec_incore), &ra->free_records);
+ if (error)
+ goto out_ra;
+
+ /* Collect the free space data and find the old btree blocks. */
+ xbitmap_init(&ra->old_allocbt_blocks);
+ error = xrep_abt_find_freespace(ra);
+ if (error)
+ goto out_bitmap;
+
+ /* Rebuild the free space information. */
+ error = xrep_abt_build_new_trees(ra);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old trees. */
+ error = xrep_abt_remove_old_trees(ra);
+
+out_bitmap:
+ xbitmap_destroy(&ra->old_allocbt_blocks);
+ xfarray_destroy(ra->free_records);
+out_ra:
+ kmem_free(ra);
+ return error;
+}
+
+/* Make sure both btrees are ok after we've rebuilt them. */
+int
+xrep_revalidate_allocbt(
+ struct xfs_scrub *sc)
+{
+ __u32 old_type = sc->sm->sm_type;
+ int error;
+
+ /*
+ * We must update sm_type temporarily so that the tree-to-tree cross
+ * reference checks will work in the correct direction, and also so
+ * that tracing will report correctly if there are more errors.
+ */
+ sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
+ error = xchk_bnobt(sc);
+ if (error)
+ goto out;
+
+ sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT;
+ error = xchk_cntbt(sc);
+out:
+ sc->sm->sm_type = old_type;
+ return error;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 5312f901826d..2045857a947b 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -166,8 +166,21 @@ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
XFS_SCRUB_OFLAG_XCORRUPT |
XFS_SCRUB_OFLAG_PREEN);
}
+
+/*
+ * "Should we prepare for a repair?"
+ *
+ * Return true if the caller permits us to repair metadata and we're not
+ * setting up for a post-repair evaluation.
+ */
+static inline bool xchk_could_repair(const struct xfs_scrub *sc)
+{
+ return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ !(sc->flags & XREP_ALREADY_FIXED);
+}
#else
# define xchk_needs_repair(sc) (false)
+# define xchk_could_repair(sc) (false)
#endif /* CONFIG_XFS_ONLINE_REPAIR */
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1122c76dd108..6c5877f55d0d 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1015,10 +1015,22 @@ xrep_agextent_reap(
xrep_agextent_reap_binval(sc, agbno, aglen);
- if (rs->resv == XFS_AG_RESV_AGFL) {
+ switch (rs->resv) {
+ case XFS_AG_RESV_AGFL:
error = xrep_put_freelist(sc, agbno);
*want_roll = true;
- } else {
+ break;
+ case XFS_AG_RESV_IGNORE:
+ /*
+ * bnobt/cntbt blocks are counted as free space, so we pass
+ * XFS_AG_RESV_IGNORE when reaping the old free space btree
+ * blocks to avoid changing fdblocks.
+ */
+ error = __xfs_free_extent(sc->tp, fsbno, aglen, rs->oinfo,
+ rs->resv, true);
+ *want_roll = true;
+ break;
+ default:
/*
* Use deferred frees to get rid of the old btree blocks to try
* to minimize the window in which we could crash and lose the
@@ -1030,6 +1042,7 @@ xrep_agextent_reap(
__xfs_free_extent_later(sc->tp, fsbno, aglen, rs->oinfo, true);
rs->deferred++;
*want_roll = rs->deferred > 100;
+ break;
}
return error;
@@ -1501,3 +1514,43 @@ xrep_ino_dqattach(
return error;
}
+
+/* Initialize all the btree cursors for an AG repair. */
+void
+xrep_ag_btcur_init(
+ struct xfs_scrub *sc,
+ struct xchk_ag *sa)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ /* Set up a bnobt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
+ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag, XFS_BTNUM_BNO);
+ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag, XFS_BTNUM_CNT);
+ }
+
+ /* Set up a inobt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
+ sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
+ sc->sa.pag, XFS_BTNUM_INO);
+ if (xfs_has_finobt(mp))
+ sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp,
+ sa->agi_bp, sc->sa.pag, XFS_BTNUM_FINO);
+ }
+
+ /* Set up a rmapbt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT &&
+ xfs_has_rmapbt(mp))
+ sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag);
+
+ /* Set up a refcountbt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT &&
+ xfs_has_reflink(mp))
+ sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, sc->sa.pag);
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 7fc2d7d49001..f9875e695f7f 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -7,6 +7,7 @@
#define __XFS_SCRUB_REPAIR_H__
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_quota_defs.h"
#include "scrub/bitmap.h"
@@ -61,6 +62,15 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
int xrep_ino_dqattach(struct xfs_scrub *sc);
+/* Repair setup functions */
+int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
+
+void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
+
+/* Metadata revalidators */
+
+int xrep_revalidate_allocbt(struct xfs_scrub *sc);
+
/* Metadata repairers */
int xrep_probe(struct xfs_scrub *sc);
@@ -68,6 +78,7 @@ int xrep_superblock(struct xfs_scrub *sc);
int xrep_agf(struct xfs_scrub *sc);
int xrep_agfl(struct xfs_scrub *sc);
int xrep_agi(struct xfs_scrub *sc);
+int xrep_allocbt(struct xfs_scrub *sc);
struct xrep_newbt_resv {
/* Link to list of extents that we've reserved. */
@@ -107,6 +118,9 @@ struct xrep_newbt {
enum xfs_ag_resv_type resv;
};
+#define for_each_xrep_newbt_reservation(xnr, resv, n) \
+ list_for_each_entry_safe((resv), (n), &(xnr)->resv_list, list)
+
void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc);
void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc,
const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint,
@@ -141,11 +155,23 @@ xrep_calc_ag_resblks(
return 0;
}
+/* repair setup functions for no-repair */
+static inline int
+xrep_setup_nothing(
+ struct xfs_scrub *sc)
+{
+ return 0;
+}
+#define xrep_setup_ag_allocbt xrep_setup_nothing
+
+#define xrep_revalidate_allocbt (NULL)
+
#define xrep_probe xrep_notsupported
#define xrep_superblock xrep_notsupported
#define xrep_agf xrep_notsupported
#define xrep_agfl xrep_notsupported
#define xrep_agi xrep_notsupported
+#define xrep_allocbt xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index c0a2275b8b73..775bc4457bc5 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -220,13 +220,15 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_PERAG,
.setup = xchk_setup_ag_allocbt,
.scrub = xchk_bnobt,
- .repair = xrep_notsupported,
+ .repair = xrep_allocbt,
+ .repair_eval = xrep_revalidate_allocbt,
},
[XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */
.type = ST_PERAG,
.setup = xchk_setup_ag_allocbt,
.scrub = xchk_cntbt,
- .repair = xrep_notsupported,
+ .repair = xrep_allocbt,
+ .repair_eval = xrep_revalidate_allocbt,
},
[XFS_SCRUB_TYPE_INOBT] = { /* inobt */
.type = ST_PERAG,
@@ -511,7 +513,10 @@ retry_op:
goto out_teardown;
/* Scrub for errors. */
- error = sc->ops->scrub(sc);
+ if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL)
+ error = sc->ops->repair_eval(sc);
+ else
+ error = sc->ops->scrub(sc);
if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
/*
* Scrubbers return -EDEADLOCK to mean 'try harder'.
@@ -528,8 +533,7 @@ retry_op:
xchk_update_health(sc);
- if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
- !(sc->flags & XREP_ALREADY_FIXED)) {
+ if (xchk_could_repair(sc)) {
bool needs_fix = xchk_needs_repair(sc->sm);
/* Let debug users force us into the repair routines. */
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index ac2b277deb5c..c6f33d1b8b7b 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -26,6 +26,14 @@ struct xchk_meta_ops {
/* Repair or optimize the metadata. */
int (*repair)(struct xfs_scrub *);
+ /*
+ * Re-scrub the metadata we repaired, in case there's extra work that
+ * we need to do to check our repair work. If this is NULL, we'll use
+ * the ->scrub function pointer, assuming that the regular scrub is
+ * sufficient.
+ */
+ int (*repair_eval)(struct xfs_scrub *sc);
+
/* Decide if we even have this piece of metadata. */
bool (*has)(struct xfs_mount *);
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 26022810485e..cf58e3bfc7dd 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -902,11 +902,33 @@ DEFINE_EVENT(xrep_rmap_class, name, \
xfs_agblock_t agbno, xfs_extlen_t len, \
uint64_t owner, uint64_t offset, unsigned int flags), \
TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
-DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn);
DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn);
DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn);
DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn);
+TRACE_EVENT(xrep_abt_found,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ const struct xfs_alloc_rec_incore *rec),
+ TP_ARGS(mp, agno, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, startblock)
+ __field(xfs_extlen_t, blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startblock = rec->ar_startblock;
+ __entry->blockcount = rec->ar_blockcount;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startblock,
+ __entry->blockcount)
+)
+
TRACE_EVENT(xrep_refcount_extent_fn,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
struct xfs_refcount_irec *irec),
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index ad22a003f959..ea384d031804 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -628,3 +628,16 @@ xfs_extent_busy_ag_cmp(
diff = b1->bno - b2->bno;
return diff;
}
+
+/* Are there any busy extents in this AG? */
+bool
+xfs_extent_busy_list_empty(
+ struct xfs_perag *pag)
+{
+ bool res;
+
+ spin_lock(&pag->pagb_lock);
+ res = RB_EMPTY_ROOT(&pag->pagb_tree);
+ spin_unlock(&pag->pagb_lock);
+ return res;
+}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 4a118131059f..19828c9854d1 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -67,4 +67,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
list_sort(NULL, list, xfs_extent_busy_ag_cmp);
}
+bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
+
#endif /* __XFS_EXTENT_BUSY_H__ */