summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChandan Babu R <chandanbabu@kernel.org>2024-02-24 10:25:31 +0530
committerChandan Babu R <chandanbabu@kernel.org>2024-02-24 10:25:31 +0530
commit74acb705354cf9f1edffbc30e896813ce69163a0 (patch)
tree4e49ba33705fcb79955cfb9fa7906d8eed771ab2
parentfd43925cad85332818f8e480b6b39e972ce77c40 (diff)
parent7fbaab57a80f1639add1c7d02adeb9d17bd50206 (diff)
Merge tag 'repair-refcount-scalability-6.9_2024-02-23' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.9-mergeC
xfs: reduce refcount repair memory usage The refcountbt repair code has serious memory usage problems when the block sharing factor of the filesystem is very high. This can happen if a deduplication tool has been run against the filesystem, or if the fs stores reflinked VM images that have been aging for a long time. Recall that the original reference counting algorithm walks the reverse mapping records of the filesystem to generate reference counts. For any given block in the AG, the rmap bag structure contains the all rmap records that cover that block; the refcount is the size of that bag. For online repair, the bag doesn't need the owner, offset, or state flag information, so it discards those. This halves the record size, but the bag structure still stores one excerpted record for each reverse mapping. If the sharing count is high, this will use a LOT of memory storing redundant records. In the extreme case, 100k mappings to the same piece of space will consume 100k*16 bytes = 1.6M of memory. For offline repair, the bag stores the owner values so that we know which inodes need to be marked as being reflink inodes. If a deduplication tool has been run and there are many blocks within a file pointing to the same physical space, this will stll use a lot of memory to store redundant records. The solution to this problem is to deduplicate the bag records when possible by adding a reference count to the bag record, and changing the bag add function to detect an existing record to bump the refcount. In the above example, the 100k mappings will now use 24 bytes of memory. These lookups can be done efficiently with a btree, so we create a new refcount bag btree type (inside of online repair). This is why we refactored the btree code in the previous patchset. The btree conversion also dramatically reduces the runtime of the refcount generation algorithm, because the code to delete all bag records that end at a given agblock now only has to delete one record instead of (using the example above) 100k records. As an added benefit, record deletion now gives back the unused xfile space, which it did not do previously. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Chandan Babu R <chandanbabu@kernel.org> * tag 'repair-refcount-scalability-6.9_2024-02-23' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux: xfs: port refcount repair to the new refcount bag structure xfs: create refcount bag structure for btree repairs xfs: define an in-memory btree for storing refcount bag info during repairs
-rw-r--r--fs/xfs/Makefile2
-rw-r--r--fs/xfs/scrub/rcbag.c307
-rw-r--r--fs/xfs/scrub/rcbag.h28
-rw-r--r--fs/xfs/scrub/rcbag_btree.c370
-rw-r--r--fs/xfs/scrub/rcbag_btree.h81
-rw-r--r--fs/xfs/scrub/refcount.c12
-rw-r--r--fs/xfs/scrub/refcount_repair.c164
-rw-r--r--fs/xfs/scrub/repair.h2
-rw-r--r--fs/xfs/xfs_stats.c3
-rw-r--r--fs/xfs/xfs_stats.h1
-rw-r--r--fs/xfs/xfs_super.c10
11 files changed, 872 insertions, 108 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 6de02b2573c3..76674ad5833e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -198,6 +198,8 @@ xfs-y += $(addprefix scrub/, \
inode_repair.o \
newbt.o \
nlinks_repair.o \
+ rcbag_btree.o \
+ rcbag.o \
reap.o \
refcount_repair.o \
repair.o \
diff --git a/fs/xfs/scrub/rcbag.c b/fs/xfs/scrub/rcbag.c
new file mode 100644
index 000000000000..e1e52bc20713
--- /dev/null
+++ b/fs/xfs/scrub/rcbag.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_error.h"
+#include "scrub/scrub.h"
+#include "scrub/rcbag_btree.h"
+#include "scrub/rcbag.h"
+#include "scrub/trace.h"
+
+struct rcbag {
+ struct xfs_mount *mp;
+ struct xfbtree xfbtree;
+ uint64_t nr_items;
+};
+
+int
+rcbag_init(
+ struct xfs_mount *mp,
+ struct xfs_buftarg *btp,
+ struct rcbag **bagp)
+{
+ struct rcbag *bag;
+ int error;
+
+ bag = kzalloc(sizeof(struct rcbag), XCHK_GFP_FLAGS);
+ if (!bag)
+ return -ENOMEM;
+
+ bag->nr_items = 0;
+ bag->mp = mp;
+
+ error = rcbagbt_mem_init(mp, &bag->xfbtree, btp);
+ if (error)
+ goto out_bag;
+
+ *bagp = bag;
+ return 0;
+
+out_bag:
+ kfree(bag);
+ return error;
+}
+
+void
+rcbag_free(
+ struct rcbag **bagp)
+{
+ struct rcbag *bag = *bagp;
+
+ xfbtree_destroy(&bag->xfbtree);
+ kfree(bag);
+ *bagp = NULL;
+}
+
+/* Track an rmap in the refcount bag. */
+int
+rcbag_add(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ const struct xfs_rmap_irec *rmap)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ int has;
+ int error;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = rcbagbt_lookup_eq(cur, rmap, &has);
+ if (error)
+ goto out_cur;
+
+ if (has) {
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ bagrec.rbg_refcount++;
+ error = rcbagbt_update(cur, &bagrec);
+ if (error)
+ goto out_cur;
+ } else {
+ bagrec.rbg_startblock = rmap->rm_startblock;
+ bagrec.rbg_blockcount = rmap->rm_blockcount;
+ bagrec.rbg_refcount = 1;
+
+ error = rcbagbt_insert(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+
+ error = xfbtree_trans_commit(&bag->xfbtree, tp);
+ if (error)
+ return error;
+
+ bag->nr_items++;
+ return 0;
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ xfbtree_trans_cancel(&bag->xfbtree, tp);
+ return error;
+}
+
+/* Return the number of records in the bag. */
+uint64_t
+rcbag_count(
+ const struct rcbag *rcbag)
+{
+ return rcbag->nr_items;
+}
+
+static inline uint32_t rcbag_rec_next_bno(const struct rcbag_rec *r)
+{
+ return r->rbg_startblock + r->rbg_blockcount;
+}
+
+/*
+ * Find the next block where the refcount changes, given the next rmap we
+ * looked at and the ones we're already tracking.
+ */
+int
+rcbag_next_edge(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ const struct xfs_rmap_irec *next_rmap,
+ bool next_valid,
+ uint32_t *next_bnop)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ uint32_t next_bno = NULLAGBLOCK;
+ int has;
+ int error;
+
+ if (next_valid)
+ next_bno = next_rmap->rm_startblock;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_increment(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ next_bno = min(next_bno, rcbag_rec_next_bno(&bagrec));
+ }
+
+ /*
+ * We should have found /something/ because either next_rrm is the next
+ * interesting rmap to look at after emitting this refcount extent, or
+ * there are other rmaps in rmap_bag contributing to the current
+ * sharing count. But if something is seriously wrong, bail out.
+ */
+ if (next_bno == NULLAGBLOCK) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+
+ *next_bnop = next_bno;
+ return 0;
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/* Pop all refcount bag records that end at next_bno */
+int
+rcbag_remove_ending_at(
+ struct rcbag *bag,
+ struct xfs_trans *tp,
+ uint32_t next_bno)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ int has;
+ int error;
+
+ /* go to the right edge of the tree */
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ memset(&cur->bc_rec, 0xFF, sizeof(cur->bc_rec));
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, &has);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_decrement(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ if (rcbag_rec_next_bno(&bagrec) != next_bno)
+ continue;
+
+ error = xfs_btree_delete(cur, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ bag->nr_items -= bagrec.rbg_refcount;
+ }
+
+ xfs_btree_del_cursor(cur, 0);
+ return xfbtree_trans_commit(&bag->xfbtree, tp);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ xfbtree_trans_cancel(&bag->xfbtree, tp);
+ return error;
+}
+
+/* Dump the rcbag. */
+void
+rcbag_dump(
+ struct rcbag *bag,
+ struct xfs_trans *tp)
+{
+ struct rcbag_rec bagrec;
+ struct xfs_mount *mp = bag->mp;
+ struct xfs_btree_cur *cur;
+ unsigned long long nr = 0;
+ int has;
+ int error;
+
+ cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+ error = xfs_btree_goto_left_edge(cur);
+ if (error)
+ goto out_cur;
+
+ while (true) {
+ error = xfs_btree_increment(cur, 0, &has);
+ if (error)
+ goto out_cur;
+ if (!has)
+ break;
+
+ error = rcbagbt_get_rec(cur, &bagrec, &has);
+ if (error)
+ goto out_cur;
+ if (!has) {
+ error = -EFSCORRUPTED;
+ goto out_cur;
+ }
+
+ xfs_err(bag->mp, "[%llu]: bno 0x%x fsbcount 0x%x refcount 0x%llx\n",
+ nr++,
+ (unsigned int)bagrec.rbg_startblock,
+ (unsigned int)bagrec.rbg_blockcount,
+ (unsigned long long)bagrec.rbg_refcount);
+ }
+
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+}
diff --git a/fs/xfs/scrub/rcbag.h b/fs/xfs/scrub/rcbag.h
new file mode 100644
index 000000000000..e29ef788ba72
--- /dev/null
+++ b/fs/xfs/scrub/rcbag.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RCBAG_H__
+#define __XFS_SCRUB_RCBAG_H__
+
+struct xfs_mount;
+struct rcbag;
+struct xfs_buftarg;
+
+int rcbag_init(struct xfs_mount *mp, struct xfs_buftarg *btp,
+ struct rcbag **bagp);
+void rcbag_free(struct rcbag **bagp);
+int rcbag_add(struct rcbag *bag, struct xfs_trans *tp,
+ const struct xfs_rmap_irec *rmap);
+uint64_t rcbag_count(const struct rcbag *bag);
+
+int rcbag_next_edge(struct rcbag *bag, struct xfs_trans *tp,
+ const struct xfs_rmap_irec *next_rmap, bool next_valid,
+ uint32_t *next_bnop);
+int rcbag_remove_ending_at(struct rcbag *bag, struct xfs_trans *tp,
+ uint32_t next_bno);
+
+void rcbag_dump(struct rcbag *bag, struct xfs_trans *tp);
+
+#endif /* __XFS_SCRUB_RCBAG_H__ */
diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c
new file mode 100644
index 000000000000..709356dc6256
--- /dev/null
+++ b/fs/xfs/scrub/rcbag_btree.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_error.h"
+#include "scrub/rcbag_btree.h"
+#include "scrub/trace.h"
+
+static struct kmem_cache *rcbagbt_cur_cache;
+
+STATIC void
+rcbagbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ const union xfs_btree_rec *rec)
+{
+ struct rcbag_key *bag_key = (struct rcbag_key *)key;
+ const struct rcbag_rec *bag_rec = (const struct rcbag_rec *)rec;
+
+ BUILD_BUG_ON(sizeof(struct rcbag_key) > sizeof(union xfs_btree_key));
+ BUILD_BUG_ON(sizeof(struct rcbag_rec) > sizeof(union xfs_btree_rec));
+
+ bag_key->rbg_startblock = bag_rec->rbg_startblock;
+ bag_key->rbg_blockcount = bag_rec->rbg_blockcount;
+}
+
+STATIC void
+rcbagbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ struct rcbag_rec *bag_rec = (struct rcbag_rec *)rec;
+ struct rcbag_rec *bag_irec = (struct rcbag_rec *)&cur->bc_rec;
+
+ bag_rec->rbg_startblock = bag_irec->rbg_startblock;
+ bag_rec->rbg_blockcount = bag_irec->rbg_blockcount;
+ bag_rec->rbg_refcount = bag_irec->rbg_refcount;
+}
+
+STATIC int64_t
+rcbagbt_key_diff(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key)
+{
+ struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
+ const struct rcbag_key *kp = (const struct rcbag_key *)key;
+
+ if (kp->rbg_startblock > rec->rbg_startblock)
+ return 1;
+ if (kp->rbg_startblock < rec->rbg_startblock)
+ return -1;
+
+ if (kp->rbg_blockcount > rec->rbg_blockcount)
+ return 1;
+ if (kp->rbg_blockcount < rec->rbg_blockcount)
+ return -1;
+
+ return 0;
+}
+
+STATIC int64_t
+rcbagbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2,
+ const union xfs_btree_key *mask)
+{
+ const struct rcbag_key *kp1 = (const struct rcbag_key *)k1;
+ const struct rcbag_key *kp2 = (const struct rcbag_key *)k2;
+
+ ASSERT(mask == NULL);
+
+ if (kp1->rbg_startblock > kp2->rbg_startblock)
+ return 1;
+ if (kp1->rbg_startblock < kp2->rbg_startblock)
+ return -1;
+
+ if (kp1->rbg_blockcount > kp2->rbg_blockcount)
+ return 1;
+ if (kp1->rbg_blockcount < kp2->rbg_blockcount)
+ return -1;
+
+ return 0;
+}
+
+STATIC int
+rcbagbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_key *k1,
+ const union xfs_btree_key *k2)
+{
+ const struct rcbag_key *kp1 = (const struct rcbag_key *)k1;
+ const struct rcbag_key *kp2 = (const struct rcbag_key *)k2;
+
+ if (kp1->rbg_startblock > kp2->rbg_startblock)
+ return 0;
+ if (kp1->rbg_startblock < kp2->rbg_startblock)
+ return 1;
+
+ if (kp1->rbg_blockcount > kp2->rbg_blockcount)
+ return 0;
+ if (kp1->rbg_blockcount < kp2->rbg_blockcount)
+ return 1;
+
+ return 0;
+}
+
+STATIC int
+rcbagbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *r1,
+ const union xfs_btree_rec *r2)
+{
+ const struct rcbag_rec *rp1 = (const struct rcbag_rec *)r1;
+ const struct rcbag_rec *rp2 = (const struct rcbag_rec *)r2;
+
+ if (rp1->rbg_startblock > rp2->rbg_startblock)
+ return 0;
+ if (rp1->rbg_startblock < rp2->rbg_startblock)
+ return 1;
+
+ if (rp1->rbg_blockcount > rp2->rbg_blockcount)
+ return 0;
+ if (rp1->rbg_blockcount < rp2->rbg_blockcount)
+ return 1;
+
+ return 0;
+}
+
+static xfs_failaddr_t
+rcbagbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_failaddr_t fa;
+ unsigned int level;
+ unsigned int maxrecs;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+ if (fa)
+ return fa;
+
+ level = be16_to_cpu(block->bb_level);
+ if (level >= rcbagbt_maxlevels_possible())
+ return __this_address;
+
+ maxrecs = rcbagbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0);
+ return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+rcbagbt_rw_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa = rcbagbt_verify(bp);
+
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops rcbagbt_mem_buf_ops = {
+ .name = "rcbagbt_mem",
+ .magic = { 0, cpu_to_be32(RCBAG_MAGIC) },
+ .verify_read = rcbagbt_rw_verify,
+ .verify_write = rcbagbt_rw_verify,
+ .verify_struct = rcbagbt_verify,
+};
+
+static const struct xfs_btree_ops rcbagbt_mem_ops = {
+ .name = "rcbag",
+ .type = XFS_BTREE_TYPE_MEM,
+
+ .rec_len = sizeof(struct rcbag_rec),
+ .key_len = sizeof(struct rcbag_key),
+ .ptr_len = XFS_BTREE_LONG_PTR_LEN,
+
+ .lru_refs = 1,
+ .statoff = XFS_STATS_CALC_INDEX(xs_rcbag_2),
+
+ .dup_cursor = xfbtree_dup_cursor,
+ .set_root = xfbtree_set_root,
+ .alloc_block = xfbtree_alloc_block,
+ .free_block = xfbtree_free_block,
+ .get_minrecs = xfbtree_get_minrecs,
+ .get_maxrecs = xfbtree_get_maxrecs,
+ .init_key_from_rec = rcbagbt_init_key_from_rec,
+ .init_rec_from_cur = rcbagbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfbtree_init_ptr_from_cur,
+ .key_diff = rcbagbt_key_diff,
+ .buf_ops = &rcbagbt_mem_buf_ops,
+ .diff_two_keys = rcbagbt_diff_two_keys,
+ .keys_inorder = rcbagbt_keys_inorder,
+ .recs_inorder = rcbagbt_recs_inorder,
+};
+
+/* Create a cursor for an in-memory btree. */
+struct xfs_btree_cur *
+rcbagbt_mem_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfbtree *xfbtree)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_btree_alloc_cursor(mp, tp, &rcbagbt_mem_ops,
+ rcbagbt_maxlevels_possible(), rcbagbt_cur_cache);
+
+ cur->bc_mem.xfbtree = xfbtree;
+ cur->bc_nlevels = xfbtree->nlevels;
+ return cur;
+}
+
+/* Create an in-memory refcount bag btree. */
+int
+rcbagbt_mem_init(
+ struct xfs_mount *mp,
+ struct xfbtree *xfbt,
+ struct xfs_buftarg *btp)
+{
+ xfbt->owner = 0;
+ return xfbtree_init(mp, xfbt, btp, &rcbagbt_mem_ops);
+}
+
+/* Calculate number of records in a refcount bag btree block. */
+static inline unsigned int
+rcbagbt_block_maxrecs(
+ unsigned int blocklen,
+ bool leaf)
+{
+ if (leaf)
+ return blocklen / sizeof(struct rcbag_rec);
+ return blocklen /
+ (sizeof(struct rcbag_key) + sizeof(rcbag_ptr_t));
+}
+
+/*
+ * Calculate number of records in an refcount bag btree block.
+ */
+unsigned int
+rcbagbt_maxrecs(
+ struct xfs_mount *mp,
+ unsigned int blocklen,
+ bool leaf)
+{
+ blocklen -= RCBAG_BLOCK_LEN;
+ return rcbagbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for refcount bag btrees. */
+unsigned int
+rcbagbt_maxlevels_possible(void)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_space_to_height(minrecs, ULLONG_MAX);
+}
+
+/* Calculate the refcount bag btree size for some records. */
+unsigned long long
+rcbagbt_calc_size(
+ unsigned long long nr_records)
+{
+ unsigned int minrecs[2];
+ unsigned int blocklen;
+
+ blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+ minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2;
+ minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2;
+
+ return xfs_btree_calc_size(minrecs, nr_records);
+}
+
+int __init
+rcbagbt_init_cur_cache(void)
+{
+ rcbagbt_cur_cache = kmem_cache_create("xfs_rcbagbt_cur",
+ xfs_btree_cur_sizeof(rcbagbt_maxlevels_possible()),
+ 0, 0, NULL);
+
+ if (!rcbagbt_cur_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+rcbagbt_destroy_cur_cache(void)
+{
+ kmem_cache_destroy(rcbagbt_cur_cache);
+ rcbagbt_cur_cache = NULL;
+}
+
+/* Look up the refcount bag record corresponding to this reverse mapping. */
+int
+rcbagbt_lookup_eq(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rmap,
+ int *success)
+{
+ struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
+
+ rec->rbg_startblock = rmap->rm_startblock;
+ rec->rbg_blockcount = rmap->rm_blockcount;
+
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, success);
+}
+
+/* Get the data from the pointed-to record. */
+int
+rcbagbt_get_rec(
+ struct xfs_btree_cur *cur,
+ struct rcbag_rec *rec,
+ int *has)
+{
+ union xfs_btree_rec *btrec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &btrec, has);
+ if (error || !(*has))
+ return error;
+
+ memcpy(rec, btrec, sizeof(struct rcbag_rec));
+ return 0;
+}
+
+/* Update the record referred to by cur to the value given. */
+int
+rcbagbt_update(
+ struct xfs_btree_cur *cur,
+ const struct rcbag_rec *rec)
+{
+ union xfs_btree_rec btrec;
+
+ memcpy(&btrec, rec, sizeof(struct rcbag_rec));
+ return xfs_btree_update(cur, &btrec);
+}
+
+/* Update the record referred to by cur to the value given. */
+int
+rcbagbt_insert(
+ struct xfs_btree_cur *cur,
+ const struct rcbag_rec *rec,
+ int *success)
+{
+ struct rcbag_rec *btrec = (struct rcbag_rec *)&cur->bc_rec;
+
+ memcpy(btrec, rec, sizeof(struct rcbag_rec));
+ return xfs_btree_insert(cur, success);
+}
diff --git a/fs/xfs/scrub/rcbag_btree.h b/fs/xfs/scrub/rcbag_btree.h
new file mode 100644
index 000000000000..03cadb032552
--- /dev/null
+++ b/fs/xfs/scrub/rcbag_btree.h
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RCBAG_BTREE_H__
+#define __XFS_SCRUB_RCBAG_BTREE_H__
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+#define RCBAG_MAGIC 0x74826671 /* 'JRBG' */
+
+struct rcbag_key {
+ uint32_t rbg_startblock;
+ uint32_t rbg_blockcount;
+};
+
+struct rcbag_rec {
+ uint32_t rbg_startblock;
+ uint32_t rbg_blockcount;
+ uint64_t rbg_refcount;
+};
+
+typedef __be64 rcbag_ptr_t;
+
+/* reflinks only exist on crc enabled filesystems */
+#define RCBAG_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define RCBAG_REC_ADDR(block, index) \
+ ((struct rcbag_rec *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ (((index) - 1) * sizeof(struct rcbag_rec))))
+
+#define RCBAG_KEY_ADDR(block, index) \
+ ((struct rcbag_key *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ ((index) - 1) * sizeof(struct rcbag_key)))
+
+#define RCBAG_PTR_ADDR(block, index, maxrecs) \
+ ((rcbag_ptr_t *) \
+ ((char *)(block) + RCBAG_BLOCK_LEN + \
+ (maxrecs) * sizeof(struct rcbag_key) + \
+ ((index) - 1) * sizeof(rcbag_ptr_t)))
+
+unsigned int rcbagbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+ bool leaf);
+
+unsigned long long rcbagbt_calc_size(unsigned long long nr_records);
+
+unsigned int rcbagbt_maxlevels_possible(void);
+
+int __init rcbagbt_init_cur_cache(void);
+void rcbagbt_destroy_cur_cache(void);
+
+struct xfs_btree_cur *rcbagbt_mem_cursor(struct xfs_mount *mp,
+ struct xfs_trans *tp, struct xfbtree *xfbtree);
+int rcbagbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+ struct xfs_buftarg *btp);
+
+int rcbagbt_lookup_eq(struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rmap, int *success);
+int rcbagbt_get_rec(struct xfs_btree_cur *cur, struct rcbag_rec *rec, int *has);
+int rcbagbt_update(struct xfs_btree_cur *cur, const struct rcbag_rec *rec);
+int rcbagbt_insert(struct xfs_btree_cur *cur, const struct rcbag_rec *rec,
+ int *success);
+
+#else
+# define rcbagbt_init_cur_cache() 0
+# define rcbagbt_destroy_cur_cache() ((void)0)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+#endif /* __XFS_SCRUB_RCBAG_BTREE_H__ */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index bf22f245bbfa..d0c7d4a29c0f 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -7,8 +7,10 @@
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_ag.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
@@ -17,6 +19,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub reference count btrees.
@@ -27,6 +30,15 @@ xchk_setup_ag_refcountbt(
{
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+ if (xchk_could_repair(sc)) {
+ int error;
+
+ error = xrep_setup_ag_refcountbt(sc);
+ if (error)
+ return error;
+ }
+
return xchk_setup_ag_btree(sc, false);
}
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
index 8240c993061b..a00d7ce7ae5b 100644
--- a/fs/xfs/scrub/refcount_repair.c
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -38,6 +38,7 @@
#include "scrub/xfarray.h"
#include "scrub/newbt.h"
#include "scrub/reap.h"
+#include "scrub/rcbag.h"
/*
* Rebuilding the Reference Count Btree
@@ -98,12 +99,6 @@
* insert all the records.
*/
-/* The only parts of the rmap that we care about for computing refcounts. */
-struct xrep_refc_rmap {
- xfs_agblock_t startblock;
- xfs_extlen_t blockcount;
-} __packed;
-
struct xrep_refc {
/* refcount extents */
struct xfarray *refcount_records;
@@ -123,6 +118,20 @@ struct xrep_refc {
xfs_extlen_t btblocks;
};
+/* Set us up to repair refcount btrees. */
+int
+xrep_setup_ag_refcountbt(
+ struct xfs_scrub *sc)
+{
+ char *descr;
+ int error;
+
+ descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+ error = xrep_setup_xfbtree(sc, descr);
+ kfree(descr);
+ return error;
+}
+
/* Check for any obvious conflicts with this shared/CoW staging extent. */
STATIC int
xrep_refc_check_ext(
@@ -224,10 +233,9 @@ xrep_refc_rmap_shareable(
STATIC int
xrep_refc_walk_rmaps(
struct xrep_refc *rr,
- struct xrep_refc_rmap *rrm,
+ struct xfs_rmap_irec *rmap,
bool *have_rec)
{
- struct xfs_rmap_irec rmap;
struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur;
struct xfs_mount *mp = cur->bc_mp;
int have_gt;
@@ -251,7 +259,7 @@ xrep_refc_walk_rmaps(
if (!have_gt)
return 0;
- error = xfs_rmap_get_rec(cur, &rmap, &have_gt);
+ error = xfs_rmap_get_rec(cur, rmap, &have_gt);
if (error)
return error;
if (XFS_IS_CORRUPT(mp, !have_gt)) {
@@ -259,23 +267,22 @@ xrep_refc_walk_rmaps(
return -EFSCORRUPTED;
}
- if (rmap.rm_owner == XFS_RMAP_OWN_COW) {
- error = xrep_refc_stash_cow(rr, rmap.rm_startblock,
- rmap.rm_blockcount);
+ if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
+ error = xrep_refc_stash_cow(rr, rmap->rm_startblock,
+ rmap->rm_blockcount);
if (error)
return error;
- } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) {
+ } else if (rmap->rm_owner == XFS_RMAP_OWN_REFC) {
/* refcountbt block, dump it when we're done. */
- rr->btblocks += rmap.rm_blockcount;
+ rr->btblocks += rmap->rm_blockcount;
error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
- rmap.rm_startblock, rmap.rm_blockcount);
+ rmap->rm_startblock,
+ rmap->rm_blockcount);
if (error)
return error;
}
- } while (!xrep_refc_rmap_shareable(mp, &rmap));
+ } while (!xrep_refc_rmap_shareable(mp, rmap));
- rrm->startblock = rmap.rm_startblock;
- rrm->blockcount = rmap.rm_blockcount;
*have_rec = true;
return 0;
}
@@ -357,45 +364,6 @@ xrep_refc_sort_records(
return error;
}
-#define RRM_NEXT(r) ((r).startblock + (r).blockcount)
-/*
- * Find the next block where the refcount changes, given the next rmap we
- * looked at and the ones we're already tracking.
- */
-static inline int
-xrep_refc_next_edge(
- struct xfarray *rmap_bag,
- struct xrep_refc_rmap *next_rrm,
- bool next_valid,
- xfs_agblock_t *nbnop)
-{
- struct xrep_refc_rmap rrm;
- xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
- xfs_agblock_t nbno = NULLAGBLOCK;
- int error;
-
- if (next_valid)
- nbno = next_rrm->startblock;
-
- while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1)
- nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm));
-
- if (error)
- return error;
-
- /*
- * We should have found /something/ because either next_rrm is the next
- * interesting rmap to look at after emitting this refcount extent, or
- * there are other rmaps in rmap_bag contributing to the current
- * sharing count. But if something is seriously wrong, bail out.
- */
- if (nbno == NULLAGBLOCK)
- return -EFSCORRUPTED;
-
- *nbnop = nbno;
- return 0;
-}
-
/*
* Walk forward through the rmap btree to collect all rmaps starting at
* @bno in @rmap_bag. These represent the file(s) that share ownership of
@@ -405,22 +373,21 @@ xrep_refc_next_edge(
static int
xrep_refc_push_rmaps_at(
struct xrep_refc *rr,
- struct xfarray *rmap_bag,
+ struct rcbag *rcstack,
xfs_agblock_t bno,
- struct xrep_refc_rmap *rrm,
- bool *have,
- uint64_t *stack_sz)
+ struct xfs_rmap_irec *rmap,
+ bool *have)
{
struct xfs_scrub *sc = rr->sc;
int have_gt;
int error;
- while (*have && rrm->startblock == bno) {
- error = xfarray_store_anywhere(rmap_bag, rrm);
+ while (*have && rmap->rm_startblock == bno) {
+ error = rcbag_add(rcstack, rr->sc->tp, rmap);
if (error)
return error;
- (*stack_sz)++;
- error = xrep_refc_walk_rmaps(rr, rrm, have);
+
+ error = xrep_refc_walk_rmaps(rr, rmap, have);
if (error)
return error;
}
@@ -441,12 +408,9 @@ STATIC int
xrep_refc_find_refcounts(
struct xrep_refc *rr)
{
- struct xrep_refc_rmap rrm;
struct xfs_scrub *sc = rr->sc;
- struct xfarray *rmap_bag;
- char *descr;
- uint64_t old_stack_sz;
- uint64_t stack_sz = 0;
+ struct rcbag *rcstack;
+ uint64_t old_stack_height;
xfs_agblock_t sbno;
xfs_agblock_t cbno;
xfs_agblock_t nbno;
@@ -456,14 +420,11 @@ xrep_refc_find_refcounts(
xrep_ag_btcur_init(sc, &sc->sa);
/*
- * Set up a sparse array to store all the rmap records that we're
- * tracking to generate a reference count record. If this exceeds
+ * Set up a bag to store all the rmap records that we're tracking to
+ * generate a reference count record. If the size of the bag exceeds
* MAXREFCOUNT, we clamp rc_refcount.
*/
- descr = xchk_xfile_ag_descr(sc, "rmap record bag");
- error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap),
- &rmap_bag);
- kfree(descr);
+ error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
if (error)
goto out_cur;
@@ -474,62 +435,54 @@ xrep_refc_find_refcounts(
/* Process reverse mappings into refcount data. */
while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
+ struct xfs_rmap_irec rmap;
+
/* Push all rmaps with pblk == sbno onto the stack */
- error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+ error = xrep_refc_walk_rmaps(rr, &rmap, &have);
if (error)
goto out_bag;
if (!have)
break;
- sbno = cbno = rrm.startblock;
- error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno,
- &rrm, &have, &stack_sz);
+ sbno = cbno = rmap.rm_startblock;
+ error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap,
+ &have);
if (error)
goto out_bag;
/* Set nbno to the bno of the next refcount change */
- error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno);
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
if (error)
goto out_bag;
ASSERT(nbno > sbno);
- old_stack_sz = stack_sz;
+ old_stack_height = rcbag_count(rcstack);
/* While stack isn't empty... */
- while (stack_sz) {
- xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
-
+ while (rcbag_count(rcstack) > 0) {
/* Pop all rmaps that end at nbno */
- while ((error = xfarray_iter(rmap_bag, &array_cur,
- &rrm)) == 1) {
- if (RRM_NEXT(rrm) != nbno)
- continue;
- error = xfarray_unset(rmap_bag, array_cur - 1);
- if (error)
- goto out_bag;
- stack_sz--;
- }
+ error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
if (error)
goto out_bag;
/* Push array items that start at nbno */
- error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+ error = xrep_refc_walk_rmaps(rr, &rmap, &have);
if (error)
goto out_bag;
if (have) {
- error = xrep_refc_push_rmaps_at(rr, rmap_bag,
- nbno, &rrm, &have, &stack_sz);
+ error = xrep_refc_push_rmaps_at(rr, rcstack,
+ nbno, &rmap, &have);
if (error)
goto out_bag;
}
/* Emit refcount if necessary */
ASSERT(nbno > cbno);
- if (stack_sz != old_stack_sz) {
- if (old_stack_sz > 1) {
+ if (rcbag_count(rcstack) != old_stack_height) {
+ if (old_stack_height > 1) {
error = xrep_refc_stash(rr,
XFS_REFC_DOMAIN_SHARED,
cbno, nbno - cbno,
- old_stack_sz);
+ old_stack_height);
if (error)
goto out_bag;
}
@@ -537,13 +490,13 @@ xrep_refc_find_refcounts(
}
/* Stack empty, go find the next rmap */
- if (stack_sz == 0)
+ if (rcbag_count(rcstack) == 0)
break;
- old_stack_sz = stack_sz;
+ old_stack_height = rcbag_count(rcstack);
sbno = nbno;
/* Set nbno to the bno of the next refcount change */
- error = xrep_refc_next_edge(rmap_bag, &rrm, have,
+ error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
&nbno);
if (error)
goto out_bag;
@@ -552,14 +505,13 @@ xrep_refc_find_refcounts(
}
}
- ASSERT(stack_sz == 0);
+ ASSERT(rcbag_count(rcstack) == 0);
out_bag:
- xfarray_destroy(rmap_bag);
+ rcbag_free(&rcstack);
out_cur:
xchk_ag_btcur_free(&sc->sa);
return error;
}
-#undef RRM_NEXT
/* Retrieve refcountbt data for bulk load. */
STATIC int
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index dd1c89e8714c..ce082d941459 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -89,6 +89,7 @@ int xrep_reset_perag_resv(struct xfs_scrub *sc);
int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten);
int xrep_metadata_inode_forks(struct xfs_scrub *sc);
int xrep_setup_ag_rmapbt(struct xfs_scrub *sc);
+int xrep_setup_ag_refcountbt(struct xfs_scrub *sc);
/* Repair setup functions */
int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -186,6 +187,7 @@ xrep_setup_nothing(
}
#define xrep_setup_ag_allocbt xrep_setup_nothing
#define xrep_setup_ag_rmapbt xrep_setup_nothing
+#define xrep_setup_ag_refcountbt xrep_setup_nothing
#define xrep_setup_inode(sc, imap) ((void)0)
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 5c6773628d69..ed97d72caa66 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -51,7 +51,8 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{ "fibt2", xfsstats_offset(xs_rmap_2) },
{ "rmapbt", xfsstats_offset(xs_refcbt_2) },
{ "refcntbt", xfsstats_offset(xs_rmap_mem_2) },
- { "rmapbt_mem", xfsstats_offset(xs_qm_dqreclaims)},
+ { "rmapbt_mem", xfsstats_offset(xs_rcbag_2) },
+ { "rcbagbt", xfsstats_offset(xs_qm_dqreclaims)},
/* we print both series of quota information together */
{ "qm", xfsstats_offset(xs_xstrat_bytes)},
};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 3b50419d8bb9..a61fb56ed2e6 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -126,6 +126,7 @@ struct __xfsstats {
uint32_t xs_rmap_2[__XBTS_MAX];
uint32_t xs_refcbt_2[__XBTS_MAX];
uint32_t xs_rmap_mem_2[__XBTS_MAX];
+ uint32_t xs_rcbag_2[__XBTS_MAX];
uint32_t xs_qm_dqreclaims;
uint32_t xs_qm_dqreclaim_misses;
uint32_t xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 74e87ed5eee1..6828c48b15e9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -44,6 +44,7 @@
#include "xfs_dahash_test.h"
#include "xfs_rtbitmap.h"
#include "scrub/stats.h"
+#include "scrub/rcbag_btree.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -2060,10 +2061,14 @@ xfs_init_caches(void)
if (error)
goto out_destroy_log_ticket_cache;
- error = xfs_defer_init_item_caches();
+ error = rcbagbt_init_cur_cache();
if (error)
goto out_destroy_btree_cur_cache;
+ error = xfs_defer_init_item_caches();
+ if (error)
+ goto out_destroy_rcbagbt_cur_cache;
+
xfs_da_state_cache = kmem_cache_create("xfs_da_state",
sizeof(struct xfs_da_state),
0, 0, NULL);
@@ -2220,6 +2225,8 @@ xfs_init_caches(void)
kmem_cache_destroy(xfs_da_state_cache);
out_destroy_defer_item_cache:
xfs_defer_destroy_item_caches();
+ out_destroy_rcbagbt_cur_cache:
+ rcbagbt_destroy_cur_cache();
out_destroy_btree_cur_cache:
xfs_btree_destroy_cur_caches();
out_destroy_log_ticket_cache:
@@ -2257,6 +2264,7 @@ xfs_destroy_caches(void)
kmem_cache_destroy(xfs_ifork_cache);
kmem_cache_destroy(xfs_da_state_cache);
xfs_defer_destroy_item_caches();
+ rcbagbt_destroy_cur_cache();
xfs_btree_destroy_cur_caches();
kmem_cache_destroy(xfs_log_ticket_cache);
kmem_cache_destroy(xfs_buf_cache);