summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDarrick J. Wong <darrick.wong@oracle.com>2020-02-19 17:01:40 -0800
committerDarrick J. Wong <darrick.wong@oracle.com>2020-06-01 21:16:29 -0700
commit7942b9890ea843896224c9d274505fe4eeee04d5 (patch)
tree51fe92e4380d9e27e2cb4ea3220cd1f2cc12ee3c
parent11158365e5f45f088fb753a660c97432319af06d (diff)
xfs: implement block reservation accounting for btrees we're staging
Create a new xrep_newbt structure to encapsulate a fake root for creating a staged btree cursor as well as to track all the blocks that we need to reserve in order to build that btree. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
-rw-r--r--fs/xfs/scrub/agheader_repair.c1
-rw-r--r--fs/xfs/scrub/common.c1
-rw-r--r--fs/xfs/scrub/repair.c302
-rw-r--r--fs/xfs/scrub/repair.h55
-rw-r--r--fs/xfs/scrub/scrub.c2
-rw-r--r--fs/xfs/scrub/trace.h36
6 files changed, 397 insertions, 0 deletions
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index a96c373248df..e7c82deb88a1 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -10,6 +10,7 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 18876056e5e0..02c118e5949f 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -10,6 +10,7 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index eef4468d559f..d967c704b507 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -10,6 +10,7 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
@@ -375,6 +376,307 @@ xrep_init_btblock(
return 0;
}
+/* Initialize accounting resources for staging a new AG btree. */
+void
+xrep_newbt_init_ag(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo,
+ xfs_fsblock_t alloc_hint,
+ enum xfs_ag_resv_type resv)
+{
+ memset(xnr, 0, sizeof(struct xrep_newbt));
+ xnr->sc = sc;
+ xnr->oinfo = *oinfo; /* structure copy */
+ xnr->alloc_hint = alloc_hint;
+ xnr->resv = resv;
+ INIT_LIST_HEAD(&xnr->resv_list);
+}
+
+/* Initialize accounting resources for staging a new inode fork btree. */
+void
+xrep_newbt_init_inode(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc,
+ int whichfork,
+ const struct xfs_owner_info *oinfo)
+{
+ xrep_newbt_init_ag(xnr, sc, oinfo,
+ XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
+ XFS_AG_RESV_NONE);
+ xnr->ifake.if_fork = kmem_zone_zalloc(xfs_ifork_zone, 0);
+ xnr->ifake.if_fork_size = XFS_IFORK_SIZE(sc->ip, whichfork);
+}
+
+/*
+ * Initialize accounting resources for staging a new btree. Callers are
+ * expected to add their own reservations (and clean them up) manually.
+ */
+void
+xrep_newbt_init_bare(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc)
+{
+ xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
+ XFS_AG_RESV_NONE);
+}
+
+/* Designate specific blocks to be used to build our new btree. */
+int
+xrep_newbt_add_blocks(
+ struct xrep_newbt *xnr,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t len)
+{
+ struct xrep_newbt_resv *resv;
+
+ resv = kmem_alloc(sizeof(struct xrep_newbt_resv), KM_MAYFAIL);
+ if (!resv)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&resv->list);
+ resv->fsbno = fsbno;
+ resv->len = len;
+ resv->used = 0;
+ list_add_tail(&resv->list, &xnr->resv_list);
+ return 0;
+}
+
+/* Allocate disk space for our new btree. */
+int
+xrep_newbt_alloc_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ xfs_alloctype_t type;
+ xfs_fsblock_t alloc_hint = xnr->alloc_hint;
+ int error = 0;
+
+ /*
+ * Inode-rooted btrees can allocate from any AG, whereas AG btrees
+ * require a specific AG mentioned in the alloc hint..
+ */
+ type = sc->ip ? XFS_ALLOCTYPE_START_BNO : XFS_ALLOCTYPE_NEAR_BNO;
+
+ while (nr_blocks > 0) {
+ struct xfs_alloc_arg args = {
+ .tp = sc->tp,
+ .mp = sc->mp,
+ .type = type,
+ .fsbno = alloc_hint,
+ .oinfo = xnr->oinfo,
+ .minlen = 1,
+ .maxlen = nr_blocks,
+ .prod = 1,
+ .resv = xnr->resv,
+ };
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ trace_xrep_newbt_alloc_blocks(sc->mp,
+ XFS_FSB_TO_AGNO(sc->mp, args.fsbno),
+ XFS_FSB_TO_AGBNO(sc->mp, args.fsbno),
+ args.len, xnr->oinfo.oi_owner);
+
+ error = xrep_newbt_add_blocks(xnr, args.fsbno, args.len);
+ if (error)
+ return error;
+
+ nr_blocks -= args.len;
+ alloc_hint = args.fsbno + args.len - 1;
+
+ error = xrep_roll_trans(sc);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Release blocks that were reserved for a btree repair. If the repair
+ * succeeded then we log deferred frees for unused blocks. Otherwise, we try
+ * to free the extents immediately to roll the filesystem back to where it was
+ * before we started.
+ */
+static inline int
+xrep_newbt_destroy_reservation(
+ struct xrep_newbt *xnr,
+ struct xrep_newbt_resv *resv,
+ bool cancel_repair)
+{
+ struct xfs_scrub *sc = xnr->sc;
+
+ if (cancel_repair) {
+ int error;
+
+ /* Free the extent then roll the transaction. */
+ error = xfs_free_extent(sc->tp, resv->fsbno, resv->len,
+ &xnr->oinfo, xnr->resv);
+ if (error)
+ return error;
+
+ return xrep_roll_trans(sc);
+ }
+
+ /*
+ * Use the deferred freeing mechanism to schedule for deletion any
+ * blocks we didn't use to rebuild the tree. This enables us to log
+ * them all in the same transaction as the root change.
+ */
+ resv->fsbno += resv->used;
+ resv->len -= resv->used;
+ resv->used = 0;
+
+ if (resv->len == 0)
+ return 0;
+
+ trace_xrep_newbt_free_blocks(sc->mp,
+ XFS_FSB_TO_AGNO(sc->mp, resv->fsbno),
+ XFS_FSB_TO_AGBNO(sc->mp, resv->fsbno),
+ resv->len, xnr->oinfo.oi_owner);
+
+ __xfs_bmap_add_free(sc->tp, resv->fsbno, resv->len, &xnr->oinfo, true);
+
+ return 0;
+}
+
+/* Free all the accounting info and disk space we reserved for a new btree. */
+void
+xrep_newbt_destroy(
+ struct xrep_newbt *xnr,
+ int error)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xrep_newbt_resv *resv, *n;
+ int err2;
+
+ /*
+ * If the filesystem already went down, we can't free the blocks. Skip
+ * ahead to freeing the incore metadata because we can't fix anything.
+ */
+ if (XFS_FORCED_SHUTDOWN(sc->mp))
+ goto junkit;
+
+ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+ err2 = xrep_newbt_destroy_reservation(xnr, resv, error != 0);
+ if (err2)
+ goto junkit;
+
+ list_del(&resv->list);
+ kmem_free(resv);
+ }
+
+junkit:
+ /*
+ * If we still have reservations attached to @newbt, cleanup must have
+ * failed and the filesystem is about to go down. Clean up the incore
+ * reservations.
+ */
+ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+ list_del(&resv->list);
+ kmem_free(resv);
+ }
+
+ if (sc->ip) {
+ kmem_cache_free(xfs_ifork_zone, xnr->ifake.if_fork);
+ xnr->ifake.if_fork = NULL;
+ }
+}
+
+/* Feed one of the reserved btree blocks to the bulk loader. */
+int
+xrep_newbt_claim_block(
+ struct xfs_btree_cur *cur,
+ struct xrep_newbt *xnr,
+ union xfs_btree_ptr *ptr)
+{
+ struct xrep_newbt_resv *resv;
+ xfs_fsblock_t fsb;
+
+ /*
+ * The first item in the list should always have a free block unless
+ * we're completely out.
+ */
+ resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
+ if (resv->used == resv->len)
+ return -ENOSPC;
+
+ /*
+ * Peel off a block from the start of the reservation. We allocate
+ * blocks in order to place blocks on disk in increasing record or key
+ * order. The block reservations tend to end up on the list in
+ * decreasing order, which hopefully results in leaf blocks ending up
+ * together.
+ */
+ fsb = resv->fsbno + resv->used;
+ resv->used++;
+
+ /* If we used all the blocks in this reservation, move it to the end. */
+ if (resv->used == resv->len)
+ list_move_tail(&resv->list, &xnr->resv_list);
+
+ trace_xrep_newbt_claim_block(cur->bc_mp,
+ XFS_FSB_TO_AGNO(cur->bc_mp, fsb),
+ XFS_FSB_TO_AGBNO(cur->bc_mp, fsb),
+ 1, xnr->oinfo.oi_owner);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ ptr->l = cpu_to_be64(fsb);
+ else
+ ptr->s = cpu_to_be32(XFS_FSB_TO_AGBNO(cur->bc_mp, fsb));
+ return 0;
+}
+
+/*
+ * Estimate proper slack values for a btree that's being reloaded.
+ *
+ * Under most circumstances, we'll take whatever default loading value the
+ * btree bulk loading code calculates for us. However, there are some
+ * exceptions to this rule:
+ *
+ * (1) If someone turned one of the debug knobs.
+ * (2) If this is a per-AG btree and the AG has less than ~9% space free.
+ * (3) If this is an inode btree and the FS has less than ~9% space free.
+ *
+ * Note that we actually use 3/32 for the comparison to avoid division.
+ */
+void
+xrep_bload_estimate_slack(
+ struct xfs_scrub *sc,
+ struct xfs_btree_bload *bload)
+{
+ uint64_t free;
+ uint64_t sz;
+
+ /* Let the btree code compute the default slack values. */
+ bload->leaf_slack = -1;
+ bload->node_slack = -1;
+
+ if (sc->ops->type == ST_PERAG) {
+ free = sc->sa.pag->pagf_freeblks;
+ sz = xfs_ag_block_count(sc->mp, sc->sa.agno);
+ } else {
+ free = percpu_counter_sum(&sc->mp->m_fdblocks);
+ sz = sc->mp->m_sb.sb_dblocks;
+ }
+
+ /* No further changes if there's more than 3/32ths space left. */
+ if (free >= ((sz * 3) >> 5))
+ return;
+
+ /* We're low on space; load the btrees as tightly as possible. */
+ if (bload->leaf_slack < 0)
+ bload->leaf_slack = 0;
+ if (bload->node_slack < 0)
+ bload->node_slack = 0;
+}
+
/*
* Reconstructing per-AG Btrees
*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index f6c22c7a0a3c..611e1669222d 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -6,6 +6,10 @@
#ifndef __XFS_SCRUB_REPAIR_H__
#define __XFS_SCRUB_REPAIR_H__
+#include "scrub/bitmap.h"
+#include "xfs_btree.h"
+union xfs_btree_ptr;
+
static inline int xrep_notsupported(struct xfs_scrub *sc)
{
return -EOPNOTSUPP;
@@ -60,6 +64,57 @@ int xrep_agf(struct xfs_scrub *sc);
int xrep_agfl(struct xfs_scrub *sc);
int xrep_agi(struct xfs_scrub *sc);
+struct xrep_newbt_resv {
+ /* Link to list of extents that we've reserved. */
+ struct list_head list;
+
+ /* FSB of the block we reserved. */
+ xfs_fsblock_t fsbno;
+
+ /* Length of the reservation. */
+ xfs_extlen_t len;
+
+ /* How much of this reservation has been used. */
+ xfs_extlen_t used;
+};
+
+struct xrep_newbt {
+ struct xfs_scrub *sc;
+
+ /* List of extents that we've reserved. */
+ struct list_head resv_list;
+
+ /* Fake root for new btree. */
+ union {
+ struct xbtree_afakeroot afake;
+ struct xbtree_ifakeroot ifake;
+ };
+
+ /* rmap owner of these blocks */
+ struct xfs_owner_info oinfo;
+
+ /* Allocation hint */
+ xfs_fsblock_t alloc_hint;
+
+ /* per-ag reservation type */
+ enum xfs_ag_resv_type resv;
+};
+
+void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc);
+void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint,
+ enum xfs_ag_resv_type resv);
+void xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc,
+ int whichfork, const struct xfs_owner_info *oinfo);
+int xrep_newbt_add_blocks(struct xrep_newbt *xnr, xfs_fsblock_t fsbno,
+ xfs_extlen_t len);
+int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks);
+void xrep_newbt_destroy(struct xrep_newbt *xnr, int error);
+int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr,
+ union xfs_btree_ptr *ptr);
+void xrep_bload_estimate_slack(struct xfs_scrub *sc,
+ struct xfs_btree_bload *bload);
+
#else
static inline int xrep_attempt(
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 8ebf35b115ce..bcc3bf8ea813 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -17,6 +17,8 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_scrub.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index e46f5cef90da..3a816c4d0f9e 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -904,6 +904,42 @@ TRACE_EVENT(xrep_ialloc_insert,
__entry->freemask)
)
+DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len,
+ int64_t owner),
+ TP_ARGS(mp, agno, agbno, len, owner),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(int64_t, owner)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->owner = owner;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner)
+);
+#define DEFINE_NEWBT_EXTENT_EVENT(name) \
+DEFINE_EVENT(xrep_newbt_extent_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len, \
+ int64_t owner), \
+ TP_ARGS(mp, agno, agbno, len, owner))
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block);
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */