From 4624f8c76088688180c5a5a455a89d47935fa570 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 4 Nov 2021 15:27:22 -0700
Subject: xfs: add the ability to reap entire inode forks

In preparation for supporting repair of indexed file-based metadata
(such as realtime bitmaps, directories, and extended attribute data),
add a function to reap the old blocks after a metadata repair finishes.
IOWs, this is an elaborate bunmapi call that deals with crosslinked
blocks by unmapping them without freeing them, and also scans for incore
buffers to invalidate.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/scrub/repair.c | 319 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h |   1 +
 fs/xfs/scrub/trace.h  |  61 ++++++++++
 3 files changed, 381 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index cdf4995b079c..59a3022ff267 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1303,6 +1303,325 @@ xrep_reap_extents(
 	return xrep_roll_ag_trans(sc);
 }
 
+/*
+ * Metadata files are not supposed to share blocks with anything else.  If
+ * blocks are shared, we remove the reverse mapping (thus reducing the
+ * crosslink factor); if blocks are not shared, we also need to free them.
+ *
+ * This first step determines the longest subset of the passed-in imap
+ * (starting at its beginning) that is either crosslinked or not crosslinked.
+ */
+STATIC int
+xrep_bmapi_reap_find(
+	struct xfs_scrub		*sc,
+	struct xfs_inode		*ip,
+	int				whichfork,
+	const struct xfs_bmbt_irec	*imap,
+	bool				*crosslinked,
+	xfs_extlen_t			*len)
+{
+	struct xfs_owner_info		oinfo;
+	struct xfs_btree_cur		*cur;
+	xfs_agblock_t			agbno;
+	xfs_agblock_t			agbno_next;
+	int				error;
+
+	*len = 1;
+
+	agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
+	agbno_next = agbno + imap->br_blockcount;
+
+	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+			sc->sa.pag);
+
+	xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff);
+	error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
+	if (error)
+		goto out_cur;
+
+	while (++agbno < agbno_next) {
+		bool			also_crosslinked;
+
+		oinfo.oi_offset++;
+		error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo,
+				&also_crosslinked);
+		if (error)
+			goto out_cur;
+
+		if (also_crosslinked != *crosslinked)
+			break;
+
+		(*len)++;
+	}
+
+out_cur:
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/*
+ * Decide if this buffer can be joined to a transaction.  This is true for most
+ * buffers, but there are two cases that we want to catch: large remote xattr
+ * value buffers are not logged and can overflow the buffer log item dirty
+ * bitmap size; and oversized cached buffers if things have really gone
+ * haywire.
+ */
+static inline bool
+xrep_buf_loggable(
+	const struct xfs_buf	*bp)
+{
+	int			i;
+
+	for (i = 0; i < bp->b_map_count; i++) {
+		int		chunks;
+		int		map_size;
+
+		chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
+				XFS_BLF_CHUNK);
+		map_size = DIV_ROUND_UP(chunks, NBWORD);
+		if (map_size > XFS_BLF_DATAMAP_SIZE)
+			return false;
+	}
+
+	return true;
+}
+
+/* Invalidate any buffers for this file mapping. */
+STATIC int
+xrep_bmapi_reap_binval(
+	struct xfs_scrub		*sc,
+	struct xfs_inode		*ip,
+	int				whichfork,
+	const struct xfs_bmbt_irec	*imap,
+	xfs_agblock_t			agbno)
+{
+	struct xfs_mount		*mp = sc->mp;
+	int				bmap_flags = xfs_bmapi_aflag(whichfork);
+	xfs_fileoff_t			off;
+	xfs_fileoff_t			max_off;
+	xfs_extlen_t			scan_blocks;
+	xfs_agnumber_t			agno = sc->sa.pag->pag_agno;
+	xfs_agblock_t			agbno_next = agbno + imap->br_blockcount;
+	int				error;
+
+	/*
+	 * Avoid invalidating AG headers and post-EOFS blocks because we never
+	 * own those.
+	 */
+	if (!xfs_verify_agbno(mp, agno, agbno) ||
+	    !xfs_verify_agbno(mp, agno, agbno_next - 1))
+		return 0;
+
+	/*
+	 * Buffers for file blocks can span multiple contiguous mappings.  This
+	 * means that for each block in the mapping, there could exist an
+	 * xfs_buf indexed by that block with any length up to the maximum
+	 * buffer size (remote xattr values) or to the next hole in the fork.
+	 * To set up our binval scan, first we need to figure out the location
+	 * of the next hole.
+	 */
+	off = imap->br_startoff + imap->br_blockcount;
+	max_off = off + xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
+	while (off < max_off) {
+		struct xfs_bmbt_irec	hmap;
+		int			nhmaps = 1;
+
+		error = xfs_bmapi_read(ip, off, max_off - off, &hmap,
+				&nhmaps, bmap_flags);
+		if (error)
+			return error;
+		if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) {
+			ASSERT(0);
+			return -EFSCORRUPTED;
+		}
+
+		if (!xfs_bmap_is_real_extent(&hmap))
+			break;
+
+		off = hmap.br_startoff + hmap.br_blockcount;
+	}
+	scan_blocks = off - imap->br_startoff;
+
+	trace_xrep_bmapi_reap_binval(sc, imap, scan_blocks);
+
+	/*
+	 * If there are incore buffers for these blocks, invalidate them.  If
+	 * we can't (try)lock the buffer we assume it's owned by someone else
+	 * and leave it alone.  The buffer cache cannot detect aliasing, so
+	 * employ nested loops to detect incore buffers of any plausible size.
+	 */
+	for (; agbno < agbno_next; agbno++, scan_blocks--) {
+		struct xrep_buf_scan	scan = {
+			.daddr		= XFS_AGB_TO_DADDR(mp, agno, agbno),
+			.max_sectors	= xrep_max_buf_sectors(mp, scan_blocks),
+			.daddr_step	= XFS_FSB_TO_BB(mp, 1),
+		};
+		struct xfs_buf		*bp;
+
+		while ((bp = xrep_buf_scan_advance(mp, &scan)) != NULL) {
+			if (xrep_buf_loggable(bp)) {
+				xfs_trans_bjoin(sc->tp, bp);
+				xfs_trans_binval(sc->tp, bp);
+			} else {
+				xfs_buf_stale(bp);
+				xfs_buf_relse(bp);
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Dispose of as much of this file extent as we can.  Upon return,
+ * imap->br_startoff is updated to wherever the next round of disposal work
+ * should begin.
+ */
+STATIC int
+xrep_bmapi_reap_extent(
+	struct xfs_scrub		*sc,
+	struct xfs_inode		*ip,
+	int				whichfork,
+	struct xfs_bmbt_irec		*imap)
+{
+	xfs_filblks_t			remaining;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+	xfs_extlen_t			len;
+	bool				crosslinked;
+	int				error;
+
+	ASSERT(sc->sa.pag == NULL);
+
+	trace_xrep_bmapi_reap_extent(sc, ip, whichfork, imap);
+
+	agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
+	sc->sa.pag = xfs_perag_get(sc->mp, agno);
+	if (!sc->sa.pag)
+		return -EFSCORRUPTED;
+
+	error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &sc->sa.agf_bp);
+	if (error)
+		goto out_pag;
+
+	/*
+	 * Decide the fate of the blocks at the beginning of the mapping, then
+	 * update the mapping to use it with the unmap calls.
+	 */
+	error = xrep_bmapi_reap_find(sc, ip, whichfork, imap, &crosslinked,
+			&len);
+	if (error)
+		goto out_agf;
+
+	remaining = imap->br_blockcount - len;
+	imap->br_blockcount = len;
+
+	/*
+	 * Schedule removal of the mapping from the fork.  We use deferred log
+	 * intents in this function to control the exact sequence of metadata
+	 * updates.
+	 */
+	xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
+	xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+			-(int64_t)len);
+
+	agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
+	if (crosslinked) {
+		/*
+		 * If there are other rmappings, this block is cross linked and
+		 * must not be freed.  Remove the reverse mapping, leave the
+		 * buffer cache in its possibly confused state, and move on.
+		 * We don't want to risk discarding valid data buffers from
+		 * anybody else who thinks they own the block, even though that
+		 * runs the risk of stale buffer warnings in the future.
+		 */
+		trace_xrep_dispose_unmap_extent(sc->sa.pag, agbno, len);
+
+		xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap);
+	} else {
+		/*
+		 * If the block is not crosslinked, we can invalidate all the
+		 * incore buffers for the extent, and then free the extent.
+		 * This is a bit of a mess since we don't detect discontiguous
+		 * buffers that are indexed by a block starting before the
+		 * first block of the extent but overlap anyway.
+		 */
+		trace_xrep_dispose_free_extent(sc->sa.pag, agbno, len);
+
+		error = xrep_bmapi_reap_binval(sc, ip, whichfork, imap, agbno);
+		if (error)
+			goto out_agf;
+
+		__xfs_free_extent_later(sc->tp, imap->br_startblock, len, NULL,
+				true);
+	}
+
+	/* Update the mapping to reflect the work not yet done and exit. */
+	imap->br_startoff += len;
+	imap->br_startblock += len;
+	imap->br_blockcount = remaining;
+out_agf:
+	xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
+	sc->sa.agf_bp = NULL;
+out_pag:
+	xfs_perag_put(sc->sa.pag);
+	sc->sa.pag = NULL;
+	return error;
+}
+
+/*
+ * Dispose of each block mapped to the given fork of the given file.  Callers
+ * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip.  The fork
+ * must not have any delalloc reservations.
+ */
+int
+xrep_reap_fork(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	xfs_fileoff_t		off = 0;
+	int			bmap_flags = xfs_bmapi_aflag(whichfork);
+	int			error;
+
+	ASSERT(xfs_has_rmapbt(sc->mp));
+	ASSERT(ip == sc->ip || ip == sc->tempip);
+	ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
+
+	while (off < XFS_MAX_FILEOFF) {
+		struct xfs_bmbt_irec	imap;
+		int			nimaps = 1;
+
+		/* Read the next extent, skip past holes and delalloc. */
+		error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap,
+				&nimaps, bmap_flags);
+		if (error)
+			return error;
+		if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) {
+			ASSERT(0);
+			return -EFSCORRUPTED;
+		}
+
+		if (!xfs_bmap_is_real_extent(&imap)) {
+			off = imap.br_startoff + imap.br_blockcount;
+			continue;
+		}
+
+		/* Try to reap extent blocks */
+		error = xrep_bmapi_reap_extent(sc, ip, whichfork, &imap);
+		if (error)
+			return error;
+
+		error = xfs_defer_finish(&sc->tp);
+		if (error)
+			return error;
+
+		off = imap.br_startoff;
+	}
+
+	return 0;
+}
+
 /*
  * Finding per-AG Btree Roots for AGF/AGI Reconstruction
  *
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index b46e91676621..a83c6ad50153 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -51,6 +51,7 @@ struct xbitmap;
 int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags);
 int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist,
 		const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+int xrep_reap_fork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork);
 
 struct xrep_find_ag_btree {
 	/* in: rmap owner of the btree we're looking for */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index c49505b588cc..932e5e900410 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -1527,6 +1527,67 @@ TRACE_EVENT(xrep_tempfile_create,
 		  __entry->temp_inum)
 );
 
+TRACE_EVENT(xrep_bmapi_reap_extent,
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork,
+		 const struct xfs_bmbt_irec *irec),
+	TP_ARGS(sc, ip, whichfork, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, whichfork)
+		__field(xfs_fileoff_t, fileoff)
+		__field(xfs_filblks_t, len)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(int, state)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->whichfork = whichfork;
+		__entry->fileoff = irec->br_startoff;
+		__entry->len = irec->br_blockcount;
+		__entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock);
+		__entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock);
+		__entry->state = irec->br_state;
+	),
+	TP_printk("dev %d:%d ip 0x%llx whichfork %s agno 0x%x agbno 0x%x fileoff 0x%llx fsbcount 0x%llx state 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->fileoff,
+		  __entry->len,
+		  __entry->state)
+);
+
+TRACE_EVENT(xrep_bmapi_reap_binval,
+	TP_PROTO(struct xfs_scrub *sc, const struct xfs_bmbt_irec *irec,
+		 xfs_extlen_t scan_blocks),
+	TP_ARGS(sc, irec, scan_blocks),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_filblks_t, len)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, scan_blocks)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->len = irec->br_blockcount;
+		__entry->agno = XFS_FSB_TO_AGNO(sc->mp, irec->br_startblock);
+		__entry->agbno = XFS_FSB_TO_AGBNO(sc->mp, irec->br_startblock);
+		__entry->scan_blocks = scan_blocks;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%llx scan_blocks 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->scan_blocks)
+);
+
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
 
 #endif /* _TRACE_XFS_SCRUB_TRACE_H */
-- 
cgit v1.2.3