xfs: make atomic extent swapping support realtime files

Now that bmap items support the realtime device, we can add the necessary pieces to the atomic extent swapping code to support such things. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
author: Darrick J. Wong <djwong@kernel.org> 2021-09-01 10:58:21 -0700
committer: Darrick J. Wong <djwong@kernel.org> 2021-12-15 17:29:01 -0800
commit: 3e15752ca0034fdfa221ca3ead3f2128a750e000 (patch)
tree: 842e70d3e06fbd43ecece139133994bb353c89ce /fs/xfs
parent: b849353b04270126bf5a089b61bf0d28432a2a57 (diff)
9 files changed, 285 insertions, 13 deletions
diff --git a/fs/xfs/libxfs/xfs_swapext.c b/fs/xfs/libxfs/xfs_swapext.c
index 23d5cc89fc19..5db7e26a5444 100644
--- a/fs/xfs/libxfs/xfs_swapext.c
+++ b/fs/xfs/libxfs/xfs_swapext.c
@@ -225,6 +225,115 @@ sxi_advance(
 	sxi->sxi_blockcount -= irec->br_blockcount;
 }
 
+/*
+ * There may be partially written rt extents lurking in the ranges to be
+ * swapped.  According to the rules for realtime files with big rt extents, we
+ * must guarantee that an outside observer (an IO thread, realistically) never
+ * can see multiple physical rt extents mapped to the same logical file rt
+ * extent.  The deferred bmap log intent items that we use under the hood
+ * operate on single block mappings and not rt extents, which means we must
+ * have a strategy to ensure that log recovery after a failure won't stop in
+ * the middle of an rt extent.
+ *
+ * The preferred strategy is to use deferred extent swap log intent items to
+ * track the status of the overall swap operation so that we can complete the
+ * work during crash recovery.  If that isn't possible, we fall back to
+ * requiring the selected mappings in both forks to be aligned to rt extent
+ * boundaries.  As an aside, the old fork swap routine didn't have this
+ * requirement, but at an extreme cost in flexibilty (full files only, no attr
+ * forks, and no support if rmapbt is enabled).
+ */
+bool
+xfs_swapext_need_rt_conversion(
+	struct xfs_inode		*ip)
+{
+	struct xfs_mount		*mp = ip->i_mount;
+
+	/* We're using the extent swap intent items */
+	if (xfs_has_atomicswap(mp))
+		return false;
+
+	/* The only supported operation is full fork swaps */
+	if (!xfs_can_atomicswap(mp))
+		return false;
+
+	/* Conversion is only needed for realtime files with big rt extents */
+	return xfs_inode_has_bigrtextents(ip);
+}
+
+#ifdef DEBUG
+static inline int
+xfs_swapext_check_rt_extents(
+	struct xfs_mount		*mp,
+	const struct xfs_swapext_req	*req)
+{
+	struct xfs_bmbt_irec		irec1, irec2;
+	xfs_fileoff_t			startoff1 = req->startoff1;
+	xfs_fileoff_t			startoff2 = req->startoff2;
+	xfs_filblks_t			blockcount = req->blockcount;
+	uint32_t			mod;
+	int				nimaps;
+	int				error;
+
+	if (req->whichfork == XFS_ATTR_FORK ||
+	    !xfs_swapext_need_rt_conversion(req->ip2))
+		return 0;
+
+	while (blockcount > 0) {
+		/* Read extent from the first file */
+		nimaps = 1;
+		error = xfs_bmapi_read(req->ip1, startoff1, blockcount,
+				&irec1, &nimaps, 0);
+		if (error)
+			return error;
+		ASSERT(nimaps == 1);
+
+		/* Read extent from the second file */
+		nimaps = 1;
+		error = xfs_bmapi_read(req->ip2, startoff2,
+				irec1.br_blockcount, &irec2, &nimaps,
+				0);
+		if (error)
+			return error;
+		ASSERT(nimaps == 1);
+
+		/*
+		 * We can only swap as many blocks as the smaller of the two
+		 * extent maps.
+		 */
+		irec1.br_blockcount = min(irec1.br_blockcount,
+					  irec2.br_blockcount);
+
+		/* Both mappings must be aligned to the realtime extent size. */
+		div_u64_rem(irec1.br_startoff, mp->m_sb.sb_rextsize, &mod);
+		if (mod) {
+			ASSERT(mod == 0);
+			return -EINVAL;
+		}
+
+		div_u64_rem(irec2.br_startoff, mp->m_sb.sb_rextsize, &mod);
+		if (mod) {
+			ASSERT(mod == 0);
+			return -EINVAL;
+		}
+
+		div_u64_rem(irec1.br_blockcount, mp->m_sb.sb_rextsize, &mod);
+		if (mod) {
+			ASSERT(mod == 0);
+			return -EINVAL;
+		}
+
+		startoff1 += irec1.br_blockcount;
+		startoff2 += irec1.br_blockcount;
+		blockcount -= irec1.br_blockcount;
+	}
+
+	return 0;
+}
+#else
+# define xfs_swapext_check_rt_extents(mp, req)		(0)
+#endif
+
 /* Check all extents to make sure we can actually swap them. */
 int
 xfs_swapext_check_extents(
@@ -244,12 +353,7 @@ xfs_swapext_check_extents(
 	    ifp2->if_format == XFS_DINODE_FMT_LOCAL)
 		return -EINVAL;
 
-	/* We don't support realtime data forks yet. */
-	if (!XFS_IS_REALTIME_INODE(req->ip1))
-		return 0;
-	if (req->whichfork == XFS_ATTR_FORK)
-		return 0;
-	return -EINVAL;
+	return xfs_swapext_check_rt_extents(mp, req);
 }
 
 #ifdef CONFIG_XFS_QUOTA
diff --git a/fs/xfs/libxfs/xfs_swapext.h b/fs/xfs/libxfs/xfs_swapext.h
index f9f3d1057ad1..1b9c6b398bda 100644
--- a/fs/xfs/libxfs/xfs_swapext.h
+++ b/fs/xfs/libxfs/xfs_swapext.h
@@ -109,6 +109,7 @@ int xfs_swapext_finish_one(struct xfs_trans *tp,
 
 int xfs_swapext_check_extents(struct xfs_mount *mp,
 		const struct xfs_swapext_req *req);
+bool xfs_swapext_need_rt_conversion(struct xfs_inode *ip);
 
 int xfs_swapext(struct xfs_trans **tpp, const struct xfs_swapext_req *req);
 
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a3c71d26c8b5..f43f1d434fe2 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -981,7 +981,7 @@ xfs_free_file_space(
 	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
 
 	/* We can only free complete realtime extents. */
-	if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
+	if (xfs_inode_has_bigrtextents(ip)) {
 		startoffset_fsb = roundup_64(startoffset_fsb,
 					     mp->m_sb.sb_rextsize);
 		endoffset_fsb = rounddown_64(endoffset_fsb,
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1e596417178d..d67b784d34d2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -218,6 +218,11 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
 	return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME;
 }
 
+static inline bool xfs_inode_has_bigrtextents(struct xfs_inode *ip)
+{
+	return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
+}
+
 /*
  * Return the buftarg used for data allocations on a given inode.
  */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 504b504ebfe1..8fae34a06dce 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -389,8 +389,7 @@ __XFS_LOG_FEAT(atomicswap, ATOMIC_SWAP)
  */
 static inline bool xfs_can_atomicswap(struct xfs_mount *mp)
 {
-	return (xfs_has_reflink(mp) || xfs_has_rmapbt(mp)) &&
-	       !xfs_has_realtime(mp);
+	return xfs_has_reflink(mp) || xfs_has_rmapbt(mp);
 }
 
 /*
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 0ae875f9ddb1..2c46552895ec 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -20,6 +20,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_sb.h"
 #include "xfs_health.h"
+#include "xfs_trace.h"
 
 /*
  * Read and return the summary information for a given extent size,
@@ -1371,3 +1372,138 @@ xfs_rtpick_extent(
 	*pick = b;
 	return 0;
 }
+
+/*
+ * Decide if this is an unwritten extent that isn't aligned to a rt extent
+ * boundary.  If it is, shorten the mapping so that we're ready to convert
+ * everything up to the next rt extent to a zeroed written extent.  If not,
+ * return false.
+ */
+static inline bool
+xfs_rtfile_want_conversion(
+	struct xfs_mount	*mp,
+	struct xfs_bmbt_irec	*irec)
+{
+	xfs_fileoff_t		rext_next;
+	uint32_t		modoff, modcnt;
+
+	if (irec->br_state != XFS_EXT_UNWRITTEN)
+		return false;
+
+	div_u64_rem(irec->br_startoff, mp->m_sb.sb_rextsize, &modoff);
+	if (modoff == 0) {
+		uint64_t	rexts = div_u64_rem(irec->br_blockcount,
+						mp->m_sb.sb_rextsize, &modcnt);
+
+		if (rexts > 0) {
+			/*
+			 * Unwritten mapping starts at an rt extent boundary
+			 * and is longer than one rt extent.  Round the length
+			 * down to the nearest extent but don't select it for
+			 * conversion.
+			 */
+			irec->br_blockcount -= modcnt;
+			modcnt = 0;
+		}
+
+		/* Unwritten mapping is perfectly aligned, do not convert. */
+		if (modcnt == 0)
+			return false;
+	}
+
+	/*
+	 * Unaligned and unwritten; trim to the current rt extent and select it
+	 * for conversion.
+	 */
+	rext_next = (irec->br_startoff - modoff) + mp->m_sb.sb_rextsize;
+	xfs_trim_extent(irec, irec->br_startoff, rext_next - irec->br_startoff);
+	return true;
+}
+
+/*
+ * For all realtime extents backing the given range of a file, search for
+ * unwritten mappings that do not cover a full rt extent and convert them
+ * to zeroed written mappings.  The goal is to end up with one mapping per rt
+ * extent so that we can perform a remapping operation.  Callers must ensure
+ * that there are no dirty pages in the given range.
+ */
+int
+xfs_rtfile_convert_unwritten(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	uint64_t		len)
+{
+	struct xfs_bmbt_irec	irec;
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		off;
+	xfs_fileoff_t		endoff;
+	unsigned int		resblks;
+	int			ret;
+
+	if (mp->m_sb.sb_rextsize == 1)
+		return 0;
+
+	off = rounddown_64(XFS_B_TO_FSBT(mp, pos), mp->m_sb.sb_rextsize);
+	endoff = roundup_64(XFS_B_TO_FSB(mp, pos + len), mp->m_sb.sb_rextsize);
+
+	trace_xfs_rtfile_convert_unwritten(ip, pos, len);
+
+	while (off < endoff) {
+		int		nmap = 1;
+
+		if (fatal_signal_pending(current))
+			return -EINTR;
+
+		resblks = XFS_DIOSTRAT_SPACE_RES(mp, 1);
+		ret = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
+				&tp);
+		if (ret)
+			return ret;
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+		/*
+		 * Read the mapping.  If we find an unwritten extent that isn't
+		 * aligned to an rt extent boundary...
+		 */
+		ret = xfs_bmapi_read(ip, off, endoff - off, &irec, &nmap, 0);
+		if (ret)
+			goto err;
+		ASSERT(nmap == 1);
+		ASSERT(irec.br_startoff == off);
+		if (!xfs_rtfile_want_conversion(mp, &irec)) {
+			xfs_trans_cancel(tp);
+			off += irec.br_blockcount;
+			continue;
+		}
+
+		/*
+		 * ...make sure this partially unwritten rt extent gets
+		 * converted to a zeroed written extent that we can remap.
+		 */
+		nmap = 1;
+		ret = xfs_bmapi_write(tp, ip, off, irec.br_blockcount,
+				XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &irec,
+				&nmap);
+		if (ret)
+			goto err;
+		ASSERT(nmap == 1);
+		if (irec.br_state != XFS_EXT_NORM) {
+			ASSERT(0);
+			ret = -EIO;
+			goto err;
+		}
+		ret = xfs_trans_commit(tp);
+		if (ret)
+			return ret;
+
+		off += irec.br_blockcount;
+	}
+
+	return 0;
+err:
+	xfs_trans_cancel(tp);
+	return ret;
+}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b0b086034cff..ca6ee2f13cbc 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -138,6 +138,8 @@ bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
 int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
 			       xfs_rtblock_t start, xfs_extlen_t len,
 			       bool *is_free);
+int xfs_rtfile_convert_unwritten(struct xfs_inode *ip, loff_t pos,
+		uint64_t len);
 #else
 # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)    (ENOSYS)
 # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
@@ -161,6 +163,7 @@ xfs_rtmount_init(
 }
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtunmount_inodes(m)
+# define xfs_rtfile_convert_unwritten(ip, pos, len)	(0)
 #endif	/* CONFIG_XFS_RT */
 
 #endif	/* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 13c4d20d9c74..f108f65e1397 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1524,7 +1524,7 @@ DEFINE_IMAP_EVENT(xfs_iomap_alloc);
 DEFINE_IMAP_EVENT(xfs_iomap_found);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
-	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count),
 	TP_ARGS(ip, offset, count),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
@@ -1532,7 +1532,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 		__field(loff_t, isize)
 		__field(loff_t, disize)
 		__field(loff_t, offset)
-		__field(size_t, count)
+		__field(u64, count)
 	),
 	TP_fast_assign(
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -1543,7 +1543,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 		__entry->count = count;
 	),
 	TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
-		  "pos 0x%llx bytecount 0x%zx",
+		  "pos 0x%llx bytecount 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->isize,
@@ -1554,7 +1554,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 
 #define DEFINE_SIMPLE_IO_EVENT(name)	\
 DEFINE_EVENT(xfs_simple_io_class, name,	\
-	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),	\
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count),	\
 	TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
@@ -3525,6 +3525,7 @@ TRACE_EVENT(xfs_ioctl_clone,
 /* unshare tracepoints */
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
+DEFINE_SIMPLE_IO_EVENT(xfs_rtfile_convert_unwritten);
 
 /* copy on write */
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
diff --git a/fs/xfs/xfs_xchgrange.c b/fs/xfs/xfs_xchgrange.c
index c16028b2290c..6ffbfb44be49 100644
--- a/fs/xfs/xfs_xchgrange.c
+++ b/fs/xfs/xfs_xchgrange.c
@@ -28,6 +28,7 @@
 #include "xfs_sb.h"
 #include "xfs_icache.h"
 #include "xfs_log.h"
+#include "xfs_rtalloc.h"
 
 /* Lock (and optionally join) two inodes for a file range exchange. */
 void
@@ -414,6 +415,19 @@ xfs_xchg_range_prep(
 			return error;
 	}
 
+	/* Convert unwritten sub-extent mappings if required. */
+	if (xfs_swapext_need_rt_conversion(ip2)) {
+		error = xfs_rtfile_convert_unwritten(ip2, fxr->file2_offset,
+				fxr->length);
+		if (error)
+			return error;
+
+		error = xfs_rtfile_convert_unwritten(ip1, fxr->file1_offset,
+				fxr->length);
+		if (error)
+			return error;
+	}
+
 	return 0;
 }
 
@@ -622,6 +636,15 @@ xfs_xchg_range(
 	if (fxr->flags & FILE_XCHG_RANGE_SKIP_FILE1_HOLES)
 		req.req_flags |= XFS_SWAP_REQ_SKIP_FILE1_HOLES;
 
+	/*
+	 * Round the request length up to the nearest fundamental unit of
+	 * allocation.  The prep function already checked that the request
+	 * offsets and length in @fxr are safe to round up.
+	 */
+	if (XFS_IS_REALTIME_INODE(ip2))
+		req.blockcount = roundup_64(req.blockcount,
+					    mp->m_sb.sb_rextsize);
+
 	error = xfs_xchg_range_estimate(&req, &res);
 	if (error)
 		return error;
author	Darrick J. Wong <djwong@kernel.org>	2021-09-01 10:58:21 -0700
committer	Darrick J. Wong <djwong@kernel.org>	2021-12-15 17:29:01 -0800
commit	3e15752ca0034fdfa221ca3ead3f2128a750e000 (patch)
tree	842e70d3e06fbd43ecece139133994bb353c89ce /fs/xfs
parent	b849353b04270126bf5a089b61bf0d28432a2a57 (diff)