4 files changed, 81 insertions, 9 deletions
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b93faa819894..0c73aa441c47 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1348,6 +1348,13 @@ xfs_reflink_remap_blocks(
 	len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
 			XFS_MAX_FILEOFF);
 
+	/*
+	 * Make sure the end is aligned with a rt extent (if desired), since
+	 * the end of the range could be EOF.
+	 */
+	if (xfs_inode_has_bigrtextents(dest))
+		len = roundup_64(len, mp->m_sb.sb_rextsize);
+
 	trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
 
 	while (len > 0) {
@@ -1421,6 +1428,50 @@ xfs_reflink_zero_posteof(
 			&xfs_buffered_write_iomap_ops);
 }
 
+/* Adjust the length of the remap operation to end on a rt extent boundary. */
+STATIC int
+xfs_reflink_remap_adjust_rtlen(
+	struct xfs_inode	*src,
+	loff_t			pos_in,
+	struct xfs_inode	*dest,
+	loff_t			pos_out,
+	loff_t			*len,
+	unsigned int		remap_flags)
+{
+	struct xfs_mount	*mp = src->i_mount;
+	uint32_t		mod;
+
+	div_u64_rem(*len, XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize), &mod);
+
+	/*
+	 * We previously checked the rtextent alignment of both offsets, so we
+	 * now have to check the alignment of the length.  The VFS remap prep
+	 * function can change the length on us, so we can only make length
+	 * adjustments after that.  If the length is aligned to an rtextent,
+	 * we're trivially good to go.
+	 *
+	 * Otherwise, the length is not aligned to an rt extent.  If the source
+	 * file's range ends at EOF, the VFS ensured that the dest file's range
+	 * also ends at EOF.  The actual remap function will round the (byte)
+	 * length up to the nearest rtextent unit, so we're ok here too.
+	 */
+	if (mod == 0 || pos_in + *len == i_size_read(VFS_I(src)))
+		return 0;
+
+	/*
+	 * Otherwise, the only thing we can do is round the request length down
+	 * to an rt extent boundary.  If the caller doesn't allow that, we are
+	 * finished.
+	 */
+	if (!(remap_flags & REMAP_FILE_CAN_SHORTEN))
+		return -EINVAL;
+
+	/* Back off by a single extent. */
+	(*len) -= mod;
+	trace_xfs_reflink_remap_adjust_rtlen(src, pos_in, *len, dest, pos_out);
+	return 0;
+}
+
 /*
  * Prepare two files for range cloning.  Upon a successful return both inodes
  * will have the iolock and mmaplock held, the page cache of the out file will
@@ -1480,11 +1531,22 @@ xfs_reflink_remap_prep(
 	if (IS_DAX(inode_in) || IS_DAX(inode_out))
 		goto out_unlock;
 
-	ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
-			len, remap_flags);
+	ASSERT(is_power_of_2(xfs_inode_alloc_unitsize(dest)));
+
+	ret = __generic_remap_file_range_prep(file_in, pos_in, file_out,
+			pos_out, len, remap_flags,
+			xfs_inode_alloc_unitsize(dest));
 	if (ret || *len == 0)
 		goto out_unlock;
 
+	/* Make sure the end is aligned with a rt extent. */
+	if (xfs_inode_has_bigrtextents(src)) {
+		ret = xfs_reflink_remap_adjust_rtlen(src, pos_in, dest,
+				pos_out, len, remap_flags);
+		if (ret || *len == 0)
+			goto out_unlock;
+	}
+
 	/* Attach dquots to dest inode before changing block map */
 	ret = xfs_qm_dqattach(dest);
 	if (ret)
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 15f5a405d7d4..0a1114df236f 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1039,7 +1039,7 @@ xfs_growfs_rt(
 	if (!xfs_has_metadir(mp) && (xfs_has_rmapbt(mp) || xfs_has_reflink(mp)))
 		return -EOPNOTSUPP;
 
-	if (xfs_has_reflink(mp) && in->extsize != 1)
+	if (xfs_has_reflink(mp) && !is_power_of_2(mp->m_sb.sb_rextsize))
 		return -EOPNOTSUPP;
 
 	nrblocks = in->newblocks;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index a70862935988..1c924fe18f3e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1682,13 +1682,23 @@ xfs_fs_fill_super(
 
 	if (xfs_has_reflink(mp)) {
 		/*
-		 * Reflink doesn't support rt extent sizes larger than a single
-		 * block because we would have to perform unshare-around for
-		 * rtext-unaligned write requests.
+		 * Reflink doesn't support pagecache pages that span multiple
+		 * realtime extents because iomap doesn't track subpage dirty
+		 * state.  This means that we cannot dirty all the pages
+		 * backing an rt extent without dirtying the adjoining rt
+		 * extents.  If those rt extents are shared and extend into
+		 * other pages, this leads to crazy write amplification.  The
+		 * VFS remap_range checks assume power-of-two block sizes, so
+		 * we don't support that either.
+		 *
+		 * Hence we only support rt extent sizes that are an integer
+		 * power of two because we know those will align with the page
+		 * size.
 		 */
-		if (xfs_has_realtime(mp) && mp->m_sb.sb_rextsize != 1) {
+		if (xfs_has_realtime(mp) &&
+		    !is_power_of_2(mp->m_sb.sb_rextsize)) {
 			xfs_alert(mp,
-	"reflink not compatible with realtime extent size %u!",
+	"reflink not compatible with non-power-of-2 realtime extent size %u!",
 					mp->m_sb.sb_rextsize);
 			error = -EINVAL;
 			goto out_filestream_unmount;
@@ -1707,7 +1717,6 @@ xfs_fs_fill_super(
 		}
 	}
 
-
 	error = xfs_mountfs(mp);
 	if (error)
 		goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index e3d4111ac387..de536012a04d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3616,6 +3616,7 @@ TRACE_EVENT(xfs_reflink_remap_blocks,
 		  __entry->dest_lblk)
 );
 DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
+DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_adjust_rtlen);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);