xfs: enable CoW when rt extent size is larger than 1 block

Copy on write encounters a major plot twist when the file being CoW'd lives on the realtime volume and the realtime extent size is larger than a single filesystem block. XFS can only unmap and remap full rt extents, which means that allocations are always done in units of full rt extents, and a request to unmap less than one extent is treated as a request to convert an extent to unwritten status. This behavioral quirk is not compatible with the existing CoW mechanism, so we have to intercept every path through which files can be modified to ensure that we dirty an entire rt extent at once so that we can remap a full rt extent. Use the existing VFS unshare functions to dirty the page cache to set that up. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
author: Darrick J. Wong <djwong@kernel.org> 2022-07-14 11:16:15 -0700
committer: Darrick J. Wong <djwong@kernel.org> 2022-10-14 14:17:26 -0700
commit: bd4e97e23d67e5b9376ba3b6c621c71596950632 (patch)
tree: d8c14d5aaf49c223da4d5900db0f47808b9eb1b2 /fs/xfs/xfs_file.c
parent: b142ae8a645faa60219876671648e55bf8c6e620 (diff)
1 files changed, 181 insertions, 0 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d936e1b17e51..8cc2cfe40b18 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -32,6 +32,7 @@
 #include <linux/mman.h>
 #include <linux/fadvise.h>
 #include <linux/mount.h>
+#include <linux/buffer_head.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -396,6 +397,13 @@ restart:
 			goto restart;
 		}
 
+		if (xfs_inode_needs_cow_around(ip)) {
+			error = xfs_file_cow_around(ip, isize,
+					iocb->ki_pos - isize);
+			if (error)
+				return error;
+		}
+
 		trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
 		error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
 		if (error)
@@ -508,6 +516,7 @@ xfs_file_dio_write_aligned(
 	struct iov_iter		*from)
 {
 	unsigned int		iolock = XFS_IOLOCK_SHARED;
+	size_t			count = iov_iter_count(from);
 	ssize_t			ret;
 
 	ret = xfs_ilock_iocb(iocb, iolock);
@@ -518,6 +527,17 @@ xfs_file_dio_write_aligned(
 		goto out_unlock;
 
 	/*
+	 * We can't unshare a partial rt extent yet, which means that we can't
+	 * handle direct writes that are block-aligned but not rtextent-aligned.
+	 */
+	if (xfs_inode_needs_cow_around(ip) &&
+	    !xfs_is_falloc_aligned(ip, iocb->ki_pos, count)) {
+		trace_xfs_reflink_bounce_dio_write(iocb, from);
+		ret = -ENOTBLK;
+		goto out_unlock;
+	}
+
+	/*
 	 * We don't need to hold the IOLOCK exclusively across the IO, so demote
 	 * the iolock back to shared if we had to take the exclusive lock in
 	 * xfs_file_write_checks() for other reasons.
@@ -753,6 +773,82 @@ out:
 	return ret;
 }
 
+/*
+ * Start the process of unsharing part of a file by dirtying the pagecache for
+ * any shared extents in the given region.  Caller must ensure the range is
+ * within EOF.
+ */
+static inline int
+xfs_file_cow_around_bytes(
+	struct xfs_inode	*ip,
+	xfs_off_t		off,
+	xfs_off_t		len)
+{
+	trace_xfs_file_cow_around(ip, off, len);
+
+	return iomap_file_unshare(VFS_I(ip), off, len,
+			&xfs_buffered_write_iomap_ops);
+}
+
+/*
+ * Dirty the pages on either side of a write request as needed to satisfy
+ * alignment requirements if we're going to perform a copy-write.
+ *
+ * This is only needed for realtime files when the rt extent size is larger
+ * than 1 fs block, because we don't allow a logical rt extent in a file to map
+ * to multiple physical rt extents.  In other words, we can only map and unmap
+ * full rt extents.  Note that page cache doesn't exist above EOF, so be
+ * careful to stay below EOF.
+ */
+int
+xfs_file_cow_around(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	long long int		count)
+{
+	unsigned int		extsize = xfs_inode_alloc_unitsize(ip);
+	loff_t			next = pos + count;
+	loff_t			isize = i_size_read(VFS_I(ip));
+	loff_t			upos;
+	uint32_t		mod;
+	int			error;
+
+	if (xfs_is_falloc_aligned(ip, pos, count))
+		return 0;
+
+	inode_dio_wait(VFS_I(ip));
+
+	/* Unshare at the start of the extent. */
+	div_u64_rem(pos, extsize, &mod);
+	upos = pos - mod;
+	if (mod != 0 && upos < isize) {
+		loff_t		ulen = extsize;
+
+		if (upos + ulen > isize)
+			ulen = isize - upos;
+
+		error = xfs_file_cow_around_bytes(ip, upos, ulen);
+		if (error)
+			return error;
+	}
+
+	/* Unshare at the end. */
+	div_u64_rem(next, extsize, &mod);
+	upos = next - mod;
+	if (mod != 0 && upos < isize) {
+		loff_t		ulen = extsize;
+
+		if (upos + ulen > isize)
+			ulen = isize - upos;
+
+		error = xfs_file_cow_around_bytes(ip, upos, ulen);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
 STATIC ssize_t
 xfs_file_write_iter(
 	struct kiocb		*iocb,
@@ -774,6 +870,12 @@ xfs_file_write_iter(
 	if (IS_DAX(inode))
 		return xfs_file_dax_write(iocb, from);
 
+	if (xfs_inode_needs_cow_around(ip)) {
+		ret = xfs_file_cow_around(ip, iocb->ki_pos, ocount);
+		if (ret)
+			return ret;
+	}
+
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		/*
 		 * Allow a directio write to fall back to a buffered
@@ -929,6 +1031,13 @@ xfs_file_fallocate(
 		goto out_unlock;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		/* Unshare around the region to punch, if needed. */
+		if (xfs_inode_needs_cow_around(ip)) {
+			error = xfs_file_cow_around(ip, offset, len);
+			if (error)
+				goto out_unlock;
+		}
+
 		error = xfs_free_file_space(ip, offset, len);
 		if (error)
 			goto out_unlock;
@@ -999,6 +1108,14 @@ xfs_file_fallocate(
 
 			trace_xfs_zero_file_space(ip);
 
+			/* Unshare around the region to zero, if needed. */
+			if (xfs_inode_needs_cow_around(ip)) {
+				error = xfs_file_cow_around(ip, offset,
+						len);
+				if (error)
+					goto out_unlock;
+			}
+
 			error = xfs_free_file_space(ip, offset, len);
 			if (error)
 				goto out_unlock;
@@ -1007,6 +1124,26 @@ xfs_file_fallocate(
 			      round_down(offset, blksize);
 			offset = round_down(offset, blksize);
 		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+			/*
+			 * Enlarge the unshare region to align to a full
+			 * allocation unit.
+			 */
+			if (xfs_inode_needs_cow_around(ip)) {
+				loff_t		isize = i_size_read(VFS_I(ip));
+				unsigned int	rextsize;
+				uint32_t	mod;
+
+				rextsize = xfs_inode_alloc_unitsize(ip);
+				div_u64_rem(offset, rextsize, &mod);
+				offset -= mod;
+				len += mod;
+
+				div_u64_rem(offset + len, rextsize, &mod);
+				if (mod)
+					len += rextsize - mod;
+				if (offset + len > isize)
+					len = isize - offset;
+			}
 			error = xfs_reflink_unshare(ip, offset, len);
 			if (error)
 				goto out_unlock;
@@ -1340,6 +1477,35 @@ xfs_dax_fault(
 }
 #endif
 
+static int
+xfs_filemap_fault_around(
+	struct vm_fault		*vmf,
+	struct inode		*inode)
+{
+	struct folio		*folio = page_folio(vmf->page);
+	loff_t			pos;
+	ssize_t			len;
+	int			error;
+
+	if (!xfs_inode_needs_cow_around(XFS_I(inode)))
+		return 0;
+
+	folio_lock(folio);
+	len = folio_mkwrite_check_truncate(folio, inode);
+	if (len < 0) {
+		folio_unlock(folio);
+		return len;
+	}
+	pos = folio_pos(folio);
+	folio_unlock(folio);
+
+	error = xfs_file_cow_around(XFS_I(inode), pos, len);
+	if (error)
+		return error;
+
+	return 0;
+}
+
 /*
  * Locking for serialisation of IO during page faults. This results in a lock
  * ordering of:
@@ -1377,7 +1543,21 @@ __xfs_filemap_fault(
 		xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	} else {
 		if (write_fault) {
+			int	error;
+
 			xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+			/*
+			 * Unshare all the blocks in this rt extent surrounding
+			 * this page.
+			 */
+			error = xfs_filemap_fault_around(vmf, inode);
+			if (error) {
+				xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+				ret = block_page_mkwrite_return(error);
+				goto out;
+			}
+
 			ret = iomap_page_mkwrite(vmf,
 					&xfs_buffered_write_iomap_ops);
 			xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1386,6 +1566,7 @@ __xfs_filemap_fault(
 		}
 	}
 
+out:
 	if (write_fault)
 		sb_end_pagefault(inode->i_sb);
 	return ret;
author	Darrick J. Wong <djwong@kernel.org>	2022-07-14 11:16:15 -0700
committer	Darrick J. Wong <djwong@kernel.org>	2022-10-14 14:17:26 -0700
commit	bd4e97e23d67e5b9376ba3b6c621c71596950632 (patch)
tree	d8c14d5aaf49c223da4d5900db0f47808b9eb1b2 /fs/xfs/xfs_file.c
parent	b142ae8a645faa60219876671648e55bf8c6e620 (diff)