summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_file.c
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2022-07-14 11:16:15 -0700
committerDarrick J. Wong <djwong@kernel.org>2022-10-14 14:17:26 -0700
commitbd4e97e23d67e5b9376ba3b6c621c71596950632 (patch)
treed8c14d5aaf49c223da4d5900db0f47808b9eb1b2 /fs/xfs/xfs_file.c
parentb142ae8a645faa60219876671648e55bf8c6e620 (diff)
xfs: enable CoW when rt extent size is larger than 1 block
Copy on write encounters a major plot twist when the file being CoW'd lives on the realtime volume and the realtime extent size is larger than a single filesystem block. XFS can only unmap and remap full rt extents, which means that allocations are always done in units of full rt extents, and a request to unmap less than one extent is treated as a request to convert an extent to unwritten status. This behavioral quirk is not compatible with the existing CoW mechanism, so we have to intercept every path through which files can be modified to ensure that we dirty an entire rt extent at once so that we can remap a full rt extent. Use the existing VFS unshare functions to dirty the page cache to set that up. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r--fs/xfs/xfs_file.c181
1 files changed, 181 insertions, 0 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d936e1b17e51..8cc2cfe40b18 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -32,6 +32,7 @@
#include <linux/mman.h>
#include <linux/fadvise.h>
#include <linux/mount.h>
+#include <linux/buffer_head.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -396,6 +397,13 @@ restart:
goto restart;
}
+ if (xfs_inode_needs_cow_around(ip)) {
+ error = xfs_file_cow_around(ip, isize,
+ iocb->ki_pos - isize);
+ if (error)
+ return error;
+ }
+
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
if (error)
@@ -508,6 +516,7 @@ xfs_file_dio_write_aligned(
struct iov_iter *from)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
+ size_t count = iov_iter_count(from);
ssize_t ret;
ret = xfs_ilock_iocb(iocb, iolock);
@@ -518,6 +527,17 @@ xfs_file_dio_write_aligned(
goto out_unlock;
/*
+ * We can't unshare a partial rt extent yet, which means that we can't
+ * handle direct writes that are block-aligned but not rtextent-aligned.
+ */
+ if (xfs_inode_needs_cow_around(ip) &&
+ !xfs_is_falloc_aligned(ip, iocb->ki_pos, count)) {
+ trace_xfs_reflink_bounce_dio_write(iocb, from);
+ ret = -ENOTBLK;
+ goto out_unlock;
+ }
+
+ /*
* We don't need to hold the IOLOCK exclusively across the IO, so demote
* the iolock back to shared if we had to take the exclusive lock in
* xfs_file_write_checks() for other reasons.
@@ -753,6 +773,82 @@ out:
return ret;
}
+/*
+ * Start the process of unsharing part of a file by dirtying the pagecache for
+ * any shared extents in the given region. Caller must ensure the range is
+ * within EOF.
+ */
+static inline int
+xfs_file_cow_around_bytes(
+ struct xfs_inode *ip,
+ xfs_off_t off,
+ xfs_off_t len)
+{
+ trace_xfs_file_cow_around(ip, off, len);
+
+ return iomap_file_unshare(VFS_I(ip), off, len,
+ &xfs_buffered_write_iomap_ops);
+}
+
+/*
+ * Dirty the pages on either side of a write request as needed to satisfy
+ * alignment requirements if we're going to perform a copy-write.
+ *
+ * This is only needed for realtime files when the rt extent size is larger
+ * than 1 fs block, because we don't allow a logical rt extent in a file to map
+ * to multiple physical rt extents. In other words, we can only map and unmap
+ * full rt extents. Note that page cache doesn't exist above EOF, so be
+ * careful to stay below EOF.
+ */
+int
+xfs_file_cow_around(
+ struct xfs_inode *ip,
+ loff_t pos,
+ long long int count)
+{
+ unsigned int extsize = xfs_inode_alloc_unitsize(ip);
+ loff_t next = pos + count;
+ loff_t isize = i_size_read(VFS_I(ip));
+ loff_t upos;
+ uint32_t mod;
+ int error;
+
+ if (xfs_is_falloc_aligned(ip, pos, count))
+ return 0;
+
+ inode_dio_wait(VFS_I(ip));
+
+ /* Unshare at the start of the extent. */
+ div_u64_rem(pos, extsize, &mod);
+ upos = pos - mod;
+ if (mod != 0 && upos < isize) {
+ loff_t ulen = extsize;
+
+ if (upos + ulen > isize)
+ ulen = isize - upos;
+
+ error = xfs_file_cow_around_bytes(ip, upos, ulen);
+ if (error)
+ return error;
+ }
+
+ /* Unshare at the end. */
+ div_u64_rem(next, extsize, &mod);
+ upos = next - mod;
+ if (mod != 0 && upos < isize) {
+ loff_t ulen = extsize;
+
+ if (upos + ulen > isize)
+ ulen = isize - upos;
+
+ error = xfs_file_cow_around_bytes(ip, upos, ulen);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
STATIC ssize_t
xfs_file_write_iter(
struct kiocb *iocb,
@@ -774,6 +870,12 @@ xfs_file_write_iter(
if (IS_DAX(inode))
return xfs_file_dax_write(iocb, from);
+ if (xfs_inode_needs_cow_around(ip)) {
+ ret = xfs_file_cow_around(ip, iocb->ki_pos, ocount);
+ if (ret)
+ return ret;
+ }
+
if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
@@ -929,6 +1031,13 @@ xfs_file_fallocate(
goto out_unlock;
if (mode & FALLOC_FL_PUNCH_HOLE) {
+ /* Unshare around the region to punch, if needed. */
+ if (xfs_inode_needs_cow_around(ip)) {
+ error = xfs_file_cow_around(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ }
+
error = xfs_free_file_space(ip, offset, len);
if (error)
goto out_unlock;
@@ -999,6 +1108,14 @@ xfs_file_fallocate(
trace_xfs_zero_file_space(ip);
+ /* Unshare around the region to zero, if needed. */
+ if (xfs_inode_needs_cow_around(ip)) {
+ error = xfs_file_cow_around(ip, offset,
+ len);
+ if (error)
+ goto out_unlock;
+ }
+
error = xfs_free_file_space(ip, offset, len);
if (error)
goto out_unlock;
@@ -1007,6 +1124,26 @@ xfs_file_fallocate(
round_down(offset, blksize);
offset = round_down(offset, blksize);
} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+ /*
+ * Enlarge the unshare region to align to a full
+ * allocation unit.
+ */
+ if (xfs_inode_needs_cow_around(ip)) {
+ loff_t isize = i_size_read(VFS_I(ip));
+ unsigned int rextsize;
+ uint32_t mod;
+
+ rextsize = xfs_inode_alloc_unitsize(ip);
+ div_u64_rem(offset, rextsize, &mod);
+ offset -= mod;
+ len += mod;
+
+ div_u64_rem(offset + len, rextsize, &mod);
+ if (mod)
+ len += rextsize - mod;
+ if (offset + len > isize)
+ len = isize - offset;
+ }
error = xfs_reflink_unshare(ip, offset, len);
if (error)
goto out_unlock;
@@ -1340,6 +1477,35 @@ xfs_dax_fault(
}
#endif
+static int
+xfs_filemap_fault_around(
+ struct vm_fault *vmf,
+ struct inode *inode)
+{
+ struct folio *folio = page_folio(vmf->page);
+ loff_t pos;
+ ssize_t len;
+ int error;
+
+ if (!xfs_inode_needs_cow_around(XFS_I(inode)))
+ return 0;
+
+ folio_lock(folio);
+ len = folio_mkwrite_check_truncate(folio, inode);
+ if (len < 0) {
+ folio_unlock(folio);
+ return len;
+ }
+ pos = folio_pos(folio);
+ folio_unlock(folio);
+
+ error = xfs_file_cow_around(XFS_I(inode), pos, len);
+ if (error)
+ return error;
+
+ return 0;
+}
+
/*
* Locking for serialisation of IO during page faults. This results in a lock
* ordering of:
@@ -1377,7 +1543,21 @@ __xfs_filemap_fault(
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
} else {
if (write_fault) {
+ int error;
+
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+ /*
+ * Unshare all the blocks in this rt extent surrounding
+ * this page.
+ */
+ error = xfs_filemap_fault_around(vmf, inode);
+ if (error) {
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = block_page_mkwrite_return(error);
+ goto out;
+ }
+
ret = iomap_page_mkwrite(vmf,
&xfs_buffered_write_iomap_ops);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1386,6 +1566,7 @@ __xfs_filemap_fault(
}
}
+out:
if (write_fault)
sb_end_pagefault(inode->i_sb);
return ret;