summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2021-09-01 11:19:50 -0700
committerDarrick J. Wong <djwong@kernel.org>2021-12-15 17:29:26 -0800
commitc9da9d633c0ccbd1933ae2f58aaa99e68a4e3e0d (patch)
tree8dafe8df1fd8fd91b370cbd3a0e81153947e4537
parent7ce91d85f50da47ca729f30c17d6db3883d71807 (diff)
xfs: enable CoW when rt extent size is larger than 1 block
Copy on write encounters a major plot twist when the file being CoW'd lives on the realtime volume and the realtime extent size is larger than a single filesystem block. XFS can only unmap and remap full rt extents, which means that allocations are always done in units of full rt extents, and a request to unmap less than one extent is treated as a request to convert an extent to unwritten status. This behavioral quirk is not compatible with the existing CoW mechanism, so we have to intercept every path through which files can be modified to ensure that we dirty an entire rt extent at once so that we can remap a full rt extent. Use the existing VFS unshare functions to dirty the page cache to set that up. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
-rw-r--r--fs/xfs/xfs_file.c147
-rw-r--r--fs/xfs/xfs_inode.h9
-rw-r--r--fs/xfs/xfs_iops.c15
-rw-r--r--fs/xfs/xfs_reflink.c40
-rw-r--r--fs/xfs/xfs_trace.h1
5 files changed, 211 insertions, 1 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index eb691d8d1391..f251965e8781 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -423,6 +423,13 @@ restart:
goto restart;
}
+ if (xfs_inode_needs_cow_around(ip)) {
+ error = xfs_file_cow_around(ip, isize,
+ iocb->ki_pos - isize);
+ if (error)
+ return error;
+ }
+
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
NULL, &xfs_buffered_write_iomap_ops);
@@ -536,6 +543,7 @@ xfs_file_dio_write_aligned(
struct iov_iter *from)
{
int iolock = XFS_IOLOCK_SHARED;
+ size_t count = iov_iter_count(from);
ssize_t ret;
ret = xfs_ilock_iocb(iocb, iolock);
@@ -546,6 +554,17 @@ xfs_file_dio_write_aligned(
goto out_unlock;
/*
+ * We can't unshare a partial rt extent yet, which means that we can't
+ * handle direct writes that are block-aligned but not rtextent-aligned.
+ */
+ if (xfs_inode_needs_cow_around(ip) &&
+ !xfs_is_falloc_aligned(ip, iocb->ki_pos, count)) {
+ trace_xfs_reflink_bounce_dio_write(iocb, from);
+ ret = -ENOTBLK;
+ goto out_unlock;
+ }
+
+ /*
* We don't need to hold the IOLOCK exclusively across the IO, so demote
* the iolock back to shared if we had to take the exclusive lock in
* xfs_file_write_checks() for other reasons.
@@ -784,6 +803,82 @@ out:
return ret;
}
+/*
+ * Start the process of unsharing part of a file by dirtying the pagecache for
+ * any shared extents in the given region. Caller must ensure the range is
+ * within EOF.
+ */
+static inline int
+xfs_file_cow_around_bytes(
+ struct xfs_inode *ip,
+ xfs_off_t off,
+ xfs_off_t len)
+{
+ trace_xfs_file_cow_around(ip, off, len);
+
+ return iomap_file_unshare(VFS_I(ip), off, len,
+ &xfs_buffered_write_iomap_ops);
+}
+
+/*
+ * Dirty the pages on either side of a write request as needed to satisfy
+ * alignment requirements if we're going to perform a copy-write.
+ *
+ * This is only needed for realtime files when the rt extent size is larger
+ * than 1 fs block, because we don't allow a logical rt extent in a file to map
+ * to multiple physical rt extents. In other words, we can only map and unmap
+ * full rt extents. Note that page cache doesn't exist above EOF, so be
+ * careful to stay below EOF.
+ */
+int
+xfs_file_cow_around(
+ struct xfs_inode *ip,
+ loff_t pos,
+ long long int count)
+{
+ unsigned int extsize = xfs_inode_alloc_unitsize(ip);
+ loff_t next = pos + count;
+ loff_t isize = i_size_read(VFS_I(ip));
+ loff_t upos;
+ uint32_t mod;
+ int error;
+
+ if (xfs_is_falloc_aligned(ip, pos, count))
+ return 0;
+
+ inode_dio_wait(VFS_I(ip));
+
+ /* Unshare at the start of the extent. */
+ div_u64_rem(pos, extsize, &mod);
+ upos = pos - mod;
+ if (mod != 0 && upos < isize) {
+ loff_t ulen = extsize;
+
+ if (upos + ulen > isize)
+ ulen = isize - upos;
+
+ error = xfs_file_cow_around_bytes(ip, upos, ulen);
+ if (error)
+ return error;
+ }
+
+ /* Unshare at the end. */
+ div_u64_rem(next, extsize, &mod);
+ upos = next - mod;
+ if (mod != 0 && upos < isize) {
+ loff_t ulen = extsize;
+
+ if (upos + ulen > isize)
+ ulen = isize - upos;
+
+ error = xfs_file_cow_around_bytes(ip, upos, ulen);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
STATIC ssize_t
xfs_file_write_iter(
struct kiocb *iocb,
@@ -807,6 +902,12 @@ xfs_file_write_iter(
if (IS_DAX(inode))
return xfs_file_dax_write(iocb, from);
+ if (xfs_inode_needs_cow_around(ip)) {
+ ret = xfs_file_cow_around(ip, iocb->ki_pos, ocount);
+ if (ret)
+ return ret;
+ }
+
if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
@@ -944,6 +1045,13 @@ xfs_file_fallocate(
}
if (mode & FALLOC_FL_PUNCH_HOLE) {
+ /* Unshare around the region to punch, if needed. */
+ if (xfs_inode_needs_cow_around(ip)) {
+ error = xfs_file_cow_around(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ }
+
error = xfs_free_file_space(ip, offset, len);
if (error)
goto out_unlock;
@@ -1016,6 +1124,14 @@ xfs_file_fallocate(
trace_xfs_zero_file_space(ip);
+ /* Unshare around the region to zero, if needed. */
+ if (xfs_inode_needs_cow_around(ip)) {
+ error = xfs_file_cow_around(ip, offset,
+ len);
+ if (error)
+ goto out_unlock;
+ }
+
error = xfs_free_file_space(ip, offset, len);
if (error)
goto out_unlock;
@@ -1024,6 +1140,26 @@ xfs_file_fallocate(
round_down(offset, blksize);
offset = round_down(offset, blksize);
} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+ /*
+ * Enlarge the unshare region to align to a full
+ * allocation unit.
+ */
+ if (xfs_inode_needs_cow_around(ip)) {
+ loff_t isize = i_size_read(VFS_I(ip));
+ unsigned int rextsize;
+ uint32_t mod;
+
+ rextsize = xfs_inode_alloc_unitsize(ip);
+ div_u64_rem(offset, rextsize, &mod);
+ offset -= mod;
+ len += mod;
+
+ div_u64_rem(offset + len, rextsize, &mod);
+ if (mod)
+ len += rextsize - mod;
+ if (offset + len > isize)
+ len = isize - offset;
+ }
error = xfs_reflink_unshare(ip, offset, len);
if (error)
goto out_unlock;
@@ -1379,6 +1515,16 @@ __xfs_filemap_fault(
} else {
if (write_fault) {
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+ /* Unshare around the region to zero, if needed. */
+ if (xfs_inode_needs_cow_around(ip) &&
+ xfs_file_cow_around(ip, page_offset(vmf->page),
+ page_size(vmf->page))) {
+ ret = VM_FAULT_SIGBUS;
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ goto out;
+ }
+
ret = iomap_page_mkwrite(vmf,
&xfs_buffered_write_iomap_ops);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1387,6 +1533,7 @@ __xfs_filemap_fault(
}
}
+out:
if (write_fault)
sb_end_pagefault(inode->i_sb);
return ret;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 7bf707010219..186d20fb5747 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -232,6 +232,12 @@ static inline bool xfs_inode_has_bigrtextents(struct xfs_inode *ip)
return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
}
+/* Decide if we need to unshare the blocks around a range that we're writing. */
+static inline bool xfs_inode_needs_cow_around(struct xfs_inode *ip)
+{
+ return xfs_is_reflink_inode(ip) && xfs_inode_has_bigrtextents(ip);
+}
+
/*
* Return the buftarg used for data allocations on a given inode.
*/
@@ -530,4 +536,7 @@ int xfs_icreate_dqalloc(const struct xfs_icreate_args *args,
struct xfs_dquot **udqpp, struct xfs_dquot **gdqpp,
struct xfs_dquot **pdqpp);
+int xfs_file_cow_around(struct xfs_inode *ip, loff_t pos,
+ long long int count);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1f7c275ec894..e4ccef86fe28 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -24,6 +24,7 @@
#include "xfs_ioctl.h"
#include "xfs_health.h"
#include "xfs_bmap.h"
+#include "xfs_reflink.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -935,10 +936,24 @@ xfs_setattr_size(
* truncate.
*/
if (newsize > oldsize) {
+ if (xfs_inode_needs_cow_around(ip)) {
+ error = xfs_file_cow_around(ip, oldsize,
+ newsize - oldsize);
+ if (error)
+ return error;
+ }
+
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
error = iomap_zero_range(inode, oldsize, newsize - oldsize,
&did_zeroing, &xfs_buffered_write_iomap_ops);
} else {
+ if (xfs_inode_needs_cow_around(ip)) {
+ error = xfs_file_cow_around(ip, newsize,
+ oldsize - newsize);
+ if (error)
+ return error;
+ }
+
/*
* iomap won't detect a dirty page over an unwritten block (or a
* cow block over a hole) and subsequently skips zeroing the
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 598861944112..f38cff39acd7 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -266,9 +266,26 @@ xfs_reflink_convert_cow_locked(
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got;
struct xfs_btree_cur *dummy_cur = NULL;
+ struct xfs_mount *mp = ip->i_mount;
int dummy_logflags;
int error = 0;
+ /*
+ * We can only remap full rt extents, so make sure that we convert the
+ * entire extent. The caller must ensure that this is either a direct
+ * write that's aligned to the rt extent size, or a buffered write for
+ * which we've dirtied extra pages to make this work properly.
+ */
+ if (xfs_inode_needs_cow_around(ip)) {
+ xfs_fileoff_t new_off;
+
+ new_off = rounddown_64(offset_fsb, mp->m_sb.sb_rextsize);
+ count_fsb += offset_fsb - new_off;
+ offset_fsb = new_off;
+
+ count_fsb = roundup_64(count_fsb, mp->m_sb.sb_rextsize);
+ }
+
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
return 0;
@@ -475,11 +492,21 @@ xfs_reflink_cancel_cow_blocks(
bool cancel_real)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_mount *mp = ip->i_mount;
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
bool isrt = XFS_IS_REALTIME_INODE(ip);
int error = 0;
+ /*
+ * Shrink the range that we're cancelling if they don't align to the
+ * realtime extent size, since we can only free full extents.
+ */
+ if (xfs_inode_needs_cow_around(ip)) {
+ offset_fsb = roundup_64(offset_fsb, mp->m_sb.sb_rextsize);
+ end_fsb = rounddown_64(end_fsb, mp->m_sb.sb_rextsize);
+ }
+
if (!xfs_inode_has_cow_data(ip))
return 0;
if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
@@ -779,6 +806,7 @@ xfs_reflink_end_cow(
xfs_off_t offset,
xfs_off_t count)
{
+ struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb;
xfs_fileoff_t end_fsb;
int error = 0;
@@ -789,6 +817,16 @@ xfs_reflink_end_cow(
end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
/*
+ * Make sure the end is aligned with a rt extent (if desired), since
+ * the end of the range could be EOF. The _convert_cow function should
+ * have set us up to swap only full rt extents.
+ */
+ if (xfs_inode_needs_cow_around(ip)) {
+ offset_fsb = rounddown_64(offset_fsb, mp->m_sb.sb_rextsize);
+ end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
+ }
+
+ /*
* Walk forwards until we've remapped the I/O range. The loop function
* repeatedly cycles the ILOCK to allocate one transaction per remapped
* extent.
@@ -1625,7 +1663,7 @@ xfs_reflink_unshare(
inode_dio_wait(inode);
- error = iomap_file_unshare(inode, offset, len,
+ error = iomap_file_unshare(VFS_I(ip), offset, len,
&xfs_buffered_write_iomap_ops);
if (error)
goto out;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index c3e19f0e4e3c..1cc5c4380f9b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3675,6 +3675,7 @@ TRACE_EVENT(xfs_ioctl_clone,
/* unshare tracepoints */
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
+DEFINE_SIMPLE_IO_EVENT(xfs_file_cow_around);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
DEFINE_SIMPLE_IO_EVENT(xfs_rtfile_convert_unwritten);