xfs: add a ->xchg_file_range handler

Add a function to handle file range exchange requests from the vfs. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
author: Darrick J. Wong <djwong@kernel.org> 2021-01-05 17:45:09 -0800
committer: Darrick J. Wong <djwong@kernel.org> 2021-03-25 17:08:27 -0700
commit: fbb60ffbf097bdae349ddff66163f10acfa3fed0 (patch)
tree: 6d3f7cc486bd5b5cf44202a996a3fc5b9938d2b1
parent: 78181637324e5cc5aa7b324aa025689790586095 (diff)
6 files changed, 457 insertions, 0 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a007ca0711d9..84a29d01c896 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -24,6 +24,7 @@
 #include "xfs_pnfs.h"
 #include "xfs_iomap.h"
 #include "xfs_reflink.h"
+#include "xfs_xchgrange.h"
 
 #include <linux/falloc.h>
 #include <linux/backing-dev.h>
@@ -1179,6 +1180,53 @@ out_unlock:
 }
 
 STATIC int
+xfs_file_xchg_range(
+	struct file		*file1,
+	struct file		*file2,
+	struct file_xchg_range	*fxr)
+{
+	struct inode		*inode1 = file_inode(file1);
+	struct inode		*inode2 = file_inode(file2);
+	struct xfs_inode	*ip1 = XFS_I(inode1);
+	struct xfs_inode	*ip2 = XFS_I(inode2);
+	struct xfs_mount	*mp = ip1->i_mount;
+	unsigned int		priv_flags = 0;
+	int			ret;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/* Update cmtime if the fd/inode don't forbid it. */
+	if (likely(!(file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)))
+		priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME1;
+	if (likely(!(file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)))
+		priv_flags |= XFS_XCHG_RANGE_UPD_CMTIME2;
+
+	/* Lock both files against IO */
+	ret = xfs_ilock2_io_mmap(ip1, ip2);
+	if (ret)
+		return ret;
+
+	/* Prepare and then exchange file contents. */
+	ret = xfs_xchg_range_prep(file1, file2, fxr);
+	if (ret)
+		goto out_unlock;
+
+	trace_xfs_file_xchg_range(ip1, fxr->file1_offset, fxr->length, ip2,
+			fxr->file2_offset);
+
+	ret = xfs_xchg_range(ip1, ip2, fxr, priv_flags);
+	if (ret)
+		goto out_unlock;
+
+out_unlock:
+	xfs_iunlock2_io_mmap(ip1, ip2);
+	if (ret)
+		trace_xfs_file_xchg_range_error(ip2, ret, _RET_IP_);
+	return ret;
+}
+
+STATIC int
 xfs_file_open(
 	struct inode	*inode,
 	struct file	*file)
@@ -1443,6 +1491,7 @@ const struct file_operations xfs_file_operations = {
 	.fallocate	= xfs_file_fallocate,
 	.fadvise	= xfs_file_fadvise,
 	.remap_file_range = xfs_file_remap_range,
+	.xchg_file_range = xfs_file_xchg_range,
 };
 
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 85287f764f4a..59706de3a9d0 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3838,3 +3838,16 @@ xfs_inode_count_blocks(
 	xfs_bmap_count_leaves(ifp, rblocks);
 	*dblocks = ip->i_d.di_nblocks - *rblocks;
 }
+
+/* Returns the size of fundamental allocation unit for a file, in bytes. */
+unsigned int
+xfs_inode_alloc_unitsize(
+	struct xfs_inode	*ip)
+{
+	unsigned int		blocks = 1;
+
+	if (XFS_IS_REALTIME_INODE(ip))
+		blocks = ip->i_mount->m_sb.sb_rextsize;
+
+	return XFS_FSB_TO_B(ip->i_mount, blocks);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1eebd5d03d01..81c7c695fb92 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,6 +500,7 @@ void xfs_end_io(struct work_struct *work);
 
 int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
 void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip);
 
 void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_filblks_t *dblocks, xfs_filblks_t *rblocks);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index dc9cc3c67e58..f4e739e81594 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3319,6 +3319,10 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);
 DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
 DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
 DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
+
+/* swapext tracepoints */
+DEFINE_DOUBLE_IO_EVENT(xfs_file_xchg_range);
+DEFINE_INODE_ERROR_EVENT(xfs_file_xchg_range_error);
 DEFINE_INODE_IREC_EVENT(xfs_swapext_extent1);
 DEFINE_INODE_IREC_EVENT(xfs_swapext_extent2);
 DEFINE_ITRUNC_EVENT(xfs_swapext_update_inode_size);
diff --git a/fs/xfs/xfs_xchgrange.c b/fs/xfs/xfs_xchgrange.c
index 5e7098d5838e..877ef9f3eb64 100644
--- a/fs/xfs/xfs_xchgrange.c
+++ b/fs/xfs/xfs_xchgrange.c
@@ -13,8 +13,15 @@
 #include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
+#include "xfs_quota.h"
+#include "xfs_bmap_util.h"
+#include "xfs_reflink.h"
+#include "xfs_trace.h"
 #include "xfs_swapext.h"
 #include "xfs_xchgrange.h"
+#include "xfs_sb.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
 
 /* Lock (and optionally join) two inodes for a file range exchange. */
 void
@@ -64,3 +71,375 @@ xfs_xchg_range_estimate(
 	xfs_xchg_range_iunlock(req->ip1, req->ip2);
 	return error;
 }
+
+/* Prepare two files to have their data exchanged. */
+int
+xfs_xchg_range_prep(
+	struct file		*file1,
+	struct file		*file2,
+	struct file_xchg_range	*fxr)
+{
+	struct xfs_inode	*ip1 = XFS_I(file_inode(file1));
+	struct xfs_inode	*ip2 = XFS_I(file_inode(file2));
+	int			ret;
+
+	/* Verify both files are either real-time or non-realtime */
+	if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
+		return -EINVAL;
+
+	/*
+	 * The alignment checks in the VFS helpers cannot deal with allocation
+	 * units that are not powers of 2.  This can happen with the realtime
+	 * volume if the extent size is set.  Note that alignment checks are
+	 * skipped if FULL_FILES is set.
+	 */
+	if (!(fxr->flags & FILE_XCHG_RANGE_FULL_FILES) &&
+	    !is_power_of_2(xfs_inode_alloc_unitsize(ip2)))
+		return -EOPNOTSUPP;
+
+	ret = generic_xchg_file_range_prep(file1, file2, fxr,
+			xfs_inode_alloc_unitsize(ip2));
+	if (ret)
+		return ret;
+
+	/* Attach dquots to both inodes before changing block maps. */
+	ret = xfs_qm_dqattach(ip2);
+	if (ret)
+		return ret;
+	ret = xfs_qm_dqattach(ip1);
+	if (ret)
+		return ret;
+
+	/* Flush the relevant ranges of both files. */
+	ret = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
+	if (ret)
+		return ret;
+	return xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
+}
+
+#define QRETRY_IP1	(0x1)
+#define QRETRY_IP2	(0x2)
+
+/*
+ * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
+ * this if quota enforcement is disabled or if both inodes' dquots are the
+ * same.  The qretry structure must be initialized to zeroes before the first
+ * call to this function.
+ */
+STATIC int
+xfs_xchg_range_reserve_quota(
+	struct xfs_trans		*tp,
+	const struct xfs_swapext_req	*req,
+	const struct xfs_swapext_res	*res,
+	unsigned int			*qretry)
+{
+	int64_t				ddelta, rdelta;
+	int				ip1_error = 0;
+	int				error;
+
+	/*
+	 * Don't bother with a quota reservation if we're not enforcing them
+	 * or the two inodes have the same dquots.
+	 */
+	if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
+	    (req->ip1->i_udquot == req->ip2->i_udquot &&
+	     req->ip1->i_gdquot == req->ip2->i_gdquot &&
+	     req->ip1->i_pdquot == req->ip2->i_pdquot))
+		return 0;
+
+	*qretry = 0;
+
+	/*
+	 * For each file, compute the net gain in the number of regular blocks
+	 * that will be mapped into that file and reserve that much quota.  The
+	 * quota counts must be able to absorb at least that much space.
+	 */
+	ddelta = res->ip2_bcount - res->ip1_bcount;
+	rdelta = res->ip2_rtbcount - res->ip1_rtbcount;
+	if (ddelta > 0 || rdelta > 0) {
+		error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
+				ddelta > 0 ? ddelta : 0,
+				rdelta > 0 ? rdelta : 0,
+				false);
+		if (error == -EDQUOT || error == -ENOSPC) {
+			/*
+			 * Save this error and see what happens if we try to
+			 * reserve quota for ip2.  Then report both.
+			 */
+			*qretry |= QRETRY_IP1;
+			ip1_error = error;
+			error = 0;
+		}
+		if (error)
+			return error;
+	}
+	if (ddelta < 0 || rdelta < 0) {
+		error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
+				ddelta < 0 ? -ddelta : 0,
+				rdelta < 0 ? -rdelta : 0,
+				false);
+		if (error == -EDQUOT || error == -ENOSPC)
+			*qretry |= QRETRY_IP2;
+		if (error)
+			return error;
+	}
+	if (ip1_error)
+		return ip1_error;
+
+	/*
+	 * For each file, forcibly reserve the gross gain in mapped blocks so
+	 * that we don't trip over any quota block reservation assertions.
+	 * We must reserve the gross gain because the quota code subtracts from
+	 * bcount the number of blocks that we unmap; it does not add that
+	 * quantity back to the quota block reservation.
+	 */
+	error = xfs_trans_reserve_quota_nblks(tp, req->ip1, res->ip1_bcount,
+			res->ip1_rtbcount, true);
+	if (error)
+		return error;
+
+	return xfs_trans_reserve_quota_nblks(tp, req->ip2, res->ip2_bcount,
+			res->ip2_rtbcount, true);
+}
+
+/*
+ * Get permission to use log-assisted atomic exchange of file extents.
+ *
+ * Callers must not be running any transactions, and they must release the
+ * permission either (1) by calling xlog_drop_incompat_feat when they're done,
+ * or (2) by setting XFS_TRANS_LOG_INCOMPAT on a transaction.
+ */
+STATIC int
+xfs_swapext_enable_log_assist(
+	struct xfs_mount	*mp,
+	bool			force,
+	bool			*enabled)
+{
+	int			error = 0;
+
+	/*
+	 * Protect ourselves from an idle log clearing the atomic swapext
+	 * log incompat feature bit.
+	 */
+	xlog_use_incompat_feat(mp->m_log);
+	*enabled = true;
+
+	/* Already enabled?  We're good to go. */
+	if (xfs_sb_version_hasatomicswap(&mp->m_sb))
+		return 0;
+
+	/*
+	 * If the caller doesn't /require/ log-assisted swapping, drop the
+	 * feature protection and exit.  They'll just have to use something
+	 * else.
+	 */
+	if (!force)
+		goto err;
+
+	/*
+	 * Caller requires log-assisted swapping but the fs feature set isn't
+	 * rich enough.  We have to bail out here.
+	 */
+	if (!xfs_sb_version_canatomicswap(&mp->m_sb)) {
+		error = -EOPNOTSUPP;
+		goto err;
+	}
+
+	/* Enable log-assisted extent swapping. */
+	xfs_warn(mp,
+ "EXPERIMENTAL atomic file range swap feature added. Use at your own risk!");
+	error = xfs_add_incompat_log_feature(mp,
+			XFS_SB_FEAT_INCOMPAT_LOG_ATOMIC_SWAP);
+	if (error)
+		goto err;
+	return 0;
+err:
+	xlog_drop_incompat_feat(mp->m_log);
+	*enabled = false;
+	return error;
+}
+
+/* Exchange the contents of two files. */
+int
+xfs_xchg_range(
+	struct xfs_inode		*ip1,
+	struct xfs_inode		*ip2,
+	const struct file_xchg_range	*fxr,
+	unsigned int			private_flags)
+{
+	struct xfs_swapext_req		req = {
+		.ip1			= ip1,
+		.ip2			= ip2,
+		.whichfork		= XFS_DATA_FORK,
+	};
+	struct xfs_swapext_res		res;
+	struct xfs_mount		*mp = ip1->i_mount;
+	struct xfs_trans		*tp;
+	loff_t				req_len;
+	unsigned int			qretry;
+	bool				retried = false;
+	bool				use_atomic = false;
+	int				error;
+
+	/* We don't support whole-fork swapping yet. */
+	if (!xfs_sb_version_canatomicswap(&mp->m_sb))
+		return -EOPNOTSUPP;
+
+	if (fxr->flags & FILE_XCHG_RANGE_TO_EOF)
+		req.flags |= XFS_SWAPEXT_SET_SIZES;
+	if (fxr->flags & FILE_XCHG_RANGE_SKIP_FILE1_HOLES)
+		req.flags |= XFS_SWAPEXT_SKIP_FILE1_HOLES;
+
+	req.startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset);
+	req.startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset);
+
+	/*
+	 * Round the request length up to the nearest fundamental unit of
+	 * allocation.  The prep function already checked that the request
+	 * offsets and length in @fxr are safe to round up.
+	 */
+	req_len = round_up(fxr->length, xfs_inode_alloc_unitsize(ip2));
+	req.blockcount = XFS_B_TO_FSB(mp, req_len);
+
+	/*
+	 * Cancel CoW fork preallocations for the ranges of both files.  The
+	 * prep function should have flushed all the dirty data, so the only
+	 * extents remaining should be speculative.
+	 */
+	if (xfs_inode_has_cow_data(ip1)) {
+		error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
+				fxr->length, true);
+		if (error)
+			return error;
+	}
+
+	if (xfs_inode_has_cow_data(ip2)) {
+		error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
+				fxr->length, true);
+		if (error)
+			return error;
+	}
+
+	error = xfs_xchg_range_estimate(&req, &res);
+	if (error)
+		return error;
+
+	error = xfs_swapext_enable_log_assist(mp,
+			!(fxr->flags & FILE_XCHG_RANGE_NONATOMIC),
+			&use_atomic);
+	if (error)
+		return error;
+
+retry:
+	/* Allocate the transaction, lock the inodes, and join them. */
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, res.resblks, 0,
+			XFS_TRANS_RES_FDBLKS, &tp);
+	if (error)
+		goto out_unlock_feat;
+
+	xfs_xchg_range_ilock(tp, ip1, ip2);
+
+	trace_xfs_swap_extent_before(ip2, 0);
+	trace_xfs_swap_extent_before(ip1, 1);
+
+	/*
+	 * Do all of the inputs checking that we can only do once we've taken
+	 * both ILOCKs.
+	 */
+	error = generic_xchg_file_range_check_fresh(VFS_I(ip1), VFS_I(ip2),
+			fxr);
+	if (error)
+		goto out_trans_cancel;
+
+	error = xfs_swapext_check_extents(mp, &req);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * Reserve ourselves some quota if any of them are in enforcing mode.
+	 * In theory we only need enough to satisfy the change in the number
+	 * of blocks between the two ranges being remapped.
+	 */
+	error = xfs_xchg_range_reserve_quota(tp, &req, &res, &qretry);
+	if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+		xfs_trans_cancel(tp);
+		xfs_xchg_range_iunlock(ip1, ip2);
+		if (qretry & QRETRY_IP1)
+			xfs_blockgc_free_quota(ip1, 0);
+		if (qretry & QRETRY_IP2)
+			xfs_blockgc_free_quota(ip2, 0);
+		retried = true;
+		goto retry;
+	}
+	if (error)
+		goto out_trans_cancel;
+
+	/* If we got this far on a dry run, all parameters are ok. */
+	if (fxr->flags & FILE_XCHG_RANGE_DRY_RUN)
+		goto out_trans_cancel;
+
+	/*
+	 * If we got permission to use the atomic extent swap feature, put the
+	 * transaction in charge of releasing that permission.
+	 */
+	if (use_atomic) {
+		tp->t_flags |= XFS_TRANS_LOG_INCOMPAT;
+		use_atomic = false;
+	}
+
+	/* Update the mtime and ctime of both files. */
+	if (private_flags & XFS_XCHG_RANGE_UPD_CMTIME1)
+		xfs_trans_ichgtime(tp, ip1,
+				XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	if (private_flags & XFS_XCHG_RANGE_UPD_CMTIME2)
+		xfs_trans_ichgtime(tp, ip2,
+				XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	/* Exchange the file contents by swapping the block mappings. */
+	error = xfs_swapext(&tp, &req);
+	if (error)
+		goto out_trans_cancel;
+
+	/*
+	 * If the caller wanted us to exchange the contents of two complete
+	 * files of unequal length, exchange the incore sizes now.  This should
+	 * be safe because we flushed both files' page caches and moved all the
+	 * post-eof extents, so there should not be anything to zero.
+	 */
+	if (fxr->flags & FILE_XCHG_RANGE_TO_EOF) {
+		loff_t	temp;
+
+		temp = i_size_read(VFS_I(ip2));
+		i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
+		i_size_write(VFS_I(ip1), temp);
+	}
+
+	/* Relog the inodes to keep transactions moving forward. */
+	xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
+	xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
+
+	/*
+	 * Force the log to persist metadata updates if the caller or the
+	 * administrator requires this.  The VFS prep function already flushed
+	 * the relevant parts of the page cache.
+	 */
+	if ((mp->m_flags & XFS_MOUNT_WSYNC) ||
+	    (fxr->flags & FILE_XCHG_RANGE_FSYNC))
+		xfs_trans_set_sync(tp);
+
+	error = xfs_trans_commit(tp);
+
+	trace_xfs_swap_extent_after(ip2, 0);
+	trace_xfs_swap_extent_after(ip1, 1);
+
+out_unlock:
+	xfs_xchg_range_iunlock(ip1, ip2);
+out_unlock_feat:
+	if (use_atomic)
+		xlog_drop_incompat_feat(mp->m_log);
+	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	goto out_unlock;
+}
diff --git a/fs/xfs/xfs_xchgrange.h b/fs/xfs/xfs_xchgrange.h
index ddda2bfb6f4b..cca297034689 100644
--- a/fs/xfs/xfs_xchgrange.h
+++ b/fs/xfs/xfs_xchgrange.h
@@ -15,5 +15,16 @@ void xfs_xchg_range_iunlock(struct xfs_inode *ip1, struct xfs_inode *ip2);
 
 int xfs_xchg_range_estimate(const struct xfs_swapext_req *req,
 		struct xfs_swapext_res *res);
+int xfs_xchg_range_prep(struct file *file1, struct file *file2,
+		struct file_xchg_range *fxr);
+
+/* Update ip1's change and mod time. */
+#define XFS_XCHG_RANGE_UPD_CMTIME1	(1 << 0)
+
+/* Update ip2's change and mod time. */
+#define XFS_XCHG_RANGE_UPD_CMTIME2	(1 << 1)
+
+int xfs_xchg_range(struct xfs_inode *ip1, struct xfs_inode *ip2,
+		const struct file_xchg_range *fxr, unsigned int private_flags);
 
 #endif /* __XFS_XCHGRANGE_H__ */
author	Darrick J. Wong <djwong@kernel.org>	2021-01-05 17:45:09 -0800
committer	Darrick J. Wong <djwong@kernel.org>	2021-03-25 17:08:27 -0700
commit	fbb60ffbf097bdae349ddff66163f10acfa3fed0 (patch)
tree	6d3f7cc486bd5b5cf44202a996a3fc5b9938d2b1
parent	78181637324e5cc5aa7b324aa025689790586095 (diff)