// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2021 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 *
 * The xfs_swap_extent_* functions are:
 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * Copyright (c) 2012 Red Hat, Inc.
 * All Rights Reserved.
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_quota.h"
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
#include "xfs_reflink.h"
#include "xfs_trace.h"
#include "xfs_swapext.h"
#include "xfs_xchgrange.h"
#include "xfs_sb.h"
#include "xfs_icache.h"
#include "xfs_log.h"
#include "xfs_rtalloc.h"

/* Lock (and optionally join) two inodes for a file range exchange. */
void
xfs_xchg_range_ilock(
	struct xfs_trans	*tp,
	struct xfs_inode	*ip1,
	struct xfs_inode	*ip2)
{
	if (ip1 != ip2)
		xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
				    ip2, XFS_ILOCK_EXCL);
	else
		xfs_ilock(ip1, XFS_ILOCK_EXCL);
	if (tp) {
		xfs_trans_ijoin(tp, ip1, 0);
		if (ip2 != ip1)
			xfs_trans_ijoin(tp, ip2, 0);
	}

}

/* Unlock two inodes after a file range exchange operation. */
void
xfs_xchg_range_iunlock(
	struct xfs_inode	*ip1,
	struct xfs_inode	*ip2)
{
	if (ip2 != ip1)
		xfs_iunlock(ip2, XFS_ILOCK_EXCL);
	xfs_iunlock(ip1, XFS_ILOCK_EXCL);
}

/*
 * Estimate the resource requirements to exchange file contents between the two
 * files.  The caller is required to hold the IOLOCK and the MMAPLOCK and to
 * have flushed both inodes' pagecache and active direct-ios.
 */
int
xfs_xchg_range_estimate(
	const struct xfs_swapext_req	*req,
	struct xfs_swapext_res		*res)
{
	int				error;

	xfs_xchg_range_ilock(NULL, req->ip1, req->ip2);
	error = xfs_swapext_estimate(req, res);
	xfs_xchg_range_iunlock(req->ip1, req->ip2);
	return error;
}

/*
 * We need to check that the format of the data fork in the temporary inode is
 * valid for the target inode before doing the swap. This is not a problem with
 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
 * data fork depending on the space the attribute fork is taking so we can get
 * invalid formats on the target inode.
 *
 * E.g. target has space for 7 extents in extent format, temp inode only has
 * space for 6.  If we defragment down to 7 extents, then the tmp format is a
 * btree, but when swapped it needs to be in extent format. Hence we can't just
 * blindly swap data forks on attr2 filesystems.
 *
 * Note that we check the swap in both directions so that we don't end up with
 * a corrupt temporary inode, either.
 *
 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
 * inode will prevent this situation from occurring, so all we do here is
 * reject and log the attempt. basically we are putting the responsibility on
 * userspace to get this right.
 */
STATIC int
xfs_swap_extents_check_format(
	struct xfs_inode	*ip,	/* target inode */
	struct xfs_inode	*tip)	/* tmp inode */
{
	struct xfs_ifork	*ifp = &ip->i_df;
	struct xfs_ifork	*tifp = &tip->i_df;

	/* User/group/project quota ids must match if quotas are enforced. */
	if (XFS_IS_QUOTA_ON(ip->i_mount) &&
	    (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) ||
	     !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) ||
	     ip->i_projid != tip->i_projid))
		return -EINVAL;

	/* Should never get a local format */
	if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
	    tifp->if_format == XFS_DINODE_FMT_LOCAL)
		return -EINVAL;

	/*
	 * if the target inode has less extents that then temporary inode then
	 * why did userspace call us?
	 */
	if (ifp->if_nextents < tifp->if_nextents)
		return -EINVAL;

	/*
	 * If we have to use the (expensive) rmap swap method, we can
	 * handle any number of extents and any format.
	 */
	if (xfs_has_rmapbt(ip->i_mount))
		return 0;

	/*
	 * if the target inode is in extent form and the temp inode is in btree
	 * form then we will end up with the target inode in the wrong format
	 * as we already know there are less extents in the temp inode.
	 */
	if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
	    tifp->if_format == XFS_DINODE_FMT_BTREE)
		return -EINVAL;

	/* Check temp in extent form to max in target */
	if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
	    tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
		return -EINVAL;

	/* Check target in extent form to max in temp */
	if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
	    ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
		return -EINVAL;

	/*
	 * If we are in a btree format, check that the temp root block will fit
	 * in the target and that it has enough extents to be in btree format
	 * in the target.
	 *
	 * Note that we have to be careful to allow btree->extent conversions
	 * (a common defrag case) which will occur when the temp inode is in
	 * extent format...
	 */
	if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
		if (XFS_IFORK_Q(ip) &&
		    XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip))
			return -EINVAL;
		if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
			return -EINVAL;
	}

	/* Reciprocal target->temp btree format checks */
	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
		if (XFS_IFORK_Q(tip) &&
		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
			return -EINVAL;
		if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
			return -EINVAL;
	}

	return 0;
}

/*
 * Fix up the owners of the bmbt blocks to refer to the current inode. The
 * change owner scan attempts to order all modified buffers in the current
 * transaction. In the event of ordered buffer failure, the offending buffer is
 * physically logged as a fallback and the scan returns -EAGAIN. We must roll
 * the transaction in this case to replenish the fallback log reservation and
 * restart the scan. This process repeats until the scan completes.
 */
static int
xfs_swap_change_owner(
	struct xfs_trans	**tpp,
	struct xfs_inode	*ip,
	struct xfs_inode	*tmpip)
{
	int			error;
	struct xfs_trans	*tp = *tpp;

	do {
		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
					      NULL);
		/* success or fatal error */
		if (error != -EAGAIN)
			break;

		error = xfs_trans_roll(tpp);
		if (error)
			break;
		tp = *tpp;

		/*
		 * Redirty both inodes so they can relog and keep the log tail
		 * moving forward.
		 */
		xfs_trans_ijoin(tp, ip, 0);
		xfs_trans_ijoin(tp, tmpip, 0);
		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
		xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
	} while (true);

	return error;
}

/* Swap the extents of two files by swapping data forks. */
STATIC int
xfs_swap_extent_forks(
	struct xfs_trans	**tpp,
	struct xfs_swapext_req	*req)
{
	struct xfs_inode	*ip = req->ip1;
	struct xfs_inode	*tip = req->ip2;
	xfs_filblks_t		aforkblks = 0;
	xfs_filblks_t		taforkblks = 0;
	xfs_extnum_t		junk;
	uint64_t		tmp;
	unsigned int		reflink_state;
	int			src_log_flags = XFS_ILOG_CORE;
	int			target_log_flags = XFS_ILOG_CORE;
	int			error;

	reflink_state = xfs_swapext_reflink_prep(req);

	/*
	 * Count the number of extended attribute blocks
	 */
	if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 &&
	    ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
		error = xfs_bmap_count_blocks(*tpp, ip, XFS_ATTR_FORK, &junk,
				&aforkblks);
		if (error)
			return error;
	}
	if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 &&
	    tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
		error = xfs_bmap_count_blocks(*tpp, tip, XFS_ATTR_FORK, &junk,
				&taforkblks);
		if (error)
			return error;
	}

	/*
	 * Btree format (v3) inodes have the inode number stamped in the bmbt
	 * block headers. We can't start changing the bmbt blocks until the
	 * inode owner change is logged so recovery does the right thing in the
	 * event of a crash. Set the owner change log flags now and leave the
	 * bmbt scan as the last step.
	 */
	if (xfs_has_v3inodes(ip->i_mount)) {
		if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
			target_log_flags |= XFS_ILOG_DOWNER;
		if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
			src_log_flags |= XFS_ILOG_DOWNER;
	}

	/*
	 * Swap the data forks of the inodes
	 */
	swap(ip->i_df, tip->i_df);

	/*
	 * Fix the on-disk inode values
	 */
	tmp = (uint64_t)ip->i_nblocks;
	ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks;
	tip->i_nblocks = tmp + taforkblks - aforkblks;

	/*
	 * The extents in the source inode could still contain speculative
	 * preallocation beyond EOF (e.g. the file is open but not modified
	 * while defrag is in progress). In that case, we need to copy over the
	 * number of delalloc blocks the data fork in the source inode is
	 * tracking beyond EOF so that when the fork is truncated away when the
	 * temporary inode is unlinked we don't underrun the i_delayed_blks
	 * counter on that inode.
	 */
	ASSERT(tip->i_delayed_blks == 0);
	tip->i_delayed_blks = ip->i_delayed_blks;
	ip->i_delayed_blks = 0;

	switch (ip->i_df.if_format) {
	case XFS_DINODE_FMT_EXTENTS:
		src_log_flags |= XFS_ILOG_DEXT;
		break;
	case XFS_DINODE_FMT_BTREE:
		ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
		       (src_log_flags & XFS_ILOG_DOWNER));
		src_log_flags |= XFS_ILOG_DBROOT;
		break;
	}

	switch (tip->i_df.if_format) {
	case XFS_DINODE_FMT_EXTENTS:
		target_log_flags |= XFS_ILOG_DEXT;
		break;
	case XFS_DINODE_FMT_BTREE:
		target_log_flags |= XFS_ILOG_DBROOT;
		ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
		       (target_log_flags & XFS_ILOG_DOWNER));
		break;
	}

	xfs_swapext_reflink_finish(*tpp, req, reflink_state);

	xfs_trans_log_inode(*tpp, ip,  src_log_flags);
	xfs_trans_log_inode(*tpp, tip, target_log_flags);

	/*
	 * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
	 * have inode number owner values in the bmbt blocks that still refer to
	 * the old inode. Scan each bmbt to fix up the owner values with the
	 * inode number of the current inode.
	 */
	if (src_log_flags & XFS_ILOG_DOWNER) {
		error = xfs_swap_change_owner(tpp, ip, tip);
		if (error)
			return error;
	}
	if (target_log_flags & XFS_ILOG_DOWNER) {
		error = xfs_swap_change_owner(tpp, tip, ip);
		if (error)
			return error;
	}

	return 0;
}

/*
 * Check the alignment of an exchange request when the allocation unit size
 * isn't a power of two.  The VFS helpers use (fast) bitmask-based alignment
 * checks, but here we have to use slow long division.
 */
static int
xfs_xchg_range_check_rtalign(
	struct xfs_inode		*ip1,
	struct xfs_inode		*ip2,
	const struct file_xchg_range	*fxr)
{
	struct xfs_mount		*mp = ip1->i_mount;
	uint32_t			rextbytes;
	uint64_t			length = fxr->length;
	uint64_t			blen;
	loff_t				size1, size2;

	rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
	size1 = i_size_read(VFS_I(ip1));
	size2 = i_size_read(VFS_I(ip2));

	/* The start of both ranges must be aligned to a rt extent. */
	if (!isaligned_64(fxr->file1_offset, rextbytes) ||
	    !isaligned_64(fxr->file2_offset, rextbytes))
		return -EINVAL;

	if (fxr->flags & FILE_XCHG_RANGE_TO_EOF)
		length = max_t(int64_t, size1 - fxr->file1_offset,
					size2 - fxr->file2_offset);

	/*
	 * If the user wanted us to exchange up to the infile's EOF, round up
	 * to the next rt extent boundary for this check.  Do the same for the
	 * outfile.
	 *
	 * Otherwise, reject the range length if it's not rt extent aligned.
	 * We already confirmed the starting offsets' rt extent block
	 * alignment.
	 */
	if (fxr->file1_offset + length == size1)
		blen = roundup_64(size1, rextbytes) - fxr->file1_offset;
	else if (fxr->file2_offset + length == size2)
		blen = roundup_64(size2, rextbytes) - fxr->file2_offset;
	else if (!isaligned_64(length, rextbytes))
		return -EINVAL;
	else
		blen = length;

	/* Don't allow overlapped exchanges within the same file. */
	if (ip1 == ip2 &&
	    fxr->file2_offset + blen > fxr->file1_offset &&
	    fxr->file1_offset + blen > fxr->file2_offset)
		return -EINVAL;

	/*
	 * Ensure that we don't exchange a partial EOF rt extent into the
	 * middle of another file.
	 */
	if (isaligned_64(length, rextbytes))
		return 0;

	blen = length;
	if (fxr->file2_offset + length < size2)
		blen = rounddown_64(blen, rextbytes);

	if (fxr->file1_offset + blen < size1)
		blen = rounddown_64(blen, rextbytes);

	return blen == length ? 0 : -EINVAL;
}

/* Prepare two files to have their data exchanged. */
int
xfs_xchg_range_prep(
	struct file		*file1,
	struct file		*file2,
	struct file_xchg_range	*fxr)
{
	struct xfs_inode	*ip1 = XFS_I(file_inode(file1));
	struct xfs_inode	*ip2 = XFS_I(file_inode(file2));
	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(ip2);
	int			error;

	trace_xfs_xchg_range_prep(ip1, fxr, ip2, 0);

	/* Verify both files are either real-time or non-realtime */
	if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
		return -EINVAL;

	/* Check non-power of two alignment issues, if necessary. */
	if (XFS_IS_REALTIME_INODE(ip2) && !is_power_of_2(alloc_unit)) {
		error = xfs_xchg_range_check_rtalign(ip1, ip2, fxr);
		if (error)
			return error;

		/* Do the VFS checks with the regular block alignment. */
		alloc_unit = ip1->i_mount->m_sb.sb_blocksize;
	}

	error = generic_xchg_file_range_prep(file1, file2, fxr, alloc_unit);
	if (error || fxr->length == 0)
		return error;

	/* Attach dquots to both inodes before changing block maps. */
	error = xfs_qm_dqattach(ip2);
	if (error)
		return error;
	error = xfs_qm_dqattach(ip1);
	if (error)
		return error;

	trace_xfs_xchg_range_flush(ip1, fxr, ip2, 0);

	/* Flush the relevant ranges of both files. */
	error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
	if (error)
		return error;
	error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
	if (error)
		return error;

	/*
	 * Cancel CoW fork preallocations for the ranges of both files.  The
	 * prep function should have flushed all the dirty data, so the only
	 * extents remaining should be speculative.
	 */
	if (xfs_inode_has_cow_data(ip1)) {
		error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
				fxr->length, true);
		if (error)
			return error;
	}

	if (xfs_inode_has_cow_data(ip2)) {
		error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
				fxr->length, true);
		if (error)
			return error;
	}

	/* Convert unwritten sub-extent mappings if required. */
	if (xfs_swapext_need_rt_conversion(ip2)) {
		error = xfs_rtfile_convert_unwritten(ip2, fxr->file2_offset,
				fxr->length);
		if (error)
			return error;

		error = xfs_rtfile_convert_unwritten(ip1, fxr->file1_offset,
				fxr->length);
		if (error)
			return error;
	}

	return 0;
}

#define QRETRY_IP1	(0x1)
#define QRETRY_IP2	(0x2)

/*
 * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
 * this if quota enforcement is disabled or if both inodes' dquots are the
 * same.  The qretry structure must be initialized to zeroes before the first
 * call to this function.
 */
STATIC int
xfs_xchg_range_reserve_quota(
	struct xfs_trans		*tp,
	const struct xfs_swapext_req	*req,
	const struct xfs_swapext_res	*res,
	unsigned int			*qretry)
{
	int64_t				ddelta, rdelta;
	int				ip1_error = 0;
	int				error;

	/*
	 * Don't bother with a quota reservation if we're not enforcing them
	 * or the two inodes have the same dquots.
	 */
	if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
	    (req->ip1->i_udquot == req->ip2->i_udquot &&
	     req->ip1->i_gdquot == req->ip2->i_gdquot &&
	     req->ip1->i_pdquot == req->ip2->i_pdquot))
		return 0;

	*qretry = 0;

	/*
	 * For each file, compute the net gain in the number of regular blocks
	 * that will be mapped into that file and reserve that much quota.  The
	 * quota counts must be able to absorb at least that much space.
	 */
	ddelta = res->ip2_bcount - res->ip1_bcount;
	rdelta = res->ip2_rtbcount - res->ip1_rtbcount;
	if (ddelta > 0 || rdelta > 0) {
		error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
				ddelta > 0 ? ddelta : 0,
				rdelta > 0 ? rdelta : 0,
				false);
		if (error == -EDQUOT || error == -ENOSPC) {
			/*
			 * Save this error and see what happens if we try to
			 * reserve quota for ip2.  Then report both.
			 */
			*qretry |= QRETRY_IP1;
			ip1_error = error;
			error = 0;
		}
		if (error)
			return error;
	}
	if (ddelta < 0 || rdelta < 0) {
		error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
				ddelta < 0 ? -ddelta : 0,
				rdelta < 0 ? -rdelta : 0,
				false);
		if (error == -EDQUOT || error == -ENOSPC)
			*qretry |= QRETRY_IP2;
		if (error)
			return error;
	}
	if (ip1_error)
		return ip1_error;

	/*
	 * For each file, forcibly reserve the gross gain in mapped blocks so
	 * that we don't trip over any quota block reservation assertions.
	 * We must reserve the gross gain because the quota code subtracts from
	 * bcount the number of blocks that we unmap; it does not add that
	 * quantity back to the quota block reservation.
	 */
	error = xfs_trans_reserve_quota_nblks(tp, req->ip1, res->ip1_bcount,
			res->ip1_rtbcount, true);
	if (error)
		return error;

	return xfs_trans_reserve_quota_nblks(tp, req->ip2, res->ip2_bcount,
			res->ip2_rtbcount, true);
}

/*
 * Get permission to use log-assisted atomic exchange of file extents.
 *
 * Callers must not be running any transactions or hold any inode locks, and
 * they must release the permission by calling xfs_xchg_range_rele_log_assist
 * when they're done.
 */
int
xfs_xchg_range_grab_log_assist(
	struct xfs_mount	*mp,
	bool			force,
	bool			*enabled)
{
	int			error = 0;

	/*
	 * Protect ourselves from an idle log clearing the atomic swapext
	 * log incompat feature bit.
	 */
	xlog_use_incompat_feat(mp->m_log);
	*enabled = true;

	/*
	 * If log-assisted swapping is already enabled, the caller can use the
	 * log assisted swap functions with the log-incompat reference we got.
	 */
	if (xfs_has_atomicswap(mp))
		return 0;

	/*
	 * If the caller doesn't /require/ log-assisted swapping, drop the
	 * log-incompat feature protection and exit.  The caller cannot use
	 * log assisted swapping.
	 */
	if (!force)
		goto drop_incompat;

	/*
	 * Caller requires log-assisted swapping but the fs feature set isn't
	 * rich enough to support it.  Bail out.
	 */
	if (!xfs_can_atomicswap(mp)) {
		error = -EOPNOTSUPP;
		goto drop_incompat;
	}

	error = xfs_add_atomicswap(mp);
	if (error)
		goto drop_incompat;

	xfs_warn(mp,
 "EXPERIMENTAL atomic file range swap feature added. Use at your own risk!");

	return 0;
drop_incompat:
	xlog_drop_incompat_feat(mp->m_log);
	*enabled = false;
	return error;
}

/* Release permission to use log-assisted extent swapping. */
void
xfs_xchg_range_rele_log_assist(
	struct xfs_mount	*mp)
{
	xlog_drop_incompat_feat(mp->m_log);
}

/* Decide if we can use the old data fork exchange code. */
static inline bool
xfs_xchg_use_forkswap(
	const struct file_xchg_range	*fxr,
	struct xfs_inode		*ip1,
	struct xfs_inode		*ip2)
{
	return	(fxr->flags & FILE_XCHG_RANGE_NONATOMIC) &&
		(fxr->flags & FILE_XCHG_RANGE_FULL_FILES) &&
		!(fxr->flags & FILE_XCHG_RANGE_TO_EOF) &&
		fxr->file1_offset == 0 && fxr->file2_offset == 0 &&
		fxr->length == ip1->i_disk_size &&
		fxr->length == ip2->i_disk_size;
}

enum xchg_strategy {
	SWAPEXT		= 1,	/* xfs_swapext() */
	FORKSWAP	= 2,	/* exchange forks */
};

/* Exchange the contents of two files. */
int
xfs_xchg_range(
	struct xfs_inode		*ip1,
	struct xfs_inode		*ip2,
	const struct file_xchg_range	*fxr,
	unsigned int			xchg_flags)
{
	struct xfs_mount		*mp = ip1->i_mount;
	struct xfs_swapext_req		req = {
		.ip1			= ip1,
		.ip2			= ip2,
		.whichfork		= XFS_DATA_FORK,
		.startoff1		= XFS_B_TO_FSBT(mp, fxr->file1_offset),
		.startoff2		= XFS_B_TO_FSBT(mp, fxr->file2_offset),
		.blockcount		= XFS_B_TO_FSB(mp, fxr->length),
	};
	struct xfs_swapext_res		res;
	struct xfs_trans		*tp;
	unsigned int			qretry;
	unsigned int			flags = 0;
	bool				retried = false;
	enum xchg_strategy		strategy;
	int				error;

	trace_xfs_xchg_range(ip1, fxr, ip2, xchg_flags);

	if (fxr->flags & FILE_XCHG_RANGE_TO_EOF)
		req.req_flags |= XFS_SWAP_REQ_SET_SIZES;
	if (fxr->flags & FILE_XCHG_RANGE_SKIP_FILE1_HOLES)
		req.req_flags |= XFS_SWAP_REQ_SKIP_FILE1_HOLES;

	/*
	 * Round the request length up to the nearest fundamental unit of
	 * allocation.  The prep function already checked that the request
	 * offsets and length in @fxr are safe to round up.
	 */
	if (XFS_IS_REALTIME_INODE(ip2))
		req.blockcount = roundup_64(req.blockcount,
					    mp->m_sb.sb_rextsize);

	error = xfs_xchg_range_estimate(&req, &res);
	if (error)
		return error;

	/*
	 * We haven't decided which exchange strategy we want to use yet, but
	 * here we must choose if we want freed blocks during the swap to be
	 * added to the transaction block reservation (RES_FDBLKS) or freed
	 * into the global fdblocks.  The legacy fork swap mechanism doesn't
	 * free any blocks, so it doesn't require it.  It is also the only
	 * option that works for older filesystems.
	 *
	 * The bmap log intent items that were added with rmap and reflink can
	 * change the bmbt shape, so the intent-based swap strategies require
	 * us to set RES_FDBLKS.
	 */
	if (xfs_has_lazysbcount(mp))
		flags |= XFS_TRANS_RES_FDBLKS;

retry:
	/* Allocate the transaction, lock the inodes, and join them. */
	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, res.resblks, 0,
			flags, &tp);
	if (error)
		return error;

	xfs_xchg_range_ilock(tp, ip1, ip2);

	trace_xfs_swap_extent_before(ip2, 0);
	trace_xfs_swap_extent_before(ip1, 1);

	if (fxr->flags & FILE_XCHG_RANGE_FILE2_FRESH)
		trace_xfs_xchg_range_freshness(ip2, fxr);

	/*
	 * Now that we've excluded all other inode metadata changes by taking
	 * the ILOCK, repeat the freshness check.
	 */
	error = generic_xchg_file_range_check_fresh(VFS_I(ip2), fxr);
	if (error)
		goto out_trans_cancel;

	error = xfs_swapext_check_extents(mp, &req);
	if (error)
		goto out_trans_cancel;

	/*
	 * Reserve ourselves some quota if any of them are in enforcing mode.
	 * In theory we only need enough to satisfy the change in the number
	 * of blocks between the two ranges being remapped.
	 */
	error = xfs_xchg_range_reserve_quota(tp, &req, &res, &qretry);
	if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
		xfs_trans_cancel(tp);
		xfs_xchg_range_iunlock(ip1, ip2);
		if (qretry & QRETRY_IP1)
			xfs_blockgc_free_quota(ip1, 0);
		if (qretry & QRETRY_IP2)
			xfs_blockgc_free_quota(ip2, 0);
		retried = true;
		goto retry;
	}
	if (error)
		goto out_trans_cancel;

	if (xfs_has_atomicswap(mp) || xfs_can_atomicswap(mp)) {
		/*
		 * xfs_swapext() uses deferred bmap log intent items to swap
		 * extents between file forks.  If the atomic log swap feature
		 * is enabled, it will also use swapext log intent items to
		 * restart the operation in case of failure.
		 *
		 * This means that we can use it if we previously obtained
		 * permission from the log to use log-assisted atomic extent
		 * swapping; or if the fs supports rmap or reflink and the
		 * user said NONATOMIC.
		 */
		strategy = SWAPEXT;
	} else if (xfs_xchg_use_forkswap(fxr, ip1, ip2)) {
		/*
		 * Exchange the file contents by using the old bmap fork
		 * exchange code, if we're a defrag tool doing a full file
		 * swap.
		 */
		strategy = FORKSWAP;

		error = xfs_swap_extents_check_format(ip2, ip1);
		if (error) {
			xfs_notice(mp,
		"%s: inode 0x%llx format is incompatible for exchanging.",
					__func__, ip2->i_ino);
			goto out_trans_cancel;
		}
	} else {
		/* We cannot exchange the file contents. */
		error = -EOPNOTSUPP;
		goto out_trans_cancel;
	}

	/* If we got this far on a dry run, all parameters are ok. */
	if (fxr->flags & FILE_XCHG_RANGE_DRY_RUN)
		goto out_trans_cancel;

	/* Update the mtime and ctime of both files. */
	if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME1)
		xfs_trans_ichgtime(tp, ip1,
				XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
	if (xchg_flags & XFS_XCHG_RANGE_UPD_CMTIME2)
		xfs_trans_ichgtime(tp, ip2,
				XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

	if (strategy == SWAPEXT)
		error = xfs_swapext(&tp, &req);
	else
		error = xfs_swap_extent_forks(&tp, &req);
	if (error)
		goto out_trans_cancel;

	/*
	 * If the caller wanted us to exchange the contents of two complete
	 * files of unequal length, exchange the incore sizes now.  This should
	 * be safe because we flushed both files' page caches and moved all the
	 * post-eof extents, so there should not be anything to zero.
	 */
	if (fxr->flags & FILE_XCHG_RANGE_TO_EOF) {
		loff_t	temp;

		temp = i_size_read(VFS_I(ip2));
		i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
		i_size_write(VFS_I(ip1), temp);
	}

	/* Relog the inodes to keep transactions moving forward. */
	xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
	xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);

	/*
	 * Force the log to persist metadata updates if the caller or the
	 * administrator requires this.  The VFS prep function already flushed
	 * the relevant parts of the page cache.
	 */
	if (xfs_has_wsync(mp) || (fxr->flags & FILE_XCHG_RANGE_FSYNC))
		xfs_trans_set_sync(tp);

	error = xfs_trans_commit(tp);

	trace_xfs_swap_extent_after(ip2, 0);
	trace_xfs_swap_extent_after(ip1, 1);

out_unlock:
	xfs_xchg_range_iunlock(ip1, ip2);
	return error;

out_trans_cancel:
	xfs_trans_cancel(tp);
	goto out_unlock;
}