summaryrefslogtreecommitdiff
path: root/fs/xfs/libxfs/xfs_swapext.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/libxfs/xfs_swapext.c')
-rw-r--r--fs/xfs/libxfs/xfs_swapext.c900
1 files changed, 900 insertions, 0 deletions
diff --git a/fs/xfs/libxfs/xfs_swapext.c b/fs/xfs/libxfs/xfs_swapext.c
new file mode 100644
index 000000000000..f7cdfeef5822
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_swapext.c
@@ -0,0 +1,900 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2021 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_icache.h"
+#include "xfs_quota.h"
+#include "xfs_swapext.h"
+#include "xfs_trace.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+
+struct kmem_cache *xfs_swapext_intent_cache;
+
+/* bmbt mappings adjacent to a pair of records. */
+struct xfs_swapext_adjacent {
+ struct xfs_bmbt_irec left1;
+ struct xfs_bmbt_irec right1;
+ struct xfs_bmbt_irec left2;
+ struct xfs_bmbt_irec right2;
+};
+
+#define ADJACENT_INIT { \
+ .left1 = { .br_startblock = HOLESTARTBLOCK }, \
+ .right1 = { .br_startblock = HOLESTARTBLOCK }, \
+ .left2 = { .br_startblock = HOLESTARTBLOCK }, \
+ .right2 = { .br_startblock = HOLESTARTBLOCK }, \
+}
+
+/* Information to help us reset reflink flag / CoW fork state after a swap. */
+
+/* Are we swapping the data fork? */
+#define XFS_SX_REFLINK_DATAFORK (1U << 0)
+
+/* Can we swap the flags? */
+#define XFS_SX_REFLINK_SWAPFLAGS (1U << 1)
+
+/* Previous state of the two inodes' reflink flags. */
+#define XFS_SX_REFLINK_IP1_REFLINK (1U << 2)
+#define XFS_SX_REFLINK_IP2_REFLINK (1U << 3)
+
+/*
+ * Prepare both inodes' reflink state for an extent swap, and return our
+ * findings so that xfs_swapext_reflink_finish can deal with the aftermath.
+ */
+unsigned int
+xfs_swapext_reflink_prep(
+ const struct xfs_swapext_req *req)
+{
+ struct xfs_mount *mp = req->ip1->i_mount;
+ unsigned int rs = 0;
+
+ if (req->whichfork != XFS_DATA_FORK)
+ return 0;
+
+ /*
+ * If either file has shared blocks and we're swapping data forks, we
+ * must flag the other file as having shared blocks so that we get the
+ * shared-block rmap functions if we need to fix up the rmaps. The
+ * flags will be switched for real by xfs_swapext_reflink_finish.
+ */
+ if (xfs_is_reflink_inode(req->ip1))
+ rs |= XFS_SX_REFLINK_IP1_REFLINK;
+ if (xfs_is_reflink_inode(req->ip2))
+ rs |= XFS_SX_REFLINK_IP2_REFLINK;
+
+ if (rs & XFS_SX_REFLINK_IP1_REFLINK)
+ req->ip2->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+ if (rs & XFS_SX_REFLINK_IP2_REFLINK)
+ req->ip1->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+
+ /*
+ * If either file had the reflink flag set before; and the two files'
+ * reflink state was different; and we're swapping the entirety of both
+ * files, then we can exchange the reflink flags at the end.
+ * Otherwise, we propagate the reflink flag from either file to the
+ * other file.
+ *
+ * Note that we've only set the _REFLINK flags of the reflink state, so
+ * we can cheat and use hweight32 for the reflink flag test.
+ *
+ */
+ if (hweight32(rs) == 1 && req->startoff1 == 0 && req->startoff2 == 0 &&
+ req->blockcount == XFS_B_TO_FSB(mp, req->ip1->i_disk_size) &&
+ req->blockcount == XFS_B_TO_FSB(mp, req->ip2->i_disk_size))
+ rs |= XFS_SX_REFLINK_SWAPFLAGS;
+
+ rs |= XFS_SX_REFLINK_DATAFORK;
+ return rs;
+}
+
+/*
+ * If the reflink flag is set on either inode, make sure it has an incore CoW
+ * fork, since all reflink inodes must have them. If there's a CoW fork and it
+ * has extents in it, make sure the inodes are tagged appropriately so that
+ * speculative preallocations can be GC'd if we run low of space.
+ */
+static inline void
+xfs_swapext_ensure_cowfork(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *cfork;
+
+ if (xfs_is_reflink_inode(ip))
+ xfs_ifork_init_cow(ip);
+
+ cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ if (!cfork)
+ return;
+ if (cfork->if_bytes > 0)
+ xfs_inode_set_cowblocks_tag(ip);
+ else
+ xfs_inode_clear_cowblocks_tag(ip);
+}
+
+/*
+ * Set both inodes' ondisk reflink flags to their final state and ensure that
+ * the incore state is ready to go.
+ */
+void
+xfs_swapext_reflink_finish(
+ struct xfs_trans *tp,
+ const struct xfs_swapext_req *req,
+ unsigned int rs)
+{
+ if (!(rs & XFS_SX_REFLINK_DATAFORK))
+ return;
+
+ if (rs & XFS_SX_REFLINK_SWAPFLAGS) {
+ /* Exchange the reflink inode flags and log them. */
+ req->ip1->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+ if (rs & XFS_SX_REFLINK_IP2_REFLINK)
+ req->ip1->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+
+ req->ip2->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+ if (rs & XFS_SX_REFLINK_IP1_REFLINK)
+ req->ip2->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+
+ xfs_trans_log_inode(tp, req->ip1, XFS_ILOG_CORE);
+ xfs_trans_log_inode(tp, req->ip2, XFS_ILOG_CORE);
+ }
+
+ xfs_swapext_ensure_cowfork(req->ip1);
+ xfs_swapext_ensure_cowfork(req->ip2);
+}
+
+/* Schedule an atomic extent swap. */
+void
+xfs_swapext_schedule(
+ struct xfs_trans *tp,
+ struct xfs_swapext_intent *sxi)
+{
+ trace_xfs_swapext_defer(tp->t_mountp, sxi);
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_SWAPEXT, &sxi->sxi_list);
+}
+
+/*
+ * Adjust the on-disk inode size upwards if needed so that we never map extents
+ * into the file past EOF. This is crucial so that log recovery won't get
+ * confused by the sudden appearance of post-eof extents.
+ */
+STATIC void
+xfs_swapext_update_size(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ xfs_fsize_t new_isize)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_fsize_t len;
+
+ if (new_isize < 0)
+ return;
+
+ len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount),
+ new_isize);
+
+ if (len <= ip->i_disk_size)
+ return;
+
+ trace_xfs_swapext_update_inode_size(ip, len);
+
+ ip->i_disk_size = len;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+static inline bool
+sxi_has_more_swap_work(const struct xfs_swapext_intent *sxi)
+{
+ return sxi->sxi_blockcount > 0;
+}
+
+static inline void
+sxi_advance(
+ struct xfs_swapext_intent *sxi,
+ const struct xfs_bmbt_irec *irec)
+{
+ sxi->sxi_startoff1 += irec->br_blockcount;
+ sxi->sxi_startoff2 += irec->br_blockcount;
+ sxi->sxi_blockcount -= irec->br_blockcount;
+}
+
+/* Check all extents to make sure we can actually swap them. */
+int
+xfs_swapext_check_extents(
+ struct xfs_mount *mp,
+ const struct xfs_swapext_req *req)
+{
+ struct xfs_ifork *ifp1, *ifp2;
+
+ /* No fork? */
+ ifp1 = XFS_IFORK_PTR(req->ip1, req->whichfork);
+ ifp2 = XFS_IFORK_PTR(req->ip2, req->whichfork);
+ if (!ifp1 || !ifp2)
+ return -EINVAL;
+
+ /* We don't know how to swap local format forks. */
+ if (ifp1->if_format == XFS_DINODE_FMT_LOCAL ||
+ ifp2->if_format == XFS_DINODE_FMT_LOCAL)
+ return -EINVAL;
+
+ /* We don't support realtime data forks yet. */
+ if (!XFS_IS_REALTIME_INODE(req->ip1))
+ return 0;
+ if (req->whichfork == XFS_ATTR_FORK)
+ return 0;
+ return -EINVAL;
+}
+
+#ifdef CONFIG_XFS_QUOTA
+/* Log the actual updates to the quota accounting. */
+static inline void
+xfs_swapext_update_quota(
+ struct xfs_trans *tp,
+ struct xfs_swapext_intent *sxi,
+ struct xfs_bmbt_irec *irec1,
+ struct xfs_bmbt_irec *irec2)
+{
+ int64_t ip1_delta = 0, ip2_delta = 0;
+ unsigned int qflag;
+
+ qflag = XFS_IS_REALTIME_INODE(sxi->sxi_ip1) ? XFS_TRANS_DQ_RTBCOUNT :
+ XFS_TRANS_DQ_BCOUNT;
+
+ if (xfs_bmap_is_real_extent(irec1)) {
+ ip1_delta -= irec1->br_blockcount;
+ ip2_delta += irec1->br_blockcount;
+ }
+
+ if (xfs_bmap_is_real_extent(irec2)) {
+ ip1_delta += irec2->br_blockcount;
+ ip2_delta -= irec2->br_blockcount;
+ }
+
+ xfs_trans_mod_dquot_byino(tp, sxi->sxi_ip1, qflag, ip1_delta);
+ xfs_trans_mod_dquot_byino(tp, sxi->sxi_ip2, qflag, ip2_delta);
+}
+#else
+# define xfs_swapext_update_quota(tp, sxi, irec1, irec2) ((void)0)
+#endif
+
+/*
+ * Walk forward through the file ranges in @sxi until we find two different
+ * mappings to exchange. If there is work to do, return the mappings;
+ * otherwise we've reached the end of the range and sxi_blockcount will be
+ * zero.
+ *
+ * If the walk skips over a pair of mappings to the same storage, save them as
+ * the left records in @adj (if provided) so that the simulation phase can
+ * avoid an extra lookup.
+ */
+static int
+xfs_swapext_find_mappings(
+ struct xfs_swapext_intent *sxi,
+ struct xfs_bmbt_irec *irec1,
+ struct xfs_bmbt_irec *irec2,
+ struct xfs_swapext_adjacent *adj)
+{
+ int nimaps;
+ int bmap_flags;
+ int error;
+
+ bmap_flags = xfs_bmapi_aflag(xfs_swapext_whichfork(sxi));
+
+ for (; sxi_has_more_swap_work(sxi); sxi_advance(sxi, irec1)) {
+ /* Read extent from the first file */
+ nimaps = 1;
+ error = xfs_bmapi_read(sxi->sxi_ip1, sxi->sxi_startoff1,
+ sxi->sxi_blockcount, irec1, &nimaps,
+ bmap_flags);
+ if (error)
+ return error;
+ if (nimaps != 1 ||
+ irec1->br_startblock == DELAYSTARTBLOCK ||
+ irec1->br_startoff != sxi->sxi_startoff1) {
+ /*
+ * We should never get no mapping or a delalloc extent
+ * or something that doesn't match what we asked for,
+ * since the caller flushed both inodes and we hold the
+ * ILOCKs for both inodes.
+ */
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ /*
+ * If the caller told us to ignore sparse areas of file1, jump
+ * ahead to the next region.
+ */
+ if ((sxi->sxi_flags & XFS_SWAP_EXT_SKIP_FILE1_HOLES) &&
+ irec1->br_startblock == HOLESTARTBLOCK) {
+ trace_xfs_swapext_extent1(sxi->sxi_ip1, irec1);
+ continue;
+ }
+
+ /* Read extent from the second file */
+ nimaps = 1;
+ error = xfs_bmapi_read(sxi->sxi_ip2, sxi->sxi_startoff2,
+ irec1->br_blockcount, irec2, &nimaps,
+ bmap_flags);
+ if (error)
+ return error;
+ if (nimaps != 1 ||
+ irec2->br_startblock == DELAYSTARTBLOCK ||
+ irec2->br_startoff != sxi->sxi_startoff2) {
+ /*
+ * We should never get no mapping or a delalloc extent
+ * or something that doesn't match what we asked for,
+ * since the caller flushed both inodes and we hold the
+ * ILOCKs for both inodes.
+ */
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ /*
+ * We can only swap as many blocks as the smaller of the two
+ * extent maps.
+ */
+ irec1->br_blockcount = min(irec1->br_blockcount,
+ irec2->br_blockcount);
+
+ trace_xfs_swapext_extent1(sxi->sxi_ip1, irec1);
+ trace_xfs_swapext_extent2(sxi->sxi_ip2, irec2);
+
+ /* We found something to swap, so return it. */
+ if (irec1->br_startblock != irec2->br_startblock)
+ return 0;
+
+ /*
+ * Two extents mapped to the same physical block must not have
+ * different states; that's filesystem corruption. Move on to
+ * the next extent if they're both holes or both the same
+ * physical extent.
+ */
+ if (irec1->br_state != irec2->br_state)
+ return -EFSCORRUPTED;
+
+ /*
+ * Save the mappings if we're estimating work and skipping
+ * these identical mappings.
+ */
+ if (adj) {
+ memcpy(&adj->left1, irec1, sizeof(*irec1));
+ memcpy(&adj->left2, irec2, sizeof(*irec2));
+ }
+ }
+
+ return 0;
+}
+
+/* Exchange these two mappings. */
+static void
+xfs_swapext_exchange_mappings(
+ struct xfs_trans *tp,
+ struct xfs_swapext_intent *sxi,
+ struct xfs_bmbt_irec *irec1,
+ struct xfs_bmbt_irec *irec2)
+{
+ int whichfork = xfs_swapext_whichfork(sxi);
+
+ xfs_swapext_update_quota(tp, sxi, irec1, irec2);
+
+ /* Remove both mappings. */
+ xfs_bmap_unmap_extent(tp, sxi->sxi_ip1, whichfork, irec1);
+ xfs_bmap_unmap_extent(tp, sxi->sxi_ip2, whichfork, irec2);
+
+ /*
+ * Re-add both mappings. We swap the file offsets between the two maps
+ * and add the opposite map, which has the effect of filling the
+ * logical offsets we just unmapped, but with with the physical mapping
+ * information swapped.
+ */
+ swap(irec1->br_startoff, irec2->br_startoff);
+ xfs_bmap_map_extent(tp, sxi->sxi_ip1, whichfork, irec2);
+ xfs_bmap_map_extent(tp, sxi->sxi_ip2, whichfork, irec1);
+
+ /* Make sure we're not mapping extents past EOF. */
+ if (whichfork == XFS_DATA_FORK) {
+ xfs_swapext_update_size(tp, sxi->sxi_ip1, irec2,
+ sxi->sxi_isize1);
+ xfs_swapext_update_size(tp, sxi->sxi_ip2, irec1,
+ sxi->sxi_isize2);
+ }
+
+ /*
+ * Advance our cursor and exit. The caller (either defer ops or log
+ * recovery) will log the SXD item, and if *blockcount is nonzero, it
+ * will log a new SXI item for the remainder and call us back.
+ */
+ sxi_advance(sxi, irec1);
+}
+
+/* Finish one extent swap, possibly log more. */
+int
+xfs_swapext_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_swapext_intent *sxi)
+{
+ struct xfs_bmbt_irec irec1, irec2;
+ int error;
+
+ /* Find something to swap and swap it. */
+ error = xfs_swapext_find_mappings(sxi, &irec1, &irec2, NULL);
+ if (error)
+ return error;
+
+ if (sxi_has_more_swap_work(sxi))
+ xfs_swapext_exchange_mappings(tp, sxi, &irec1, &irec2);
+
+ /*
+ * If the caller asked us to exchange the file sizes and we're done
+ * moving extents, update the ondisk file sizes as part of the final
+ * extent swapping transaction.
+ */
+ if (!sxi_has_more_swap_work(sxi) &&
+ (sxi->sxi_flags & XFS_SWAP_EXT_SET_SIZES)) {
+ sxi->sxi_ip1->i_disk_size = sxi->sxi_isize1;
+ sxi->sxi_ip2->i_disk_size = sxi->sxi_isize2;
+
+ xfs_trans_log_inode(tp, sxi->sxi_ip1, XFS_ILOG_CORE);
+ xfs_trans_log_inode(tp, sxi->sxi_ip2, XFS_ILOG_CORE);
+ }
+
+ /* If we still have work to do, ask for a new transaction. */
+ if (sxi_has_more_swap_work(sxi)) {
+ trace_xfs_swapext_defer(tp->t_mountp, sxi);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/* Estimate the bmbt and rmapbt overhead required to exchange extents. */
+static int
+xfs_swapext_estimate_overhead(
+ const struct xfs_swapext_req *req,
+ struct xfs_swapext_res *res)
+{
+ struct xfs_mount *mp = req->ip1->i_mount;
+ unsigned int bmbt_overhead;
+
+ /*
+ * Compute the amount of bmbt blocks we should reserve for each file.
+ *
+ * Conceptually this shouldn't affect the shape of either bmbt, but
+ * since we atomically move extents one by one, we reserve enough space
+ * to handle a bmbt split for each remap operation (t1).
+ *
+ * However, we must be careful to handle a corner case where the
+ * repeated unmap and map activities could result in ping-ponging of
+ * the btree shape. This behavior can come from one of two sources:
+ *
+ * An inode's extent list could have just enough records to straddle
+ * the btree format boundary. If so, the inode could bounce between
+ * btree <-> extent format on unmap -> remap cycles, freeing and
+ * allocating a bmapbt block each time.
+ *
+ * The same thing can happen if we have just enough records in a block
+ * to bounce between one and two leaf blocks. If there aren't enough
+ * sibling blocks to absorb or donate some records, we end up reshaping
+ * the tree with every remap operation. This doesn't seem to happen if
+ * we have more than four bmbt leaf blocks, so we'll make that the
+ * lower bound on the pingponging (t2).
+ *
+ * Therefore, we use XFS_TRANS_RES_FDBLKS so that freed bmbt blocks
+ * are accounted back to the transaction block reservation.
+ */
+ bmbt_overhead = XFS_NEXTENTADD_SPACE_RES(mp, res->nr_exchanges,
+ req->whichfork);
+ res->ip1_bcount += bmbt_overhead;
+ res->ip2_bcount += bmbt_overhead;
+ res->resblks += 2 * bmbt_overhead;
+
+ /* Apply similar logic to rmapbt reservations. */
+ if (xfs_has_rmapbt(mp)) {
+ unsigned int rmapbt_overhead;
+
+ if (!XFS_IS_REALTIME_INODE(req->ip1))
+ rmapbt_overhead = XFS_NRMAPADD_SPACE_RES(mp,
+ res->nr_exchanges);
+ else
+ rmapbt_overhead = 0;
+ res->resblks += 2 * rmapbt_overhead;
+ }
+
+ trace_xfs_swapext_estimate(req, res);
+
+ if (res->resblks > UINT_MAX)
+ return -ENOSPC;
+ return 0;
+}
+
+/* Decide if we can merge two real extents. */
+static inline bool
+can_merge(
+ const struct xfs_bmbt_irec *b1,
+ const struct xfs_bmbt_irec *b2)
+{
+ /* Don't merge holes. */
+ if (b1->br_startblock == HOLESTARTBLOCK ||
+ b2->br_startblock == HOLESTARTBLOCK)
+ return false;
+
+ /* We don't merge holes. */
+ if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2))
+ return false;
+
+ if (b1->br_startoff + b1->br_blockcount == b2->br_startoff &&
+ b1->br_startblock + b1->br_blockcount == b2->br_startblock &&
+ b1->br_state == b2->br_state &&
+ b1->br_blockcount + b2->br_blockcount <= MAXEXTLEN)
+ return true;
+
+ return false;
+}
+
+#define CLEFT_CONTIG 0x01
+#define CRIGHT_CONTIG 0x02
+#define CHOLE 0x04
+#define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG)
+
+#define NLEFT_CONTIG 0x10
+#define NRIGHT_CONTIG 0x20
+#define NHOLE 0x40
+#define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG)
+
+/* Estimate the effect of a single swap on extent count. */
+static inline int
+delta_nextents_step(
+ struct xfs_mount *mp,
+ const struct xfs_bmbt_irec *left,
+ const struct xfs_bmbt_irec *curr,
+ const struct xfs_bmbt_irec *new,
+ const struct xfs_bmbt_irec *right)
+{
+ bool lhole, rhole, chole, nhole;
+ unsigned int state = 0;
+ int ret = 0;
+
+ lhole = left->br_startblock == HOLESTARTBLOCK;
+ rhole = right->br_startblock == HOLESTARTBLOCK;
+ chole = curr->br_startblock == HOLESTARTBLOCK;
+ nhole = new->br_startblock == HOLESTARTBLOCK;
+
+ if (chole)
+ state |= CHOLE;
+ if (!lhole && !chole && can_merge(left, curr))
+ state |= CLEFT_CONTIG;
+ if (!rhole && !chole && can_merge(curr, right))
+ state |= CRIGHT_CONTIG;
+ if ((state & CBOTH_CONTIG) == CBOTH_CONTIG &&
+ left->br_startblock + curr->br_startblock +
+ right->br_startblock > MAXEXTLEN)
+ state &= ~CRIGHT_CONTIG;
+
+ if (nhole)
+ state |= NHOLE;
+ if (!lhole && !nhole && can_merge(left, new))
+ state |= NLEFT_CONTIG;
+ if (!rhole && !nhole && can_merge(new, right))
+ state |= NRIGHT_CONTIG;
+ if ((state & NBOTH_CONTIG) == NBOTH_CONTIG &&
+ left->br_startblock + new->br_startblock +
+ right->br_startblock > MAXEXTLEN)
+ state &= ~NRIGHT_CONTIG;
+
+ switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) {
+ case CLEFT_CONTIG | CRIGHT_CONTIG:
+ /*
+ * left/curr/right are the same extent, so deleting curr causes
+ * 2 new extents to be created.
+ */
+ ret += 2;
+ break;
+ case 0:
+ /*
+ * curr is not contiguous with any extent, so we remove curr
+ * completely
+ */
+ ret--;
+ break;
+ case CHOLE:
+ /* hole, do nothing */
+ break;
+ case CLEFT_CONTIG:
+ case CRIGHT_CONTIG:
+ /* trim either left or right, no change */
+ break;
+ }
+
+ switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) {
+ case NLEFT_CONTIG | NRIGHT_CONTIG:
+ /*
+ * left/curr/right will become the same extent, so adding
+ * curr causes the deletion of right.
+ */
+ ret--;
+ break;
+ case 0:
+ /* new is not contiguous with any extent */
+ ret++;
+ break;
+ case NHOLE:
+ /* hole, do nothing. */
+ break;
+ case NLEFT_CONTIG:
+ case NRIGHT_CONTIG:
+ /* new is absorbed into left or right, no change */
+ break;
+ }
+
+ trace_xfs_swapext_delta_nextents_step(mp, left, curr, new, right, ret,
+ state);
+ return ret;
+}
+
+/* Make sure we don't overflow the extent counters. */
+static inline int
+check_delta_nextents(
+ const struct xfs_swapext_req *req,
+ struct xfs_inode *ip,
+ int64_t delta)
+{
+ ASSERT(delta < INT_MAX);
+ ASSERT(delta > INT_MIN);
+
+ if (delta < 0)
+ return 0;
+
+ return xfs_iext_count_may_overflow(ip, req->whichfork, delta);
+}
+
+/* Find the next extent after irec. */
+static inline int
+get_next_ext(
+ struct xfs_inode *ip,
+ int bmap_flags,
+ const struct xfs_bmbt_irec *irec,
+ struct xfs_bmbt_irec *nrec)
+{
+ xfs_fileoff_t off;
+ xfs_filblks_t blockcount;
+ int nimaps = 1;
+ int error;
+
+ off = irec->br_startoff + irec->br_blockcount;
+ blockcount = XFS_MAX_FILEOFF - off;
+ error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags);
+ if (error)
+ return error;
+ if (nrec->br_startblock == DELAYSTARTBLOCK ||
+ nrec->br_startoff != off) {
+ /*
+ * If we don't get the extent we want, return a zero-length
+ * mapping, which our estimator function will pretend is a hole.
+ * We shouldn't get delalloc reservations.
+ */
+ nrec->br_startblock = HOLESTARTBLOCK;
+ }
+
+ return 0;
+}
+
+int __init
+xfs_swapext_intent_init_cache(void)
+{
+ xfs_swapext_intent_cache = kmem_cache_create("xfs_swapext_intent",
+ sizeof(struct xfs_swapext_intent),
+ 0, 0, NULL);
+
+ return xfs_swapext_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_swapext_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_swapext_intent_cache);
+ xfs_swapext_intent_cache = NULL;
+}
+
+/* Allocate and initialize a new incore intent item from a request. */
+struct xfs_swapext_intent *
+xfs_swapext_init_intent(
+ const struct xfs_swapext_req *req)
+{
+ struct xfs_swapext_intent *sxi;
+
+ sxi = kmem_cache_alloc(xfs_swapext_intent_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ INIT_LIST_HEAD(&sxi->sxi_list);
+ sxi->sxi_ip1 = req->ip1;
+ sxi->sxi_ip2 = req->ip2;
+ sxi->sxi_startoff1 = req->startoff1;
+ sxi->sxi_startoff2 = req->startoff2;
+ sxi->sxi_blockcount = req->blockcount;
+ sxi->sxi_isize1 = sxi->sxi_isize2 = -1;
+ sxi->sxi_flags = 0;
+
+ if (req->whichfork == XFS_ATTR_FORK)
+ sxi->sxi_flags |= XFS_SWAP_EXT_ATTR_FORK;
+
+ if (req->whichfork == XFS_DATA_FORK &&
+ (req->req_flags & XFS_SWAP_REQ_SET_SIZES)) {
+ sxi->sxi_flags |= XFS_SWAP_EXT_SET_SIZES;
+ sxi->sxi_isize1 = req->ip2->i_disk_size;
+ sxi->sxi_isize2 = req->ip1->i_disk_size;
+ }
+
+ if (req->req_flags & XFS_SWAP_REQ_SKIP_FILE1_HOLES)
+ sxi->sxi_flags |= XFS_SWAP_EXT_SKIP_FILE1_HOLES;
+
+ return sxi;
+}
+
+/*
+ * Estimate the number of exchange operations and the number of file blocks
+ * in each file that will be affected by the exchange operation.
+ */
+int
+xfs_swapext_estimate(
+ const struct xfs_swapext_req *req,
+ struct xfs_swapext_res *res)
+{
+ struct xfs_swapext_intent *sxi;
+ struct xfs_bmbt_irec irec1, irec2;
+ struct xfs_swapext_adjacent adj = ADJACENT_INIT;
+ xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0;
+ int64_t d_nexts1, d_nexts2;
+ int bmap_flags;
+ int error;
+
+ ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS));
+
+ bmap_flags = xfs_bmapi_aflag(req->whichfork);
+ sxi = xfs_swapext_init_intent(req);
+ memset(res, 0, sizeof(struct xfs_swapext_res));
+
+ /*
+ * To guard against the possibility of overflowing the extent counters,
+ * we have to estimate an upper bound on the potential increase in that
+ * counter. We can split the extent at each end of the range, and for
+ * each step of the swap we can split the extent that we're working on
+ * if the extents do not align.
+ */
+ d_nexts1 = d_nexts2 = 3;
+
+ while (sxi_has_more_swap_work(sxi)) {
+ /*
+ * Walk through the file ranges until we find something to
+ * swap. Because we're simulating the swap, pass in adj to
+ * capture skipped mappings for correct estimation of bmbt
+ * record merges.
+ */
+ error = xfs_swapext_find_mappings(sxi, &irec1, &irec2, &adj);
+ if (error)
+ goto out_free;
+ if (!sxi_has_more_swap_work(sxi))
+ break;
+
+ /* Update accounting. */
+ if (xfs_bmap_is_real_extent(&irec1))
+ ip1_blocks += irec1.br_blockcount;
+ if (xfs_bmap_is_real_extent(&irec2))
+ ip2_blocks += irec2.br_blockcount;
+ res->nr_exchanges++;
+
+ /* Read the next extents from both files. */
+ error = get_next_ext(req->ip1, bmap_flags, &irec1, &adj.right1);
+ if (error)
+ goto out_free;
+
+ error = get_next_ext(req->ip2, bmap_flags, &irec2, &adj.right2);
+ if (error)
+ goto out_free;
+
+ /* Update extent count deltas. */
+ d_nexts1 += delta_nextents_step(req->ip1->i_mount,
+ &adj.left1, &irec1, &irec2, &adj.right1);
+
+ d_nexts2 += delta_nextents_step(req->ip1->i_mount,
+ &adj.left2, &irec2, &irec1, &adj.right2);
+
+ /* Now pretend we swapped the extents. */
+ if (can_merge(&adj.left2, &irec1))
+ adj.left2.br_blockcount += irec1.br_blockcount;
+ else
+ memcpy(&adj.left2, &irec1, sizeof(irec1));
+
+ if (can_merge(&adj.left1, &irec2))
+ adj.left1.br_blockcount += irec2.br_blockcount;
+ else
+ memcpy(&adj.left1, &irec2, sizeof(irec2));
+
+ sxi_advance(sxi, &irec1);
+ }
+
+ /* Account for the blocks that are being exchanged. */
+ if (XFS_IS_REALTIME_INODE(req->ip1) &&
+ req->whichfork == XFS_DATA_FORK) {
+ res->ip1_rtbcount = ip1_blocks;
+ res->ip2_rtbcount = ip2_blocks;
+ } else {
+ res->ip1_bcount = ip1_blocks;
+ res->ip2_bcount = ip2_blocks;
+ }
+
+ /*
+ * Make sure that both forks have enough slack left in their extent
+ * counters that the swap operation will not overflow.
+ */
+ trace_xfs_swapext_delta_nextents(req, d_nexts1, d_nexts2);
+ if (req->ip1 == req->ip2) {
+ error = check_delta_nextents(req, req->ip1,
+ d_nexts1 + d_nexts2);
+ } else {
+ error = check_delta_nextents(req, req->ip1, d_nexts1);
+ if (error)
+ goto out_free;
+ error = check_delta_nextents(req, req->ip2, d_nexts2);
+ }
+ if (error)
+ goto out_free;
+
+ error = xfs_swapext_estimate_overhead(req, res);
+out_free:
+ kmem_free(sxi);
+ return error;
+}
+
+/*
+ * Swap a range of extents from one inode to another. If the atomic swap
+ * feature is enabled, then the operation progress can be resumed even if the
+ * system goes down.
+ *
+ * The caller must ensure the inodes must be joined to the transaction and
+ * ILOCKd; they will still be joined to the transaction at exit.
+ */
+int
+xfs_swapext(
+ struct xfs_trans **tpp,
+ const struct xfs_swapext_req *req)
+{
+ struct xfs_swapext_intent *sxi;
+ unsigned int reflink_state;
+ int error;
+
+ ASSERT(xfs_isilocked(req->ip1, XFS_ILOCK_EXCL));
+ ASSERT(xfs_isilocked(req->ip2, XFS_ILOCK_EXCL));
+ ASSERT(req->whichfork != XFS_COW_FORK);
+ ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS));
+ if (req->req_flags & XFS_SWAP_REQ_SET_SIZES)
+ ASSERT(req->whichfork == XFS_DATA_FORK);
+
+ if (req->blockcount == 0)
+ return 0;
+
+ reflink_state = xfs_swapext_reflink_prep(req);
+
+ sxi = xfs_swapext_init_intent(req);
+ xfs_swapext_schedule(*tpp, sxi);
+
+ error = xfs_defer_finish(tpp);
+ if (error)
+ return error;
+
+ xfs_swapext_reflink_finish(*tpp, req, reflink_state);
+ return 0;
+}