diff options
Diffstat (limited to 'fs/xfs/libxfs/xfs_swapext.c')
-rw-r--r-- | fs/xfs/libxfs/xfs_swapext.c | 900 |
1 files changed, 900 insertions, 0 deletions
diff --git a/fs/xfs/libxfs/xfs_swapext.c b/fs/xfs/libxfs/xfs_swapext.c new file mode 100644 index 000000000000..f7cdfeef5822 --- /dev/null +++ b/fs/xfs/libxfs/xfs_swapext.c @@ -0,0 +1,900 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2021 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_bmap.h" +#include "xfs_icache.h" +#include "xfs_quota.h" +#include "xfs_swapext.h" +#include "xfs_trace.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" + +struct kmem_cache *xfs_swapext_intent_cache; + +/* bmbt mappings adjacent to a pair of records. */ +struct xfs_swapext_adjacent { + struct xfs_bmbt_irec left1; + struct xfs_bmbt_irec right1; + struct xfs_bmbt_irec left2; + struct xfs_bmbt_irec right2; +}; + +#define ADJACENT_INIT { \ + .left1 = { .br_startblock = HOLESTARTBLOCK }, \ + .right1 = { .br_startblock = HOLESTARTBLOCK }, \ + .left2 = { .br_startblock = HOLESTARTBLOCK }, \ + .right2 = { .br_startblock = HOLESTARTBLOCK }, \ +} + +/* Information to help us reset reflink flag / CoW fork state after a swap. */ + +/* Are we swapping the data fork? */ +#define XFS_SX_REFLINK_DATAFORK (1U << 0) + +/* Can we swap the flags? */ +#define XFS_SX_REFLINK_SWAPFLAGS (1U << 1) + +/* Previous state of the two inodes' reflink flags. */ +#define XFS_SX_REFLINK_IP1_REFLINK (1U << 2) +#define XFS_SX_REFLINK_IP2_REFLINK (1U << 3) + +/* + * Prepare both inodes' reflink state for an extent swap, and return our + * findings so that xfs_swapext_reflink_finish can deal with the aftermath. + */ +unsigned int +xfs_swapext_reflink_prep( + const struct xfs_swapext_req *req) +{ + struct xfs_mount *mp = req->ip1->i_mount; + unsigned int rs = 0; + + if (req->whichfork != XFS_DATA_FORK) + return 0; + + /* + * If either file has shared blocks and we're swapping data forks, we + * must flag the other file as having shared blocks so that we get the + * shared-block rmap functions if we need to fix up the rmaps. The + * flags will be switched for real by xfs_swapext_reflink_finish. + */ + if (xfs_is_reflink_inode(req->ip1)) + rs |= XFS_SX_REFLINK_IP1_REFLINK; + if (xfs_is_reflink_inode(req->ip2)) + rs |= XFS_SX_REFLINK_IP2_REFLINK; + + if (rs & XFS_SX_REFLINK_IP1_REFLINK) + req->ip2->i_diflags2 |= XFS_DIFLAG2_REFLINK; + if (rs & XFS_SX_REFLINK_IP2_REFLINK) + req->ip1->i_diflags2 |= XFS_DIFLAG2_REFLINK; + + /* + * If either file had the reflink flag set before; and the two files' + * reflink state was different; and we're swapping the entirety of both + * files, then we can exchange the reflink flags at the end. + * Otherwise, we propagate the reflink flag from either file to the + * other file. + * + * Note that we've only set the _REFLINK flags of the reflink state, so + * we can cheat and use hweight32 for the reflink flag test. + * + */ + if (hweight32(rs) == 1 && req->startoff1 == 0 && req->startoff2 == 0 && + req->blockcount == XFS_B_TO_FSB(mp, req->ip1->i_disk_size) && + req->blockcount == XFS_B_TO_FSB(mp, req->ip2->i_disk_size)) + rs |= XFS_SX_REFLINK_SWAPFLAGS; + + rs |= XFS_SX_REFLINK_DATAFORK; + return rs; +} + +/* + * If the reflink flag is set on either inode, make sure it has an incore CoW + * fork, since all reflink inodes must have them. If there's a CoW fork and it + * has extents in it, make sure the inodes are tagged appropriately so that + * speculative preallocations can be GC'd if we run low of space. + */ +static inline void +xfs_swapext_ensure_cowfork( + struct xfs_inode *ip) +{ + struct xfs_ifork *cfork; + + if (xfs_is_reflink_inode(ip)) + xfs_ifork_init_cow(ip); + + cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK); + if (!cfork) + return; + if (cfork->if_bytes > 0) + xfs_inode_set_cowblocks_tag(ip); + else + xfs_inode_clear_cowblocks_tag(ip); +} + +/* + * Set both inodes' ondisk reflink flags to their final state and ensure that + * the incore state is ready to go. + */ +void +xfs_swapext_reflink_finish( + struct xfs_trans *tp, + const struct xfs_swapext_req *req, + unsigned int rs) +{ + if (!(rs & XFS_SX_REFLINK_DATAFORK)) + return; + + if (rs & XFS_SX_REFLINK_SWAPFLAGS) { + /* Exchange the reflink inode flags and log them. */ + req->ip1->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + if (rs & XFS_SX_REFLINK_IP2_REFLINK) + req->ip1->i_diflags2 |= XFS_DIFLAG2_REFLINK; + + req->ip2->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + if (rs & XFS_SX_REFLINK_IP1_REFLINK) + req->ip2->i_diflags2 |= XFS_DIFLAG2_REFLINK; + + xfs_trans_log_inode(tp, req->ip1, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, req->ip2, XFS_ILOG_CORE); + } + + xfs_swapext_ensure_cowfork(req->ip1); + xfs_swapext_ensure_cowfork(req->ip2); +} + +/* Schedule an atomic extent swap. */ +void +xfs_swapext_schedule( + struct xfs_trans *tp, + struct xfs_swapext_intent *sxi) +{ + trace_xfs_swapext_defer(tp->t_mountp, sxi); + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_SWAPEXT, &sxi->sxi_list); +} + +/* + * Adjust the on-disk inode size upwards if needed so that we never map extents + * into the file past EOF. This is crucial so that log recovery won't get + * confused by the sudden appearance of post-eof extents. + */ +STATIC void +xfs_swapext_update_size( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fsize_t new_isize) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_fsize_t len; + + if (new_isize < 0) + return; + + len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount), + new_isize); + + if (len <= ip->i_disk_size) + return; + + trace_xfs_swapext_update_inode_size(ip, len); + + ip->i_disk_size = len; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +static inline bool +sxi_has_more_swap_work(const struct xfs_swapext_intent *sxi) +{ + return sxi->sxi_blockcount > 0; +} + +static inline void +sxi_advance( + struct xfs_swapext_intent *sxi, + const struct xfs_bmbt_irec *irec) +{ + sxi->sxi_startoff1 += irec->br_blockcount; + sxi->sxi_startoff2 += irec->br_blockcount; + sxi->sxi_blockcount -= irec->br_blockcount; +} + +/* Check all extents to make sure we can actually swap them. */ +int +xfs_swapext_check_extents( + struct xfs_mount *mp, + const struct xfs_swapext_req *req) +{ + struct xfs_ifork *ifp1, *ifp2; + + /* No fork? */ + ifp1 = XFS_IFORK_PTR(req->ip1, req->whichfork); + ifp2 = XFS_IFORK_PTR(req->ip2, req->whichfork); + if (!ifp1 || !ifp2) + return -EINVAL; + + /* We don't know how to swap local format forks. */ + if (ifp1->if_format == XFS_DINODE_FMT_LOCAL || + ifp2->if_format == XFS_DINODE_FMT_LOCAL) + return -EINVAL; + + /* We don't support realtime data forks yet. */ + if (!XFS_IS_REALTIME_INODE(req->ip1)) + return 0; + if (req->whichfork == XFS_ATTR_FORK) + return 0; + return -EINVAL; +} + +#ifdef CONFIG_XFS_QUOTA +/* Log the actual updates to the quota accounting. */ +static inline void +xfs_swapext_update_quota( + struct xfs_trans *tp, + struct xfs_swapext_intent *sxi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2) +{ + int64_t ip1_delta = 0, ip2_delta = 0; + unsigned int qflag; + + qflag = XFS_IS_REALTIME_INODE(sxi->sxi_ip1) ? XFS_TRANS_DQ_RTBCOUNT : + XFS_TRANS_DQ_BCOUNT; + + if (xfs_bmap_is_real_extent(irec1)) { + ip1_delta -= irec1->br_blockcount; + ip2_delta += irec1->br_blockcount; + } + + if (xfs_bmap_is_real_extent(irec2)) { + ip1_delta += irec2->br_blockcount; + ip2_delta -= irec2->br_blockcount; + } + + xfs_trans_mod_dquot_byino(tp, sxi->sxi_ip1, qflag, ip1_delta); + xfs_trans_mod_dquot_byino(tp, sxi->sxi_ip2, qflag, ip2_delta); +} +#else +# define xfs_swapext_update_quota(tp, sxi, irec1, irec2) ((void)0) +#endif + +/* + * Walk forward through the file ranges in @sxi until we find two different + * mappings to exchange. If there is work to do, return the mappings; + * otherwise we've reached the end of the range and sxi_blockcount will be + * zero. + * + * If the walk skips over a pair of mappings to the same storage, save them as + * the left records in @adj (if provided) so that the simulation phase can + * avoid an extra lookup. + */ +static int +xfs_swapext_find_mappings( + struct xfs_swapext_intent *sxi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2, + struct xfs_swapext_adjacent *adj) +{ + int nimaps; + int bmap_flags; + int error; + + bmap_flags = xfs_bmapi_aflag(xfs_swapext_whichfork(sxi)); + + for (; sxi_has_more_swap_work(sxi); sxi_advance(sxi, irec1)) { + /* Read extent from the first file */ + nimaps = 1; + error = xfs_bmapi_read(sxi->sxi_ip1, sxi->sxi_startoff1, + sxi->sxi_blockcount, irec1, &nimaps, + bmap_flags); + if (error) + return error; + if (nimaps != 1 || + irec1->br_startblock == DELAYSTARTBLOCK || + irec1->br_startoff != sxi->sxi_startoff1) { + /* + * We should never get no mapping or a delalloc extent + * or something that doesn't match what we asked for, + * since the caller flushed both inodes and we hold the + * ILOCKs for both inodes. + */ + ASSERT(0); + return -EINVAL; + } + + /* + * If the caller told us to ignore sparse areas of file1, jump + * ahead to the next region. + */ + if ((sxi->sxi_flags & XFS_SWAP_EXT_SKIP_FILE1_HOLES) && + irec1->br_startblock == HOLESTARTBLOCK) { + trace_xfs_swapext_extent1(sxi->sxi_ip1, irec1); + continue; + } + + /* Read extent from the second file */ + nimaps = 1; + error = xfs_bmapi_read(sxi->sxi_ip2, sxi->sxi_startoff2, + irec1->br_blockcount, irec2, &nimaps, + bmap_flags); + if (error) + return error; + if (nimaps != 1 || + irec2->br_startblock == DELAYSTARTBLOCK || + irec2->br_startoff != sxi->sxi_startoff2) { + /* + * We should never get no mapping or a delalloc extent + * or something that doesn't match what we asked for, + * since the caller flushed both inodes and we hold the + * ILOCKs for both inodes. + */ + ASSERT(0); + return -EINVAL; + } + + /* + * We can only swap as many blocks as the smaller of the two + * extent maps. + */ + irec1->br_blockcount = min(irec1->br_blockcount, + irec2->br_blockcount); + + trace_xfs_swapext_extent1(sxi->sxi_ip1, irec1); + trace_xfs_swapext_extent2(sxi->sxi_ip2, irec2); + + /* We found something to swap, so return it. */ + if (irec1->br_startblock != irec2->br_startblock) + return 0; + + /* + * Two extents mapped to the same physical block must not have + * different states; that's filesystem corruption. Move on to + * the next extent if they're both holes or both the same + * physical extent. + */ + if (irec1->br_state != irec2->br_state) + return -EFSCORRUPTED; + + /* + * Save the mappings if we're estimating work and skipping + * these identical mappings. + */ + if (adj) { + memcpy(&adj->left1, irec1, sizeof(*irec1)); + memcpy(&adj->left2, irec2, sizeof(*irec2)); + } + } + + return 0; +} + +/* Exchange these two mappings. */ +static void +xfs_swapext_exchange_mappings( + struct xfs_trans *tp, + struct xfs_swapext_intent *sxi, + struct xfs_bmbt_irec *irec1, + struct xfs_bmbt_irec *irec2) +{ + int whichfork = xfs_swapext_whichfork(sxi); + + xfs_swapext_update_quota(tp, sxi, irec1, irec2); + + /* Remove both mappings. */ + xfs_bmap_unmap_extent(tp, sxi->sxi_ip1, whichfork, irec1); + xfs_bmap_unmap_extent(tp, sxi->sxi_ip2, whichfork, irec2); + + /* + * Re-add both mappings. We swap the file offsets between the two maps + * and add the opposite map, which has the effect of filling the + * logical offsets we just unmapped, but with with the physical mapping + * information swapped. + */ + swap(irec1->br_startoff, irec2->br_startoff); + xfs_bmap_map_extent(tp, sxi->sxi_ip1, whichfork, irec2); + xfs_bmap_map_extent(tp, sxi->sxi_ip2, whichfork, irec1); + + /* Make sure we're not mapping extents past EOF. */ + if (whichfork == XFS_DATA_FORK) { + xfs_swapext_update_size(tp, sxi->sxi_ip1, irec2, + sxi->sxi_isize1); + xfs_swapext_update_size(tp, sxi->sxi_ip2, irec1, + sxi->sxi_isize2); + } + + /* + * Advance our cursor and exit. The caller (either defer ops or log + * recovery) will log the SXD item, and if *blockcount is nonzero, it + * will log a new SXI item for the remainder and call us back. + */ + sxi_advance(sxi, irec1); +} + +/* Finish one extent swap, possibly log more. */ +int +xfs_swapext_finish_one( + struct xfs_trans *tp, + struct xfs_swapext_intent *sxi) +{ + struct xfs_bmbt_irec irec1, irec2; + int error; + + /* Find something to swap and swap it. */ + error = xfs_swapext_find_mappings(sxi, &irec1, &irec2, NULL); + if (error) + return error; + + if (sxi_has_more_swap_work(sxi)) + xfs_swapext_exchange_mappings(tp, sxi, &irec1, &irec2); + + /* + * If the caller asked us to exchange the file sizes and we're done + * moving extents, update the ondisk file sizes as part of the final + * extent swapping transaction. + */ + if (!sxi_has_more_swap_work(sxi) && + (sxi->sxi_flags & XFS_SWAP_EXT_SET_SIZES)) { + sxi->sxi_ip1->i_disk_size = sxi->sxi_isize1; + sxi->sxi_ip2->i_disk_size = sxi->sxi_isize2; + + xfs_trans_log_inode(tp, sxi->sxi_ip1, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, sxi->sxi_ip2, XFS_ILOG_CORE); + } + + /* If we still have work to do, ask for a new transaction. */ + if (sxi_has_more_swap_work(sxi)) { + trace_xfs_swapext_defer(tp->t_mountp, sxi); + return -EAGAIN; + } + + return 0; +} + +/* Estimate the bmbt and rmapbt overhead required to exchange extents. */ +static int +xfs_swapext_estimate_overhead( + const struct xfs_swapext_req *req, + struct xfs_swapext_res *res) +{ + struct xfs_mount *mp = req->ip1->i_mount; + unsigned int bmbt_overhead; + + /* + * Compute the amount of bmbt blocks we should reserve for each file. + * + * Conceptually this shouldn't affect the shape of either bmbt, but + * since we atomically move extents one by one, we reserve enough space + * to handle a bmbt split for each remap operation (t1). + * + * However, we must be careful to handle a corner case where the + * repeated unmap and map activities could result in ping-ponging of + * the btree shape. This behavior can come from one of two sources: + * + * An inode's extent list could have just enough records to straddle + * the btree format boundary. If so, the inode could bounce between + * btree <-> extent format on unmap -> remap cycles, freeing and + * allocating a bmapbt block each time. + * + * The same thing can happen if we have just enough records in a block + * to bounce between one and two leaf blocks. If there aren't enough + * sibling blocks to absorb or donate some records, we end up reshaping + * the tree with every remap operation. This doesn't seem to happen if + * we have more than four bmbt leaf blocks, so we'll make that the + * lower bound on the pingponging (t2). + * + * Therefore, we use XFS_TRANS_RES_FDBLKS so that freed bmbt blocks + * are accounted back to the transaction block reservation. + */ + bmbt_overhead = XFS_NEXTENTADD_SPACE_RES(mp, res->nr_exchanges, + req->whichfork); + res->ip1_bcount += bmbt_overhead; + res->ip2_bcount += bmbt_overhead; + res->resblks += 2 * bmbt_overhead; + + /* Apply similar logic to rmapbt reservations. */ + if (xfs_has_rmapbt(mp)) { + unsigned int rmapbt_overhead; + + if (!XFS_IS_REALTIME_INODE(req->ip1)) + rmapbt_overhead = XFS_NRMAPADD_SPACE_RES(mp, + res->nr_exchanges); + else + rmapbt_overhead = 0; + res->resblks += 2 * rmapbt_overhead; + } + + trace_xfs_swapext_estimate(req, res); + + if (res->resblks > UINT_MAX) + return -ENOSPC; + return 0; +} + +/* Decide if we can merge two real extents. */ +static inline bool +can_merge( + const struct xfs_bmbt_irec *b1, + const struct xfs_bmbt_irec *b2) +{ + /* Don't merge holes. */ + if (b1->br_startblock == HOLESTARTBLOCK || + b2->br_startblock == HOLESTARTBLOCK) + return false; + + /* We don't merge holes. */ + if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2)) + return false; + + if (b1->br_startoff + b1->br_blockcount == b2->br_startoff && + b1->br_startblock + b1->br_blockcount == b2->br_startblock && + b1->br_state == b2->br_state && + b1->br_blockcount + b2->br_blockcount <= MAXEXTLEN) + return true; + + return false; +} + +#define CLEFT_CONTIG 0x01 +#define CRIGHT_CONTIG 0x02 +#define CHOLE 0x04 +#define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG) + +#define NLEFT_CONTIG 0x10 +#define NRIGHT_CONTIG 0x20 +#define NHOLE 0x40 +#define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG) + +/* Estimate the effect of a single swap on extent count. */ +static inline int +delta_nextents_step( + struct xfs_mount *mp, + const struct xfs_bmbt_irec *left, + const struct xfs_bmbt_irec *curr, + const struct xfs_bmbt_irec *new, + const struct xfs_bmbt_irec *right) +{ + bool lhole, rhole, chole, nhole; + unsigned int state = 0; + int ret = 0; + + lhole = left->br_startblock == HOLESTARTBLOCK; + rhole = right->br_startblock == HOLESTARTBLOCK; + chole = curr->br_startblock == HOLESTARTBLOCK; + nhole = new->br_startblock == HOLESTARTBLOCK; + + if (chole) + state |= CHOLE; + if (!lhole && !chole && can_merge(left, curr)) + state |= CLEFT_CONTIG; + if (!rhole && !chole && can_merge(curr, right)) + state |= CRIGHT_CONTIG; + if ((state & CBOTH_CONTIG) == CBOTH_CONTIG && + left->br_startblock + curr->br_startblock + + right->br_startblock > MAXEXTLEN) + state &= ~CRIGHT_CONTIG; + + if (nhole) + state |= NHOLE; + if (!lhole && !nhole && can_merge(left, new)) + state |= NLEFT_CONTIG; + if (!rhole && !nhole && can_merge(new, right)) + state |= NRIGHT_CONTIG; + if ((state & NBOTH_CONTIG) == NBOTH_CONTIG && + left->br_startblock + new->br_startblock + + right->br_startblock > MAXEXTLEN) + state &= ~NRIGHT_CONTIG; + + switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) { + case CLEFT_CONTIG | CRIGHT_CONTIG: + /* + * left/curr/right are the same extent, so deleting curr causes + * 2 new extents to be created. + */ + ret += 2; + break; + case 0: + /* + * curr is not contiguous with any extent, so we remove curr + * completely + */ + ret--; + break; + case CHOLE: + /* hole, do nothing */ + break; + case CLEFT_CONTIG: + case CRIGHT_CONTIG: + /* trim either left or right, no change */ + break; + } + + switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) { + case NLEFT_CONTIG | NRIGHT_CONTIG: + /* + * left/curr/right will become the same extent, so adding + * curr causes the deletion of right. + */ + ret--; + break; + case 0: + /* new is not contiguous with any extent */ + ret++; + break; + case NHOLE: + /* hole, do nothing. */ + break; + case NLEFT_CONTIG: + case NRIGHT_CONTIG: + /* new is absorbed into left or right, no change */ + break; + } + + trace_xfs_swapext_delta_nextents_step(mp, left, curr, new, right, ret, + state); + return ret; +} + +/* Make sure we don't overflow the extent counters. */ +static inline int +check_delta_nextents( + const struct xfs_swapext_req *req, + struct xfs_inode *ip, + int64_t delta) +{ + ASSERT(delta < INT_MAX); + ASSERT(delta > INT_MIN); + + if (delta < 0) + return 0; + + return xfs_iext_count_may_overflow(ip, req->whichfork, delta); +} + +/* Find the next extent after irec. */ +static inline int +get_next_ext( + struct xfs_inode *ip, + int bmap_flags, + const struct xfs_bmbt_irec *irec, + struct xfs_bmbt_irec *nrec) +{ + xfs_fileoff_t off; + xfs_filblks_t blockcount; + int nimaps = 1; + int error; + + off = irec->br_startoff + irec->br_blockcount; + blockcount = XFS_MAX_FILEOFF - off; + error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags); + if (error) + return error; + if (nrec->br_startblock == DELAYSTARTBLOCK || + nrec->br_startoff != off) { + /* + * If we don't get the extent we want, return a zero-length + * mapping, which our estimator function will pretend is a hole. + * We shouldn't get delalloc reservations. + */ + nrec->br_startblock = HOLESTARTBLOCK; + } + + return 0; +} + +int __init +xfs_swapext_intent_init_cache(void) +{ + xfs_swapext_intent_cache = kmem_cache_create("xfs_swapext_intent", + sizeof(struct xfs_swapext_intent), + 0, 0, NULL); + + return xfs_swapext_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_swapext_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_swapext_intent_cache); + xfs_swapext_intent_cache = NULL; +} + +/* Allocate and initialize a new incore intent item from a request. */ +struct xfs_swapext_intent * +xfs_swapext_init_intent( + const struct xfs_swapext_req *req) +{ + struct xfs_swapext_intent *sxi; + + sxi = kmem_cache_alloc(xfs_swapext_intent_cache, + GFP_NOFS | __GFP_NOFAIL); + INIT_LIST_HEAD(&sxi->sxi_list); + sxi->sxi_ip1 = req->ip1; + sxi->sxi_ip2 = req->ip2; + sxi->sxi_startoff1 = req->startoff1; + sxi->sxi_startoff2 = req->startoff2; + sxi->sxi_blockcount = req->blockcount; + sxi->sxi_isize1 = sxi->sxi_isize2 = -1; + sxi->sxi_flags = 0; + + if (req->whichfork == XFS_ATTR_FORK) + sxi->sxi_flags |= XFS_SWAP_EXT_ATTR_FORK; + + if (req->whichfork == XFS_DATA_FORK && + (req->req_flags & XFS_SWAP_REQ_SET_SIZES)) { + sxi->sxi_flags |= XFS_SWAP_EXT_SET_SIZES; + sxi->sxi_isize1 = req->ip2->i_disk_size; + sxi->sxi_isize2 = req->ip1->i_disk_size; + } + + if (req->req_flags & XFS_SWAP_REQ_SKIP_FILE1_HOLES) + sxi->sxi_flags |= XFS_SWAP_EXT_SKIP_FILE1_HOLES; + + return sxi; +} + +/* + * Estimate the number of exchange operations and the number of file blocks + * in each file that will be affected by the exchange operation. + */ +int +xfs_swapext_estimate( + const struct xfs_swapext_req *req, + struct xfs_swapext_res *res) +{ + struct xfs_swapext_intent *sxi; + struct xfs_bmbt_irec irec1, irec2; + struct xfs_swapext_adjacent adj = ADJACENT_INIT; + xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0; + int64_t d_nexts1, d_nexts2; + int bmap_flags; + int error; + + ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS)); + + bmap_flags = xfs_bmapi_aflag(req->whichfork); + sxi = xfs_swapext_init_intent(req); + memset(res, 0, sizeof(struct xfs_swapext_res)); + + /* + * To guard against the possibility of overflowing the extent counters, + * we have to estimate an upper bound on the potential increase in that + * counter. We can split the extent at each end of the range, and for + * each step of the swap we can split the extent that we're working on + * if the extents do not align. + */ + d_nexts1 = d_nexts2 = 3; + + while (sxi_has_more_swap_work(sxi)) { + /* + * Walk through the file ranges until we find something to + * swap. Because we're simulating the swap, pass in adj to + * capture skipped mappings for correct estimation of bmbt + * record merges. + */ + error = xfs_swapext_find_mappings(sxi, &irec1, &irec2, &adj); + if (error) + goto out_free; + if (!sxi_has_more_swap_work(sxi)) + break; + + /* Update accounting. */ + if (xfs_bmap_is_real_extent(&irec1)) + ip1_blocks += irec1.br_blockcount; + if (xfs_bmap_is_real_extent(&irec2)) + ip2_blocks += irec2.br_blockcount; + res->nr_exchanges++; + + /* Read the next extents from both files. */ + error = get_next_ext(req->ip1, bmap_flags, &irec1, &adj.right1); + if (error) + goto out_free; + + error = get_next_ext(req->ip2, bmap_flags, &irec2, &adj.right2); + if (error) + goto out_free; + + /* Update extent count deltas. */ + d_nexts1 += delta_nextents_step(req->ip1->i_mount, + &adj.left1, &irec1, &irec2, &adj.right1); + + d_nexts2 += delta_nextents_step(req->ip1->i_mount, + &adj.left2, &irec2, &irec1, &adj.right2); + + /* Now pretend we swapped the extents. */ + if (can_merge(&adj.left2, &irec1)) + adj.left2.br_blockcount += irec1.br_blockcount; + else + memcpy(&adj.left2, &irec1, sizeof(irec1)); + + if (can_merge(&adj.left1, &irec2)) + adj.left1.br_blockcount += irec2.br_blockcount; + else + memcpy(&adj.left1, &irec2, sizeof(irec2)); + + sxi_advance(sxi, &irec1); + } + + /* Account for the blocks that are being exchanged. */ + if (XFS_IS_REALTIME_INODE(req->ip1) && + req->whichfork == XFS_DATA_FORK) { + res->ip1_rtbcount = ip1_blocks; + res->ip2_rtbcount = ip2_blocks; + } else { + res->ip1_bcount = ip1_blocks; + res->ip2_bcount = ip2_blocks; + } + + /* + * Make sure that both forks have enough slack left in their extent + * counters that the swap operation will not overflow. + */ + trace_xfs_swapext_delta_nextents(req, d_nexts1, d_nexts2); + if (req->ip1 == req->ip2) { + error = check_delta_nextents(req, req->ip1, + d_nexts1 + d_nexts2); + } else { + error = check_delta_nextents(req, req->ip1, d_nexts1); + if (error) + goto out_free; + error = check_delta_nextents(req, req->ip2, d_nexts2); + } + if (error) + goto out_free; + + error = xfs_swapext_estimate_overhead(req, res); +out_free: + kmem_free(sxi); + return error; +} + +/* + * Swap a range of extents from one inode to another. If the atomic swap + * feature is enabled, then the operation progress can be resumed even if the + * system goes down. + * + * The caller must ensure the inodes must be joined to the transaction and + * ILOCKd; they will still be joined to the transaction at exit. + */ +int +xfs_swapext( + struct xfs_trans **tpp, + const struct xfs_swapext_req *req) +{ + struct xfs_swapext_intent *sxi; + unsigned int reflink_state; + int error; + + ASSERT(xfs_isilocked(req->ip1, XFS_ILOCK_EXCL)); + ASSERT(xfs_isilocked(req->ip2, XFS_ILOCK_EXCL)); + ASSERT(req->whichfork != XFS_COW_FORK); + ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS)); + if (req->req_flags & XFS_SWAP_REQ_SET_SIZES) + ASSERT(req->whichfork == XFS_DATA_FORK); + + if (req->blockcount == 0) + return 0; + + reflink_state = xfs_swapext_reflink_prep(req); + + sxi = xfs_swapext_init_intent(req); + xfs_swapext_schedule(*tpp, sxi); + + error = xfs_defer_finish(tpp); + if (error) + return error; + + xfs_swapext_reflink_finish(*tpp, req, reflink_state); + return 0; +} |