// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2021 Oracle. All Rights Reserved. * Author: Darrick J. Wong */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_icache.h" #include "xfs_quota.h" #include "xfs_swapext.h" #include "xfs_trace.h" #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr_leaf.h" #include "xfs_attr.h" #include "xfs_dir2_priv.h" #include "xfs_dir2.h" struct kmem_cache *xfs_swapext_intent_cache; /* bmbt mappings adjacent to a pair of records. */ struct xfs_swapext_adjacent { struct xfs_bmbt_irec left1; struct xfs_bmbt_irec right1; struct xfs_bmbt_irec left2; struct xfs_bmbt_irec right2; }; #define ADJACENT_INIT { \ .left1 = { .br_startblock = HOLESTARTBLOCK }, \ .right1 = { .br_startblock = HOLESTARTBLOCK }, \ .left2 = { .br_startblock = HOLESTARTBLOCK }, \ .right2 = { .br_startblock = HOLESTARTBLOCK }, \ } /* Information to help us reset reflink flag / CoW fork state after a swap. */ /* Are we swapping the data fork? */ #define XFS_SX_REFLINK_DATAFORK (1U << 0) /* Can we swap the flags? */ #define XFS_SX_REFLINK_SWAPFLAGS (1U << 1) /* Previous state of the two inodes' reflink flags. */ #define XFS_SX_REFLINK_IP1_REFLINK (1U << 2) #define XFS_SX_REFLINK_IP2_REFLINK (1U << 3) /* * Prepare both inodes' reflink state for an extent swap, and return our * findings so that xfs_swapext_reflink_finish can deal with the aftermath. */ unsigned int xfs_swapext_reflink_prep( const struct xfs_swapext_req *req) { struct xfs_mount *mp = req->ip1->i_mount; unsigned int rs = 0; if (req->whichfork != XFS_DATA_FORK) return 0; /* * If either file has shared blocks and we're swapping data forks, we * must flag the other file as having shared blocks so that we get the * shared-block rmap functions if we need to fix up the rmaps. The * flags will be switched for real by xfs_swapext_reflink_finish. */ if (xfs_is_reflink_inode(req->ip1)) rs |= XFS_SX_REFLINK_IP1_REFLINK; if (xfs_is_reflink_inode(req->ip2)) rs |= XFS_SX_REFLINK_IP2_REFLINK; if (rs & XFS_SX_REFLINK_IP1_REFLINK) req->ip2->i_diflags2 |= XFS_DIFLAG2_REFLINK; if (rs & XFS_SX_REFLINK_IP2_REFLINK) req->ip1->i_diflags2 |= XFS_DIFLAG2_REFLINK; /* * If either file had the reflink flag set before; and the two files' * reflink state was different; and we're swapping the entirety of both * files, then we can exchange the reflink flags at the end. * Otherwise, we propagate the reflink flag from either file to the * other file. * * Note that we've only set the _REFLINK flags of the reflink state, so * we can cheat and use hweight32 for the reflink flag test. * */ if (hweight32(rs) == 1 && req->startoff1 == 0 && req->startoff2 == 0 && req->blockcount == XFS_B_TO_FSB(mp, req->ip1->i_disk_size) && req->blockcount == XFS_B_TO_FSB(mp, req->ip2->i_disk_size)) rs |= XFS_SX_REFLINK_SWAPFLAGS; rs |= XFS_SX_REFLINK_DATAFORK; return rs; } /* * If the reflink flag is set on either inode, make sure it has an incore CoW * fork, since all reflink inodes must have them. If there's a CoW fork and it * has extents in it, make sure the inodes are tagged appropriately so that * speculative preallocations can be GC'd if we run low of space. */ static inline void xfs_swapext_ensure_cowfork( struct xfs_inode *ip) { struct xfs_ifork *cfork; if (xfs_is_reflink_inode(ip)) xfs_ifork_init_cow(ip); cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK); if (!cfork) return; if (cfork->if_bytes > 0) xfs_inode_set_cowblocks_tag(ip); else xfs_inode_clear_cowblocks_tag(ip); } /* * Set both inodes' ondisk reflink flags to their final state and ensure that * the incore state is ready to go. */ void xfs_swapext_reflink_finish( struct xfs_trans *tp, const struct xfs_swapext_req *req, unsigned int rs) { if (!(rs & XFS_SX_REFLINK_DATAFORK)) return; if (rs & XFS_SX_REFLINK_SWAPFLAGS) { /* Exchange the reflink inode flags and log them. */ req->ip1->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; if (rs & XFS_SX_REFLINK_IP2_REFLINK) req->ip1->i_diflags2 |= XFS_DIFLAG2_REFLINK; req->ip2->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; if (rs & XFS_SX_REFLINK_IP1_REFLINK) req->ip2->i_diflags2 |= XFS_DIFLAG2_REFLINK; xfs_trans_log_inode(tp, req->ip1, XFS_ILOG_CORE); xfs_trans_log_inode(tp, req->ip2, XFS_ILOG_CORE); } xfs_swapext_ensure_cowfork(req->ip1); xfs_swapext_ensure_cowfork(req->ip2); } /* Schedule an atomic extent swap. */ void xfs_swapext_schedule( struct xfs_trans *tp, struct xfs_swapext_intent *sxi) { trace_xfs_swapext_defer(tp->t_mountp, sxi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_SWAPEXT, &sxi->sxi_list); } /* * Adjust the on-disk inode size upwards if needed so that we never map extents * into the file past EOF. This is crucial so that log recovery won't get * confused by the sudden appearance of post-eof extents. */ STATIC void xfs_swapext_update_size( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap, xfs_fsize_t new_isize) { struct xfs_mount *mp = tp->t_mountp; xfs_fsize_t len; if (new_isize < 0) return; len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount), new_isize); if (len <= ip->i_disk_size) return; trace_xfs_swapext_update_inode_size(ip, len); ip->i_disk_size = len; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } static inline bool sxi_has_more_swap_work(const struct xfs_swapext_intent *sxi) { return sxi->sxi_blockcount > 0; } static inline bool sxi_has_postop_work(const struct xfs_swapext_intent *sxi) { return sxi->sxi_flags & XFS_SWAP_EXT_FILE2_CVT_SF; } static inline void sxi_advance( struct xfs_swapext_intent *sxi, const struct xfs_bmbt_irec *irec) { sxi->sxi_startoff1 += irec->br_blockcount; sxi->sxi_startoff2 += irec->br_blockcount; sxi->sxi_blockcount -= irec->br_blockcount; } /* * There may be partially written rt extents lurking in the ranges to be * swapped. According to the rules for realtime files with big rt extents, we * must guarantee that an outside observer (an IO thread, realistically) never * can see multiple physical rt extents mapped to the same logical file rt * extent. The deferred bmap log intent items that we use under the hood * operate on single block mappings and not rt extents, which means we must * have a strategy to ensure that log recovery after a failure won't stop in * the middle of an rt extent. * * The preferred strategy is to use deferred extent swap log intent items to * track the status of the overall swap operation so that we can complete the * work during crash recovery. If that isn't possible, we fall back to * requiring the selected mappings in both forks to be aligned to rt extent * boundaries. As an aside, the old fork swap routine didn't have this * requirement, but at an extreme cost in flexibilty (full files only, no attr * forks, and no support if rmapbt is enabled). */ bool xfs_swapext_need_rt_conversion( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; /* We're using the extent swap intent items */ if (xfs_has_atomicswap(mp)) return false; /* The only supported operation is full fork swaps */ if (!xfs_can_atomicswap(mp)) return false; /* Conversion is only needed for realtime files with big rt extents */ return xfs_inode_has_bigrtextents(ip); } #ifdef DEBUG static inline int xfs_swapext_check_rt_extents( struct xfs_mount *mp, const struct xfs_swapext_req *req) { struct xfs_bmbt_irec irec1, irec2; xfs_fileoff_t startoff1 = req->startoff1; xfs_fileoff_t startoff2 = req->startoff2; xfs_filblks_t blockcount = req->blockcount; uint32_t mod; int nimaps; int error; if (req->whichfork == XFS_ATTR_FORK || !xfs_swapext_need_rt_conversion(req->ip2)) return 0; while (blockcount > 0) { /* Read extent from the first file */ nimaps = 1; error = xfs_bmapi_read(req->ip1, startoff1, blockcount, &irec1, &nimaps, 0); if (error) return error; ASSERT(nimaps == 1); /* Read extent from the second file */ nimaps = 1; error = xfs_bmapi_read(req->ip2, startoff2, irec1.br_blockcount, &irec2, &nimaps, 0); if (error) return error; ASSERT(nimaps == 1); /* * We can only swap as many blocks as the smaller of the two * extent maps. */ irec1.br_blockcount = min(irec1.br_blockcount, irec2.br_blockcount); /* Both mappings must be aligned to the realtime extent size. */ div_u64_rem(irec1.br_startoff, mp->m_sb.sb_rextsize, &mod); if (mod) { ASSERT(mod == 0); return -EINVAL; } div_u64_rem(irec2.br_startoff, mp->m_sb.sb_rextsize, &mod); if (mod) { ASSERT(mod == 0); return -EINVAL; } div_u64_rem(irec1.br_blockcount, mp->m_sb.sb_rextsize, &mod); if (mod) { ASSERT(mod == 0); return -EINVAL; } startoff1 += irec1.br_blockcount; startoff2 += irec1.br_blockcount; blockcount -= irec1.br_blockcount; } return 0; } #else # define xfs_swapext_check_rt_extents(mp, req) (0) #endif /* Check all extents to make sure we can actually swap them. */ int xfs_swapext_check_extents( struct xfs_mount *mp, const struct xfs_swapext_req *req) { struct xfs_ifork *ifp1, *ifp2; /* No fork? */ ifp1 = XFS_IFORK_PTR(req->ip1, req->whichfork); ifp2 = XFS_IFORK_PTR(req->ip2, req->whichfork); if (!ifp1 || !ifp2) return -EINVAL; /* We don't know how to swap local format forks. */ if (ifp1->if_format == XFS_DINODE_FMT_LOCAL || ifp2->if_format == XFS_DINODE_FMT_LOCAL) return -EINVAL; return xfs_swapext_check_rt_extents(mp, req); } #ifdef CONFIG_XFS_QUOTA /* Log the actual updates to the quota accounting. */ static inline void xfs_swapext_update_quota( struct xfs_trans *tp, struct xfs_swapext_intent *sxi, struct xfs_bmbt_irec *irec1, struct xfs_bmbt_irec *irec2) { int64_t ip1_delta = 0, ip2_delta = 0; unsigned int qflag; qflag = XFS_IS_REALTIME_INODE(sxi->sxi_ip1) ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; if (xfs_bmap_is_real_extent(irec1)) { ip1_delta -= irec1->br_blockcount; ip2_delta += irec1->br_blockcount; } if (xfs_bmap_is_real_extent(irec2)) { ip1_delta += irec2->br_blockcount; ip2_delta -= irec2->br_blockcount; } xfs_trans_mod_dquot_byino(tp, sxi->sxi_ip1, qflag, ip1_delta); xfs_trans_mod_dquot_byino(tp, sxi->sxi_ip2, qflag, ip2_delta); } #else # define xfs_swapext_update_quota(tp, sxi, irec1, irec2) ((void)0) #endif /* * Walk forward through the file ranges in @sxi until we find two different * mappings to exchange. If there is work to do, return the mappings; * otherwise we've reached the end of the range and sxi_blockcount will be * zero. * * If the walk skips over a pair of mappings to the same storage, save them as * the left records in @adj (if provided) so that the simulation phase can * avoid an extra lookup. */ static int xfs_swapext_find_mappings( struct xfs_swapext_intent *sxi, struct xfs_bmbt_irec *irec1, struct xfs_bmbt_irec *irec2, struct xfs_swapext_adjacent *adj) { int nimaps; int bmap_flags; int error; bmap_flags = xfs_bmapi_aflag(xfs_swapext_whichfork(sxi)); for (; sxi_has_more_swap_work(sxi); sxi_advance(sxi, irec1)) { /* Read extent from the first file */ nimaps = 1; error = xfs_bmapi_read(sxi->sxi_ip1, sxi->sxi_startoff1, sxi->sxi_blockcount, irec1, &nimaps, bmap_flags); if (error) return error; if (nimaps != 1 || irec1->br_startblock == DELAYSTARTBLOCK || irec1->br_startoff != sxi->sxi_startoff1) { /* * We should never get no mapping or a delalloc extent * or something that doesn't match what we asked for, * since the caller flushed both inodes and we hold the * ILOCKs for both inodes. */ ASSERT(0); return -EINVAL; } /* * If the caller told us to ignore sparse areas of file1, jump * ahead to the next region. */ if ((sxi->sxi_flags & XFS_SWAP_EXT_SKIP_FILE1_HOLES) && irec1->br_startblock == HOLESTARTBLOCK) { trace_xfs_swapext_extent1(sxi->sxi_ip1, irec1); continue; } /* Read extent from the second file */ nimaps = 1; error = xfs_bmapi_read(sxi->sxi_ip2, sxi->sxi_startoff2, irec1->br_blockcount, irec2, &nimaps, bmap_flags); if (error) return error; if (nimaps != 1 || irec2->br_startblock == DELAYSTARTBLOCK || irec2->br_startoff != sxi->sxi_startoff2) { /* * We should never get no mapping or a delalloc extent * or something that doesn't match what we asked for, * since the caller flushed both inodes and we hold the * ILOCKs for both inodes. */ ASSERT(0); return -EINVAL; } /* * We can only swap as many blocks as the smaller of the two * extent maps. */ irec1->br_blockcount = min(irec1->br_blockcount, irec2->br_blockcount); trace_xfs_swapext_extent1(sxi->sxi_ip1, irec1); trace_xfs_swapext_extent2(sxi->sxi_ip2, irec2); /* We found something to swap, so return it. */ if (irec1->br_startblock != irec2->br_startblock) return 0; /* * Two extents mapped to the same physical block must not have * different states; that's filesystem corruption. Move on to * the next extent if they're both holes or both the same * physical extent. */ if (irec1->br_state != irec2->br_state) return -EFSCORRUPTED; /* * Save the mappings if we're estimating work and skipping * these identical mappings. */ if (adj) { memcpy(&adj->left1, irec1, sizeof(*irec1)); memcpy(&adj->left2, irec2, sizeof(*irec2)); } } return 0; } /* Exchange these two mappings. */ static void xfs_swapext_exchange_mappings( struct xfs_trans *tp, struct xfs_swapext_intent *sxi, struct xfs_bmbt_irec *irec1, struct xfs_bmbt_irec *irec2) { int whichfork = xfs_swapext_whichfork(sxi); xfs_swapext_update_quota(tp, sxi, irec1, irec2); /* Remove both mappings. */ xfs_bmap_unmap_extent(tp, sxi->sxi_ip1, whichfork, irec1); xfs_bmap_unmap_extent(tp, sxi->sxi_ip2, whichfork, irec2); /* * Re-add both mappings. We swap the file offsets between the two maps * and add the opposite map, which has the effect of filling the * logical offsets we just unmapped, but with with the physical mapping * information swapped. */ swap(irec1->br_startoff, irec2->br_startoff); xfs_bmap_map_extent(tp, sxi->sxi_ip1, whichfork, irec2); xfs_bmap_map_extent(tp, sxi->sxi_ip2, whichfork, irec1); /* Make sure we're not mapping extents past EOF. */ if (whichfork == XFS_DATA_FORK) { xfs_swapext_update_size(tp, sxi->sxi_ip1, irec2, sxi->sxi_isize1); xfs_swapext_update_size(tp, sxi->sxi_ip2, irec1, sxi->sxi_isize2); } /* * Advance our cursor and exit. The caller (either defer ops or log * recovery) will log the SXD item, and if *blockcount is nonzero, it * will log a new SXI item for the remainder and call us back. */ sxi_advance(sxi, irec1); } /* Convert inode2's leaf attr fork back to shortform, if possible.. */ STATIC int xfs_swapext_attr_to_sf( struct xfs_trans *tp, struct xfs_swapext_intent *sxi) { struct xfs_da_args args = { .dp = sxi->sxi_ip2, .geo = tp->t_mountp->m_attr_geo, .whichfork = XFS_ATTR_FORK, .trans = tp, }; struct xfs_buf *bp; int forkoff; int error; if (!xfs_attr_is_leaf(sxi->sxi_ip2)) return 0; error = xfs_attr3_leaf_read(tp, sxi->sxi_ip2, 0, &bp); if (error) return error; forkoff = xfs_attr_shortform_allfit(bp, sxi->sxi_ip2); if (forkoff == 0) return 0; return xfs_attr3_leaf_to_shortform(bp, &args, forkoff); } /* Convert inode2's block dir fork back to shortform, if possible.. */ STATIC int xfs_swapext_dir_to_sf( struct xfs_trans *tp, struct xfs_swapext_intent *sxi) { struct xfs_da_args args = { .dp = sxi->sxi_ip2, .geo = tp->t_mountp->m_dir_geo, .whichfork = XFS_DATA_FORK, .trans = tp, }; struct xfs_dir2_sf_hdr sfh; struct xfs_buf *bp; int isblock; int size; int error; error = xfs_dir2_isblock(&args, &isblock); if (error) return error; if (!isblock) return 0; error = xfs_dir3_block_read(tp, sxi->sxi_ip2, &bp); if (error) return error; size = xfs_dir2_block_sfsize(sxi->sxi_ip2, bp->b_addr, &sfh); if (size > XFS_IFORK_DSIZE(sxi->sxi_ip2)) return 0; return xfs_dir2_block_to_sf(&args, bp, size, &sfh); } /* Finish whatever work might come after a swap operation. */ static int xfs_swapext_postop_work( struct xfs_trans *tp, struct xfs_swapext_intent *sxi) { int error = 0; if (sxi->sxi_flags & XFS_SWAP_EXT_FILE2_CVT_SF) { if (sxi->sxi_flags & XFS_SWAP_EXT_ATTR_FORK) error = xfs_swapext_attr_to_sf(tp, sxi); else if (S_ISDIR(VFS_I(sxi->sxi_ip2)->i_mode)) error = xfs_swapext_dir_to_sf(tp, sxi); sxi->sxi_flags &= ~XFS_SWAP_EXT_FILE2_CVT_SF; if (error) return error; } return 0; } /* Finish one extent swap, possibly log more. */ int xfs_swapext_finish_one( struct xfs_trans *tp, struct xfs_swapext_intent *sxi) { struct xfs_bmbt_irec irec1, irec2; int error = 0; /* * If there isn't any exchange work to do, the previous transaction * finished the extent swap and now we need to do some post-op cleanup * work on file2. */ if (!sxi_has_more_swap_work(sxi)) { ASSERT(sxi_has_postop_work(sxi)); return xfs_swapext_postop_work(tp, sxi); } /* Find something to swap and swap it. */ error = xfs_swapext_find_mappings(sxi, &irec1, &irec2, NULL); if (error) return error; if (sxi_has_more_swap_work(sxi)) xfs_swapext_exchange_mappings(tp, sxi, &irec1, &irec2); /* * If the caller asked us to exchange the file sizes and we're done * moving extents, update the ondisk file sizes as part of the final * extent swapping transaction. */ if (!sxi_has_more_swap_work(sxi) && (sxi->sxi_flags & XFS_SWAP_EXT_SET_SIZES)) { sxi->sxi_ip1->i_disk_size = sxi->sxi_isize1; sxi->sxi_ip2->i_disk_size = sxi->sxi_isize2; xfs_trans_log_inode(tp, sxi->sxi_ip1, XFS_ILOG_CORE); xfs_trans_log_inode(tp, sxi->sxi_ip2, XFS_ILOG_CORE); } if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_SWAPEXT_FINISH_ONE)) return -EIO; /* If we still have work to do, ask for a new transaction. */ if (sxi_has_more_swap_work(sxi) || sxi_has_postop_work(sxi)) { trace_xfs_swapext_defer(tp->t_mountp, sxi); return -EAGAIN; } return 0; } /* Estimate the bmbt and rmapbt overhead required to exchange extents. */ int xfs_swapext_estimate_overhead( const struct xfs_swapext_req *req, struct xfs_swapext_res *res) { struct xfs_mount *mp = req->ip1->i_mount; unsigned int bmbt_overhead; /* * Compute the amount of bmbt blocks we should reserve for each file. * * Conceptually this shouldn't affect the shape of either bmbt, but * since we atomically move extents one by one, we reserve enough space * to handle a bmbt split for each remap operation (t1). * * However, we must be careful to handle a corner case where the * repeated unmap and map activities could result in ping-ponging of * the btree shape. This behavior can come from one of two sources: * * An inode's extent list could have just enough records to straddle * the btree format boundary. If so, the inode could bounce between * btree <-> extent format on unmap -> remap cycles, freeing and * allocating a bmapbt block each time. * * The same thing can happen if we have just enough records in a block * to bounce between one and two leaf blocks. If there aren't enough * sibling blocks to absorb or donate some records, we end up reshaping * the tree with every remap operation. This doesn't seem to happen if * we have more than four bmbt leaf blocks, so we'll make that the * lower bound on the pingponging (t2). * * Therefore, we use XFS_TRANS_RES_FDBLKS so that freed bmbt blocks * are accounted back to the transaction block reservation. */ bmbt_overhead = XFS_NEXTENTADD_SPACE_RES(mp, res->nr_exchanges, req->whichfork); res->ip1_bcount += bmbt_overhead; res->ip2_bcount += bmbt_overhead; res->resblks += 2 * bmbt_overhead; /* Apply similar logic to rmapbt reservations. */ if (xfs_has_rmapbt(mp)) { unsigned int rmapbt_overhead; if (!XFS_IS_REALTIME_INODE(req->ip1)) rmapbt_overhead = XFS_NRMAPADD_SPACE_RES(mp, res->nr_exchanges); else rmapbt_overhead = XFS_NRTRMAPADD_SPACE_RES(mp, res->nr_exchanges); res->resblks += 2 * rmapbt_overhead; } trace_xfs_swapext_estimate(req, res); if (res->resblks > UINT_MAX) return -ENOSPC; return 0; } /* Decide if we can merge two real extents. */ static inline bool can_merge( const struct xfs_bmbt_irec *b1, const struct xfs_bmbt_irec *b2) { /* Don't merge holes. */ if (b1->br_startblock == HOLESTARTBLOCK || b2->br_startblock == HOLESTARTBLOCK) return false; /* We don't merge holes. */ if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2)) return false; if (b1->br_startoff + b1->br_blockcount == b2->br_startoff && b1->br_startblock + b1->br_blockcount == b2->br_startblock && b1->br_state == b2->br_state && b1->br_blockcount + b2->br_blockcount <= MAXEXTLEN) return true; return false; } #define CLEFT_CONTIG 0x01 #define CRIGHT_CONTIG 0x02 #define CHOLE 0x04 #define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG) #define NLEFT_CONTIG 0x10 #define NRIGHT_CONTIG 0x20 #define NHOLE 0x40 #define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG) /* Estimate the effect of a single swap on extent count. */ static inline int delta_nextents_step( struct xfs_mount *mp, const struct xfs_bmbt_irec *left, const struct xfs_bmbt_irec *curr, const struct xfs_bmbt_irec *new, const struct xfs_bmbt_irec *right) { bool lhole, rhole, chole, nhole; unsigned int state = 0; int ret = 0; lhole = left->br_startblock == HOLESTARTBLOCK; rhole = right->br_startblock == HOLESTARTBLOCK; chole = curr->br_startblock == HOLESTARTBLOCK; nhole = new->br_startblock == HOLESTARTBLOCK; if (chole) state |= CHOLE; if (!lhole && !chole && can_merge(left, curr)) state |= CLEFT_CONTIG; if (!rhole && !chole && can_merge(curr, right)) state |= CRIGHT_CONTIG; if ((state & CBOTH_CONTIG) == CBOTH_CONTIG && left->br_startblock + curr->br_startblock + right->br_startblock > MAXEXTLEN) state &= ~CRIGHT_CONTIG; if (nhole) state |= NHOLE; if (!lhole && !nhole && can_merge(left, new)) state |= NLEFT_CONTIG; if (!rhole && !nhole && can_merge(new, right)) state |= NRIGHT_CONTIG; if ((state & NBOTH_CONTIG) == NBOTH_CONTIG && left->br_startblock + new->br_startblock + right->br_startblock > MAXEXTLEN) state &= ~NRIGHT_CONTIG; switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) { case CLEFT_CONTIG | CRIGHT_CONTIG: /* * left/curr/right are the same extent, so deleting curr causes * 2 new extents to be created. */ ret += 2; break; case 0: /* * curr is not contiguous with any extent, so we remove curr * completely */ ret--; break; case CHOLE: /* hole, do nothing */ break; case CLEFT_CONTIG: case CRIGHT_CONTIG: /* trim either left or right, no change */ break; } switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) { case NLEFT_CONTIG | NRIGHT_CONTIG: /* * left/curr/right will become the same extent, so adding * curr causes the deletion of right. */ ret--; break; case 0: /* new is not contiguous with any extent */ ret++; break; case NHOLE: /* hole, do nothing. */ break; case NLEFT_CONTIG: case NRIGHT_CONTIG: /* new is absorbed into left or right, no change */ break; } trace_xfs_swapext_delta_nextents_step(mp, left, curr, new, right, ret, state); return ret; } /* Make sure we don't overflow the extent counters. */ static inline int check_delta_nextents( const struct xfs_swapext_req *req, struct xfs_inode *ip, int64_t delta) { ASSERT(delta < INT_MAX); ASSERT(delta > INT_MIN); if (delta < 0) return 0; return xfs_iext_count_may_overflow(ip, req->whichfork, delta); } /* Find the next extent after irec. */ static inline int get_next_ext( struct xfs_inode *ip, int bmap_flags, const struct xfs_bmbt_irec *irec, struct xfs_bmbt_irec *nrec) { xfs_fileoff_t off; xfs_filblks_t blockcount; int nimaps = 1; int error; off = irec->br_startoff + irec->br_blockcount; blockcount = XFS_MAX_FILEOFF - off; error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags); if (error) return error; if (nrec->br_startblock == DELAYSTARTBLOCK || nrec->br_startoff != off) { /* * If we don't get the extent we want, return a zero-length * mapping, which our estimator function will pretend is a hole. * We shouldn't get delalloc reservations. */ nrec->br_startblock = HOLESTARTBLOCK; } return 0; } int __init xfs_swapext_intent_init_cache(void) { xfs_swapext_intent_cache = kmem_cache_create("xfs_swapext_intent", sizeof(struct xfs_swapext_intent), 0, 0, NULL); return xfs_swapext_intent_cache != NULL ? 0 : -ENOMEM; } void xfs_swapext_intent_destroy_cache(void) { kmem_cache_destroy(xfs_swapext_intent_cache); xfs_swapext_intent_cache = NULL; } /* Allocate and initialize a new incore intent item from a request. */ struct xfs_swapext_intent * xfs_swapext_init_intent( const struct xfs_swapext_req *req) { struct xfs_swapext_intent *sxi; sxi = kmem_cache_alloc(xfs_swapext_intent_cache, GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&sxi->sxi_list); sxi->sxi_ip1 = req->ip1; sxi->sxi_ip2 = req->ip2; sxi->sxi_startoff1 = req->startoff1; sxi->sxi_startoff2 = req->startoff2; sxi->sxi_blockcount = req->blockcount; sxi->sxi_isize1 = sxi->sxi_isize2 = -1; sxi->sxi_flags = 0; if (req->whichfork == XFS_ATTR_FORK) sxi->sxi_flags |= XFS_SWAP_EXT_ATTR_FORK; if (req->whichfork == XFS_DATA_FORK && (req->req_flags & XFS_SWAP_REQ_SET_SIZES)) { sxi->sxi_flags |= XFS_SWAP_EXT_SET_SIZES; sxi->sxi_isize1 = req->ip2->i_disk_size; sxi->sxi_isize2 = req->ip1->i_disk_size; } if (req->req_flags & XFS_SWAP_REQ_SKIP_FILE1_HOLES) sxi->sxi_flags |= XFS_SWAP_EXT_SKIP_FILE1_HOLES; if (req->req_flags & XFS_SWAP_REQ_FILE2_CVT_SF) sxi->sxi_flags |= XFS_SWAP_EXT_FILE2_CVT_SF; return sxi; } /* * Estimate the number of exchange operations and the number of file blocks * in each file that will be affected by the exchange operation. */ int xfs_swapext_estimate( const struct xfs_swapext_req *req, struct xfs_swapext_res *res) { struct xfs_swapext_intent *sxi; struct xfs_bmbt_irec irec1, irec2; struct xfs_swapext_adjacent adj = ADJACENT_INIT; xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0; int64_t d_nexts1, d_nexts2; int bmap_flags; int error; ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS)); bmap_flags = xfs_bmapi_aflag(req->whichfork); sxi = xfs_swapext_init_intent(req); memset(res, 0, sizeof(struct xfs_swapext_res)); /* * To guard against the possibility of overflowing the extent counters, * we have to estimate an upper bound on the potential increase in that * counter. We can split the extent at each end of the range, and for * each step of the swap we can split the extent that we're working on * if the extents do not align. */ d_nexts1 = d_nexts2 = 3; while (sxi_has_more_swap_work(sxi)) { /* * Walk through the file ranges until we find something to * swap. Because we're simulating the swap, pass in adj to * capture skipped mappings for correct estimation of bmbt * record merges. */ error = xfs_swapext_find_mappings(sxi, &irec1, &irec2, &adj); if (error) goto out_free; if (!sxi_has_more_swap_work(sxi)) break; /* Update accounting. */ if (xfs_bmap_is_real_extent(&irec1)) ip1_blocks += irec1.br_blockcount; if (xfs_bmap_is_real_extent(&irec2)) ip2_blocks += irec2.br_blockcount; res->nr_exchanges++; /* Read the next extents from both files. */ error = get_next_ext(req->ip1, bmap_flags, &irec1, &adj.right1); if (error) goto out_free; error = get_next_ext(req->ip2, bmap_flags, &irec2, &adj.right2); if (error) goto out_free; /* Update extent count deltas. */ d_nexts1 += delta_nextents_step(req->ip1->i_mount, &adj.left1, &irec1, &irec2, &adj.right1); d_nexts2 += delta_nextents_step(req->ip1->i_mount, &adj.left2, &irec2, &irec1, &adj.right2); /* Now pretend we swapped the extents. */ if (can_merge(&adj.left2, &irec1)) adj.left2.br_blockcount += irec1.br_blockcount; else memcpy(&adj.left2, &irec1, sizeof(irec1)); if (can_merge(&adj.left1, &irec2)) adj.left1.br_blockcount += irec2.br_blockcount; else memcpy(&adj.left1, &irec2, sizeof(irec2)); sxi_advance(sxi, &irec1); } /* Account for the blocks that are being exchanged. */ if (XFS_IS_REALTIME_INODE(req->ip1) && req->whichfork == XFS_DATA_FORK) { res->ip1_rtbcount = ip1_blocks; res->ip2_rtbcount = ip2_blocks; } else { res->ip1_bcount = ip1_blocks; res->ip2_bcount = ip2_blocks; } /* * Make sure that both forks have enough slack left in their extent * counters that the swap operation will not overflow. */ trace_xfs_swapext_delta_nextents(req, d_nexts1, d_nexts2); if (req->ip1 == req->ip2) { error = check_delta_nextents(req, req->ip1, d_nexts1 + d_nexts2); } else { error = check_delta_nextents(req, req->ip1, d_nexts1); if (error) goto out_free; error = check_delta_nextents(req, req->ip2, d_nexts2); } if (error) goto out_free; error = xfs_swapext_estimate_overhead(req, res); out_free: kmem_free(sxi); return error; } /* * Swap a range of extents from one inode to another. If the atomic swap * feature is enabled, then the operation progress can be resumed even if the * system goes down. * * The caller must ensure the inodes must be joined to the transaction and * ILOCKd; they will still be joined to the transaction at exit. */ int xfs_swapext( struct xfs_trans **tpp, const struct xfs_swapext_req *req) { struct xfs_swapext_intent *sxi; unsigned int reflink_state; int error; ASSERT(xfs_isilocked(req->ip1, XFS_ILOCK_EXCL)); ASSERT(xfs_isilocked(req->ip2, XFS_ILOCK_EXCL)); ASSERT(req->whichfork != XFS_COW_FORK); ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS)); if (req->req_flags & XFS_SWAP_REQ_SET_SIZES) ASSERT(req->whichfork == XFS_DATA_FORK); if (req->req_flags & XFS_SWAP_REQ_FILE2_CVT_SF) ASSERT(req->whichfork == XFS_ATTR_FORK || (req->whichfork == XFS_DATA_FORK && S_ISDIR(VFS_I(req->ip2)->i_mode))); if (req->blockcount == 0) return 0; reflink_state = xfs_swapext_reflink_prep(req); sxi = xfs_swapext_init_intent(req); xfs_swapext_schedule(*tpp, sxi); error = xfs_defer_finish(tpp); if (error) return error; xfs_swapext_reflink_finish(*tpp, req, reflink_state); return 0; }