// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2020 Oracle. All Rights Reserved. * Author: Darrick J. Wong * * The xfs_swap_extent_* functions are: * Copyright (c) 2000-2006 Silicon Graphics, Inc. * Copyright (c) 2012 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_quota.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" #include "xfs_reflink.h" #include "xfs_trace.h" #include "xfs_swapext.h" #include "xfs_swaprange.h" /* Lock (and optionally join) two inodes for an extent swap operation. */ void xfs_swap_range_ilock( struct xfs_trans *tp, struct xfs_inode *ip1, struct xfs_inode *ip2) { if (ip1 != ip2) xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL, ip2, XFS_ILOCK_EXCL); else xfs_ilock(ip1, XFS_ILOCK_EXCL); if (tp) { xfs_trans_ijoin(tp, ip1, 0); if (ip2 != ip1) xfs_trans_ijoin(tp, ip2, 0); } } /* Unlock two inodes after an extent swap operation. */ void xfs_swap_range_iunlock( struct xfs_inode *ip1, struct xfs_inode *ip2) { if (ip2 != ip1) xfs_iunlock(ip2, XFS_ILOCK_EXCL); xfs_iunlock(ip1, XFS_ILOCK_EXCL); } /* * Estimate the resource requirements to swap ranges between the two files. * The caller is required to hold the IOLOCK and the MMAPLOCK and to have * flushed both inodes' pagecache and active directios. */ int xfs_swap_range_estimate( const struct xfs_swapext_req *req, struct xfs_swapext_res *res) { int error; xfs_swap_range_ilock(NULL, req->ip1, req->ip2); error = xfs_swapext_estimate(req, res); xfs_swap_range_iunlock(req->ip1, req->ip2); return error; } /* * We need to check that the format of the data fork in the temporary inode is * valid for the target inode before doing the swap. This is not a problem with * attr1 because of the fixed fork offset, but attr2 has a dynamically sized * data fork depending on the space the attribute fork is taking so we can get * invalid formats on the target inode. * * E.g. target has space for 7 extents in extent format, temp inode only has * space for 6. If we defragment down to 7 extents, then the tmp format is a * btree, but when swapped it needs to be in extent format. Hence we can't just * blindly swap data forks on attr2 filesystems. * * Note that we check the swap in both directions so that we don't end up with * a corrupt temporary inode, either. * * Note that fixing the way xfs_fsr sets up the attribute fork in the source * inode will prevent this situation from occurring, so all we do here is * reject and log the attempt. basically we are putting the responsibility on * userspace to get this right. */ STATIC int xfs_swap_extents_check_format( struct xfs_inode *ip, /* target inode */ struct xfs_inode *tip) /* tmp inode */ { struct xfs_ifork *ifp = &ip->i_df; struct xfs_ifork *tifp = &tip->i_df; /* User/group/project quota ids must match if quotas are enforced. */ if (XFS_IS_QUOTA_ON(ip->i_mount) && (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) || !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) || ip->i_d.di_projid != tip->i_d.di_projid)) return -EINVAL; /* Should never get a local format */ if (ifp->if_format == XFS_DINODE_FMT_LOCAL || tifp->if_format == XFS_DINODE_FMT_LOCAL) return -EINVAL; /* * if the target inode has less extents that then temporary inode then * why did userspace call us? */ if (ifp->if_nextents < tifp->if_nextents) return -EINVAL; /* * If we have to use the (expensive) rmap swap method, we can * handle any number of extents and any format. */ if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb)) return 0; /* * if the target inode is in extent form and the temp inode is in btree * form then we will end up with the target inode in the wrong format * as we already know there are less extents in the temp inode. */ if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && tifp->if_format == XFS_DINODE_FMT_BTREE) return -EINVAL; /* Check temp in extent form to max in target */ if (tifp->if_format == XFS_DINODE_FMT_EXTENTS && tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; /* Check target in extent form to max in temp */ if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; /* * If we are in a btree format, check that the temp root block will fit * in the target and that it has enough extents to be in btree format * in the target. * * Note that we have to be careful to allow btree->extent conversions * (a common defrag case) which will occur when the temp inode is in * extent format... */ if (tifp->if_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_Q(ip) && xfs_bmap_bmdr_space(tifp->if_broot) > XFS_IFORK_BOFF(ip)) return -EINVAL; if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; } /* Reciprocal target->temp btree format checks */ if (ifp->if_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_Q(tip) && xfs_bmap_bmdr_space(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) return -EINVAL; if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; } return 0; } /* * Fix up the owners of the bmbt blocks to refer to the current inode. The * change owner scan attempts to order all modified buffers in the current * transaction. In the event of ordered buffer failure, the offending buffer is * physically logged as a fallback and the scan returns -EAGAIN. We must roll * the transaction in this case to replenish the fallback log reservation and * restart the scan. This process repeats until the scan completes. */ STATIC int xfs_swap_change_owner( struct xfs_trans **tpp, struct xfs_inode *ip, struct xfs_inode *tmpip) { int error; struct xfs_trans *tp = *tpp; do { error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino, NULL); /* success or fatal error */ if (error != -EAGAIN) break; error = xfs_trans_roll(tpp); if (error) break; tp = *tpp; /* * Redirty both inodes so they can relog and keep the log tail * moving forward. */ xfs_trans_ijoin(tp, ip, 0); xfs_trans_ijoin(tp, tmpip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE); } while (true); return error; } /* Swap the extents of two files by swapping data forks. */ STATIC int xfs_swap_extent_forks( struct xfs_trans **tpp, struct xfs_swapext_req *req) { struct xfs_inode *ip = req->ip1; struct xfs_inode *tip = req->ip2; xfs_filblks_t aforkblks = 0; xfs_filblks_t taforkblks = 0; xfs_extnum_t junk; uint64_t tmp; unsigned int reflink_state; int src_log_flags = XFS_ILOG_CORE; int target_log_flags = XFS_ILOG_CORE; int error; reflink_state = xfs_swapext_reflink_prep(req); /* * Count the number of extended attribute blocks */ if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 && ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(*tpp, ip, XFS_ATTR_FORK, &junk, &aforkblks); if (error) return error; } if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 && tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(*tpp, tip, XFS_ATTR_FORK, &junk, &taforkblks); if (error) return error; } /* * Btree format (v3) inodes have the inode number stamped in the bmbt * block headers. We can't start changing the bmbt blocks until the * inode owner change is logged so recovery does the right thing in the * event of a crash. Set the owner change log flags now and leave the * bmbt scan as the last step. */ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) target_log_flags |= XFS_ILOG_DOWNER; if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE) src_log_flags |= XFS_ILOG_DOWNER; } /* * Swap the data forks of the inodes */ swap(ip->i_df, tip->i_df); /* * Fix the on-disk inode values */ tmp = (uint64_t)ip->i_d.di_nblocks; ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; /* * The extents in the source inode could still contain speculative * preallocation beyond EOF (e.g. the file is open but not modified * while defrag is in progress). In that case, we need to copy over the * number of delalloc blocks the data fork in the source inode is * tracking beyond EOF so that when the fork is truncated away when the * temporary inode is unlinked we don't underrun the i_delayed_blks * counter on that inode. */ ASSERT(tip->i_delayed_blks == 0); tip->i_delayed_blks = ip->i_delayed_blks; ip->i_delayed_blks = 0; switch (ip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: src_log_flags |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || (src_log_flags & XFS_ILOG_DOWNER)); src_log_flags |= XFS_ILOG_DBROOT; break; } switch (tip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: target_log_flags |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: target_log_flags |= XFS_ILOG_DBROOT; ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || (target_log_flags & XFS_ILOG_DOWNER)); break; } xfs_swapext_reflink_finish(*tpp, req, reflink_state); xfs_trans_log_inode(*tpp, ip, src_log_flags); xfs_trans_log_inode(*tpp, tip, target_log_flags); /* * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems * have inode number owner values in the bmbt blocks that still refer to * the old inode. Scan each bmbt to fix up the owner values with the * inode number of the current inode. */ if (src_log_flags & XFS_ILOG_DOWNER) { error = xfs_swap_change_owner(tpp, ip, tip); if (error) return error; } if (target_log_flags & XFS_ILOG_DOWNER) { error = xfs_swap_change_owner(tpp, tip, ip); if (error) return error; } return 0; } /* Prepare two files to have their data swapped. */ int xfs_swap_range_prep( struct file *file1, struct file *file2, struct file_swap_range *fsr) { struct xfs_inode *ip1 = XFS_I(file_inode(file1)); struct xfs_inode *ip2 = XFS_I(file_inode(file2)); int ret; /* Verify both files are either real-time or non-realtime */ if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2)) return -EINVAL; ASSERT(is_power_of_2(xfs_inode_alloc_unitsize(ip2))); ret = generic_swap_file_range_prep(file1, file2, fsr, xfs_inode_alloc_unitsize(ip2)); if (ret) return ret; /* Attach dquots to both inodes before changing block maps. */ ret = xfs_qm_dqattach(ip2); if (ret) return ret; ret = xfs_qm_dqattach(ip1); if (ret) return ret; /* Flush the relevant ranges of both files. */ ret = xfs_flush_unmap_range(ip2, fsr->file2_offset, fsr->length); if (ret) return ret; return xfs_flush_unmap_range(ip1, fsr->file1_offset, fsr->length); } /* Make a particular type of quota reservation. */ STATIC int xfs_swap_range_reserve_quota_blocks( struct xfs_trans *tp, const struct xfs_swapext_req *req, xfs_filblks_t ip1_mapped, xfs_filblks_t ip2_mapped, unsigned int qmopts) { int error; /* * For each file, compute the net gain in the number of blocks that * will be mapped into that file and reserve that much quota. The * quota counts must be able to absorb at least that much space. */ if (ip2_mapped > ip1_mapped) { error = xfs_trans_reserve_quota_nblks(tp, req->ip1, ip2_mapped - ip1_mapped, 0, qmopts); if (error) return error; } if (ip1_mapped > ip2_mapped) { error = xfs_trans_reserve_quota_nblks(tp, req->ip2, ip1_mapped - ip2_mapped, 0, qmopts); if (error) return error; } /* * For each file, forcibly reserve the gross gain in mapped blocks so * that we don't trip over any quota block reservation assertions. * We must reserve the gross gain because the quota code subtracts from * bcount the number of blocks that we unmap; it does not add that * quantity back to the quota block reservation. */ error = xfs_trans_reserve_quota_nblks(tp, req->ip1, ip1_mapped, 0, XFS_QMOPT_FORCE_RES | qmopts); if (error) return error; return xfs_trans_reserve_quota_nblks(tp, req->ip2, ip2_mapped, 0, XFS_QMOPT_FORCE_RES | qmopts); } /* * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip * this if quota enforcement is disabled or if both inodes' dquots are the * same. */ STATIC int xfs_swap_range_reserve_quota( struct xfs_trans *tp, const struct xfs_swapext_req *req, const struct xfs_swapext_res *res) { int error; /* * Don't bother with a quota reservation if we're not enforcing them * or the two inodes have the same dquots. */ if (!XFS_IS_QUOTA_ON(tp->t_mountp) || (req->ip1->i_udquot == req->ip2->i_udquot && req->ip1->i_gdquot == req->ip2->i_gdquot && req->ip1->i_pdquot == req->ip2->i_pdquot)) return 0; error = xfs_swap_range_reserve_quota_blocks(tp, req, res->ip1_bcount, res->ip2_bcount, XFS_QMOPT_RES_REGBLKS); if (error) return error; return xfs_swap_range_reserve_quota_blocks(tp, req, res->ip1_rtbcount, res->ip2_rtbcount, XFS_QMOPT_RES_RTBLKS); } /* Decide if we can use xfs_swapext(). */ static inline bool xfs_swapext_use_deferred( struct xfs_mount *mp, const struct file_swap_range *fsr) { /* Fully recoverable extent swapping? Yes! */ if (xfs_sb_version_hasatomicswap(&mp->m_sb)) return true; /* Non-atomic swap requested and we have the deferred bmap log items */ if ((fsr->flags & FILE_SWAP_RANGE_NONATOMIC) && (xfs_sb_version_hasreflink(&mp->m_sb) || xfs_sb_version_hasrmapbt(&mp->m_sb))) return true; return false; } /* Decide if we can use the old data fork exchange code. */ static inline bool xfs_swapext_use_fork_exchange( const struct file_swap_range *fsr, struct xfs_inode *ip1, struct xfs_inode *ip2) { return (fsr->flags & FILE_SWAP_RANGE_NONATOMIC) && (fsr->flags & FILE_SWAP_RANGE_FULL_FILES) && !(fsr->flags & FILE_SWAP_RANGE_TO_EOF) && fsr->file1_offset == 0 && fsr->file2_offset == 0 && fsr->length == ip1->i_d.di_size && fsr->length == ip2->i_d.di_size; } /* Swap parts of two files. */ int xfs_swap_range( struct xfs_inode *ip1, struct xfs_inode *ip2, const struct file_swap_range *fsr, unsigned int private_flags) { struct xfs_swapext_req req = { .ip1 = ip1, .ip2 = ip2, .whichfork = XFS_DATA_FORK, }; struct xfs_swapext_res res; struct xfs_mount *mp = ip1->i_mount; struct xfs_trans *tp; loff_t req_len; int error; if (fsr->flags & FILE_SWAP_RANGE_TO_EOF) req.flags |= XFS_SWAPEXT_SET_SIZES; req.startoff1 = XFS_B_TO_FSBT(mp, fsr->file1_offset); req.startoff2 = XFS_B_TO_FSBT(mp, fsr->file2_offset); /* * Round the request length up to the nearest fundamental unit of * allocation. The prep function already checked that the request * offsets and length in @fsr are safe to round up. */ req_len = round_up(fsr->length, xfs_inode_alloc_unitsize(ip2)); req.blockcount = XFS_B_TO_FSB(mp, req_len); /* * Cancel CoW fork preallocations for the ranges of both files. The * prep function should have flushed all the dirty data, so the only * extents remaining should be speculative. */ if (xfs_inode_has_cow_data(ip1)) { error = xfs_reflink_cancel_cow_range(ip1, fsr->file1_offset, fsr->length, true); if (error) return error; } if (xfs_inode_has_cow_data(ip2)) { error = xfs_reflink_cancel_cow_range(ip2, fsr->file2_offset, fsr->length, true); if (error) return error; } error = xfs_swap_range_estimate(&req, &res); if (error) return error; /* Allocate the transaction, lock the inodes, and join them. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, res.resblks, 0, XFS_TRANS_RES_FDBLKS, &tp); if (error) return error; xfs_swap_range_ilock(tp, ip1, ip2); trace_xfs_swap_extent_before(ip2, 0); trace_xfs_swap_extent_before(ip1, 1); /* * Do all of the inputs checking that we can only do once we've taken * both ILOCKs. */ error = generic_swap_file_range_check_fresh(VFS_I(ip1), VFS_I(ip2), fsr); if (error) goto out_trans_cancel; error = xfs_swapext_check_extents(mp, &req); if (error) goto out_trans_cancel; /* * Reserve ourselves some quota if any of them are in enforcing mode. * In theory we only need enough to satisfy the change in the number * of blocks between the two ranges being remapped. */ error = xfs_swap_range_reserve_quota(tp, &req, &res); if (error) goto out_trans_cancel; /* Perform the file range swap... */ if (xfs_swapext_use_deferred(mp, fsr)) { /* If we got this far on a dry run, all parameters are ok. */ if (fsr->flags & FILE_SWAP_RANGE_DRY_RUN) goto out_trans_cancel; /* Update the mtime and ctime of both files. */ if (private_flags & XFS_SWAP_RANGE_UPD_CMTIME1) xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); if (private_flags & XFS_SWAP_RANGE_UPD_CMTIME2) xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); /* ...by using the deferred op swapext, since it's available. */ error = xfs_swapext(&tp, &req); } else if (xfs_swapext_use_fork_exchange(fsr, ip1, ip2)) { /* * ...by using the old bmap fork exchange code, if we're a * defrag tool doing a full file swap. */ error = xfs_swap_extents_check_format(ip2, ip1); if (error) { xfs_notice(mp, "%s: inode 0x%llx format is incompatible for exchanging.", __func__, ip2->i_ino); goto out_trans_cancel; } /* If we got this far on a dry run, all parameters are ok. */ if (fsr->flags & FILE_SWAP_RANGE_DRY_RUN) goto out_trans_cancel; /* Update the mtime and ctime of both files. */ if (private_flags & XFS_SWAP_RANGE_UPD_CMTIME1) xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); if (private_flags & XFS_SWAP_RANGE_UPD_CMTIME2) xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); error = xfs_swap_extent_forks(&tp, &req); } else { /* ...or not at all, because we cannot do it. */ error = -EOPNOTSUPP; } if (error) goto out_trans_cancel; /* * If the caller wanted us to swap two complete files of unequal * length, swap the incore sizes now. This should be safe because we * flushed both files' page caches and moved all the post-eof extents, * so there should not be anything to zero. */ if (fsr->flags & FILE_SWAP_RANGE_TO_EOF) { loff_t temp; temp = i_size_read(VFS_I(ip2)); i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1))); i_size_write(VFS_I(ip1), temp); } /* Relog the inodes to keep transactions moving forward. */ xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); /* * Force the log to persist metadata updates if the caller or the * administrator requires this. The VFS prep function already flushed * the relevant parts of the page cache. */ if ((mp->m_flags & XFS_MOUNT_WSYNC) || (fsr->flags & FILE_SWAP_RANGE_FSYNC)) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); trace_xfs_swap_extent_after(ip2, 0); trace_xfs_swap_extent_after(ip1, 1); out_unlock: xfs_swap_range_iunlock(ip1, ip2); return error; out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; }