// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2022 Oracle. All Rights Reserved. * Author: Darrick J. Wong */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_quota.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" #include "xfs_dir2.h" #include "xfs_xchgrange.h" #include "xfs_swapext.h" #include "xfs_defer.h" #include "xfs_symlink_remote.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" #include "scrub/trace.h" #include "scrub/tempfile.h" #include "scrub/tempswap.h" #include "scrub/xfile.h" /* * Create a temporary file for reconstructing metadata, with the intention of * atomically swapping the temporary file's contents with the file that's * being repaired. */ int xrep_tempfile_create( struct xfs_scrub *sc, uint16_t mode) { struct xfs_mount *mp = sc->mp; struct xfs_trans *tp = NULL; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; struct xfs_dquot *pdqp = NULL; struct xfs_trans_res *tres; struct xfs_inode *dp = mp->m_rootip; xfs_ino_t ino; unsigned int resblks; bool is_dir = S_ISDIR(mode); int error; if (xfs_is_shutdown(mp)) return -EIO; if (xfs_is_readonly(mp)) return -EROFS; ASSERT(sc->tp == NULL); ASSERT(sc->tempip == NULL); /* * Make sure that we have allocated dquot(s) on disk. The temporary * inode should be completely root owned so that we don't fail due to * quota limits. */ error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp); if (error) return error; if (is_dir) { resblks = XFS_MKDIR_SPACE_RES(mp, 0); tres = &M_RES(mp)->tr_mkdir; } else { resblks = XFS_IALLOC_SPACE_RES(mp); tres = &M_RES(mp)->tr_create_tmpfile; } error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, &tp); if (error) goto out_release_dquots; /* Allocate inode, set up directory. */ error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); if (error) goto out_trans_cancel; error = xfs_init_new_inode(&init_user_ns, tp, dp, ino, mode, 0, 0, 0, false, &sc->tempip); if (error) goto out_trans_cancel; /* Change the ownership of the inode to root. */ VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID; VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID; sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT); xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE); /* * Mark our temporary file as private so that LSMs and the ACL code * don't try to add their own metadata or reason about these files. * The file should never be exposed to userspace. */ VFS_I(sc->tempip)->i_flags |= S_PRIVATE; VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR; if (is_dir) { error = xfs_dir_init(tp, sc->tempip, dp); if (error) goto out_trans_cancel; } else if (S_ISLNK(VFS_I(sc->tempip)->i_mode)) { error = xfs_symlink_write_target(tp, sc->tempip, ".", 1, 0, 0); if (error) goto out_trans_cancel; } /* * Attach the dquot(s) to the inodes and modify them incore. * These ids of the inode couldn't have changed since the new * inode has been locked ever since it was created. */ xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp); /* * Put our temp file on the unlinked list so it's purged automatically. * Anything being reconstructed using this file must be atomically * swapped with the original file because the contents here will be * purged when the inode is dropped or log recovery cleans out the * unlinked list. */ error = xfs_iunlink(tp, sc->tempip); if (error) goto out_trans_cancel; error = xfs_trans_commit(tp); if (error) goto out_release_inode; trace_xrep_tempfile_create(sc); xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); /* Finish setting up the incore / vfs context. */ xfs_setup_iops(sc->tempip); xfs_finish_inode_setup(sc->tempip); sc->temp_ilock_flags = 0; return error; out_trans_cancel: xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the * setup of the inode and release the inode. This prevents recursive * transactions and deadlocks from xfs_inactive. */ if (sc->tempip) { xfs_finish_inode_setup(sc->tempip); xchk_irele(sc, sc->tempip); } out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); return error; } /* Take IOLOCK_EXCL on the temporary file, maybe. */ bool xrep_tempfile_iolock_nowait( struct xfs_scrub *sc) { if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) { sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; return true; } return false; } /* Release IOLOCK_EXCL on the temporary file. */ void xrep_tempfile_iounlock( struct xfs_scrub *sc) { xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL); sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL; } /* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */ void xrep_tempfile_ilock( struct xfs_scrub *sc) { sc->temp_ilock_flags |= XFS_ILOCK_EXCL; xfs_ilock(sc->tempip, XFS_ILOCK_EXCL); } /* Try to grab ILOCK_EXCL on the temporary file. */ bool xrep_tempfile_ilock_nowait( struct xfs_scrub *sc) { if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) { sc->temp_ilock_flags |= XFS_ILOCK_EXCL; return true; } return false; } /* Unlock ILOCK_EXCL on the temporary file after an update. */ void xrep_tempfile_iunlock( struct xfs_scrub *sc) { xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL; } /* * Begin the process of making changes to both the file being scrubbed and * the temporary file by taking ILOCK_EXCL on both. */ void xrep_tempfile_ilock_both( struct xfs_scrub *sc) { xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip, XFS_ILOCK_EXCL); sc->ilock_flags |= XFS_ILOCK_EXCL; sc->temp_ilock_flags |= XFS_ILOCK_EXCL; } /* Release the temporary file. */ void xrep_tempfile_rele( struct xfs_scrub *sc) { if (!sc->tempip) return; if (sc->temp_ilock_flags) { xfs_iunlock(sc->tempip, sc->temp_ilock_flags); sc->temp_ilock_flags = 0; } xchk_irele(sc, sc->tempip); sc->tempip = NULL; } /* * Make sure that the given range of the data fork of the temporary file is * mapped to written blocks. The caller must ensure that both inodes are * joined to the transaction. */ int xrep_tempfile_prealloc( struct xfs_scrub *sc, xfs_fileoff_t off, xfs_filblks_t len) { struct xfs_bmbt_irec map; xfs_fileoff_t end = off + len; int error; ASSERT(sc->tempip != NULL); ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip)); for (; off < end; off = map.br_startoff + map.br_blockcount) { int nmaps = 1; /* * If we have a real extent mapping this block then we're * in ok shape. */ error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps, XFS_DATA_FORK); if (error) return error; if (nmaps == 0) { ASSERT(nmaps != 0); return -EFSCORRUPTED; } if (xfs_bmap_is_written_extent(&map)) continue; /* * If we find a delalloc reservation then something is very * very wrong. Bail out. */ if (map.br_startblock == DELAYSTARTBLOCK) return -EFSCORRUPTED; /* * Make sure this block has a real zeroed extent allocated to * it. */ nmaps = 1; error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map, &nmaps); if (error) return error; trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map); /* Commit new extent and all deferred work. */ error = xfs_defer_finish(&sc->tp); if (error) return error; } return 0; } /* * Write data to each block of a file. The given range of the tempfile's data * fork must already be populated with written extents. */ int xrep_tempfile_copyin( struct xfs_scrub *sc, xfs_fileoff_t off, xfs_filblks_t len, xrep_tempfile_copyin_fn prep_fn, void *data) { LIST_HEAD(buffers_list); struct xfs_mount *mp = sc->mp; struct xfs_buf *bp; xfs_fileoff_t flush_mask; xfs_fileoff_t end = off + len; loff_t pos = XFS_FSB_TO_B(mp, off); int error = 0; ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode)); /* Flush buffers to disk every 512K */ flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1; for (; off < end; off++, pos += mp->m_sb.sb_blocksize) { struct xfs_bmbt_irec map; int nmaps = 1; /* Read block mapping for this file block. */ error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0); if (error) goto out_err; if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) { error = -EFSCORRUPTED; goto out_err; } /* Get the metadata buffer for this offset in the file. */ error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), mp->m_bsize, 0, &bp); if (error) goto out_err; trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map); /* Read in a block's worth of data from the xfile. */ error = prep_fn(sc, bp, data); if (error) { xfs_trans_brelse(sc->tp, bp); goto out_err; } /* Queue buffer, and flush if we have too much dirty data. */ xfs_buf_delwri_queue_here(bp, &buffers_list); xfs_trans_brelse(sc->tp, bp); if (!(off & flush_mask)) { error = xfs_buf_delwri_submit(&buffers_list); if (error) goto out_err; } } /* * Write the new blocks to disk. If the ordered list isn't empty after * that, then something went wrong and we have to fail. This should * never happen, but we'll check anyway. */ error = xfs_buf_delwri_submit(&buffers_list); if (error) goto out_err; if (!list_empty(&buffers_list)) { ASSERT(list_empty(&buffers_list)); error = -EIO; goto out_err; } return 0; out_err: xfs_buf_delwri_cancel(&buffers_list); return error; } /* * Set the temporary file's size. Caller must join the tempfile to the scrub * transaction and is responsible for adjusting block mappings as needed. */ int xrep_tempfile_set_isize( struct xfs_scrub *sc, unsigned long long isize) { if (sc->tempip->i_disk_size == isize) return 0; sc->tempip->i_disk_size = isize; i_size_write(VFS_I(sc->tempip), isize); return xrep_tempfile_roll_trans(sc); } /* * Roll a repair transaction involving the temporary file. Caller must join * both the temporary file and the file being scrubbed to the transaction. * This function return with both inodes joined to a new scrub transaction, * or the usual negative errno. */ int xrep_tempfile_roll_trans( struct xfs_scrub *sc) { int error; xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE); error = xrep_roll_trans(sc); if (error) return error; xfs_trans_ijoin(sc->tp, sc->tempip, 0); return 0; } /* Enable atomic extent swapping. */ int xrep_tempswap_grab_log_assist( struct xfs_scrub *sc) { bool need_rele = false; int error; ASSERT(!(sc->flags & XREP_ATOMIC_EXCHANGE)); error = xfs_xchg_range_grab_log_assist(sc->mp, true, &need_rele); if (error) return error; if (!need_rele) { ASSERT(need_rele); return -EOPNOTSUPP; } sc->flags |= XREP_ATOMIC_EXCHANGE; return 0; } /* * Fill out the swapext request in preparation for swapping the contents of a * metadata file that we've rebuilt in the temp file. */ STATIC int xrep_tempswap_prep_request( struct xfs_scrub *sc, int whichfork, struct xrep_tempswap *tx) { struct xfs_swapext_req *req = &tx->req; memset(tx, 0, sizeof(struct xrep_tempswap)); /* COW forks don't exist on disk. */ if (whichfork == XFS_COW_FORK) { ASSERT(0); return -EINVAL; } /* Both files should have the relevant forks. */ if (!xfs_ifork_ptr(sc->ip, whichfork) || !xfs_ifork_ptr(sc->tempip, whichfork)) { ASSERT(0); return -EINVAL; } /* Swap all mappings in both forks. */ req->ip1 = sc->tempip; req->ip2 = sc->ip; req->startoff1 = 0; req->startoff2 = 0; req->whichfork = whichfork; req->blockcount = XFS_MAX_FILEOFF; req->req_flags = XFS_SWAP_REQ_LOGGED; /* Always swap sizes when we're swapping data fork mappings. */ if (whichfork == XFS_DATA_FORK) req->req_flags |= XFS_SWAP_REQ_SET_SIZES; /* * If we're repairing symlinks, xattrs, or directories, always try to * convert ip2 to short format after swapping. */ if (whichfork == XFS_ATTR_FORK || S_ISDIR(VFS_I(sc->ip)->i_mode) || S_ISLNK(VFS_I(sc->ip)->i_mode)) req->req_flags |= XFS_SWAP_REQ_CVT_INO2_SF; return 0; } /* * Fill out the swapext resource estimation structures in preparation for * swapping the contents of a metadata file that we've rebuilt in the temp * file. Caller must hold IOLOCK_EXCL but not ILOCK_EXCL on both files. */ STATIC int xrep_tempswap_estimate( struct xfs_scrub *sc, struct xrep_tempswap *tx) { struct xfs_swapext_req *req = &tx->req; struct xfs_ifork *ifp; struct xfs_ifork *tifp; int state = 0; /* * Deal with either fork being in local format. The swapext code only * knows how to exchange block mappings for regular files, so we only * have to know about local format for xattrs and directories. */ ifp = xfs_ifork_ptr(sc->ip, req->whichfork); if (ifp->if_format == XFS_DINODE_FMT_LOCAL) state |= 1; tifp = xfs_ifork_ptr(sc->tempip, req->whichfork); if (tifp->if_format == XFS_DINODE_FMT_LOCAL) state |= 2; switch (state) { case 0: /* Both files have mapped extents; use the regular estimate. */ return xfs_xchg_range_estimate(req); case 1: /* * The file being repaired is in local format, but the temp * file has mapped extents. To perform the swap, the file * being repaired will be reinitialized to have an empty extent * map, so the number of exchanges is the temporary file's * extent count. */ req->ip1_bcount = sc->tempip->i_nblocks; req->nr_exchanges = tifp->if_nextents; break; case 2: /* * The temporary file is in local format, but the file being * repaired has mapped extents. To perform the swap, the temp * file will be converted to have a single block, so the number * of exchanges is (worst case) the extent count of the file * being repaired plus one more. */ req->ip1_bcount = 1; req->ip2_bcount = sc->ip->i_nblocks; req->nr_exchanges = ifp->if_nextents; break; case 3: /* * Both forks are in local format. To perform the swap, the * file being repaired will be reinitialized to have an empty * extent map and the temp file will be converted to have a * single block. Only one exchange is required. Presumably, * the caller could not exchange the two inode fork areas * directly. */ req->ip1_bcount = 1; req->nr_exchanges = 1; break; } return xfs_swapext_estimate_overhead(req); } /* * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip * this if quota enforcement is disabled or if both inodes' dquots are the * same. The qretry structure must be initialized to zeroes before the first * call to this function. */ STATIC int xrep_tempswap_reserve_quota( struct xfs_scrub *sc, const struct xrep_tempswap *tx) { struct xfs_trans *tp = sc->tp; const struct xfs_swapext_req *req = &tx->req; int64_t ddelta, rdelta; int error; /* * Don't bother with a quota reservation if we're not enforcing them * or the two inodes have the same dquots. */ if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || (req->ip1->i_udquot == req->ip2->i_udquot && req->ip1->i_gdquot == req->ip2->i_gdquot && req->ip1->i_pdquot == req->ip2->i_pdquot)) return 0; /* * Quota reservation for each file comes from two sources. First, we * need to account for any net gain in mapped blocks during the swap. * Second, we need reservation for the gross gain in mapped blocks so * that we don't trip over any quota block reservation assertions. We * must reserve the gross gain because the quota code subtracts from * bcount the number of blocks that we unmap; it does not add that * quantity back to the quota block reservation. */ ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount); rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount); error = xfs_trans_reserve_quota_nblks(tp, req->ip1, ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount, true); if (error) return error; ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount); rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount); return xfs_trans_reserve_quota_nblks(tp, req->ip2, ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount, true); } /* * Prepare an existing transaction for a swap. The caller must hold * the ILOCK of both the inode being repaired and the temporary file. * Only use this when those ILOCKs cannot be dropped. * * Fill out the swapext request and resource estimation structures in * preparation for swapping the contents of a metadata file that we've rebuilt * in the temp file, then reserve space and quota to the transaction. */ int xrep_tempswap_trans_reserve( struct xfs_scrub *sc, int whichfork, struct xrep_tempswap *tx) { int error; ASSERT(sc->tp != NULL); ASSERT(xfs_isilocked(sc->ip, XFS_ILOCK_EXCL)); ASSERT(xfs_isilocked(sc->tempip, XFS_ILOCK_EXCL)); error = xrep_tempswap_prep_request(sc, whichfork, tx); if (error) return error; error = xfs_swapext_estimate(&tx->req); if (error) return error; error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0); if (error) return error; return xrep_tempswap_reserve_quota(sc, tx); } /* * Allocate a transaction, ILOCK the temporary file and the file being * repaired, and join them to the transaction in preparation to swap fork * contents as part of a repair operation. */ int xrep_tempswap_trans_alloc( struct xfs_scrub *sc, int whichfork, struct xrep_tempswap *tx) { unsigned int flags = 0; int error; ASSERT(sc->tp == NULL); error = xrep_tempswap_prep_request(sc, whichfork, tx); if (error) return error; error = xrep_tempswap_estimate(sc, tx); if (error) return error; if (xfs_has_lazysbcount(sc->mp)) flags |= XFS_TRANS_RES_FDBLKS; error = xrep_tempswap_grab_log_assist(sc); if (error) return error; error = xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, tx->req.resblks, 0, flags, &sc->tp); if (error) return error; sc->temp_ilock_flags |= XFS_ILOCK_EXCL; sc->ilock_flags |= XFS_ILOCK_EXCL; xfs_xchg_range_ilock(sc->tp, sc->ip, sc->tempip); return xrep_tempswap_reserve_quota(sc, tx); } /* Swap forks between the file being repaired and the temporary file. */ int xrep_tempswap_contents( struct xfs_scrub *sc, struct xrep_tempswap *tx) { int error; ASSERT(sc->flags & XREP_ATOMIC_EXCHANGE); xfs_swapext(sc->tp, &tx->req); error = xfs_defer_finish(&sc->tp); if (error) return error; /* * If we swapped the ondisk sizes of two metadata files, we must swap * the incore sizes as well. Since online fsck doesn't use swapext on * the data forks of user-accessible files, the two sizes are always * the same, so we don't need to log the inodes. */ if (tx->req.req_flags & XFS_SWAP_REQ_SET_SIZES) { loff_t temp; temp = i_size_read(VFS_I(sc->ip)); i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); i_size_write(VFS_I(sc->tempip), temp); } return 0; } /* * Write local format data from one of the temporary file's forks into the same * fork of file being repaired, and swap the file sizes, if appropriate. * Caller must ensure that the file being repaired has enough fork space to * hold all the bytes. */ void xrep_tempfile_copyout_local( struct xfs_scrub *sc, int whichfork) { struct xfs_ifork *temp_ifp; struct xfs_ifork *ifp; unsigned int ilog_flags = XFS_ILOG_CORE; temp_ifp = xfs_ifork_ptr(sc->tempip, whichfork); ifp = xfs_ifork_ptr(sc->ip, whichfork); ASSERT(temp_ifp != NULL); ASSERT(ifp != NULL); ASSERT(temp_ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); switch (whichfork) { case XFS_DATA_FORK: ASSERT(sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)); break; case XFS_ATTR_FORK: ASSERT(sc->tempip->i_forkoff >= sc->ip->i_forkoff); break; default: ASSERT(0); return; } xfs_idestroy_fork(ifp); xfs_init_local_fork(sc->ip, whichfork, temp_ifp->if_u1.if_data, temp_ifp->if_bytes); if (whichfork == XFS_DATA_FORK) { i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); sc->ip->i_disk_size = sc->tempip->i_disk_size; } ilog_flags |= xfs_ilog_fdata(whichfork); xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags); } /* Decide if a given XFS inode is a temporary file for a repair. */ bool xrep_is_tempfile( const struct xfs_inode *ip) { const struct inode *inode = &ip->i_vnode; if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR)) return true; return false; }