diff options
Diffstat (limited to 'fs/xfs/scrub/dir_repair.c')
-rw-r--r-- | fs/xfs/scrub/dir_repair.c | 699 |
1 files changed, 699 insertions, 0 deletions
diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c new file mode 100644 index 000000000000..5236a30bbad8 --- /dev/null +++ b/fs/xfs/scrub/dir_repair.c @@ -0,0 +1,699 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/array.h" +#include "scrub/blob.h" + +/* + * Directory Repair + * ================ + * + * We repair directories by reading the directory leaf blocks looking for + * entries, truncate the entire directory fork, and reinsert all the entries. + * Unfortunately, there's not yet a secondary copy of directory attribute data, + * which means that if we blow up midway through there's little we can do. + */ + +struct xrep_dir_key { + xblob_cookie name_cookie; + xfs_ino_t ino; + unsigned int hash; + uint8_t namelen; + uint8_t ftype; +} __packed; + +struct xrep_dir { + struct xfs_scrub *sc; + struct xfbma *dir_entries; + struct xblob *dir_names; + xfs_ino_t parent_ino; +}; + +/* + * Decide if we want to salvage this entry. We don't bother with oversized + * names or the dot entry. + */ +STATIC int +xrep_dir_want_salvage( + struct xrep_dir *rd, + const char *name, + int namelen, + xfs_ino_t ino) +{ + struct xfs_mount *mp = rd->sc->mp; + + /* No pointers to ourselves or to garbage. */ + if (ino == rd->sc->ip->i_ino) + return false; + if (!xfs_verify_dir_ino(mp, ino)) + return false; + + /* No weird looking names or dot entries. */ + if (namelen > MAXNAMELEN || namelen <= 0) + return false; + if (namelen == 1 && name[0] == '.') + return false; + + return true; +} + +/* Allocate an in-core record to hold entries while we rebuild the dir data. */ +STATIC int +xrep_dir_salvage_entry( + struct xrep_dir *rd, + unsigned char *name, + unsigned int namelen, + xfs_ino_t ino) +{ + struct xrep_dir_key key = { + .ino = ino, + }; + struct xfs_inode *ip; + unsigned int i; + int error = 0; + + if (xchk_should_terminate(rd->sc, &error)) + return error; + + /* Truncate the name to the first illegal character. */ + for (i = 0; i < namelen && name[i] != 0 && name[i] != '/'; i++); + key.namelen = i; + key.hash = xfs_da_hashname(name, key.namelen); + + trace_xrep_dir_salvage_entry(rd->sc->ip, name, key.namelen, ino); + + /* Save the parent pointer. */ + if (key.namelen == 2 && name[0] == '.' && name[1] == '.') { + if (rd->parent_ino != NULLFSINO) + return -EFSCORRUPTED; + rd->parent_ino = ino; + return 0; + } + + /* + * Compute the ftype or dump the entry if we can't. We don't lock the + * inode because inodes can't change type while we have a reference. + */ + error = xfs_iget(rd->sc->mp, rd->sc->tp, ino, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip); + if (error) + return 0; + key.ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + xfs_irele(ip); + + /* Remember this for later. */ + error = xblob_put(rd->dir_names, &key.name_cookie, name, key.namelen); + if (error) + return error; + + return xfbma_append(rd->dir_entries, &key); +} + +/* Record a shortform directory entry for later reinsertion. */ +STATIC int +xrep_dir_salvage_sf_entry( + struct xrep_dir *rd, + struct xfs_dir2_sf_hdr *sfp, + struct xfs_dir2_sf_entry *sfep) +{ + xfs_ino_t ino; + + ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep); + if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino)) + return 0; + + return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino); +} + +/* Record a regular directory entry for later reinsertion. */ +STATIC int +xrep_dir_salvage_data_entry( + struct xrep_dir *rd, + struct xfs_dir2_data_entry *dep) +{ + xfs_ino_t ino; + + ino = be64_to_cpu(dep->inumber); + if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino)) + return 0; + + return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino); +} + +/* Try to recover block/data format directory entries. */ +STATIC int +xrep_dir_recover_data( + struct xrep_dir *rd, + struct xfs_buf *bp) +{ + struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo; + unsigned int offset; + unsigned int end; + int error; /* error return value */ + + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + offset = geo->data_entry_offset; + end = min_t(unsigned int, BBTOB(bp->b_length), + xfs_dir3_data_end_offset(geo, bp->b_addr)); + + while (offset < end) { + struct xfs_dir2_data_unused *dup = bp->b_addr + offset; + struct xfs_dir2_data_entry *dep = bp->b_addr + offset; + + if (xchk_should_terminate(rd->sc, &error)) + break; + + /* Skip unused entries. */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + offset += be16_to_cpu(dup->length); + continue; + } + + /* Don't walk off the end of the block. */ + offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen); + if (offset > end) + break; + + /* Ok, let's save this entry. */ + error = xrep_dir_salvage_data_entry(rd, dep); + if (error) + return error; + + } + + return 0; +} + +/* Try to recover shortform directory entries. */ +STATIC int +xrep_dir_recover_sf( + struct xrep_dir *rd) +{ + struct xfs_dir2_sf_hdr *sfp; + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_entry *next; + struct xfs_ifork *ifp; + unsigned char *end; + int error; + + ifp = XFS_IFORK_PTR(rd->sc->ip, XFS_DATA_FORK); + sfp = (struct xfs_dir2_sf_hdr *)rd->sc->ip->i_df.if_u1.if_data; + end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes; + + rd->parent_ino = xfs_dir2_sf_get_parent_ino(sfp); + + sfep = xfs_dir2_sf_firstentry(sfp); + while ((unsigned char *)sfep < end) { + if (xchk_should_terminate(rd->sc, &error)) + break; + + next = xfs_dir2_sf_nextentry(rd->sc->mp, sfp, sfep); + if ((unsigned char *)next > end) + break; + + /* Ok, let's save this entry. */ + error = xrep_dir_salvage_sf_entry(rd, sfp, sfep); + if (error) + return error; + + sfep = next; + } + + return 0; +} + +/* + * Try to figure out the format of this directory from the data fork mappings + * and the directory size. If we can be reasonably sure of format, we can be + * more aggressive in salvaging directory entries. On return, @magic_guess + * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format" + * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory, + * and 0 if we can't tell. + */ +STATIC void +xrep_dir_guess_format( + struct xrep_dir *rd, + __be32 *magic_guess) +{ + struct xfs_inode *ip = rd->sc->ip; + struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo; + xfs_fileoff_t last; + int error; + + ASSERT(xfs_sb_version_hascrc(&ip->i_mount->m_sb)); + + *magic_guess = 0; + + /* + * If there's a single directory block and the directory size is + * exactly one block, this has to be a single block format directory. + */ + error = xfs_bmap_last_offset(ip, &last, XFS_DATA_FORK); + if (!error && XFS_FSB_TO_B(ip->i_mount, last) == geo->blksize && + ip->i_d.di_size == geo->blksize) { + *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + return; + } + + /* + * If the last extent before the leaf offset matches the directory + * size and the directory size is larger than 1 block, this is a + * data format directory. + */ + last = geo->leafblk; + error = xfs_bmap_last_before(rd->sc->tp, ip, &last, XFS_DATA_FORK); + if (!error && + XFS_FSB_TO_B(ip->i_mount, last) > geo->blksize && + XFS_FSB_TO_B(ip->i_mount, last) == ip->i_d.di_size) { + *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + return; + } +} + +/* Recover directory entries from a specific directory block. */ +STATIC int +xrep_dir_recover_dirblock( + struct xrep_dir *rd, + __be32 magic_guess, + xfs_dablk_t dabno) +{ + struct xfs_dir2_data_hdr *hdr; + struct xfs_buf *bp; + __be32 oldmagic; + int error; + + /* + * Try to read buffer. We invalidate them in the next step so we don't + * bother to set a buffer type or ops. + */ + error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno, -1, &bp, + XFS_DATA_FORK, NULL); + if (error || !bp) + return error; + + hdr = bp->b_addr; + oldmagic = hdr->magic; + + trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno, + be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess)); + + /* + * If we're sure of the block's format, proceed with the salvage + * operation using the specified magic number. + */ + if (magic_guess) { + hdr->magic = magic_guess; + goto recover; + } + + /* + * If we couldn't guess what type of directory this is, then we will + * only salvage entries from directory blocks that match the magic + * number and pass verifiers. + */ + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops)) + goto out; + break; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops)) + goto out; + break; + default: + goto out; + } + +recover: + error = xrep_dir_recover_data(rd, bp); + +out: + hdr->magic = oldmagic; + xfs_trans_brelse(rd->sc->tp, bp); + return error; +} + +/* Extract as many directory entries as we can. */ +STATIC int +xrep_dir_recover( + struct xrep_dir *rd) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = rd->sc; + struct xfs_ifork *ifp; + struct xfs_da_geometry *geo = sc->mp->m_dir_geo; + xfs_dablk_t dabno; + __be32 magic_guess; + int error = 0; + + if (rd->sc->ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + return xrep_dir_recover_sf(rd); + + xrep_dir_guess_format(rd, &magic_guess); + + /* Iterate each directory data block in the data fork. */ + ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); + for_each_xfs_iext(ifp, &icur, &got) { + /* Leaf blocks come after all data blocks, so cut off there. */ + xfs_trim_extent(&got, 0, geo->leafblk); + if (got.br_blockcount == 0) + continue; + + for (dabno = round_up(got.br_startoff, geo->fsbcount); + dabno < got.br_startoff + got.br_blockcount; + dabno += geo->fsbcount) { + if (xchk_should_terminate(rd->sc, &error)) + return error; + + error = xrep_dir_recover_dirblock(rd, magic_guess, + dabno); + if (error) + break; + } + } + + return error; +} + +/* Reset a non-local directory. */ +STATIC int +xrep_dir_reset_nonlocal( + struct xfs_scrub *sc, + struct xfs_ifork *ifp) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_buf *bp; + struct xfs_da_geometry *geo = sc->mp->m_dir_geo; + xfs_dablk_t dabno; + int error; + + /* Invalidate each directory block. */ + for_each_xfs_iext(ifp, &icur, &got) { + for (dabno = round_up(got.br_startoff, geo->fsbcount); + dabno < got.br_startoff + got.br_blockcount; + dabno += geo->fsbcount) { + error = xfs_da_get_buf(sc->tp, sc->ip, dabno, &bp, + XFS_DATA_FORK); + if (error || !bp) + continue; + xfs_trans_binval(sc->tp, bp); + error = xfs_trans_roll_inode(&sc->tp, sc->ip); + if (error) + return error; + } + } + + /* Now free all the blocks. */ + return xfs_bunmapi_range(&sc->tp, sc->ip, XFS_DATA_FORK, 0, + XFS_MAX_FILEOFF, XFS_BMAPI_NODISCARD); +} + +/* Free all the directory blocks and delete the fork. */ +STATIC int +xrep_dir_reset_fork( + struct xrep_dir *rd) +{ + struct xfs_ifork *ifp; + struct xfs_da_args *args = rd->sc->buf; + int error; + + xfs_trans_ijoin(rd->sc->tp, rd->sc->ip, 0); + ifp = XFS_IFORK_PTR(rd->sc->ip, XFS_DATA_FORK); + + /* Unmap all the directory buffers. */ + if (xfs_ifork_has_extents(rd->sc->ip, XFS_DATA_FORK)) { + error = xrep_dir_reset_nonlocal(rd->sc, ifp); + if (error) + return error; + } + + /* Reset the data fork to an empty data fork. */ + xfs_ifork_reset(ifp); + ifp->if_flags = XFS_IFINLINE; + ifp->if_bytes = 0; + rd->sc->ip->i_d.di_size = 0; + + /* Reinitialize the short form directory. */ + set_nlink(VFS_I(rd->sc->ip), 2); + args->geo = rd->sc->mp->m_dir_geo; + args->dp = rd->sc->ip; + args->trans = rd->sc->tp; + error = xfs_dir2_sf_create(args, rd->parent_ino); + if (error) + return error; + + return xfs_trans_roll_inode(&rd->sc->tp, rd->sc->ip); +} + +/* Compare two dir keys, sorting in hash order. */ +static int +xrep_dir_key_cmp( + const void *a, + const void *b) +{ + const struct xrep_dir_key *ap = a; + const struct xrep_dir_key *bp = b; + + if (ap->hash > bp->hash) + return 1; + else if (ap->hash < bp->hash) + return -1; + return 0; +} + +/* + * Find all the directory entries for this inode by scraping them out of the + * directory leaf blocks by hand. The caller must clean up the lists if + * anything goes wrong. + */ +STATIC int +xrep_dir_find_entries( + struct xrep_dir *rd) +{ + struct xfs_inode *ip = rd->sc->ip; + struct xfs_ifork *ifp; + int error; + + error = xrep_ino_dqattach(rd->sc); + if (error) + return error; + + /* Extent map should be loaded. */ + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_LOCAL && + !(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(rd->sc->tp, ip, XFS_DATA_FORK); + if (error) + return error; + } + + /* Read every directory entry and record them in memory. */ + return xrep_dir_recover(rd); +} + +/* Insert one dir entry. */ +STATIC int +xrep_dir_insert_rec( + const void *item, + void *priv) +{ + struct xfs_name name; + const struct xrep_dir_key *key = item; + struct xrep_dir *rd = priv; + struct xfs_trans *tp; + char *namebuf = rd->sc->buf; + struct xfs_mount *mp = rd->sc->mp; + uint resblks; + int error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* The entry name is stored in the in-core buffer. */ + name.name = namebuf; + + error = xblob_get(rd->dir_names, key->name_cookie, namebuf, + key->namelen); + if (error) + return error; + + error = xblob_free(rd->dir_names, key->name_cookie); + if (error) + return error; + + trace_xrep_dir_insert_rec(rd->sc->ip, namebuf, key->namelen, key->ino, + key->ftype); + + error = xfs_qm_dqattach(rd->sc->ip); + if (error) + return error; + + resblks = XFS_LINK_SPACE_RES(mp, key->namelen); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp); + if (error == -ENOSPC) { + resblks = 0; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp); + } + if (error) + return error; + + xfs_ilock(rd->sc->ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, rd->sc->ip, XFS_ILOCK_EXCL); + + name.len = key->namelen; + name.type = key->ftype; + error = xfs_dir_createname(tp, rd->sc->ip, &name, key->ino, resblks); + if (error) + goto err; + + if (name.type == XFS_DIR3_FT_DIR) + inc_nlink(VFS_I(rd->sc->ip)); + xfs_trans_log_inode(tp, rd->sc->ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp); + +err: + xfs_trans_cancel(tp); + return error; +} + +/* + * Insert all the attributes that we collected. + * + * Commit the repair transaction and drop the ilock because the attribute + * setting code needs to be able to allocate special transactions and take the + * ilock on its own. Some day we'll have deferred attribute setting, at which + * point we'll be able to use that to replace the attributes atomically and + * safely. + */ +STATIC int +xrep_dir_rebuild_tree( + struct xrep_dir *rd) +{ + int error; + + /* + * Commit the existing transaction and drop the ILOCK so that we can + * use a series of small transactions to rebuild the directory. + */ + error = xfs_trans_commit(rd->sc->tp); + rd->sc->tp = NULL; + if (error) + return error; + + xfs_iunlock(rd->sc->ip, XFS_ILOCK_EXCL); + rd->sc->ilock_flags &= ~XFS_ILOCK_EXCL; + + /* + * Sort the entries hash to minimize dabtree splits when we rebuild the + * directory tree information. + */ + error = xfbma_sort(rd->dir_entries, xrep_dir_key_cmp); + if (error) + return error; + + /* Re-add every entry to the directory. */ + return xfbma_iter_del(rd->dir_entries, xrep_dir_insert_rec, rd); +} + +/* + * Repair the directory metadata. + * + * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer + * cache in XFS can't handle aliased multiblock buffers, so this might + * misbehave if the directory blocks are crosslinked with other filesystem + * metadata. + * + * XXX: Is it necessary to check the dcache for this directory to make sure + * that we always recreate every cached entry? + */ +int +xrep_dir( + struct xfs_scrub *sc) +{ + struct xrep_dir rd = { + .sc = sc, + .parent_ino = NULLFSINO, + }; + int error; + + /* Set up some storage */ + rd.dir_entries = xfbma_init(sizeof(struct xrep_dir_key)); + if (IS_ERR(rd.dir_entries)) + return PTR_ERR(rd.dir_entries); + rd.dir_names = xblob_init(); + if (IS_ERR(rd.dir_names)) { + error = PTR_ERR(rd.dir_names); + goto out_arr; + } + + /* + * The directory scrubber might have dropped the ILOCK, so pick it up + * again. + */ + if (!(sc->ilock_flags & XFS_ILOCK_EXCL)) { + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + sc->ilock_flags |= XFS_ILOCK_EXCL; + } + + /* Collect directory entries by parsing raw leaf blocks. */ + error = xrep_dir_find_entries(&rd); + if (error) + goto out; + + /* If we can't find the parent pointer, we're sunk. */ + if (rd.parent_ino == NULLFSINO) + return -EFSCORRUPTED; + + /* + * Invalidate and truncate all data fork extents. This is the point at + * which we are no longer able to bail out gracefully. We commit the + * transaction here because the rebuilding step allocates its own + * transactions. + */ + error = xrep_dir_reset_fork(&rd); + if (error) + goto out; + + /* Now rebuild the directory information. */ + error = xrep_dir_rebuild_tree(&rd); +out: + xblob_destroy(rd.dir_names); +out_arr: + xfbma_destroy(rd.dir_entries); + return error; +} |