diff options
author | Darrick J. Wong <djwong@kernel.org> | 2021-09-01 11:25:16 -0700 |
---|---|---|
committer | Darrick J. Wong <djwong@kernel.org> | 2021-10-22 16:41:14 -0700 |
commit | aa7bbb4df5d4f26f134b67f7a4218a2c7a116d50 (patch) | |
tree | d81b793c3bf5840db2a0bb7b52f051b354a453f7 | |
parent | a96548dece0c260f61a0007e7f45ae968802b8c6 (diff) |
xfs: teach scrub to check file nlinks
Copy-pasta the online quotacheck code to check inode link counts too.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
-rw-r--r-- | fs/xfs/Makefile | 1 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_fs.h | 3 | ||||
-rw-r--r-- | fs/xfs/scrub/common.h | 1 | ||||
-rw-r--r-- | fs/xfs/scrub/health.c | 1 | ||||
-rw-r--r-- | fs/xfs/scrub/nlinks.c | 761 | ||||
-rw-r--r-- | fs/xfs/scrub/nlinks.h | 56 | ||||
-rw-r--r-- | fs/xfs/scrub/scrub.c | 6 | ||||
-rw-r--r-- | fs/xfs/scrub/scrub.h | 1 | ||||
-rw-r--r-- | fs/xfs/scrub/trace.c | 1 | ||||
-rw-r--r-- | fs/xfs/scrub/trace.h | 132 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 131 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.h | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.h | 18 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_symlink.c | 1 |
15 files changed, 1121 insertions, 1 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 832c2be0b634..d8167db4bac4 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -155,6 +155,7 @@ xfs-y += $(addprefix scrub/, \ ialloc.o \ inode.o \ iscan.o \ + nlinks.o \ parent.o \ refcount.o \ rmap.o \ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 0f101a190b52..c0a42019f8f5 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -744,9 +744,10 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_HEALTHY 26 /* everything checked out ok */ #define XFS_SCRUB_TYPE_RTRMAPBT 27 /* realtime reverse mapping btree */ #define XFS_SCRUB_TYPE_RTREFCBT 28 /* realtime reference count btree */ +#define XFS_SCRUB_TYPE_NLINKS 29 /* inode link counts */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 29 +#define XFS_SCRUB_TYPE_NR 30 /* * This special type code only applies to the vectored scrub implementation. diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 72c7d2636b56..74f0606174df 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -132,6 +132,7 @@ xchk_setup_quotacheck(struct xfs_scrub *sc) } #endif int xchk_setup_fscounters(struct xfs_scrub *sc); +int xchk_setup_nlinks(struct xfs_scrub *sc); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 88a3969a9161..49a806400dac 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -109,6 +109,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, [XFS_SCRUB_TYPE_RTRMAPBT] = { XHG_RT, XFS_SICK_RT_RMAPBT }, [XFS_SCRUB_TYPE_RTREFCBT] = { XHG_RT, XFS_SICK_RT_REFCNTBT }, + [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS }, }; /* Return the health status mask for this scrub type. */ diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c new file mode 100644 index 000000000000..f13bb0176f3d --- /dev/null +++ b/fs/xfs/scrub/nlinks.c @@ -0,0 +1,761 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2021 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/nlinks.h" +#include "scrub/trace.h" + +/* + * Live Inode Link Count Checking + * ============================== + * + * Inode link counts are "summary" metadata, in the sense that they are + * computed as the number of directory entries referencing each file on the + * filesystem. Therefore, we compute the correct link counts by creating a + * shadow link count structure and walking every inode. + * + * Because we are scanning a live filesystem, it's possible that another thread + * will try to update the link counts for an inode that we've already scanned. + * This will cause our counts to be incorrect. Therefore, we hook all inode + * link count updates when the change is made to the incore inode. By + * shadowing transaction updates in this manner, live nlink check can ensure by + * locking the inode and the shadow structure that its own copies are not out + * of date. + * + * Note that we use srcu notifier hooks to minimize the overhead when live + * nlinks is /not/ running. Locking order for nlink observations is inode + * ILOCK -> iscan_lock/xchk_nlink_ctrs lock. + */ + +/* Set us up to scrub inode link counts. */ +int +xchk_setup_nlinks( + struct xfs_scrub *sc) +{ + sc->buf = kmem_zalloc(sizeof(struct xchk_nlink_ctrs), + KM_NOFS | KM_MAYFAIL); + if (!sc->buf) + return -ENOMEM; + + return xchk_setup_fs(sc); +} + +/* Retrieve the observed link count record for the given inode. */ +STATIC int +xchk_nlinks_get_record( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + struct xchk_nlink *nl) +{ + int error; + + error = xfarray_load(xnc->nlinks, ino, nl); + if (error == -ENODATA) { + /* + * ENODATA means we tried to read beyond the end of the sparse + * array. This isn't a big deal, just zero the incore record + * and return that. + */ + memset(nl, 0, sizeof(struct xchk_nlink)); + return 0; + } + return error; +} + +/* Update incore link count information. Caller must hold the nlinks lock. */ +STATIC int +xchk_nlinks_update_incore( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *dp, + bool parent_to_child, + xfs_ino_t ino, + int delta) +{ + struct xchk_nlink nl; + int error; + + if (!xnc->nlinks) + return 0; + + error = xchk_nlinks_get_record(xnc, ino, &nl); + if (error) + return error; + + if (parent_to_child) + nl.parent += delta; + else + nl.child += delta; + + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + } + return error; +} + +/* Read the observed link count for comparison with the actual inode. */ +STATIC int +xchk_nlinks_comparison_read( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + struct xchk_nlink *obs) +{ + struct xchk_nlink nl; + int error; + + error = xchk_nlinks_get_record(xnc, ino, &nl); + if (error) + return error; + + nl.flags |= XCHK_NLINK_COMPARE_SCANNED; + + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + if (error) + return error; + + obs->parent = nl.parent; + obs->child = nl.child; + return 0; +} + +/* + * Apply a link count change from the regular filesystem into our shadow link + * count structure. + */ +STATIC int +xchk_nlinks_live_update( + struct notifier_block *nb, + unsigned long arg, + void *data) +{ + struct xfs_nlink_delta_params *p = data; + struct xchk_nlink_ctrs *xnc; + bool parent_to_child; + int error; + + parent_to_child = (arg == XFS_PARENT_NLINK_DELTA); + xnc = container_of(nb, struct xchk_nlink_ctrs, nlink_delta_hook); + + if (!xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) + return NOTIFY_DONE; + + trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, parent_to_child, + p->ino, p->delta); + + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, p->dp, parent_to_child, p->ino, + p->delta); + mutex_unlock(&xnc->lock); + if (error) + xchk_iscan_abort(&xnc->collect_iscan); + + return NOTIFY_DONE; +} + +struct xchk_walk_dir { + struct dir_context dir_iter; + struct xchk_nlink_ctrs *xnc; + struct xfs_inode *dp; +}; + +/* Bump the observed link count for the inode referenced by this entry. */ +STATIC int +xchk_nlinks_walk_dir( + struct dir_context *dir_iter, + const char *name, + int namelen, + loff_t pos, + u64 ino, + unsigned type) +{ + struct xchk_walk_dir *xwd; + struct xchk_nlink_ctrs *xnc; + bool parent_to_child = true; + int error = -ECANCELED; + + xwd = container_of(dir_iter, struct xchk_walk_dir, dir_iter); + xnc = xwd->xnc; + + if (namelen == 0) { + /* Shouldn't be any zero-length dirents... */ + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } else if (namelen == 1 && name[0] == '.') { + /* + * The dot entry has to point to the directory, and we account + * it as a "child" pointing to its parent. + */ + if (ino != xwd->dp->i_ino) { + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + parent_to_child = false; + } else if (namelen == 2 && name[0] == '.' && name[1] == '.') { + /* dotdot means child pointing to parent */ + parent_to_child = false; + } + + /* Update the shadow link counts if we haven't already failed. */ + + if (xchk_iscan_aborted(&xnc->collect_iscan)) + goto out_incomplete; + + trace_xchk_nlinks_walk_dir(xnc->sc->mp, xwd->dp, parent_to_child, ino, + name, namelen); + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, xwd->dp, parent_to_child, ino, + 1); + mutex_unlock(&xnc->lock); + if (error) + goto out_abort; + + return 0; + +out_abort: + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(xnc->sc); + return error; +} + +/* Bump the observed link counts of every entry in this directory. */ +STATIC int +xchk_nlinks_dir( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = xnc->sc; + struct xchk_walk_dir xwd = { + .dir_iter.actor = xchk_nlinks_walk_dir, + .dir_iter.pos = 0, + .xnc = xnc, + .dp = dp, + }; + loff_t oldpos; + size_t bufsize; + unsigned int lock_mode; + int error = 0; + + /* Lock out the VFS from changing this directory while we walk it. */ + xfs_ilock(dp, XFS_IOLOCK_SHARED); + + /* + * The dotdot entry of an unlinked directory still points to the last + * parent, but the parent no longer links to this directory. Skip the + * directory to avoid overcounting. + */ + if (VFS_I(dp)->i_nlink == 0) + goto out; + + /* + * If there are any blocks, read-ahead block 0 as we're almost certain + * to have the next operation be a read there. This is how we + * guarantee that the directory's extent map has been loaded, if there + * is one. + */ + lock_mode = xfs_ilock_data_map_shared(dp); + if (dp->i_df.if_nextents > 0) + error = xfs_dir3_data_readahead(dp, 0, 0); + xfs_iunlock(dp, lock_mode); + if (error) + goto out; + + /* + * Bump link counts for every dirent we see. Userspace usually asks + * for a 32k buffer, so we will too. + */ + bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, dp->i_disk_size); + do { + oldpos = xwd.dir_iter.pos; + error = xfs_readdir(sc->tp, dp, &xwd.dir_iter, bufsize); + } while (!error && oldpos < xwd.dir_iter.pos); + +out: + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return error; +} + +/* If this looks like a valid pointer, count it. */ +static inline int +xchk_nlinks_metafile( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + if (!xfs_verify_ino(xnc->sc->mp, ino)) + return 0; + + trace_xchk_nlinks_metafile(xnc->sc->mp, ino); + return xchk_nlinks_update_incore(xnc, NULL, true, ino, 1); +} + +/* Bump the link counts of metadata files rooted in the superblock. */ +STATIC int +xchk_nlinks_metafiles( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_mount *mp = xnc->sc->mp; + int error = -ECANCELED; + + + if (xchk_iscan_aborted(&xnc->collect_iscan)) + goto out_incomplete; + + mutex_lock(&xnc->lock); + error = xchk_nlinks_metafile(xnc, mp->m_sb.sb_rbmino); + if (error) + goto out_abort; + + error = xchk_nlinks_metafile(xnc, mp->m_sb.sb_rsumino); + if (error) + goto out_abort; + + error = xchk_nlinks_metafile(xnc, mp->m_sb.sb_uquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_metafile(xnc, mp->m_sb.sb_gquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_metafile(xnc, mp->m_sb.sb_pquotino); + if (error) + goto out_abort; + mutex_unlock(&xnc->lock); + + return 0; + +out_abort: + mutex_unlock(&xnc->lock); + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(xnc->sc); + return error; +} + +/* Advance the collection scan cursor for this file. */ +static inline int +xchk_nlinks_file( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + xfs_ilock(ip, XFS_IOLOCK_SHARED); + xchk_iscan_mark_visited(&xnc->collect_iscan, ip); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; +} + +/* Walk all directories and count inode links. */ +STATIC int +xchk_nlinks_collect( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_scrub *sc = xnc->sc; + int error; + + /* Count the rt and quota files if they're rooted in the superblock. */ + if (!xfs_has_metadir(sc->mp)) { + error = xchk_nlinks_metafiles(xnc); + if (error) + return error; + } + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Cancel the transaction to release the log grant space while we scan + * the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_advance(sc, &xnc->collect_iscan)) == 1) { + struct xfs_inode *ip; + + error = xchk_iscan_iget(sc, &xnc->collect_iscan, &ip); + if (error == -EAGAIN) + continue; + if (error) + break; + + if (S_ISDIR(VFS_I(ip)->i_mode)) + error = xchk_nlinks_dir(xnc, ip); + else + error = xchk_nlinks_file(xnc, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + + if (error == -ECANCELED) + xchk_set_incomplete(sc); + if (error) + return error; + + /* + * Switch out for a real transaction in preparation for building a new + * tree. + */ + xchk_trans_cancel(sc); + return xchk_setup_fs(sc); +} + +/* Check our link count against an inode. */ +STATIC int +xchk_nlinks_compare_inode( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + struct xchk_nlink obs; + struct xfs_scrub *sc = xnc->sc; + uint64_t total_links; + int error; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + mutex_lock(&xnc->lock); + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_scanlock; + } + + error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs); + if (error) + goto out_scanlock; + total_links = xchk_nlink_total(&obs); + + trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs); + + /* We found more than the maxiumum possible link count. */ + if (total_links > U32_MAX) + xchk_ino_set_corrupt(sc, ip->i_ino); + + /* Link counts should match. */ + if (total_links != VFS_I(ip)->i_nlink) + xchk_ino_set_corrupt(sc, ip->i_ino); + + /* Directories should have at least one child due to dot entry. */ + if (S_ISDIR(VFS_I(ip)->i_mode) && obs.child < 1) + xchk_ino_set_corrupt(sc, ip->i_ino); + + /* Non-directories should not have children */ + if (!S_ISDIR(VFS_I(ip)->i_mode) && obs.child != 0) + xchk_ino_set_corrupt(sc, ip->i_ino); + + if (ip == sc->mp->m_metadirip || ip == sc->mp->m_rootip) { + /* Nothing should point to the directory tree roots. */ + if (obs.parent != 0) + xchk_ino_set_corrupt(sc, ip->i_ino); + + /* + * Directory tree roots should have at least two "child" + * references to cover dot and dotdot. + */ + if (obs.child < 2) + xchk_ino_set_corrupt(sc, ip->i_ino); + } else if (obs.parent == 0) { + /* Non-root linked files should have a parent. */ + if (VFS_I(ip)->i_nlink != 0) + xchk_ino_set_corrupt(sc, ip->i_ino); + } + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + error = -EFSCORRUPTED; + +out_scanlock: + mutex_unlock(&xnc->lock); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +/* + * Check our link count against an inode that wasn't checked previously. This + * is intended to catch directories with dangling links, though we could be + * racing with inode allocation in other threads. + */ +STATIC int +xchk_nlinks_compare_inum( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + struct xchk_nlink obs; + struct xfs_mount *mp = xnc->sc->mp; + struct xfs_trans *tp = xnc->sc->tp; + struct xfs_buf *agi_bp; + struct xfs_inode *ip; + int error; + + /* + * Lock the AGI to the transaction just in case the lookup fails and we + * need something to prevent inode allocation while we reconfirm the + * observed nlink value. + */ + error = xfs_ialloc_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ino), &agi_bp); + if (error) + return error; + + error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &ip); + if (error == 0) { + /* Actually got an inode, so use the inode compare. */ + xfs_trans_brelse(tp, agi_bp); + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_irele(xnc->sc, ip); + return error; + } + if (error == -ENOENT || error == -EINVAL) { + /* No inode was found; check for zero link count below. */ + error = 0; + } + if (error) + goto out_agi; + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_agi; + } + + mutex_lock(&xnc->lock); + error = xchk_nlinks_comparison_read(xnc, ino, &obs); + if (error) + goto out_scanlock; + + trace_xchk_nlinks_check_zero(mp, ino, &obs); + + /* + * If we can't grab the inode, the link count had better be zero. We + * still hold the AGI to prevent inode allocation/freeing. + */ + if (xchk_nlink_total(&obs) != 0) { + xchk_ino_set_corrupt(xnc->sc, ino); + error = -ECANCELED; + } + +out_scanlock: + mutex_unlock(&xnc->lock); +out_agi: + xfs_trans_brelse(tp, agi_bp); + return error; +} + +/* Compare the link counts we observed against the live information. */ +STATIC int +xchk_nlinks_compare( + struct xchk_nlink_ctrs *xnc) +{ + struct xchk_nlink nl; + struct xfs_scrub *sc = xnc->sc; + uint64_t nr = 0; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Create a new empty transaction so that we can advance the iscan + * cursor without deadlocking if the inobt has a cycle and push on the + * inactivation workqueue. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + /* + * Use the inobt to walk all allocated inodes to compare the link + * counts. If we can't grab the inode, we'll try again in the second + * step. + */ + xchk_iscan_start(&xnc->compare_iscan); + while ((error = xchk_iscan_advance(sc, &xnc->compare_iscan)) == 1) { + struct xfs_inode *ip; + + error = xchk_iscan_iget(sc, &xnc->compare_iscan, &ip); + if (error == -ECANCELED) + continue; + if (error) + break; + + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_iscan_mark_visited(&xnc->compare_iscan, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Walk all the non-null nlink observations that weren't checked in the + * previous step. + */ + mutex_lock(&xnc->lock); + while (!(error = xfarray_load_next(xnc->nlinks, &nr, &nl))) { + xfs_ino_t ino = nr - 1; + + if (nl.flags & XCHK_NLINK_COMPARE_SCANNED) + continue; + + mutex_unlock(&xnc->lock); + + if (xchk_should_terminate(xnc->sc, &error)) + return error; + + error = xchk_nlinks_compare_inum(xnc, ino); + if (error) + return error; + + mutex_lock(&xnc->lock); + } + mutex_unlock(&xnc->lock); + + /* ENODATA means we hit the end of the array. */ + if (error == -ENODATA) + return 0; + return error; +} + +/* Tear down everything associated with a nlinks check. */ +static void +xchk_nlinks_teardown_scan( + struct xchk_nlink_ctrs *xnc) +{ + /* Discourage any hook functions that might be running. */ + xchk_iscan_abort(&xnc->collect_iscan); + + xfs_hook_del(&xnc->sc->mp->m_nlink_delta_hooks, &xnc->nlink_delta_hook); + + xfarray_destroy(xnc->nlinks); + xnc->nlinks = NULL; + + xchk_iscan_finish(&xnc->collect_iscan); + mutex_destroy(&xnc->lock); + xnc->sc = NULL; +} + +/* + * Scan all inodes in the entire filesystem to generate link count data. If + * the scan is successful, the counts will be left alive for a repair. If any + * error occurs, we'll tear everything down. + */ +STATIC int +xchk_nlinks_setup_scan( + struct xfs_scrub *sc, + struct xchk_nlink_ctrs *xnc) +{ + int error; + + ASSERT(xnc->sc == NULL); + xnc->sc = sc; + + mutex_init(&xnc->lock); + xnc->collect_iscan.iget_tries = 20; + xnc->collect_iscan.iget_retry_delay = HZ / 10; + xchk_iscan_start(&xnc->collect_iscan); + + error = -ENOMEM; + xnc->nlinks = xfarray_create("link counts", sizeof(struct xchk_nlink)); + if (!xnc->nlinks) + goto out_teardown; + + /* + * Hook into the bumplink/droplink code. The hook only triggers for + * inodes that were already scanned, and the scanner thread takes each + * inode's ILOCK, which means that any in-progress inode updates will + * finish before we can scan the inode. + */ + error = xfs_hook_add(&sc->mp->m_nlink_delta_hooks, + &xnc->nlink_delta_hook, xchk_nlinks_live_update); + if (error) + goto out_teardown; + + /* Use deferred cleanup to pass the inode link count data to repair. */ + sc->buf_cleanup = (void (*)(void *))xchk_nlinks_teardown_scan; + return 0; + +out_teardown: + xchk_nlinks_teardown_scan(xnc); + return error; +} + +/* Scrub the link count of all inodes on the filesystem. */ +int +xchk_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc = sc->buf; + int error = 0; + + /* Set ourselves up to check link counts on the live filesystem. */ + error = xchk_nlinks_setup_scan(sc, xnc); + if (error) + return error; + + /* Walk all inodes, picking up link count information. */ + error = xchk_nlinks_collect(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Compare link counts. */ + error = xchk_nlinks_compare(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + return 0; +} diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h new file mode 100644 index 000000000000..0ece2ab5dd38 --- /dev/null +++ b/fs/xfs/scrub/nlinks.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_NLINKS_H__ +#define __XFS_SCRUB_NLINKS_H__ + +/* Live link count control structure. */ +struct xchk_nlink_ctrs { + struct xfs_scrub *sc; + + /* Shadow link count data and its mutex. */ + struct xfarray *nlinks; + struct mutex lock; + + /* + * The collection step uses a separate iscan context from the compare + * step because the collection iscan coordinates live updates to the + * observation data while this scanner is running. The compare iscan + * is secondary and can be reinitialized as needed. + */ + struct xchk_iscan collect_iscan; + struct xchk_iscan compare_iscan; + + /* + * Hook into bumplink/droplink so that we can receive live updates + * from other writer threads. + */ + struct notifier_block nlink_delta_hook; +}; + +struct xchk_nlink { + /* Links from a parent directory to this inode. */ + xfs_nlink_t parent; + + /* Links from children of this inode (e.g. dot and dotdot). */ + xfs_nlink_t child; + + /* Record state flags */ + unsigned int flags; +}; + +/* This data item was seen by the check-time compare function. */ +#define XCHK_NLINK_COMPARE_SCANNED (1U << 0) + +/* Compute total link count, using large enough variables to detect overflow. */ +static inline uint64_t +xchk_nlink_total(const struct xchk_nlink *live) +{ + uint64_t ret = live->parent; + + return ret + live->child; +} + +#endif /* __XFS_SCRUB_NLINKS_H__ */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index f755c6d4ff5e..55f2c37d01fd 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -393,6 +393,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .has = xfs_has_rtreflink, .repair = xrep_rtrefcountbt, }, + [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */ + .type = ST_FS, + .setup = xchk_setup_nlinks, + .scrub = xchk_nlinks, + .repair = xrep_notsupported, + }, }; /* This isn't a stable feature, warn once per day. */ diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 4adbda457782..d585ac1bfe73 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -201,6 +201,7 @@ xchk_quotacheck(struct xfs_scrub *sc) } #endif int xchk_fscounters(struct xfs_scrub *sc); +int xchk_nlinks(struct xfs_scrub *sc); /* cross-referencing helpers */ void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno, diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index e9178f1bcf39..e3702d07bc1e 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -19,6 +19,7 @@ #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/iscan.h" +#include "scrub/nlinks.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 4b2b15d9ac1e..6c2d874b9c56 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -21,6 +21,7 @@ struct xfs_scrub; struct xfile; struct xfarray; struct xchk_iscan; +struct xchk_nlink; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -97,6 +98,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); { XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \ { XFS_SCRUB_TYPE_RTRMAPBT, "rtrmapbt" }, \ { XFS_SCRUB_TYPE_RTREFCBT, "rtrefcountbt" }, \ + { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \ { XFS_SCRUB_TYPE_BARRIER, "barrier" } #define XFS_SCRUB_FLAG_STRINGS \ @@ -987,6 +989,136 @@ TRACE_EVENT(xchk_iscan_iget, __entry->cursor, __entry->visited, __entry->error) ); +TRACE_EVENT(xchk_nlinks_walk_dir, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + bool parent_to_child, xfs_ino_t ino, const char *name, + unsigned int namelen), + TP_ARGS(mp, dp, parent_to_child, ino, name, namelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(bool, parent) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp->i_ino; + __entry->parent = parent_to_child; + __entry->ino = ino; + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + ), + TP_printk("dev %d:%d dir 0x%llx parent_to_child? %d ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->parent, + __entry->ino, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_nlinks_metafile, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino), + TP_ARGS(mp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +); + +TRACE_EVENT(xchk_nlinks_live_update, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + bool parent_to_child, xfs_ino_t ino, int delta), + TP_ARGS(mp, dp, parent_to_child, ino, delta), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(bool, parent) + __field(xfs_ino_t, ino) + __field(int, delta) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp ? dp->i_ino : NULLFSINO; + __entry->parent = parent_to_child; + __entry->ino = ino; + __entry->delta = delta; + ), + TP_printk("dev %d:%d dir 0x%llx parent_to_child? %d ino 0x%llx nlink_delta %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->parent, + __entry->ino, + __entry->delta) +); + +TRACE_EVENT(xchk_nlinks_check_zero, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *live), + TP_ARGS(mp, ino, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parent) + __field(xfs_nlink_t, child) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parent = live->parent; + __entry->child = live->child; + ), + TP_printk("dev %d:%d ino 0x%llx parent_links %u child_links %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent, + __entry->child) +); + +DECLARE_EVENT_CLASS(xchk_nlink_diff_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, + const struct xchk_nlink *live), + TP_ARGS(mp, ip, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(umode_t, mode) + __field(xfs_nlink_t, nlink) + __field(xfs_nlink_t, parent) + __field(xfs_nlink_t, child) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->mode = VFS_I(ip)->i_mode; + __entry->nlink = VFS_I(ip)->i_nlink; + __entry->parent = live->parent; + __entry->child = live->child; + ), + TP_printk("dev %d:%d ino 0x%llx dir? %d nlink %u parent_links %u child_links %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + !!S_ISDIR(__entry->mode), + __entry->nlink, + __entry->parent, + __entry->child) +); +#define DEFINE_SCRUB_NLINK_DIFF_EVENT(name) \ +DEFINE_EVENT(xchk_nlink_diff_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, \ + const struct xchk_nlink *live), \ + TP_ARGS(mp, ip, live)) +DEFINE_SCRUB_NLINK_DIFF_EVENT(xchk_nlinks_compare_inode); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 80f9fb2a7799..2629f39119aa 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -684,6 +684,47 @@ xfs_icreate_dqalloc( flags, udqpp, gdqpp, pdqpp); } +#ifdef CONFIG_XFS_LIVE_HOOKS +static inline void +xfs_nlink_delta( + struct xfs_inode *dp, + struct xfs_inode *ip, + enum xfs_nlink_delta_type type, + int delta) +{ + struct xfs_nlink_delta_params p; + struct xfs_mount *mp = ip->i_mount; + + p.dp = dp; + p.ino = ip->i_ino; + p.delta = delta; + + xfs_hook_call(&mp->m_nlink_delta_hooks, type, &p); +} + +/* Call a hook to capture nlink updates in real time. */ +static inline void +xfs_nlink_child_delta( + struct xfs_inode *dp, + struct xfs_inode *ip, + int delta) +{ + xfs_nlink_delta(dp, ip, XFS_CHILD_NLINK_DELTA, delta); +} + +/* Call a hook to capture nlink updates in real time. */ +void +xfs_nlink_parent_delta( + struct xfs_inode *dp, + struct xfs_inode *ip, + int delta) +{ + xfs_nlink_delta(dp, ip, XFS_PARENT_NLINK_DELTA, delta); +} +#else +# define xfs_nlink_child_delta(dp, ip, delta) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + int xfs_create( struct xfs_inode *dp, @@ -776,6 +817,16 @@ xfs_create( goto out_trans_cancel; /* + * Create ip with a reference from dp, and add '.' and '..' references + * if it's a directory. + */ + xfs_nlink_parent_delta(dp, ip, 1); + if (is_dir) { + xfs_nlink_child_delta(ip, ip, 1); + xfs_nlink_child_delta(ip, dp, 1); + } + + /* * If this is a synchronous mount, make sure that the * create transaction goes to disk before returning to * the user. @@ -1070,6 +1121,7 @@ xfs_link( error = xfs_dir_link_existing_child(tp, resblks, tdp, target_name, sip); if (error) goto error_return; + xfs_nlink_parent_delta(tdp, sip, 1); /* * If this is a synchronous mount, make sure that the @@ -2211,6 +2263,16 @@ xfs_remove( goto out_trans_cancel; /* + * Drop the link from dp to ip, and if ip was a directory, remove the + * '.' and '..' references since we freed the directory. + */ + xfs_nlink_parent_delta(dp, ip, -1); + if (S_ISDIR(VFS_I(ip)->i_mode)) { + xfs_nlink_child_delta(ip, dp, -1); + xfs_nlink_child_delta(ip, ip, -1); + } + + /* * If this is a synchronous mount, make sure that the * remove transaction goes to disk before returning to * the user. @@ -2337,6 +2399,72 @@ xfs_rename_alloc_whiteout( return 0; } +#ifdef CONFIG_XFS_LIVE_HOOKS +static inline void +xfs_rename_call_nlink_hooks( + struct xfs_inode *src_dp, + struct xfs_inode *src_ip, + struct xfs_inode *target_dp, + struct xfs_inode *target_ip, + struct xfs_inode *wip, + unsigned int flags) +{ + /* If we added a whiteout, add the reference from src_dp. */ + if (wip) + xfs_nlink_parent_delta(src_dp, wip, 1); + + /* Move the src_ip reference from src_dp to target_dp. */ + xfs_nlink_parent_delta(src_dp, src_ip, -1); + xfs_nlink_parent_delta(target_dp, src_ip, 1); + + /* + * If src_ip is a dir, move its '..' reference from src_dp to + * target_dp. + */ + if (S_ISDIR(VFS_I(src_ip)->i_mode)) { + xfs_nlink_child_delta(src_ip, src_dp, -1); + xfs_nlink_child_delta(src_ip, target_dp, 1); + } + + if (!target_ip) + return; + + if (flags & RENAME_EXCHANGE) { + /* Move the target_ip reference from target_dp to src_dp. */ + xfs_nlink_parent_delta(target_dp, target_ip, -1); + xfs_nlink_parent_delta(src_dp, target_ip, 1); + + /* + * If target_ip is a dir, move its '..' reference from + * target_dp to src_dp. + */ + if (S_ISDIR(VFS_I(target_ip)->i_mode)) { + xfs_nlink_child_delta(target_ip, target_dp, -1); + xfs_nlink_child_delta(target_ip, src_dp, 1); + } + + return; + } + + /* Drop target_ip's reference from target_dp. */ + xfs_nlink_parent_delta(target_dp, target_ip, -1); + + if (!S_ISDIR(VFS_I(target_ip)->i_mode)) + return; + + /* + * If target_ip was a dir, drop the '.' and '..' references since that + * was the last reference. + */ + ASSERT(VFS_I(target_ip)->i_nlink == 0); + xfs_nlink_child_delta(target_ip, target_dp, -1); + xfs_nlink_child_delta(target_ip, target_ip, -1); +} +#else +# define xfs_rename_call_nlink_hooks(src_dp, src_ip, target_dp, \ + target_ip, wip, flags) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + /* * xfs_rename */ @@ -2477,6 +2605,9 @@ xfs_rename( VFS_I(wip)->i_state &= ~I_LINKABLE; } + xfs_rename_call_nlink_hooks(src_dp, src_ip, target_dp, target_ip, wip, + flags); + error = xfs_finish_rename(tp); if (wip) xfs_irele(wip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3a1321b8f1c8..bf6b26311641 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -541,4 +541,11 @@ int xfs_icreate_dqalloc(const struct xfs_icreate_args *args, int xfs_file_cow_around(struct xfs_inode *ip, loff_t pos, long long int count); +#ifdef CONFIG_XFS_LIVE_HOOKS +void xfs_nlink_parent_delta(struct xfs_inode *dp, struct xfs_inode *ip, + int delta); +#else +# define xfs_nlink_parent_delta(dp, ip, delta) ((void)0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f030db220a91..8ff3fc3e83de 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -271,8 +271,26 @@ typedef struct xfs_mount { * while a repair freeze is in progress. */ struct mutex m_scrub_freeze; + + /* online file link count check stuff */ + struct xfs_hook_chain m_nlink_delta_hooks; } xfs_mount_t; +/* + * Parameters for tracking bumplink and droplink operations. The hook + * function arg parameter is one of these. + */ +enum xfs_nlink_delta_type { + XFS_PARENT_NLINK_DELTA, /* parent pointing to child */ + XFS_CHILD_NLINK_DELTA, /* child pointing to parent */ +}; + +struct xfs_nlink_delta_params { + struct xfs_inode *dp; + xfs_ino_t ino; + int delta; +}; + #define M_IGEO(mp) (&(mp)->m_ino_geo) /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 7ecfdd8537cf..f0f2e478a794 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2000,6 +2000,8 @@ static int xfs_init_fs_context( mp->m_logbsize = -1; mp->m_allocsize_log = 16; /* 64k */ + xfs_hook_init(&mp->m_nlink_delta_hooks); + /* * Copy binary VFS mount flags we are interested in. */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 177987c3716f..42f0bd0c26c7 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -332,6 +332,7 @@ xfs_symlink( goto out_trans_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + xfs_nlink_parent_delta(dp, ip, 1); /* * If this is a synchronous mount, make sure that the |