diff options
-rw-r--r-- | fs/xfs/Makefile | 1 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_fs.h | 3 | ||||
-rw-r--r-- | fs/xfs/scrub/common.h | 1 | ||||
-rw-r--r-- | fs/xfs/scrub/health.c | 1 | ||||
-rw-r--r-- | fs/xfs/scrub/nlinks.c | 744 | ||||
-rw-r--r-- | fs/xfs/scrub/nlinks.h | 56 | ||||
-rw-r--r-- | fs/xfs/scrub/scrub.c | 6 | ||||
-rw-r--r-- | fs/xfs/scrub/scrub.h | 1 | ||||
-rw-r--r-- | fs/xfs/scrub/trace.c | 1 | ||||
-rw-r--r-- | fs/xfs/scrub/trace.h | 132 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 131 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.h | 22 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.h | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_symlink.c | 1 |
15 files changed, 1104 insertions, 1 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 832c2be0b634..d8167db4bac4 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -155,6 +155,7 @@ xfs-y += $(addprefix scrub/, \ ialloc.o \ inode.o \ iscan.o \ + nlinks.o \ parent.o \ refcount.o \ rmap.o \ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 681db654cf31..5e7956dc03d0 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -743,9 +743,10 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_HEALTHY 26 /* everything checked out ok */ #define XFS_SCRUB_TYPE_RTRMAPBT 27 /* realtime reverse mapping btree */ #define XFS_SCRUB_TYPE_RTREFCBT 28 /* realtime reference count btree */ +#define XFS_SCRUB_TYPE_NLINKS 29 /* inode link counts */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 29 +#define XFS_SCRUB_TYPE_NR 30 /* * This special type code only applies to the vectored scrub implementation. diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 04f7e1731458..f2e1cf719c06 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -132,6 +132,7 @@ xchk_setup_quotacheck(struct xfs_scrub *sc) } #endif int xchk_setup_fscounters(struct xfs_scrub *sc); +int xchk_setup_nlinks(struct xfs_scrub *sc); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 88a3969a9161..49a806400dac 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -109,6 +109,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, [XFS_SCRUB_TYPE_RTRMAPBT] = { XHG_RT, XFS_SICK_RT_RMAPBT }, [XFS_SCRUB_TYPE_RTREFCBT] = { XHG_RT, XFS_SICK_RT_REFCNTBT }, + [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS }, }; /* Return the health status mask for this scrub type. */ diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c new file mode 100644 index 000000000000..aa5a3237a34f --- /dev/null +++ b/fs/xfs/scrub/nlinks.c @@ -0,0 +1,744 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2021 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/nlinks.h" +#include "scrub/trace.h" + +/* + * Live Inode Link Count Checking + * ============================== + * + * Inode link counts are "summary" metadata, in the sense that they are + * computed as the number of directory entries referencing each file on the + * filesystem. Therefore, we compute the correct link counts by creating a + * shadow link count structure and walking every inode. + * + * Because we are scanning a live filesystem, it's possible that another thread + * will try to update the link counts for an inode that we've already scanned. + * This will cause our counts to be incorrect. Therefore, we hook all inode + * link count updates when the change is made to the incore inode. By + * shadowing transaction updates in this manner, live nlink check can ensure by + * locking the inode and the shadow structure that its own copies are not out + * of date. + * + * Note that we use srcu notifier hooks to minimize the overhead when live + * nlinks is /not/ running. Locking order for nlink observations is inode + * ILOCK -> iscan_lock/xchk_nlink_ctrs lock. + */ + +/* Set us up to scrub inode link counts. */ +int +xchk_setup_nlinks( + struct xfs_scrub *sc) +{ + sc->buf = kmem_zalloc(sizeof(struct xchk_nlink_ctrs), + KM_NOFS | KM_MAYFAIL); + if (!sc->buf) + return -ENOMEM; + + return xchk_setup_fs(sc); +} + +/* Update incore link count information. Caller must hold the nlinks lock. */ +STATIC int +xchk_nlinks_update_incore( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *dp, + bool parent_to_child, + xfs_ino_t ino, + int delta) +{ + struct xchk_nlink nl; + int error; + + if (!xnc->nlinks) + return 0; + + error = xfarray_load_sparse(xnc->nlinks, ino, &nl); + if (error) + return error; + + if (parent_to_child) + nl.parent += delta; + else + nl.child += delta; + + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + } + return error; +} + +/* Read the observed link count for comparison with the actual inode. */ +STATIC int +xchk_nlinks_comparison_read( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + struct xchk_nlink *obs) +{ + struct xchk_nlink nl; + int error; + + error = xfarray_load_sparse(xnc->nlinks, ino, &nl); + if (error) + return error; + + nl.flags |= XCHK_NLINK_COMPARE_SCANNED; + + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + if (error) + return error; + + obs->parent = nl.parent; + obs->child = nl.child; + return 0; +} + +/* + * Apply a link count change from the regular filesystem into our shadow link + * count structure. + */ +STATIC int +xchk_nlinks_live_update( + struct notifier_block *nb, + unsigned long arg, + void *data) +{ + struct xfs_nlink_delta_params *p = data; + struct xchk_nlink_ctrs *xnc; + bool parent_to_child; + int error; + + parent_to_child = (arg == XFS_PARENT_NLINK_DELTA); + xnc = container_of(nb, struct xchk_nlink_ctrs, nlink_delta_hook); + + if (!xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) + return NOTIFY_DONE; + + trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, parent_to_child, + p->ino, p->delta); + + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, p->dp, parent_to_child, p->ino, + p->delta); + mutex_unlock(&xnc->lock); + if (error) + xchk_iscan_abort(&xnc->collect_iscan); + + return NOTIFY_DONE; +} + +struct xchk_walk_dir { + struct dir_context dir_iter; + struct xchk_nlink_ctrs *xnc; + struct xfs_inode *dp; +}; + +/* Bump the observed link count for the inode referenced by this entry. */ +STATIC int +xchk_nlinks_walk_dir( + struct dir_context *dir_iter, + const char *name, + int namelen, + loff_t pos, + u64 ino, + unsigned type) +{ + struct xchk_walk_dir *xwd; + struct xchk_nlink_ctrs *xnc; + bool parent_to_child = true; + int error = -ECANCELED; + + xwd = container_of(dir_iter, struct xchk_walk_dir, dir_iter); + xnc = xwd->xnc; + + if (namelen == 0) { + /* Shouldn't be any zero-length dirents... */ + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } else if (namelen == 1 && name[0] == '.') { + /* + * The dot entry has to point to the directory, and we account + * it as a "child" pointing to its parent. + */ + if (ino != xwd->dp->i_ino) { + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + parent_to_child = false; + } else if (namelen == 2 && name[0] == '.' && name[1] == '.') { + /* dotdot means child pointing to parent */ + parent_to_child = false; + } + + /* Update the shadow link counts if we haven't already failed. */ + + if (xchk_iscan_aborted(&xnc->collect_iscan)) + goto out_incomplete; + + trace_xchk_nlinks_walk_dir(xnc->sc->mp, xwd->dp, parent_to_child, ino, + name, namelen); + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, xwd->dp, parent_to_child, ino, + 1); + mutex_unlock(&xnc->lock); + if (error) + goto out_abort; + + return 0; + +out_abort: + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(xnc->sc); + return error; +} + +/* Bump the observed link counts of every entry in this directory. */ +STATIC int +xchk_nlinks_dir( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = xnc->sc; + struct xchk_walk_dir xwd = { + .dir_iter.actor = xchk_nlinks_walk_dir, + .dir_iter.pos = 0, + .xnc = xnc, + .dp = dp, + }; + loff_t oldpos; + size_t bufsize; + unsigned int lock_mode; + int error = 0; + + /* Lock out the VFS from changing this directory while we walk it. */ + xfs_ilock(dp, XFS_IOLOCK_SHARED); + + /* + * The dotdot entry of an unlinked directory still points to the last + * parent, but the parent no longer links to this directory. Skip the + * directory to avoid overcounting. + */ + if (VFS_I(dp)->i_nlink == 0) + goto out; + + /* + * If there are any blocks, read-ahead block 0 as we're almost certain + * to have the next operation be a read there. This is how we + * guarantee that the directory's extent map has been loaded, if there + * is one. + */ + lock_mode = xfs_ilock_data_map_shared(dp); + if (dp->i_df.if_nextents > 0) + error = xfs_dir3_data_readahead(dp, 0, 0); + xfs_iunlock(dp, lock_mode); + if (error) + goto out; + + /* + * Bump link counts for every dirent we see. Userspace usually asks + * for a 32k buffer, so we will too. + */ + bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, dp->i_disk_size); + do { + oldpos = xwd.dir_iter.pos; + error = xfs_readdir(sc->tp, dp, &xwd.dir_iter, bufsize); + } while (!error && oldpos < xwd.dir_iter.pos); + + xchk_iscan_mark_visited(&xnc->collect_iscan, dp); +out: + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return error; +} + +/* If this looks like a valid pointer, count it. */ +static inline int +xchk_nlinks_metafile( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + if (!xfs_verify_ino(xnc->sc->mp, ino)) + return 0; + + trace_xchk_nlinks_metafile(xnc->sc->mp, ino); + return xchk_nlinks_update_incore(xnc, NULL, true, ino, 1); +} + +/* Bump the link counts of metadata files rooted in the superblock. */ +STATIC int +xchk_nlinks_metafiles( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_mount *mp = xnc->sc->mp; + int error = -ECANCELED; + + + if (xchk_iscan_aborted(&xnc->collect_iscan)) + goto out_incomplete; + + mutex_lock(&xnc->lock); + error = xchk_nlinks_metafile(xnc, mp->m_sb.sb_rbmino); + if (error) + goto out_abort; + + error = xchk_nlinks_metafile(xnc, mp->m_sb.sb_rsumino); + if (error) + goto out_abort; + + error = xchk_nlinks_metafile(xnc, mp->m_sb.sb_uquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_metafile(xnc, mp->m_sb.sb_gquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_metafile(xnc, mp->m_sb.sb_pquotino); + if (error) + goto out_abort; + mutex_unlock(&xnc->lock); + + return 0; + +out_abort: + mutex_unlock(&xnc->lock); + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(xnc->sc); + return error; +} + +/* Advance the collection scan cursor for this file. */ +static inline int +xchk_nlinks_file( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + xfs_ilock(ip, XFS_IOLOCK_SHARED); + xchk_iscan_mark_visited(&xnc->collect_iscan, ip); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; +} + +/* Walk all directories and count inode links. */ +STATIC int +xchk_nlinks_collect( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_scrub *sc = xnc->sc; + int error; + + /* Count the rt and quota files if they're rooted in the superblock. */ + if (!xfs_has_metadir(sc->mp)) { + error = xchk_nlinks_metafiles(xnc); + if (error) + return error; + } + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Cancel the transaction to release the log grant space while we scan + * the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_advance(sc, &xnc->collect_iscan)) == 1) { + struct xfs_inode *ip; + + error = xchk_iscan_iget(sc, &xnc->collect_iscan, &ip); + if (error == -EAGAIN) + continue; + if (error) + break; + + if (S_ISDIR(VFS_I(ip)->i_mode)) + error = xchk_nlinks_dir(xnc, ip); + else + error = xchk_nlinks_file(xnc, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + + if (error == -ECANCELED) + xchk_set_incomplete(sc); + if (error) + return error; + + /* + * Switch out for a real transaction in preparation for building a new + * tree. + */ + xchk_trans_cancel(sc); + return xchk_setup_fs(sc); +} + +/* Check our link count against an inode. */ +STATIC int +xchk_nlinks_compare_inode( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + struct xchk_nlink obs; + struct xfs_scrub *sc = xnc->sc; + uint64_t total_links; + unsigned int actual_nlink; + int error; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + mutex_lock(&xnc->lock); + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_scanlock; + } + + error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs); + if (error) + goto out_scanlock; + total_links = xchk_nlink_total(&obs); + actual_nlink = VFS_I(ip)->i_nlink; + + trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs); + + /* We found more than the maxiumum possible link count. */ + if (total_links > U32_MAX) + xchk_ino_set_corrupt(sc, ip->i_ino); + + /* Link counts should match. */ + if (total_links != actual_nlink) + xchk_ino_set_corrupt(sc, ip->i_ino); + + /* + * Directories with nonzero link count must have at least one child + * (dot entry). The collection phase ignores directories with zero + * link count, so we ignore them here too. + */ + if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0 && obs.child < 1) + xchk_ino_set_corrupt(sc, ip->i_ino); + + /* Non-directories should not have children */ + if (!S_ISDIR(VFS_I(ip)->i_mode) && obs.child != 0) + xchk_ino_set_corrupt(sc, ip->i_ino); + + if (ip == sc->mp->m_metadirip || ip == sc->mp->m_rootip) { + /* Nothing should point to the directory tree roots. */ + if (obs.parent != 0) + xchk_ino_set_corrupt(sc, ip->i_ino); + + /* + * Directory tree roots should have at least two "child" + * references to cover dot and dotdot. + */ + if (obs.child < 2) + xchk_ino_set_corrupt(sc, ip->i_ino); + } else if (obs.parent == 0) { + /* Non-root linked files should have a parent. */ + if (actual_nlink != 0) + xchk_ino_set_corrupt(sc, ip->i_ino); + } + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + error = -EFSCORRUPTED; + +out_scanlock: + mutex_unlock(&xnc->lock); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +/* + * Check our link count against an inode that wasn't checked previously. This + * is intended to catch directories with dangling links, though we could be + * racing with inode allocation in other threads. + */ +STATIC int +xchk_nlinks_compare_inum( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + struct xchk_nlink obs; + struct xfs_mount *mp = xnc->sc->mp; + struct xfs_trans *tp = xnc->sc->tp; + struct xfs_buf *agi_bp; + struct xfs_inode *ip; + int error; + + /* + * Lock the AGI to the transaction just in case the lookup fails and we + * need something to prevent inode allocation while we reconfirm the + * observed nlink value. + */ + error = xfs_ialloc_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ino), &agi_bp); + if (error) + return error; + + error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &ip); + if (error == 0) { + /* Actually got an inode, so use the inode compare. */ + xfs_trans_brelse(tp, agi_bp); + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_irele(xnc->sc, ip); + return error; + } + if (error == -ENOENT || error == -EINVAL) { + /* No inode was found; check for zero link count below. */ + error = 0; + } + if (error) + goto out_agi; + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_agi; + } + + mutex_lock(&xnc->lock); + error = xchk_nlinks_comparison_read(xnc, ino, &obs); + if (error) + goto out_scanlock; + + trace_xchk_nlinks_check_zero(mp, ino, &obs); + + /* + * If we can't grab the inode, the link count had better be zero. We + * still hold the AGI to prevent inode allocation/freeing. + */ + if (xchk_nlink_total(&obs) != 0) { + xchk_ino_set_corrupt(xnc->sc, ino); + error = -ECANCELED; + } + +out_scanlock: + mutex_unlock(&xnc->lock); +out_agi: + xfs_trans_brelse(tp, agi_bp); + return error; +} + +/* Compare the link counts we observed against the live information. */ +STATIC int +xchk_nlinks_compare( + struct xchk_nlink_ctrs *xnc) +{ + struct xchk_nlink nl; + struct xfs_scrub *sc = xnc->sc; + uint64_t nr = 0; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Create a new empty transaction so that we can advance the iscan + * cursor without deadlocking if the inobt has a cycle and push on the + * inactivation workqueue. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + /* + * Use the inobt to walk all allocated inodes to compare the link + * counts. If we can't grab the inode, we'll try again in the second + * step. + */ + xchk_iscan_start(&xnc->compare_iscan); + while ((error = xchk_iscan_advance(sc, &xnc->compare_iscan)) == 1) { + struct xfs_inode *ip; + + error = xchk_iscan_iget(sc, &xnc->compare_iscan, &ip); + if (error == -ECANCELED) + continue; + if (error) + break; + + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_iscan_mark_visited(&xnc->compare_iscan, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_finish(&xnc->compare_iscan); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Walk all the non-null nlink observations that weren't checked in the + * previous step. + */ + mutex_lock(&xnc->lock); + while ((error = xfarray_iter(xnc->nlinks, &nr, &nl)) == 1) { + xfs_ino_t ino = nr - 1; + + if (nl.flags & XCHK_NLINK_COMPARE_SCANNED) + continue; + + mutex_unlock(&xnc->lock); + + error = xchk_nlinks_compare_inum(xnc, ino); + if (error) + return error; + + if (xchk_should_terminate(xnc->sc, &error)) + return error; + + mutex_lock(&xnc->lock); + } + mutex_unlock(&xnc->lock); + + return error; +} + +/* Tear down everything associated with a nlinks check. */ +static void +xchk_nlinks_teardown_scan( + struct xchk_nlink_ctrs *xnc) +{ + /* Discourage any hook functions that might be running. */ + xchk_iscan_abort(&xnc->collect_iscan); + + xfs_hook_del(&xnc->sc->mp->m_nlink_delta_hooks, &xnc->nlink_delta_hook); + + xfarray_destroy(xnc->nlinks); + xnc->nlinks = NULL; + + xchk_iscan_finish(&xnc->collect_iscan); + mutex_destroy(&xnc->lock); + xnc->sc = NULL; +} + +/* + * Scan all inodes in the entire filesystem to generate link count data. If + * the scan is successful, the counts will be left alive for a repair. If any + * error occurs, we'll tear everything down. + */ +STATIC int +xchk_nlinks_setup_scan( + struct xfs_scrub *sc, + struct xchk_nlink_ctrs *xnc) +{ + int error; + + ASSERT(xnc->sc == NULL); + xnc->sc = sc; + + mutex_init(&xnc->lock); + xnc->collect_iscan.iget_tries = 20; + xnc->collect_iscan.iget_retry_delay = HZ / 10; + xchk_iscan_start(&xnc->collect_iscan); + + error = xfarray_create(sc->mp, "file link counts", + sizeof(struct xchk_nlink), &xnc->nlinks); + if (error) + goto out_teardown; + + /* + * Hook into the bumplink/droplink code. The hook only triggers for + * inodes that were already scanned, and the scanner thread takes each + * inode's ILOCK, which means that any in-progress inode updates will + * finish before we can scan the inode. + */ + error = xfs_hook_add(&sc->mp->m_nlink_delta_hooks, + &xnc->nlink_delta_hook, xchk_nlinks_live_update); + if (error) + goto out_teardown; + + /* Use deferred cleanup to pass the inode link count data to repair. */ + sc->buf_cleanup = (void (*)(void *))xchk_nlinks_teardown_scan; + return 0; + +out_teardown: + xchk_nlinks_teardown_scan(xnc); + return error; +} + +/* Scrub the link count of all inodes on the filesystem. */ +int +xchk_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc = sc->buf; + int error = 0; + + /* Set ourselves up to check link counts on the live filesystem. */ + error = xchk_nlinks_setup_scan(sc, xnc); + if (error) + return error; + + /* Walk all inodes, picking up link count information. */ + error = xchk_nlinks_collect(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Compare link counts. */ + error = xchk_nlinks_compare(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + return 0; +} diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h new file mode 100644 index 000000000000..0ece2ab5dd38 --- /dev/null +++ b/fs/xfs/scrub/nlinks.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_NLINKS_H__ +#define __XFS_SCRUB_NLINKS_H__ + +/* Live link count control structure. */ +struct xchk_nlink_ctrs { + struct xfs_scrub *sc; + + /* Shadow link count data and its mutex. */ + struct xfarray *nlinks; + struct mutex lock; + + /* + * The collection step uses a separate iscan context from the compare + * step because the collection iscan coordinates live updates to the + * observation data while this scanner is running. The compare iscan + * is secondary and can be reinitialized as needed. + */ + struct xchk_iscan collect_iscan; + struct xchk_iscan compare_iscan; + + /* + * Hook into bumplink/droplink so that we can receive live updates + * from other writer threads. + */ + struct notifier_block nlink_delta_hook; +}; + +struct xchk_nlink { + /* Links from a parent directory to this inode. */ + xfs_nlink_t parent; + + /* Links from children of this inode (e.g. dot and dotdot). */ + xfs_nlink_t child; + + /* Record state flags */ + unsigned int flags; +}; + +/* This data item was seen by the check-time compare function. */ +#define XCHK_NLINK_COMPARE_SCANNED (1U << 0) + +/* Compute total link count, using large enough variables to detect overflow. */ +static inline uint64_t +xchk_nlink_total(const struct xchk_nlink *live) +{ + uint64_t ret = live->parent; + + return ret + live->child; +} + +#endif /* __XFS_SCRUB_NLINKS_H__ */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index e60f2e71dac0..efd44b31997d 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -393,6 +393,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .has = xfs_has_rtreflink, .repair = xrep_rtrefcountbt, }, + [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */ + .type = ST_FS, + .setup = xchk_setup_nlinks, + .scrub = xchk_nlinks, + .repair = xrep_notsupported, + }, }; /* This isn't a stable feature, warn once per day. */ diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index f0a5acf7df48..d9ad8049c84d 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -201,6 +201,7 @@ xchk_quotacheck(struct xfs_scrub *sc) } #endif int xchk_fscounters(struct xfs_scrub *sc); +int xchk_nlinks(struct xfs_scrub *sc); /* cross-referencing helpers */ void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno, diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 2deee7fabb96..988b54eb2ebd 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -19,6 +19,7 @@ #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/iscan.h" +#include "scrub/nlinks.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 10c43e03e133..0ee0c7fd7400 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -21,6 +21,7 @@ struct xfs_scrub; struct xfile; struct xfarray; struct xchk_iscan; +struct xchk_nlink; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -97,6 +98,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); { XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \ { XFS_SCRUB_TYPE_RTRMAPBT, "rtrmapbt" }, \ { XFS_SCRUB_TYPE_RTREFCBT, "rtrefcountbt" }, \ + { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \ { XFS_SCRUB_TYPE_BARRIER, "barrier" } #define XFS_SCRUB_FLAG_STRINGS \ @@ -990,6 +992,136 @@ TRACE_EVENT(xchk_iscan_iget, __entry->cursor, __entry->visited, __entry->error) ); +TRACE_EVENT(xchk_nlinks_walk_dir, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + bool parent_to_child, xfs_ino_t ino, const char *name, + unsigned int namelen), + TP_ARGS(mp, dp, parent_to_child, ino, name, namelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(bool, parent) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp->i_ino; + __entry->parent = parent_to_child; + __entry->ino = ino; + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + ), + TP_printk("dev %d:%d dir 0x%llx parent_to_child? %d ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->parent, + __entry->ino, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_nlinks_metafile, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino), + TP_ARGS(mp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +); + +TRACE_EVENT(xchk_nlinks_live_update, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + bool parent_to_child, xfs_ino_t ino, int delta), + TP_ARGS(mp, dp, parent_to_child, ino, delta), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(bool, parent) + __field(xfs_ino_t, ino) + __field(int, delta) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp ? dp->i_ino : NULLFSINO; + __entry->parent = parent_to_child; + __entry->ino = ino; + __entry->delta = delta; + ), + TP_printk("dev %d:%d dir 0x%llx parent_to_child? %d ino 0x%llx nlink_delta %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->parent, + __entry->ino, + __entry->delta) +); + +TRACE_EVENT(xchk_nlinks_check_zero, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *live), + TP_ARGS(mp, ino, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parent) + __field(xfs_nlink_t, child) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parent = live->parent; + __entry->child = live->child; + ), + TP_printk("dev %d:%d ino 0x%llx parent_links %u child_links %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent, + __entry->child) +); + +DECLARE_EVENT_CLASS(xchk_nlink_diff_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, + const struct xchk_nlink *live), + TP_ARGS(mp, ip, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(umode_t, mode) + __field(xfs_nlink_t, nlink) + __field(xfs_nlink_t, parent) + __field(xfs_nlink_t, child) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->mode = VFS_I(ip)->i_mode; + __entry->nlink = VFS_I(ip)->i_nlink; + __entry->parent = live->parent; + __entry->child = live->child; + ), + TP_printk("dev %d:%d ino 0x%llx dir? %d nlink %u parent_links %u child_links %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + !!S_ISDIR(__entry->mode), + __entry->nlink, + __entry->parent, + __entry->child) +); +#define DEFINE_SCRUB_NLINK_DIFF_EVENT(name) \ +DEFINE_EVENT(xchk_nlink_diff_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, \ + const struct xchk_nlink *live), \ + TP_ARGS(mp, ip, live)) +DEFINE_SCRUB_NLINK_DIFF_EVENT(xchk_nlinks_compare_inode); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 077acfaf8e1d..c2f854079f33 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -678,6 +678,47 @@ xfs_icreate_dqalloc( flags, udqpp, gdqpp, pdqpp); } +#ifdef CONFIG_XFS_LIVE_HOOKS +static inline void +xfs_nlink_delta( + struct xfs_inode *dp, + struct xfs_inode *ip, + enum xfs_nlink_delta_type type, + int delta) +{ + struct xfs_nlink_delta_params p; + struct xfs_mount *mp = ip->i_mount; + + p.dp = dp; + p.ino = ip->i_ino; + p.delta = delta; + + xfs_hook_call(&mp->m_nlink_delta_hooks, type, &p); +} + +/* Call a hook to capture nlink updates in real time. */ +static inline void +xfs_nlink_child_delta( + struct xfs_inode *dp, + struct xfs_inode *ip, + int delta) +{ + xfs_nlink_delta(dp, ip, XFS_CHILD_NLINK_DELTA, delta); +} + +/* Call a hook to capture nlink updates in real time. */ +void +xfs_nlink_parent_delta( + struct xfs_inode *dp, + struct xfs_inode *ip, + int delta) +{ + xfs_nlink_delta(dp, ip, XFS_PARENT_NLINK_DELTA, delta); +} +#else +# define xfs_nlink_child_delta(dp, ip, delta) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + int xfs_create( struct xfs_inode *dp, @@ -770,6 +811,16 @@ xfs_create( goto out_trans_cancel; /* + * Create ip with a reference from dp, and add '.' and '..' references + * if it's a directory. + */ + xfs_nlink_parent_delta(dp, ip, 1); + if (is_dir) { + xfs_nlink_child_delta(ip, ip, 1); + xfs_nlink_child_delta(ip, dp, 1); + } + + /* * If this is a synchronous mount, make sure that the * create transaction goes to disk before returning to * the user. @@ -1064,6 +1115,7 @@ xfs_link( error = xfs_dir_link_existing_child(tp, resblks, tdp, target_name, sip); if (error) goto error_return; + xfs_nlink_parent_delta(tdp, sip, 1); /* * If this is a synchronous mount, make sure that the @@ -2205,6 +2257,16 @@ xfs_remove( goto out_trans_cancel; /* + * Drop the link from dp to ip, and if ip was a directory, remove the + * '.' and '..' references since we freed the directory. + */ + xfs_nlink_parent_delta(dp, ip, -1); + if (S_ISDIR(VFS_I(ip)->i_mode)) { + xfs_nlink_child_delta(ip, dp, -1); + xfs_nlink_child_delta(ip, ip, -1); + } + + /* * If this is a synchronous mount, make sure that the * remove transaction goes to disk before returning to * the user. @@ -2331,6 +2393,72 @@ xfs_rename_alloc_whiteout( return 0; } +#ifdef CONFIG_XFS_LIVE_HOOKS +static inline void +xfs_rename_call_nlink_hooks( + struct xfs_inode *src_dp, + struct xfs_inode *src_ip, + struct xfs_inode *target_dp, + struct xfs_inode *target_ip, + struct xfs_inode *wip, + unsigned int flags) +{ + /* If we added a whiteout, add the reference from src_dp. */ + if (wip) + xfs_nlink_parent_delta(src_dp, wip, 1); + + /* Move the src_ip reference from src_dp to target_dp. */ + xfs_nlink_parent_delta(src_dp, src_ip, -1); + xfs_nlink_parent_delta(target_dp, src_ip, 1); + + /* + * If src_ip is a dir, move its '..' reference from src_dp to + * target_dp. + */ + if (S_ISDIR(VFS_I(src_ip)->i_mode)) { + xfs_nlink_child_delta(src_ip, src_dp, -1); + xfs_nlink_child_delta(src_ip, target_dp, 1); + } + + if (!target_ip) + return; + + if (flags & RENAME_EXCHANGE) { + /* Move the target_ip reference from target_dp to src_dp. */ + xfs_nlink_parent_delta(target_dp, target_ip, -1); + xfs_nlink_parent_delta(src_dp, target_ip, 1); + + /* + * If target_ip is a dir, move its '..' reference from + * target_dp to src_dp. + */ + if (S_ISDIR(VFS_I(target_ip)->i_mode)) { + xfs_nlink_child_delta(target_ip, target_dp, -1); + xfs_nlink_child_delta(target_ip, src_dp, 1); + } + + return; + } + + /* Drop target_ip's reference from target_dp. */ + xfs_nlink_parent_delta(target_dp, target_ip, -1); + + if (!S_ISDIR(VFS_I(target_ip)->i_mode)) + return; + + /* + * If target_ip was a dir, drop the '.' and '..' references since that + * was the last reference. + */ + ASSERT(VFS_I(target_ip)->i_nlink == 0); + xfs_nlink_child_delta(target_ip, target_dp, -1); + xfs_nlink_child_delta(target_ip, target_ip, -1); +} +#else +# define xfs_rename_call_nlink_hooks(src_dp, src_ip, target_dp, \ + target_ip, wip, flags) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + /* * xfs_rename */ @@ -2470,6 +2598,9 @@ xfs_rename( VFS_I(wip)->i_state &= ~I_LINKABLE; } + xfs_rename_call_nlink_hooks(src_dp, src_ip, target_dp, target_ip, wip, + flags); + error = xfs_finish_rename(tp); if (wip) xfs_irele(wip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8dac6fd2760a..2eb8389762ef 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -539,4 +539,26 @@ int xfs_icreate_dqalloc(const struct xfs_icreate_args *args, int xfs_file_cow_around(struct xfs_inode *ip, loff_t pos, long long int count); +/* + * Parameters for tracking bumplink and droplink operations. The hook + * function arg parameter is one of these. + */ +enum xfs_nlink_delta_type { + XFS_PARENT_NLINK_DELTA, /* parent pointing to child */ + XFS_CHILD_NLINK_DELTA, /* child pointing to parent */ +}; + +struct xfs_nlink_delta_params { + struct xfs_inode *dp; + xfs_ino_t ino; + int delta; +}; + +#ifdef CONFIG_XFS_LIVE_HOOKS +void xfs_nlink_parent_delta(struct xfs_inode *dp, struct xfs_inode *ip, + int delta); +#else +# define xfs_nlink_parent_delta(dp, ip, delta) ((void)0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index fb4f170ce21b..5943f8f100a7 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -273,6 +273,9 @@ typedef struct xfs_mount { * while a repair freeze is in progress. */ struct mutex m_scrub_freeze; + + /* online file link count check stuff */ + struct xfs_hook_chain m_nlink_delta_hooks; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1edacacb4807..879e56cd8498 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1992,6 +1992,8 @@ static int xfs_init_fs_context( mp->m_logbsize = -1; mp->m_allocsize_log = 16; /* 64k */ + xfs_hook_init(&mp->m_nlink_delta_hooks); + /* * Copy binary VFS mount flags we are interested in. */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 177987c3716f..42f0bd0c26c7 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -332,6 +332,7 @@ xfs_symlink( goto out_trans_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + xfs_nlink_parent_delta(dp, ip, 1); /* * If this is a synchronous mount, make sure that the |