// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2022 Oracle. All Rights Reserved. * Author: Darrick J. Wong */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_ag.h" #include "xfs_error.h" #include "xfs_bit.h" #include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/iscan.h" #include "scrub/common.h" #include "scrub/trace.h" /* * Live File Scan * ============== * * Live file scans walk every inode in a live filesystem. This is more or * less like a regular iwalk, except that when we're advancing the scan cursor, * we must ensure that inodes cannot be added or deleted anywhere between the * old cursor value and the new cursor value. If we're advancing the cursor * by one inode, the caller must hold that inode; if we're finding the next * inode to scan, we must grab the AGI and hold it until we've updated the * scan cursor. * * Callers are expected to use this code to scan all files in the filesystem to * construct a new metadata index of some kind. The scan races against other * live updates, which means there must be a provision to update the new index * when updates are made to inodes that already been scanned. The iscan lock * can be used in live update hook code to stop the scan and protect this data * structure. * * To keep the new index up to date with other metadata updates being made to * the live filesystem, it is assumed that the caller will add hooks as needed * to be notified when a metadata update occurs. The inode scanner must tell * the hook code when an inode has been visited with xchk_iscan_mark_visit. * Hook functions can use xchk_iscan_want_live_update to decide if the * scanner's observations must be updated. */ /* * Set the bits in @irec's free mask that correspond to the inodes before * @agino so that we skip them. This is how we restart an inode walk that was * interrupted in the middle of an inode record. */ STATIC void xchk_iscan_adjust_start( xfs_agino_t agino, /* starting inode of chunk */ struct xfs_inobt_rec_incore *irec) /* btree record */ { int idx; /* index into inode chunk */ idx = agino - irec->ir_startino; irec->ir_free |= xfs_inobt_maskn(0, idx); irec->ir_freecount = hweight64(irec->ir_free); } /* * Set *cursor to the next allocated inode after whatever it's set to now. * If there are no more inodes in this AG, cursor is set to NULLAGINO. */ STATIC int xchk_iscan_find_next( struct xfs_scrub *sc, struct xfs_buf *agi_bp, struct xfs_perag *pag, xfs_agino_t *cursor) { struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_mount *mp = sc->mp; struct xfs_trans *tp = sc->tp; xfs_agnumber_t agno = pag->pag_agno; xfs_agino_t lastino = NULLAGINO; xfs_agino_t first, last; xfs_agino_t agino = *cursor; int has_rec; int error; /* If the cursor is beyond the end of this AG, move to the next one. */ xfs_agino_range(mp, agno, &first, &last); if (agino > last) { *cursor = NULLAGINO; return 0; } /* * Look up the inode chunk for the current cursor position. If there * is no chunk here, we want the next one. */ cur = xfs_inobt_init_cursor(mp, tp, agi_bp, pag, XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec); if (!error && !has_rec) error = xfs_btree_increment(cur, 0, &has_rec); for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) { /* * If we've run out of inobt records in this AG, move the * cursor on to the next AG and exit. The caller can try * again with the next AG. */ if (!has_rec) { *cursor = NULLAGINO; break; } error = xfs_inobt_get_rec(cur, &rec, &has_rec); if (error) break; if (!has_rec) { error = -EFSCORRUPTED; break; } /* Make sure that we always move forward. */ if (lastino != NULLAGINO && XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) { error = -EFSCORRUPTED; break; } lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1; /* * If this record only covers inodes that come before the * cursor, advance to the next record. */ if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) continue; /* * If the incoming lookup put us in the middle of an inobt * record, mark it and the previous inodes "free" so that the * search for allocated inodes will start at the cursor. Use * funny math to avoid overflowing the bit shift. */ if (agino >= rec.ir_startino) xchk_iscan_adjust_start(agino + 1, &rec); /* * If there are allocated inodes in this chunk, find them, * and update the cursor. */ if (rec.ir_freecount < XFS_INODES_PER_CHUNK) { int next = xfs_lowbit64(~rec.ir_free); *cursor = rec.ir_startino + next; break; } } xfs_btree_del_cursor(cur, error); return error; } /* * Prepare to return agno/agino to the iscan caller by moving the lastino * cursor to the previous inode. Do this while we still hold the AGI so that * no other threads can create or delete inodes in this AG. */ static inline void xchk_iscan_move_cursor( struct xfs_scrub *sc, struct xchk_iscan *iscan, xfs_agnumber_t agno, xfs_agino_t agino) { struct xfs_mount *mp = sc->mp; mutex_lock(&iscan->lock); iscan->cursor_ino = XFS_AGINO_TO_INO(mp, agno, agino); iscan->__visited_ino = iscan->cursor_ino - 1; trace_xchk_iscan_move_cursor(mp, iscan); mutex_unlock(&iscan->lock); } /* * Prepare to return agno/agino to the iscan caller by moving the lastino * cursor to the previous inode. Do this while we still hold the AGI so that * no other threads can create or delete inodes in this AG. */ static inline void xchk_iscan_finish_scan( struct xfs_scrub *sc, struct xchk_iscan *iscan) { struct xfs_mount *mp = sc->mp; mutex_lock(&iscan->lock); iscan->cursor_ino = NULLFSINO; /* All live updates will be applied from now on */ iscan->__visited_ino = NULLFSINO; trace_xchk_iscan_move_cursor(mp, iscan); mutex_unlock(&iscan->lock); } /* * Advance ino to the next inode that the inobt thinks is allocated, being * careful to jump to the next AG if we've reached the right end of this AG's * inode btree. Advancing ino effectively means that we've pushed the inode * scan forward, so set the iscan cursor to (ino - 1) so that our live update * predicates will track inode allocations in that part of the inode number * key space once we release the AGI buffer. * * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, * -ECANCELED if the live scan aborted, or the usual negative errno. */ STATIC int xchk_iscan_advance( struct xfs_scrub *sc, struct xchk_iscan *iscan, struct xfs_buf **agi_bpp) { struct xfs_mount *mp = sc->mp; struct xfs_buf *agi_bp; struct xfs_perag *pag; xfs_agnumber_t agno; xfs_agino_t agino; int ret; ASSERT(iscan->cursor_ino >= iscan->__visited_ino); do { agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino); pag = xfs_perag_get(mp, agno); if (!pag) { xchk_iscan_finish_scan(sc, iscan); return 0; } ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); if (ret) goto out_pag; agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino); ret = xchk_iscan_find_next(sc, agi_bp, pag, &agino); if (ret) goto out_buf; if (agino != NULLAGINO) break; xchk_iscan_move_cursor(sc, iscan, agno + 1, 0); xfs_trans_brelse(sc->tp, agi_bp); xfs_perag_put(pag); if (xchk_iscan_aborted(iscan)) return -ECANCELED; } while (1); xchk_iscan_move_cursor(sc, iscan, agno, agino); *agi_bpp = agi_bp; xfs_perag_put(pag); return 1; out_buf: xfs_trans_brelse(sc->tp, agi_bp); out_pag: xfs_perag_put(pag); return ret; } /* * Grabbing the inode failed, so we need to back up the scan and ask the caller * to try to _advance the scan again. Returns -EBUSY if we've run out of retry * opportunities, -ECANCELED if the process has a fatal signal pending, or * -EAGAIN if we should try again. */ STATIC int xchk_iscan_iget_retry( struct xfs_mount *mp, struct xchk_iscan *iscan, bool wait) { ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1); if (!iscan->iget_timeout || time_is_before_jiffies(iscan->__iget_deadline)) return -EBUSY; if (wait) { unsigned long relax; /* * Sleep for a period of time to let the rest of the system * catch up. If we return early, someone sent a kill signal to * the calling process. */ relax = msecs_to_jiffies(iscan->iget_retry_delay); trace_xchk_iscan_iget_retry_wait(mp, iscan); if (schedule_timeout_killable(relax) || xchk_iscan_aborted(iscan)) return -ECANCELED; } iscan->cursor_ino--; return -EAGAIN; } /* * Grab an inode as part of an inode scan. While scanning this inode, the * caller must ensure that no other threads can modify the inode until a call * to xchk_iscan_visit succeeds. * * Returns 0 and an incore inode; -EAGAIN if the caller should call again * xchk_iscan_advance; -EBUSY if we couldn't grab an inode; -ECANCELED if * there's a fatal signal pending; or some other negative errno. */ STATIC int xchk_iscan_iget( struct xfs_scrub *sc, struct xchk_iscan *iscan, struct xfs_buf *agi_bp, struct xfs_inode **ipp) { struct xfs_mount *mp = sc->mp; int error; error = xfs_iget(sc->mp, sc->tp, iscan->cursor_ino, XFS_IGET_NOWAIT, 0, ipp); xfs_trans_brelse(sc->tp, agi_bp); trace_xchk_iscan_iget(mp, iscan, error); if (error == -ENOENT || error == -EAGAIN) { /*¬ * It's possible that this inode has lost all of its links but * hasn't yet been inactivated. If we don't have a transaction * or it's not writable, flush the inodegc workers and wait. */ xfs_inodegc_flush(mp); return xchk_iscan_iget_retry(mp, iscan, true); } if (error == -EINVAL) { /* * We thought the inode was allocated, but the inode btree * lookup failed, which means that it was freed since the last * time we advanced the cursor. Back up and try again. This * should never happen since still hold the AGI buffer from the * inobt check, but we need to be careful about infinite loops. */ return xchk_iscan_iget_retry(mp, iscan, false); } return error; } /* * Advance the inode scan cursor to the next allocated inode and return the * incore inode structure associated with it. * * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be * grabbed, or the usual negative errno. * * If the function returns -EBUSY and the caller can handle skipping an inode, * it may call this function again to continue the scan with the next allocated * inode. */ int xchk_iscan_iter( struct xfs_scrub *sc, struct xchk_iscan *iscan, struct xfs_inode **ipp) { int ret; if (iscan->iget_timeout) iscan->__iget_deadline = jiffies + msecs_to_jiffies(iscan->iget_timeout); do { struct xfs_buf *agi_bp = NULL; ret = xchk_iscan_advance(sc, iscan, &agi_bp); if (ret != 1) return ret; if (xchk_iscan_aborted(iscan)) { xfs_trans_brelse(sc->tp, agi_bp); ret = -ECANCELED; break; } ret = xchk_iscan_iget(sc, iscan, agi_bp, ipp); } while (ret == -EAGAIN); if (!ret) return 1; return ret; } /* Release inode scan resources. */ void xchk_iscan_finish( struct xchk_iscan *iscan) { mutex_destroy(&iscan->lock); iscan->cursor_ino = NULLFSINO; iscan->__visited_ino = NULLFSINO; } /* * Set ourselves up to start an inode scan. If the @iget_timeout and * @iget_retry_delay parameters are set, the scan will try to iget each inode * for @iget_timeout milliseconds. If an iget call indicates that the inode is * waiting to be inactivated, the CPU will relax for @iget_retry_delay * milliseconds after pushing the inactivation workers. */ void xchk_iscan_start( struct xchk_iscan *iscan, unsigned int iget_timeout, unsigned int iget_retry_delay) { clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); iscan->iget_timeout = iget_timeout; iscan->iget_retry_delay = iget_retry_delay; iscan->__visited_ino = 0; iscan->cursor_ino = 0; mutex_init(&iscan->lock); } /* * Mark this inode as having been visited. Callers must hold a sufficiently * exclusive lock on the inode to prevent concurrent modifications. */ void xchk_iscan_mark_visited( struct xchk_iscan *iscan, struct xfs_inode *ip) { mutex_lock(&iscan->lock); iscan->__visited_ino = ip->i_ino; trace_xchk_iscan_visit(ip->i_mount, iscan); mutex_unlock(&iscan->lock); } /* * Do we need a live update for this inode? This is true if the scanner thread * has visited this inode and the scan hasn't been aborted due to errors. * Callers must hold a sufficiently exclusive lock on the inode to prevent * scanners from reading any inode metadata. */ bool xchk_iscan_want_live_update( struct xchk_iscan *iscan, xfs_ino_t ino) { bool ret; if (xchk_iscan_aborted(iscan)) return false; mutex_lock(&iscan->lock); ret = iscan->__visited_ino >= ino; mutex_unlock(&iscan->lock); return ret; }