// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2022 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_bmap.h"
#include "xfs_quota.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_bmap_util.h"
#include "xfs_swapext.h"
#include "xfs_xchgrange.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/tempfile.h"
#include "scrub/tempswap.h"
#include "scrub/xfarray.h"
#include "scrub/xfblob.h"
#include "scrub/readdir.h"
#include "scrub/reap.h"
#include "scrub/parent.h"

/*
 * Directory Repair
 * ================
 *
 * We repair directories by reading the directory data blocks looking for
 * directory entries.  Salvaged entries are added to a private hidden temporary
 * dir without touching the link counts of the inodes found.  When we're done
 * salvaging, we rewrite the directory block owners and use an atomic extent
 * swap to commit the new directory blocks to the directory being repaired.
 * This will disrupt readdir cursors, but there's not much else we can do.
 */

/* Directory entry to be restored in the new directory. */
struct xrep_dirent {
	/* Cookie for retrieval of the dirent name. */
	xfblob_cookie		name_cookie;

	/* Target inode number. */
	xfs_ino_t		ino;

	/* Hash of the dirent name. */
	unsigned int		hash;

	/* Length of the dirent name. */
	uint8_t			namelen;

	/* File type of the dirent. */
	uint8_t			ftype;
};

struct xrep_dir {
	struct xfs_scrub	*sc;

	struct xrep_tempswap	tx;

	/* Fixed-size array of xrep_dirent structures. */
	struct xfarray		*dir_entries;

	/* Blobs containing directory entry names. */
	struct xfblob		*dir_names;

	/*
	 * This is the parent that we're going to set on the reconstructed
	 * directory.
	 */
	xfs_ino_t		parent_ino;

	/* nlink value of the corrected directory. */
	xfs_nlink_t		new_nlink;

	/* Preallocated args struct for performing dir operations */
	struct xfs_da_args	args;

	/* Directory entry name, plus the trailing null. */
	char			namebuf[MAXNAMELEN];
};

/* Absorb up to 8 pages of dirents before we flush them to the temp dir. */
#define XREP_DIR_SALVAGE_BYTES	(PAGE_SIZE * 8)

/* Set up for a directory repair. */
int
xrep_setup_directory(
	struct xfs_scrub	*sc)
{
	int			error;

	error = xrep_tempfile_create(sc, S_IFDIR);
	if (error)
		return error;

	sc->buf = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS);
	if (!sc->buf)
		return -ENOMEM;

	return 0;
}

/*
 * Decide if we want to salvage this entry.  We don't bother with oversized
 * names or the dot entry.
 */
STATIC int
xrep_dir_want_salvage(
	struct xrep_dir		*rd,
	const char		*name,
	int			namelen,
	xfs_ino_t		ino)
{
	struct xfs_mount	*mp = rd->sc->mp;

	/* No pointers to ourselves or to garbage. */
	if (ino == rd->sc->ip->i_ino)
		return false;
	if (!xfs_verify_dir_ino(mp, ino))
		return false;

	/* No weird looking names or dot entries. */
	if (namelen >= MAXNAMELEN || namelen <= 0)
		return false;
	if (namelen == 1 && name[0] == '.')
		return false;

	return true;
}

/* Allocate an in-core record to hold entries while we rebuild the dir data. */
STATIC int
xrep_dir_salvage_entry(
	struct xrep_dir		*rd,
	unsigned char		*name,
	unsigned int		namelen,
	xfs_ino_t		ino)
{
	struct xrep_dirent	entry = {
		.ino		= ino,
	};
	struct xfs_scrub	*sc = rd->sc;
	struct xfs_inode	*ip;
	unsigned int		i = 0;
	int			error = 0;

	if (xchk_should_terminate(sc, &error))
		return error;

	/*
	 * Truncate the name to the first character that would trip namecheck.
	 * If we no longer have a name after that, ignore this entry.
	 */
	while (i < namelen && name[i] != 0 && name[i] != '/')
		i++;
	if (i == 0)
		return 0;
	entry.namelen = i;
	entry.hash = xfs_da_hashname(name, entry.namelen);

	/* Ignore '..' entries; we already picked the new parent. */
	if (entry.namelen == 2 && name[0] == '.' && name[1] == '.') {
		trace_xrep_dir_salvaged_parent(sc->ip, ino);
		return 0;
	}

	trace_xrep_dir_salvage_entry(sc->ip, name, entry.namelen, ino);

	/*
	 * Compute the ftype or dump the entry if we can't.  We don't lock the
	 * inode because inodes can't change type while we have a reference.
	 */
	error = xchk_iget(sc, ino, &ip);
	if (error)
		return 0;

	entry.ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
	xchk_irele(sc, ip);

	/* Remember this for later. */
	error = xfblob_store(rd->dir_names, &entry.name_cookie, name,
			entry.namelen);
	if (error)
		return error;

	return xfarray_append(rd->dir_entries, &entry);
}

/* Record a shortform directory entry for later reinsertion. */
STATIC int
xrep_dir_salvage_sf_entry(
	struct xrep_dir			*rd,
	struct xfs_dir2_sf_hdr		*sfp,
	struct xfs_dir2_sf_entry	*sfep)
{
	xfs_ino_t			ino;

	ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
	if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
		return 0;

	return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
}

/* Record a regular directory entry for later reinsertion. */
STATIC int
xrep_dir_salvage_data_entry(
	struct xrep_dir			*rd,
	struct xfs_dir2_data_entry	*dep)
{
	xfs_ino_t			ino;

	ino = be64_to_cpu(dep->inumber);
	if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
		return 0;

	return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
}

/* Try to recover block/data format directory entries. */
STATIC int
xrep_dir_recover_data(
	struct xrep_dir		*rd,
	struct xfs_buf		*bp)
{
	struct xfs_da_geometry	*geo = rd->sc->mp->m_dir_geo;
	unsigned int		offset;
	unsigned int		end;
	int			error = 0;

	/*
	 * Loop over the data portion of the block.
	 * Each object is a real entry (dep) or an unused one (dup).
	 */
	offset = geo->data_entry_offset;
	end = min_t(unsigned int, BBTOB(bp->b_length),
			xfs_dir3_data_end_offset(geo, bp->b_addr));

	while (offset < end) {
		struct xfs_dir2_data_unused	*dup = bp->b_addr + offset;
		struct xfs_dir2_data_entry	*dep = bp->b_addr + offset;

		if (xchk_should_terminate(rd->sc, &error))
			return error;

		/* Skip unused entries. */
		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
			offset += be16_to_cpu(dup->length);
			continue;
		}

		/* Don't walk off the end of the block. */
		offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
		if (offset > end)
			break;

		/* Ok, let's save this entry. */
		error = xrep_dir_salvage_data_entry(rd, dep);
		if (error)
			return error;

	}

	return 0;
}

/* Try to recover shortform directory entries. */
STATIC int
xrep_dir_recover_sf(
	struct xrep_dir			*rd)
{
	struct xfs_dir2_sf_hdr		*sfp;
	struct xfs_dir2_sf_entry	*sfep;
	struct xfs_dir2_sf_entry	*next;
	struct xfs_ifork		*ifp;
	xfs_ino_t			ino;
	unsigned char			*end;
	int				error = 0;

	ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
	sfp = (struct xfs_dir2_sf_hdr *)rd->sc->ip->i_df.if_u1.if_data;
	end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes;

	ino = xfs_dir2_sf_get_parent_ino(sfp);
	trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);

	sfep = xfs_dir2_sf_firstentry(sfp);
	while ((unsigned char *)sfep < end) {
		if (xchk_should_terminate(rd->sc, &error))
			return error;

		next = xfs_dir2_sf_nextentry(rd->sc->mp, sfp, sfep);
		if ((unsigned char *)next > end)
			break;

		/* Ok, let's save this entry. */
		error = xrep_dir_salvage_sf_entry(rd, sfp, sfep);
		if (error)
			return error;

		sfep = next;
	}

	return 0;
}

/*
 * Try to figure out the format of this directory from the data fork mappings
 * and the directory size.  If we can be reasonably sure of format, we can be
 * more aggressive in salvaging directory entries.  On return, @magic_guess
 * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
 * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
 * and 0 if we can't tell.
 */
STATIC void
xrep_dir_guess_format(
	struct xrep_dir		*rd,
	__be32			*magic_guess)
{
	struct xfs_inode	*ip = rd->sc->ip;
	struct xfs_da_geometry	*geo = rd->sc->mp->m_dir_geo;
	xfs_fileoff_t		last;
	int			error;

	ASSERT(xfs_has_crc(ip->i_mount));

	*magic_guess = 0;

	/*
	 * If there's a single directory block and the directory size is
	 * exactly one block, this has to be a single block format directory.
	 */
	error = xfs_bmap_last_offset(ip, &last, XFS_DATA_FORK);
	if (!error && XFS_FSB_TO_B(ip->i_mount, last) == geo->blksize &&
	    ip->i_disk_size == geo->blksize) {
		*magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
		return;
	}

	/*
	 * If the last extent before the leaf offset matches the directory
	 * size and the directory size is larger than 1 block, this is a
	 * data format directory.
	 */
	last = geo->leafblk;
	error = xfs_bmap_last_before(rd->sc->tp, ip, &last, XFS_DATA_FORK);
	if (!error &&
	    XFS_FSB_TO_B(ip->i_mount, last) > geo->blksize &&
	    XFS_FSB_TO_B(ip->i_mount, last) == ip->i_disk_size) {
		*magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
		return;
	}
}

/* Recover directory entries from a specific directory block. */
STATIC int
xrep_dir_recover_dirblock(
	struct xrep_dir		*rd,
	__be32			magic_guess,
	xfs_dablk_t		dabno)
{
	struct xfs_dir2_data_hdr *hdr;
	struct xfs_buf		*bp;
	__be32			oldmagic;
	int			error;

	/*
	 * Try to read buffer.  We invalidate them in the next step so we don't
	 * bother to set a buffer type or ops.
	 */
	error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
			XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
	if (error || !bp)
		return error;

	hdr = bp->b_addr;
	oldmagic = hdr->magic;

	trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
			be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));

	/*
	 * If we're sure of the block's format, proceed with the salvage
	 * operation using the specified magic number.
	 */
	if (magic_guess) {
		hdr->magic = magic_guess;
		goto recover;
	}

	/*
	 * If we couldn't guess what type of directory this is, then we will
	 * only salvage entries from directory blocks that match the magic
	 * number and pass verifiers.
	 */
	switch (hdr->magic) {
	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
		if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
			goto out;
		if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
			goto out;
		break;
	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
		if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
			goto out;
		if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
			goto out;
		break;
	default:
		goto out;
	}

recover:
	error = xrep_dir_recover_data(rd, bp);

out:
	hdr->magic = oldmagic;
	xfs_trans_brelse(rd->sc->tp, bp);
	return error;
}

static inline void xrep_dir_init_args(struct xrep_dir *rd)
{
	memset(&rd->args, 0, sizeof(struct xfs_da_args));
	rd->args.geo = rd->sc->mp->m_dir_geo;
	rd->args.whichfork = XFS_DATA_FORK;
	rd->args.owner = rd->sc->ip->i_ino;
	rd->args.trans = rd->sc->tp;
}

/*
 * Enter a name in a directory, or check for available space.
 * If inum is 0, only the available space test is performed.
 */
STATIC int
xrep_dir_createname(
	struct xrep_dir		*rd,
	struct xfs_inode	*dp,
	const struct xfs_name	*name,
	xfs_ino_t		inum,
	xfs_extlen_t		total)
{
	struct xfs_scrub	*sc = rd->sc;
	bool			is_block, is_leaf;
	int			error;

	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));

	if (inum) {
		error = xfs_dir_ino_validate(sc->mp, inum);
		if (error)
			return error;
	}

	xrep_dir_init_args(rd);
	rd->args.name = name->name;
	rd->args.namelen = name->len;
	rd->args.filetype = name->type;
	rd->args.hashval = xfs_dir2_hashname(sc->mp, name);
	rd->args.inumber = inum;
	rd->args.dp = dp;
	rd->args.total = total;
	rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;

	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
		return xfs_dir2_sf_addname(&rd->args);

	error = xfs_dir2_isblock(&rd->args, &is_block);
	if (error)
		return error;
	if (is_block)
		return xfs_dir2_block_addname(&rd->args);

	error = xfs_dir2_isleaf(&rd->args, &is_leaf);
	if (error)
		return error;
	if (is_leaf)
		return xfs_dir2_leaf_addname(&rd->args);

	return xfs_dir2_node_addname(&rd->args);
}

/* Insert one dir entry without cycling locks or transactions. */
STATIC int
xrep_dir_insert_rec(
	struct xrep_dir			*rd,
	const struct xrep_dirent	*entry)
{
	struct xfs_name			name = {
		.len			= entry->namelen,
		.type			= entry->ftype,
		.name			= rd->namebuf,
	};
	struct xfs_mount		*mp = rd->sc->mp;
	char				*namebuf = rd->namebuf;
	xfs_ino_t			ino;
	uint				resblks;
	int				error;

	/* The entry name is stored in the in-core buffer. */
	error = xfblob_load(rd->dir_names, entry->name_cookie, namebuf,
			entry->namelen);
	if (error)
		return error;
	namebuf[MAXNAMELEN - 1] = 0;

	trace_xrep_dir_insert_rec(rd->sc->tempip, &name, entry->ino);

	error = xfs_qm_dqattach(rd->sc->tempip);
	if (error)
		return error;

	resblks = XFS_LINK_SPACE_RES(mp, entry->namelen);
	error = xchk_trans_alloc(rd->sc, resblks);
	if (error)
		return error;

	/*
	 * Lock the temporary directory and join it to the transaction, and
	 * make sure this filename isn't unique before we add it.
	 */
	xrep_tempfile_ilock(rd->sc);
	xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);

	error = xchk_dir_lookup(rd->sc, rd->sc->tempip, &name, &ino);
	if (error != -ENOENT)
		goto out_cancel;

	error = xrep_dir_createname(rd, rd->sc->tempip, &name, entry->ino,
			resblks);
	if (error)
		goto out_cancel;

	if (name.type == XFS_DIR3_FT_DIR)
		rd->new_nlink++;

	/* Commit and unlock. */
	error = xrep_trans_commit(rd->sc);
	if (error)
		return error;

	xrep_tempfile_iunlock(rd->sc);
	return 0;
out_cancel:
	xchk_trans_cancel(rd->sc);
	xrep_tempfile_iunlock(rd->sc);
	return error;
}

/*
 * Periodically flush salvaged directory entries to the temporary file.  This
 * is done to reduce the memory requirements of the directory rebuild, since
 * directories can contain up to 32GB of directory data.
 */
STATIC int
xrep_dir_flush_salvaged(
	struct xrep_dir		*rd)
{
	xfarray_idx_t		array_cur;
	int			error;

	/*
	 * Entering this function, the scrub context has a reference to the
	 * inode being repaired, the temporary file, and a scrub transaction
	 * that we use during dirent salvaging to avoid livelocking if there
	 * are cycles in the directory structures.  We hold ILOCK_EXCL on both
	 * the inode being repaired and the temporary file, though they are
	 * not ijoined to the scrub transaction.
	 *
	 * To constrain kernel memory use, we occasionally write salvaged
	 * dirents from the xfarray and xfblob structures into the temporary
	 * directory in preparation for swapping the directory structures at
	 * the end.  Updating the temporary file requires a transaction, so we
	 * commit the scrub transaction and drop the two ILOCKs so that
	 * we can allocate whatever transaction we want.
	 *
	 * We still hold IOLOCK_EXCL on the inode being repaired, which
	 * prevents anyone from accessing the damaged directory data while we
	 * repair it.
	 */
	error = xrep_trans_commit(rd->sc);
	if (error)
		return error;
	xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);

	/*
	 * Take the IOLOCK of the temporary file while we modify dirents.  This
	 * isn't strictly required because the temporary file is never revealed
	 * to userspace, but we follow the same locking rules.
	 */
	while (!xrep_tempfile_iolock_nowait(rd->sc)) {
		if (xchk_should_terminate(rd->sc, &error))
			return error;
		delay(1);
	}

	/* Add all the salvaged dirents to the temporary directory. */
	foreach_xfarray_idx(rd->dir_entries, array_cur) {
		struct xrep_dirent	entry;

		error = xfarray_load(rd->dir_entries, array_cur, &entry);
		if (error)
			return error;

		error = xrep_dir_insert_rec(rd, &entry);
		if (error)
			return error;
	}
	xrep_tempfile_iounlock(rd->sc);

	/* Empty out both arrays now that we've added the entries. */
	xfarray_truncate(rd->dir_entries);
	xfblob_truncate(rd->dir_names);

	/* Recreate the salvage transaction and relock both inodes. */
	error = xchk_trans_alloc(rd->sc, 0);
	if (error)
		return error;
	xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
	return 0;
}

/* Extract as many directory entries as we can. */
STATIC int
xrep_dir_recover(
	struct xrep_dir		*rd)
{
	struct xfs_bmbt_irec	got;
	struct xfs_scrub	*sc = rd->sc;
	struct xfs_da_geometry	*geo = sc->mp->m_dir_geo;
	xfs_fileoff_t		offset;
	xfs_dablk_t		dabno;
	__be32			magic_guess;
	int			nmap;
	int			error;

	xrep_dir_guess_format(rd, &magic_guess);

	/* Iterate each directory data block in the data fork. */
	for (offset = 0;
	     offset < geo->leafblk;
	     offset = got.br_startoff + got.br_blockcount) {
		nmap = 1;
		error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
				&got, &nmap, 0);
		if (error)
			return error;
		if (nmap != 1)
			return -EFSCORRUPTED;
		if (!xfs_bmap_is_written_extent(&got))
			continue;

		for (dabno = round_up(got.br_startoff, geo->fsbcount);
		     dabno < got.br_startoff + got.br_blockcount;
		     dabno += geo->fsbcount) {
			if (xchk_should_terminate(rd->sc, &error))
				return error;

			error = xrep_dir_recover_dirblock(rd,
					magic_guess, dabno);
			if (error)
				return error;

			/* Flush dirents to constrain memory usage. */
			if (xfarray_bytes(rd->dir_entries) +
			    xfblob_bytes(rd->dir_names) <
			    XREP_DIR_SALVAGE_BYTES)
				continue;

			error = xrep_dir_flush_salvaged(rd);
			if (error)
				return error;
		}
	}

	return 0;
}

/*
 * Find all the directory entries for this inode by scraping them out of the
 * directory leaf blocks by hand, and flushing them into the temp dir.
 */
STATIC int
xrep_dir_find_entries(
	struct xrep_dir		*rd)
{
	struct xfs_inode	*ip = rd->sc->ip;
	int			error;

	error = xrep_ino_dqattach(rd->sc);
	if (error)
		return error;

	/*
	 * Salvage directory entries from the old directory, and write them to
	 * the temporary directory.
	 */
	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
		error = xrep_dir_recover_sf(rd);
	} else {
		error = xfs_iread_extents(rd->sc->tp, ip, XFS_DATA_FORK);
		if (error)
			return error;

		error = xrep_dir_recover(rd);
	}
	if (error)
		return error;

	return xrep_dir_flush_salvaged(rd);
}

/*
 * Free all the directory blocks and reset the data fork.  The caller must
 * join the inode to the transaction.  This function returns with the inode
 * joined to a clean scrub transaction.
 */
STATIC int
xrep_dir_reset_fork(
	struct xrep_dir		*rd,
	xfs_ino_t		parent_ino)
{
	struct xfs_scrub	*sc = rd->sc;
	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
	int			error;

	/* Unmap all the directory buffers. */
	if (xfs_ifork_has_extents(ifp)) {
		error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
		if (error)
			return error;
	}

	trace_xrep_dir_reset_fork(sc->tempip, parent_ino);

	/* Reset the data fork to an empty data fork. */
	xfs_idestroy_fork(ifp);
	ifp->if_bytes = 0;
	sc->tempip->i_disk_size = 0;

	/* Reinitialize the short form directory. */
	xrep_dir_init_args(rd);
	rd->args.dp = sc->tempip;
	error = xfs_dir2_sf_create(&rd->args, parent_ino);
	if (error)
		return error;

	return xrep_tempfile_roll_trans(sc);
}

/*
 * Prepare both inodes' directory forks for extent swapping.  Promote the
 * tempfile from short format to leaf format, and if the file being repaired
 * has a short format data fork, turn it into an empty extent list.
 */
STATIC int
xrep_dir_swap_prep(
	struct xfs_scrub	*sc,
	bool			temp_local,
	bool			ip_local)
{
	int			error;

	/*
	 * If the tempfile's directory is in shortform format, convert that
	 * to a single leaf extent so that we can use the atomic extent swap.
	 */
	if (temp_local) {
		struct xfs_da_args	args = {
			.dp		= sc->tempip,
			.geo		= sc->mp->m_dir_geo,
			.whichfork	= XFS_DATA_FORK,
			.trans		= sc->tp,
			.total		= 1,
			.owner		= sc->ip->i_ino,
		};

		error = xfs_dir2_sf_to_block(&args);
		if (error)
			return error;

		/*
		 * Roll the deferred log items to get us back to a clean
		 * transaction.
		 */
		error = xfs_defer_finish(&sc->tp);
		if (error)
			return error;
	}

	/*
	 * If the file being repaired had a shortform data fork, convert that
	 * to an empty extent list in preparation for the atomic extent swap.
	 */
	if (ip_local) {
		struct xfs_ifork	*ifp;

		ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
		xfs_idestroy_fork(ifp);
		ifp->if_format = XFS_DINODE_FMT_EXTENTS;
		ifp->if_nextents = 0;
		ifp->if_bytes = 0;
		ifp->if_u1.if_root = NULL;
		ifp->if_height = 0;

		xfs_trans_log_inode(sc->tp, sc->ip,
				XFS_ILOG_CORE | XFS_ILOG_DDATA);
	}

	return 0;
}

/*
 * Replace the inode number of a directory entry.
 */
static int
xrep_dir_replace(
	struct xrep_dir		*rd,
	struct xfs_inode	*dp,
	const struct xfs_name	*name,
	xfs_ino_t		inum,
	xfs_extlen_t		total)
{
	struct xfs_scrub	*sc = rd->sc;
	bool			is_block, is_leaf;
	int			error;

	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));

	error = xfs_dir_ino_validate(sc->mp, inum);
	if (error)
		return error;

	xrep_dir_init_args(rd);
	rd->args.name = name->name;
	rd->args.namelen = name->len;
	rd->args.filetype = name->type;
	rd->args.hashval = xfs_dir2_hashname(sc->mp, name);
	rd->args.inumber = inum;
	rd->args.dp = dp;
	rd->args.total = total;

	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
		return xfs_dir2_sf_replace(&rd->args);

	error = xfs_dir2_isblock(&rd->args, &is_block);
	if (error)
		return error;
	if (is_block)
		return xfs_dir2_block_replace(&rd->args);

	error = xfs_dir2_isleaf(&rd->args, &is_leaf);
	if (error)
		return error;
	if (is_leaf)
		return xfs_dir2_leaf_replace(&rd->args);

	return xfs_dir2_node_replace(&rd->args);
}

/* Swap the temporary directory's data fork with the one being repaired. */
STATIC int
xrep_dir_swap(
	struct xrep_dir		*rd)
{
	struct xfs_scrub	*sc = rd->sc;
	bool			ip_local, temp_local;
	int			error = 0;

	/*
	 * Take the IOLOCK on the temporary file so that we can run dir
	 * operations with the same locks held as we would for a normal file.
	 */
	while (!xrep_tempfile_iolock_nowait(rd->sc)) {
		if (xchk_should_terminate(rd->sc, &error))
			return error;
		delay(1);
	}

	error = xrep_tempswap_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
	if (error)
		return error;

	/*
	 * Reset the temporary directory's '.' entry to point to the directory
	 * we're repairing.  Note: shortform directories lack the dot entry.
	 *
	 * It's possible that this replacement could also expand a sf tempdir
	 * into block format.
	 */
	if (0) { // sc->tempip->i_df.if_format != XFS_DINODE_FMT_LOCAL) {
		error = xrep_dir_replace(rd, sc->tempip, &xfs_name_dot,
				sc->ip->i_ino, rd->tx.req.resblks);
		if (error)
			return error;
	}

	/*
	 * Reset the temporary directory's '..' entry to point to the parent
	 * that we found.  The temporary directory was created with the root
	 * directory as the parent, so we can skip this if repairing a
	 * subdirectory of the root.
	 *
	 * It's also possible that this replacement could also expand a sf
	 * tempdir into block format.
	 */
	if (rd->parent_ino != sc->mp->m_rootip->i_ino) {
		error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
				rd->parent_ino, rd->tx.req.resblks);
		if (error)
			return error;
	}

	/*
	 * Changing the dot and dotdot entries could have changed the shape of
	 * the directory, so we recompute these.
	 */
	ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
	temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;

	/*
	 * If the both files have a local format data fork and the rebuilt
	 * directory data would fit in the repaired file's data fork, copy
	 * the contents from the tempfile and declare ourselves done.
	 */
	if (ip_local && temp_local &&
	    sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
		set_nlink(VFS_I(sc->ip), rd->new_nlink);
		xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
		return 0;
	}

	/* Clean the transaction before we start working on the extent swap. */
	error = xrep_tempfile_roll_trans(rd->sc);
	if (error)
		return error;

	/* Otherwise, make sure both data forks are in block-mapping mode. */
	error = xrep_dir_swap_prep(sc, temp_local, ip_local);
	if (error)
		return error;

	/*
	 * Set nlink of the directory under repair to the number of
	 * subdirectories that will be in the new directory data.  Do this in
	 * the same transaction sequence that (atomically) commits the new
	 * data.
	 */
	set_nlink(VFS_I(sc->ip), rd->new_nlink);

	return xrep_tempswap_contents(sc, &rd->tx);
}

/*
 * Swap the new directory contents (which we created in the tempfile) into the
 * directory being repaired.
 */
STATIC int
xrep_dir_rebuild_tree(
	struct xrep_dir		*rd)
{
	struct xfs_scrub	*sc = rd->sc;
	int			error;

	trace_xrep_dir_rebuild_tree(sc->ip, rd->parent_ino);

	/*
	 * Commit the repair transaction so that we can use the atomic extent
	 * swap helper functions to compute the correct block reservations and
	 * re-lock the inodes.
	 *
	 * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
	 * modifications, but there's nothing to prevent userspace from reading
	 * the directory until we're ready for the swap operation.  Reads will
	 * return -EIO without shutting down the fs, so we're ok with that.
	 */
	error = xrep_trans_commit(sc);
	if (error)
		return error;

	xchk_iunlock(sc, XFS_ILOCK_EXCL);

	/*
	 * Swap the tempdir's data fork with the file being repaired.  This
	 * recreates the transaction and re-takes the ILOCK in the scrub
	 * context.
	 */
	error = xrep_dir_swap(rd);
	if (error)
		return error;

	/*
	 * Release the old directory blocks and reset the data fork of the temp
	 * directory to an empty shortform directory because inactivation does
	 * nothing for directories.
	 */
	return xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
}

/*
 * Look up the dotdot entry and confirm that it's really the parent.
 * Returns NULLFSINO if we don't know what to do.
 */
static inline xfs_ino_t
xrep_dir_lookup_parent(
	struct xrep_dir		*rd)
{
	struct xfs_scrub	*sc = rd->sc;
	xfs_ino_t		parent_ino;
	int			error;

	parent_ino = xrep_dotdot_lookup(sc);
	if (parent_ino == NULLFSINO)
		return parent_ino;

	error = xrep_parent_confirm(sc, &parent_ino);
	if (error)
		return NULLFSINO;

	return parent_ino;
}

/*
 * Look up '..' in the dentry cache and confirm that it's really the parent.
 * Returns NULLFSINO if the dcache misses or if the hit is implausible.
 */
static inline xfs_ino_t
xrep_dir_dcache_parent(
	struct xrep_dir		*rd)
{
	struct xfs_scrub	*sc = rd->sc;
	xfs_ino_t		parent_ino;
	int			error;

	parent_ino = xrep_parent_from_dcache(sc);
	if (parent_ino == NULLFSINO)
		return parent_ino;

	error = xrep_parent_confirm(sc, &parent_ino);
	if (error)
		return NULLFSINO;

	return parent_ino;
}

/* Try to find the parent of the directory being repaired. */
STATIC int
xrep_dir_find_parent(
	struct xrep_dir		*rd)
{
	int			error;

	rd->parent_ino = xrep_parent_self_reference(rd->sc);
	if (rd->parent_ino != NULLFSINO)
		return 0;

	rd->parent_ino = xrep_dir_dcache_parent(rd);
	if (rd->parent_ino != NULLFSINO)
		return 0;

	rd->parent_ino = xrep_dir_lookup_parent(rd);
	if (rd->parent_ino != NULLFSINO)
		return 0;

	/*
	 * A full filesystem scan is the last resort.  On a busy filesystem,
	 * the scan can fail with -EBUSY if we cannot grab IOLOCKs.  That means
	 * that we don't know what who the parent is, so we should return to
	 * userspace.
	 */
	error = xrep_parent_scan(rd->sc, &rd->parent_ino);
	if (error)
		return error;

	if (rd->parent_ino != NULLFSINO)
		return 0;

	/* NOTE: A future patch will deal with moving orphans. */
	return -EFSCORRUPTED;
}

/*
 * Repair the directory metadata.
 *
 * XXX: Directory entry buffers can be multiple fsblocks in size.  The buffer
 * cache in XFS can't handle aliased multiblock buffers, so this might
 * misbehave if the directory blocks are crosslinked with other filesystem
 * metadata.
 *
 * XXX: Is it necessary to check the dcache for this directory to make sure
 * that we always recreate every cached entry?
 */
int
xrep_directory(
	struct xfs_scrub	*sc)
{
	struct xrep_dir		*rd = sc->buf;
	int			error;

	/* We require the rmapbt to rebuild anything. */
	if (!xfs_has_rmapbt(sc->mp))
		return -EOPNOTSUPP;

	rd->sc = sc;
	rd->parent_ino = NULLFSINO;
	rd->new_nlink = 2;

	/* Set up some staging memory for salvaging dirents. */
	error = xfarray_create(sc->mp, "directory entries", 0,
			sizeof(struct xrep_dirent), &rd->dir_entries);
	if (error)
		goto out_rd;

	error = xfblob_create(sc->mp, "dirent names", &rd->dir_names);
	if (error)
		goto out_arr;

	/*
	 * Drop the ILOCK and MMAPLOCK on this directory; we don't need to
	 * hold these to maintain control over the directory we're fixing.
	 * This should leave us holding only IOLOCK_EXCL.  If we have to scan
	 * the entire filesystem to find or confirm the parent of this
	 * directory, we may have to cycle IOLOCK_EXCL.
	 */
	if (sc->ilock_flags & XFS_ILOCK_EXCL)
		xchk_iunlock(sc, XFS_ILOCK_EXCL);
	xchk_iunlock(sc, XFS_MMAPLOCK_EXCL);

	/* Figure out who is going to be the parent of this directory. */
	error = xrep_dir_find_parent(rd);
	if (error)
		goto out_names;

	/* Re-grab the ILOCK so that we can salvage directory entries. */
	xchk_ilock(sc, XFS_ILOCK_EXCL);

	/*
	 * Collect directory entries by parsing raw leaf blocks to salvage
	 * whatever we can.  When we're done, free the staging memory before
	 * swapping the directories to reduce memory usage.
	 */
	error = xrep_dir_find_entries(rd);
	if (error)
		goto out_names;

	xfblob_destroy(rd->dir_names);
	xfarray_destroy(rd->dir_entries);
	rd->dir_names = NULL;
	rd->dir_entries = NULL;

	/* Last chance to abort before we start committing fixes. */
	if (xchk_should_terminate(sc, &error))
		goto out_rd;

	/* Swap in the good contents. */
	error = xrep_dir_rebuild_tree(rd);

out_names:
	if (rd->dir_names)
		xfblob_destroy(rd->dir_names);
out_arr:
	if (rd->dir_entries)
		xfarray_destroy(rd->dir_entries);
out_rd:
	return error;
}