// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2020 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <darrick.wong@oracle.com>
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_bmap.h"
#include "xfs_quota.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_iwalk.h"
#include "xfs_swapext.h"
#include "xfs_bmap_util.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/array.h"
#include "scrub/blob.h"
#include "scrub/parent.h"

/*
 * Directory Repair
 * ================
 *
 * We repair directories by reading the directory leaf blocks looking for
 * entries, truncate the entire directory fork, and reinsert all the entries.
 * Unfortunately, there's not yet a secondary copy of directory attribute data,
 * which means that if we blow up midway through there's little we can do.
 */

/* Directory entry to be restored in the new directory. */
struct xrep_dir_key {
	/* Cookie for retrieval of the dirent name. */
	xblob_cookie		name_cookie;

	/* Target inode number. */
	xfs_ino_t		ino;

	/* Hash of the dirent name. */
	unsigned int		hash;

	/* Length of the dirent name. */
	uint8_t			namelen;

	/* File type of the dirent. */
	uint8_t			ftype;
} __packed;

struct xrep_dir {
	struct xfs_scrub	*sc;

	/* Fixed-size array of xrep_dir_key structures. */
	struct xfbma		*dir_entries;

	/* Blobs containing directory entry names. */
	struct xblob		*dir_names;

	/*
	 * Potential parent of the directory we're reconstructing.  This can
	 * be NULLFSINO if we haven't found any parents; 0 if we've found too
	 * many parents during salvaging; or a regular inode number if we've
	 * found a good candidate.
	 */
	xfs_ino_t		parent_ino;

	/* nlink value of the corrected directory. */
	xfs_nlink_t		new_nlink;
};

/*
 * Decide if we want to salvage this entry.  We don't bother with oversized
 * names or the dot entry.
 */
STATIC int
xrep_dir_want_salvage(
	struct xrep_dir		*rd,
	const char		*name,
	int			namelen,
	xfs_ino_t		ino)
{
	struct xfs_mount	*mp = rd->sc->mp;

	/* No pointers to ourselves or to garbage. */
	if (ino == rd->sc->ip->i_ino)
		return false;
	if (!xfs_verify_dir_ino(mp, ino))
		return false;

	/* No weird looking names or dot entries. */
	if (namelen > MAXNAMELEN || namelen <= 0)
		return false;
	if (namelen == 1 && name[0] == '.')
		return false;

	return true;
}

/* Allocate an in-core record to hold entries while we rebuild the dir data. */
STATIC int
xrep_dir_salvage_entry(
	struct xrep_dir		*rd,
	unsigned char		*name,
	unsigned int		namelen,
	xfs_ino_t		ino)
{
	struct xrep_dir_key	key = {
		.ino		= ino,
	};
	struct xfs_inode	*ip;
	unsigned int		i;
	int			error = 0;

	if (xchk_should_terminate(rd->sc, &error))
		return error;

	/* Truncate the name to the first illegal character. */
	for (i = 0; i < namelen && name[i] != 0 && name[i] != '/'; i++);
	key.namelen = i;
	key.hash = xfs_da_hashname(name, key.namelen);

	trace_xrep_dir_salvage_entry(rd->sc->ip, name, key.namelen, ino);

	/* If this is a '..' entry, we can save it for later... */
	if (key.namelen == 2 && name[0] == '.' && name[1] == '.') {
		switch (rd->parent_ino) {
		case NULLFSINO:
			/* Found a parent, save it for later. */
			rd->parent_ino = ino;
			break;
		default:
			/*
			 * Found more than one parent, so force a directory
			 * tree walk later.
			 */
			rd->parent_ino = 0;
			/* fall through */
		case 0:
			break;
		}
		return 0;
	}

	/*
	 * Compute the ftype or dump the entry if we can't.  We don't lock the
	 * inode because inodes can't change type while we have a reference.
	 */
	error = xfs_iget(rd->sc->mp, rd->sc->tp, ino, XFS_IGET_UNTRUSTED, 0,
			&ip);
	if (error)
		return 0;
	key.ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
	xfs_irele(ip);

	/* Remember this for later. */
	error = xblob_put(rd->dir_names, &key.name_cookie, name, key.namelen);
	if (error)
		return error;

	return xfbma_append(rd->dir_entries, &key);
}

/* Record a shortform directory entry for later reinsertion. */
STATIC int
xrep_dir_salvage_sf_entry(
	struct xrep_dir			*rd,
	struct xfs_dir2_sf_hdr		*sfp,
	struct xfs_dir2_sf_entry	*sfep)
{
	xfs_ino_t			ino;

	ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
	if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
		return 0;

	return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
}

/* Record a regular directory entry for later reinsertion. */
STATIC int
xrep_dir_salvage_data_entry(
	struct xrep_dir			*rd,
	struct xfs_dir2_data_entry	*dep)
{
	xfs_ino_t			ino;

	ino = be64_to_cpu(dep->inumber);
	if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
		return 0;

	return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
}

/* Try to recover block/data format directory entries. */
STATIC int
xrep_dir_recover_data(
	struct xrep_dir		*rd,
	struct xfs_buf		*bp)
{
	struct xfs_da_geometry	*geo = rd->sc->mp->m_dir_geo;
	unsigned int		offset;
	unsigned int		end;
	int			error;		/* error return value */

	/*
	 * Loop over the data portion of the block.
	 * Each object is a real entry (dep) or an unused one (dup).
	 */
	offset = geo->data_entry_offset;
	end = min_t(unsigned int, BBTOB(bp->b_length),
			xfs_dir3_data_end_offset(geo, bp->b_addr));

	while (offset < end) {
		struct xfs_dir2_data_unused	*dup = bp->b_addr + offset;
		struct xfs_dir2_data_entry	*dep = bp->b_addr + offset;

		if (xchk_should_terminate(rd->sc, &error))
			break;

		/* Skip unused entries. */
		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
			offset += be16_to_cpu(dup->length);
			continue;
		}

		/* Don't walk off the end of the block. */
		offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
		if (offset > end)
			break;

		/* Ok, let's save this entry. */
		error = xrep_dir_salvage_data_entry(rd, dep);
		if (error)
			return error;

	}

	return 0;
}

/* Try to recover shortform directory entries. */
STATIC int
xrep_dir_recover_sf(
	struct xrep_dir			*rd)
{
	struct xfs_dir2_sf_hdr		*sfp;
	struct xfs_dir2_sf_entry	*sfep;
	struct xfs_dir2_sf_entry	*next;
	struct xfs_ifork		*ifp;
	unsigned char			*end;
	int				error;

	ifp = XFS_IFORK_PTR(rd->sc->ip, XFS_DATA_FORK);
	sfp = (struct xfs_dir2_sf_hdr *)rd->sc->ip->i_df.if_u1.if_data;
	end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes;

	rd->parent_ino = xfs_dir2_sf_get_parent_ino(sfp);

	sfep = xfs_dir2_sf_firstentry(sfp);
	while ((unsigned char *)sfep < end) {
		if (xchk_should_terminate(rd->sc, &error))
			break;

		next = xfs_dir2_sf_nextentry(rd->sc->mp, sfp, sfep);
		if ((unsigned char *)next > end)
			break;

		/* Ok, let's save this entry. */
		error = xrep_dir_salvage_sf_entry(rd, sfp, sfep);
		if (error)
			return error;

		sfep = next;
	}

	return 0;
}

/*
 * Try to figure out the format of this directory from the data fork mappings
 * and the directory size.  If we can be reasonably sure of format, we can be
 * more aggressive in salvaging directory entries.  On return, @magic_guess
 * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
 * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
 * and 0 if we can't tell.
 */
STATIC void
xrep_dir_guess_format(
	struct xrep_dir		*rd,
	__be32			*magic_guess)
{
	struct xfs_inode	*ip = rd->sc->ip;
	struct xfs_da_geometry	*geo = rd->sc->mp->m_dir_geo;
	xfs_fileoff_t		last;
	int			error;

	ASSERT(xfs_sb_version_hascrc(&ip->i_mount->m_sb));

	*magic_guess = 0;

	/*
	 * If there's a single directory block and the directory size is
	 * exactly one block, this has to be a single block format directory.
	 */
	error = xfs_bmap_last_offset(ip, &last, XFS_DATA_FORK);
	if (!error && XFS_FSB_TO_B(ip->i_mount, last) == geo->blksize &&
	    ip->i_d.di_size == geo->blksize) {
		*magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
		return;
	}

	/*
	 * If the last extent before the leaf offset matches the directory
	 * size and the directory size is larger than 1 block, this is a
	 * data format directory.
	 */
	last = geo->leafblk;
	error = xfs_bmap_last_before(rd->sc->tp, ip, &last, XFS_DATA_FORK);
	if (!error &&
	    XFS_FSB_TO_B(ip->i_mount, last) > geo->blksize &&
	    XFS_FSB_TO_B(ip->i_mount, last) == ip->i_d.di_size) {
		*magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
		return;
	}
}

/* Recover directory entries from a specific directory block. */
STATIC int
xrep_dir_recover_dirblock(
	struct xrep_dir		*rd,
	__be32			magic_guess,
	xfs_dablk_t		dabno)
{
	struct xfs_dir2_data_hdr *hdr;
	struct xfs_buf		*bp;
	__be32			oldmagic;
	int			error;

	/*
	 * Try to read buffer.  We invalidate them in the next step so we don't
	 * bother to set a buffer type or ops.
	 */
	error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
			XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
	if (error || !bp)
		return error;

	hdr = bp->b_addr;
	oldmagic = hdr->magic;

	trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
			be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));

	/*
	 * If we're sure of the block's format, proceed with the salvage
	 * operation using the specified magic number.
	 */
	if (magic_guess) {
		hdr->magic = magic_guess;
		goto recover;
	}

	/*
	 * If we couldn't guess what type of directory this is, then we will
	 * only salvage entries from directory blocks that match the magic
	 * number and pass verifiers.
	 */
	switch (hdr->magic) {
	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
		if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
			goto out;
		break;
	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
		if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
			goto out;
		break;
	default:
		goto out;
	}

recover:
	error = xrep_dir_recover_data(rd, bp);

out:
	hdr->magic = oldmagic;
	xfs_trans_brelse(rd->sc->tp, bp);
	return error;
}

/* Extract as many directory entries as we can. */
STATIC int
xrep_dir_recover(
	struct xrep_dir		*rd)
{
	struct xfs_iext_cursor	icur;
	struct xfs_bmbt_irec	got;
	struct xfs_scrub	*sc = rd->sc;
	struct xfs_ifork	*ifp;
	struct xfs_da_geometry	*geo = sc->mp->m_dir_geo;
	xfs_dablk_t		dabno;
	__be32			magic_guess;
	int			error = 0;

	if (rd->sc->ip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
		return xrep_dir_recover_sf(rd);

	xrep_dir_guess_format(rd, &magic_guess);

	/* Iterate each directory data block in the data fork. */
	ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
	for_each_xfs_iext(ifp, &icur, &got) {
		/* Leaf blocks come after all data blocks, so cut off there. */
		xfs_trim_extent(&got, 0, geo->leafblk);
		if (got.br_blockcount == 0)
			continue;

		for (dabno = round_up(got.br_startoff, geo->fsbcount);
		     dabno < got.br_startoff + got.br_blockcount;
		     dabno += geo->fsbcount) {
			if (xchk_should_terminate(rd->sc, &error))
				return error;

			error = xrep_dir_recover_dirblock(rd, magic_guess,
					dabno);
			if (error)
				break;
		}
	}

	return error;
}

/* Invalidate a directory's blocks and unmap them. */
STATIC int
xrep_dir_reset_nonlocal(
	struct xfs_scrub	*sc,
	struct xfs_inode	*dp)
{
	struct xfs_iext_cursor	icur;
	struct xfs_bmbt_irec	got;
	struct xfs_ifork	*ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
	struct xfs_buf		*bp;
	struct xfs_mount	*mp = sc->mp;
	struct xfs_da_geometry	*geo = mp->m_dir_geo;
	xfs_fileoff_t		off;

	/*
	 * Invalidate each directory block.  All directory blocks are of
	 * fsbcount length and alignment, so we only need to walk those same
	 * offsets.
	 *
	 * We use TRYLOCK here (recall that we hold the ILOCK of the directory
	 * inode) so that we skip any buffer that's locked on the assumption
	 * that we don't own that block.
	 */
	for_each_xfs_iext(ifp, &icur, &got) {
		for (off = round_up(got.br_startoff, geo->fsbcount);
		     off < got.br_startoff + got.br_blockcount;
		     off += geo->fsbcount) {
			xfs_fsblock_t	fsbno;

			fsbno = (off - got.br_startoff) + got.br_startblock;
			bp = xfs_buf_incore(mp->m_ddev_targp,
					XFS_FSB_TO_DADDR(mp, fsbno),
					XFS_FSB_TO_BB(mp, geo->fsbcount),
					XBF_TRYLOCK | XBF_SCAN_STALE);
			if (bp) {
				xfs_buf_stale(bp);
				xfs_buf_relse(bp);
			}
		}
	}

	/* Now free all the blocks. */
	return xfs_bunmapi_range(&sc->tp, dp, XFS_DATA_FORK, 0,
			XFS_MAX_FILEOFF, XFS_BMAPI_NODISCARD);
}

/*
 * Free all the directory blocks and reset the data fork.  The caller must
 * join the inode to the transaction.  This function returns with the inode
 * joined to a clean scrub transaction.
 */
STATIC int
xrep_dir_reset_fork(
	struct xfs_scrub	*sc,
	struct xfs_inode	*dp,
	xfs_ino_t		parent_ino)
{
	struct xfs_ifork	*ifp;
	struct xfs_da_args	*args = sc->buf;
	int			error;

	ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);

	/* Unmap all the directory buffers. */
	if (xfs_ifork_has_extents(dp, XFS_DATA_FORK)) {
		error = xrep_dir_reset_nonlocal(sc, dp);
		if (error)
			return error;
	}

	trace_xrep_dir_reset_fork(dp, parent_ino);

	/* Reset the data fork to an empty data fork. */
	xfs_ifork_reset(ifp);
	ifp->if_flags = XFS_IFINLINE;
	ifp->if_bytes = 0;
	dp->i_d.di_size = 0;

	/* Reinitialize the short form directory. */
	args->geo = sc->mp->m_dir_geo;
	args->dp = dp;
	args->trans = sc->tp;
	error = xfs_dir2_sf_create(args, parent_ino);
	if (error)
		return error;

	return xfs_trans_roll_inode(&sc->tp, dp);
}

/* Compare two dir keys, sorting in hash order. */
static int
xrep_dir_key_cmp(
	const void			*a,
	const void			*b)
{
	const struct xrep_dir_key	*ap = a;
	const struct xrep_dir_key	*bp = b;

	if (ap->hash > bp->hash)
		return 1;
	else if (ap->hash < bp->hash)
		return -1;
	return 0;
}

/*
 * Find all the directory entries for this inode by scraping them out of the
 * directory leaf blocks by hand.  The caller must clean up the lists if
 * anything goes wrong.
 */
STATIC int
xrep_dir_find_entries(
	struct xrep_dir		*rd)
{
	struct xfs_inode	*ip = rd->sc->ip;
	struct xfs_ifork	*ifp;
	int			error;

	error = xrep_ino_dqattach(rd->sc);
	if (error)
		return error;

	/* Extent map should be loaded. */
	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
	if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_LOCAL &&
	    !(ifp->if_flags & XFS_IFEXTENTS)) {
		error = xfs_iread_extents(rd->sc->tp, ip, XFS_DATA_FORK);
		if (error)
			return error;
	}

	/* Read every directory entry and record them in memory. */
	return xrep_dir_recover(rd);
}

/* Insert one dir entry. */
STATIC int
xrep_dir_insert_rec(
	const void			*item,
	void				*priv)
{
	struct xfs_name			name;
	const struct xrep_dir_key	*key = item;
	struct xrep_dir			*rd = priv;
	struct xfs_trans		*tp;
	char				*namebuf = rd->sc->buf;
	struct xfs_mount		*mp = rd->sc->mp;
	uint				resblks;
	int				error;

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	/* The entry name is stored in the in-core buffer. */
	name.name = namebuf;

	error = xblob_get(rd->dir_names, key->name_cookie, namebuf,
			key->namelen);
	if (error)
		return error;

	error = xblob_free(rd->dir_names, key->name_cookie);
	if (error)
		return error;

	trace_xrep_dir_insert_rec(rd->sc->tempip, namebuf, key->namelen,
			key->ino, key->ftype);

	error = xfs_qm_dqattach(rd->sc->tempip);
	if (error)
		return error;

	resblks = XFS_LINK_SPACE_RES(mp, key->namelen);
	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
	if (error == -ENOSPC) {
		resblks = 0;
		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
	}
	if (error)
		return error;

	xfs_ilock(rd->sc->tempip, XFS_ILOCK_EXCL);
	xfs_trans_ijoin(tp, rd->sc->tempip, XFS_ILOCK_EXCL);

	name.len = key->namelen;
	name.type = key->ftype;
	error = xfs_dir_createname(tp, rd->sc->tempip, &name, key->ino,
			resblks);
	if (error)
		goto err;

	if (name.type == XFS_DIR3_FT_DIR)
		rd->new_nlink++;
	xfs_trans_log_inode(tp, rd->sc->tempip, XFS_ILOG_CORE);
	return xfs_trans_commit(tp);

err:
	xfs_trans_cancel(tp);
	return error;
}

/*
 * Prepare both inodes' directory forks for extent swapping.  Promote the
 * tempfile from short format to leaf format, and if the file being repaired
 * has a short format attr fork, turn it into an empty extent list.
 */
STATIC int
xrep_dir_swap_prep(
	struct xfs_scrub	*sc,
	bool			temp_local,
	bool			ip_local)
{
	int			error;

	/*
	 * If the tempfile's attributes are in shortform format, convert that
	 * to a single leaf extent so that we can use the atomic extent swap.
	 */
	if (temp_local) {
		struct xfs_da_args	args = {
			.dp		= sc->tempip,
			.geo		= sc->mp->m_dir_geo,
			.whichfork	= XFS_DATA_FORK,
			.trans		= sc->tp,
			.total		= 1,
		};

		error = xfs_dir2_sf_to_block(&args);
		if (error)
			return error;

		error = xfs_defer_finish(&sc->tp);
		if (error)
			return error;
	}

	/*
	 * If the file being repaired had a shortform attribute fork, convert
	 * that to an empty extent list in preparation for the atomic extent
	 * swap.
	 */
	if (ip_local) {
		struct xfs_ifork	*ifp;

		sc->ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
		sc->ip->i_d.di_nextents = 0;

		ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
		xfs_ifork_reset(ifp);
		ifp->if_bytes = 0;
		ifp->if_u1.if_root = NULL;
		ifp->if_height = 0;
		ifp->if_flags |= XFS_IFEXTENTS;

		xfs_trans_log_inode(sc->tp, sc->ip,
				XFS_ILOG_CORE | XFS_ILOG_DDATA);
	}

	return 0;
}

/*
 * Set the owner for this directory block to the directory being repaired.
 * Return the magic number that we found, or the usual negative error.
 */
STATIC int
xrep_dir_reset_owner(
	struct xfs_scrub		*sc,
	xfs_dablk_t			dabno,
	struct xfs_buf			*bp,
	unsigned int			*magic)
{
	struct xfs_da_geometry		*geo = sc->mp->m_dir_geo;
	struct xfs_dir3_data_hdr	*data3 = bp->b_addr;
	struct xfs_da3_blkinfo		*info3 = bp->b_addr;
	struct xfs_dir3_free_hdr	*free3 = bp->b_addr;
	struct xfs_dir2_data_entry	*dep;

	/* Directory data blocks. */
	if (dabno < geo->leafblk) {
		*magic = be32_to_cpu(data3->hdr.magic);
		if (*magic != XFS_DIR3_BLOCK_MAGIC &&
		    *magic != XFS_DIR3_DATA_MAGIC)
			return -EFSCORRUPTED;

		/*
		 * If this is a block format directory, it's possible that the
		 * block was created as part of converting the temp directory
		 * from short format to block format in order to use the atomic
		 * extent swap.  In that case, the '.' entry will be set to
		 * the temp dir, so find the dot entry and reset it.
		 */
		if (*magic == XFS_DIR3_BLOCK_MAGIC) {
			dep = bp->b_addr + geo->data_entry_offset;
			if (dep->namelen != 1 || dep->name[0] != '.')
				return -EFSCORRUPTED;

			dep->inumber = cpu_to_be64(sc->ip->i_ino);
		}

		data3->hdr.owner = be64_to_cpu(sc->ip->i_ino);
		return 0;
	}

	/* Directory leaf and da node blocks. */
	if (dabno < geo->freeblk) {
		*magic = be16_to_cpu(info3->hdr.magic);
		switch (*magic) {
		case XFS_DA3_NODE_MAGIC:
		case XFS_DIR3_LEAF1_MAGIC:
		case XFS_DIR3_LEAFN_MAGIC:
			break;
		default:
			return -EFSCORRUPTED;
		}

		info3->owner = be64_to_cpu(sc->ip->i_ino);
		return 0;
	}

	/* Directory free blocks. */
	*magic = be32_to_cpu(free3->hdr.magic);
	if (*magic != XFS_DIR3_FREE_MAGIC)
		return -EFSCORRUPTED;

	free3->hdr.owner = be64_to_cpu(sc->ip->i_ino);
	return 0;
}

/*
 * If the buffer didn't have buffer ops set, we need to set them now that we've
 * dirtied the directory block.
 */
STATIC void
xrep_dir_set_verifier(
	unsigned int		magic,
	struct xfs_buf		*bp)
{
	switch (magic) {
	case XFS_DIR3_BLOCK_MAGIC:
		bp->b_ops = &xfs_dir3_block_buf_ops;
		break;
	case XFS_DIR3_DATA_MAGIC:
		bp->b_ops = &xfs_dir3_data_buf_ops;
		break;
	case XFS_DA3_NODE_MAGIC:
		bp->b_ops = &xfs_da3_node_buf_ops;
		break;
	case XFS_DIR3_LEAF1_MAGIC:
		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
		break;
	case XFS_DIR3_LEAFN_MAGIC:
		bp->b_ops = &xfs_dir3_leafn_buf_ops;
		break;
	case XFS_DIR3_FREE_MAGIC:
		bp->b_ops = &xfs_dir3_free_buf_ops;
		break;
	}

	xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
}

/*
 * Change the owner field of every block in the data fork to match the
 * directory being repaired.
 */
STATIC int
xrep_dir_swap_owner(
	struct xfs_scrub		*sc)
{
	struct xfs_bmbt_irec		map;
	struct xfs_da_geometry		*geo = sc->mp->m_dir_geo;
	struct xfs_buf			*bp;
	xfs_fileoff_t			offset = 0;
	xfs_fileoff_t			end = XFS_MAX_FILEOFF;
	xfs_dablk_t			dabno;
	int				nmap;
	int				error;

	for (offset = 0;
	     offset < end;
	     offset = map.br_startoff + map.br_blockcount) {
		nmap = 1;
		error = xfs_bmapi_read(sc->tempip, offset, end - offset,
				&map, &nmap, 0);
		if (error)
			return error;
		if (nmap != 1)
			return -EFSCORRUPTED;
		if (!xfs_bmap_is_real_extent(&map))
			continue;


		for (dabno = round_up(map.br_startoff, geo->fsbcount);
		     dabno < map.br_startoff + map.br_blockcount;
		     dabno += geo->fsbcount) {
			unsigned int	magic;

			error = xfs_da_read_buf(sc->tp, sc->tempip,
					dabno, 0, &bp, XFS_DATA_FORK, NULL);
			if (error)
				return error;
			if (!bp)
				return -EFSCORRUPTED;

			error = xrep_dir_reset_owner(sc, dabno, bp, &magic);
			if (error) {
				xfs_trans_brelse(sc->tp, bp);
				return error;
			}

			if (bp->b_ops == NULL)
				xrep_dir_set_verifier(magic, bp);

			xfs_trans_ordered_buf(sc->tp, bp);
			xfs_trans_brelse(sc->tp, bp);
		}
	}

	return 0;
}

/*
 * If both files' directory structure are in short format, we can copy
 * the short format data from the tempfile to the repaired file if it'll
 * fit.
 */
STATIC void
xrep_dir_swap_local(
	struct xfs_scrub	*sc,
	int			newsize)
{
	struct xfs_ifork	*ifp1, *ifp2;

	ifp1 = XFS_IFORK_PTR(sc->tempip, XFS_DATA_FORK);
	ifp2 = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);

	xfs_idata_realloc(sc->ip, ifp2->if_bytes - ifp1->if_bytes,
			XFS_DATA_FORK);

	memcpy(ifp2->if_u1.if_data, ifp1->if_u1.if_data, newsize);
	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}

struct xfs_name xfs_name_dot = { (unsigned char *)".", 1, XFS_DIR3_FT_DIR };

/* Swap the temporary directory's data fork with the one being repaired. */
STATIC int
xrep_dir_swap(
	struct xrep_dir		*rd)
{
	struct xfs_scrub	*sc = rd->sc;
	unsigned int		resblks;
	bool			ip_local, temp_local;
	int			error;

	resblks = xfs_swap_range_calc_resblks(sc->tempip, sc->ip,
			XFS_DATA_FORK);
	error = xchk_trans_alloc(sc, max(1U, resblks));
	if (error)
		return error;

	/*
	 * Lock and join the inodes to the tansaction so that transaction commit
	 * or cancel will unlock the inodes from this point onwards.
	 */
	xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip,
			XFS_ILOCK_EXCL);
	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
	sc->ilock_flags |= XFS_ILOCK_EXCL;
	xfs_trans_ijoin(sc->tp, sc->ip, 0);
	xfs_trans_ijoin(sc->tp, sc->tempip, 0);

	/*
	 * Reset the temporary directory's '.' entry to point to the directory
	 * we're repairing.  Note: shortform directories lack the dot entry.
	 *
	 * It's possible that this replacement could also expand a sf tempdir
	 * into block format.
	 */
	if (XFS_IFORK_FORMAT(sc->tempip, XFS_DATA_FORK) !=
			XFS_DINODE_FMT_LOCAL) {
		error = xfs_dir_replace(sc->tp, sc->tempip, &xfs_name_dot,
				sc->ip->i_ino, resblks);
		if (error)
			return error;
	}

	/*
	 * Reset the temporary directory's '..' entry to point to the parent
	 * that we found.  The temporary directory was created with the root
	 * directory as the parent, so we can skip this if repairing a
	 * subdirectory of the root.
	 *
	 * It's also possible that this replacement could also expand a sf
	 * tempdir into block format.
	 */
	if (rd->parent_ino != sc->mp->m_rootip->i_ino) {
		error = xfs_dir_replace(sc->tp, rd->sc->tempip,
				&xfs_name_dotdot, rd->parent_ino, resblks);
		if (error)
			return error;
	}

	/* XXX: do we need to roll the transaction here? */

	/*
	 * Changing the dot and dotdot entries could have changed the shape of
	 * the directory, so we recompute these.
	 */
	ip_local = XFS_IFORK_FORMAT(sc->ip, XFS_DATA_FORK) ==
				XFS_DINODE_FMT_LOCAL;
	temp_local = XFS_IFORK_FORMAT(sc->tempip, XFS_DATA_FORK) ==
				XFS_DINODE_FMT_LOCAL;

	/*
	 * If the both files have a local format data fork and the rebuilt
	 * directory data would fit in the repaired file's data fork, copy
	 * the contents from the tempfile and declare ourselves done.
	 */
	if (ip_local && temp_local) {
		if (sc->tempip->i_d.di_size <= XFS_IFORK_DSIZE(sc->ip)) {
			xrep_dir_swap_local(sc, sc->tempip->i_d.di_size);
			set_nlink(VFS_I(sc->ip), rd->new_nlink);
			return 0;
		}
	}

	/* Otherwise, make sure both data forks are in block-mapping mode. */
	error = xrep_dir_swap_prep(sc, temp_local, ip_local);
	if (error)
		return error;

	/* Rewrite the owner field of all attr blocks in the temporary file. */
	error = xrep_dir_swap_owner(sc);
	if (error)
		return error;

	/*
	 * Set nlink of the directory under repair to the number of
	 * subdirectories that will be in the new directory data.  Do this in
	 * the same transaction sequence that (atomically) commits the new
	 * data.
	 */
	set_nlink(VFS_I(sc->ip), rd->new_nlink);

	return xfs_swapext_atomic(&sc->tp, sc->tempip, sc->ip, XFS_DATA_FORK,
			0, 0, NULLFILEOFF,
			XFS_SWAPEXT_SET_SIZES | XFS_SWAPEXT_TO_SHORTFORM2);
}

/*
 * Insert all the attributes that we collected.
 *
 * Commit the repair transaction and drop the ilock because the attribute
 * setting code needs to be able to allocate special transactions and take the
 * ilock on its own.  Some day we'll have deferred attribute setting, at which
 * point we'll be able to use that to replace the attributes atomically and
 * safely.
 */
STATIC int
xrep_dir_rebuild_tree(
	struct xrep_dir		*rd)
{
	int			error;

	/*
	 * Commit the existing transaction and drop the ILOCK so that we can
	 * use a series of small transactions to rebuild the directory.
	 */
	error = xfs_trans_commit(rd->sc->tp);
	rd->sc->tp = NULL;
	if (error)
		return error;

	/*
	 * Drop the ILOCK so that we don't pin the tail of the log.  We still
	 * hold the IOLOCK (aka i_rwsem) which will prevent directory access.
	 */
	xfs_iunlock(rd->sc->ip, XFS_ILOCK_EXCL);
	rd->sc->ilock_flags &= ~XFS_ILOCK_EXCL;

	/*
	 * Sort the entries hash to minimize dabtree splits when we rebuild the
	 * directory tree information.
	 */
	error = xfbma_sort(rd->dir_entries, xrep_dir_key_cmp);
	if (error)
		return error;

	/* Re-add every entry to the temporary directory. */
	error = xfbma_iter_del(rd->dir_entries, xrep_dir_insert_rec, rd);
	if (error)
		return error;

	/* Swap the tempdir's data fork with the file being repaired. */
	error = xrep_dir_swap(rd);
	if (error)
		return error;

	/*
	 * Now reset the data fork of the temp directory to an empty shortform
	 * directory because inactivation does nothing for directories.  We're
	 * done with the inode that we want to repair, so roll the transaction
	 * and drop its ILOCK before we tackle the temporary file.
	 */
	error = xfs_trans_roll_inode(&rd->sc->tp, rd->sc->tempip);
	if (error)
		return error;
	xfs_iunlock(rd->sc->ip, XFS_ILOCK_EXCL);
	rd->sc->ilock_flags &= ~XFS_ILOCK_EXCL;

	return xrep_dir_reset_fork(rd->sc, rd->sc->tempip,
			rd->sc->mp->m_rootip->i_ino);
}

/*
 * If this directory entry points to the directory we're rebuilding, then the
 * directory we're scanning is the parent.  Remember the parent.
 */
STATIC int
xrep_dir_absorb_parent(
	struct xfs_inode	*dp,
	struct xfs_name		*name,
	unsigned int		dtype,
	void			*data)
{
	struct xrep_dir		*rd = data;
	int			error = 0;

	/* Uhoh, more than one parent for a dir? */
	if (rd->parent_ino != NULLFSINO)
		return -EFSCORRUPTED;

	if (xchk_should_terminate(rd->sc, &error))
		return error;

	/* We found a potential parent; remember this. */
	rd->parent_ino = dp->i_ino;
	return 0;
}

/*
 * Make sure we return with a valid parent inode.
 *
 * If the directory salvaging step found a single '..' entry, check the
 * alleged parent for a dentry pointing to the directory.  If this succeds,
 * we're done.  Otherwise, scan the entire filesystem for a parent.
 */
STATIC int
xrep_dir_validate_parent(
	struct xrep_dir		*rd)
{
	struct xfs_scrub	*sc = rd->sc;
	struct xfs_inode	*parent;
	xfs_nlink_t		expected_nlink, nlink;
	int			error;

	/*
	 * If the directory salvage scan found no parent or found an obviously
	 * incorrect parent, try asking the dcache for the parent.
	 *
	 * If the dcache doesn't know about a parent or the parent seems
	 * obviously incorrect, jump to the filesystem scan.
	 *
	 * Otherwise, if the alleged parent seems plausible, scan the directory
	 * to make sure it really points to us.
	 */
	if (!xrep_parent_acceptable(sc, rd->parent_ino))
		rd->parent_ino = xrep_parent_check_dcache(sc->ip);
	if (!xrep_parent_acceptable(sc, rd->parent_ino))
		goto scan;

	/*
	 * Grab this parent inode.  Since we release the inode before we cancel
	 * the scrub transaction and don't know if releasing the inode will
	 * trigger eofblocks cleanup (which allocates what would be a nested
	 * transaction), we avoid DONTCACHE here.
	 */
	error = xfs_iget(sc->mp, sc->tp, rd->parent_ino, XFS_IGET_UNTRUSTED, 0,
			&parent);
	if (error)
		goto scan;
	if (!S_ISDIR(VFS_I(parent)->i_mode))
		goto rele_scan;

	/*
	 * We prefer to keep the inode locked while we lock and search its
	 * alleged parent for a forward reference.  If we can grab the iolock,
	 * validate the pointers and we're done.  We must use nowait here to
	 * avoid an ABBA deadlock on the parent and the child inodes.
	 */
	if (!xfs_ilock_nowait(parent, XFS_IOLOCK_SHARED))
		goto rele_scan;

	/*
	 * If we're an unlinked directory, the parent /won't/ have a link
	 * to us.  Otherwise, it should have one link.
	 */
	expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;

	error = xchk_parent_count_parent_dentries(sc, parent, &nlink);
	if (error)
		goto unlock_rele_scan;

	/* The parent is an exact match, we're done. */
	if (nlink == expected_nlink) {
		xfs_iunlock(parent, XFS_IOLOCK_SHARED);
		xfs_irele(parent);
		return 0;
	}

unlock_rele_scan:
	xfs_iunlock(parent, XFS_IOLOCK_SHARED);
rele_scan:
	xfs_irele(parent);
scan:
	/*
	 * If we're an unlinked directory, the parent /won't/ have a link
	 * to us.  Set the parent directory to the root.
	 */
	if (VFS_I(rd->sc->ip)->i_nlink == 0) {
		rd->parent_ino = sc->mp->m_sb.sb_rootino;
		return 0;
	}

	/* Scan the entire directory tree for the directory's parent. */
	error = xrep_scan_for_parents(sc, sc->ip->i_ino,
			xrep_dir_absorb_parent, rd);
	if (error)
		return error;

	return rd->parent_ino == NULLFSINO ? -EFSCORRUPTED : 0;
}

/*
 * Repair the directory metadata.
 *
 * XXX: Directory entry buffers can be multiple fsblocks in size.  The buffer
 * cache in XFS can't handle aliased multiblock buffers, so this might
 * misbehave if the directory blocks are crosslinked with other filesystem
 * metadata.
 *
 * XXX: Is it necessary to check the dcache for this directory to make sure
 * that we always recreate every cached entry?
 */
int
xrep_dir(
	struct xfs_scrub	*sc)
{
	struct xrep_dir		rd = {
		.sc		= sc,
		.parent_ino	= NULLFSINO,
		.new_nlink	= 2,
	};
	int			error;

	/* Set up some storage */
	rd.dir_entries = xfbma_init(sizeof(struct xrep_dir_key));
	if (IS_ERR(rd.dir_entries))
		return PTR_ERR(rd.dir_entries);
	rd.dir_names = xblob_init();
	if (IS_ERR(rd.dir_names)) {
		error = PTR_ERR(rd.dir_names);
		goto out_arr;
	}

	/*
	 * The directory scrubber might have dropped the ILOCK, so pick it up
	 * again.
	 */
	if (!(sc->ilock_flags & XFS_ILOCK_EXCL)) {
		xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
		sc->ilock_flags |= XFS_ILOCK_EXCL;
	}

	/* Collect directory entries by parsing raw leaf blocks. */
	error = xrep_dir_find_entries(&rd);
	if (error)
		goto out;

	/*
	 * Validate the parent pointer that we observed while salvaging the
	 * directory; or scan the filesystem to find one.  We drop the ILOCK
	 * on the directory being repaired to avoid ABBA deadlocks, though we
	 * maintain the directory IOLOCK to prevent concurrent modifications.
	 */
	xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
	error = xrep_dir_validate_parent(&rd);
	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
	if (error)
		goto out;

	/* Now rebuild the directory information. */
	error = xrep_dir_rebuild_tree(&rd);
out:
	xblob_destroy(rd.dir_names);
out_arr:
	xfbma_destroy(rd.dir_entries);
	return error;
}