xfs: create deferred log items for extent swapping

Now that we've created the skeleton of a log intent item to track and restart extent swap operations, add the upper level logic to commit intent items and turn them into concrete work recorded in the log. We use the deferred item "multihop" feature that was introduced a few patches ago to constrain the number of active swap operations to one per thread. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
author: Darrick J. Wong <djwong@kernel.org> 2021-09-01 10:54:13 -0700
committer: Darrick J. Wong <djwong@kernel.org> 2021-10-22 16:40:43 -0700
commit: 885e9a90393720dc9d061d544d69143c5aff1ef4 (patch)
tree: 2fd2d244788d353256c96d4faf6cf265c08b864b /fs/xfs/xfs_swapext_item.c
parent: ef2190a3df498cad2a7b956cd9a0b55c08d2b9f8 (diff)
1 files changed, 340 insertions, 3 deletions
diff --git a/fs/xfs/xfs_swapext_item.c b/fs/xfs/xfs_swapext_item.c
index fb260f90d357..95041fe69ba7 100644
--- a/fs/xfs/xfs_swapext_item.c
+++ b/fs/xfs/xfs_swapext_item.c
@@ -16,13 +16,16 @@
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_swapext_item.h"
+#include "xfs_swapext.h"
 #include "xfs_log.h"
 #include "xfs_bmap.h"
 #include "xfs_icache.h"
+#include "xfs_bmap_btree.h"
 #include "xfs_trans_space.h"
 #include "xfs_error.h"
 #include "xfs_log_priv.h"
 #include "xfs_log_recover.h"
+#include "xfs_xchgrange.h"
 
 struct kmem_cache	*xfs_sxi_zone;
 struct kmem_cache	*xfs_sxd_zone;
@@ -195,13 +198,334 @@ static const struct xfs_item_ops xfs_sxd_item_ops = {
 	.iop_release	= xfs_sxd_item_release,
 };
 
+static struct xfs_sxd_log_item *
+xfs_trans_get_sxd(
+	struct xfs_trans		*tp,
+	struct xfs_sxi_log_item		*sxi_lip)
+{
+	struct xfs_sxd_log_item		*sxd_lip;
+
+	sxd_lip = kmem_cache_zalloc(xfs_sxd_zone, GFP_KERNEL | __GFP_NOFAIL);
+	xfs_log_item_init(tp->t_mountp, &sxd_lip->sxd_item, XFS_LI_SXD,
+			  &xfs_sxd_item_ops);
+	sxd_lip->sxd_intent_log_item = sxi_lip;
+	sxd_lip->sxd_format.sxd_sxi_id = sxi_lip->sxi_format.sxi_id;
+
+	xfs_trans_add_item(tp, &sxd_lip->sxd_item);
+	return sxd_lip;
+}
+
+/*
+ * Finish an swapext update and log it to the SXD. Note that the transaction is
+ * marked dirty regardless of whether the swapext update succeeds or fails to
+ * support the SXI/SXD lifecycle rules.
+ */
+static int
+xfs_swapext_finish_update(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct xfs_swapext_intent	*sxi)
+{
+	int				error;
+
+	error = xfs_swapext_finish_one(tp, sxi);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the SXI and frees the SXD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	if (done)
+		set_bit(XFS_LI_DIRTY, &done->li_flags);
+
+	return error;
+}
+
+/* Log swapext updates in the intent item. */
+STATIC void
+xfs_swapext_log_item(
+	struct xfs_trans		*tp,
+	struct xfs_sxi_log_item		*sxi_lip,
+	struct xfs_swapext_intent	*sxi)
+{
+	struct xfs_swap_extent		*sx;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	set_bit(XFS_LI_DIRTY, &sxi_lip->sxi_item.li_flags);
+
+	sx = &sxi_lip->sxi_format.sxi_extent;
+	sx->sx_inode1 = sxi->sxi_ip1->i_ino;
+	sx->sx_inode2 = sxi->sxi_ip2->i_ino;
+	sx->sx_startoff1 = sxi->sxi_startoff1;
+	sx->sx_startoff2 = sxi->sxi_startoff2;
+	sx->sx_blockcount = sxi->sxi_blockcount;
+	sx->sx_isize1 = sxi->sxi_isize1;
+	sx->sx_isize2 = sxi->sxi_isize2;
+	sx->sx_flags = sxi->sxi_flags;
+}
+
+STATIC struct xfs_log_item *
+xfs_swapext_create_intent(
+	struct xfs_trans		*tp,
+	struct list_head		*items,
+	unsigned int			count,
+	bool				sort)
+{
+	struct xfs_sxi_log_item		*sxi_lip = xfs_sxi_init(tp->t_mountp);
+	struct xfs_swapext_intent	*sxi;
+
+	ASSERT(count == XFS_SXI_MAX_FAST_EXTENTS);
+
+	/*
+	 * We use the same defer ops control machinery to perform extent swaps
+	 * even if we lack the machinery to track the operation status through
+	 * log items.
+	 */
+	if (!xfs_has_atomicswap(tp->t_mountp))
+		return NULL;
+
+	xfs_trans_add_item(tp, &sxi_lip->sxi_item);
+	list_for_each_entry(sxi, items, sxi_list)
+		xfs_swapext_log_item(tp, sxi_lip, sxi);
+	return &sxi_lip->sxi_item;
+}
+
+STATIC struct xfs_log_item *
+xfs_swapext_create_done(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*intent,
+	unsigned int			count)
+{
+	if (intent == NULL)
+		return NULL;
+	return &xfs_trans_get_sxd(tp, SXI_ITEM(intent))->sxd_item;
+}
+
+/* Process a deferred swapext update. */
+STATIC int
+xfs_swapext_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_log_item		*done,
+	struct list_head		*item,
+	struct xfs_btree_cur		**state)
+{
+	struct xfs_swapext_intent	*sxi;
+	int				error;
+
+	sxi = container_of(item, struct xfs_swapext_intent, sxi_list);
+
+	/*
+	 * Swap one more extent between the two files.  If there's still more
+	 * work to do, we want to requeue ourselves after all other pending
+	 * deferred operations have finished.  This includes all of the dfops
+	 * that we queued directly as well as any new ones created in the
+	 * process of finishing the others.  Doing so prevents us from queuing
+	 * a large number of SXI log items in kernel memory, which in turn
+	 * prevents us from pinning the tail of the log (while logging those
+	 * new SXI items) until the first SXI items can be processed.
+	 */
+	error = xfs_swapext_finish_update(tp, done, sxi);
+	if (error == -EAGAIN)
+		return error;
+
+	kmem_free(sxi);
+	return error;
+}
+
+/* Abort all pending SXIs. */
+STATIC void
+xfs_swapext_abort_intent(
+	struct xfs_log_item		*intent)
+{
+	xfs_sxi_release(SXI_ITEM(intent));
+}
+
+/* Cancel a deferred swapext update. */
+STATIC void
+xfs_swapext_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_swapext_intent	*sxi;
+
+	sxi = container_of(item, struct xfs_swapext_intent, sxi_list);
+	kmem_free(sxi);
+}
+
+const struct xfs_defer_op_type xfs_swapext_defer_type = {
+	.max_items	= XFS_SXI_MAX_FAST_EXTENTS,
+	.create_intent	= xfs_swapext_create_intent,
+	.abort_intent	= xfs_swapext_abort_intent,
+	.create_done	= xfs_swapext_create_done,
+	.finish_item	= xfs_swapext_finish_item,
+	.cancel_item	= xfs_swapext_cancel_item,
+};
+
+/* Is this recovered SXI ok? */
+static inline bool
+xfs_sxi_validate(
+	struct xfs_mount		*mp,
+	struct xfs_sxi_log_item		*sxi_lip)
+{
+	struct xfs_swap_extent		*sx = &sxi_lip->sxi_format.sxi_extent;
+
+	if (!xfs_has_atomicswap(mp))
+		return false;
+
+	if (sxi_lip->sxi_format.__pad != 0)
+		return false;
+
+	if (sx->sx_flags & ~XFS_SWAP_EXT_FLAGS)
+		return false;
+
+	if (!xfs_verify_ino(mp, sx->sx_inode1) ||
+	    !xfs_verify_ino(mp, sx->sx_inode2))
+		return false;
+
+	if ((sx->sx_flags & XFS_SWAP_EXT_SET_SIZES) &&
+	     (sx->sx_isize1 < 0 || sx->sx_isize2 < 0))
+		return false;
+
+	if (!xfs_verify_fileext(mp, sx->sx_startoff1, sx->sx_blockcount))
+		return false;
+
+	return xfs_verify_fileext(mp, sx->sx_startoff2, sx->sx_blockcount);
+}
+
+/*
+ * Use the recovered log state to create a new request, estimate resource
+ * requirements, and create a new incore intent state.
+ */
+STATIC struct xfs_swapext_intent *
+xfs_sxi_item_recover_intent(
+	struct xfs_mount		*mp,
+	const struct xfs_swap_extent	*sx,
+	struct xfs_swapext_res		*res)
+{
+	struct xfs_swapext_req		req;
+	struct xfs_inode		*ip1, *ip2;
+	int				error;
+
+	/*
+	 * Grab both inodes and set IRECOVERY to prevent trimming of post-eof
+	 * extents and freeing of unlinked inodes until we're totally done
+	 * processing files.
+	 */
+	error = xlog_recover_iget(mp, sx->sx_inode1, &ip1);
+	if (error)
+		return ERR_PTR(error);
+	error = xlog_recover_iget(mp, sx->sx_inode2, &ip2);
+	if (error)
+		goto err_rele1;
+
+	req.ip1 = ip1;
+	req.ip2 = ip2;
+	req.startoff1 = sx->sx_startoff1;
+	req.startoff2 = sx->sx_startoff2;
+	req.blockcount = sx->sx_blockcount;
+	req.req_flags = 0;
+
+	if (sx->sx_flags & XFS_SWAP_EXT_ATTR_FORK)
+		req.whichfork = XFS_ATTR_FORK;
+	else
+		req.whichfork = XFS_DATA_FORK;
+
+	if (sx->sx_flags & XFS_SWAP_EXT_SET_SIZES)
+		req.req_flags |= XFS_SWAP_REQ_SET_SIZES;
+	if (sx->sx_flags & XFS_SWAP_EXT_SKIP_FILE1_HOLES)
+		req.req_flags |= XFS_SWAP_REQ_SKIP_FILE1_HOLES;
+
+	xfs_xchg_range_ilock(NULL, ip1, ip2);
+	error = xfs_swapext_estimate(&req, res);
+	xfs_xchg_range_iunlock(ip1, ip2);
+	if (error)
+		goto err_rele2;
+
+	return xfs_swapext_init_intent(&req);
+
+err_rele2:
+	xfs_irele(ip2);
+err_rele1:
+	xfs_irele(ip1);
+	return ERR_PTR(error);
+}
+
 /* Process a swapext update intent item that was recovered from the log. */
 STATIC int
 xfs_sxi_item_recover(
 	struct xfs_log_item		*lip,
 	struct list_head		*capture_list)
 {
-	return -EFSCORRUPTED;
+	struct xfs_swapext_intent	*sxi;
+	struct xfs_swapext_res		res;
+	struct xfs_sxi_log_item		*sxi_lip = SXI_ITEM(lip);
+	struct xfs_mount		*mp = lip->li_mountp;
+	struct xfs_swap_extent		*sx = &sxi_lip->sxi_format.sxi_extent;
+	struct xfs_sxd_log_item		*sxd_lip = NULL;
+	struct xfs_trans		*tp;
+	struct xfs_inode		*ip1, *ip2;
+	int				error = 0;
+
+	if (!xfs_sxi_validate(mp, sxi_lip)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+				&sxi_lip->sxi_format,
+				sizeof(sxi_lip->sxi_format));
+		return -EFSCORRUPTED;
+	}
+
+	sxi = xfs_sxi_item_recover_intent(mp, sx, &res);
+	if (IS_ERR(sxi))
+		return PTR_ERR(sxi);
+
+	ip1 = sxi->sxi_ip1;
+	ip2 = sxi->sxi_ip2;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, res.resblks, 0, 0,
+			&tp);
+	if (error)
+		goto err_rele;
+
+	sxd_lip = xfs_trans_get_sxd(tp, sxi_lip);
+
+	xfs_xchg_range_ilock(tp, ip1, ip2);
+
+	error = xfs_swapext_finish_update(tp, &sxd_lip->sxd_item, sxi);
+	if (error == -EAGAIN) {
+		/*
+		 * If there's more extent swapping to be done, we have to
+		 * schedule that as a separate deferred operation to be run
+		 * after we've finished replaying all of the intents we
+		 * recovered from the log.  Transfer ownership of the sxi to
+		 * the transaction.
+		 */
+		xfs_swapext_schedule(tp, sxi);
+		error = 0;
+		sxi = NULL;
+	}
+	if (error == -EFSCORRUPTED)
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, sx,
+				sizeof(*sx));
+	if (error)
+		goto err_cancel;
+
+	/*
+	 * Commit transaction, which frees the transaction and saves the inodes
+	 * for later replay activities.
+	 */
+	error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+	goto err_unlock;
+
+err_cancel:
+	xfs_trans_cancel(tp);
+err_unlock:
+	xfs_xchg_range_iunlock(ip1, ip2);
+err_rele:
+	kmem_free(sxi);
+	xfs_irele(ip2);
+	xfs_irele(ip1);
+	return error;
 }
 
 STATIC bool
@@ -218,8 +542,21 @@ xfs_sxi_item_relog(
 	struct xfs_log_item		*intent,
 	struct xfs_trans		*tp)
 {
-	ASSERT(0);
-	return NULL;
+	struct xfs_sxd_log_item		*sxd_lip;
+	struct xfs_sxi_log_item		*sxi_lip;
+	struct xfs_swap_extent		*sx;
+
+	sx = &SXI_ITEM(intent)->sxi_format.sxi_extent;
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	sxd_lip = xfs_trans_get_sxd(tp, SXI_ITEM(intent));
+	set_bit(XFS_LI_DIRTY, &sxd_lip->sxd_item.li_flags);
+
+	sxi_lip = xfs_sxi_init(tp->t_mountp);
+	memcpy(&sxi_lip->sxi_format.sxi_extent, sx, sizeof(*sx));
+	xfs_trans_add_item(tp, &sxi_lip->sxi_item);
+	set_bit(XFS_LI_DIRTY, &sxi_lip->sxi_item.li_flags);
+	return &sxi_lip->sxi_item;
 }
 
 static const struct xfs_item_ops xfs_sxi_item_ops = {
author	Darrick J. Wong <djwong@kernel.org>	2021-09-01 10:54:13 -0700
committer	Darrick J. Wong <djwong@kernel.org>	2021-10-22 16:40:43 -0700
commit	885e9a90393720dc9d061d544d69143c5aff1ef4 (patch)
tree	2fd2d244788d353256c96d4faf6cf265c08b864b /fs/xfs/xfs_swapext_item.c
parent	ef2190a3df498cad2a7b956cd9a0b55c08d2b9f8 (diff)