summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDarrick J. Wong <darrick.wong@oracle.com>2020-05-15 17:15:03 -0700
committerDarrick J. Wong <darrick.wong@oracle.com>2020-06-24 18:12:16 -0700
commitdba9435b226c01df4bcd3bd1fdd42a5c189968e5 (patch)
tree30b9ca166d9627b1269fa3b67e21d2910cb10a33
parente812e6bd89dc6489ca924756635fb81855091700 (diff)
xfs: proper replay of deferred ops queued during log recovery
When we replay unfinished intent items that have been recovered from the log, it's possible that the replay will cause the creation of more deferred work items. As outlined in commit 509955823cc9c ("xfs: log recovery should replay deferred ops in order"), later work items have an implicit ordering dependency on earlier work items. Therefore, recovery must replay the items (both recovered and created) in the same order that they would have been during normal operation. For log recovery, we enforce this ordering by using an empty transaction to collect deferred ops that get created in the process of recovering a log intent item to prevent them from being committed before the rest of the recovered intent items. After we finish committing all the recovered log items, we allocate a transaction with an enormous block reservation, splice our huge list of created deferred ops into that transaction, and commit it, thereby finishing all those ops. This is /really/ hokey -- it's the one place in XFS where we allow nested transactions; the splicing of the defer ops list is is inelegant and has to be done twice per recovery function; and the broken way we handle inode pointers and block reservations cause subtle use-after-free and allocator problems that will be fixed by this patch and the two patches after it. Therefore, replace the hokey empty transaction with a structure designed to capture each chain of deferred ops that are created as part of recovering a single unfinished log intent. Finally, refactor the loop that replays those chains to do so using one transaction per chain. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
-rw-r--r--fs/xfs/libxfs/xfs_defer.c56
-rw-r--r--fs/xfs/libxfs/xfs_defer.h24
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h4
-rw-r--r--fs/xfs/xfs_bmap_item.c16
-rw-r--r--fs/xfs/xfs_extfree_item.c7
-rw-r--r--fs/xfs/xfs_log_recover.c127
-rw-r--r--fs/xfs/xfs_refcount_item.c16
-rw-r--r--fs/xfs/xfs_rmap_item.c7
-rw-r--r--fs/xfs/xfs_trans.h4
9 files changed, 184 insertions, 77 deletions
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index d8f586256add..11fe61a3026b 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -552,3 +552,59 @@ xfs_defer_move(
xfs_defer_reset(stp);
}
+
+/*
+ * Capture a chain of deferred ops that are attached to a transaction. The
+ * entire deferred ops state is transferred to the capture structure.
+ */
+int
+xfs_defer_capture(
+ struct xfs_trans *tp,
+ struct xfs_defer_capture **dfcp)
+{
+ struct xfs_defer_capture *dfc;
+
+ *dfcp = NULL;
+ if (list_empty(&tp->t_dfops))
+ return 0;
+
+ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
+ if (!dfc)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&dfc->dfc_list);
+ INIT_LIST_HEAD(&dfc->dfc_dfops);
+
+ /* Move the dfops chain and transaction state to the freezer. */
+ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
+ dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
+ xfs_defer_reset(tp);
+
+ *dfcp = dfc;
+ return 0;
+}
+
+/* Attach a chain of captured deferred ops to a new transaction. */
+void
+xfs_defer_continue(
+ struct xfs_defer_capture *dfc,
+ struct xfs_trans *tp)
+{
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+
+ /* Move captured dfops chain and state to the transaction. */
+ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
+ tp->t_flags |= dfc->dfc_tpflags;
+ dfc->dfc_tpflags = 0;
+}
+
+/* Release all resources that we used to capture deferred ops. */
+void
+xfs_defer_capture_free(
+ struct xfs_mount *mp,
+ struct xfs_defer_capture *dfc)
+{
+ xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
+ kmem_free(dfc);
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 6b2ca580f2b0..d61ef0a750ab 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -8,6 +8,7 @@
struct xfs_btree_cur;
struct xfs_defer_op_type;
+struct xfs_defer_capture;
/*
* Header for deferred operation list.
@@ -63,4 +64,27 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+/*
+ * Deferred operation freezer. This structure enables a dfops user to detach
+ * the chain of deferred operations from a transaction so that they can be
+ * continued later.
+ */
+struct xfs_defer_capture {
+ /* List of other freezer heads. */
+ struct list_head dfc_list;
+
+ /* Deferred ops state saved from the transaction. */
+ struct list_head dfc_dfops;
+ unsigned int dfc_tpflags;
+};
+
+/*
+ * Functions to capture a chain of deferred operations and continue them later.
+ * This doesn't normally happen except log recovery.
+ */
+int xfs_defer_capture(struct xfs_trans *tp, struct xfs_defer_capture **dfcp);
+void xfs_defer_continue(struct xfs_defer_capture *dfc, struct xfs_trans *tp);
+void xfs_defer_capture_free(struct xfs_mount *mp,
+ struct xfs_defer_capture *dfc);
+
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 641132d0e39d..c3563c5c033c 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -6,6 +6,8 @@
#ifndef __XFS_LOG_RECOVER_H__
#define __XFS_LOG_RECOVER_H__
+struct xfs_defer_capture;
+
/*
* Each log item type (XFS_LI_*) gets its own xlog_recover_item_ops to
* define how recovery should work for that type of log item.
@@ -125,5 +127,7 @@ void xlog_recover_iodone(struct xfs_buf *bp);
void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
uint64_t intent_id);
+int xlog_recover_trans_commit(struct xfs_trans *tp,
+ struct xfs_defer_capture **dfcp);
#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 6736c5ab188f..fd56f354cf20 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -423,13 +423,13 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
STATIC int
xfs_bui_item_recover(
struct xfs_log_item *lip,
- struct xfs_trans *parent_tp)
+ struct xfs_defer_capture **dfcp)
{
struct xfs_bmbt_irec irec;
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = lip->li_mountp;
struct xfs_map_extent *bmap;
struct xfs_bud_log_item *budp;
xfs_fsblock_t startblock_fsb;
@@ -485,12 +485,7 @@ xfs_bui_item_recover(
XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
if (error)
return error;
- /*
- * Recovery stashes all deferred ops during intent processing and
- * finishes them on completion. Transfer current dfops state to this
- * transaction and transfer the result back before we return.
- */
- xfs_defer_move(tp, parent_tp);
+
budp = xfs_trans_get_bud(tp, buip);
/* Grab the inode. */
@@ -534,15 +529,12 @@ xfs_bui_item_recover(
xfs_bmap_unmap_extent(tp, ip, &irec);
}
- xfs_defer_move(parent_tp, tp);
- error = xfs_trans_commit(tp);
+ error = xlog_recover_trans_commit(tp, dfcp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
-
return error;
err_inode:
- xfs_defer_move(parent_tp, tp);
xfs_trans_cancel(tp);
if (ip) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index b9c333bae0a1..951855364eae 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -583,10 +583,10 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
STATIC int
xfs_efi_item_recover(
struct xfs_log_item *lip,
- struct xfs_trans *parent_tp)
+ struct xfs_defer_capture **dfcp)
{
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = lip->li_mountp;
struct xfs_efd_log_item *efdp;
struct xfs_trans *tp;
struct xfs_extent *extp;
@@ -631,8 +631,7 @@ xfs_efi_item_recover(
}
- error = xfs_trans_commit(tp);
- return error;
+ return xlog_recover_trans_commit(tp, dfcp);
abort_error:
xfs_trans_cancel(tp);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index ec015df55b77..0cc7af4247e0 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1792,6 +1792,26 @@ xlog_recover_release_intent(
spin_unlock(&ailp->ail_lock);
}
+/*
+ * Freeze any deferred ops and commit the transaction. This is the last step
+ * needed to finish a log intent item that we recovered from the log.
+ */
+int
+xlog_recover_trans_commit(
+ struct xfs_trans *tp,
+ struct xfs_defer_capture **dfcp)
+{
+ int error;
+
+ error = xfs_defer_capture(tp, dfcp);
+ if (error) {
+ xfs_trans_cancel(tp);
+ return error;
+ }
+
+ return xfs_trans_commit(tp);
+}
+
/******************************************************************************
*
* Log recover routines
@@ -2468,38 +2488,64 @@ xlog_recover_process_data(
return 0;
}
+static void
+xlog_cancel_defer_ops(
+ struct xfs_mount *mp,
+ struct list_head *dfops_freezers)
+{
+ struct xfs_defer_capture *dfc, *next;
+
+ list_for_each_entry_safe(dfc, next, dfops_freezers, dfc_list) {
+ list_del_init(&dfc->dfc_list);
+ xfs_defer_capture_free(mp, dfc);
+ }
+}
+
/* Take all the collected deferred ops and finish them in order. */
static int
xlog_finish_defer_ops(
- struct xfs_trans *parent_tp)
+ struct xfs_mount *mp,
+ struct list_head *dfops_freezers)
{
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_defer_capture *dfc, *next;
struct xfs_trans *tp;
int64_t freeblks;
- uint resblks;
- int error;
+ uint64_t resblks;
+ int error = 0;
- /*
- * We're finishing the defer_ops that accumulated as a result of
- * recovering unfinished intent items during log recovery. We
- * reserve an itruncate transaction because it is the largest
- * permanent transaction type. Since we're the only user of the fs
- * right now, take 93% (15/16) of the available free blocks. Use
- * weird math to avoid a 64-bit division.
- */
- freeblks = percpu_counter_sum(&mp->m_fdblocks);
- if (freeblks <= 0)
- return -ENOSPC;
- resblks = min_t(int64_t, UINT_MAX, freeblks);
- resblks = (resblks * 15) >> 4;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
- 0, XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
- /* transfer all collected dfops to this transaction */
- xfs_defer_move(tp, parent_tp);
+ list_for_each_entry_safe(dfc, next, dfops_freezers, dfc_list) {
+ /*
+ * We're finishing the defer_ops that accumulated as a result
+ * of recovering unfinished intent items during log recovery.
+ * We reserve an itruncate transaction because it is the
+ * largest permanent transaction type. Since we're the only
+ * user of the fs right now, take 93% (15/16) of the available
+ * free blocks. Use weird math to avoid a 64-bit division.
+ */
+ freeblks = percpu_counter_sum(&mp->m_fdblocks);
+ if (freeblks <= 0) {
+ error = -ENOSPC;
+ break;
+ }
- return xfs_trans_commit(tp);
+ resblks = min_t(uint64_t, UINT_MAX, freeblks);
+ resblks = (resblks * 15) >> 4;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
+ 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ break;
+
+ /* transfer all collected dfops to this transaction */
+ list_del_init(&dfc->dfc_list);
+ xfs_defer_continue(dfc, tp);
+
+ error = xfs_trans_commit(tp);
+ xfs_defer_capture_free(mp, dfc);
+ if (error)
+ break;
+ }
+
+ return error;
}
/* Is this log item a deferred action intent? */
@@ -2529,8 +2575,9 @@ STATIC int
xlog_recover_process_intents(
struct xlog *log)
{
- struct xfs_trans *parent_tp;
+ LIST_HEAD(dfops_freezers);
struct xfs_ail_cursor cur;
+ struct xfs_defer_capture *freezer = NULL;
struct xfs_log_item *lip;
struct xfs_ail *ailp;
int error;
@@ -2538,19 +2585,6 @@ xlog_recover_process_intents(
xfs_lsn_t last_lsn;
#endif
- /*
- * The intent recovery handlers commit transactions to complete recovery
- * for individual intents, but any new deferred operations that are
- * queued during that process are held off until the very end. The
- * purpose of this transaction is to serve as a container for deferred
- * operations. Each intent recovery handler must transfer dfops here
- * before its local transaction commits, and we'll finish the entire
- * list below.
- */
- error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
- if (error)
- return error;
-
ailp = log->l_ailp;
spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
@@ -2579,26 +2613,31 @@ xlog_recover_process_intents(
/*
* NOTE: If your intent processing routine can create more
- * deferred ops, you /must/ attach them to the transaction in
+ * deferred ops, you /must/ attach them to the freezer in
* this routine or else those subsequent intents will get
* replayed in the wrong order!
*/
if (!test_and_set_bit(XFS_LI_RECOVERED, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
- error = lip->li_ops->iop_recover(lip, parent_tp);
+ error = lip->li_ops->iop_recover(lip, &freezer);
spin_lock(&ailp->ail_lock);
}
+ if (freezer) {
+ list_add_tail(&freezer->dfc_list, &dfops_freezers);
+ freezer = NULL;
+ }
if (error)
- goto out;
+ break;
+
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
-out:
+
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
if (!error)
- error = xlog_finish_defer_ops(parent_tp);
- xfs_trans_cancel(parent_tp);
+ error = xlog_finish_defer_ops(log->l_mp, &dfops_freezers);
+ xlog_cancel_defer_ops(log->l_mp, &dfops_freezers);
return error;
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index c81639891e29..69699d22b254 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -423,7 +423,7 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
STATIC int
xfs_cui_item_recover(
struct xfs_log_item *lip,
- struct xfs_trans *parent_tp)
+ struct xfs_defer_capture **dfcp)
{
struct xfs_bmbt_irec irec;
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
@@ -431,7 +431,7 @@ xfs_cui_item_recover(
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = lip->li_mountp;
xfs_fsblock_t startblock_fsb;
xfs_fsblock_t new_fsb;
xfs_extlen_t new_len;
@@ -492,12 +492,7 @@ xfs_cui_item_recover(
mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
if (error)
return error;
- /*
- * Recovery stashes all deferred ops during intent processing and
- * finishes them on completion. Transfer current dfops state to this
- * transaction and transfer the result back before we return.
- */
- xfs_defer_move(tp, parent_tp);
+
cudp = xfs_trans_get_cud(tp, cuip);
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
@@ -554,13 +549,10 @@ xfs_cui_item_recover(
}
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- xfs_defer_move(parent_tp, tp);
- error = xfs_trans_commit(tp);
- return error;
+ return xlog_recover_trans_commit(tp, dfcp);
abort_error:
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- xfs_defer_move(parent_tp, tp);
xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index a86599db20a6..11520e1d352f 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -466,14 +466,14 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
STATIC int
xfs_rui_item_recover(
struct xfs_log_item *lip,
- struct xfs_trans *parent_tp)
+ struct xfs_defer_capture **dfcp)
{
struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
struct xfs_map_extent *rmap;
struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = lip->li_mountp;
xfs_fsblock_t startblock_fsb;
enum xfs_rmap_intent_type type;
xfs_exntst_t state;
@@ -572,8 +572,7 @@ xfs_rui_item_recover(
}
xfs_rmap_finish_one_cleanup(tp, rcur, error);
- error = xfs_trans_commit(tp);
- return error;
+ return xlog_recover_trans_commit(tp, dfcp);
abort_error:
xfs_rmap_finish_one_cleanup(tp, rcur, error);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 8308bf6d7e40..78fa4267bb8e 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -26,6 +26,7 @@ struct xfs_cui_log_item;
struct xfs_cud_log_item;
struct xfs_bui_log_item;
struct xfs_bud_log_item;
+struct xfs_defer_capture;
struct xfs_log_item {
struct list_head li_ail; /* AIL pointers */
@@ -79,7 +80,8 @@ struct xfs_item_ops {
void (*iop_release)(struct xfs_log_item *);
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
void (*iop_error)(struct xfs_log_item *, xfs_buf_t *);
- int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp);
+ int (*iop_recover)(struct xfs_log_item *lip,
+ struct xfs_defer_capture **dfcp);
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
};