bcachefs: Journal replay refactoring

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
author: Kent Overstreet <kent.overstreet@gmail.com> 2019-04-11 22:39:39 -0400
committer: Kent Overstreet <kent.overstreet@linux.dev> 2023-10-22 17:08:20 -0400
commit: 644d180b055fa47be7e6ca8b684f45e2350dfafd (patch)
tree: e7842030427308ac1f4b7c69b5f365e7e6bb39aa
parent: 3ea2b1e12898154d6fae49b22a3509521ba49d38 (diff)
5 files changed, 251 insertions, 240 deletions
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 969612e612e0..25d0631c43dd 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -984,9 +984,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	u64 last_seq = cur_seq, nr, seq;
 
 	if (!list_empty(journal_entries))
-		last_seq = le64_to_cpu(list_last_entry(journal_entries,
-						       struct journal_replay,
-						       list)->j.last_seq);
+		last_seq = le64_to_cpu(list_first_entry(journal_entries,
+							struct journal_replay,
+							list)->j.seq);
 
 	nr = cur_seq - last_seq;
 
@@ -999,6 +999,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 		}
 	}
 
+	j->replay_journal_seq	= last_seq;
+	j->replay_journal_seq_end = cur_seq;
 	j->last_seq_ondisk	= last_seq;
 	j->pin.front		= last_seq;
 	j->pin.back		= cur_seq;
@@ -1007,7 +1009,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 	fifo_for_each_entry_ptr(p, &j->pin, seq) {
 		INIT_LIST_HEAD(&p->list);
 		INIT_LIST_HEAD(&p->flushed);
-		atomic_set(&p->count, 0);
+		atomic_set(&p->count, 1);
 		p->devs.nr = 0;
 	}
 
@@ -1016,10 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
 
 		BUG_ON(seq < last_seq || seq >= cur_seq);
 
-		p = journal_seq_pin(j, seq);
-
-		atomic_set(&p->count, 1);
-		p->devs = i->devs;
+		journal_seq_pin(j, seq)->devs = i->devs;
 	}
 
 	spin_lock(&j->lock);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 8010b38114ac..4fd7b048050b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1,9 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "alloc_background.h"
 #include "alloc_foreground.h"
-#include "btree_gc.h"
-#include "btree_update.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "error.h"
@@ -642,18 +639,6 @@ err:
 	goto out;
 }
 
-void bch2_journal_entries_free(struct list_head *list)
-{
-
-	while (!list_empty(list)) {
-		struct journal_replay *i =
-			list_first_entry(list, struct journal_replay, list);
-		list_del(&i->list);
-		kvpfree(i, offsetof(struct journal_replay, j) +
-			vstruct_bytes(&i->j));
-	}
-}
-
 int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 {
 	struct journal_list jlist;
@@ -733,121 +718,6 @@ fsck_err:
 	return ret;
 }
 
-/* journal replay: */
-
-static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
-{
-	struct btree_trans trans;
-	struct btree_iter *iter;
-	/*
-	 * We might cause compressed extents to be
-	 * split, so we need to pass in a
-	 * disk_reservation:
-	 */
-	struct disk_reservation disk_res =
-		bch2_disk_reservation_init(c, 0);
-	BKEY_PADDED(k) split;
-	int ret;
-
-	bch2_trans_init(&trans, c);
-
-	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-				   bkey_start_pos(&k->k),
-				   BTREE_ITER_INTENT);
-	do {
-		ret = bch2_btree_iter_traverse(iter);
-		if (ret)
-			break;
-
-		bkey_copy(&split.k, k);
-		bch2_cut_front(iter->pos, &split.k);
-		bch2_extent_trim_atomic(&split.k, iter);
-
-		ret = bch2_disk_reservation_add(c, &disk_res,
-				split.k.k.size *
-				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
-				BCH_DISK_RESERVATION_NOFAIL);
-		BUG_ON(ret);
-
-		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
-		ret = bch2_trans_commit(&trans, &disk_res, NULL,
-					BTREE_INSERT_ATOMIC|
-					BTREE_INSERT_NOFAIL|
-					BTREE_INSERT_LAZY_RW|
-					BTREE_INSERT_JOURNAL_REPLAY);
-	} while ((!ret || ret == -EINTR) &&
-		 bkey_cmp(k->k.p, iter->pos));
-
-	bch2_disk_reservation_put(c, &disk_res);
-
-	/*
-	 * This isn't strictly correct - we should only be relying on the btree
-	 * node lock for synchronization with gc when we've got a write lock
-	 * held.
-	 *
-	 * but - there are other correctness issues if btree gc were to run
-	 * before journal replay finishes
-	 */
-	BUG_ON(c->gc_pos.phase);
-
-	bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
-		      NULL, 0, 0);
-	bch2_trans_exit(&trans);
-
-	return ret;
-}
-
-int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
-{
-	struct journal *j = &c->journal;
-	struct bkey_i *k, *_n;
-	struct jset_entry *entry;
-	struct journal_replay *i, *n;
-	int ret = 0;
-
-	list_for_each_entry_safe(i, n, list, list) {
-		j->replay_journal_seq = le64_to_cpu(i->j.seq);
-
-		for_each_jset_key(k, _n, entry, &i->j) {
-			switch (entry->btree_id) {
-			case BTREE_ID_ALLOC:
-				ret = bch2_alloc_replay_key(c, k);
-				break;
-			case BTREE_ID_EXTENTS:
-				ret = bch2_extent_replay_key(c, k);
-				break;
-			default:
-				ret = bch2_btree_insert(c, entry->btree_id, k,
-						NULL, NULL,
-						BTREE_INSERT_NOFAIL|
-						BTREE_INSERT_LAZY_RW|
-						BTREE_INSERT_JOURNAL_REPLAY|
-						BTREE_INSERT_NOMARK);
-				break;
-			}
-
-			if (ret) {
-				bch_err(c, "journal replay: error %d while replaying key",
-					ret);
-				goto err;
-			}
-
-			cond_resched();
-		}
-
-		bch2_journal_pin_put(j, j->replay_journal_seq);
-	}
-
-	j->replay_journal_seq = 0;
-
-	bch2_journal_set_replay_done(j);
-	bch2_journal_flush_all_pins(j);
-	ret = bch2_journal_error(j);
-err:
-	bch2_journal_entries_free(list);
-	return ret;
-}
-
 /* journal write: */
 
 static void __journal_write_alloc(struct journal *j,
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 4bb174839956..72e575f360af 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -36,8 +36,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 		vstruct_for_each_safe(entry, k, _n)
 
 int bch2_journal_read(struct bch_fs *, struct list_head *);
-void bch2_journal_entries_free(struct list_head *);
-int bch2_journal_replay(struct bch_fs *, struct list_head *);
 
 void bch2_journal_write(struct closure *);
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 7349b50bc5e7..0585e9b6e230 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -203,6 +203,7 @@ struct journal {
 	}			pin;
 
 	u64			replay_journal_seq;
+	u64			replay_journal_seq_end;
 
 	struct write_point	wp;
 	spinlock_t		err_lock;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b1fcc105cffd..2e849135195d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -12,94 +12,162 @@
 #include "error.h"
 #include "fsck.h"
 #include "journal_io.h"
+#include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "quota.h"
 #include "recovery.h"
 #include "replicas.h"
 #include "super-io.h"
 
+#include <linux/sort.h>
 #include <linux/stat.h>
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
-static struct bkey_i *btree_root_find(struct bch_fs *c,
-				      struct bch_sb_field_clean *clean,
-				      struct jset *j,
-				      enum btree_id id, unsigned *level)
+/* journal replay: */
+
+static void bch2_journal_entries_free(struct list_head *list)
 {
-	struct bkey_i *k;
-	struct jset_entry *entry, *start, *end;
 
-	if (clean) {
-		start = clean->start;
-		end = vstruct_end(&clean->field);
-	} else {
-		start = j->start;
-		end = vstruct_last(j);
+	while (!list_empty(list)) {
+		struct journal_replay *i =
+			list_first_entry(list, struct journal_replay, list);
+		list_del(&i->list);
+		kvpfree(i, offsetof(struct journal_replay, j) +
+			vstruct_bytes(&i->j));
 	}
+}
 
-	for (entry = start; entry < end; entry = vstruct_next(entry))
-		if (entry->type == BCH_JSET_ENTRY_btree_root &&
-		    entry->btree_id == id)
-			goto found;
+static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	/*
+	 * We might cause compressed extents to be
+	 * split, so we need to pass in a
+	 * disk_reservation:
+	 */
+	struct disk_reservation disk_res =
+		bch2_disk_reservation_init(c, 0);
+	BKEY_PADDED(k) split;
+	int ret;
 
-	return NULL;
-found:
-	if (!entry->u64s)
-		return ERR_PTR(-EINVAL);
+	bch2_trans_init(&trans, c);
 
-	k = entry->start;
-	*level = entry->level;
-	return k;
-}
+	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+				   bkey_start_pos(&k->k),
+				   BTREE_ITER_INTENT);
+	do {
+		ret = bch2_btree_iter_traverse(iter);
+		if (ret)
+			break;
 
-static int verify_superblock_clean(struct bch_fs *c,
-				   struct bch_sb_field_clean **cleanp,
-				   struct jset *j)
-{
-	unsigned i;
-	struct bch_sb_field_clean *clean = *cleanp;
-	int ret = 0;
+		bkey_copy(&split.k, k);
+		bch2_cut_front(iter->pos, &split.k);
+		bch2_extent_trim_atomic(&split.k, iter);
 
-	if (!clean || !j)
-		return 0;
+		ret = bch2_disk_reservation_add(c, &disk_res,
+				split.k.k.size *
+				bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
+				BCH_DISK_RESERVATION_NOFAIL);
+		BUG_ON(ret);
 
-	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
-			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
-			le64_to_cpu(clean->journal_seq),
-			le64_to_cpu(j->seq))) {
-		kfree(clean);
-		*cleanp = NULL;
-		return 0;
+		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
+		ret = bch2_trans_commit(&trans, &disk_res, NULL,
+					BTREE_INSERT_ATOMIC|
+					BTREE_INSERT_NOFAIL|
+					BTREE_INSERT_LAZY_RW|
+					BTREE_INSERT_JOURNAL_REPLAY);
+	} while ((!ret || ret == -EINTR) &&
+		 bkey_cmp(k->k.p, iter->pos));
+
+	bch2_disk_reservation_put(c, &disk_res);
+
+	/*
+	 * This isn't strictly correct - we should only be relying on the btree
+	 * node lock for synchronization with gc when we've got a write lock
+	 * held.
+	 *
+	 * but - there are other correctness issues if btree gc were to run
+	 * before journal replay finishes
+	 */
+	BUG_ON(c->gc_pos.phase);
+
+	bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
+		      NULL, 0, 0);
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id btree_id,
+				   struct bkey_i *k)
+{
+	switch (btree_id) {
+	case BTREE_ID_ALLOC:
+		return bch2_alloc_replay_key(c, k);
+	case BTREE_ID_EXTENTS:
+		return bch2_extent_replay_key(c, k);
+	default:
+		return bch2_btree_insert(c, btree_id, k,
+					 NULL, NULL,
+					 BTREE_INSERT_NOFAIL|
+					 BTREE_INSERT_LAZY_RW|
+					 BTREE_INSERT_JOURNAL_REPLAY|
+					 BTREE_INSERT_NOMARK);
 	}
+}
 
-	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-			"superblock read clock doesn't match journal after clean shutdown");
-	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-			"superblock read clock doesn't match journal after clean shutdown");
+static void replay_now_at(struct journal *j, u64 seq)
+{
+	BUG_ON(seq < j->replay_journal_seq);
+	BUG_ON(seq > j->replay_journal_seq_end);
 
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct bkey_i *k1, *k2;
-		unsigned l1 = 0, l2 = 0;
+	while (j->replay_journal_seq < seq)
+		bch2_journal_pin_put(j, j->replay_journal_seq++);
+}
 
-		k1 = btree_root_find(c, clean, NULL, i, &l1);
-		k2 = btree_root_find(c, NULL, j, i, &l2);
+static int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
+{
+	struct journal *j = &c->journal;
+	struct bkey_i *k, *_n;
+	struct jset_entry *entry;
+	struct journal_replay *i, *n;
+	int ret = 0;
 
-		if (!k1 && !k2)
-			continue;
+	list_for_each_entry_safe(i, n, list, list) {
+		replay_now_at(j, le64_to_cpu(i->j.seq));
 
-		mustfix_fsck_err_on(!k1 || !k2 ||
-				    IS_ERR(k1) ||
-				    IS_ERR(k2) ||
-				    k1->k.u64s != k2->k.u64s ||
-				    memcmp(k1, k2, bkey_bytes(k1)) ||
-				    l1 != l2, c,
-			"superblock btree root doesn't match journal after clean shutdown");
+		for_each_jset_key(k, _n, entry, &i->j) {
+			ret = bch2_journal_replay_key(c, entry->btree_id, k);
+			if (ret) {
+				bch_err(c, "journal replay: error %d while replaying key",
+					ret);
+				goto err;
+			}
+
+			cond_resched();
+		}
 	}
-fsck_err:
+
+	replay_now_at(j, j->replay_journal_seq_end);
+	j->replay_journal_seq = 0;
+
+	bch2_journal_set_replay_done(j);
+	bch2_journal_flush_all_pins(j);
+	ret = bch2_journal_error(j);
+err:
+	bch2_journal_entries_free(list);
 	return ret;
 }
 
+static bool journal_empty(struct list_head *journal)
+{
+	return list_empty(journal) ||
+		journal_entry_empty(&list_last_entry(journal,
+					struct journal_replay, list)->j);
+}
+
 static int
 verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
 						  struct list_head *journal)
@@ -130,40 +198,7 @@ fsck_err:
 	return ret;
 }
 
-static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
-{
-	struct bch_sb_field_clean *clean, *sb_clean;
-	int ret;
-
-	mutex_lock(&c->sb_lock);
-	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-
-	if (fsck_err_on(!sb_clean, c,
-			"superblock marked clean but clean section not present")) {
-		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-		c->sb.clean = false;
-		mutex_unlock(&c->sb_lock);
-		return NULL;
-	}
-
-	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
-			GFP_KERNEL);
-	if (!clean) {
-		mutex_unlock(&c->sb_lock);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	if (le16_to_cpu(c->disk_sb.sb->version) <
-	    bcachefs_metadata_version_bkey_renumber)
-		bch2_sb_clean_renumber(clean, READ);
-
-	mutex_unlock(&c->sb_lock);
-
-	return clean;
-fsck_err:
-	mutex_unlock(&c->sb_lock);
-	return ERR_PTR(ret);
-}
+/* journal replay early: */
 
 static int journal_replay_entry_early(struct bch_fs *c,
 				      struct jset_entry *entry)
@@ -275,6 +310,121 @@ static int journal_replay_early(struct bch_fs *c,
 	return 0;
 }
 
+/* sb clean section: */
+
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+				      struct bch_sb_field_clean *clean,
+				      struct jset *j,
+				      enum btree_id id, unsigned *level)
+{
+	struct bkey_i *k;
+	struct jset_entry *entry, *start, *end;
+
+	if (clean) {
+		start = clean->start;
+		end = vstruct_end(&clean->field);
+	} else {
+		start = j->start;
+		end = vstruct_last(j);
+	}
+
+	for (entry = start; entry < end; entry = vstruct_next(entry))
+		if (entry->type == BCH_JSET_ENTRY_btree_root &&
+		    entry->btree_id == id)
+			goto found;
+
+	return NULL;
+found:
+	if (!entry->u64s)
+		return ERR_PTR(-EINVAL);
+
+	k = entry->start;
+	*level = entry->level;
+	return k;
+}
+
+static int verify_superblock_clean(struct bch_fs *c,
+				   struct bch_sb_field_clean **cleanp,
+				   struct jset *j)
+{
+	unsigned i;
+	struct bch_sb_field_clean *clean = *cleanp;
+	int ret = 0;
+
+	if (!clean || !j)
+		return 0;
+
+	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+			le64_to_cpu(clean->journal_seq),
+			le64_to_cpu(j->seq))) {
+		kfree(clean);
+		*cleanp = NULL;
+		return 0;
+	}
+
+	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct bkey_i *k1, *k2;
+		unsigned l1 = 0, l2 = 0;
+
+		k1 = btree_root_find(c, clean, NULL, i, &l1);
+		k2 = btree_root_find(c, NULL, j, i, &l2);
+
+		if (!k1 && !k2)
+			continue;
+
+		mustfix_fsck_err_on(!k1 || !k2 ||
+				    IS_ERR(k1) ||
+				    IS_ERR(k2) ||
+				    k1->k.u64s != k2->k.u64s ||
+				    memcmp(k1, k2, bkey_bytes(k1)) ||
+				    l1 != l2, c,
+			"superblock btree root doesn't match journal after clean shutdown");
+	}
+fsck_err:
+	return ret;
+}
+
+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
+{
+	struct bch_sb_field_clean *clean, *sb_clean;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+
+	if (fsck_err_on(!sb_clean, c,
+			"superblock marked clean but clean section not present")) {
+		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+		c->sb.clean = false;
+		mutex_unlock(&c->sb_lock);
+		return NULL;
+	}
+
+	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+			GFP_KERNEL);
+	if (!clean) {
+		mutex_unlock(&c->sb_lock);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (le16_to_cpu(c->disk_sb.sb->version) <
+	    bcachefs_metadata_version_bkey_renumber)
+		bch2_sb_clean_renumber(clean, READ);
+
+	mutex_unlock(&c->sb_lock);
+
+	return clean;
+fsck_err:
+	mutex_unlock(&c->sb_lock);
+	return ERR_PTR(ret);
+}
+
 static int read_btree_roots(struct bch_fs *c)
 {
 	unsigned i;
@@ -320,13 +470,6 @@ fsck_err:
 	return ret;
 }
 
-static bool journal_empty(struct list_head *journal)
-{
-	return list_empty(journal) ||
-		journal_entry_empty(&list_last_entry(journal,
-					struct journal_replay, list)->j);
-}
-
 int bch2_fs_recovery(struct bch_fs *c)
 {
 	const char *err = "cannot allocate memory";
author	Kent Overstreet <kent.overstreet@gmail.com>	2019-04-11 22:39:39 -0400
committer	Kent Overstreet <kent.overstreet@linux.dev>	2023-10-22 17:08:20 -0400
commit	644d180b055fa47be7e6ca8b684f45e2350dfafd (patch)
tree	e7842030427308ac1f4b7c69b5f365e7e6bb39aa
parent	3ea2b1e12898154d6fae49b22a3509521ba49d38 (diff)