summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2019-04-11 22:39:39 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 17:08:20 -0400
commit644d180b055fa47be7e6ca8b684f45e2350dfafd (patch)
treee7842030427308ac1f4b7c69b5f365e7e6bb39aa
parent3ea2b1e12898154d6fae49b22a3509521ba49d38 (diff)
bcachefs: Journal replay refactoring
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/journal.c15
-rw-r--r--fs/bcachefs/journal_io.c130
-rw-r--r--fs/bcachefs/journal_io.h2
-rw-r--r--fs/bcachefs/journal_types.h1
-rw-r--r--fs/bcachefs/recovery.c343
5 files changed, 251 insertions, 240 deletions
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 969612e612e0..25d0631c43dd 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -984,9 +984,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
u64 last_seq = cur_seq, nr, seq;
if (!list_empty(journal_entries))
- last_seq = le64_to_cpu(list_last_entry(journal_entries,
- struct journal_replay,
- list)->j.last_seq);
+ last_seq = le64_to_cpu(list_first_entry(journal_entries,
+ struct journal_replay,
+ list)->j.seq);
nr = cur_seq - last_seq;
@@ -999,6 +999,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
}
}
+ j->replay_journal_seq = last_seq;
+ j->replay_journal_seq_end = cur_seq;
j->last_seq_ondisk = last_seq;
j->pin.front = last_seq;
j->pin.back = cur_seq;
@@ -1007,7 +1009,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
fifo_for_each_entry_ptr(p, &j->pin, seq) {
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
- atomic_set(&p->count, 0);
+ atomic_set(&p->count, 1);
p->devs.nr = 0;
}
@@ -1016,10 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
BUG_ON(seq < last_seq || seq >= cur_seq);
- p = journal_seq_pin(j, seq);
-
- atomic_set(&p->count, 1);
- p->devs = i->devs;
+ journal_seq_pin(j, seq)->devs = i->devs;
}
spin_lock(&j->lock);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 8010b38114ac..4fd7b048050b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1,9 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "alloc_background.h"
#include "alloc_foreground.h"
-#include "btree_gc.h"
-#include "btree_update.h"
#include "buckets.h"
#include "checksum.h"
#include "error.h"
@@ -642,18 +639,6 @@ err:
goto out;
}
-void bch2_journal_entries_free(struct list_head *list)
-{
-
- while (!list_empty(list)) {
- struct journal_replay *i =
- list_first_entry(list, struct journal_replay, list);
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
- }
-}
-
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{
struct journal_list jlist;
@@ -733,121 +718,6 @@ fsck_err:
return ret;
}
-/* journal replay: */
-
-static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- /*
- * We might cause compressed extents to be
- * split, so we need to pass in a
- * disk_reservation:
- */
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
- BKEY_PADDED(k) split;
- int ret;
-
- bch2_trans_init(&trans, c);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
- bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
- do {
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- break;
-
- bkey_copy(&split.k, k);
- bch2_cut_front(iter->pos, &split.k);
- bch2_extent_trim_atomic(&split.k, iter);
-
- ret = bch2_disk_reservation_add(c, &disk_res,
- split.k.k.size *
- bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
- BCH_DISK_RESERVATION_NOFAIL);
- BUG_ON(ret);
-
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
- ret = bch2_trans_commit(&trans, &disk_res, NULL,
- BTREE_INSERT_ATOMIC|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY);
- } while ((!ret || ret == -EINTR) &&
- bkey_cmp(k->k.p, iter->pos));
-
- bch2_disk_reservation_put(c, &disk_res);
-
- /*
- * This isn't strictly correct - we should only be relying on the btree
- * node lock for synchronization with gc when we've got a write lock
- * held.
- *
- * but - there are other correctness issues if btree gc were to run
- * before journal replay finishes
- */
- BUG_ON(c->gc_pos.phase);
-
- bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
- NULL, 0, 0);
- bch2_trans_exit(&trans);
-
- return ret;
-}
-
-int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
-{
- struct journal *j = &c->journal;
- struct bkey_i *k, *_n;
- struct jset_entry *entry;
- struct journal_replay *i, *n;
- int ret = 0;
-
- list_for_each_entry_safe(i, n, list, list) {
- j->replay_journal_seq = le64_to_cpu(i->j.seq);
-
- for_each_jset_key(k, _n, entry, &i->j) {
- switch (entry->btree_id) {
- case BTREE_ID_ALLOC:
- ret = bch2_alloc_replay_key(c, k);
- break;
- case BTREE_ID_EXTENTS:
- ret = bch2_extent_replay_key(c, k);
- break;
- default:
- ret = bch2_btree_insert(c, entry->btree_id, k,
- NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY|
- BTREE_INSERT_NOMARK);
- break;
- }
-
- if (ret) {
- bch_err(c, "journal replay: error %d while replaying key",
- ret);
- goto err;
- }
-
- cond_resched();
- }
-
- bch2_journal_pin_put(j, j->replay_journal_seq);
- }
-
- j->replay_journal_seq = 0;
-
- bch2_journal_set_replay_done(j);
- bch2_journal_flush_all_pins(j);
- ret = bch2_journal_error(j);
-err:
- bch2_journal_entries_free(list);
- return ret;
-}
-
/* journal write: */
static void __journal_write_alloc(struct journal *j,
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 4bb174839956..72e575f360af 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -36,8 +36,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
vstruct_for_each_safe(entry, k, _n)
int bch2_journal_read(struct bch_fs *, struct list_head *);
-void bch2_journal_entries_free(struct list_head *);
-int bch2_journal_replay(struct bch_fs *, struct list_head *);
void bch2_journal_write(struct closure *);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 7349b50bc5e7..0585e9b6e230 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -203,6 +203,7 @@ struct journal {
} pin;
u64 replay_journal_seq;
+ u64 replay_journal_seq_end;
struct write_point wp;
spinlock_t err_lock;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b1fcc105cffd..2e849135195d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -12,94 +12,162 @@
#include "error.h"
#include "fsck.h"
#include "journal_io.h"
+#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "quota.h"
#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
+#include <linux/sort.h>
#include <linux/stat.h>
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-static struct bkey_i *btree_root_find(struct bch_fs *c,
- struct bch_sb_field_clean *clean,
- struct jset *j,
- enum btree_id id, unsigned *level)
+/* journal replay: */
+
+static void bch2_journal_entries_free(struct list_head *list)
{
- struct bkey_i *k;
- struct jset_entry *entry, *start, *end;
- if (clean) {
- start = clean->start;
- end = vstruct_end(&clean->field);
- } else {
- start = j->start;
- end = vstruct_last(j);
+ while (!list_empty(list)) {
+ struct journal_replay *i =
+ list_first_entry(list, struct journal_replay, list);
+ list_del(&i->list);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
}
+}
- for (entry = start; entry < end; entry = vstruct_next(entry))
- if (entry->type == BCH_JSET_ENTRY_btree_root &&
- entry->btree_id == id)
- goto found;
+static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
+{
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ /*
+ * We might cause compressed extents to be
+ * split, so we need to pass in a
+ * disk_reservation:
+ */
+ struct disk_reservation disk_res =
+ bch2_disk_reservation_init(c, 0);
+ BKEY_PADDED(k) split;
+ int ret;
- return NULL;
-found:
- if (!entry->u64s)
- return ERR_PTR(-EINVAL);
+ bch2_trans_init(&trans, c);
- k = entry->start;
- *level = entry->level;
- return k;
-}
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ bkey_start_pos(&k->k),
+ BTREE_ITER_INTENT);
+ do {
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ break;
-static int verify_superblock_clean(struct bch_fs *c,
- struct bch_sb_field_clean **cleanp,
- struct jset *j)
-{
- unsigned i;
- struct bch_sb_field_clean *clean = *cleanp;
- int ret = 0;
+ bkey_copy(&split.k, k);
+ bch2_cut_front(iter->pos, &split.k);
+ bch2_extent_trim_atomic(&split.k, iter);
- if (!clean || !j)
- return 0;
+ ret = bch2_disk_reservation_add(c, &disk_res,
+ split.k.k.size *
+ bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
+ BCH_DISK_RESERVATION_NOFAIL);
+ BUG_ON(ret);
- if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
- "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
- le64_to_cpu(clean->journal_seq),
- le64_to_cpu(j->seq))) {
- kfree(clean);
- *cleanp = NULL;
- return 0;
+ bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
+ ret = bch2_trans_commit(&trans, &disk_res, NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_JOURNAL_REPLAY);
+ } while ((!ret || ret == -EINTR) &&
+ bkey_cmp(k->k.p, iter->pos));
+
+ bch2_disk_reservation_put(c, &disk_res);
+
+ /*
+ * This isn't strictly correct - we should only be relying on the btree
+ * node lock for synchronization with gc when we've got a write lock
+ * held.
+ *
+ * but - there are other correctness issues if btree gc were to run
+ * before journal replay finishes
+ */
+ BUG_ON(c->gc_pos.phase);
+
+ bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
+ NULL, 0, 0);
+ bch2_trans_exit(&trans);
+
+ return ret;
+}
+
+static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id btree_id,
+ struct bkey_i *k)
+{
+ switch (btree_id) {
+ case BTREE_ID_ALLOC:
+ return bch2_alloc_replay_key(c, k);
+ case BTREE_ID_EXTENTS:
+ return bch2_extent_replay_key(c, k);
+ default:
+ return bch2_btree_insert(c, btree_id, k,
+ NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_JOURNAL_REPLAY|
+ BTREE_INSERT_NOMARK);
}
+}
- mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
- "superblock read clock doesn't match journal after clean shutdown");
- mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
- "superblock read clock doesn't match journal after clean shutdown");
+static void replay_now_at(struct journal *j, u64 seq)
+{
+ BUG_ON(seq < j->replay_journal_seq);
+ BUG_ON(seq > j->replay_journal_seq_end);
- for (i = 0; i < BTREE_ID_NR; i++) {
- struct bkey_i *k1, *k2;
- unsigned l1 = 0, l2 = 0;
+ while (j->replay_journal_seq < seq)
+ bch2_journal_pin_put(j, j->replay_journal_seq++);
+}
- k1 = btree_root_find(c, clean, NULL, i, &l1);
- k2 = btree_root_find(c, NULL, j, i, &l2);
+static int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
+{
+ struct journal *j = &c->journal;
+ struct bkey_i *k, *_n;
+ struct jset_entry *entry;
+ struct journal_replay *i, *n;
+ int ret = 0;
- if (!k1 && !k2)
- continue;
+ list_for_each_entry_safe(i, n, list, list) {
+ replay_now_at(j, le64_to_cpu(i->j.seq));
- mustfix_fsck_err_on(!k1 || !k2 ||
- IS_ERR(k1) ||
- IS_ERR(k2) ||
- k1->k.u64s != k2->k.u64s ||
- memcmp(k1, k2, bkey_bytes(k1)) ||
- l1 != l2, c,
- "superblock btree root doesn't match journal after clean shutdown");
+ for_each_jset_key(k, _n, entry, &i->j) {
+ ret = bch2_journal_replay_key(c, entry->btree_id, k);
+ if (ret) {
+ bch_err(c, "journal replay: error %d while replaying key",
+ ret);
+ goto err;
+ }
+
+ cond_resched();
+ }
}
-fsck_err:
+
+ replay_now_at(j, j->replay_journal_seq_end);
+ j->replay_journal_seq = 0;
+
+ bch2_journal_set_replay_done(j);
+ bch2_journal_flush_all_pins(j);
+ ret = bch2_journal_error(j);
+err:
+ bch2_journal_entries_free(list);
return ret;
}
+static bool journal_empty(struct list_head *journal)
+{
+ return list_empty(journal) ||
+ journal_entry_empty(&list_last_entry(journal,
+ struct journal_replay, list)->j);
+}
+
static int
verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
struct list_head *journal)
@@ -130,40 +198,7 @@ fsck_err:
return ret;
}
-static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
-{
- struct bch_sb_field_clean *clean, *sb_clean;
- int ret;
-
- mutex_lock(&c->sb_lock);
- sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-
- if (fsck_err_on(!sb_clean, c,
- "superblock marked clean but clean section not present")) {
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->sb.clean = false;
- mutex_unlock(&c->sb_lock);
- return NULL;
- }
-
- clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
- GFP_KERNEL);
- if (!clean) {
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(-ENOMEM);
- }
-
- if (le16_to_cpu(c->disk_sb.sb->version) <
- bcachefs_metadata_version_bkey_renumber)
- bch2_sb_clean_renumber(clean, READ);
-
- mutex_unlock(&c->sb_lock);
-
- return clean;
-fsck_err:
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(ret);
-}
+/* journal replay early: */
static int journal_replay_entry_early(struct bch_fs *c,
struct jset_entry *entry)
@@ -275,6 +310,121 @@ static int journal_replay_early(struct bch_fs *c,
return 0;
}
+/* sb clean section: */
+
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+ struct bch_sb_field_clean *clean,
+ struct jset *j,
+ enum btree_id id, unsigned *level)
+{
+ struct bkey_i *k;
+ struct jset_entry *entry, *start, *end;
+
+ if (clean) {
+ start = clean->start;
+ end = vstruct_end(&clean->field);
+ } else {
+ start = j->start;
+ end = vstruct_last(j);
+ }
+
+ for (entry = start; entry < end; entry = vstruct_next(entry))
+ if (entry->type == BCH_JSET_ENTRY_btree_root &&
+ entry->btree_id == id)
+ goto found;
+
+ return NULL;
+found:
+ if (!entry->u64s)
+ return ERR_PTR(-EINVAL);
+
+ k = entry->start;
+ *level = entry->level;
+ return k;
+}
+
+static int verify_superblock_clean(struct bch_fs *c,
+ struct bch_sb_field_clean **cleanp,
+ struct jset *j)
+{
+ unsigned i;
+ struct bch_sb_field_clean *clean = *cleanp;
+ int ret = 0;
+
+ if (!clean || !j)
+ return 0;
+
+ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+ le64_to_cpu(clean->journal_seq),
+ le64_to_cpu(j->seq))) {
+ kfree(clean);
+ *cleanp = NULL;
+ return 0;
+ }
+
+ mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
+ "superblock read clock doesn't match journal after clean shutdown");
+ mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
+ "superblock read clock doesn't match journal after clean shutdown");
+
+ for (i = 0; i < BTREE_ID_NR; i++) {
+ struct bkey_i *k1, *k2;
+ unsigned l1 = 0, l2 = 0;
+
+ k1 = btree_root_find(c, clean, NULL, i, &l1);
+ k2 = btree_root_find(c, NULL, j, i, &l2);
+
+ if (!k1 && !k2)
+ continue;
+
+ mustfix_fsck_err_on(!k1 || !k2 ||
+ IS_ERR(k1) ||
+ IS_ERR(k2) ||
+ k1->k.u64s != k2->k.u64s ||
+ memcmp(k1, k2, bkey_bytes(k1)) ||
+ l1 != l2, c,
+ "superblock btree root doesn't match journal after clean shutdown");
+ }
+fsck_err:
+ return ret;
+}
+
+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
+{
+ struct bch_sb_field_clean *clean, *sb_clean;
+ int ret;
+
+ mutex_lock(&c->sb_lock);
+ sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+
+ if (fsck_err_on(!sb_clean, c,
+ "superblock marked clean but clean section not present")) {
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+ c->sb.clean = false;
+ mutex_unlock(&c->sb_lock);
+ return NULL;
+ }
+
+ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+ GFP_KERNEL);
+ if (!clean) {
+ mutex_unlock(&c->sb_lock);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (le16_to_cpu(c->disk_sb.sb->version) <
+ bcachefs_metadata_version_bkey_renumber)
+ bch2_sb_clean_renumber(clean, READ);
+
+ mutex_unlock(&c->sb_lock);
+
+ return clean;
+fsck_err:
+ mutex_unlock(&c->sb_lock);
+ return ERR_PTR(ret);
+}
+
static int read_btree_roots(struct bch_fs *c)
{
unsigned i;
@@ -320,13 +470,6 @@ fsck_err:
return ret;
}
-static bool journal_empty(struct list_head *journal)
-{
- return list_empty(journal) ||
- journal_entry_empty(&list_last_entry(journal,
- struct journal_replay, list)->j);
-}
-
int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";