summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2022-03-21 00:15:53 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2022-04-08 19:26:03 -0400
commit6ddf061e68560a2bb263b126af7e894a6c1afb5f (patch)
treec7a2f3315a2f336a18dabbdc7e0ce350f512ed68
parentce1e90826806edb0a07950a3afa487811b1d3857 (diff)
bcachefs: Use a genradix for reading journal entries
Previously, the journal read path used a linked list for storing the journal entries we read from disk. But there's been a bug that's been causing journal_flush_delay to incorrectly be set to 0, leading to far more journal entries than is normal being written out, which then means filesystems are no longer able to start due to the O(n^2) behaviour of inserting into/searching that linked list. Fix this by switching to a radix tree. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--fs/bcachefs/bcachefs.h3
-rw-r--r--fs/bcachefs/journal.c34
-rw-r--r--fs/bcachefs/journal.h2
-rw-r--r--fs/bcachefs/journal_io.c136
-rw-r--r--fs/bcachefs/journal_io.h3
-rw-r--r--fs/bcachefs/recovery.c68
-rw-r--r--fs/bcachefs/recovery.h2
-rw-r--r--fs/bcachefs/super.c3
8 files changed, 150 insertions, 101 deletions
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index ab6df637ee26..e29a0891134b 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -894,7 +894,8 @@ struct bch_fs {
mempool_t btree_bounce_pool;
struct journal journal;
- struct list_head journal_entries;
+ GENRADIX(struct journal_replay *) journal_entries;
+ u64 journal_entries_base_seq;
struct journal_keys journal_keys;
struct list_head journal_iters;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index d01b1cd4000d..75be0a5f708c 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1040,17 +1040,25 @@ void bch2_fs_journal_stop(struct journal *j)
cancel_delayed_work_sync(&j->write_work);
}
-int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
- struct list_head *journal_entries)
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
- struct journal_replay *i;
+ struct journal_replay *i, **_i;
+ struct genradix_iter iter;
+ bool had_entries = false;
+ unsigned ptr;
u64 last_seq = cur_seq, nr, seq;
- if (!list_empty(journal_entries))
- last_seq = le64_to_cpu(list_last_entry(journal_entries,
- struct journal_replay, list)->j.last_seq);
+ genradix_for_each_reverse(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
+
+ last_seq = le64_to_cpu(i->j.last_seq);
+ break;
+ }
nr = cur_seq - last_seq;
@@ -1072,14 +1080,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
j->pin.back = cur_seq;
atomic64_set(&j->seq, cur_seq - 1);
- if (list_empty(journal_entries))
- j->last_empty_seq = cur_seq - 1;
-
fifo_for_each_entry_ptr(p, &j->pin, seq)
journal_pin_list_init(p, 1);
- list_for_each_entry(i, journal_entries, list) {
- unsigned ptr;
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
+ continue;
seq = le64_to_cpu(i->j.seq);
BUG_ON(seq >= cur_seq);
@@ -1095,9 +1103,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
p->devs.nr = 0;
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+
+ had_entries = true;
}
- if (list_empty(journal_entries))
+ if (!had_entries)
j->last_empty_seq = cur_seq;
spin_lock(&j->lock);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index e7321c327d9d..7fee0c05aa7d 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -512,7 +512,7 @@ int bch2_dev_journal_alloc(struct bch_dev *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64, struct list_head *);
+int bch2_fs_journal_start(struct journal *, u64);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 5e08932c1f7c..eb20bf05cba8 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,12 +17,22 @@
#include <trace/events/bcachefs.h>
-static void __journal_replay_free(struct journal_replay *i)
+static inline u32 journal_entry_radix_idx(struct bch_fs *c,
+ struct jset *j)
{
- list_del(&i->list);
+ return (le64_to_cpu(j->seq) - c->journal_entries_base_seq) & (~0U >> 1);
+}
+
+static void __journal_replay_free(struct bch_fs *c,
+ struct journal_replay *i)
+{
+ struct journal_replay **p =
+ genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, &i->j));
+
+ BUG_ON(*p != i);
+ *p = NULL;
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
-
}
static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
@@ -30,13 +40,12 @@ static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
i->ignore = true;
if (!c->opts.read_entire_journal)
- __journal_replay_free(i);
+ __journal_replay_free(c, i);
}
struct journal_list {
struct closure cl;
struct mutex lock;
- struct list_head *head;
int ret;
};
@@ -52,19 +61,30 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct journal_list *jlist, struct jset *j,
bool bad)
{
- struct journal_replay *i, *pos, *dup = NULL;
+ struct genradix_iter iter;
+ struct journal_replay **_i, *i, *dup;
struct journal_ptr *ptr;
- struct list_head *where;
size_t bytes = vstruct_bytes(j);
u64 last_seq = 0;
int ret = JOURNAL_ENTRY_ADD_OK;
+ /*
+ * Xarrays are indexed by a ulong, not a u64, so we can't index them by
+ * sequence number directly:
+ * Assume instead that they will all fall within the range of +-2billion
+ * of the filrst one we find.
+ */
+ if (!c->journal_entries_base_seq)
+ c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
+
+#if 0
list_for_each_entry_reverse(i, jlist->head, list) {
if (!JSET_NO_FLUSH(&i->j)) {
last_seq = le64_to_cpu(i->j.last_seq);
break;
}
}
+#endif
/* Is this entry older than the range we need? */
if (!c->opts.read_entire_journal &&
@@ -75,28 +95,20 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
/* Drop entries we don't need anymore */
if (!JSET_NO_FLUSH(j)) {
- list_for_each_entry_safe(i, pos, jlist->head, list) {
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i)
+ continue;
+
if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
break;
journal_replay_free(c, i);
}
}
- list_for_each_entry_reverse(i, jlist->head, list) {
- if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
- where = &i->list;
- goto add;
- }
- }
-
- where = jlist->head;
-add:
- dup = where->next != jlist->head
- ? container_of(where->next, struct journal_replay, list)
- : NULL;
-
- if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
- dup = NULL;
+ _i = genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, j));
+ dup = _i ? *_i : NULL;
/*
* Duplicate journal entries? If so we want the one that didn't have a
@@ -132,10 +144,19 @@ add:
if (dup) {
i->nr_ptrs = dup->nr_ptrs;
memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
- __journal_replay_free(dup);
+ __journal_replay_free(c, dup);
}
- list_add(&i->list, where);
+ _i = genradix_ptr_alloc(&c->journal_entries,
+ journal_entry_radix_idx(c, &i->j),
+ GFP_KERNEL);
+ if (!_i) {
+ bch_err(c, "failed to allocate c->journal_entries entry");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ *_i = i;
found:
for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
if (ptr->dev == ca->dev_idx) {
@@ -914,7 +935,8 @@ static void bch2_journal_read_device(struct closure *cl)
struct bch_fs *c = ca->fs;
struct journal_list *jlist =
container_of(cl->parent, struct journal_list, cl);
- struct journal_replay *r;
+ struct journal_replay *r, **_r;
+ struct genradix_iter iter;
struct journal_read_buf buf = { NULL, 0 };
u64 min_seq = U64_MAX;
unsigned i;
@@ -957,7 +979,12 @@ static void bch2_journal_read_device(struct closure *cl)
ja->sectors_free = ca->mi.bucket_size;
mutex_lock(&jlist->lock);
- list_for_each_entry(r, jlist->head, list) {
+ genradix_for_each(&c->journal_entries, iter, _r) {
+ r = *_r;
+
+ if (!r)
+ continue;
+
for (i = 0; i < r->nr_ptrs; i++) {
if (r->ptrs[i].dev == ca->dev_idx &&
sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
@@ -1023,11 +1050,11 @@ void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
}
}
-int bch2_journal_read(struct bch_fs *c, struct list_head *list,
- u64 *blacklist_seq, u64 *start_seq)
+int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
{
struct journal_list jlist;
- struct journal_replay *i, *t;
+ struct journal_replay *i, **_i, *prev = NULL;
+ struct genradix_iter radix_iter;
struct bch_dev *ca;
unsigned iter;
struct printbuf buf = PRINTBUF;
@@ -1038,7 +1065,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
closure_init_stack(&jlist.cl);
mutex_init(&jlist.lock);
- jlist.head = list;
jlist.ret = 0;
for_each_member_device(ca, c, iter) {
@@ -1062,22 +1088,21 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
if (jlist.ret)
return jlist.ret;
- if (list_empty(list)) {
- bch_info(c, "journal read done, but no entries found");
- return 0;
- }
-
- i = list_last_entry(list, struct journal_replay, list);
- *start_seq = le64_to_cpu(i->j.seq) + 1;
+ *start_seq = 0;
/*
* Find most recent flush entry, and ignore newer non flush entries -
* those entries will be blacklisted:
*/
- list_for_each_entry_safe_reverse(i, t, list, list) {
- if (i->ignore)
+ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
+ if (!*start_seq)
+ *start_seq = le64_to_cpu(i->j.seq) + 1;
+
if (!JSET_NO_FLUSH(&i->j)) {
last_seq = le64_to_cpu(i->j.last_seq);
*blacklist_seq = le64_to_cpu(i->j.seq) + 1;
@@ -1087,6 +1112,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
journal_replay_free(c, i);
}
+ if (!*start_seq) {
+ bch_info(c, "journal read done, but no entries found");
+ return 0;
+ }
+
if (!last_seq) {
fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
ret = -1;
@@ -1094,8 +1124,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
}
/* Drop blacklisted entries and entries older than last_seq: */
- list_for_each_entry_safe(i, t, list, list) {
- if (i->ignore)
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
seq = le64_to_cpu(i->j.seq);
@@ -1114,8 +1146,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
/* Check for missing entries: */
seq = last_seq;
- list_for_each_entry(i, list, list) {
- if (i->ignore)
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
BUG_ON(seq > le64_to_cpu(i->j.seq));
@@ -1137,11 +1171,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
!bch2_journal_seq_is_blacklisted(c, seq, false))
seq++;
- if (i->list.prev != list) {
- struct journal_replay *p = list_prev_entry(i, list);
-
- bch2_journal_ptrs_to_text(&buf1, c, p);
- pr_buf(&buf1, " size %zu", vstruct_sectors(&p->j, c->block_bits));
+ if (prev) {
+ bch2_journal_ptrs_to_text(&buf1, c, prev);
+ pr_buf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
} else
pr_buf(&buf1, "(none)");
bch2_journal_ptrs_to_text(&buf2, c, i);
@@ -1158,10 +1190,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
printbuf_exit(&buf2);
}
+ prev = i;
seq++;
}
- list_for_each_entry(i, list, list) {
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct bch_replicas_padded replicas = {
@@ -1170,7 +1203,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
};
unsigned ptr;
- if (i->ignore)
+ i = *_i;
+ if (!i || i->ignore)
continue;
ret = jset_validate_entries(c, &i->j, READ);
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index f2001835e43e..30e995c81fc4 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -7,7 +7,6 @@
* during cache_registration
*/
struct journal_replay {
- struct list_head list;
struct journal_ptr {
u8 dev;
u32 bucket;
@@ -53,7 +52,7 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct journal_replay *);
-int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
+int bch2_journal_read(struct bch_fs *, u64 *, u64 *);
void bch2_journal_write(struct closure *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 12502f1d3ee5..9aa507c19198 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -433,16 +433,16 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *i
/* sort and dedup all keys in the journal: */
-void bch2_journal_entries_free(struct list_head *list)
+void bch2_journal_entries_free(struct bch_fs *c)
{
-
- while (!list_empty(list)) {
- struct journal_replay *i =
- list_first_entry(list, struct journal_replay, list);
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
- }
+ struct journal_replay **i;
+ struct genradix_iter iter;
+
+ genradix_for_each(&c->journal_entries, iter, i)
+ if (*i)
+ kvpfree(*i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&(*i)->j));
+ genradix_free(&c->journal_entries);
}
/*
@@ -476,15 +476,18 @@ void bch2_journal_keys_free(struct journal_keys *keys)
static int journal_keys_sort(struct bch_fs *c)
{
- struct journal_replay *i;
+ struct genradix_iter iter;
+ struct journal_replay *i, **_i;
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct journal_keys *keys = &c->journal_keys;
struct journal_key *src, *dst;
size_t nr_keys = 0;
- list_for_each_entry(i, &c->journal_entries, list) {
- if (i->ignore)
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
if (!keys->journal_seq_base)
@@ -503,8 +506,10 @@ static int journal_keys_sort(struct bch_fs *c)
if (!keys->d)
return -ENOMEM;
- list_for_each_entry(i, &c->journal_entries, list) {
- if (i->ignore)
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
BUG_ON(le64_to_cpu(i->j.seq) - keys->journal_seq_base > U32_MAX);
@@ -751,10 +756,8 @@ static int journal_replay_entry_early(struct bch_fs *c,
}
static int journal_replay_early(struct bch_fs *c,
- struct bch_sb_field_clean *clean,
- struct list_head *journal)
+ struct bch_sb_field_clean *clean)
{
- struct journal_replay *i;
struct jset_entry *entry;
int ret;
@@ -767,8 +770,13 @@ static int journal_replay_early(struct bch_fs *c,
return ret;
}
} else {
- list_for_each_entry(i, journal, list) {
- if (i->ignore)
+ struct genradix_iter iter;
+ struct journal_replay *i, **_i;
+
+ genradix_for_each(&c->journal_entries, iter, _i) {
+ i = *_i;
+
+ if (!i || i->ignore)
continue;
vstruct_for_each(&i->j, entry) {
@@ -1093,17 +1101,17 @@ int bch2_fs_recovery(struct bch_fs *c)
}
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
- struct journal_replay *i;
+ struct genradix_iter iter;
+ struct journal_replay **i;
bch_verbose(c, "starting journal read");
- ret = bch2_journal_read(c, &c->journal_entries,
- &blacklist_seq, &journal_seq);
+ ret = bch2_journal_read(c, &blacklist_seq, &journal_seq);
if (ret)
goto err;
- list_for_each_entry_reverse(i, &c->journal_entries, list)
- if (!i->ignore) {
- last_journal_entry = &i->j;
+ genradix_for_each_reverse(&c->journal_entries, iter, i)
+ if (*i && !(*i)->ignore) {
+ last_journal_entry = &(*i)->j;
break;
}
@@ -1152,7 +1160,7 @@ use_clean:
zero_out_btree_mem_ptr(&c->journal_keys);
- ret = journal_replay_early(c, clean, &c->journal_entries);
+ ret = journal_replay_early(c, clean);
if (ret)
goto err;
@@ -1175,8 +1183,7 @@ use_clean:
}
}
- ret = bch2_fs_journal_start(&c->journal, journal_seq,
- &c->journal_entries);
+ ret = bch2_fs_journal_start(&c->journal, journal_seq);
if (ret)
goto err;
@@ -1380,7 +1387,7 @@ out:
if (!c->opts.keep_journal) {
bch2_journal_keys_free(&c->journal_keys);
- bch2_journal_entries_free(&c->journal_entries);
+ bch2_journal_entries_free(c);
}
kfree(clean);
if (ret)
@@ -1401,7 +1408,6 @@ int bch2_fs_initialize(struct bch_fs *c)
struct qstr lostfound = QSTR("lost+found");
const char *err = "cannot allocate memory";
struct bch_dev *ca;
- LIST_HEAD(journal);
unsigned i;
int ret;
@@ -1441,7 +1447,7 @@ int bch2_fs_initialize(struct bch_fs *c)
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
- bch2_fs_journal_start(&c->journal, 1, &journal);
+ bch2_fs_journal_start(&c->journal, 1);
bch2_journal_set_replay_done(&c->journal);
err = "error going read-write";
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 30580a8984a1..ab8b116ac7db 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -55,7 +55,7 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct btree *);
void bch2_journal_keys_free(struct journal_keys *);
-void bch2_journal_entries_free(struct list_head *);
+void bch2_journal_entries_free(struct bch_fs *);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index aee3206cd0a0..e03c03ffe493 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -450,7 +450,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
bch2_journal_keys_free(&c->journal_keys);
- bch2_journal_entries_free(&c->journal_entries);
+ bch2_journal_entries_free(c);
percpu_free_rwsem(&c->mark_lock);
if (c->btree_paths_bufs)
@@ -668,7 +668,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_WORK(&c->journal_seq_blacklist_gc_work,
bch2_blacklist_entries_gc);
- INIT_LIST_HEAD(&c->journal_entries);
INIT_LIST_HEAD(&c->journal_iters);
INIT_LIST_HEAD(&c->fsck_errors);