summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2020-05-25 14:57:06 -0400
committerKent Overstreet <kent.overstreet@gmail.com>2020-06-09 21:32:46 -0400
commit49c3abc9a6a5ad399f6b1e6b6dcda2efe31d58e5 (patch)
tree3a03bd44892712f943b4e0ce9a6560453ca7a532
parentfa9e6109dc1daf8a808f463f3176d1056de69154 (diff)
bcachefs: Interior btree updates are now fully transactional
We now update the alloc info (bucket sector counts) atomically with journalling the update to the interior btree nodes, and we also set new btree roots atomically with the journalled part of the btree update. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-rw-r--r--fs/bcachefs/alloc_background.c5
-rw-r--r--fs/bcachefs/bcachefs.h6
-rw-r--r--fs/bcachefs/btree_gc.c12
-rw-r--r--fs/bcachefs/btree_types.h1
-rw-r--r--fs/bcachefs/btree_update_interior.c816
-rw-r--r--fs/bcachefs/btree_update_interior.h64
-rw-r--r--fs/bcachefs/btree_update_leaf.c7
-rw-r--r--fs/bcachefs/buckets.c2
-rw-r--r--fs/bcachefs/buckets.h2
-rw-r--r--fs/bcachefs/journal.c5
-rw-r--r--fs/bcachefs/journal.h27
-rw-r--r--fs/bcachefs/journal_io.c20
-rw-r--r--fs/bcachefs/journal_reclaim.c2
-rw-r--r--fs/bcachefs/journal_reclaim.h2
-rw-r--r--fs/bcachefs/keylist.c4
-rw-r--r--fs/bcachefs/keylist.h4
-rw-r--r--fs/bcachefs/migrate.c11
-rw-r--r--fs/bcachefs/move.c10
-rw-r--r--fs/bcachefs/recovery.c7
-rw-r--r--fs/bcachefs/super-io.c22
-rw-r--r--fs/bcachefs/super.c5
21 files changed, 410 insertions, 624 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 91768b7162f8..f511d63bc0f7 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1461,11 +1461,6 @@ again:
}
rcu_read_unlock();
- if (c->btree_roots_dirty) {
- bch2_journal_meta(&c->journal);
- goto again;
- }
-
return !nodes_unwritten &&
!bch2_btree_interior_updates_nr_pending(c);
}
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index ee3c46328754..327ef01c3f22 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -601,13 +601,10 @@ struct bch_fs {
struct bio_set btree_bio;
struct btree_root btree_roots[BTREE_ID_NR];
- bool btree_roots_dirty;
struct mutex btree_root_lock;
struct btree_cache btree_cache;
- mempool_t btree_reserve_pool;
-
/*
* Cache of allocated btree nodes - if we allocate a btree node and
* don't use it, if we free it that space can't be reused until going
@@ -625,6 +622,9 @@ struct bch_fs {
struct mutex btree_interior_update_lock;
struct closure_waitlist btree_interior_update_wait;
+ struct workqueue_struct *btree_interior_update_worker;
+ struct work_struct btree_interior_update_work;
+
mempool_t btree_iters_pool;
struct workqueue_struct *wq;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2ee33887b364..65b01e865015 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -464,6 +464,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
}
+#if 0
/* Also see bch2_pending_btree_node_free_insert_done() */
static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
{
@@ -481,6 +482,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
mutex_unlock(&c->btree_interior_update_lock);
}
+#endif
static void bch2_mark_allocator_buckets(struct bch_fs *c)
{
@@ -799,6 +801,10 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
trace_gc_start(c);
down_write(&c->gc_lock);
+
+ /* flush interior btree updates: */
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_nr_pending(c));
again:
ret = bch2_gc_start(c, metadata_only);
if (ret)
@@ -810,7 +816,9 @@ again:
if (ret)
goto out;
+#if 0
bch2_mark_pending_btree_node_frees(c);
+#endif
bch2_mark_allocator_buckets(c);
c->gc_count++;
@@ -1035,6 +1043,8 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
btree_node_reset_sib_u64s(n);
bch2_btree_build_aux_trees(n);
+
+ bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->lock);
bch2_btree_node_write(c, n, SIX_LOCK_intent);
@@ -1083,7 +1093,7 @@ next:
bch2_btree_iter_node_replace(iter, new_nodes[0]);
for (i = 0; i < nr_new_nodes; i++)
- bch2_open_buckets_put(c, &new_nodes[i]->ob);
+ bch2_btree_update_get_open_buckets(as, new_nodes[i]);
/* Free the old nodes and update our sliding window */
for (i = 0; i < nr_old_nodes; i++) {
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index f957dd2cbbef..8357b5251a43 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -307,6 +307,7 @@ struct btree_trans {
/* update path: */
struct jset_entry *extra_journal_entries;
unsigned extra_journal_entry_u64s;
+ struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
struct journal_preres journal_preres;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 0fa011231493..455a7093af45 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -21,10 +21,6 @@
#include <linux/random.h>
#include <trace/events/bcachefs.h>
-static void btree_node_will_make_reachable(struct btree_update *,
- struct btree *);
-static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-
/* Debug code: */
/*
@@ -124,74 +120,6 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
/* Btree node freeing/allocation: */
-static bool btree_key_matches(struct bch_fs *c,
- struct bkey_s_c l,
- struct bkey_s_c r)
-{
- struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l);
- struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r);
- const struct bch_extent_ptr *ptr1, *ptr2;
-
- bkey_for_each_ptr(ptrs1, ptr1)
- bkey_for_each_ptr(ptrs2, ptr2)
- if (ptr1->dev == ptr2->dev &&
- ptr1->gen == ptr2->gen &&
- ptr1->offset == ptr2->offset)
- return true;
-
- return false;
-}
-
-/*
- * We're doing the index update that makes @b unreachable, update stuff to
- * reflect that:
- *
- * Must be called _before_ btree_update_updated_root() or
- * btree_update_updated_node:
- */
-static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
- struct bkey_s_c k,
- struct bch_fs_usage *stats)
-{
- struct bch_fs *c = as->c;
- struct pending_btree_node_free *d;
-
- for (d = as->pending; d < as->pending + as->nr_pending; d++)
- if (!bkey_cmp(k.k->p, d->key.k.p) &&
- btree_key_matches(c, k, bkey_i_to_s_c(&d->key)))
- goto found;
- BUG();
-found:
- BUG_ON(d->index_update_done);
- d->index_update_done = true;
-
- /*
- * We're dropping @k from the btree, but it's still live until the
- * index update is persistent so we need to keep a reference around for
- * mark and sweep to find - that's primarily what the
- * btree_node_pending_free list is for.
- *
- * So here (when we set index_update_done = true), we're moving an
- * existing reference to a different part of the larger "gc keyspace" -
- * and the new position comes after the old position, since GC marks
- * the pending free list after it walks the btree.
- *
- * If we move the reference while mark and sweep is _between_ the old
- * and the new position, mark and sweep will see the reference twice
- * and it'll get double accounted - so check for that here and subtract
- * to cancel out one of mark and sweep's markings if necessary:
- */
-
- if (gc_pos_cmp(c->gc_pos, b
- ? gc_pos_btree_node(b)
- : gc_pos_btree_root(as->btree_id)) >= 0 &&
- gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
- bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
- 0, 0, NULL, 0,
- BTREE_TRIGGER_OVERWRITE|
- BTREE_TRIGGER_GC);
-}
-
static void __btree_node_free(struct bch_fs *c, struct btree *b)
{
trace_btree_node_free(c, b);
@@ -216,8 +144,6 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
{
struct open_buckets ob = b->ob;
- btree_update_drop_new_node(c, b);
-
b->ob.nr = 0;
clear_btree_node_dirty(b);
@@ -237,39 +163,12 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
trans_for_each_iter(iter->trans, linked)
BUG_ON(linked->l[b->level].b == b);
- /*
- * Is this a node that isn't reachable on disk yet?
- *
- * Nodes that aren't reachable yet have writes blocked until they're
- * reachable - now that we've cancelled any pending writes and moved
- * things waiting on that write to wait on this update, we can drop this
- * node from the list of nodes that the other update is making
- * reachable, prior to freeing it:
- */
- btree_update_drop_new_node(c, b);
-
six_lock_write(&b->lock);
__btree_node_free(c, b);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
}
-static void bch2_btree_node_free_ondisk(struct bch_fs *c,
- struct pending_btree_node_free *pending,
- u64 journal_seq)
-{
- BUG_ON(!pending->index_update_done);
-
- bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
- 0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);
-
- if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
- bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
- 0, 0, NULL, journal_seq,
- BTREE_TRIGGER_OVERWRITE|
- BTREE_TRIGGER_GC);
-}
-
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
struct disk_reservation *res,
struct closure *cl,
@@ -357,9 +256,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
int ret;
BUG_ON(level >= BTREE_MAX_DEPTH);
- BUG_ON(!as->reserve->nr);
+ BUG_ON(!as->nr_prealloc_nodes);
- b = as->reserve->b[--as->reserve->nr];
+ b = as->prealloc_nodes[--as->nr_prealloc_nodes];
set_btree_node_accessed(b);
set_btree_node_dirty(b);
@@ -394,8 +293,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
bch2_btree_build_aux_trees(b);
- btree_node_will_make_reachable(as, b);
-
ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
BUG_ON(ret);
@@ -466,19 +363,20 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
btree_node_set_format(b, b->data->format);
bch2_btree_build_aux_trees(b);
+ bch2_btree_update_add_new_node(as, b);
six_unlock_write(&b->lock);
return b;
}
-static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
+static void bch2_btree_reserve_put(struct btree_update *as)
{
- bch2_disk_reservation_put(c, &reserve->disk_res);
+ struct bch_fs *c = as->c;
mutex_lock(&c->btree_reserve_cache_lock);
- while (reserve->nr) {
- struct btree *b = reserve->b[--reserve->nr];
+ while (as->nr_prealloc_nodes) {
+ struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
six_unlock_write(&b->lock);
@@ -502,36 +400,14 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
}
mutex_unlock(&c->btree_reserve_cache_lock);
-
- mempool_free(reserve, &c->btree_reserve_pool);
}
-static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
- unsigned nr_nodes,
- unsigned flags,
- struct closure *cl)
+static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
+ unsigned flags, struct closure *cl)
{
- struct btree_reserve *reserve;
+ struct bch_fs *c = as->c;
struct btree *b;
- struct disk_reservation disk_res = { 0, 0 };
- unsigned sectors = nr_nodes * c->opts.btree_node_size;
- int ret, disk_res_flags = 0;
-
- if (flags & BTREE_INSERT_NOFAIL)
- disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
-
- /*
- * This check isn't necessary for correctness - it's just to potentially
- * prevent us from doing a lot of work that'll end up being wasted:
- */
- ret = bch2_journal_error(&c->journal);
- if (ret)
- return ERR_PTR(ret);
-
- if (bch2_disk_reservation_get(c, &disk_res, sectors,
- c->opts.metadata_replicas,
- disk_res_flags))
- return ERR_PTR(-ENOSPC);
+ int ret;
BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
@@ -540,18 +416,11 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
* open bucket reserve:
*/
ret = bch2_btree_cache_cannibalize_lock(c, cl);
- if (ret) {
- bch2_disk_reservation_put(c, &disk_res);
- return ERR_PTR(ret);
- }
-
- reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
-
- reserve->disk_res = disk_res;
- reserve->nr = 0;
+ if (ret)
+ return ret;
- while (reserve->nr < nr_nodes) {
- b = __bch2_btree_node_alloc(c, &disk_res,
+ while (as->nr_prealloc_nodes < nr_nodes) {
+ b = __bch2_btree_node_alloc(c, &as->disk_res,
flags & BTREE_INSERT_NOWAIT
? NULL : cl, flags);
if (IS_ERR(b)) {
@@ -563,21 +432,20 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
if (ret)
goto err_free;
- reserve->b[reserve->nr++] = b;
+ as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
}
bch2_btree_cache_cannibalize_unlock(c);
- return reserve;
+ return 0;
err_free:
- bch2_btree_reserve_put(c, reserve);
bch2_btree_cache_cannibalize_unlock(c);
trace_btree_reserve_get_fail(c, nr_nodes, cl);
- return ERR_PTR(ret);
+ return ret;
}
/* Asynchronous interior node update machinery */
-static void __bch2_btree_update_free(struct btree_update *as)
+static void bch2_btree_update_free(struct btree_update *as)
{
struct bch_fs *c = as->c;
@@ -585,14 +453,13 @@ static void __bch2_btree_update_free(struct btree_update *as)
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
+ bch2_disk_reservation_put(c, &as->disk_res);
+ bch2_btree_reserve_put(as);
- BUG_ON(as->nr_new_nodes || as->nr_pending);
-
- if (as->reserve)
- bch2_btree_reserve_put(c, as->reserve);
-
+ mutex_lock(&c->btree_interior_update_lock);
list_del(&as->unwritten_list);
list_del(&as->list);
+ mutex_unlock(&c->btree_interior_update_lock);
closure_debug_destroy(&as->cl);
mempool_free(as, &c->btree_interior_update_pool);
@@ -600,37 +467,59 @@ static void __bch2_btree_update_free(struct btree_update *as)
closure_wake_up(&c->btree_interior_update_wait);
}
-static void bch2_btree_update_free(struct btree_update *as)
+static void btree_update_will_delete_key(struct btree_update *as,
+ struct bkey_i *k)
{
- struct bch_fs *c = as->c;
+ BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s >
+ ARRAY_SIZE(as->_old_keys));
+ bch2_keylist_add(&as->old_keys, k);
+}
- mutex_lock(&c->btree_interior_update_lock);
- __bch2_btree_update_free(as);
- mutex_unlock(&c->btree_interior_update_lock);
+static void btree_update_will_add_key(struct btree_update *as,
+ struct bkey_i *k)
+{
+ BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s >
+ ARRAY_SIZE(as->_new_keys));
+ bch2_keylist_add(&as->new_keys, k);
}
-static inline bool six_trylock_intentwrite(struct six_lock *lock)
+/*
+ * The transactional part of an interior btree node update, where we journal the
+ * update we did to the interior node and update alloc info:
+ */
+static int btree_update_nodes_written_trans(struct btree_trans *trans,
+ struct btree_update *as)
{
- if (!six_trylock_intent(lock))
- return false;
+ struct bkey_i *k;
+ int ret;
+
+ trans->extra_journal_entries = (void *) &as->journal_entries[0];
+ trans->extra_journal_entry_u64s = as->journal_u64s;
+ trans->journal_pin = &as->journal;
- if (!six_trylock_write(lock)) {
- six_unlock_intent(lock);
- return false;
+ for_each_keylist_key(&as->new_keys, k) {
+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+ 0, 0, BTREE_TRIGGER_INSERT);
+ if (ret)
+ return ret;
}
- return true;
+ for_each_keylist_key(&as->old_keys, k) {
+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+ 0, 0, BTREE_TRIGGER_OVERWRITE);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
-static void btree_update_nodes_written(struct closure *cl)
+static void btree_update_nodes_written(struct btree_update *as)
{
- struct btree_update *as = container_of(cl, struct btree_update, cl);
- struct btree *nodes_need_write[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES + 1];
- unsigned nr_nodes_need_write;
- struct journal_res res = { 0 };
struct bch_fs *c = as->c;
- struct btree_root *r;
- struct btree *b;
+ struct btree *b = as->b;
+ u64 journal_seq = 0;
+ unsigned i;
int ret;
/*
@@ -638,78 +527,17 @@ static void btree_update_nodes_written(struct closure *cl)
* to child nodes that weren't written yet: now, the child nodes have
* been written so we can write out the update to the interior node.
*/
- mutex_lock(&c->btree_interior_update_lock);
- as->nodes_written = true;
-again:
- nr_nodes_need_write = 0;
- as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
- struct btree_update, unwritten_list);
- if (!as || !as->nodes_written) {
- mutex_unlock(&c->btree_interior_update_lock);
- return;
- }
-
- b = as->b;
- if (b && !six_trylock_intentwrite(&b->lock)) {
- mutex_unlock(&c->btree_interior_update_lock);
-
- btree_node_lock_type(c, b, SIX_LOCK_intent);
- six_lock_write(&b->lock);
-
- six_unlock_write(&b->lock);
- six_unlock_intent(&b->lock);
-
- mutex_lock(&c->btree_interior_update_lock);
- goto again;
- }
-
- ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s,
- JOURNAL_RES_GET_NONBLOCK|
- JOURNAL_RES_GET_RESERVED);
- if (ret == -EAGAIN) {
- unsigned u64s = as->journal_u64s;
-
- if (b) {
- six_unlock_write(&b->lock);
- six_unlock_intent(&b->lock);
- }
-
- mutex_unlock(&c->btree_interior_update_lock);
-
- ret = bch2_journal_res_get(&c->journal, &res, u64s,
- JOURNAL_RES_GET_CHECK|
- JOURNAL_RES_GET_RESERVED);
- if (!ret) {
- mutex_lock(&c->btree_interior_update_lock);
- goto again;
- }
- }
-
- if (!ret) {
- struct journal_buf *buf = &c->journal.buf[res.idx];
- struct jset_entry *entry = vstruct_idx(buf->data, res.offset);
-
- res.offset += as->journal_u64s;
- res.u64s -= as->journal_u64s;
- memcpy_u64s(entry, as->journal_entries, as->journal_u64s);
- } else {
- /*
- * On journal error we have to run most of the normal path so
- * that shutdown works - unblocking btree node writes in
- * particular and writing them if needed - except for
- * journalling the update:
- */
-
- BUG_ON(!bch2_journal_error(&c->journal));
- }
-
- switch (as->mode) {
- case BTREE_INTERIOR_NO_UPDATE:
- BUG();
- case BTREE_INTERIOR_UPDATING_NODE:
- /* @b is the node we did the final insert into: */
-
+ ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_JOURNAL_RESERVED,
+ btree_update_nodes_written_trans(&trans, as));
+ BUG_ON(ret && !bch2_journal_error(&c->journal));
+
+ if (b) {
/*
+ * @b is the node we did the final insert into:
+ *
* On failure to get a journal reservation, we still have to
* unblock the write and allow most of the write path to happen
* so that shutdown works, but the i->journal_seq mechanism
@@ -719,83 +547,90 @@ again:
* we're in journal error state:
*/
+ btree_node_lock_type(c, b, SIX_LOCK_intent);
+ btree_node_lock_type(c, b, SIX_LOCK_write);
+ mutex_lock(&c->btree_interior_update_lock);
+
list_del(&as->write_blocked_list);
- if (!ret) {
+ if (!ret && as->b == b) {
struct bset *i = btree_bset_last(b);
+ BUG_ON(!b->level);
+ BUG_ON(!btree_node_dirty(b));
+
i->journal_seq = cpu_to_le64(
- max(res.seq,
+ max(journal_seq,
le64_to_cpu(i->journal_seq)));
- bch2_btree_add_journal_pin(c, b, res.seq);
+ bch2_btree_add_journal_pin(c, b, journal_seq);
}
- nodes_need_write[nr_nodes_need_write++] = b;
-
+ mutex_unlock(&c->btree_interior_update_lock);
six_unlock_write(&b->lock);
- six_unlock_intent(&b->lock);
- break;
-
- case BTREE_INTERIOR_UPDATING_AS:
- BUG_ON(b);
- break;
-
- case BTREE_INTERIOR_UPDATING_ROOT:
- r = &c->btree_roots[as->btree_id];
- BUG_ON(b);
-
- mutex_lock(&c->btree_root_lock);
- bkey_copy(&r->key, as->parent_keys.keys);
- r->level = as->level;
- r->alive = true;
- c->btree_roots_dirty = true;
- mutex_unlock(&c->btree_root_lock);
- break;
+ btree_node_write_if_need(c, b, SIX_LOCK_intent);
+ six_unlock_intent(&b->lock);
}
bch2_journal_pin_drop(&c->journal, &as->journal);
- bch2_journal_res_put(&c->journal, &res);
bch2_journal_preres_put(&c->journal, &as->journal_preres);
- while (as->nr_new_nodes) {
- b = as->new_nodes[--as->nr_new_nodes];
+ mutex_lock(&c->btree_interior_update_lock);
+ for (i = 0; i < as->nr_new_nodes; i++) {
+ b = as->new_nodes[i];
BUG_ON(b->will_make_reachable != (unsigned long) as);
b->will_make_reachable = 0;
+ }
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ for (i = 0; i < as->nr_new_nodes; i++) {
+ b = as->new_nodes[i];
- nodes_need_write[nr_nodes_need_write++] = b;
+ btree_node_lock_type(c, b, SIX_LOCK_read);
+ btree_node_write_if_need(c, b, SIX_LOCK_read);
+ six_unlock_read(&b->lock);
}
- while (as->nr_pending)
- bch2_btree_node_free_ondisk(c,
- &as->pending[--as->nr_pending], res.seq);
+ for (i = 0; i < as->nr_open_buckets; i++)
+ bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
- __bch2_btree_update_free(as);
- /*
- * for flush_held_btree_writes() waiting on updates to flush or
- * nodes to be writeable:
- */
- closure_wake_up(&c->btree_interior_update_wait);
+ bch2_btree_update_free(as);
+}
- /*
- * Can't take btree node locks while holding btree_interior_update_lock:
- * */
- mutex_unlock(&c->btree_interior_update_lock);
+static void btree_interior_update_work(struct work_struct *work)
+{
+ struct bch_fs *c =
+ container_of(work, struct bch_fs, btree_interior_update_work);
+ struct btree_update *as;
- /* Do btree writes after dropping journal res/locks: */
- while (nr_nodes_need_write) {
- b = nodes_need_write[--nr_nodes_need_write];
+ while (1) {
+ mutex_lock(&c->btree_interior_update_lock);
+ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
+ struct btree_update, unwritten_list);
+ if (as && !as->nodes_written)
+ as = NULL;
+ mutex_unlock(&c->btree_interior_update_lock);
- btree_node_lock_type(c, b, SIX_LOCK_read);
- bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
- six_unlock_read(&b->lock);
+ if (!as)
+ break;
+
+ btree_update_nodes_written(as);
}
+}
+
+static void btree_update_set_nodes_written(struct closure *cl)
+{
+ struct btree_update *as = container_of(cl, struct btree_update, cl);
+ struct bch_fs *c = as->c;
mutex_lock(&c->btree_interior_update_lock);
- goto again;
+ as->nodes_written = true;
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
}
/*
@@ -814,7 +649,6 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
as->mode = BTREE_INTERIOR_UPDATING_NODE;
as->b = b;
- as->level = b->level;
list_add(&as->write_blocked_list, &b->write_blocked);
mutex_unlock(&c->btree_interior_update_lock);
@@ -845,25 +679,45 @@ static void btree_update_reparent(struct btree_update *as,
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
{
+ struct bkey_i *insert = &b->key;
struct bch_fs *c = as->c;
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
- BUG_ON(!bch2_keylist_empty(&as->parent_keys));
+
+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
+ ARRAY_SIZE(as->journal_entries));
+
+ as->journal_u64s +=
+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+ BCH_JSET_ENTRY_btree_root,
+ b->btree_id, b->level,
+ insert, insert->k.u64s);
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
as->mode = BTREE_INTERIOR_UPDATING_ROOT;
- as->level = b->level;
- bch2_keylist_add(&as->parent_keys, &b->key);
mutex_unlock(&c->btree_interior_update_lock);
}
-static void btree_node_will_make_reachable(struct btree_update *as,
- struct btree *b)
+/*
+ * bch2_btree_update_add_new_node:
+ *
+ * This causes @as to wait on @b to be written, before it gets to
+ * bch2_btree_update_nodes_written
+ *
+ * Additionally, it sets b->will_make_reachable to prevent any additional writes
+ * to @b from happening besides the first until @b is reachable on disk
+ *
+ * And it adds @b to the list of @as's new nodes, so that we can update sector
+ * counts in bch2_btree_update_nodes_written:
+ */
+void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
+ closure_get(&as->cl);
+
mutex_lock(&c->btree_interior_update_lock);
BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
BUG_ON(b->will_make_reachable);
@@ -871,10 +725,14 @@ static void btree_node_will_make_reachable(struct btree_update *as,
as->new_nodes[as->nr_new_nodes++] = b;
b->will_make_reachable = 1UL|(unsigned long) as;
- closure_get(&as->cl);
mutex_unlock(&c->btree_interior_update_lock);
+
+ btree_update_will_add_key(as, &b->key);
}
+/*
+ * returns true if @b was a new node
+ */
static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
{
struct btree_update *as;
@@ -882,6 +740,11 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
unsigned i;
mutex_lock(&c->btree_interior_update_lock);
+ /*
+ * When b->will_make_reachable != 0, it owns a ref on as->cl that's
+ * dropped when it gets written by bch2_btree_complete_write - the
+ * xchg() is for synchronization with bch2_btree_complete_write:
+ */
v = xchg(&b->will_make_reachable, 0);
as = (struct btree_update *) (v & ~1UL);
@@ -903,25 +766,11 @@ found:
closure_put(&as->cl);
}
-static void btree_interior_update_add_node_reference(struct btree_update *as,
- struct btree *b)
+void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
{
- struct bch_fs *c = as->c;
- struct pending_btree_node_free *d;
-
- mutex_lock(&c->btree_interior_update_lock);
-
- /* Add this node to the list of nodes being freed: */
- BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
-
- d = &as->pending[as->nr_pending++];
- d->index_update_done = false;
- d->seq = b->data->keys.seq;
- d->btree_id = b->btree_id;
- d->level = b->level;
- bkey_copy(&d->key, &b->key);
-
- mutex_unlock(&c->btree_interior_update_lock);
+ while (b->ob.nr)
+ as->open_buckets[as->nr_open_buckets++] =
+ b->ob.v[--b->ob.nr];
}
/*
@@ -941,8 +790,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
if (btree_node_fake(b))
return;
- btree_interior_update_add_node_reference(as, b);
-
mutex_lock(&c->btree_interior_update_lock);
/*
@@ -984,16 +831,28 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock);
+
+ /*
+ * Is this a node that isn't reachable on disk yet?
+ *
+ * Nodes that aren't reachable yet have writes blocked until they're
+ * reachable - now that we've cancelled any pending writes and moved
+ * things waiting on that write to wait on this update, we can drop this
+ * node from the list of nodes that the other update is making
+ * reachable, prior to freeing it:
+ */
+ btree_update_drop_new_node(c, b);
+
+ btree_update_will_delete_key(as, &b->key);
}
void bch2_btree_update_done(struct btree_update *as)
{
BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
- bch2_btree_reserve_put(as->c, as->reserve);
- as->reserve = NULL;
+ bch2_btree_reserve_put(as);
- continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq);
+ continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
}
struct btree_update *
@@ -1002,12 +861,32 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct journal_preres journal_preres = { 0 };
- struct btree_reserve *reserve;
struct btree_update *as;
- int ret;
+ int ret, disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+ ? BCH_DISK_RESERVATION_NOFAIL : 0;
+
+ /*
+ * This check isn't necessary for correctness - it's just to potentially
+ * prevent us from doing a lot of work that'll end up being wasted:
+ */
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ return ERR_PTR(ret);
+
+ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
+ memset(as, 0, sizeof(*as));
+ closure_init(&as->cl, NULL);
+ as->c = c;
+ as->mode = BTREE_INTERIOR_NO_UPDATE;
+ as->btree_id = id;
+ INIT_LIST_HEAD(&as->list);
+ INIT_LIST_HEAD(&as->unwritten_list);
+ INIT_LIST_HEAD(&as->write_blocked_list);
+ bch2_keylist_init(&as->old_keys, as->_old_keys);
+ bch2_keylist_init(&as->new_keys, as->_new_keys);
+ bch2_keylist_init(&as->parent_keys, as->inline_keys);
- ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
JOURNAL_RES_GET_NONBLOCK);
if (ret == -EAGAIN) {
@@ -1016,46 +895,41 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
bch2_trans_unlock(trans);
- ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES, 0);
if (ret)
return ERR_PTR(ret);
if (!bch2_trans_relock(trans)) {
- bch2_journal_preres_put(&c->journal, &journal_preres);
- return ERR_PTR(-EINTR);
+ ret = -EINTR;
+ goto err;
}
}
- reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
- if (IS_ERR(reserve)) {
- bch2_journal_preres_put(&c->journal, &journal_preres);
- return ERR_CAST(reserve);
- }
-
- as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
- memset(as, 0, sizeof(*as));
- closure_init(&as->cl, NULL);
- as->c = c;
- as->mode = BTREE_INTERIOR_NO_UPDATE;
- as->btree_id = id;
- as->reserve = reserve;
- INIT_LIST_HEAD(&as->write_blocked_list);
- INIT_LIST_HEAD(&as->unwritten_list);
- as->journal_preres = journal_preres;
+ ret = bch2_disk_reservation_get(c, &as->disk_res,
+ nr_nodes * c->opts.btree_node_size,
+ c->opts.metadata_replicas,
+ disk_res_flags);
+ if (ret)
+ goto err;
- bch2_keylist_init(&as->parent_keys, as->inline_keys);
+ ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl);
+ if (ret)
+ goto err;
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->list, &c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
return as;
+err:
+ bch2_btree_update_free(as);
+ return ERR_PTR(ret);
}
/* Btree root updates: */
-static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
+static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
{
/* Root nodes cannot be reaped */
mutex_lock(&c->btree_cache.lock);
@@ -1073,38 +947,6 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
bch2_recalc_btree_reserve(c);
}
-static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
-{
- struct bch_fs *c = as->c;
- struct btree *old = btree_node_root(c, b);
- struct bch_fs_usage *fs_usage;
-
- __bch2_btree_set_root_inmem(c, b);
-
- mutex_lock(&c->btree_interior_update_lock);
- percpu_down_read(&c->mark_lock);
- fs_usage = bch2_fs_usage_scratch_get(c);
-
- bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
- 0, 0, fs_usage, 0,
- BTREE_TRIGGER_INSERT);
- if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
- bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
- 0, 0, NULL, 0,
- BTREE_TRIGGER_INSERT|
- BTREE_TRIGGER_GC);
-
- if (old && !btree_node_fake(old))
- bch2_btree_node_free_index(as, NULL,
- bkey_i_to_s_c(&old->key),
- fs_usage);
- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
-
- bch2_fs_usage_scratch_put(c, fs_usage);
- percpu_up_read(&c->mark_lock);
- mutex_unlock(&c->btree_interior_update_lock);
-}
-
/**
* bch_btree_set_root - update the root in memory and on disk
*
@@ -1135,7 +977,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
*/
bch2_btree_node_lock_write(old, iter);
- bch2_btree_set_root_inmem(as, b);
+ bch2_btree_set_root_inmem(c, b);
btree_update_updated_root(as, b);
@@ -1156,57 +998,21 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
struct bkey_i *insert,
struct btree_node_iter *node_iter)
{
- struct bch_fs *c = as->c;
- struct bch_fs_usage *fs_usage;
- struct jset_entry *entry;
struct bkey_packed *k;
- struct bkey tmp;
BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
ARRAY_SIZE(as->journal_entries));
- entry = (void *) &as->journal_entries[as->journal_u64s];
- memset(entry, 0, sizeof(*entry));
- entry->u64s = cpu_to_le16(insert->k.u64s);
- entry->type = BCH_JSET_ENTRY_btree_keys;
- entry->btree_id = b->btree_id;
- entry->level = b->level;
- memcpy_u64s_small(entry->_data, insert, insert->k.u64s);
- as->journal_u64s += jset_u64s(insert->k.u64s);
-
- mutex_lock(&c->btree_interior_update_lock);
- percpu_down_read(&c->mark_lock);
- fs_usage = bch2_fs_usage_scratch_get(c);
-
- bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
- 0, 0, fs_usage, 0,
- BTREE_TRIGGER_INSERT);
-
- if (gc_visited(c, gc_pos_btree_node(b)))
- bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
- 0, 0, NULL, 0,
- BTREE_TRIGGER_INSERT|
- BTREE_TRIGGER_GC);
+ as->journal_u64s +=
+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
+ BCH_JSET_ENTRY_btree_keys,
+ b->btree_id, b->level,
+ insert, insert->k.u64s);
while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
bch2_btree_node_iter_advance(node_iter, b);
- /*
- * If we're overwriting, look up pending delete and mark so that gc
- * marks it on the pending delete list:
- */
- if (k && !bkey_cmp_packed(b, k, &insert->k))
- bch2_btree_node_free_index(as, b,
- bkey_disassemble(b, k, &tmp),
- fs_usage);
-
- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
-
- bch2_fs_usage_scratch_put(c, fs_usage);
- percpu_up_read(&c->mark_lock);
- mutex_unlock(&c->btree_interior_update_lock);
-
bch2_btree_bset_insert_key(iter, b, node_iter, insert);
set_btree_node_dirty(b);
set_btree_node_need_write(b);
@@ -1226,6 +1032,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
struct bkey_packed *k, *prev = NULL;
n2 = bch2_btree_node_alloc(as, n1->level);
+ bch2_btree_update_add_new_node(as, n2);
n2->data->max_key = n1->data->max_key;
n2->data->format = n1->format;
@@ -1321,14 +1128,6 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
struct bkey_packed *src, *dst, *n;
struct bset *i;
- /*
- * XXX
- *
- * these updates must be journalled
- *
- * oops
- */
-
BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
@@ -1380,6 +1179,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_btree_interior_update_will_free_node(as, b);
n1 = bch2_btree_node_alloc_replacement(as, b);
+ bch2_btree_update_add_new_node(as, n1);
if (keys)
btree_split_insert_keys(as, n1, iter, keys);
@@ -1439,11 +1239,11 @@ static void btree_split(struct btree_update *as, struct btree *b,
bch2_btree_set_root(as, n1, iter);
}
- bch2_open_buckets_put(c, &n1->ob);
+ bch2_btree_update_get_open_buckets(as, n1);
if (n2)
- bch2_open_buckets_put(c, &n2->ob);
+ bch2_btree_update_get_open_buckets(as, n2);
if (n3)
- bch2_open_buckets_put(c, &n3->ob);
+ bch2_btree_update_get_open_buckets(as, n3);
/* Successful split, update the iterator to point to the new nodes: */
@@ -1538,7 +1338,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
bch2_btree_node_lock_for_insert(c, b, iter);
- if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
+ if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
bch2_btree_node_unlock_write(b, iter);
goto split;
}
@@ -1749,6 +1549,7 @@ retry:
bch2_btree_interior_update_will_free_node(as, m);
n = bch2_btree_node_alloc(as, b->level);
+ bch2_btree_update_add_new_node(as, n);
btree_set_min(n, prev->data->min_key);
btree_set_max(n, next->data->max_key);
@@ -1771,7 +1572,7 @@ retry:
bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
- bch2_open_buckets_put(c, &n->ob);
+ bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->lock, SIX_LOCK_intent);
bch2_btree_iter_node_drop(iter, b);
@@ -1859,6 +1660,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_interior_update_will_free_node(as, b);
n = bch2_btree_node_alloc_replacement(as, b);
+ bch2_btree_update_add_new_node(as, n);
bch2_btree_build_aux_trees(n);
six_unlock_write(&n->lock);
@@ -1874,7 +1676,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
bch2_btree_set_root(as, n, iter);
}
- bch2_open_buckets_put(c, &n->ob);
+ bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->lock, SIX_LOCK_intent);
bch2_btree_iter_node_drop(iter, b);
@@ -1949,49 +1751,8 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
struct btree *parent;
int ret;
- /*
- * Two corner cases that need to be thought about here:
- *
- * @b may not be reachable yet - there might be another interior update
- * operation waiting on @b to be written, and we're gonna deliver the
- * write completion to that interior update operation _before_
- * persisting the new_key update
- *
- * That ends up working without us having to do anything special here:
- * the reason is, we do kick off (and do the in memory updates) for the
- * update for @new_key before we return, creating a new interior_update
- * operation here.
- *
- * The new interior update operation here will in effect override the
- * previous one. The previous one was going to terminate - make @b
- * reachable - in one of two ways:
- * - updating the btree root pointer
- * In that case,
- * no, this doesn't work. argh.
- */
-
- if (b->will_make_reachable)
- as->must_rewrite = true;
-
- btree_interior_update_add_node_reference(as, b);
-
- /*
- * XXX: the rest of the update path treats this like we're actually
- * inserting a new node and deleting the existing node, so the
- * reservation needs to include enough space for @b
- *
- * that is actually sketch as fuck though and I am surprised the code
- * seems to work like that, definitely need to go back and rework it
- * into something saner.
- *
- * (I think @b is just getting double counted until the btree update
- * finishes and "deletes" @b on disk)
- */
- ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
- c->opts.btree_node_size *
- bch2_bkey_nr_ptrs(bkey_i_to_s_c(new_key)),
- BCH_DISK_RESERVATION_NOFAIL);
- BUG_ON(ret);
+ btree_update_will_delete_key(as, &b->key);
+ btree_update_will_add_key(as, new_key);
parent = btree_node_parent(iter, b);
if (parent) {
@@ -2019,44 +1780,18 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
bkey_copy(&b->key, new_key);
}
} else {
- struct bch_fs_usage *fs_usage;
-
BUG_ON(btree_node_root(c, b) != b);
bch2_btree_node_lock_write(b, iter);
+ bkey_copy(&b->key, new_key);
- mutex_lock(&c->btree_interior_update_lock);
- percpu_down_read(&c->mark_lock);
- fs_usage = bch2_fs_usage_scratch_get(c);
-
- bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
- 0, 0, fs_usage, 0,
- BTREE_TRIGGER_INSERT);
- if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
- bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
- 0, 0, NULL, 0,
- BTREE_TRIGGER_INSERT||
- BTREE_TRIGGER_GC);
-
- bch2_btree_node_free_index(as, NULL,
- bkey_i_to_s_c(&b->key),
- fs_usage);
- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
-
- bch2_fs_usage_scratch_put(c, fs_usage);
- percpu_up_read(&c->mark_lock);
- mutex_unlock(&c->btree_interior_update_lock);
-
- if (btree_ptr_hash_val(new_key) != b->hash_val) {
+ if (btree_ptr_hash_val(&b->key) != b->hash_val) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, b);
- bkey_copy(&b->key, new_key);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
- } else {
- bkey_copy(&b->key, new_key);
}
btree_update_updated_root(as, b);
@@ -2171,7 +1906,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
{
BUG_ON(btree_node_root(c, b));
- __bch2_btree_set_root_inmem(c, b);
+ bch2_btree_set_root_inmem(c, b);
}
void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
@@ -2210,7 +1945,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
BUG_ON(ret);
- __bch2_btree_set_root_inmem(c, b);
+ bch2_btree_set_root_inmem(c, b);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
@@ -2247,10 +1982,59 @@ size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
return ret;
}
+void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
+{
+ struct btree_root *r;
+ struct jset_entry *entry;
+
+ mutex_lock(&c->btree_root_lock);
+
+ vstruct_for_each(jset, entry)
+ if (entry->type == BCH_JSET_ENTRY_btree_root) {
+ r = &c->btree_roots[entry->btree_id];
+ r->level = entry->level;
+ r->alive = true;
+ bkey_copy(&r->key, &entry->start[0]);
+ }
+
+ mutex_unlock(&c->btree_root_lock);
+}
+
+struct jset_entry *
+bch2_btree_roots_to_journal_entries(struct bch_fs *c,
+ struct jset_entry *start,
+ struct jset_entry *end)
+{
+ struct jset_entry *entry;
+ unsigned long have = 0;
+ unsigned i;
+
+ for (entry = start; entry < end; entry = vstruct_next(entry))
+ if (entry->type == BCH_JSET_ENTRY_btree_root)
+ __set_bit(entry->btree_id, &have);
+
+ mutex_lock(&c->btree_root_lock);
+
+ for (i = 0; i < BTREE_ID_NR; i++)
+ if (c->btree_roots[i].alive && !test_bit(i, &have)) {
+ journal_entry_set(end,
+ BCH_JSET_ENTRY_btree_root,
+ i, c->btree_roots[i].level,
+ &c->btree_roots[i].key,
+ c->btree_roots[i].key.u64s);
+ end = vstruct_next(end);
+ }
+
+ mutex_unlock(&c->btree_root_lock);
+
+ return end;
+}
+
void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
{
+ if (c->btree_interior_update_worker)
+ destroy_workqueue(c->btree_interior_update_worker);
mempool_exit(&c->btree_interior_update_pool);
- mempool_exit(&c->btree_reserve_pool);
}
int bch2_fs_btree_interior_update_init(struct bch_fs *c)
@@ -2259,9 +2043,13 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c)
INIT_LIST_HEAD(&c->btree_interior_update_list);
INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
mutex_init(&c->btree_interior_update_lock);
+ INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
+
+ c->btree_interior_update_worker =
+ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+ if (!c->btree_interior_update_worker)
+ return -ENOMEM;
- return mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
- sizeof(struct btree_reserve)) ?:
- mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
- sizeof(struct btree_update));
+ return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
+ sizeof(struct btree_update));
}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 739a5ac536b8..a6be62d3a18f 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -6,34 +6,13 @@
#include "btree_locking.h"
#include "btree_update.h"
-struct btree_reserve {
- struct disk_reservation disk_res;
- unsigned nr;
- struct btree *b[BTREE_RESERVE_MAX];
-};
-
void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
struct bkey_format *);
-/* Btree node freeing/allocation: */
-
-/*
- * Tracks a btree node that has been (or is about to be) freed in memory, but
- * has _not_ yet been freed on disk (because the write that makes the new
- * node(s) visible and frees the old hasn't completed yet)
- */
-struct pending_btree_node_free {
- bool index_update_done;
-
- __le64 seq;
- enum btree_id btree_id;
- unsigned level;
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-};
+#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
-#define BTREE_UPDATE_JOURNAL_RES \
- ((BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2)
+#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
@@ -72,9 +51,8 @@ struct btree_update {
unsigned nodes_written:1;
enum btree_id btree_id;
- u8 level;
- struct btree_reserve *reserve;
+ struct disk_reservation disk_res;
struct journal_preres journal_preres;
/*
@@ -96,17 +74,28 @@ struct btree_update {
*/
struct journal_entry_pin journal;
- /*
- * Nodes being freed:
- * Protected by c->btree_node_pending_free_lock
- */
- struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
- unsigned nr_pending;
+ /* Preallocated nodes we reserve when we start the update: */
+ struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX];
+ unsigned nr_prealloc_nodes;
+
+ /* Nodes being freed: */
+ struct keylist old_keys;
+ u64 _old_keys[BTREE_UPDATE_NODES_MAX *
+ BKEY_BTREE_PTR_VAL_U64s_MAX];
+
+ /* Nodes being added: */
+ struct keylist new_keys;
+ u64 _new_keys[BTREE_UPDATE_NODES_MAX *
+ BKEY_BTREE_PTR_VAL_U64s_MAX];
/* New nodes, that will be made reachable by this update: */
- struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
+ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX];
unsigned nr_new_nodes;
+ u8 open_buckets[BTREE_UPDATE_NODES_MAX *
+ BCH_REPLICAS_MAX];
+ u8 nr_open_buckets;
+
unsigned journal_u64s;
u64 journal_entries[BTREE_UPDATE_JOURNAL_RES];
@@ -120,14 +109,12 @@ struct btree_update {
u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
};
-#define for_each_pending_btree_node_free(c, as, p) \
- list_for_each_entry(as, &c->btree_interior_update_list, list) \
- for (p = as->pending; p < as->pending + as->nr_pending; p++)
-
void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
struct btree_iter *);
void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
+void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *);
+
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
struct btree *,
struct bkey_format);
@@ -139,6 +126,7 @@ bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned,
void bch2_btree_interior_update_will_free_node(struct btree_update *,
struct btree *);
+void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
void bch2_btree_insert_node(struct btree_update *, struct btree *,
struct btree_iter *, struct keylist *,
@@ -333,6 +321,10 @@ ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
+struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
+ struct jset_entry *, struct jset_entry *);
+
void bch2_fs_btree_interior_update_exit(struct bch_fs *);
int bch2_fs_btree_interior_update_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 6e402027c63f..e343d80fede3 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -414,8 +414,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
}
if (unlikely(trans->extra_journal_entry_u64s)) {
- memcpy_u64s_small(bch2_journal_reservation_entry(&c->journal,
- &trans->journal_res),
+ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
trans->extra_journal_entries,
trans->extra_journal_entry_u64s);
@@ -521,6 +520,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
i->iter);
+ if (!ret && trans->journal_pin)
+ bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq,
+ trans->journal_pin, NULL);
+
/*
* Drop journal reservation after dropping write locks, since dropping
* the journal reservation may kick off a journal write:
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2fe33d744d33..1d8381656d81 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1180,7 +1180,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
-int bch2_mark_key_locked(struct bch_fs *c,
+static int bch2_mark_key_locked(struct bch_fs *c,
struct bkey_s_c k,
unsigned offset, s64 sectors,
struct bch_fs_usage *fs_usage,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 765650ce9d0a..97265fe90e96 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -259,8 +259,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
-int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64,
- struct bch_fs_usage *, u64, unsigned);
int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
struct bch_fs_usage *, u64, unsigned);
int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 0a4538b3dc60..32999161bdd8 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -959,15 +959,12 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
void bch2_fs_journal_stop(struct journal *j)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
bch2_journal_flush_all_pins(j);
wait_event(j->wait, journal_entry_close(j));
/* do we need to write another journal entry? */
- if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
- c->btree_roots_dirty)
+ if (test_bit(JOURNAL_NOT_EMPTY, &j->flags))
bch2_journal_meta(j);
journal_quiesce(j);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index e4b7fe8ffa82..997a28ae862e 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -200,31 +200,38 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
}
static inline struct jset_entry *
-bch2_journal_reservation_entry(struct journal *j, struct journal_res *res)
+journal_res_entry(struct journal *j, struct journal_res *res)
{
return vstruct_idx(j->buf[res->idx].data, res->offset);
}
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+ enum btree_id id, unsigned level,
+ const void *data, unsigned u64s)
+{
+ memset(entry, 0, sizeof(*entry));
+ entry->u64s = cpu_to_le16(u64s);
+ entry->type = type;
+ entry->btree_id = id;
+ entry->level = level;
+ memcpy_u64s_small(entry->_data, data, u64s);
+
+ return jset_u64s(u64s);
+}
+
static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
unsigned type, enum btree_id id,
unsigned level,
const void *data, unsigned u64s)
{
- struct jset_entry *entry = bch2_journal_reservation_entry(j, res);
- unsigned actual = jset_u64s(u64s);
+ unsigned actual = journal_entry_set(journal_res_entry(j, res),
+ type, id, level, data, u64s);
EBUG_ON(!res->ref);
EBUG_ON(actual > res->u64s);
res->offset += actual;
res->u64s -= actual;
-
- memset(entry, 0, sizeof(*entry));
- entry->u64s = cpu_to_le16(u64s);
- entry->type = type;
- entry->btree_id = id;
- entry->level = level;
- memcpy_u64s_small(entry->_data, data, u64s);
}
static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 39bb2154cce1..b923efc42099 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
#include "btree_io.h"
+#include "btree_update_interior.h"
#include "buckets.h"
#include "checksum.h"
#include "error.h"
@@ -993,8 +994,23 @@ void bch2_journal_write(struct closure *cl)
j->write_start_time = local_clock();
- start = vstruct_last(jset);
- end = bch2_journal_super_entries_add_common(c, start,
+ /*
+ * New btree roots are set by journalling them; when the journal entry
+ * gets written we have to propagate them to c->btree_roots
+ *
+ * But, every journal entry we write has to contain all the btree roots
+ * (at least for now); so after we copy btree roots to c->btree_roots we
+ * have to get any missing btree roots and add them to this journal
+ * entry:
+ */
+
+ bch2_journal_entries_to_btree_roots(c, jset);
+
+ start = end = vstruct_last(jset);
+
+ end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
+
+ end = bch2_journal_super_entries_add_common(c, end,
le64_to_cpu(jset->seq));
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index d34434f62454..d5eed53f1298 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -330,7 +330,7 @@ static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
__journal_pin_drop(j, pin);
- BUG_ON(!atomic_read(&pin_list->count));
+ BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j));
atomic_inc(&pin_list->count);
pin->seq = seq;
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 883a0a5680af..3ef641f7ce30 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -38,7 +38,7 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
- if (unlikely(!journal_pin_active(pin)))
+ if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
__bch2_journal_pin_add(j, seq, pin, flush_fn);
}
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index 5da54ced9cad..864dfaa67b7a 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -6,7 +6,7 @@
int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
size_t nr_inline_u64s, size_t new_u64s)
{
- size_t oldsize = bch_keylist_u64s(l);
+ size_t oldsize = bch2_keylist_u64s(l);
size_t newsize = oldsize + new_u64s;
u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
u64 *new_keys;
@@ -52,7 +52,7 @@ void bch2_keylist_pop_front(struct keylist *l)
memmove_u64s_down(l->keys,
bkey_next(l->keys),
- bch_keylist_u64s(l));
+ bch2_keylist_u64s(l));
}
#ifdef CONFIG_BCACHEFS_DEBUG
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
index a7ff86b08abc..195799bb20bc 100644
--- a/fs/bcachefs/keylist.h
+++ b/fs/bcachefs/keylist.h
@@ -36,14 +36,14 @@ static inline bool bch2_keylist_empty(struct keylist *l)
return l->top == l->keys;
}
-static inline size_t bch_keylist_u64s(struct keylist *l)
+static inline size_t bch2_keylist_u64s(struct keylist *l)
{
return l->top_p - l->keys_p;
}
static inline size_t bch2_keylist_bytes(struct keylist *l)
{
- return bch_keylist_u64s(l) * sizeof(u64);
+ return bch2_keylist_u64s(l) * sizeof(u64);
}
static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index e26fa1608f39..96c8690adc5b 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -151,15 +151,8 @@ retry:
}
/* flush relevant btree updates */
- while (1) {
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c) ||
- c->btree_roots_dirty);
- if (c->btree_roots_dirty)
- bch2_journal_meta(&c->journal);
- if (!bch2_btree_interior_updates_nr_pending(c))
- break;
- }
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_nr_pending(c));
ret = 0;
err:
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 67e495bc8aba..11a92c099afd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -775,14 +775,8 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
- while (1) {
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c) ||
- c->btree_roots_dirty);
- if (!bch2_btree_interior_updates_nr_pending(c))
- break;
- bch2_journal_meta(&c->journal);
- }
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_nr_pending(c));
ret = bch2_replicas_gc2(c) ?: ret;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index e1cfb374b842..674febf0ded5 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -763,6 +763,7 @@ static int verify_superblock_clean(struct bch_fs *c,
"superblock read clock doesn't match journal after clean shutdown");
for (i = 0; i < BTREE_ID_NR; i++) {
+ char buf1[200], buf2[200];
struct bkey_i *k1, *k2;
unsigned l1 = 0, l2 = 0;
@@ -778,7 +779,11 @@ static int verify_superblock_clean(struct bch_fs *c,
k1->k.u64s != k2->k.u64s ||
memcmp(k1, k2, bkey_bytes(k1)) ||
l1 != l2, c,
- "superblock btree root doesn't match journal after clean shutdown");
+ "superblock btree root %u doesn't match journal after clean shutdown\n"
+ "sb: l=%u %s\n"
+ "journal: l=%u %s\n", i,
+ l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1),
+ l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2));
}
fsck_err:
return ret;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 6596764c8421..f2be64c869df 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_update_interior.h"
#include "buckets.h"
#include "checksum.h"
#include "disk_groups.h"
@@ -955,7 +956,6 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
mutex_lock(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
@@ -989,27 +989,8 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry *entry,
u64 journal_seq)
{
- struct btree_root *r;
unsigned i;
- mutex_lock(&c->btree_root_lock);
-
- for (r = c->btree_roots;
- r < c->btree_roots + BTREE_ID_NR;
- r++)
- if (r->alive) {
- entry_init_u64s(entry, r->key.u64s + 1);
- entry->btree_id = r - c->btree_roots;
- entry->level = r->level;
- entry->type = BCH_JSET_ENTRY_btree_root;
- bkey_copy(&entry->start[0], &r->key);
-
- entry = vstruct_next(entry);
- }
- c->btree_roots_dirty = false;
-
- mutex_unlock(&c->btree_root_lock);
-
percpu_down_write(&c->mark_lock);
if (!journal_seq) {
@@ -1110,6 +1091,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
entry = sb_clean->start;
entry = bch2_journal_super_entries_add_common(c, entry, 0);
+ entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
memset(entry, 0,
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 138fd7108642..db5edb5cdf9e 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -259,6 +259,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
*/
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
+ flush_work(&c->btree_interior_update_work);
clean_passes = wrote ? 0 : clean_passes + 1;
} while (clean_passes < 2);
@@ -266,6 +267,10 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch_verbose(c, "writing alloc info complete");
set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
nowrote_alloc:
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_nr_pending(c));
+ flush_work(&c->btree_interior_update_work);
+
for_each_member_device(ca, c, i)
bch2_dev_allocator_stop(ca);