summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2021-07-10 23:03:15 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 17:09:08 -0400
commit19d5432445fd0898231063518c375f7551d7c3d2 (patch)
tree8c45d7d67a972bd136aa756d0866ae7a3210ed81
parente3a67bdb6e3ab6a6850c546a3772998fe5242069 (diff)
bcachefs: Really don't hold btree locks while btree IOs are in flight
This is something we've attempted to stick to for quite some time, as it helps guarantee filesystem latency - but there's a few remaining paths that this patch fixes. This is also necessary for an upcoming patch to update btree pointers after every btree write - since the btree write completion path will now be doing btree operations. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/btree_cache.c95
-rw-r--r--fs/bcachefs/btree_io.c51
-rw-r--r--fs/bcachefs/btree_io.h26
-rw-r--r--fs/bcachefs/btree_update_interior.c2
-rw-r--r--fs/bcachefs/debug.c4
5 files changed, 116 insertions, 62 deletions
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 15f597ab03e7..051d2867ad63 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -187,6 +187,17 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
int ret = 0;
lockdep_assert_held(&bc->lock);
+wait_on_io:
+ if (b->flags & ((1U << BTREE_NODE_dirty)|
+ (1U << BTREE_NODE_read_in_flight)|
+ (1U << BTREE_NODE_write_in_flight))) {
+ if (!flush)
+ return -ENOMEM;
+
+ /* XXX: waiting on IO with btree cache lock held */
+ bch2_btree_node_wait_on_read(b);
+ bch2_btree_node_wait_on_write(b);
+ }
if (!six_trylock_intent(&b->c.lock))
return -ENOMEM;
@@ -194,25 +205,26 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
if (!six_trylock_write(&b->c.lock))
goto out_unlock_intent;
+ /* recheck under lock */
+ if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
+ (1U << BTREE_NODE_write_in_flight))) {
+ if (!flush)
+ goto out_unlock;
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ goto wait_on_io;
+ }
+
if (btree_node_noevict(b))
goto out_unlock;
if (!btree_node_may_write(b))
goto out_unlock;
- if (btree_node_dirty(b) &&
- test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
- goto out_unlock;
-
- if (btree_node_dirty(b) ||
- btree_node_write_in_flight(b) ||
- btree_node_read_in_flight(b)) {
- if (!flush)
+ if (btree_node_dirty(b)) {
+ if (!flush ||
+ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
goto out_unlock;
-
- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
- TASK_UNINTERRUPTIBLE);
-
/*
* Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted
@@ -224,8 +236,9 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
else
__bch2_btree_node_write(c, b);
- /* wait for any in flight btree write */
- btree_node_wait_on_io(b);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ goto wait_on_io;
}
out:
if (b->hash_val && !ret)
@@ -581,6 +594,7 @@ got_node:
}
BUG_ON(btree_node_hashed(b));
+ BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_write_in_flight(b));
out:
b->flags = 0;
@@ -634,6 +648,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
+ u32 seq;
BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
/*
@@ -663,31 +678,31 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
return NULL;
}
+ set_btree_node_read_in_flight(b);
+
+ six_unlock_write(&b->c.lock);
+ seq = b->c.lock.state.seq;
+ six_unlock_intent(&b->c.lock);
+
/* Unlock before doing IO: */
if (iter && sync)
bch2_trans_unlock(iter->trans);
bch2_btree_node_read(c, b, sync);
- six_unlock_write(&b->c.lock);
-
- if (!sync) {
- six_unlock_intent(&b->c.lock);
+ if (!sync)
return NULL;
- }
/*
* XXX: this will probably always fail because btree_iter_relock()
* currently fails for iterators that aren't pointed at a valid btree
* node
*/
- if (iter && !bch2_trans_relock(iter->trans)) {
- six_unlock_intent(&b->c.lock);
+ if (iter && !bch2_trans_relock(iter->trans))
return ERR_PTR(-EINTR);
- }
- if (lock_type == SIX_LOCK_read)
- six_lock_downgrade(&b->c.lock);
+ if (!six_relock_type(&b->c.lock, lock_type, seq))
+ return ERR_PTR(-EINTR);
return b;
}
@@ -831,11 +846,12 @@ lock_node:
}
if (unlikely(btree_node_read_in_flight(b))) {
+ u32 seq = b->c.lock.state.seq;
+
six_unlock_type(&b->c.lock, lock_type);
bch2_trans_unlock(iter->trans);
- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
- TASK_UNINTERRUPTIBLE);
+ bch2_btree_node_wait_on_read(b);
/*
* XXX: check if this always fails - btree_iter_relock()
@@ -844,7 +860,9 @@ lock_node:
*/
if (iter && !bch2_trans_relock(iter->trans))
return ERR_PTR(-EINTR);
- goto retry;
+
+ if (!six_relock_type(&b->c.lock, lock_type, seq))
+ goto retry;
}
prefetch(b->aux_data);
@@ -923,8 +941,7 @@ lock_node:
}
/* XXX: waiting on IO with btree locks held: */
- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
- TASK_UNINTERRUPTIBLE);
+ __bch2_btree_node_wait_on_read(b);
prefetch(b->aux_data);
@@ -979,16 +996,24 @@ void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
b = btree_cache_find(bc, k);
if (!b)
return;
+wait_on_io:
+ /* not allowed to wait on io with btree locks held: */
+
+ /* XXX we're called from btree_gc which will be holding other btree
+ * nodes locked
+ * */
+ __bch2_btree_node_wait_on_read(b);
+ __bch2_btree_node_wait_on_write(b);
six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL);
- wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
- TASK_UNINTERRUPTIBLE);
- __bch2_btree_node_write(c, b);
-
- /* wait for any in flight btree write */
- btree_node_wait_on_io(b);
+ if (btree_node_dirty(b)) {
+ __bch2_btree_node_write(c, b);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ goto wait_on_io;
+ }
BUG_ON(btree_node_dirty(b));
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 0095c789f6e5..2974b2ad6966 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -22,6 +22,44 @@
#include <linux/sched/mm.h>
+void bch2_btree_node_io_unlock(struct btree *b)
+{
+ EBUG_ON(!btree_node_write_in_flight(b));
+
+ clear_btree_node_write_in_flight(b);
+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+void bch2_btree_node_io_lock(struct btree *b)
+{
+ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_read(struct btree *b)
+{
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+void __bch2_btree_node_wait_on_write(struct btree *b)
+{
+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_read(struct btree *b)
+{
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
+void bch2_btree_node_wait_on_write(struct btree *b)
+{
+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+ TASK_UNINTERRUPTIBLE);
+}
+
static void verify_no_dups(struct btree *b,
struct bkey_packed *start,
struct bkey_packed *end)
@@ -432,7 +470,8 @@ void bch2_btree_init_next(struct btree_trans *trans,
EBUG_ON(iter && iter->l[b->c.level].b != b);
BUG_ON(bset_written(b, bset(b, &b->set[1])));
- if (b->nsets == MAX_BSETS) {
+ if (b->nsets == MAX_BSETS &&
+ !btree_node_write_in_flight(b)) {
unsigned log_u64s[] = {
ilog2(bset_u64s(&b->set[0])),
ilog2(bset_u64s(&b->set[1])),
@@ -1402,8 +1441,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
btree_pos_to_text(&PBUF(buf), c, b);
trace_btree_read(c, b);
- set_btree_node_read_in_flight(b);
-
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
return;
@@ -1480,6 +1517,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
bkey_copy(&b->key, k);
BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
+ set_btree_node_read_in_flight(b);
+
bch2_btree_node_read(c, b, true);
if (btree_node_read_error(b)) {
@@ -1525,7 +1564,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
struct btree_write *w = btree_prev_write(b);
bch2_btree_complete_write(c, b, w);
- btree_node_io_unlock(b);
+ bch2_btree_node_io_unlock(b);
}
static void bch2_btree_node_write_error(struct bch_fs *c,
@@ -1707,6 +1746,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
bool validate_before_checksum = false;
void *data;
+ BUG_ON(btree_node_write_in_flight(b));
+
if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
return;
@@ -1734,7 +1775,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
* XXX waiting on btree writes with btree locks held -
* this can deadlock, and we hit the write error path
*/
- btree_node_wait_on_io(b);
+ bch2_btree_node_wait_on_write(b);
continue;
}
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index fae67622c127..89fd4aba5218 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -52,24 +52,12 @@ struct btree_write_bio {
struct bch_write_bio wbio;
};
-static inline void btree_node_io_unlock(struct btree *b)
-{
- EBUG_ON(!btree_node_write_in_flight(b));
- clear_btree_node_write_in_flight(b);
- wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-}
-
-static inline void btree_node_io_lock(struct btree *b)
-{
- wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
-
-static inline void btree_node_wait_on_io(struct btree *b)
-{
- wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
- TASK_UNINTERRUPTIBLE);
-}
+void bch2_btree_node_io_unlock(struct btree *);
+void bch2_btree_node_io_lock(struct btree *);
+void __bch2_btree_node_wait_on_read(struct btree *);
+void __bch2_btree_node_wait_on_write(struct btree *);
+void bch2_btree_node_wait_on_read(struct btree *);
+void bch2_btree_node_wait_on_write(struct btree *);
static inline bool btree_node_may_write(struct btree *b)
{
@@ -169,7 +157,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
}
six_unlock_type(&b->c.lock, lock_held);
- btree_node_wait_on_io(b);
+ bch2_btree_node_wait_on_write(b);
btree_node_lock_type(c, b, lock_held);
}
}
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index a5d3973e163a..37dadbae41e5 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -567,7 +567,7 @@ static void btree_update_nodes_written(struct btree_update *as)
six_unlock_read(&old->c.lock);
if (seq == as->old_nodes_seq[i])
- btree_node_wait_on_io(old);
+ bch2_btree_node_wait_on_write(old);
}
/*
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index cbadb38f680f..6a28de30ea3b 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -133,7 +133,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
if (c->opts.nochanges)
return;
- btree_node_io_lock(b);
+ bch2_btree_node_io_lock(b);
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
@@ -176,7 +176,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
}
out:
mutex_unlock(&c->verify_lock);
- btree_node_io_unlock(b);
+ bch2_btree_node_io_unlock(b);
}
#ifdef CONFIG_DEBUG_FS