From d01f633041c50452f9e837b7cc0223e6f37d42da Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 Sep 2022 14:21:58 -0400 Subject: Update bcachefs sources to 176718966e bcachefs: Re-enable hash_redo_key() --- .bcachefs_revision | 2 +- include/linux/list.h | 1 + include/linux/six.h | 30 ++- include/trace/events/bcachefs.h | 246 +++++++++-------- libbcachefs/alloc_background.c | 3 +- libbcachefs/alloc_foreground.c | 29 +-- libbcachefs/bcachefs.h | 16 +- libbcachefs/bcachefs_format.h | 81 +++++- libbcachefs/btree_cache.c | 47 ++-- libbcachefs/btree_cache.h | 6 +- libbcachefs/btree_gc.c | 44 ++-- libbcachefs/btree_io.c | 12 +- libbcachefs/btree_iter.c | 507 +++--------------------------------- libbcachefs/btree_iter.h | 25 +- libbcachefs/btree_key_cache.c | 244 +++++++++++------ libbcachefs/btree_locking.c | 466 +++++++++++++++++++++++++++++++++ libbcachefs/btree_locking.h | 279 ++++++++++++-------- libbcachefs/btree_types.h | 17 +- libbcachefs/btree_update_interior.c | 170 ++++++------ libbcachefs/btree_update_interior.h | 1 + libbcachefs/btree_update_leaf.c | 65 +++-- libbcachefs/data_update.c | 19 +- libbcachefs/debug.c | 3 + libbcachefs/disk_groups.c | 28 +- libbcachefs/disk_groups.h | 1 + libbcachefs/fsck.c | 13 +- libbcachefs/io.c | 10 +- libbcachefs/journal.c | 4 +- libbcachefs/journal_io.c | 2 +- libbcachefs/journal_reclaim.c | 5 +- libbcachefs/move.c | 6 +- libbcachefs/movinggc.c | 4 +- libbcachefs/str_hash.h | 43 ++- libbcachefs/super-io.c | 2 +- libbcachefs/super.c | 18 ++ libbcachefs/sysfs.c | 19 -- linux/six.c | 461 +++++++++++++++----------------- 37 files changed, 1632 insertions(+), 1297 deletions(-) create mode 100644 libbcachefs/btree_locking.c diff --git a/.bcachefs_revision b/.bcachefs_revision index 23636ff3..f3aeb2c0 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -a7694865a3008d6752370caee2ed3c64c1b0f973 +176718966e14c5f832ead8cea2e0e45aba51f5ef diff --git a/include/linux/list.h b/include/linux/list.h index 3639dc99..dcc4745f 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -10,6 +10,7 @@ #define list_add(n, h) cds_list_add(n, h) #define list_add_tail(n, h) cds_list_add_tail(n, h) #define __list_del_entry(l) cds_list_del(l) +#define __list_del(p, n) __cds_list_del(p, n) #define list_del(l) cds_list_del(l) #define list_del_init(l) cds_list_del_init(l) #define list_replace(o, n) cds_list_replace(o, n) diff --git a/include/linux/six.h b/include/linux/six.h index 41ddf63b..f336ae04 100644 --- a/include/linux/six.h +++ b/include/linux/six.h @@ -59,7 +59,6 @@ */ #include -#include #include #include @@ -105,18 +104,23 @@ enum six_lock_type { struct six_lock { union six_lock_state state; - unsigned intent_lock_recurse; struct task_struct *owner; - struct optimistic_spin_queue osq; unsigned __percpu *readers; - + unsigned intent_lock_recurse; raw_spinlock_t wait_lock; - struct list_head wait_list[2]; + struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; +struct six_lock_waiter { + struct list_head list; + struct task_struct *task; + enum six_lock_type lock_want; + bool lock_acquired; +}; + typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); static __always_inline void __six_lock_init(struct six_lock *lock, @@ -125,8 +129,7 @@ static __always_inline void __six_lock_init(struct six_lock *lock, { atomic64_set(&lock->state.counter, 0); raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); - INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); + INIT_LIST_HEAD(&lock->wait_list); #ifdef CONFIG_DEBUG_LOCK_ALLOC debug_check_no_locks_freed((void *) lock, sizeof(*lock)); lockdep_init_map(&lock->dep_map, name, key, 0); @@ -146,6 +149,8 @@ do { \ bool six_trylock_##type(struct six_lock *); \ bool six_relock_##type(struct six_lock *, u32); \ int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\ +int six_lock_waiter_##type(struct six_lock *, struct six_lock_waiter *, \ + six_lock_should_sleep_fn, void *); \ void six_unlock_##type(struct six_lock *); __SIX_LOCK(read) @@ -182,6 +187,13 @@ static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); } +static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p) +{ + SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p); +} + static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) { SIX_LOCK_DISPATCH(type, six_unlock, lock); @@ -196,13 +208,11 @@ void six_lock_increment(struct six_lock *, enum six_lock_type); void six_lock_wakeup_all(struct six_lock *); -void six_lock_pcpu_free_rcu(struct six_lock *); void six_lock_pcpu_free(struct six_lock *); void six_lock_pcpu_alloc(struct six_lock *); struct six_lock_count { - unsigned read; - unsigned intent; + unsigned n[3]; }; struct six_lock_count six_lock_counts(struct six_lock *); diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index a18c59a3..ff5e6f7c 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -18,7 +18,7 @@ __entry->dst##_snapshot = (src).snapshot DECLARE_EVENT_CLASS(bpos, - TP_PROTO(struct bpos *p), + TP_PROTO(const struct bpos *p), TP_ARGS(p), TP_STRUCT__entry( @@ -52,6 +52,31 @@ DECLARE_EVENT_CLASS(bkey, __entry->offset, __entry->size) ); +DECLARE_EVENT_CLASS(btree_node, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u8, level ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->level = b->c.level; + __entry->btree_id = b->c.btree_id; + TRACE_BPOS_assign(pos, b->key.k.p); + ), + + TP_printk("%d,%d %u %s %llu:%llu:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->level, + bch2_btree_ids[__entry->btree_id], + __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) +); + DECLARE_EVENT_CLASS(bch_fs, TP_PROTO(struct bch_fs *c), TP_ARGS(c), @@ -112,7 +137,7 @@ TRACE_EVENT(write_super, /* io.c: */ -DEFINE_EVENT(bio, read_split, +DEFINE_EVENT(bio, read_promote, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); @@ -122,12 +147,17 @@ DEFINE_EVENT(bio, read_bounce, TP_ARGS(bio) ); +DEFINE_EVENT(bio, read_split, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + DEFINE_EVENT(bio, read_retry, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, promote, +DEFINE_EVENT(bio, read_reuse_race, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); @@ -220,48 +250,68 @@ TRACE_EVENT(journal_reclaim_finish, __entry->nr_flushed) ); -/* allocator: */ - /* bset.c: */ DEFINE_EVENT(bpos, bkey_pack_pos_fail, - TP_PROTO(struct bpos *p), + TP_PROTO(const struct bpos *p), TP_ARGS(p) ); -/* Btree */ +/* Btree cache: */ -DECLARE_EVENT_CLASS(btree_node, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b), +TRACE_EVENT(btree_cache_scan, + TP_PROTO(long nr_to_scan, long can_free, long ret), + TP_ARGS(nr_to_scan, can_free, ret), TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u8, level ) - __field(u8, btree_id ) - TRACE_BPOS_entries(pos) + __field(long, nr_to_scan ) + __field(long, can_free ) + __field(long, ret ) ), TP_fast_assign( - __entry->dev = c->dev; - __entry->level = b->c.level; - __entry->btree_id = b->c.btree_id; - TRACE_BPOS_assign(pos, b->key.k.p); + __entry->nr_to_scan = nr_to_scan; + __entry->can_free = can_free; + __entry->ret = ret; ), - TP_printk("%d,%d %u %s %llu:%llu:%u", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->level, - bch2_btree_ids[__entry->btree_id], - __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) + TP_printk("scanned for %li nodes, can free %li, ret %li", + __entry->nr_to_scan, __entry->can_free, __entry->ret) ); -DEFINE_EVENT(btree_node, btree_read, +DEFINE_EVENT(btree_node, btree_cache_reap, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -TRACE_EVENT(btree_write, +DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +/* Btree */ + +DEFINE_EVENT(btree_node, btree_node_read, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +TRACE_EVENT(btree_node_write, TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), TP_ARGS(b, bytes, sectors), @@ -291,31 +341,6 @@ DEFINE_EVENT(btree_node, btree_node_free, TP_ARGS(c, b) ); -DEFINE_EVENT(btree_node, btree_node_reap, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) -); - -DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(bch_fs, btree_node_cannibalize, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - TRACE_EVENT(btree_reserve_get_fail, TP_PROTO(const char *trans_fn, unsigned long caller_ip, @@ -323,7 +348,7 @@ TRACE_EVENT(btree_reserve_get_fail, TP_ARGS(trans_fn, caller_ip, required), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(size_t, required ) ), @@ -340,52 +365,32 @@ TRACE_EVENT(btree_reserve_get_fail, __entry->required) ); -DEFINE_EVENT(btree_node, btree_split, +DEFINE_EVENT(btree_node, btree_node_compact, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(btree_node, btree_compact, +DEFINE_EVENT(btree_node, btree_node_merge, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(btree_node, btree_merge, +DEFINE_EVENT(btree_node, btree_node_split, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(btree_node, btree_rewrite, +DEFINE_EVENT(btree_node, btree_node_rewrite, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -DEFINE_EVENT(btree_node, btree_set_root, +DEFINE_EVENT(btree_node, btree_node_set_root, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) ); -TRACE_EVENT(btree_cache_scan, - TP_PROTO(long nr_to_scan, long can_free, long ret), - TP_ARGS(nr_to_scan, can_free, ret), - - TP_STRUCT__entry( - __field(long, nr_to_scan ) - __field(long, can_free ) - __field(long, ret ) - ), - - TP_fast_assign( - __entry->nr_to_scan = nr_to_scan; - __entry->can_free = can_free; - __entry->ret = ret; - ), - - TP_printk("scanned for %li nodes, can free %li, ret %li", - __entry->nr_to_scan, __entry->can_free, __entry->ret) -); - -TRACE_EVENT(btree_node_relock_fail, +TRACE_EVENT(btree_path_relock_fail, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path, @@ -393,26 +398,31 @@ TRACE_EVENT(btree_node_relock_fail, TP_ARGS(trans, caller_ip, path, level), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) TRACE_BPOS_entries(pos) - __field(unsigned long, node ) + __array(char, node, 24 ) __field(u32, iter_lock_seq ) __field(u32, node_lock_seq ) ), TP_fast_assign( + struct btree *b = btree_path_node(path, level); + strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; TRACE_BPOS_assign(pos, path->pos); - __entry->node = (unsigned long) btree_path_node(path, level); + if (IS_ERR(b)) + strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node)); + else + scnprintf(__entry->node, sizeof(__entry->node), "%px", b); __entry->iter_lock_seq = path->l[level].lock_seq; __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0; ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u, node %lu iter seq %u lock seq %u", + TP_printk("%s %pS btree %s pos %llu:%llu:%u, node %s iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, bch2_btree_ids[__entry->btree_id], @@ -424,7 +434,7 @@ TRACE_EVENT(btree_node_relock_fail, __entry->node_lock_seq) ); -TRACE_EVENT(btree_node_upgrade_fail, +TRACE_EVENT(btree_path_upgrade_fail, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path, @@ -432,7 +442,7 @@ TRACE_EVENT(btree_node_upgrade_fail, TP_ARGS(trans, caller_ip, path, level), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) TRACE_BPOS_entries(pos) @@ -452,12 +462,12 @@ TRACE_EVENT(btree_node_upgrade_fail, TRACE_BPOS_assign(pos, path->pos); __entry->locked = btree_node_locked(path, level); - c = bch2_btree_node_lock_counts(trans, NULL, path->l[level].b, level), - __entry->self_read_count = c.read; - __entry->self_intent_count = c.intent; + c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), + __entry->self_read_count = c.n[SIX_LOCK_read]; + __entry->self_intent_count = c.n[SIX_LOCK_intent]; c = six_lock_counts(&path->l[level].b->c.lock); - __entry->read_count = c.read; - __entry->intent_count = c.intent; + __entry->read_count = c.n[SIX_LOCK_read]; + __entry->intent_count = c.n[SIX_LOCK_read]; ), TP_printk("%s %pS btree %s pos %llu:%llu:%u, locked %u held %u:%u lock count %u:%u", @@ -599,7 +609,7 @@ TRACE_EVENT(discard_buckets, __entry->err) ); -TRACE_EVENT(invalidate_bucket, +TRACE_EVENT(bucket_invalidate, TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), TP_ARGS(c, dev, bucket, sectors), @@ -625,17 +635,27 @@ TRACE_EVENT(invalidate_bucket, /* Moving IO */ -DEFINE_EVENT(bkey, move_extent, +DEFINE_EVENT(bkey, move_extent_read, + TP_PROTO(const struct bkey *k), + TP_ARGS(k) +); + +DEFINE_EVENT(bkey, move_extent_write, + TP_PROTO(const struct bkey *k), + TP_ARGS(k) +); + +DEFINE_EVENT(bkey, move_extent_finish, TP_PROTO(const struct bkey *k), TP_ARGS(k) ); -DEFINE_EVENT(bkey, move_alloc_mem_fail, +DEFINE_EVENT(bkey, move_extent_race, TP_PROTO(const struct bkey *k), TP_ARGS(k) ); -DEFINE_EVENT(bkey, move_race, +DEFINE_EVENT(bkey, move_extent_alloc_mem_fail, TP_PROTO(const struct bkey *k), TP_ARGS(k) ); @@ -714,13 +734,15 @@ TRACE_EVENT(copygc_wait, __entry->wait_amount, __entry->until) ); +/* btree transactions: */ + DECLARE_EVENT_CLASS(transaction_event, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) ), @@ -738,7 +760,7 @@ DEFINE_EVENT(transaction_event, transaction_commit, TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_event, transaction_restart_injected, +DEFINE_EVENT(transaction_event, trans_restart_injected, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) @@ -756,10 +778,28 @@ DEFINE_EVENT(transaction_event, trans_restart_journal_res_get, TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_event, trans_restart_journal_preres_get, + +TRACE_EVENT(trans_restart_journal_preres_get, TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + unsigned long caller_ip, + unsigned flags), + TP_ARGS(trans, caller_ip, flags), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(unsigned, flags ) + ), + + TP_fast_assign( + strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->flags = flags; + ), + + TP_printk("%s %pS %x", __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->flags) ); DEFINE_EVENT(transaction_event, trans_restart_journal_reclaim, @@ -805,7 +845,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, TP_ARGS(trans, caller_ip, path), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) TRACE_BPOS_entries(pos) @@ -883,7 +923,7 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, TP_ARGS(trans, caller_ip, path) ); -DEFINE_EVENT(transaction_event, transaction_restart_key_cache_upgrade, +DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) @@ -935,7 +975,7 @@ TRACE_EVENT(trans_restart_would_deadlock, have, want, want_pos), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(u8, in_traverse_all ) __field(u8, reason ) @@ -982,7 +1022,7 @@ TRACE_EVENT(trans_restart_would_deadlock_write, TP_ARGS(trans), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) ), TP_fast_assign( @@ -999,7 +1039,7 @@ TRACE_EVENT(trans_restart_mem_realloced, TP_ARGS(trans, caller_ip, bytes), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(unsigned long, bytes ) ), @@ -1025,7 +1065,7 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s), TP_STRUCT__entry( - __array(char, trans_fn, 24 ) + __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) __field(enum btree_id, btree_id ) TRACE_BPOS_entries(pos) diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 2281b8d4..d0d7690a 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -1217,8 +1217,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, if (ret) goto out; - trace_invalidate_bucket(c, bucket.inode, bucket.offset, cached_sectors); - this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]); + trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); --*nr_to_invalidate; out: bch2_trans_iter_exit(trans, &alloc_iter); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index c57baa1f..dce227c5 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -268,7 +268,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * spin_unlock(&c->freelist_lock); - trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]); + trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve]); return ob; } @@ -575,20 +575,19 @@ err: if (!ob) ob = ERR_PTR(-BCH_ERR_no_buckets_found); - if (IS_ERR(ob)) { - trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], - usage.d[BCH_DATA_free].buckets, - avail, - bch2_copygc_wait_amount(c), - c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), - buckets_seen, - skipped_open, - skipped_need_journal_commit, - skipped_nouse, - cl == NULL, - bch2_err_str(PTR_ERR(ob))); - atomic_long_inc(&c->bucket_alloc_fail); - } + if (IS_ERR(ob)) + trace_and_count(c, bucket_alloc_fail, + ca, bch2_alloc_reserves[reserve], + usage.d[BCH_DATA_free].buckets, + avail, + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), + buckets_seen, + skipped_open, + skipped_need_journal_commit, + skipped_nouse, + cl == NULL, + bch2_err_str(PTR_ERR(ob))); return ob; } diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index a5bf8087..53e7b5a0 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -212,6 +212,12 @@ #define dynamic_fault(...) 0 #define race_fault(...) 0 +#define trace_and_count(_c, _name, ...) \ +do { \ + this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \ + trace_##_name(__VA_ARGS__); \ +} while (0) + #define bch2_fs_init_fault(name) \ dynamic_fault("bcachefs:bch_fs_init:" name) #define bch2_meta_read_fault(name) \ @@ -329,9 +335,6 @@ BCH_DEBUG_PARAMS_DEBUG() x(btree_interior_update_foreground) \ x(btree_interior_update_total) \ x(btree_gc) \ - x(btree_lock_contended_read) \ - x(btree_lock_contended_intent) \ - x(btree_lock_contended_write) \ x(data_write) \ x(data_read) \ x(data_promote) \ @@ -535,6 +538,7 @@ struct btree_transaction_stats { struct mutex lock; struct time_stats lock_hold_times; unsigned nr_max_paths; + unsigned max_mem; char *max_paths_text; }; @@ -917,12 +921,6 @@ struct bch_fs { u64 last_bucket_seq_cleanup; - /* TODO rewrite as counters - The rest of this all shows up in sysfs */ - atomic_long_t read_realloc_races; - atomic_long_t extent_migrate_done; - atomic_long_t extent_migrate_raced; - atomic_long_t bucket_alloc_fail; - u64 counters_on_mount[BCH_COUNTER_NR]; u64 __percpu *counters; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 147fde14..7730e955 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1337,12 +1337,81 @@ struct bch_sb_field_disk_groups { /* BCH_SB_FIELD_counters */ -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0) \ - x(io_write, 1) \ - x(io_move, 2) \ - x(bucket_invalidate, 3) \ - x(bucket_discard, 4) +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0) \ + x(io_write, 1) \ + x(io_move, 2) \ + x(bucket_invalidate, 3) \ + x(bucket_discard, 4) \ + x(bucket_alloc, 5) \ + x(bucket_alloc_fail, 6) \ + x(btree_cache_scan, 7) \ + x(btree_cache_reap, 8) \ + x(btree_cache_cannibalize, 9) \ + x(btree_cache_cannibalize_lock, 10) \ + x(btree_cache_cannibalize_lock_fail, 11) \ + x(btree_cache_cannibalize_unlock, 12) \ + x(btree_node_write, 13) \ + x(btree_node_read, 14) \ + x(btree_node_compact, 15) \ + x(btree_node_merge, 16) \ + x(btree_node_split, 17) \ + x(btree_node_rewrite, 18) \ + x(btree_node_alloc, 19) \ + x(btree_node_free, 20) \ + x(btree_node_set_root, 21) \ + x(btree_path_relock_fail, 22) \ + x(btree_path_upgrade_fail, 23) \ + x(btree_reserve_get_fail, 24) \ + x(journal_entry_full, 25) \ + x(journal_full, 26) \ + x(journal_reclaim_finish, 27) \ + x(journal_reclaim_start, 28) \ + x(journal_write, 29) \ + x(read_promote, 30) \ + x(read_bounce, 31) \ + x(read_split, 33) \ + x(read_retry, 32) \ + x(read_reuse_race, 34) \ + x(move_extent_read, 35) \ + x(move_extent_write, 36) \ + x(move_extent_finish, 37) \ + x(move_extent_race, 38) \ + x(move_extent_alloc_mem_fail, 39) \ + x(copygc, 40) \ + x(copygc_wait, 41) \ + x(gc_gens_end, 42) \ + x(gc_gens_start, 43) \ + x(trans_blocked_journal_reclaim, 44) \ + x(trans_restart_btree_node_reused, 45) \ + x(trans_restart_btree_node_split, 46) \ + x(trans_restart_fault_inject, 47) \ + x(trans_restart_iter_upgrade, 48) \ + x(trans_restart_journal_preres_get, 49) \ + x(trans_restart_journal_reclaim, 50) \ + x(trans_restart_journal_res_get, 51) \ + x(trans_restart_key_cache_key_realloced, 52) \ + x(trans_restart_key_cache_raced, 53) \ + x(trans_restart_mark_replicas, 54) \ + x(trans_restart_mem_realloced, 55) \ + x(trans_restart_memory_allocation_failure, 56) \ + x(trans_restart_relock, 57) \ + x(trans_restart_relock_after_fill, 58) \ + x(trans_restart_relock_key_cache_fill, 59) \ + x(trans_restart_relock_next_node, 60) \ + x(trans_restart_relock_parent_for_fill, 61) \ + x(trans_restart_relock_path, 62) \ + x(trans_restart_relock_path_intent, 63) \ + x(trans_restart_too_many_iters, 64) \ + x(trans_restart_traverse, 65) \ + x(trans_restart_upgrade, 66) \ + x(trans_restart_would_deadlock, 67) \ + x(trans_restart_would_deadlock_write, 68) \ + x(trans_restart_injected, 69) \ + x(trans_restart_key_cache_upgrade, 70) \ + x(trans_traverse_all, 71) \ + x(transaction_commit, 72) \ + x(write_super, 73) enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 5a6c93d1..dabdb25c 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -14,8 +14,6 @@ #include #include -struct lock_class_key bch2_btree_node_lock_key; - const char * const bch2_btree_node_flags[] = { #define x(f) #f, BTREE_FLAGS() @@ -254,7 +252,7 @@ wait_on_io: } out: if (b->hash_val && !ret) - trace_btree_node_reap(c, b); + trace_and_count(c, btree_cache_reap, c, b); return ret; out_unlock: six_unlock_write(&b->c.lock); @@ -378,7 +376,7 @@ out: ret = freed; memalloc_nofs_restore(flags); out_norestore: - trace_btree_cache_scan(sc->nr_to_scan, can_free, ret); + trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); return ret; } @@ -514,7 +512,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) struct btree_cache *bc = &c->btree_cache; if (bc->alloc_lock == current) { - trace_btree_node_cannibalize_unlock(c); + trace_and_count(c, btree_cache_cannibalize_unlock, c); bc->alloc_lock = NULL; closure_wake_up(&bc->alloc_wait); } @@ -530,7 +528,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; if (!cl) { - trace_btree_node_cannibalize_lock_fail(c); + trace_and_count(c, btree_cache_cannibalize_lock_fail, c); return -ENOMEM; } @@ -544,11 +542,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; } - trace_btree_node_cannibalize_lock_fail(c); + trace_and_count(c, btree_cache_cannibalize_lock_fail, c); return -EAGAIN; success: - trace_btree_node_cannibalize_lock(c); + trace_and_count(c, btree_cache_cannibalize_lock, c); return 0; } @@ -672,7 +670,7 @@ err_locked: mutex_unlock(&bc->lock); - trace_btree_node_cannibalize(c); + trace_and_count(c, btree_cache_cannibalize, c); goto out; } @@ -701,7 +699,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, * been freed: */ if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { - trace_trans_restart_relock_parent_for_fill(trans, _THIS_IP_, path); + trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); } @@ -709,7 +707,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, if (trans && b == ERR_PTR(-ENOMEM)) { trans->memory_allocation_failure = true; - trace_trans_restart_memory_allocation_failure(trans, _THIS_IP_, path); + trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); } @@ -758,7 +756,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, if (!six_relock_type(&b->c.lock, lock_type, seq)) { if (trans) - trace_trans_restart_relock_after_fill(trans, _THIS_IP_, path); + trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); } @@ -896,7 +894,7 @@ lock_node: if (btree_node_read_locked(path, level + 1)) btree_node_unlock(trans, path, level + 1); - ret = btree_node_lock(trans, path, b, k->k.p, level, lock_type, + ret = btree_node_lock(trans, path, &b->c, k->k.p, level, lock_type, lock_node_check_fn, (void *) k, trace_ip); if (unlikely(ret)) { if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused)) @@ -913,7 +911,7 @@ lock_node: if (bch2_btree_node_relock(trans, path, level + 1)) goto retry; - trace_trans_restart_btree_node_reused(trans, trace_ip, path); + trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); } } @@ -969,12 +967,13 @@ lock_node: return b; } -struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, +struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, const struct bkey_i *k, enum btree_id btree_id, unsigned level, bool nofill) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; struct bset_tree *t; @@ -1008,9 +1007,14 @@ retry: goto out; } else { lock_node: - ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); - if (ret) - goto retry; + ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read); + if (unlikely(ret)) { + if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused)) + goto retry; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + BUG(); + } if (unlikely(b->hash_val != btree_ptr_hash_val(k) || b->c.btree_id != btree_id || @@ -1072,8 +1076,9 @@ int bch2_btree_node_prefetch(struct bch_fs *c, return PTR_ERR_OR_ZERO(b); } -void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k) +void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -1089,8 +1094,8 @@ wait_on_io: __bch2_btree_node_wait_on_read(b); __bch2_btree_node_wait_on_write(b); - six_lock_intent(&b->c.lock, NULL, NULL); - six_lock_write(&b->c.lock, NULL, NULL); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); if (btree_node_dirty(b)) { __bch2_btree_node_write(c, b, 0); diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 25906127..a4df3e86 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -5,8 +5,6 @@ #include "bcachefs.h" #include "btree_types.h" -extern struct lock_class_key bch2_btree_node_lock_key; - extern const char * const bch2_btree_node_flags[]; struct btree_iter; @@ -28,13 +26,13 @@ struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, const struct bkey_i *, unsigned, enum six_lock_type, unsigned long); -struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, +struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *, enum btree_id, unsigned, bool); int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *, const struct bkey_i *, enum btree_id, unsigned); -void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *); +void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *); void bch2_fs_btree_cache_exit(struct bch_fs *); int bch2_fs_btree_cache_init(struct bch_fs *); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 2f563365..663c66d0 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -165,10 +165,11 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) } } -static void bch2_btree_node_update_key_early(struct bch_fs *c, +static void bch2_btree_node_update_key_early(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_i *new) { + struct bch_fs *c = trans->c; struct btree *b; struct bkey_buf tmp; int ret; @@ -176,7 +177,7 @@ static void bch2_btree_node_update_key_early(struct bch_fs *c, bch2_bkey_buf_init(&tmp); bch2_bkey_buf_reassemble(&tmp, c, old); - b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true); + b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); if (!IS_ERR_OR_NULL(b)) { mutex_lock(&c->btree_cache.lock); @@ -352,8 +353,9 @@ fsck_err: return ret; } -static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b) +static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) { + struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf prev_k, cur_k; @@ -378,7 +380,7 @@ again: bch2_btree_and_journal_iter_advance(&iter); bch2_bkey_buf_reassemble(&cur_k, c, k); - cur = bch2_btree_node_get_noiter(c, cur_k.k, + cur = bch2_btree_node_get_noiter(trans, cur_k.k, b->c.btree_id, b->c.level - 1, false); ret = PTR_ERR_OR_ZERO(cur); @@ -392,7 +394,7 @@ again: bch2_btree_ids[b->c.btree_id], b->c.level - 1, buf.buf)) { - bch2_btree_node_evict(c, cur_k.k); + bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); cur = NULL; @@ -411,7 +413,7 @@ again: if (ret == DROP_THIS_NODE) { six_unlock_read(&cur->c.lock); - bch2_btree_node_evict(c, cur_k.k); + bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); cur = NULL; @@ -425,7 +427,7 @@ again: prev = NULL; if (ret == DROP_PREV_NODE) { - bch2_btree_node_evict(c, prev_k.k); + bch2_btree_node_evict(trans, prev_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, prev_k.k->k.p); if (ret) @@ -465,7 +467,7 @@ again: bch2_bkey_buf_reassemble(&cur_k, c, k); bch2_btree_and_journal_iter_advance(&iter); - cur = bch2_btree_node_get_noiter(c, cur_k.k, + cur = bch2_btree_node_get_noiter(trans, cur_k.k, b->c.btree_id, b->c.level - 1, false); ret = PTR_ERR_OR_ZERO(cur); @@ -476,12 +478,12 @@ again: goto err; } - ret = bch2_btree_repair_topology_recurse(c, cur); + ret = bch2_btree_repair_topology_recurse(trans, cur); six_unlock_read(&cur->c.lock); cur = NULL; if (ret == DROP_THIS_NODE) { - bch2_btree_node_evict(c, cur_k.k); + bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); dropped_children = true; @@ -522,18 +524,21 @@ fsck_err: static int bch2_repair_topology(struct bch_fs *c) { + struct btree_trans trans; struct btree *b; unsigned i; int ret = 0; + bch2_trans_init(&trans, c, 0, 0); + for (i = 0; i < BTREE_ID_NR && !ret; i++) { b = c->btree_roots[i].b; if (btree_node_fake(b)) continue; - six_lock_read(&b->c.lock, NULL, NULL); - ret = bch2_btree_repair_topology_recurse(c, b); six_unlock_read(&b->c.lock); + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + ret = bch2_btree_repair_topology_recurse(&trans, b); if (ret == DROP_THIS_NODE) { bch_err(c, "empty btree root - repair unimplemented"); @@ -541,13 +546,16 @@ static int bch2_repair_topology(struct bch_fs *c) } } + bch2_trans_exit(&trans); + return ret; } -static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, +static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id, unsigned level, bool is_root, struct bkey_s_c *k) { + struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); const union bch_extent_entry *entry; struct extent_ptr_decoded p = { 0 }; @@ -747,7 +755,7 @@ found: } if (level) - bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new); + bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); if (c->opts.verbose) { printbuf_reset(&buf); @@ -788,7 +796,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, BUG_ON(bch2_journal_seq_verify && k->k->version.lo > atomic64_read(&c->journal.seq)); - ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k); + ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); if (ret) goto err; @@ -941,7 +949,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b bch2_bkey_buf_reassemble(&cur, c, k); bch2_btree_and_journal_iter_advance(&iter); - child = bch2_btree_node_get_noiter(c, cur.k, + child = bch2_btree_node_get_noiter(trans, cur.k, b->c.btree_id, b->c.level - 1, false); ret = PTR_ERR_OR_ZERO(child); @@ -1934,7 +1942,7 @@ int bch2_gc_gens(struct bch_fs *c) if (!mutex_trylock(&c->gc_gens_lock)) return 0; - trace_gc_gens_start(c); + trace_and_count(c, gc_gens_start, c); down_read(&c->gc_lock); bch2_trans_init(&trans, c, 0, 0); @@ -1995,7 +2003,7 @@ int bch2_gc_gens(struct bch_fs *c) c->gc_count++; bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); - trace_gc_gens_end(c); + trace_and_count(c, gc_gens_end, c); err: for_each_member_device(ca, c, i) { kvfree(ca->oldest_gen); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 8aad87ea..177fd49d 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1490,7 +1490,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, struct bio *bio; int ret; - trace_btree_read(c, b); + trace_and_count(c, btree_node_read, c, b); if (bch2_verify_all_btree_replicas && !btree_node_read_all_replicas(c, b, sync)) @@ -1657,9 +1657,15 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) static void btree_node_write_done(struct bch_fs *c, struct btree *b) { - six_lock_read(&b->c.lock, NULL, NULL); + struct btree_trans trans; + + bch2_trans_init(&trans, c, 0, 0); + + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); __btree_node_write_done(c, b); six_unlock_read(&b->c.lock); + + bch2_trans_exit(&trans); } static void btree_node_write_work(struct work_struct *work) @@ -1979,7 +1985,7 @@ do_write: c->opts.nochanges) goto err; - trace_btree_write(b, bytes_to_write, sectors_to_write); + trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); wbio = container_of(bio_alloc_bioset(NULL, buf_pages(data, sectors_to_write << 9), diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 1d4b9fde..512c3b2b 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -130,443 +130,6 @@ static inline bool btree_path_pos_in_node(struct btree_path *path, !btree_path_pos_after_node(path, b); } -/* Btree node locking: */ - -void bch2_btree_node_unlock_write(struct btree_trans *trans, - struct btree_path *path, struct btree *b) -{ - bch2_btree_node_unlock_write_inlined(trans, path, b); -} - -struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, - struct btree_path *skip, - struct btree *b, - unsigned level) -{ - struct btree_path *path; - struct six_lock_count ret = { 0, 0 }; - - if (IS_ERR_OR_NULL(b)) - return ret; - - trans_for_each_path(trans, path) - if (path != skip && path->l[level].b == b) { - ret.read += btree_node_read_locked(path, level); - ret.intent += btree_node_intent_locked(path, level); - } - - return ret; -} - -static inline void six_lock_readers_add(struct six_lock *lock, int nr) -{ - if (!lock->readers) - atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter); - else - this_cpu_add(*lock->readers, nr); -} - -void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) -{ - int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->c.level).read; - - /* - * Must drop our read locks before calling six_lock_write() - - * six_unlock() won't do wakeups until the reader count - * goes to 0, and it's safe because we have the node intent - * locked: - */ - six_lock_readers_add(&b->c.lock, -readers); - six_lock_write(&b->c.lock, NULL, NULL); - six_lock_readers_add(&b->c.lock, readers); -} - -bool __bch2_btree_node_relock(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - struct btree *b = btree_path_node(path, level); - int want = __btree_lock_want(path, level); - - if (!is_btree_node(path, level)) - goto fail; - - if (race_fault()) - goto fail; - - if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || - (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, b, level, want))) { - mark_btree_node_locked(trans, path, level, want); - return true; - } -fail: - if (b != ERR_PTR(-BCH_ERR_no_btree_node_cached) && - b != ERR_PTR(-BCH_ERR_no_btree_node_init)) - trace_btree_node_relock_fail(trans, _RET_IP_, path, level); - return false; -} - -bool bch2_btree_node_upgrade(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - struct btree *b = path->l[level].b; - - if (!is_btree_node(path, level)) - return false; - - switch (btree_lock_want(path, level)) { - case BTREE_NODE_UNLOCKED: - BUG_ON(btree_node_locked(path, level)); - return true; - case BTREE_NODE_READ_LOCKED: - BUG_ON(btree_node_intent_locked(path, level)); - return bch2_btree_node_relock(trans, path, level); - case BTREE_NODE_INTENT_LOCKED: - break; - } - - if (btree_node_intent_locked(path, level)) - return true; - - if (race_fault()) - return false; - - if (btree_node_locked(path, level) - ? six_lock_tryupgrade(&b->c.lock) - : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) - goto success; - - if (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) { - btree_node_unlock(trans, path, level); - goto success; - } - - trace_btree_node_upgrade_fail(trans, _RET_IP_, path, level); - return false; -success: - mark_btree_node_intent_locked(trans, path, level); - return true; -} - -static inline bool btree_path_get_locks(struct btree_trans *trans, - struct btree_path *path, - bool upgrade) -{ - unsigned l = path->level; - int fail_idx = -1; - - do { - if (!btree_path_node(path, l)) - break; - - if (!(upgrade - ? bch2_btree_node_upgrade(trans, path, l) - : bch2_btree_node_relock(trans, path, l))) - fail_idx = l; - - l++; - } while (l < path->locks_want); - - /* - * When we fail to get a lock, we have to ensure that any child nodes - * can't be relocked so bch2_btree_path_traverse has to walk back up to - * the node that we failed to relock: - */ - if (fail_idx >= 0) { - __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - - do { - path->l[fail_idx].b = upgrade - ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) - : ERR_PTR(-BCH_ERR_no_btree_node_relock); - --fail_idx; - } while (fail_idx >= 0); - } - - if (path->uptodate == BTREE_ITER_NEED_RELOCK) - path->uptodate = BTREE_ITER_UPTODATE; - - bch2_trans_verify_locks(trans); - - return path->uptodate < BTREE_ITER_NEED_RELOCK; -} - -static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, - bool cached) -{ - return !cached - ? container_of(_b, struct btree, c)->key.k.p - : container_of(_b, struct bkey_cached, c)->key.pos; -} - -/* Slowpath: */ -int __bch2_btree_node_lock(struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct bpos pos, unsigned level, - enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - struct btree_path *linked; - unsigned reason; - - /* Check if it's safe to block: */ - trans_for_each_path(trans, linked) { - if (!linked->nodes_locked) - continue; - - /* - * Can't block taking an intent lock if we have _any_ nodes read - * locked: - * - * - Our read lock blocks another thread with an intent lock on - * the same node from getting a write lock, and thus from - * dropping its intent lock - * - * - And the other thread may have multiple nodes intent locked: - * both the node we want to intent lock, and the node we - * already have read locked - deadlock: - */ - if (type == SIX_LOCK_intent && - linked->nodes_locked != linked->nodes_intent_locked) { - reason = 1; - goto deadlock; - } - - if (linked->btree_id != path->btree_id) { - if (linked->btree_id < path->btree_id) - continue; - - reason = 3; - goto deadlock; - } - - /* - * Within the same btree, non-cached paths come before cached - * paths: - */ - if (linked->cached != path->cached) { - if (!linked->cached) - continue; - - reason = 4; - goto deadlock; - } - - /* - * Interior nodes must be locked before their descendants: if - * another path has possible descendants locked of the node - * we're about to lock, it must have the ancestors locked too: - */ - if (level > __fls(linked->nodes_locked)) { - reason = 5; - goto deadlock; - } - - /* Must lock btree nodes in key order: */ - if (btree_node_locked(linked, level) && - bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, - linked->cached)) <= 0) { - reason = 7; - goto deadlock; - } - } - - return btree_node_lock_type(trans, path, b, pos, level, - type, should_sleep_fn, p); -deadlock: - trace_trans_restart_would_deadlock(trans, ip, reason, linked, path, &pos); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); -} - -/* Btree iterator locking: */ - -#ifdef CONFIG_BCACHEFS_DEBUG - -static void bch2_btree_path_verify_locks(struct btree_path *path) -{ - unsigned l; - - if (!path->nodes_locked) { - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && - btree_path_node(path, path->level)); - return; - } - - for (l = 0; btree_path_node(path, l); l++) - BUG_ON(btree_lock_want(path, l) != - btree_node_locked_type(path, l)); -} - -void bch2_trans_verify_locks(struct btree_trans *trans) -{ - struct btree_path *path; - - trans_for_each_path(trans, path) - bch2_btree_path_verify_locks(path); -} -#else -static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} -#endif - -/* Btree path locking: */ - -/* - * Only for btree_cache.c - only relocks intent locks - */ -int bch2_btree_path_relock_intent(struct btree_trans *trans, - struct btree_path *path) -{ - unsigned l; - - for (l = path->level; - l < path->locks_want && btree_path_node(path, l); - l++) { - if (!bch2_btree_node_relock(trans, path, l)) { - __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - trace_trans_restart_relock_path_intent(trans, _RET_IP_, path); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); - } - } - - return 0; -} - -__flatten -static bool bch2_btree_path_relock_norestart(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) -{ - return btree_path_get_locks(trans, path, false); -} - -static int bch2_btree_path_relock(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) -{ - if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { - trace_trans_restart_relock_path(trans, trace_ip, path); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); - } - - return 0; -} - -bool __bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - struct btree_path *linked; - - EBUG_ON(path->locks_want >= new_locks_want); - - path->locks_want = new_locks_want; - - if (btree_path_get_locks(trans, path, true)) - return true; - - /* - * XXX: this is ugly - we'd prefer to not be mucking with other - * iterators in the btree_trans here. - * - * On failure to upgrade the iterator, setting iter->locks_want and - * calling get_locks() is sufficient to make bch2_btree_path_traverse() - * get the locks we want on transaction restart. - * - * But if this iterator was a clone, on transaction restart what we did - * to this iterator isn't going to be preserved. - * - * Possibly we could add an iterator field for the parent iterator when - * an iterator is a copy - for now, we'll just upgrade any other - * iterators with the same btree id. - * - * The code below used to be needed to ensure ancestor nodes get locked - * before interior nodes - now that's handled by - * bch2_btree_path_traverse_all(). - */ - if (!path->cached && !trans->in_traverse_all) - trans_for_each_path(trans, linked) - if (linked != path && - linked->cached == path->cached && - linked->btree_id == path->btree_id && - linked->locks_want < new_locks_want) { - linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true); - } - - return false; -} - -void __bch2_btree_path_downgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - unsigned l; - - EBUG_ON(path->locks_want < new_locks_want); - - path->locks_want = new_locks_want; - - while (path->nodes_locked && - (l = __fls(path->nodes_locked)) >= path->locks_want) { - if (l > path->level) { - btree_node_unlock(trans, path, l); - } else { - if (btree_node_intent_locked(path, l)) { - six_lock_downgrade(&path->l[l].b->c.lock); - path->nodes_intent_locked ^= 1 << l; - } - break; - } - } - - bch2_btree_path_verify_locks(path); -} - -void bch2_trans_downgrade(struct btree_trans *trans) -{ - struct btree_path *path; - - trans_for_each_path(trans, path) - bch2_btree_path_downgrade(trans, path); -} - -/* Btree transaction locking: */ - -int bch2_trans_relock(struct btree_trans *trans) -{ - struct btree_path *path; - - if (unlikely(trans->restarted)) - return -BCH_ERR_transaction_restart_relock; - - trans_for_each_path(trans, path) - if (path->should_be_locked && - bch2_btree_path_relock(trans, path, _RET_IP_)) { - trace_trans_restart_relock(trans, _RET_IP_, path); - BUG_ON(!trans->restarted); - return -BCH_ERR_transaction_restart_relock; - } - return 0; -} - -void bch2_trans_unlock(struct btree_trans *trans) -{ - struct btree_path *path; - - trans_for_each_path(trans, path) - __bch2_btree_path_unlock(trans, path); - - /* - * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking - * btree nodes, it implements its own walking: - */ - BUG_ON(!trans->is_initial_gc && - lock_class_is_held(&bch2_btree_node_lock_key)); -} - /* Btree iterator: */ #ifdef CONFIG_BCACHEFS_DEBUG @@ -795,7 +358,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, if (cmp < 0) continue; - if (!(path->nodes_locked & 1) || + if (!btree_node_locked(path, 0) || !path->should_be_locked) continue; @@ -1161,13 +724,13 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) struct btree_path *path; trans_for_each_path(trans, path) - if (!path->cached && + if (path->uptodate == BTREE_ITER_UPTODATE && + !path->cached && btree_path_pos_in_node(path, b)) { enum btree_node_locked_type t = btree_lock_want(path, b->c.level); - if (path->nodes_locked && - t != BTREE_NODE_UNLOCKED) { + if (t != BTREE_NODE_UNLOCKED) { btree_node_unlock(trans, path, b->c.level); six_lock_increment(&b->c.lock, t); mark_btree_node_locked(trans, path, b->c.level, t); @@ -1232,7 +795,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, } lock_type = __btree_lock_want(path, path->level); - ret = btree_node_lock(trans, path, b, SPOS_MAX, + ret = btree_node_lock(trans, path, &b->c, SPOS_MAX, path->level, lock_type, lock_root_check_fn, rootp, trace_ip); @@ -1517,7 +1080,7 @@ err: trans->in_traverse_all = false; - trace_trans_traverse_all(trans, trace_ip); + trace_and_count(c, trans_traverse_all, trans, trace_ip); return ret; } @@ -1654,7 +1217,7 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *trans, u64 mask = ~(~0ULL << restart_probability_bits); if ((prandom_u32() & mask) == mask) { - trace_transaction_restart_injected(trans, _RET_IP_); + trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_); return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); } } @@ -1956,7 +1519,6 @@ static struct btree_path *btree_path_alloc(struct btree_trans *trans, path->ref = 0; path->intent_ref = 0; path->nodes_locked = 0; - path->nodes_intent_locked = 0; btree_path_list_add(trans, pos, path); return path; @@ -2006,7 +1568,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, path->level = level; path->locks_want = locks_want; path->nodes_locked = 0; - path->nodes_intent_locked = 0; for (i = 0; i < ARRAY_SIZE(path->l); i++) path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); #ifdef CONFIG_BCACHEFS_DEBUG @@ -2030,10 +1591,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, */ locks_want = min(locks_want, BTREE_MAX_DEPTH); - if (locks_want > path->locks_want) { - path->locks_want = locks_want; - btree_path_get_locks(trans, path, true); - } + if (locks_want > path->locks_want) + bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want); return path; } @@ -2166,7 +1725,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - trace_trans_restart_relock_next_node(trans, _THIS_IP_, path); + trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); goto err; } @@ -3185,9 +2744,11 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { - size_t new_top = trans->mem_top + size; + unsigned new_top = trans->mem_top + size; void *p; + trans->mem_max = max(trans->mem_max, new_top); + if (new_top > trans->mem_bytes) { size_t old_bytes = trans->mem_bytes; size_t new_bytes = roundup_pow_of_two(new_top); @@ -3209,7 +2770,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) trans->mem_bytes = new_bytes; if (old_bytes) { - trace_trans_restart_mem_realloced(trans, _RET_IP_, new_bytes); + trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } } @@ -3325,12 +2886,10 @@ static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct b return i; } -void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, - unsigned expected_nr_iters, - size_t expected_mem_bytes, - const char *fn) +void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn) __acquires(&c->btree_trans_barrier) { + struct btree_transaction_stats *s; struct btree_trans *pos; BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); @@ -3344,7 +2903,10 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, bch2_trans_alloc_paths(trans, c); - if (expected_mem_bytes) { + s = btree_trans_stats(trans); + if (s) { + unsigned expected_mem_bytes = s->max_mem; + trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL); @@ -3395,9 +2957,13 @@ void bch2_trans_exit(struct btree_trans *trans) { struct btree_insert_entry *i; struct bch_fs *c = trans->c; + struct btree_transaction_stats *s = btree_trans_stats(trans); bch2_trans_unlock(trans); + if (s) + s->max_mem = max(s->max_mem, trans->mem_max); + trans_for_each_update(trans, i) __btree_path_put(i->path, true); trans->nr_updates = 0; @@ -3444,12 +3010,23 @@ void bch2_trans_exit(struct btree_trans *trans) static void __maybe_unused bch2_btree_path_node_to_text(struct printbuf *out, - struct btree_bkey_cached_common *b, - bool cached) + struct btree_bkey_cached_common *b) { + struct six_lock_count c = six_lock_counts(&b->lock); + struct task_struct *owner; + pid_t pid; + + rcu_read_lock(); + owner = READ_ONCE(b->lock.owner); + pid = owner ? owner->pid : 0;; + rcu_read_unlock(); + prt_printf(out, " l=%u %s:", b->level, bch2_btree_ids[b->btree_id]); - bch2_bpos_to_text(out, btree_node_pos(b, cached)); + bch2_bpos_to_text(out, btree_node_pos(b)); + + prt_printf(out, " locks %u:%u:%u held by pid %u", + c.n[0], c.n[1], c.n[2], pid); } void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) @@ -3476,9 +3053,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) for (l = 0; l < BTREE_MAX_DEPTH; l++) { if (btree_node_locked(path, l) && !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { - prt_printf(out, " %s l=%u ", - btree_node_intent_locked(path, l) ? "i" : "r", l); - bch2_btree_path_node_to_text(out, b, path->cached); + prt_printf(out, " %c l=%u ", + lock_types[btree_node_locked_type(path, l)], l); + bch2_btree_path_node_to_text(out, b); prt_printf(out, "\n"); } } @@ -3496,7 +3073,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) bch2_bpos_to_text(out, trans->locking_pos); prt_printf(out, " node "); - bch2_btree_path_node_to_text(out, b, path->cached); + bch2_btree_path_node_to_text(out, b); prt_printf(out, "\n"); } } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 6ad28ff6..7b47b880 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -145,12 +145,10 @@ struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, #ifdef CONFIG_BCACHEFS_DEBUG void bch2_trans_verify_paths(struct btree_trans *); -void bch2_trans_verify_locks(struct btree_trans *); void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos, bool); #else static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} -static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos, bool key_cache) {} #endif @@ -195,20 +193,6 @@ static inline int btree_trans_restart(struct btree_trans *trans, int err) bool bch2_btree_node_upgrade(struct btree_trans *, struct btree_path *, unsigned); -bool __bch2_btree_path_upgrade(struct btree_trans *, - struct btree_path *, unsigned); - -static inline bool bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - - return path->locks_want < new_locks_want - ? __bch2_btree_path_upgrade(trans, path, new_locks_want) - : path->uptodate == BTREE_ITER_UPTODATE; -} - void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned); static inline void bch2_btree_path_downgrade(struct btree_trans *trans, @@ -367,8 +351,8 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter * static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) { - trace_trans_restart_too_many_iters(trans, _THIS_IP_); + if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) { + trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_); return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); } @@ -544,11 +528,10 @@ void bch2_btree_path_to_text(struct printbuf *, struct btree_path *); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); -void __bch2_trans_init(struct btree_trans *, struct bch_fs *, - unsigned, size_t, const char *); +void __bch2_trans_init(struct btree_trans *, struct bch_fs *, const char *); void bch2_trans_exit(struct btree_trans *); -#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__) +#define bch2_trans_init(_trans, _c, _nr_iters, _mem) __bch2_trans_init(_trans, _c, __func__) void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 38b16f95..d900ff42 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -13,6 +13,11 @@ #include #include +static inline bool btree_uses_pcpu_readers(enum btree_id id) +{ + return id == BTREE_ID_subvolumes; +} + static struct kmem_cache *bch2_key_cache; static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, @@ -84,7 +89,10 @@ static void bkey_cached_free(struct btree_key_cache *bc, ck->btree_trans_barrier_seq = start_poll_synchronize_srcu(&c->btree_trans_barrier); - list_move_tail(&ck->list, &bc->freed); + if (ck->c.lock.readers) + list_move_tail(&ck->list, &bc->freed_pcpu); + else + list_move_tail(&ck->list, &bc->freed_nonpcpu); atomic_long_inc(&bc->nr_freed); kfree(ck->k); @@ -95,15 +103,51 @@ static void bkey_cached_free(struct btree_key_cache *bc, six_unlock_intent(&ck->c.lock); } -static void bkey_cached_free_fast(struct btree_key_cache *bc, - struct bkey_cached *ck) +static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, + struct bkey_cached *ck) { - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); struct btree_key_cache_freelist *f; bool freed = false; BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + if (!ck->c.lock.readers) { + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + if (f->nr < ARRAY_SIZE(f->objs)) { + f->objs[f->nr++] = ck; + freed = true; + } + preempt_enable(); + + if (!freed) { + mutex_lock(&bc->lock); + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + while (f->nr > ARRAY_SIZE(f->objs) / 2) { + struct bkey_cached *ck2 = f->objs[--f->nr]; + + list_move_tail(&ck2->list, &bc->freed_nonpcpu); + } + preempt_enable(); + + list_move_tail(&ck->list, &bc->freed_nonpcpu); + mutex_unlock(&bc->lock); + } + } else { + mutex_lock(&bc->lock); + list_move_tail(&ck->list, &bc->freed_pcpu); + mutex_unlock(&bc->lock); + } +} + +static void bkey_cached_free_fast(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + ck->btree_trans_barrier_seq = start_poll_synchronize_srcu(&c->btree_trans_barrier); @@ -114,74 +158,84 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc, ck->k = NULL; ck->u64s = 0; - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - if (f->nr < ARRAY_SIZE(f->objs)) { - f->objs[f->nr++] = ck; - freed = true; - } - preempt_enable(); - - if (!freed) { - mutex_lock(&bc->lock); - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - while (f->nr > ARRAY_SIZE(f->objs) / 2) { - struct bkey_cached *ck2 = f->objs[--f->nr]; - - list_move_tail(&ck2->list, &bc->freed); - } - preempt_enable(); - - list_move_tail(&ck->list, &bc->freed); - mutex_unlock(&bc->lock); - } + bkey_cached_move_to_freelist(bc, ck); six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); } static struct bkey_cached * -bkey_cached_alloc(struct btree_key_cache *c) +bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path) { + struct bch_fs *c = trans->c; + struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck = NULL; struct btree_key_cache_freelist *f; + bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); - preempt_disable(); - f = this_cpu_ptr(c->pcpu_freed); - if (f->nr) - ck = f->objs[--f->nr]; - preempt_enable(); - - if (!ck) { - mutex_lock(&c->lock); + if (!pcpu_readers) { preempt_disable(); - f = this_cpu_ptr(c->pcpu_freed); + f = this_cpu_ptr(bc->pcpu_freed); + if (f->nr) + ck = f->objs[--f->nr]; + preempt_enable(); + + if (!ck) { + mutex_lock(&bc->lock); + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + while (!list_empty(&bc->freed_nonpcpu) && + f->nr < ARRAY_SIZE(f->objs) / 2) { + ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); + list_del_init(&ck->list); + f->objs[f->nr++] = ck; + } - while (!list_empty(&c->freed) && - f->nr < ARRAY_SIZE(f->objs) / 2) { - ck = list_last_entry(&c->freed, struct bkey_cached, list); + ck = f->nr ? f->objs[--f->nr] : NULL; + preempt_enable(); + mutex_unlock(&bc->lock); + } + } else { + mutex_lock(&bc->lock); + if (!list_empty(&bc->freed_pcpu)) { + ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); list_del_init(&ck->list); - f->objs[f->nr++] = ck; } - - ck = f->nr ? f->objs[--f->nr] : NULL; - preempt_enable(); - mutex_unlock(&c->lock); + mutex_unlock(&bc->lock); } if (ck) { - six_lock_intent(&ck->c.lock, NULL, NULL); - six_lock_write(&ck->c.lock, NULL, NULL); + int ret; + + ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent); + if (unlikely(ret)) { + bkey_cached_move_to_freelist(bc, ck); + return ERR_PTR(ret); + } + + path->l[0].b = (void *) ck; + path->l[0].lock_seq = ck->c.lock.state.seq; + mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + + ret = bch2_btree_node_lock_write(trans, path, &ck->c); + if (unlikely(ret)) { + btree_node_unlock(trans, path, 0); + bkey_cached_move_to_freelist(bc, ck); + return ERR_PTR(ret); + } + return ck; } ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); if (likely(ck)) { INIT_LIST_HEAD(&ck->list); - six_lock_init(&ck->c.lock); + __six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key); + if (pcpu_readers) + six_lock_pcpu_alloc(&ck->c.lock); + + ck->c.cached = true; BUG_ON(!six_trylock_intent(&ck->c.lock)); BUG_ON(!six_trylock_write(&ck->c.lock)); return ck; @@ -215,36 +269,36 @@ bkey_cached_reuse(struct btree_key_cache *c) } static struct bkey_cached * -btree_key_cache_create(struct bch_fs *c, - enum btree_id btree_id, - struct bpos pos) +btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) { + struct bch_fs *c = trans->c; struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck; bool was_new = true; - ck = bkey_cached_alloc(bc); + ck = bkey_cached_alloc(trans, path); + if (unlikely(IS_ERR(ck))) + return ck; if (unlikely(!ck)) { ck = bkey_cached_reuse(bc); if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_ids[btree_id]); + bch2_btree_ids[path->btree_id]); return ERR_PTR(-ENOMEM); } + mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); was_new = false; } else { - if (btree_id == BTREE_ID_subvolumes) + if (path->btree_id == BTREE_ID_subvolumes) six_lock_pcpu_alloc(&ck->c.lock); - else - six_lock_pcpu_free(&ck->c.lock); } ck->c.level = 0; - ck->c.btree_id = btree_id; - ck->key.btree_id = btree_id; - ck->key.pos = pos; + ck->c.btree_id = path->btree_id; + ck->key.btree_id = path->btree_id; + ck->key.pos = path->pos; ck->valid = false; ck->flags = 1U << BKEY_CACHED_ACCESSED; @@ -256,6 +310,7 @@ btree_key_cache_create(struct bch_fs *c, if (likely(was_new)) { six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); + mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); kfree(ck); } else { bkey_cached_free_fast(bc, ck); @@ -291,7 +346,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, k = bch2_btree_path_peek_slot(path, &u); if (!bch2_btree_node_relock(trans, ck_path, 0)) { - trace_trans_restart_relock_key_cache_fill(trans, _THIS_IP_, ck_path); + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); goto err; } @@ -320,11 +375,12 @@ static int btree_key_cache_fill(struct btree_trans *trans, } } - /* - * XXX: not allowed to be holding read locks when we take a write lock, - * currently - */ - bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b); + ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); + if (ret) { + kfree(new_k); + goto err; + } + if (new_k) { kfree(ck->k); ck->u64s = new_u64s; @@ -372,7 +428,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path retry: ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); if (!ck) { - ck = btree_key_cache_create(c, path->btree_id, path->pos); + ck = btree_key_cache_create(trans, path); ret = PTR_ERR_OR_ZERO(ck); if (ret) goto err; @@ -414,7 +470,7 @@ fill: */ if (!path->locks_want && !__bch2_btree_path_upgrade(trans, path, 1)) { - trace_transaction_restart_key_cache_upgrade(trans, _THIS_IP_); + trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); goto err; } @@ -518,21 +574,21 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, atomic_long_dec(&c->btree_key_cache.nr_dirty); } } else { + struct btree_path *path2; evict: - BUG_ON(!btree_node_intent_locked(c_iter.path, 0)); - - mark_btree_node_unlocked(c_iter.path, 0); - c_iter.path->l[0].b = NULL; + trans_for_each_path(trans, path2) + if (path2 != c_iter.path) + __bch2_btree_path_unlock(trans, path2); - six_lock_write(&ck->c.lock, NULL, NULL); + bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { clear_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_dec(&c->btree_key_cache.nr_dirty); } + mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED); bkey_cached_evict(&c->btree_key_cache, ck); - bkey_cached_free_fast(&c->btree_key_cache, ck); } out: @@ -548,11 +604,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, struct bkey_cached *ck = container_of(pin, struct bkey_cached, journal); struct bkey_cached_key key; + struct btree_trans trans; + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); int ret = 0; - int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + bch2_trans_init(&trans, c, 0, 0); - six_lock_read(&ck->c.lock, NULL, NULL); + btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read); key = ck->key; if (ck->journal.seq != seq || @@ -562,12 +620,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, } six_unlock_read(&ck->c.lock); - ret = bch2_trans_do(c, NULL, NULL, 0, + ret = commit_do(&trans, NULL, NULL, 0, btree_key_cache_flush_pos(&trans, key, seq, BTREE_INSERT_JOURNAL_RECLAIM, false)); unlock: srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + bch2_trans_exit(&trans); return ret; } @@ -674,12 +733,29 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, * Newest freed entries are at the end of the list - once we hit one * that's too new to be freed, we can bail out: */ - list_for_each_entry_safe(ck, t, &bc->freed, list) { + list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) break; list_del(&ck->list); + six_lock_pcpu_free(&ck->c.lock); + kmem_cache_free(bch2_key_cache, ck); + atomic_long_dec(&bc->nr_freed); + scanned++; + freed++; + } + + if (scanned >= nr) + goto out; + + list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { + if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, + ck->btree_trans_barrier_seq)) + break; + + list_del(&ck->list); + six_lock_pcpu_free(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); scanned++; @@ -767,7 +843,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) for (i = 0; i < tbl->size; i++) rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { bkey_cached_evict(bc, ck); - list_add(&ck->list, &bc->freed); + list_add(&ck->list, &bc->freed_nonpcpu); } rcu_read_unlock(); @@ -777,11 +853,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) for (i = 0; i < f->nr; i++) { ck = f->objs[i]; - list_add(&ck->list, &bc->freed); + list_add(&ck->list, &bc->freed_nonpcpu); } } - list_for_each_entry_safe(ck, n, &bc->freed, list) { + list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); + + list_for_each_entry_safe(ck, n, &bc->freed_nonpcpu, list) { cond_resched(); bch2_journal_pin_drop(&c->journal, &ck->journal); @@ -789,6 +867,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) list_del(&ck->list); kfree(ck->k); + six_lock_pcpu_free(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); } @@ -808,7 +887,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { mutex_init(&c->lock); - INIT_LIST_HEAD(&c->freed); + INIT_LIST_HEAD(&c->freed_pcpu); + INIT_LIST_HEAD(&c->freed_nonpcpu); } static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink) diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c new file mode 100644 index 00000000..1cdf7d4f --- /dev/null +++ b/libbcachefs/btree_locking.c @@ -0,0 +1,466 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_locking.h" +#include "btree_types.h" + +struct lock_class_key bch2_btree_node_lock_key; + +/* Btree node locking: */ + +static inline void six_lock_readers_add(struct six_lock *lock, int nr) +{ + if (lock->readers) + this_cpu_add(*lock->readers, nr); + else if (nr > 0) + atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter); + else + atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter); +} + +struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, + struct btree_path *skip, + struct btree_bkey_cached_common *b, + unsigned level) +{ + struct btree_path *path; + struct six_lock_count ret; + + memset(&ret, 0, sizeof(ret)); + + if (IS_ERR_OR_NULL(b)) + return ret; + + trans_for_each_path(trans, path) + if (path != skip && &path->l[level].b->c == b) { + int t = btree_node_locked_type(path, level); + + if (t != BTREE_NODE_UNLOCKED) + ret.n[t]++; + } + + return ret; +} + +/* unlock */ + +void bch2_btree_node_unlock_write(struct btree_trans *trans, + struct btree_path *path, struct btree *b) +{ + bch2_btree_node_unlock_write_inlined(trans, path, b); +} + +/* lock */ + +void __bch2_btree_node_lock_write(struct btree_trans *trans, + struct btree_bkey_cached_common *b) +{ + int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; + + /* + * Must drop our read locks before calling six_lock_write() - + * six_unlock() won't do wakeups until the reader count + * goes to 0, and it's safe because we have the node intent + * locked: + */ + six_lock_readers_add(&b->lock, -readers); + btree_node_lock_nopath_nofail(trans, b, SIX_LOCK_write); + six_lock_readers_add(&b->lock, readers); +} + +static inline bool path_has_read_locks(struct btree_path *path) +{ + unsigned l; + + for (l = 0; l < BTREE_MAX_DEPTH; l++) + if (btree_node_read_locked(path, l)) + return true; + return false; +} + +/* Slowpath: */ +int __bch2_btree_node_lock(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b, + struct bpos pos, unsigned level, + enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ + struct btree_path *linked; + unsigned reason; + + /* Check if it's safe to block: */ + trans_for_each_path(trans, linked) { + if (!linked->nodes_locked) + continue; + + /* + * Can't block taking an intent lock if we have _any_ nodes read + * locked: + * + * - Our read lock blocks another thread with an intent lock on + * the same node from getting a write lock, and thus from + * dropping its intent lock + * + * - And the other thread may have multiple nodes intent locked: + * both the node we want to intent lock, and the node we + * already have read locked - deadlock: + */ + if (type == SIX_LOCK_intent && + path_has_read_locks(linked)) { + reason = 1; + goto deadlock; + } + + if (linked->btree_id != path->btree_id) { + if (linked->btree_id < path->btree_id) + continue; + + reason = 3; + goto deadlock; + } + + /* + * Within the same btree, non-cached paths come before cached + * paths: + */ + if (linked->cached != path->cached) { + if (!linked->cached) + continue; + + reason = 4; + goto deadlock; + } + + /* + * Interior nodes must be locked before their descendants: if + * another path has possible descendants locked of the node + * we're about to lock, it must have the ancestors locked too: + */ + if (level > btree_path_highest_level_locked(linked)) { + reason = 5; + goto deadlock; + } + + /* Must lock btree nodes in key order: */ + if (btree_node_locked(linked, level) && + bpos_cmp(pos, btree_node_pos(&linked->l[level].b->c)) <= 0) { + reason = 7; + goto deadlock; + } + } + + return btree_node_lock_type(trans, path, b, pos, level, + type, should_sleep_fn, p); +deadlock: + trace_and_count(trans->c, trans_restart_would_deadlock, trans, ip, reason, linked, path, &pos); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); +} + +/* relock */ + +static inline bool btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, + bool upgrade) +{ + unsigned l = path->level; + int fail_idx = -1; + + do { + if (!btree_path_node(path, l)) + break; + + if (!(upgrade + ? bch2_btree_node_upgrade(trans, path, l) + : bch2_btree_node_relock(trans, path, l))) + fail_idx = l; + + l++; + } while (l < path->locks_want); + + /* + * When we fail to get a lock, we have to ensure that any child nodes + * can't be relocked so bch2_btree_path_traverse has to walk back up to + * the node that we failed to relock: + */ + if (fail_idx >= 0) { + __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + + do { + path->l[fail_idx].b = upgrade + ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) + : ERR_PTR(-BCH_ERR_no_btree_node_relock); + --fail_idx; + } while (fail_idx >= 0); + } + + if (path->uptodate == BTREE_ITER_NEED_RELOCK) + path->uptodate = BTREE_ITER_UPTODATE; + + bch2_trans_verify_locks(trans); + + return path->uptodate < BTREE_ITER_NEED_RELOCK; +} + +bool __bch2_btree_node_relock(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + struct btree *b = btree_path_node(path, level); + int want = __btree_lock_want(path, level); + + if (race_fault()) + goto fail; + + if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || + (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, &b->c, level, want))) { + mark_btree_node_locked(trans, path, level, want); + return true; + } +fail: + trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); + return false; +} + +/* upgrade */ + +bool bch2_btree_node_upgrade(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + struct btree *b = path->l[level].b; + + if (!is_btree_node(path, level)) + return false; + + switch (btree_lock_want(path, level)) { + case BTREE_NODE_UNLOCKED: + BUG_ON(btree_node_locked(path, level)); + return true; + case BTREE_NODE_READ_LOCKED: + BUG_ON(btree_node_intent_locked(path, level)); + return bch2_btree_node_relock(trans, path, level); + case BTREE_NODE_INTENT_LOCKED: + break; + case BTREE_NODE_WRITE_LOCKED: + BUG(); + } + + if (btree_node_intent_locked(path, level)) + return true; + + if (race_fault()) + return false; + + if (btree_node_locked(path, level) + ? six_lock_tryupgrade(&b->c.lock) + : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) + goto success; + + if (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { + btree_node_unlock(trans, path, level); + goto success; + } + + trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level); + return false; +success: + mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent); + return true; +} + +/* Btree path locking: */ + +/* + * Only for btree_cache.c - only relocks intent locks + */ +int bch2_btree_path_relock_intent(struct btree_trans *trans, + struct btree_path *path) +{ + unsigned l; + + for (l = path->level; + l < path->locks_want && btree_path_node(path, l); + l++) { + if (!bch2_btree_node_relock(trans, path, l)) { + __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); + } + } + + return 0; +} + +__flatten +bool bch2_btree_path_relock_norestart(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + return btree_path_get_locks(trans, path, false); +} + +__flatten +bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + return btree_path_get_locks(trans, path, true); +} + +bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + EBUG_ON(path->locks_want >= new_locks_want); + + path->locks_want = new_locks_want; + + return btree_path_get_locks(trans, path, true); +} + +bool __bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + struct btree_path *linked; + + if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want)) + return true; + + /* + * XXX: this is ugly - we'd prefer to not be mucking with other + * iterators in the btree_trans here. + * + * On failure to upgrade the iterator, setting iter->locks_want and + * calling get_locks() is sufficient to make bch2_btree_path_traverse() + * get the locks we want on transaction restart. + * + * But if this iterator was a clone, on transaction restart what we did + * to this iterator isn't going to be preserved. + * + * Possibly we could add an iterator field for the parent iterator when + * an iterator is a copy - for now, we'll just upgrade any other + * iterators with the same btree id. + * + * The code below used to be needed to ensure ancestor nodes get locked + * before interior nodes - now that's handled by + * bch2_btree_path_traverse_all(). + */ + if (!path->cached && !trans->in_traverse_all) + trans_for_each_path(trans, linked) + if (linked != path && + linked->cached == path->cached && + linked->btree_id == path->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; + btree_path_get_locks(trans, linked, true); + } + + return false; +} + +void __bch2_btree_path_downgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + unsigned l; + + EBUG_ON(path->locks_want < new_locks_want); + + path->locks_want = new_locks_want; + + while (path->nodes_locked && + (l = btree_path_highest_level_locked(path)) >= path->locks_want) { + if (l > path->level) { + btree_node_unlock(trans, path, l); + } else { + if (btree_node_intent_locked(path, l)) { + six_lock_downgrade(&path->l[l].b->c.lock); + mark_btree_node_locked_noreset(path, l, SIX_LOCK_read); + } + break; + } + } + + bch2_btree_path_verify_locks(path); +} + +/* Btree transaction locking: */ + +void bch2_trans_downgrade(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + bch2_btree_path_downgrade(trans, path); +} + +int bch2_trans_relock(struct btree_trans *trans) +{ + struct btree_path *path; + + if (unlikely(trans->restarted)) + return - ((int) trans->restarted); + + trans_for_each_path(trans, path) + if (path->should_be_locked && + !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + } + return 0; +} + +void bch2_trans_unlock(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + __bch2_btree_path_unlock(trans, path); + + /* + * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking + * btree nodes, it implements its own walking: + */ + BUG_ON(!trans->is_initial_gc && + lock_class_is_held(&bch2_btree_node_lock_key)); +} + +/* Debug */ + +#ifdef CONFIG_BCACHEFS_DEBUG + +void bch2_btree_path_verify_locks(struct btree_path *path) +{ + unsigned l; + + if (!path->nodes_locked) { + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && + btree_path_node(path, path->level)); + return; + } + + for (l = 0; l < BTREE_MAX_DEPTH; l++) { + int want = btree_lock_want(path, l); + int have = btree_node_locked_type(path, l); + + BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); + + BUG_ON(is_btree_node(path, l) && + (want == BTREE_NODE_UNLOCKED || + have != BTREE_NODE_WRITE_LOCKED) && + want != have); + } +} + +void bch2_trans_verify_locks(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + bch2_btree_path_verify_locks(path); +} + +#endif diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 205c6b59..3bc490bc 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -14,68 +14,71 @@ #include "btree_iter.h" +extern struct lock_class_key bch2_btree_node_lock_key; + static inline bool is_btree_node(struct btree_path *path, unsigned l) { return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b); } +static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans) +{ + return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats) + ? &trans->c->btree_transaction_stats[trans->fn_idx] + : NULL; +} + /* matches six lock types */ enum btree_node_locked_type { BTREE_NODE_UNLOCKED = -1, BTREE_NODE_READ_LOCKED = SIX_LOCK_read, BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, + BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write, }; static inline int btree_node_locked_type(struct btree_path *path, unsigned level) { - /* - * We're relying on the fact that if nodes_intent_locked is set - * nodes_locked must be set as well, so that we can compute without - * branches: - */ - return BTREE_NODE_UNLOCKED + - ((path->nodes_locked >> level) & 1) + - ((path->nodes_intent_locked >> level) & 1); + return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); } -static inline bool btree_node_intent_locked(struct btree_path *path, - unsigned level) +static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) { - return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED; + return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; } -static inline bool btree_node_read_locked(struct btree_path *path, - unsigned level) +static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l) { - return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED; + return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED; } -static inline bool btree_node_locked(struct btree_path *path, unsigned level) +static inline bool btree_node_read_locked(struct btree_path *path, unsigned l) { - return path->nodes_locked & (1 << level); + return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED; } -static inline void mark_btree_node_unlocked(struct btree_path *path, - unsigned level) +static inline bool btree_node_locked(struct btree_path *path, unsigned level) { - path->nodes_locked &= ~(1 << level); - path->nodes_intent_locked &= ~(1 << level); + return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED; } -static inline void mark_btree_node_locked_noreset(struct btree_trans *trans, - struct btree_path *path, - unsigned level, - enum six_lock_type type) +static inline void mark_btree_node_locked_noreset(struct btree_path *path, + unsigned level, + enum btree_node_locked_type type) { /* relying on this to avoid a branch */ BUILD_BUG_ON(SIX_LOCK_read != 0); BUILD_BUG_ON(SIX_LOCK_intent != 1); - BUG_ON(trans->in_traverse_all && path->sorted_idx > trans->traverse_all_idx); + path->nodes_locked &= ~(3U << (level << 1)); + path->nodes_locked |= (type + 1) << (level << 1); +} - path->nodes_locked |= 1 << level; - path->nodes_intent_locked |= type << level; +static inline void mark_btree_node_unlocked(struct btree_path *path, + unsigned level) +{ + EBUG_ON(btree_node_write_locked(path, level)); + mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); } static inline void mark_btree_node_locked(struct btree_trans *trans, @@ -83,19 +86,12 @@ static inline void mark_btree_node_locked(struct btree_trans *trans, unsigned level, enum six_lock_type type) { - mark_btree_node_locked_noreset(trans, path, level, type); + mark_btree_node_locked_noreset(path, level, type); #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS path->l[level].lock_taken_time = ktime_get_ns(); #endif } -static inline void mark_btree_node_intent_locked(struct btree_trans *trans, - struct btree_path *path, - unsigned level) -{ - mark_btree_node_locked_noreset(trans, path, level, SIX_LOCK_intent); -} - static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) { return level < path->locks_want @@ -115,13 +111,6 @@ btree_lock_want(struct btree_path *path, int level) return BTREE_NODE_UNLOCKED; } -static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans) -{ - return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats) - ? &trans->c->btree_transaction_stats[trans->fn_idx] - : NULL; -} - static void btree_trans_lock_hold_time_update(struct btree_trans *trans, struct btree_path *path, unsigned level) { @@ -135,6 +124,8 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans, #endif } +/* unlock: */ + static inline void btree_node_unlock(struct btree_trans *trans, struct btree_path *path, unsigned level) { @@ -149,59 +140,91 @@ static inline void btree_node_unlock(struct btree_trans *trans, mark_btree_node_unlocked(path, level); } +static inline int btree_path_lowest_level_locked(struct btree_path *path) +{ + return __ffs(path->nodes_locked) >> 1; +} + +static inline int btree_path_highest_level_locked(struct btree_path *path) +{ + return __fls(path->nodes_locked) >> 1; +} + static inline void __bch2_btree_path_unlock(struct btree_trans *trans, struct btree_path *path) { btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); while (path->nodes_locked) - btree_node_unlock(trans, path, __ffs(path->nodes_locked)); + btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); } -static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) +/* + * Updates the saved lock sequence number, so that bch2_btree_node_relock() will + * succeed: + */ +static inline void +bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, + struct btree *b) { - switch (type) { - case SIX_LOCK_read: - return BCH_TIME_btree_lock_contended_read; - case SIX_LOCK_intent: - return BCH_TIME_btree_lock_contended_intent; - case SIX_LOCK_write: - return BCH_TIME_btree_lock_contended_write; - default: - BUG(); - } + struct btree_path *linked; + + EBUG_ON(path->l[b->c.level].b != b); + EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); + EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); + + mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent); + + trans_for_each_path_with_node(trans, b, linked) + linked->l[b->c.level].lock_seq += 2; + + six_unlock_write(&b->c.lock); +} + +void bch2_btree_node_unlock_write(struct btree_trans *, + struct btree_path *, struct btree *); + +/* lock: */ + +static inline int __must_check +btree_node_lock_nopath(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type) +{ + six_lock_type(&b->lock, type, NULL, NULL); + return 0; +} + +static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type) +{ + int ret = btree_node_lock_nopath(trans, b, type); + + BUG_ON(ret); } static inline int btree_node_lock_type(struct btree_trans *trans, struct btree_path *path, - struct btree *b, + struct btree_bkey_cached_common *b, struct bpos pos, unsigned level, enum six_lock_type type, six_lock_should_sleep_fn should_sleep_fn, void *p) { - struct bch_fs *c = trans->c; - u64 start_time; int ret; - if (six_trylock_type(&b->c.lock, type)) + if (six_trylock_type(&b->lock, type)) return 0; - start_time = local_clock(); - trans->locking_path_idx = path->idx; trans->locking_pos = pos; trans->locking_btree_id = path->btree_id; trans->locking_level = level; trans->locking_lock_type = type; - trans->locking = &b->c; - ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p); + trans->locking = b; + ret = six_lock_type(&b->lock, type, should_sleep_fn, p); trans->locking = NULL; - - if (ret) - return ret; - - bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); - return 0; + return ret; } /* @@ -209,15 +232,16 @@ static inline int btree_node_lock_type(struct btree_trans *trans, * iterators: */ static inline bool btree_node_lock_increment(struct btree_trans *trans, - struct btree *b, unsigned level, + struct btree_bkey_cached_common *b, + unsigned level, enum btree_node_locked_type want) { struct btree_path *path; trans_for_each_path(trans, path) - if (path->l[level].b == b && + if (&path->l[level].b->c == b && btree_node_locked_type(path, level) >= want) { - six_lock_increment(&b->c.lock, want); + six_lock_increment(&b->lock, want); return true; } @@ -225,14 +249,16 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, } int __bch2_btree_node_lock(struct btree_trans *, struct btree_path *, - struct btree *, struct bpos, unsigned, + struct btree_bkey_cached_common *, + struct bpos, unsigned, enum six_lock_type, six_lock_should_sleep_fn, void *, unsigned long); static inline int btree_node_lock(struct btree_trans *trans, struct btree_path *path, - struct btree *b, struct bpos pos, unsigned level, + struct btree_bkey_cached_common *b, + struct bpos pos, unsigned level, enum six_lock_type type, six_lock_should_sleep_fn should_sleep_fn, void *p, unsigned long ip) @@ -242,67 +268,97 @@ static inline int btree_node_lock(struct btree_trans *trans, EBUG_ON(level >= BTREE_MAX_DEPTH); EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); - if (likely(six_trylock_type(&b->c.lock, type)) || + if (likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(trans, b, level, type) || !(ret = __bch2_btree_node_lock(trans, path, b, pos, level, type, should_sleep_fn, p, ip))) { #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - path->l[b->c.level].lock_taken_time = ktime_get_ns(); + path->l[b->level].lock_taken_time = ktime_get_ns(); #endif } return ret; } +void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *); + +static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + EBUG_ON(&path->l[b->level].b->c != b); + EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq); + EBUG_ON(!btree_node_intent_locked(path, b->level)); + + /* + * six locks are unfair, and read locks block while a thread wants a + * write lock: thus, we need to tell the cycle detector we have a write + * lock _before_ taking the lock: + */ + mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write); + + if (unlikely(!six_trylock_write(&b->lock))) + __bch2_btree_node_lock_write(trans, b); +} + +static inline int __must_check +bch2_btree_node_lock_write(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + bch2_btree_node_lock_write_nofail(trans, path, b); + return 0; +} + +/* relock: */ + +bool bch2_btree_path_relock_norestart(struct btree_trans *, + struct btree_path *, unsigned long); bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned); static inline bool bch2_btree_node_relock(struct btree_trans *trans, struct btree_path *path, unsigned level) { EBUG_ON(btree_node_locked(path, level) && - btree_node_locked_type(path, level) != - __btree_lock_want(path, level)); + !btree_node_write_locked(path, level) && + btree_node_locked_type(path, level) != __btree_lock_want(path, level)); return likely(btree_node_locked(path, level)) || - __bch2_btree_node_relock(trans, path, level); + (!IS_ERR_OR_NULL(path->l[level].b) && + __bch2_btree_node_relock(trans, path, level)); } -/* - * Updates the saved lock sequence number, so that bch2_btree_node_relock() will - * succeed: - */ -static inline void -bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, - struct btree *b) +static inline int bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) { - struct btree_path *linked; - - EBUG_ON(path->l[b->c.level].b != b); - EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); - - trans_for_each_path_with_node(trans, b, linked) - linked->l[b->c.level].lock_seq += 2; + if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { + trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); + } - six_unlock_write(&b->c.lock); + return 0; } -void bch2_btree_node_unlock_write(struct btree_trans *, - struct btree_path *, struct btree *); +/* upgrade */ -void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *); +bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, + struct btree_path *, unsigned); +bool __bch2_btree_path_upgrade(struct btree_trans *, + struct btree_path *, unsigned); -static inline void bch2_btree_node_lock_write(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +static inline bool bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) { - EBUG_ON(path->l[b->c.level].b != b); - EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq); - EBUG_ON(!btree_node_intent_locked(path, b->c.level)); + new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - if (unlikely(!six_trylock_write(&b->c.lock))) - __bch2_btree_node_lock_write(trans, b); + return path->locks_want < new_locks_want + ? __bch2_btree_path_upgrade(trans, path, new_locks_want) + : path->uptodate == BTREE_ITER_UPTODATE; } +/* misc: */ + static inline void btree_path_set_should_be_locked(struct btree_path *path) { EBUG_ON(!btree_node_locked(path, path->level)); @@ -326,7 +382,20 @@ static inline void btree_path_set_level_up(struct btree_trans *trans, btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); } +/* debug */ + struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, - struct btree_path *, struct btree *, unsigned); + struct btree_path *, + struct btree_bkey_cached_common *b, + unsigned); + + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_btree_path_verify_locks(struct btree_path *); +void bch2_trans_verify_locks(struct btree_trans *); +#else +static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} +static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} +#endif #endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 21d76181..7c016637 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -63,6 +63,7 @@ struct btree_bkey_cached_common { struct six_lock lock; u8 level; u8 btree_id; + bool cached; }; struct btree { @@ -232,9 +233,8 @@ struct btree_path { */ bool should_be_locked:1; unsigned level:3, - locks_want:4, - nodes_locked:4, - nodes_intent_locked:4; + locks_want:4; + u8 nodes_locked; struct btree_path_level { struct btree *b; @@ -302,7 +302,8 @@ struct btree_key_cache { struct mutex lock; struct rhashtable table; bool table_init_done; - struct list_head freed; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; struct shrinker shrink; unsigned shrink_iter; struct btree_key_cache_freelist __percpu *pcpu_freed; @@ -338,6 +339,13 @@ struct bkey_cached { struct bkey_i *k; }; +static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) +{ + return !b->cached + ? container_of(b, struct btree, c)->key.k.p + : container_of(b, struct bkey_cached, c)->key.pos; +} + struct btree_insert_entry { unsigned flags; u8 bkey_type; @@ -413,6 +421,7 @@ struct btree_trans { u64 paths_allocated; unsigned mem_top; + unsigned mem_max; unsigned mem_bytes; void *mem; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 0409737f..d31c6eeb 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -143,7 +143,7 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, static void __btree_node_free(struct bch_fs *c, struct btree *b) { - trace_btree_node_free(c, b); + trace_and_count(c, btree_node_free, c, b); BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_need_write(b)); @@ -160,22 +160,23 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) } static void bch2_btree_node_free_inmem(struct btree_trans *trans, + struct btree_path *path, struct btree *b) { struct bch_fs *c = trans->c; - struct btree_path *path; - - trans_for_each_path(trans, path) - BUG_ON(path->l[b->c.level].b == b && - path->l[b->c.level].lock_seq == b->c.lock.state.seq); - - six_lock_write(&b->c.lock, NULL, NULL); + unsigned level = b->c.level; + bch2_btree_node_lock_write_nofail(trans, path, &b->c); bch2_btree_node_hash_remove(&c->btree_cache, b); __btree_node_free(c, b); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); + mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent); + + trans_for_each_path(trans, path) + if (path->l[level].b == b) { + btree_node_unlock(trans, path, level); + path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); + } } static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, @@ -258,7 +259,9 @@ mem_alloc: return b; } -static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level) +static struct btree *bch2_btree_node_alloc(struct btree_update *as, + struct btree_trans *trans, + unsigned level) { struct bch_fs *c = as->c; struct btree *b; @@ -270,8 +273,8 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev b = p->b[--p->nr]; - six_lock_intent(&b->c.lock, NULL, NULL); - six_lock_write(&b->c.lock, NULL, NULL); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); set_btree_node_accessed(b); set_btree_node_dirty_acct(c, b); @@ -304,7 +307,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); BUG_ON(ret); - trace_btree_node_alloc(c, b); + trace_and_count(c, btree_node_alloc, c, b); return b; } @@ -322,12 +325,13 @@ static void btree_set_max(struct btree *b, struct bpos pos) } struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, + struct btree_trans *trans, struct btree *b, struct bkey_format format) { struct btree *n; - n = bch2_btree_node_alloc(as, b->c.level); + n = bch2_btree_node_alloc(as, trans, b->c.level); SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); @@ -346,6 +350,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, } static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, + struct btree_trans *trans, struct btree *b) { struct bkey_format new_f = bch2_btree_calc_format(b); @@ -357,12 +362,13 @@ static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, if (!bch2_btree_node_format_fits(as->c, b, &new_f)) new_f = b->format; - return __bch2_btree_node_alloc_replacement(as, b, new_f); + return __bch2_btree_node_alloc_replacement(as, trans, b, new_f); } -static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) +static struct btree *__btree_root_alloc(struct btree_update *as, + struct btree_trans *trans, unsigned level) { - struct btree *b = bch2_btree_node_alloc(as, level); + struct btree *b = bch2_btree_node_alloc(as, trans, level); btree_set_min(b, POS_MIN); btree_set_max(b, SPOS_MAX); @@ -377,7 +383,7 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) return b; } -static void bch2_btree_reserve_put(struct btree_update *as) +static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans) { struct bch_fs *c = as->c; struct prealloc_nodes *p; @@ -404,8 +410,8 @@ static void bch2_btree_reserve_put(struct btree_update *as) mutex_unlock(&c->btree_reserve_cache_lock); - six_lock_intent(&b->c.lock, NULL, NULL); - six_lock_write(&b->c.lock, NULL, NULL); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); __btree_node_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -459,7 +465,7 @@ err: /* Asynchronous interior node update machinery */ -static void bch2_btree_update_free(struct btree_update *as) +static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans) { struct bch_fs *c = as->c; @@ -472,7 +478,7 @@ static void bch2_btree_update_free(struct btree_update *as) bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_flush(&c->journal, &as->journal); bch2_disk_reservation_put(c, &as->disk_res); - bch2_btree_reserve_put(as); + bch2_btree_reserve_put(as, trans); bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], as->start_time); @@ -550,12 +556,13 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; - struct btree *b = as->b; + struct btree *b; struct btree_trans trans; u64 journal_seq = 0; unsigned i; int ret; + bch2_trans_init(&trans, c, 0, 512); /* * If we're already in an error state, it might be because a btree node * was never written, and we might be trying to free that same btree @@ -572,15 +579,16 @@ static void btree_update_nodes_written(struct btree_update *as) * on disk: */ for (i = 0; i < as->nr_old_nodes; i++) { - struct btree *old = as->old_nodes[i]; __le64 seq; - six_lock_read(&old->c.lock, NULL, NULL); - seq = old->data ? old->data->keys.seq : 0; - six_unlock_read(&old->c.lock); + b = as->old_nodes[i]; + + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + seq = b->data ? b->data->keys.seq : 0; + six_unlock_read(&b->c.lock); if (seq == as->old_nodes_seq[i]) - wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner, + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } @@ -597,19 +605,19 @@ static void btree_update_nodes_written(struct btree_update *as) * journal reclaim does btree updates when flushing bkey_cached entries, * which may require allocations as well. */ - bch2_trans_init(&trans, c, 0, 512); ret = commit_do(&trans, &as->disk_res, &journal_seq, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_JOURNAL_RECLAIM| - JOURNAL_WATERMARK_reserved, - btree_update_nodes_written_trans(&trans, as)); - bch2_trans_exit(&trans); + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RECLAIM| + JOURNAL_WATERMARK_reserved, + btree_update_nodes_written_trans(&trans, as)); + bch2_trans_unlock(&trans); bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, "error %i in btree_update_nodes_written()", ret); err: - if (b) { + if (as->b) { + b = as->b; /* * @b is the node we did the final insert into: * @@ -622,8 +630,8 @@ err: * we're in journal error state: */ - six_lock_intent(&b->c.lock, NULL, NULL); - six_lock_write(&b->c.lock, NULL, NULL); + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_write); mutex_lock(&c->btree_interior_update_lock); list_del(&as->write_blocked_list); @@ -680,7 +688,7 @@ err: for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; - six_lock_read(&b->c.lock, NULL, NULL); + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); } @@ -688,7 +696,8 @@ err: for (i = 0; i < as->nr_open_buckets; i++) bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); - bch2_btree_update_free(as); + bch2_btree_update_free(as, &trans); + bch2_trans_exit(&trans); } static void btree_interior_update_work(struct work_struct *work) @@ -935,7 +944,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, as->nr_old_nodes++; } -static void bch2_btree_update_done(struct btree_update *as) +static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans) { struct bch_fs *c = as->c; u64 start_time = as->start_time; @@ -946,7 +955,7 @@ static void bch2_btree_update_done(struct btree_update *as) up_read(&as->c->gc_lock); as->took_gc_lock = false; - bch2_btree_reserve_put(as); + bch2_btree_reserve_put(as, trans); continue_at(&as->cl, btree_update_set_nodes_written, as->c->btree_interior_update_worker); @@ -994,7 +1003,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, nr_nodes[1] += 1; if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { - trace_trans_restart_iter_upgrade(trans, _RET_IP_, path); + trace_and_count(c, trans_restart_iter_upgrade, trans, _RET_IP_, path); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); return ERR_PTR(ret); } @@ -1048,11 +1057,16 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) { bch2_trans_unlock(trans); + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + goto err; + } + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, BTREE_UPDATE_JOURNAL_RES, journal_flags); if (ret) { - trace_trans_restart_journal_preres_get(trans, _RET_IP_); + trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get); goto err; } @@ -1085,8 +1099,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, } if (ret) { - trace_btree_reserve_get_fail(trans->fn, _RET_IP_, - nr_nodes[0] + nr_nodes[1]); + trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]); goto err; } @@ -1097,7 +1110,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, bch2_trans_verify_not_restarted(trans, restart_count); return as; err: - bch2_btree_update_free(as); + bch2_btree_update_free(as, trans); return ERR_PTR(ret); } @@ -1141,7 +1154,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct bch_fs *c = as->c; struct btree *old; - trace_btree_set_root(c, b); + trace_and_count(c, btree_node_set_root, c, b); BUG_ON(!b->written); old = btree_node_root(c, b); @@ -1150,7 +1163,7 @@ static void bch2_btree_set_root(struct btree_update *as, * Ensure no one is using the old root while we switch to the * new root: */ - bch2_btree_node_lock_write(trans, path, old); + bch2_btree_node_lock_write_nofail(trans, path, &old->c); bch2_btree_set_root_inmem(c, b); @@ -1249,6 +1262,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, * node) */ static struct btree *__btree_split_node(struct btree_update *as, + struct btree_trans *trans, struct btree *n1) { struct bkey_format_state s; @@ -1258,7 +1272,7 @@ static struct btree *__btree_split_node(struct btree_update *as, struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL; struct bpos n1_pos; - n2 = bch2_btree_node_alloc(as, n1->c.level); + n2 = bch2_btree_node_alloc(as, trans, n1->c.level); n2->data->max_key = n1->data->max_key; n2->data->format = n1->format; @@ -1422,15 +1436,15 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_interior_update_will_free_node(as, b); - n1 = bch2_btree_node_alloc_replacement(as, b); + n1 = bch2_btree_node_alloc_replacement(as, trans, b); if (keys) btree_split_insert_keys(as, trans, path, n1, keys); if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { - trace_btree_split(c, b); + trace_and_count(c, btree_node_split, c, b); - n2 = __btree_split_node(as, n1); + n2 = __btree_split_node(as, trans, n1); bch2_btree_build_aux_trees(n2); bch2_btree_build_aux_trees(n1); @@ -1452,7 +1466,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, if (!parent) { /* Depth increases, make a new root */ - n3 = __btree_root_alloc(as, b->c.level + 1); + n3 = __btree_root_alloc(as, trans, b->c.level + 1); n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[1] = U16_MAX; @@ -1462,7 +1476,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); } } else { - trace_btree_compact(c, b); + trace_and_count(c, btree_node_compact, c, b); bch2_btree_build_aux_trees(n1); six_unlock_write(&n1->c.lock); @@ -1493,22 +1507,19 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, if (n3) bch2_btree_update_get_open_buckets(as, n3); - /* Successful split, update the path to point to the new nodes: */ - - six_lock_increment(&b->c.lock, SIX_LOCK_intent); - if (n3) - bch2_trans_node_add(trans, n3); - if (n2) - bch2_trans_node_add(trans, n2); - bch2_trans_node_add(trans, n1); - /* * The old node must be freed (in memory) _before_ unlocking the new * nodes - else another thread could re-acquire a read lock on the old * node after another thread has locked and updated the new node, thus * seeing stale data: */ - bch2_btree_node_free_inmem(trans, b); + bch2_btree_node_free_inmem(trans, path, b); + + if (n3) + bch2_trans_node_add(trans, n3); + if (n2) + bch2_trans_node_add(trans, n2); + bch2_trans_node_add(trans, n1); if (n3) six_unlock_intent(&n3->c.lock); @@ -1617,7 +1628,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, return PTR_ERR(as); btree_split(as, trans, path, b, NULL, flags); - bch2_btree_update_done(as); + bch2_btree_update_done(as, trans); for (l = path->level + 1; btree_path_node(path, l) && !ret; l++) ret = bch2_foreground_maybe_merge(trans, path, l, flags); @@ -1731,12 +1742,12 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (ret) goto err; - trace_btree_merge(c, b); + trace_and_count(c, btree_node_merge, c, b); bch2_btree_interior_update_will_free_node(as, b); bch2_btree_interior_update_will_free_node(as, m); - n = bch2_btree_node_alloc(as, b->c.level); + n = bch2_btree_node_alloc(as, trans, b->c.level); SET_BTREE_NODE_SEQ(n->data, max(BTREE_NODE_SEQ(b->data), @@ -1771,19 +1782,16 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_get_open_buckets(as, n); - six_lock_increment(&b->c.lock, SIX_LOCK_intent); - six_lock_increment(&m->c.lock, SIX_LOCK_intent); + bch2_btree_node_free_inmem(trans, path, b); + bch2_btree_node_free_inmem(trans, sib_path, m); bch2_trans_node_add(trans, n); bch2_trans_verify_paths(trans); - bch2_btree_node_free_inmem(trans, b); - bch2_btree_node_free_inmem(trans, m); - six_unlock_intent(&n->c.lock); - bch2_btree_update_done(as); + bch2_btree_update_done(as, trans); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); out: @@ -1817,13 +1825,13 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, bch2_btree_interior_update_will_free_node(as, b); - n = bch2_btree_node_alloc_replacement(as, b); + n = bch2_btree_node_alloc_replacement(as, trans, b); bch2_btree_update_add_new_node(as, n); bch2_btree_build_aux_trees(n); six_unlock_write(&n->c.lock); - trace_btree_rewrite(c, b); + trace_and_count(c, btree_node_rewrite, c, b); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); @@ -1837,12 +1845,12 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, bch2_btree_update_get_open_buckets(as, n); - six_lock_increment(&b->c.lock, SIX_LOCK_intent); + bch2_btree_node_free_inmem(trans, iter->path, b); + bch2_trans_node_add(trans, n); - bch2_btree_node_free_inmem(trans, b); six_unlock_intent(&n->c.lock); - bch2_btree_update_done(as); + bch2_btree_update_done(as, trans); out: bch2_btree_path_downgrade(trans, iter->path); return ret; @@ -1989,7 +1997,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, if (ret) goto err; - bch2_btree_node_lock_write(trans, iter->path, b); + bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c); if (new_hash) { mutex_lock(&c->btree_cache.lock); diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index adfc6c24..7af810df 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -117,6 +117,7 @@ struct btree_update { }; struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, + struct btree_trans *, struct btree *, struct bkey_format); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index ca988392..e9518fbc 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -81,7 +81,7 @@ void bch2_btree_node_lock_for_insert(struct btree_trans *trans, struct btree_path *path, struct btree *b) { - bch2_btree_node_lock_write(trans, path, b); + bch2_btree_node_lock_write_nofail(trans, path, &b->c); bch2_btree_node_prep_for_write(trans, path, b); } @@ -169,10 +169,13 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, struct bch_fs *c = container_of(j, struct bch_fs, journal); struct btree_write *w = container_of(pin, struct btree_write, journal); struct btree *b = container_of(w, struct btree, writes[i]); + struct btree_trans trans; unsigned long old, new, v; unsigned idx = w - b->writes; - six_lock_read(&b->c.lock, NULL, NULL); + bch2_trans_init(&trans, c, 0, 0); + + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); v = READ_ONCE(b->flags); do { @@ -188,6 +191,8 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); + + bch2_trans_exit(&trans); return 0; } @@ -285,7 +290,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s, ret = bch2_trans_relock(trans); if (ret) { - trace_trans_restart_journal_preres_get(trans, trace_ip); + trace_and_count(c, trans_restart_journal_preres_get, trans, trace_ip, 0); return ret; } @@ -375,7 +380,7 @@ btree_key_can_insert_cached(struct btree_trans *trans, * Keys returned by peek() are no longer valid pointers, so we need a * transaction restart: */ - trace_trans_restart_key_cache_key_realloced(trans, _RET_IP_, path, old_u64s, new_u64s); + trace_and_count(c, trans_restart_key_cache_key_realloced, trans, _RET_IP_, path, old_u64s, new_u64s); return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced); } @@ -567,7 +572,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, int ret; if (race_fault()) { - trace_trans_restart_fault_inject(trans, trace_ip); + trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); } @@ -741,11 +746,12 @@ static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_ static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path) { struct btree *b = path_l(path)->b; + unsigned l; do { - if (path->nodes_locked && - path->nodes_locked != path->nodes_intent_locked) - path_upgrade_readers(trans, path); + for (l = 0; l < BTREE_MAX_DEPTH; l++) + if (btree_node_read_locked(path, l)) + path_upgrade_readers(trans, path); } while ((path = prev_btree_path(trans, path)) && path_l(path)->b == b); } @@ -764,11 +770,13 @@ static inline void normalize_read_intent_locks(struct btree_trans *trans) ? trans->paths + trans->sorted[i + 1] : NULL; - if (path->nodes_locked) { - if (path->nodes_intent_locked) - nr_intent++; - else - nr_read++; + switch (btree_node_locked_type(path, path->level)) { + case BTREE_NODE_READ_LOCKED: + nr_read++; + break; + case BTREE_NODE_INTENT_LOCKED: + nr_intent++; + break; } if (!next || path_l(path)->b != path_l(next)->b) { @@ -791,7 +799,7 @@ static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct //if (path == pos) // break; - if (path->nodes_locked != path->nodes_intent_locked && + if (btree_node_read_locked(path, path->level) && !bch2_btree_path_upgrade(trans, path, path->level + 1)) return true; } @@ -808,12 +816,19 @@ static inline int trans_lock_write(struct btree_trans *trans) if (same_leaf_as_prev(trans, i)) continue; + /* + * six locks are unfair, and read locks block while a thread + * wants a write lock: thus, we need to tell the cycle detector + * we have a write lock _before_ taking the lock: + */ + mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_write); + if (!six_trylock_write(&insert_l(i)->b->c.lock)) { if (have_conflicting_read_lock(trans, i->path)) goto fail; ret = btree_node_lock_type(trans, i->path, - insert_l(i)->b, + &insert_l(i)->b->c, i->path->pos, i->level, SIX_LOCK_write, NULL, NULL); BUG_ON(ret); @@ -824,6 +839,8 @@ static inline int trans_lock_write(struct btree_trans *trans) return 0; fail: + mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_intent); + while (--i >= trans->updates) { if (same_leaf_as_prev(trans, i)) continue; @@ -831,7 +848,7 @@ fail: bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b); } - trace_trans_restart_would_deadlock_write(trans); + trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); } @@ -964,7 +981,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, case BTREE_INSERT_BTREE_NODE_FULL: ret = bch2_btree_split_leaf(trans, i->path, trans->flags); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_trans_restart_btree_node_split(trans, trace_ip, i->path); + trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); break; case BTREE_INSERT_NEED_MARK_REPLICAS: bch2_trans_unlock(trans); @@ -975,7 +992,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, ret = bch2_trans_relock(trans); if (ret) - trace_trans_restart_mark_replicas(trans, trace_ip); + trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip); break; case BTREE_INSERT_NEED_JOURNAL_RES: bch2_trans_unlock(trans); @@ -992,12 +1009,12 @@ int bch2_trans_commit_error(struct btree_trans *trans, ret = bch2_trans_relock(trans); if (ret) - trace_trans_restart_journal_res_get(trans, trace_ip); + trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip); break; case BTREE_INSERT_NEED_JOURNAL_RECLAIM: bch2_trans_unlock(trans); - trace_trans_blocked_journal_reclaim(trans, trace_ip); + trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); wait_event_freezable(c->journal.reclaim_wait, (ret = journal_reclaim_wait_done(c))); @@ -1006,7 +1023,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, ret = bch2_trans_relock(trans); if (ret) - trace_trans_restart_journal_reclaim(trans, trace_ip); + trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip); break; default: BUG_ON(ret >= 0); @@ -1107,7 +1124,7 @@ int __bch2_trans_commit(struct btree_trans *trans) BUG_ON(!i->path->should_be_locked); if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) { - trace_trans_restart_upgrade(trans, _RET_IP_, i->path); + trace_and_count(c, trans_restart_upgrade, trans, _RET_IP_, i->path); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); goto out; } @@ -1148,7 +1165,7 @@ retry: if (ret) goto err; - trace_transaction_commit(trans, _RET_IP_); + trace_and_count(c, transaction_commit, trans, _RET_IP_); out: bch2_journal_preres_put(&c->journal, &trans->journal_preres); @@ -1617,7 +1634,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter ck = (void *) iter->key_cache_path->l[0].b; if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - trace_trans_restart_key_cache_raced(trans, _RET_IP_); + trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); } diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 3b442b01..cb25efb6 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -231,9 +231,12 @@ static int bch2_data_update_index_update(struct bch_write_op *op) m->data_opts.btree_insert_flags); if (!ret) { bch2_btree_iter_set_pos(&iter, next_pos); - atomic_long_inc(&c->extent_migrate_done); + if (ec_ob) bch2_ob_add_backpointer(c, ec_ob, &insert->k); + + this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); + trace_move_extent_finish(&new->k); } err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -248,22 +251,16 @@ next: } continue; nomatch: - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, old); - bch_info(c, "no match for %s", buf.buf); - printbuf_exit(&buf); - } - if (m->ctxt) { BUG_ON(k.k->p.offset <= iter.pos.offset); atomic64_inc(&m->ctxt->stats->keys_raced); atomic64_add(k.k->p.offset - iter.pos.offset, &m->ctxt->stats->sectors_raced); } - atomic_long_inc(&c->extent_migrate_raced); - trace_move_race(&new->k); + + this_cpu_add(c->counters[BCH_COUNTER_move_extent_race], new->k.size); + trace_move_extent_race(&new->k); + bch2_btree_iter_advance(&iter); goto next; } diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index f35e714e..fb518d59 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -666,6 +666,9 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf, mutex_lock(&s->lock); + prt_printf(&i->buf, "Max mem used: %u", s->max_mem); + prt_newline(&i->buf); + if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { prt_printf(&i->buf, "Lock hold times:"); prt_newline(&i->buf); diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index 7bd44136..22b6b841 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -384,32 +384,34 @@ inval: prt_printf(out, "invalid label %u", v); } -int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) +int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) { struct bch_member *mi; - int v = -1; - int ret = 0; - - mutex_lock(&c->sb_lock); + int ret, v = -1; if (!strlen(name) || !strcmp(name, "none")) - goto write_sb; + return 0; v = bch2_disk_path_find_or_create(&c->disk_sb, name); - if (v < 0) { - mutex_unlock(&c->sb_lock); + if (v < 0) return v; - } ret = bch2_sb_disk_groups_to_cpu(c); if (ret) - goto unlock; -write_sb: + return ret; + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; SET_BCH_MEMBER_GROUP(mi, v + 1); + return 0; +} - bch2_write_super(c); -unlock: +int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) +{ + int ret; + + mutex_lock(&c->sb_lock); + ret = __bch2_dev_group_set(c, ca, name) ?: + bch2_write_super(c); mutex_unlock(&c->sb_lock); return ret; diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h index de915480..e4470c35 100644 --- a/libbcachefs/disk_groups.h +++ b/libbcachefs/disk_groups.h @@ -82,6 +82,7 @@ void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb * int bch2_sb_disk_groups_to_cpu(struct bch_fs *); +int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); const char *bch2_sb_validate_disk_groups(struct bch_sb *, diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 9f768d77..12f2ef44 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -772,9 +772,6 @@ static int hash_redo_key(struct btree_trans *trans, struct bch_hash_info *hash_info, struct btree_iter *k_iter, struct bkey_s_c k) { - bch_err(trans->c, "hash_redo_key() not implemented yet"); - return -EINVAL; -#if 0 struct bkey_i *delete; struct bkey_i *tmp; @@ -792,8 +789,14 @@ static int hash_redo_key(struct btree_trans *trans, delete->k.p = k_iter->pos; return bch2_btree_iter_traverse(k_iter) ?: bch2_trans_update(trans, k_iter, delete, 0) ?: - bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0); -#endif + bch2_hash_set_snapshot(trans, desc, hash_info, + (subvol_inum) { 0, k.k->p.inode }, + k.k->p.snapshot, tmp, + BCH_HASH_SET_MUST_CREATE, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); } static int hash_check_key(struct btree_trans *trans, diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 93771f83..a683a689 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -1387,7 +1387,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) struct closure *cl = &op->cl; struct bio *bio = &op->write.op.wbio.bio; - trace_promote(&rbio->bio); + trace_and_count(op->write.op.c, read_promote, &rbio->bio); /* we now own pages: */ BUG_ON(!rbio->bounce); @@ -1653,7 +1653,7 @@ static void bch2_rbio_retry(struct work_struct *work) }; struct bch_io_failures failed = { .nr = 0 }; - trace_read_retry(&rbio->bio); + trace_and_count(c, read_retry, &rbio->bio); if (rbio->retry == READ_RETRY_AVOID) bch2_mark_io_failure(&failed, &rbio->pick); @@ -1909,7 +1909,7 @@ static void bch2_read_endio(struct bio *bio) if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ptr_stale(ca, &rbio->pick.ptr)) { - atomic_long_inc(&c->read_realloc_races); + trace_and_count(c, read_reuse_race, &rbio->bio); if (rbio->flags & BCH_READ_RETRY_IF_STALE) bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); @@ -2197,7 +2197,7 @@ get_bio: rbio->bio.bi_end_io = bch2_read_endio; if (rbio->bounce) - trace_read_bounce(&rbio->bio); + trace_and_count(c, read_bounce, &rbio->bio); this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); @@ -2212,7 +2212,7 @@ get_bio: if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { bio_inc_remaining(&orig->bio); - trace_read_split(&orig->bio); + trace_and_count(c, read_split, &orig->bio); } if (!rbio->pick.idx) { diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 3f1cf1ac..3e8972c2 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -391,12 +391,12 @@ retry: ret = journal_entry_open(j); if (ret == JOURNAL_ERR_max_in_flight) - trace_journal_entry_full(c); + trace_and_count(c, journal_entry_full, c); unlock: if ((ret && ret != JOURNAL_ERR_insufficient_devices) && !j->res_get_blocked_start) { j->res_get_blocked_start = local_clock() ?: 1; - trace_journal_full(c); + trace_and_count(c, journal_full, c); } can_discard = j->can_discard; diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 107521e1..55b86cbd 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1551,7 +1551,7 @@ static void do_journal_write(struct closure *cl) bch2_bio_map(bio, w->data, sectors << 9); - trace_journal_write(bio); + trace_and_count(c, journal_write, bio); closure_bio_submit(bio, cl); ca->journal.bucket_seq[ca->journal.cur_idx] = diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 9f8b63b3..e69595bd 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -641,7 +641,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); - trace_journal_reclaim_start(c, direct, kicked, + trace_and_count(c, journal_reclaim_start, c, + direct, kicked, min_nr, min_key_cache, j->prereserved.reserved, j->prereserved.remaining, @@ -657,7 +658,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) j->nr_direct_reclaim += nr_flushed; else j->nr_background_reclaim += nr_flushed; - trace_journal_reclaim_finish(c, nr_flushed); + trace_and_count(c, journal_reclaim_finish, c, nr_flushed); if (nr_flushed) wake_up(&j->reclaim_wait); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 22470067..e85c3143 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -252,8 +252,8 @@ static int bch2_move_extent(struct btree_trans *trans, atomic64_inc(&ctxt->stats->keys_moved); atomic64_add(k.k->size, &ctxt->stats->sectors_moved); this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); - - trace_move_extent(k.k); + this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); + trace_move_extent_read(k.k); atomic_add(io->read_sectors, &ctxt->read_sectors); list_add_tail(&io->list, &ctxt->reads); @@ -275,7 +275,7 @@ err_free: kfree(io); err: percpu_ref_put(&c->writes); - trace_move_alloc_mem_fail(k.k); + trace_and_count(c, move_extent_alloc_mem_fail, k.k); return ret; } diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index f913864e..35958c6b 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -165,7 +165,7 @@ static int bch2_copygc(struct bch_fs *c) if (ret < 0) bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret)); - trace_copygc(c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0); + trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0); return ret; } @@ -221,7 +221,7 @@ static int bch2_copygc_thread(void *arg) wait = bch2_copygc_wait_amount(c); if (wait > clock->max_slop) { - trace_copygc_wait(c, wait, last + wait); + trace_and_count(c, copygc_wait, c, wait, last + wait); c->copygc_wait = last + wait; bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 591bbb9f..5c327b31 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -239,29 +239,26 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, } static __always_inline -int bch2_hash_set(struct btree_trans *trans, - const struct bch_hash_desc desc, - const struct bch_hash_info *info, - subvol_inum inum, - struct bkey_i *insert, int flags) +int bch2_hash_set_snapshot(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, u32 snapshot, + struct bkey_i *insert, + int flags, + int update_flags) { struct btree_iter iter, slot = { NULL }; struct bkey_s_c k; bool found = false; - u32 snapshot; int ret; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - for_each_btree_key_upto_norestart(trans, iter, desc.btree_id, - SPOS(inum.inum, + SPOS(insert->k.p.inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), - POS(inum.inum, U64_MAX), + POS(insert->k.p.inode, U64_MAX), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (is_visible_key(desc, inum, k)) { + if (!inum.subvol || is_visible_key(desc, inum, k)) { if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) goto found; @@ -303,6 +300,26 @@ not_found: goto out; } +static __always_inline +int bch2_hash_set(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, + struct bkey_i *insert, int flags) +{ + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + insert->k.p.inode = inum.inum; + + return bch2_hash_set_snapshot(trans, desc, info, inum, + snapshot, insert, flags, 0); +} + static __always_inline int bch2_hash_delete_at(struct btree_trans *trans, const struct bch_hash_desc desc, diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index ade09bdf..e1e70d35 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -796,7 +796,7 @@ int bch2_write_super(struct bch_fs *c) unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; int ret = 0; - trace_write_super(c, _RET_IP_); + trace_and_count(c, write_super, c, _RET_IP_); if (c->opts.very_degraded) degraded_flags |= BCH_FORCE_IF_LOST; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 7c634800..8b3ce780 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -1530,6 +1530,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) struct bch_member dev_mi; unsigned dev_idx, nr_devices, u64s; struct printbuf errbuf = PRINTBUF; + struct printbuf label = PRINTBUF; int ret; ret = bch2_read_super(path, &opts, &sb); @@ -1540,6 +1541,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path) dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; + if (BCH_MEMBER_GROUP(&dev_mi)) { + bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); + if (label.allocation_failure) { + ret = -ENOMEM; + goto err; + } + } + err = bch2_dev_may_add(sb.sb, c); if (err) { bch_err(c, "device add error: %s", err); @@ -1620,6 +1629,14 @@ have_slot: ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); + if (BCH_MEMBER_GROUP(&dev_mi)) { + ret = __bch2_dev_group_set(c, ca, label.buf); + if (ret) { + bch_err(c, "device add error: error setting label"); + goto err_unlock; + } + } + bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -1652,6 +1669,7 @@ err: if (ca) bch2_dev_free(ca); bch2_free_super(&sb); + printbuf_exit(&label); printbuf_exit(&errbuf); return ret; err_late: diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 2dfed1ff..98449e42 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -190,11 +190,6 @@ read_attribute(internal_uuid); read_attribute(has_data); read_attribute(alloc_debug); -read_attribute(read_realloc_races); -read_attribute(extent_migrate_done); -read_attribute(extent_migrate_raced); -read_attribute(bucket_alloc_fail); - #define x(t, n, ...) read_attribute(t); BCH_PERSISTENT_COUNTERS() #undef x @@ -378,15 +373,6 @@ SHOW(bch2_fs) sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c)); - sysfs_print(read_realloc_races, - atomic_long_read(&c->read_realloc_races)); - sysfs_print(extent_migrate_done, - atomic_long_read(&c->extent_migrate_done)); - sysfs_print(extent_migrate_raced, - atomic_long_read(&c->extent_migrate_raced)); - sysfs_print(bucket_alloc_fail, - atomic_long_read(&c->bucket_alloc_fail)); - sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); if (attr == &sysfs_gc_gens_pos) @@ -625,11 +611,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_invalidates, &sysfs_prune_cache, - &sysfs_read_realloc_races, - &sysfs_extent_migrate_done, - &sysfs_extent_migrate_raced, - &sysfs_bucket_alloc_fail, - &sysfs_gc_gens_pos, &sysfs_copy_gc_enabled, diff --git a/linux/six.c b/linux/six.c index 5b2d92c6..d2275055 100644 --- a/linux/six.c +++ b/linux/six.c @@ -19,6 +19,8 @@ #define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) #define six_release(l) lock_release(l, _RET_IP_) +static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); + struct six_lock_vals { /* Value we add to the lock in order to take the lock: */ u64 lock_val; @@ -65,14 +67,15 @@ struct six_lock_vals { } static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, - union six_lock_state old) + union six_lock_state old, + struct task_struct *owner) { if (type != SIX_LOCK_intent) return; if (!old.intent_lock) { EBUG_ON(lock->owner); - lock->owner = current; + lock->owner = owner; } else { EBUG_ON(lock->owner != current); } @@ -88,64 +91,21 @@ static inline unsigned pcpu_read_count(struct six_lock *lock) return read_count; } -struct six_lock_waiter { - struct list_head list; - struct task_struct *task; -}; - /* This is probably up there with the more evil things I've done */ #define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) -static inline void six_lock_wakeup(struct six_lock *lock, - union six_lock_state state, - unsigned waitlist_id) -{ - if (waitlist_id == SIX_LOCK_write) { - if (state.write_locking && !state.read_lock) { - struct task_struct *p = READ_ONCE(lock->owner); - if (p) - wake_up_process(p); - } - } else { - struct list_head *wait_list = &lock->wait_list[waitlist_id]; - struct six_lock_waiter *w, *next; - - if (!(state.waiters & (1 << waitlist_id))) - return; - - clear_bit(waitlist_bitnr(waitlist_id), - (unsigned long *) &lock->state.v); - - raw_spin_lock(&lock->wait_lock); - - list_for_each_entry_safe(w, next, wait_list, list) { - list_del_init(&w->list); - - if (wake_up_process(w->task) && - waitlist_id != SIX_LOCK_read) { - if (!list_empty(wait_list)) - set_bit(waitlist_bitnr(waitlist_id), - (unsigned long *) &lock->state.v); - break; - } - } - - raw_spin_unlock(&lock->wait_lock); - } -} - -static __always_inline bool do_six_trylock_type(struct six_lock *lock, - enum six_lock_type type, - bool try) +static int __do_six_trylock_type(struct six_lock *lock, + enum six_lock_type type, + struct task_struct *task, + bool try) { const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state old, new; - bool ret; + int ret; u64 v; - EBUG_ON(type == SIX_LOCK_write && lock->owner != current); + EBUG_ON(type == SIX_LOCK_write && lock->owner != task); EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1)); - EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking))); /* @@ -176,18 +136,6 @@ retry: this_cpu_sub(*lock->readers, !ret); preempt_enable(); - /* - * If we failed because a writer was trying to take the - * lock, issue a wakeup because we might have caused a - * spurious trylock failure: - */ - if (old.write_locking) { - struct task_struct *p = READ_ONCE(lock->owner); - - if (p) - wake_up_process(p); - } - /* * If we failed from the lock path and the waiting bit wasn't * set, set it: @@ -208,6 +156,14 @@ retry: } while ((v = atomic64_cmpxchg(&lock->state.counter, old.v, new.v)) != old.v); } + + /* + * If we failed because a writer was trying to take the + * lock, issue a wakeup because we might have caused a + * spurious trylock failure: + */ + if (old.write_locking) + ret = -1 - SIX_LOCK_write; } else if (type == SIX_LOCK_write && lock->readers) { if (try) { atomic64_add(__SIX_VAL(write_locking, 1), @@ -227,9 +183,13 @@ retry: if (ret || try) v -= __SIX_VAL(write_locking, 1); + if (!ret && !try && !(lock->state.waiters & (1 << SIX_LOCK_write))) + v += __SIX_VAL(waiters, 1 << SIX_LOCK_write); + if (try && !ret) { old.v = atomic64_add_return(v, &lock->state.counter); - six_lock_wakeup(lock, old, SIX_LOCK_read); + if (old.waiters & (1 << SIX_LOCK_read)) + ret = -1 - SIX_LOCK_read; } else { atomic64_add(v, &lock->state.counter); } @@ -243,8 +203,7 @@ retry: if (type == SIX_LOCK_write) new.write_locking = 0; - } else if (!try && type != SIX_LOCK_write && - !(new.waiters & (1 << type))) + } else if (!try && !(new.waiters & (1 << type))) new.waiters |= 1 << type; else break; /* waiting bit already set */ @@ -256,14 +215,84 @@ retry: EBUG_ON(ret && !(lock->state.v & l[type].held_mask)); } - if (ret) - six_set_owner(lock, type, old); + if (ret > 0) + six_set_owner(lock, type, old, task); - EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking)); + EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking)); return ret; } +static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) +{ + struct six_lock_waiter *w, *next; + struct task_struct *task; + bool saw_one; + int ret; +again: + ret = 0; + saw_one = false; + raw_spin_lock(&lock->wait_lock); + + list_for_each_entry_safe(w, next, &lock->wait_list, list) { + if (w->lock_want != lock_type) + continue; + + if (saw_one && lock_type != SIX_LOCK_read) + goto unlock; + saw_one = true; + + ret = __do_six_trylock_type(lock, lock_type, w->task, false); + if (ret <= 0) + goto unlock; + + __list_del(w->list.prev, w->list.next); + task = w->task; + /* + * Do no writes to @w besides setting lock_acquired - otherwise + * we would need a memory barrier: + */ + barrier(); + w->lock_acquired = true; + wake_up_process(task); + } + + clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v); +unlock: + raw_spin_unlock(&lock->wait_lock); + + if (ret < 0) { + lock_type = -ret - 1; + goto again; + } +} + +static inline void six_lock_wakeup(struct six_lock *lock, + union six_lock_state state, + enum six_lock_type lock_type) +{ + if (lock_type == SIX_LOCK_write && state.read_lock) + return; + + if (!(state.waiters & (1 << lock_type))) + return; + + __six_lock_wakeup(lock, lock_type); +} + +static bool do_six_trylock_type(struct six_lock *lock, + enum six_lock_type type, + bool try) +{ + int ret; + + ret = __do_six_trylock_type(lock, type, current, try); + if (ret < 0) + __six_lock_wakeup(lock, -ret - 1); + + return ret > 0; +} + __always_inline __flatten static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) { @@ -304,12 +333,8 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, * Similar to the lock path, we may have caused a spurious write * lock fail and need to issue a wakeup: */ - if (old.write_locking) { - struct task_struct *p = READ_ONCE(lock->owner); - - if (p) - wake_up_process(p); - } + if (old.write_locking) + six_lock_wakeup(lock, old, SIX_LOCK_write); if (ret) six_acquire(&lock->dep_map, 1); @@ -327,7 +352,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, old.v, old.v + l[type].lock_val)) != old.v); - six_set_owner(lock, type, old); + six_set_owner(lock, type, old, current); if (type != SIX_LOCK_write) six_acquire(&lock->dep_map, 1); return true; @@ -335,33 +360,26 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, #ifdef CONFIG_LOCK_SPIN_ON_OWNER -static inline int six_can_spin_on_owner(struct six_lock *lock) +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait) { - struct task_struct *owner; - int retval = 1; + struct task_struct *owner, *task = current; - if (need_resched()) - return 0; + switch (wait->lock_want) { + case SIX_LOCK_read: + break; + case SIX_LOCK_intent: + if (lock->wait_list.next != &wait->list) + return false; + break; + case SIX_LOCK_write: + return false; + } rcu_read_lock(); owner = READ_ONCE(lock->owner); - if (owner) - retval = owner->on_cpu; - rcu_read_unlock(); - /* - * if lock->owner is not set, the mutex owner may have just acquired - * it and not set the owner yet or the mutex has been released. - */ - return retval; -} -static inline bool six_spin_on_owner(struct six_lock *lock, - struct task_struct *owner) -{ - bool ret = true; - - rcu_read_lock(); - while (lock->owner == owner) { + while (owner && lock->owner == owner) { /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking lock->owner still matches owner. If that fails, @@ -370,85 +388,27 @@ static inline bool six_spin_on_owner(struct six_lock *lock, */ barrier(); - if (!owner->on_cpu || need_resched()) { - ret = false; - break; - } - - cpu_relax(); - } - rcu_read_unlock(); - - return ret; -} - -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -{ - struct task_struct *task = current; - - if (type == SIX_LOCK_write) - return false; - - preempt_disable(); - if (!six_can_spin_on_owner(lock)) - goto fail; - - if (!osq_lock(&lock->osq)) - goto fail; - - while (1) { - struct task_struct *owner; - /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. - */ - owner = READ_ONCE(lock->owner); - if (owner && !six_spin_on_owner(lock, owner)) - break; - - if (do_six_trylock_type(lock, type, false)) { - osq_unlock(&lock->osq); - preempt_enable(); - return true; - } - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let + * If we're an RT task that will live-lock because we won't let * the owner complete. */ - if (!owner && (need_resched() || rt_task(task))) + if (wait->lock_acquired || + !owner->on_cpu || + rt_task(task) || + need_resched()) break; - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ cpu_relax(); } + rcu_read_unlock(); - osq_unlock(&lock->osq); -fail: - preempt_enable(); - - /* - * If we fell out of the spin path because of need_resched(), - * reschedule now, before we try-lock again. This avoids getting - * scheduled out right after we obtained the lock. - */ - if (need_resched()) - schedule(); - - return false; + return wait->lock_acquired; } #else /* CONFIG_LOCK_SPIN_ON_OWNER */ -static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait) { return false; } @@ -457,10 +417,10 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type noinline static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, six_lock_should_sleep_fn should_sleep_fn, void *p) { union six_lock_state old; - struct six_lock_waiter wait; int ret = 0; if (type == SIX_LOCK_write) { @@ -469,46 +429,58 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty smp_mb__after_atomic(); } - ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; - if (ret) - goto out_before_sleep; + lock_contended(&lock->dep_map, _RET_IP_); - if (six_optimistic_spin(lock, type)) - goto out_before_sleep; + wait->task = current; + wait->lock_want = type; + wait->lock_acquired = false; - lock_contended(&lock->dep_map, _RET_IP_); + raw_spin_lock(&lock->wait_lock); + /* + * Retry taking the lock after taking waitlist lock, have raced with an + * unlock: + */ + ret = __do_six_trylock_type(lock, type, current, false); + if (ret <= 0) + list_add_tail(&wait->list, &lock->wait_list); + raw_spin_unlock(&lock->wait_lock); - INIT_LIST_HEAD(&wait.list); - wait.task = current; + if (unlikely(ret > 0)) { + ret = 0; + goto out; + } + + if (unlikely(ret < 0)) { + __six_lock_wakeup(lock, -ret - 1); + ret = 0; + } + + if (six_optimistic_spin(lock, wait)) + goto out; while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - if (type == SIX_LOCK_write) - EBUG_ON(lock->owner != current); - else if (list_empty_careful(&wait.list)) { - raw_spin_lock(&lock->wait_lock); - list_add_tail(&wait.list, &lock->wait_list[type]); - raw_spin_unlock(&lock->wait_lock); - } - if (do_six_trylock_type(lock, type, false)) + if (wait->lock_acquired) break; ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; - if (ret) + if (unlikely(ret)) { + raw_spin_lock(&lock->wait_lock); + if (!wait->lock_acquired) + list_del(&wait->list); + raw_spin_unlock(&lock->wait_lock); + + if (wait->lock_acquired) + do_six_unlock_type(lock, type); break; + } schedule(); } __set_current_state(TASK_RUNNING); - - if (!list_empty_careful(&wait.list)) { - raw_spin_lock(&lock->wait_lock); - list_del_init(&wait.list); - raw_spin_unlock(&lock->wait_lock); - } -out_before_sleep: +out: if (ret && type == SIX_LOCK_write) { old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1), &lock->state.counter); @@ -518,9 +490,10 @@ out_before_sleep: return ret; } -__always_inline -static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) +__always_inline __flatten +static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p) { int ret; @@ -528,7 +501,7 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, six_acquire(&lock->dep_map, 0); ret = do_six_trylock_type(lock, type, true) ? 0 - : __six_lock_type_slowpath(lock, type, should_sleep_fn, p); + : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p); if (ret && type != SIX_LOCK_write) six_release(&lock->dep_map); @@ -538,28 +511,23 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, return ret; } +__always_inline +static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p) +{ + struct six_lock_waiter wait; + + return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p); +} + __always_inline __flatten -static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) +static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) { const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state state; - EBUG_ON(type == SIX_LOCK_write && - !(lock->state.v & __SIX_LOCK_HELD_intent)); - - if (type != SIX_LOCK_write) - six_release(&lock->dep_map); - - if (type == SIX_LOCK_intent) { - EBUG_ON(lock->owner != current); - - if (lock->intent_lock_recurse) { - --lock->intent_lock_recurse; - return; - } - + if (type == SIX_LOCK_intent) lock->owner = NULL; - } if (type == SIX_LOCK_read && lock->readers) { @@ -576,6 +544,27 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) six_lock_wakeup(lock, state, l[type].unlock_wakeup); } +__always_inline __flatten +static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) +{ + EBUG_ON(type == SIX_LOCK_write && + !(lock->state.v & __SIX_LOCK_HELD_intent)); + EBUG_ON((type == SIX_LOCK_write || + type == SIX_LOCK_intent) && + lock->owner != current); + + if (type != SIX_LOCK_write) + six_release(&lock->dep_map); + + if (type == SIX_LOCK_intent && + lock->intent_lock_recurse) { + --lock->intent_lock_recurse; + return; + } + + do_six_unlock_type(lock, type); +} + #define __SIX_LOCK(type) \ bool six_trylock_##type(struct six_lock *lock) \ { \ @@ -596,6 +585,14 @@ int six_lock_##type(struct six_lock *lock, \ } \ EXPORT_SYMBOL_GPL(six_lock_##type); \ \ +int six_lock_waiter_##type(struct six_lock *lock, \ + struct six_lock_waiter *wait, \ + six_lock_should_sleep_fn should_sleep_fn, void *p)\ +{ \ + return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p);\ +} \ +EXPORT_SYMBOL_GPL(six_lock_waiter_##type); \ + \ void six_unlock_##type(struct six_lock *lock) \ { \ __six_unlock_type(lock, SIX_LOCK_##type); \ @@ -639,7 +636,7 @@ bool six_lock_tryupgrade(struct six_lock *lock) if (lock->readers) this_cpu_dec(*lock->readers); - six_set_owner(lock, SIX_LOCK_intent, old); + six_set_owner(lock, SIX_LOCK_intent, old, current); return true; } @@ -701,44 +698,12 @@ void six_lock_wakeup_all(struct six_lock *lock) struct six_lock_waiter *w; raw_spin_lock(&lock->wait_lock); - - list_for_each_entry(w, &lock->wait_list[0], list) - wake_up_process(w->task); - list_for_each_entry(w, &lock->wait_list[1], list) + list_for_each_entry(w, &lock->wait_list, list) wake_up_process(w->task); - raw_spin_unlock(&lock->wait_lock); } EXPORT_SYMBOL_GPL(six_lock_wakeup_all); -struct free_pcpu_rcu { - struct rcu_head rcu; - void __percpu *p; -}; - -static void free_pcpu_rcu_fn(struct rcu_head *_rcu) -{ - struct free_pcpu_rcu *rcu = - container_of(_rcu, struct free_pcpu_rcu, rcu); - - free_percpu(rcu->p); - kfree(rcu); -} - -void six_lock_pcpu_free_rcu(struct six_lock *lock) -{ - struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL); - - if (!rcu) - return; - - rcu->p = lock->readers; - lock->readers = NULL; - - call_rcu(&rcu->rcu, free_pcpu_rcu_fn); -} -EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu); - void six_lock_pcpu_free(struct six_lock *lock) { BUG_ON(lock->readers && pcpu_read_count(lock)); @@ -763,15 +728,19 @@ EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc); */ struct six_lock_count six_lock_counts(struct six_lock *lock) { - struct six_lock_count ret = { 0, lock->state.intent_lock }; + struct six_lock_count ret; + + ret.n[SIX_LOCK_read] = 0; + ret.n[SIX_LOCK_intent] = lock->state.intent_lock + lock->intent_lock_recurse; + ret.n[SIX_LOCK_write] = lock->state.seq & 1; if (!lock->readers) - ret.read += lock->state.read_lock; + ret.n[SIX_LOCK_read] += lock->state.read_lock; else { int cpu; for_each_possible_cpu(cpu) - ret.read += *per_cpu_ptr(lock->readers, cpu); + ret.n[SIX_LOCK_read] += *per_cpu_ptr(lock->readers, cpu); } return ret; -- cgit v1.2.3