summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2022-09-04 14:21:58 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2022-09-04 14:42:05 -0400
commitd01f633041c50452f9e837b7cc0223e6f37d42da (patch)
tree2aa5423e5005b379f4a1a3c45b245e094918d3f4
parentd2c2c5954c3304598cb2ecc2e8f11788356f5afc (diff)
Update bcachefs sources to 176718966e bcachefs: Re-enable hash_redo_key()
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/linux/list.h1
-rw-r--r--include/linux/six.h30
-rw-r--r--include/trace/events/bcachefs.h246
-rw-r--r--libbcachefs/alloc_background.c3
-rw-r--r--libbcachefs/alloc_foreground.c29
-rw-r--r--libbcachefs/bcachefs.h16
-rw-r--r--libbcachefs/bcachefs_format.h81
-rw-r--r--libbcachefs/btree_cache.c47
-rw-r--r--libbcachefs/btree_cache.h6
-rw-r--r--libbcachefs/btree_gc.c44
-rw-r--r--libbcachefs/btree_io.c12
-rw-r--r--libbcachefs/btree_iter.c507
-rw-r--r--libbcachefs/btree_iter.h25
-rw-r--r--libbcachefs/btree_key_cache.c244
-rw-r--r--libbcachefs/btree_locking.c466
-rw-r--r--libbcachefs/btree_locking.h279
-rw-r--r--libbcachefs/btree_types.h17
-rw-r--r--libbcachefs/btree_update_interior.c170
-rw-r--r--libbcachefs/btree_update_interior.h1
-rw-r--r--libbcachefs/btree_update_leaf.c65
-rw-r--r--libbcachefs/data_update.c19
-rw-r--r--libbcachefs/debug.c3
-rw-r--r--libbcachefs/disk_groups.c28
-rw-r--r--libbcachefs/disk_groups.h1
-rw-r--r--libbcachefs/fsck.c13
-rw-r--r--libbcachefs/io.c10
-rw-r--r--libbcachefs/journal.c4
-rw-r--r--libbcachefs/journal_io.c2
-rw-r--r--libbcachefs/journal_reclaim.c5
-rw-r--r--libbcachefs/move.c6
-rw-r--r--libbcachefs/movinggc.c4
-rw-r--r--libbcachefs/str_hash.h43
-rw-r--r--libbcachefs/super-io.c2
-rw-r--r--libbcachefs/super.c18
-rw-r--r--libbcachefs/sysfs.c19
-rw-r--r--linux/six.c461
37 files changed, 1632 insertions, 1297 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 23636ff3..f3aeb2c0 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-a7694865a3008d6752370caee2ed3c64c1b0f973
+176718966e14c5f832ead8cea2e0e45aba51f5ef
diff --git a/include/linux/list.h b/include/linux/list.h
index 3639dc99..dcc4745f 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -10,6 +10,7 @@
#define list_add(n, h) cds_list_add(n, h)
#define list_add_tail(n, h) cds_list_add_tail(n, h)
#define __list_del_entry(l) cds_list_del(l)
+#define __list_del(p, n) __cds_list_del(p, n)
#define list_del(l) cds_list_del(l)
#define list_del_init(l) cds_list_del_init(l)
#define list_replace(o, n) cds_list_replace(o, n)
diff --git a/include/linux/six.h b/include/linux/six.h
index 41ddf63b..f336ae04 100644
--- a/include/linux/six.h
+++ b/include/linux/six.h
@@ -59,7 +59,6 @@
*/
#include <linux/lockdep.h>
-#include <linux/osq_lock.h>
#include <linux/sched.h>
#include <linux/types.h>
@@ -105,18 +104,23 @@ enum six_lock_type {
struct six_lock {
union six_lock_state state;
- unsigned intent_lock_recurse;
struct task_struct *owner;
- struct optimistic_spin_queue osq;
unsigned __percpu *readers;
-
+ unsigned intent_lock_recurse;
raw_spinlock_t wait_lock;
- struct list_head wait_list[2];
+ struct list_head wait_list;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
};
+struct six_lock_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum six_lock_type lock_want;
+ bool lock_acquired;
+};
+
typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
static __always_inline void __six_lock_init(struct six_lock *lock,
@@ -125,8 +129,7 @@ static __always_inline void __six_lock_init(struct six_lock *lock,
{
atomic64_set(&lock->state.counter, 0);
raw_spin_lock_init(&lock->wait_lock);
- INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
- INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
+ INIT_LIST_HEAD(&lock->wait_list);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
debug_check_no_locks_freed((void *) lock, sizeof(*lock));
lockdep_init_map(&lock->dep_map, name, key, 0);
@@ -146,6 +149,8 @@ do { \
bool six_trylock_##type(struct six_lock *); \
bool six_relock_##type(struct six_lock *, u32); \
int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
+int six_lock_waiter_##type(struct six_lock *, struct six_lock_waiter *, \
+ six_lock_should_sleep_fn, void *); \
void six_unlock_##type(struct six_lock *);
__SIX_LOCK(read)
@@ -182,6 +187,13 @@ static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
}
+static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+ SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p);
+}
+
static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
{
SIX_LOCK_DISPATCH(type, six_unlock, lock);
@@ -196,13 +208,11 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
void six_lock_wakeup_all(struct six_lock *);
-void six_lock_pcpu_free_rcu(struct six_lock *);
void six_lock_pcpu_free(struct six_lock *);
void six_lock_pcpu_alloc(struct six_lock *);
struct six_lock_count {
- unsigned read;
- unsigned intent;
+ unsigned n[3];
};
struct six_lock_count six_lock_counts(struct six_lock *);
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index a18c59a3..ff5e6f7c 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -18,7 +18,7 @@
__entry->dst##_snapshot = (src).snapshot
DECLARE_EVENT_CLASS(bpos,
- TP_PROTO(struct bpos *p),
+ TP_PROTO(const struct bpos *p),
TP_ARGS(p),
TP_STRUCT__entry(
@@ -52,6 +52,31 @@ DECLARE_EVENT_CLASS(bkey,
__entry->offset, __entry->size)
);
+DECLARE_EVENT_CLASS(btree_node,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u8, level )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->level = b->c.level;
+ __entry->btree_id = b->c.btree_id;
+ TRACE_BPOS_assign(pos, b->key.k.p);
+ ),
+
+ TP_printk("%d,%d %u %s %llu:%llu:%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->level,
+ bch2_btree_ids[__entry->btree_id],
+ __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
DECLARE_EVENT_CLASS(bch_fs,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c),
@@ -112,7 +137,7 @@ TRACE_EVENT(write_super,
/* io.c: */
-DEFINE_EVENT(bio, read_split,
+DEFINE_EVENT(bio, read_promote,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
@@ -122,12 +147,17 @@ DEFINE_EVENT(bio, read_bounce,
TP_ARGS(bio)
);
+DEFINE_EVENT(bio, read_split,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio)
+);
+
DEFINE_EVENT(bio, read_retry,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
-DEFINE_EVENT(bio, promote,
+DEFINE_EVENT(bio, read_reuse_race,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
@@ -220,48 +250,68 @@ TRACE_EVENT(journal_reclaim_finish,
__entry->nr_flushed)
);
-/* allocator: */
-
/* bset.c: */
DEFINE_EVENT(bpos, bkey_pack_pos_fail,
- TP_PROTO(struct bpos *p),
+ TP_PROTO(const struct bpos *p),
TP_ARGS(p)
);
-/* Btree */
+/* Btree cache: */
-DECLARE_EVENT_CLASS(btree_node,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b),
+TRACE_EVENT(btree_cache_scan,
+ TP_PROTO(long nr_to_scan, long can_free, long ret),
+ TP_ARGS(nr_to_scan, can_free, ret),
TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u8, level )
- __field(u8, btree_id )
- TRACE_BPOS_entries(pos)
+ __field(long, nr_to_scan )
+ __field(long, can_free )
+ __field(long, ret )
),
TP_fast_assign(
- __entry->dev = c->dev;
- __entry->level = b->c.level;
- __entry->btree_id = b->c.btree_id;
- TRACE_BPOS_assign(pos, b->key.k.p);
+ __entry->nr_to_scan = nr_to_scan;
+ __entry->can_free = can_free;
+ __entry->ret = ret;
),
- TP_printk("%d,%d %u %s %llu:%llu:%u",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->level,
- bch2_btree_ids[__entry->btree_id],
- __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+ TP_printk("scanned for %li nodes, can free %li, ret %li",
+ __entry->nr_to_scan, __entry->can_free, __entry->ret)
);
-DEFINE_EVENT(btree_node, btree_read,
+DEFINE_EVENT(btree_node, btree_cache_reap,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-TRACE_EVENT(btree_write,
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
+ TP_PROTO(struct bch_fs *c),
+ TP_ARGS(c)
+);
+
+/* Btree */
+
+DEFINE_EVENT(btree_node, btree_node_read,
+ TP_PROTO(struct bch_fs *c, struct btree *b),
+ TP_ARGS(c, b)
+);
+
+TRACE_EVENT(btree_node_write,
TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
TP_ARGS(b, bytes, sectors),
@@ -291,31 +341,6 @@ DEFINE_EVENT(btree_node, btree_node_free,
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_node_reap,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
-);
-
TRACE_EVENT(btree_reserve_get_fail,
TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
@@ -323,7 +348,7 @@ TRACE_EVENT(btree_reserve_get_fail,
TP_ARGS(trans_fn, caller_ip, required),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(size_t, required )
),
@@ -340,52 +365,32 @@ TRACE_EVENT(btree_reserve_get_fail,
__entry->required)
);
-DEFINE_EVENT(btree_node, btree_split,
+DEFINE_EVENT(btree_node, btree_node_compact,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_compact,
+DEFINE_EVENT(btree_node, btree_node_merge,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_merge,
+DEFINE_EVENT(btree_node, btree_node_split,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_rewrite,
+DEFINE_EVENT(btree_node, btree_node_rewrite,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(btree_node, btree_set_root,
+DEFINE_EVENT(btree_node, btree_node_set_root,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-TRACE_EVENT(btree_cache_scan,
- TP_PROTO(long nr_to_scan, long can_free, long ret),
- TP_ARGS(nr_to_scan, can_free, ret),
-
- TP_STRUCT__entry(
- __field(long, nr_to_scan )
- __field(long, can_free )
- __field(long, ret )
- ),
-
- TP_fast_assign(
- __entry->nr_to_scan = nr_to_scan;
- __entry->can_free = can_free;
- __entry->ret = ret;
- ),
-
- TP_printk("scanned for %li nodes, can free %li, ret %li",
- __entry->nr_to_scan, __entry->can_free, __entry->ret)
-);
-
-TRACE_EVENT(btree_node_relock_fail,
+TRACE_EVENT(btree_path_relock_fail,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
struct btree_path *path,
@@ -393,26 +398,31 @@ TRACE_EVENT(btree_node_relock_fail,
TP_ARGS(trans, caller_ip, path, level),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(u8, btree_id )
TRACE_BPOS_entries(pos)
- __field(unsigned long, node )
+ __array(char, node, 24 )
__field(u32, iter_lock_seq )
__field(u32, node_lock_seq )
),
TP_fast_assign(
+ struct btree *b = btree_path_node(path, level);
+
strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->btree_id = path->btree_id;
TRACE_BPOS_assign(pos, path->pos);
- __entry->node = (unsigned long) btree_path_node(path, level);
+ if (IS_ERR(b))
+ strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
+ else
+ scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
__entry->iter_lock_seq = path->l[level].lock_seq;
__entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u, node %s iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
bch2_btree_ids[__entry->btree_id],
@@ -424,7 +434,7 @@ TRACE_EVENT(btree_node_relock_fail,
__entry->node_lock_seq)
);
-TRACE_EVENT(btree_node_upgrade_fail,
+TRACE_EVENT(btree_path_upgrade_fail,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
struct btree_path *path,
@@ -432,7 +442,7 @@ TRACE_EVENT(btree_node_upgrade_fail,
TP_ARGS(trans, caller_ip, path, level),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(u8, btree_id )
TRACE_BPOS_entries(pos)
@@ -452,12 +462,12 @@ TRACE_EVENT(btree_node_upgrade_fail,
TRACE_BPOS_assign(pos, path->pos);
__entry->locked = btree_node_locked(path, level);
- c = bch2_btree_node_lock_counts(trans, NULL, path->l[level].b, level),
- __entry->self_read_count = c.read;
- __entry->self_intent_count = c.intent;
+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+ __entry->self_read_count = c.n[SIX_LOCK_read];
+ __entry->self_intent_count = c.n[SIX_LOCK_intent];
c = six_lock_counts(&path->l[level].b->c.lock);
- __entry->read_count = c.read;
- __entry->intent_count = c.intent;
+ __entry->read_count = c.n[SIX_LOCK_read];
+ __entry->intent_count = c.n[SIX_LOCK_read];
),
TP_printk("%s %pS btree %s pos %llu:%llu:%u, locked %u held %u:%u lock count %u:%u",
@@ -599,7 +609,7 @@ TRACE_EVENT(discard_buckets,
__entry->err)
);
-TRACE_EVENT(invalidate_bucket,
+TRACE_EVENT(bucket_invalidate,
TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
TP_ARGS(c, dev, bucket, sectors),
@@ -625,17 +635,27 @@ TRACE_EVENT(invalidate_bucket,
/* Moving IO */
-DEFINE_EVENT(bkey, move_extent,
+DEFINE_EVENT(bkey, move_extent_read,
+ TP_PROTO(const struct bkey *k),
+ TP_ARGS(k)
+);
+
+DEFINE_EVENT(bkey, move_extent_write,
+ TP_PROTO(const struct bkey *k),
+ TP_ARGS(k)
+);
+
+DEFINE_EVENT(bkey, move_extent_finish,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
-DEFINE_EVENT(bkey, move_alloc_mem_fail,
+DEFINE_EVENT(bkey, move_extent_race,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
-DEFINE_EVENT(bkey, move_race,
+DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
TP_PROTO(const struct bkey *k),
TP_ARGS(k)
);
@@ -714,13 +734,15 @@ TRACE_EVENT(copygc_wait,
__entry->wait_amount, __entry->until)
);
+/* btree transactions: */
+
DECLARE_EVENT_CLASS(transaction_event,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
TP_ARGS(trans, caller_ip),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
),
@@ -738,7 +760,7 @@ DEFINE_EVENT(transaction_event, transaction_commit,
TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_event, transaction_restart_injected,
+DEFINE_EVENT(transaction_event, trans_restart_injected,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
TP_ARGS(trans, caller_ip)
@@ -756,10 +778,28 @@ DEFINE_EVENT(transaction_event, trans_restart_journal_res_get,
TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_event, trans_restart_journal_preres_get,
+
+TRACE_EVENT(trans_restart_journal_preres_get,
TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
+ unsigned long caller_ip,
+ unsigned flags),
+ TP_ARGS(trans, caller_ip, flags),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(unsigned, flags )
+ ),
+
+ TP_fast_assign(
+ strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("%s %pS %x", __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ __entry->flags)
);
DEFINE_EVENT(transaction_event, trans_restart_journal_reclaim,
@@ -805,7 +845,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
TP_ARGS(trans, caller_ip, path),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(u8, btree_id )
TRACE_BPOS_entries(pos)
@@ -883,7 +923,7 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
TP_ARGS(trans, caller_ip, path)
);
-DEFINE_EVENT(transaction_event, transaction_restart_key_cache_upgrade,
+DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
TP_ARGS(trans, caller_ip)
@@ -935,7 +975,7 @@ TRACE_EVENT(trans_restart_would_deadlock,
have, want, want_pos),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(u8, in_traverse_all )
__field(u8, reason )
@@ -982,7 +1022,7 @@ TRACE_EVENT(trans_restart_would_deadlock_write,
TP_ARGS(trans),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
),
TP_fast_assign(
@@ -999,7 +1039,7 @@ TRACE_EVENT(trans_restart_mem_realloced,
TP_ARGS(trans, caller_ip, bytes),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(unsigned long, bytes )
),
@@ -1025,7 +1065,7 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
TP_STRUCT__entry(
- __array(char, trans_fn, 24 )
+ __array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(enum btree_id, btree_id )
TRACE_BPOS_entries(pos)
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 2281b8d4..d0d7690a 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -1217,8 +1217,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
if (ret)
goto out;
- trace_invalidate_bucket(c, bucket.inode, bucket.offset, cached_sectors);
- this_cpu_inc(c->counters[BCH_COUNTER_bucket_invalidate]);
+ trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
--*nr_to_invalidate;
out:
bch2_trans_iter_exit(trans, &alloc_iter);
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index c57baa1f..dce227c5 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -268,7 +268,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
spin_unlock(&c->freelist_lock);
- trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]);
+ trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve]);
return ob;
}
@@ -575,20 +575,19 @@ err:
if (!ob)
ob = ERR_PTR(-BCH_ERR_no_buckets_found);
- if (IS_ERR(ob)) {
- trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve],
- usage.d[BCH_DATA_free].buckets,
- avail,
- bch2_copygc_wait_amount(c),
- c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
- buckets_seen,
- skipped_open,
- skipped_need_journal_commit,
- skipped_nouse,
- cl == NULL,
- bch2_err_str(PTR_ERR(ob)));
- atomic_long_inc(&c->bucket_alloc_fail);
- }
+ if (IS_ERR(ob))
+ trace_and_count(c, bucket_alloc_fail,
+ ca, bch2_alloc_reserves[reserve],
+ usage.d[BCH_DATA_free].buckets,
+ avail,
+ bch2_copygc_wait_amount(c),
+ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
+ buckets_seen,
+ skipped_open,
+ skipped_need_journal_commit,
+ skipped_nouse,
+ cl == NULL,
+ bch2_err_str(PTR_ERR(ob)));
return ob;
}
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index a5bf8087..53e7b5a0 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -212,6 +212,12 @@
#define dynamic_fault(...) 0
#define race_fault(...) 0
+#define trace_and_count(_c, _name, ...) \
+do { \
+ this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \
+ trace_##_name(__VA_ARGS__); \
+} while (0)
+
#define bch2_fs_init_fault(name) \
dynamic_fault("bcachefs:bch_fs_init:" name)
#define bch2_meta_read_fault(name) \
@@ -329,9 +335,6 @@ BCH_DEBUG_PARAMS_DEBUG()
x(btree_interior_update_foreground) \
x(btree_interior_update_total) \
x(btree_gc) \
- x(btree_lock_contended_read) \
- x(btree_lock_contended_intent) \
- x(btree_lock_contended_write) \
x(data_write) \
x(data_read) \
x(data_promote) \
@@ -535,6 +538,7 @@ struct btree_transaction_stats {
struct mutex lock;
struct time_stats lock_hold_times;
unsigned nr_max_paths;
+ unsigned max_mem;
char *max_paths_text;
};
@@ -917,12 +921,6 @@ struct bch_fs {
u64 last_bucket_seq_cleanup;
- /* TODO rewrite as counters - The rest of this all shows up in sysfs */
- atomic_long_t read_realloc_races;
- atomic_long_t extent_migrate_done;
- atomic_long_t extent_migrate_raced;
- atomic_long_t bucket_alloc_fail;
-
u64 counters_on_mount[BCH_COUNTER_NR];
u64 __percpu *counters;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 147fde14..7730e955 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -1337,12 +1337,81 @@ struct bch_sb_field_disk_groups {
/* BCH_SB_FIELD_counters */
-#define BCH_PERSISTENT_COUNTERS() \
- x(io_read, 0) \
- x(io_write, 1) \
- x(io_move, 2) \
- x(bucket_invalidate, 3) \
- x(bucket_discard, 4)
+#define BCH_PERSISTENT_COUNTERS() \
+ x(io_read, 0) \
+ x(io_write, 1) \
+ x(io_move, 2) \
+ x(bucket_invalidate, 3) \
+ x(bucket_discard, 4) \
+ x(bucket_alloc, 5) \
+ x(bucket_alloc_fail, 6) \
+ x(btree_cache_scan, 7) \
+ x(btree_cache_reap, 8) \
+ x(btree_cache_cannibalize, 9) \
+ x(btree_cache_cannibalize_lock, 10) \
+ x(btree_cache_cannibalize_lock_fail, 11) \
+ x(btree_cache_cannibalize_unlock, 12) \
+ x(btree_node_write, 13) \
+ x(btree_node_read, 14) \
+ x(btree_node_compact, 15) \
+ x(btree_node_merge, 16) \
+ x(btree_node_split, 17) \
+ x(btree_node_rewrite, 18) \
+ x(btree_node_alloc, 19) \
+ x(btree_node_free, 20) \
+ x(btree_node_set_root, 21) \
+ x(btree_path_relock_fail, 22) \
+ x(btree_path_upgrade_fail, 23) \
+ x(btree_reserve_get_fail, 24) \
+ x(journal_entry_full, 25) \
+ x(journal_full, 26) \
+ x(journal_reclaim_finish, 27) \
+ x(journal_reclaim_start, 28) \
+ x(journal_write, 29) \
+ x(read_promote, 30) \
+ x(read_bounce, 31) \
+ x(read_split, 33) \
+ x(read_retry, 32) \
+ x(read_reuse_race, 34) \
+ x(move_extent_read, 35) \
+ x(move_extent_write, 36) \
+ x(move_extent_finish, 37) \
+ x(move_extent_race, 38) \
+ x(move_extent_alloc_mem_fail, 39) \
+ x(copygc, 40) \
+ x(copygc_wait, 41) \
+ x(gc_gens_end, 42) \
+ x(gc_gens_start, 43) \
+ x(trans_blocked_journal_reclaim, 44) \
+ x(trans_restart_btree_node_reused, 45) \
+ x(trans_restart_btree_node_split, 46) \
+ x(trans_restart_fault_inject, 47) \
+ x(trans_restart_iter_upgrade, 48) \
+ x(trans_restart_journal_preres_get, 49) \
+ x(trans_restart_journal_reclaim, 50) \
+ x(trans_restart_journal_res_get, 51) \
+ x(trans_restart_key_cache_key_realloced, 52) \
+ x(trans_restart_key_cache_raced, 53) \
+ x(trans_restart_mark_replicas, 54) \
+ x(trans_restart_mem_realloced, 55) \
+ x(trans_restart_memory_allocation_failure, 56) \
+ x(trans_restart_relock, 57) \
+ x(trans_restart_relock_after_fill, 58) \
+ x(trans_restart_relock_key_cache_fill, 59) \
+ x(trans_restart_relock_next_node, 60) \
+ x(trans_restart_relock_parent_for_fill, 61) \
+ x(trans_restart_relock_path, 62) \
+ x(trans_restart_relock_path_intent, 63) \
+ x(trans_restart_too_many_iters, 64) \
+ x(trans_restart_traverse, 65) \
+ x(trans_restart_upgrade, 66) \
+ x(trans_restart_would_deadlock, 67) \
+ x(trans_restart_would_deadlock_write, 68) \
+ x(trans_restart_injected, 69) \
+ x(trans_restart_key_cache_upgrade, 70) \
+ x(trans_traverse_all, 71) \
+ x(transaction_commit, 72) \
+ x(write_super, 73)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 5a6c93d1..dabdb25c 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -14,8 +14,6 @@
#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
-struct lock_class_key bch2_btree_node_lock_key;
-
const char * const bch2_btree_node_flags[] = {
#define x(f) #f,
BTREE_FLAGS()
@@ -254,7 +252,7 @@ wait_on_io:
}
out:
if (b->hash_val && !ret)
- trace_btree_node_reap(c, b);
+ trace_and_count(c, btree_cache_reap, c, b);
return ret;
out_unlock:
six_unlock_write(&b->c.lock);
@@ -378,7 +376,7 @@ out:
ret = freed;
memalloc_nofs_restore(flags);
out_norestore:
- trace_btree_cache_scan(sc->nr_to_scan, can_free, ret);
+ trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
return ret;
}
@@ -514,7 +512,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
struct btree_cache *bc = &c->btree_cache;
if (bc->alloc_lock == current) {
- trace_btree_node_cannibalize_unlock(c);
+ trace_and_count(c, btree_cache_cannibalize_unlock, c);
bc->alloc_lock = NULL;
closure_wake_up(&bc->alloc_wait);
}
@@ -530,7 +528,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
if (!cl) {
- trace_btree_node_cannibalize_lock_fail(c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
return -ENOMEM;
}
@@ -544,11 +542,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
}
- trace_btree_node_cannibalize_lock_fail(c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
return -EAGAIN;
success:
- trace_btree_node_cannibalize_lock(c);
+ trace_and_count(c, btree_cache_cannibalize_lock, c);
return 0;
}
@@ -672,7 +670,7 @@ err_locked:
mutex_unlock(&bc->lock);
- trace_btree_node_cannibalize(c);
+ trace_and_count(c, btree_cache_cannibalize, c);
goto out;
}
@@ -701,7 +699,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
* been freed:
*/
if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
- trace_trans_restart_relock_parent_for_fill(trans, _THIS_IP_, path);
+ trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
}
@@ -709,7 +707,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
if (trans && b == ERR_PTR(-ENOMEM)) {
trans->memory_allocation_failure = true;
- trace_trans_restart_memory_allocation_failure(trans, _THIS_IP_, path);
+ trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
}
@@ -758,7 +756,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
if (!six_relock_type(&b->c.lock, lock_type, seq)) {
if (trans)
- trace_trans_restart_relock_after_fill(trans, _THIS_IP_, path);
+ trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
}
@@ -896,7 +894,7 @@ lock_node:
if (btree_node_read_locked(path, level + 1))
btree_node_unlock(trans, path, level + 1);
- ret = btree_node_lock(trans, path, b, k->k.p, level, lock_type,
+ ret = btree_node_lock(trans, path, &b->c, k->k.p, level, lock_type,
lock_node_check_fn, (void *) k, trace_ip);
if (unlikely(ret)) {
if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
@@ -913,7 +911,7 @@ lock_node:
if (bch2_btree_node_relock(trans, path, level + 1))
goto retry;
- trace_trans_restart_btree_node_reused(trans, trace_ip, path);
+ trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
}
}
@@ -969,12 +967,13 @@ lock_node:
return b;
}
-struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
const struct bkey_i *k,
enum btree_id btree_id,
unsigned level,
bool nofill)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
struct bset_tree *t;
@@ -1008,9 +1007,14 @@ retry:
goto out;
} else {
lock_node:
- ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
- if (ret)
- goto retry;
+ ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
+ if (unlikely(ret)) {
+ if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
+ goto retry;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ERR_PTR(ret);
+ BUG();
+ }
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
b->c.btree_id != btree_id ||
@@ -1072,8 +1076,9 @@ int bch2_btree_node_prefetch(struct bch_fs *c,
return PTR_ERR_OR_ZERO(b);
}
-void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
+void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
@@ -1089,8 +1094,8 @@ wait_on_io:
__bch2_btree_node_wait_on_read(b);
__bch2_btree_node_wait_on_write(b);
- six_lock_intent(&b->c.lock, NULL, NULL);
- six_lock_write(&b->c.lock, NULL, NULL);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
if (btree_node_dirty(b)) {
__bch2_btree_node_write(c, b, 0);
diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h
index 25906127..a4df3e86 100644
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@@ -5,8 +5,6 @@
#include "bcachefs.h"
#include "btree_types.h"
-extern struct lock_class_key bch2_btree_node_lock_key;
-
extern const char * const bch2_btree_node_flags[];
struct btree_iter;
@@ -28,13 +26,13 @@ struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
const struct bkey_i *, unsigned,
enum six_lock_type, unsigned long);
-struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
+struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *,
enum btree_id, unsigned, bool);
int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
const struct bkey_i *, enum btree_id, unsigned);
-void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
+void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *);
void bch2_fs_btree_cache_exit(struct bch_fs *);
int bch2_fs_btree_cache_init(struct bch_fs *);
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 2f563365..663c66d0 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -165,10 +165,11 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
}
}
-static void bch2_btree_node_update_key_early(struct bch_fs *c,
+static void bch2_btree_node_update_key_early(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_i *new)
{
+ struct bch_fs *c = trans->c;
struct btree *b;
struct bkey_buf tmp;
int ret;
@@ -176,7 +177,7 @@ static void bch2_btree_node_update_key_early(struct bch_fs *c,
bch2_bkey_buf_init(&tmp);
bch2_bkey_buf_reassemble(&tmp, c, old);
- b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true);
+ b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
if (!IS_ERR_OR_NULL(b)) {
mutex_lock(&c->btree_cache.lock);
@@ -352,8 +353,9 @@ fsck_err:
return ret;
}
-static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
{
+ struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf prev_k, cur_k;
@@ -378,7 +380,7 @@ again:
bch2_btree_and_journal_iter_advance(&iter);
bch2_bkey_buf_reassemble(&cur_k, c, k);
- cur = bch2_btree_node_get_noiter(c, cur_k.k,
+ cur = bch2_btree_node_get_noiter(trans, cur_k.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(cur);
@@ -392,7 +394,7 @@ again:
bch2_btree_ids[b->c.btree_id],
b->c.level - 1,
buf.buf)) {
- bch2_btree_node_evict(c, cur_k.k);
+ bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
cur = NULL;
@@ -411,7 +413,7 @@ again:
if (ret == DROP_THIS_NODE) {
six_unlock_read(&cur->c.lock);
- bch2_btree_node_evict(c, cur_k.k);
+ bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
cur = NULL;
@@ -425,7 +427,7 @@ again:
prev = NULL;
if (ret == DROP_PREV_NODE) {
- bch2_btree_node_evict(c, prev_k.k);
+ bch2_btree_node_evict(trans, prev_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, prev_k.k->k.p);
if (ret)
@@ -465,7 +467,7 @@ again:
bch2_bkey_buf_reassemble(&cur_k, c, k);
bch2_btree_and_journal_iter_advance(&iter);
- cur = bch2_btree_node_get_noiter(c, cur_k.k,
+ cur = bch2_btree_node_get_noiter(trans, cur_k.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(cur);
@@ -476,12 +478,12 @@ again:
goto err;
}
- ret = bch2_btree_repair_topology_recurse(c, cur);
+ ret = bch2_btree_repair_topology_recurse(trans, cur);
six_unlock_read(&cur->c.lock);
cur = NULL;
if (ret == DROP_THIS_NODE) {
- bch2_btree_node_evict(c, cur_k.k);
+ bch2_btree_node_evict(trans, cur_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
dropped_children = true;
@@ -522,18 +524,21 @@ fsck_err:
static int bch2_repair_topology(struct bch_fs *c)
{
+ struct btree_trans trans;
struct btree *b;
unsigned i;
int ret = 0;
+ bch2_trans_init(&trans, c, 0, 0);
+
for (i = 0; i < BTREE_ID_NR && !ret; i++) {
b = c->btree_roots[i].b;
if (btree_node_fake(b))
continue;
- six_lock_read(&b->c.lock, NULL, NULL);
- ret = bch2_btree_repair_topology_recurse(c, b);
six_unlock_read(&b->c.lock);
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ ret = bch2_btree_repair_topology_recurse(&trans, b);
if (ret == DROP_THIS_NODE) {
bch_err(c, "empty btree root - repair unimplemented");
@@ -541,13 +546,16 @@ static int bch2_repair_topology(struct bch_fs *c)
}
}
+ bch2_trans_exit(&trans);
+
return ret;
}
-static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id,
unsigned level, bool is_root,
struct bkey_s_c *k)
{
+ struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p = { 0 };
@@ -747,7 +755,7 @@ found:
}
if (level)
- bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
+ bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
if (c->opts.verbose) {
printbuf_reset(&buf);
@@ -788,7 +796,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
BUG_ON(bch2_journal_seq_verify &&
k->k->version.lo > atomic64_read(&c->journal.seq));
- ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
+ ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
if (ret)
goto err;
@@ -941,7 +949,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
bch2_bkey_buf_reassemble(&cur, c, k);
bch2_btree_and_journal_iter_advance(&iter);
- child = bch2_btree_node_get_noiter(c, cur.k,
+ child = bch2_btree_node_get_noiter(trans, cur.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(child);
@@ -1934,7 +1942,7 @@ int bch2_gc_gens(struct bch_fs *c)
if (!mutex_trylock(&c->gc_gens_lock))
return 0;
- trace_gc_gens_start(c);
+ trace_and_count(c, gc_gens_start, c);
down_read(&c->gc_lock);
bch2_trans_init(&trans, c, 0, 0);
@@ -1995,7 +2003,7 @@ int bch2_gc_gens(struct bch_fs *c)
c->gc_count++;
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
- trace_gc_gens_end(c);
+ trace_and_count(c, gc_gens_end, c);
err:
for_each_member_device(ca, c, i) {
kvfree(ca->oldest_gen);
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 8aad87ea..177fd49d 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -1490,7 +1490,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
struct bio *bio;
int ret;
- trace_btree_read(c, b);
+ trace_and_count(c, btree_node_read, c, b);
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
@@ -1657,9 +1657,15 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
static void btree_node_write_done(struct bch_fs *c, struct btree *b)
{
- six_lock_read(&b->c.lock, NULL, NULL);
+ struct btree_trans trans;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
__btree_node_write_done(c, b);
six_unlock_read(&b->c.lock);
+
+ bch2_trans_exit(&trans);
}
static void btree_node_write_work(struct work_struct *work)
@@ -1979,7 +1985,7 @@ do_write:
c->opts.nochanges)
goto err;
- trace_btree_write(b, bytes_to_write, sectors_to_write);
+ trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write);
wbio = container_of(bio_alloc_bioset(NULL,
buf_pages(data, sectors_to_write << 9),
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 1d4b9fde..512c3b2b 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -130,443 +130,6 @@ static inline bool btree_path_pos_in_node(struct btree_path *path,
!btree_path_pos_after_node(path, b);
}
-/* Btree node locking: */
-
-void bch2_btree_node_unlock_write(struct btree_trans *trans,
- struct btree_path *path, struct btree *b)
-{
- bch2_btree_node_unlock_write_inlined(trans, path, b);
-}
-
-struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
- struct btree_path *skip,
- struct btree *b,
- unsigned level)
-{
- struct btree_path *path;
- struct six_lock_count ret = { 0, 0 };
-
- if (IS_ERR_OR_NULL(b))
- return ret;
-
- trans_for_each_path(trans, path)
- if (path != skip && path->l[level].b == b) {
- ret.read += btree_node_read_locked(path, level);
- ret.intent += btree_node_intent_locked(path, level);
- }
-
- return ret;
-}
-
-static inline void six_lock_readers_add(struct six_lock *lock, int nr)
-{
- if (!lock->readers)
- atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
- else
- this_cpu_add(*lock->readers, nr);
-}
-
-void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
-{
- int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->c.level).read;
-
- /*
- * Must drop our read locks before calling six_lock_write() -
- * six_unlock() won't do wakeups until the reader count
- * goes to 0, and it's safe because we have the node intent
- * locked:
- */
- six_lock_readers_add(&b->c.lock, -readers);
- six_lock_write(&b->c.lock, NULL, NULL);
- six_lock_readers_add(&b->c.lock, readers);
-}
-
-bool __bch2_btree_node_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- struct btree *b = btree_path_node(path, level);
- int want = __btree_lock_want(path, level);
-
- if (!is_btree_node(path, level))
- goto fail;
-
- if (race_fault())
- goto fail;
-
- if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
- (btree_node_lock_seq_matches(path, b, level) &&
- btree_node_lock_increment(trans, b, level, want))) {
- mark_btree_node_locked(trans, path, level, want);
- return true;
- }
-fail:
- if (b != ERR_PTR(-BCH_ERR_no_btree_node_cached) &&
- b != ERR_PTR(-BCH_ERR_no_btree_node_init))
- trace_btree_node_relock_fail(trans, _RET_IP_, path, level);
- return false;
-}
-
-bool bch2_btree_node_upgrade(struct btree_trans *trans,
- struct btree_path *path, unsigned level)
-{
- struct btree *b = path->l[level].b;
-
- if (!is_btree_node(path, level))
- return false;
-
- switch (btree_lock_want(path, level)) {
- case BTREE_NODE_UNLOCKED:
- BUG_ON(btree_node_locked(path, level));
- return true;
- case BTREE_NODE_READ_LOCKED:
- BUG_ON(btree_node_intent_locked(path, level));
- return bch2_btree_node_relock(trans, path, level);
- case BTREE_NODE_INTENT_LOCKED:
- break;
- }
-
- if (btree_node_intent_locked(path, level))
- return true;
-
- if (race_fault())
- return false;
-
- if (btree_node_locked(path, level)
- ? six_lock_tryupgrade(&b->c.lock)
- : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
- goto success;
-
- if (btree_node_lock_seq_matches(path, b, level) &&
- btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
- btree_node_unlock(trans, path, level);
- goto success;
- }
-
- trace_btree_node_upgrade_fail(trans, _RET_IP_, path, level);
- return false;
-success:
- mark_btree_node_intent_locked(trans, path, level);
- return true;
-}
-
-static inline bool btree_path_get_locks(struct btree_trans *trans,
- struct btree_path *path,
- bool upgrade)
-{
- unsigned l = path->level;
- int fail_idx = -1;
-
- do {
- if (!btree_path_node(path, l))
- break;
-
- if (!(upgrade
- ? bch2_btree_node_upgrade(trans, path, l)
- : bch2_btree_node_relock(trans, path, l)))
- fail_idx = l;
-
- l++;
- } while (l < path->locks_want);
-
- /*
- * When we fail to get a lock, we have to ensure that any child nodes
- * can't be relocked so bch2_btree_path_traverse has to walk back up to
- * the node that we failed to relock:
- */
- if (fail_idx >= 0) {
- __bch2_btree_path_unlock(trans, path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-
- do {
- path->l[fail_idx].b = upgrade
- ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
- : ERR_PTR(-BCH_ERR_no_btree_node_relock);
- --fail_idx;
- } while (fail_idx >= 0);
- }
-
- if (path->uptodate == BTREE_ITER_NEED_RELOCK)
- path->uptodate = BTREE_ITER_UPTODATE;
-
- bch2_trans_verify_locks(trans);
-
- return path->uptodate < BTREE_ITER_NEED_RELOCK;
-}
-
-static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
- bool cached)
-{
- return !cached
- ? container_of(_b, struct btree, c)->key.k.p
- : container_of(_b, struct bkey_cached, c)->key.pos;
-}
-
-/* Slowpath: */
-int __bch2_btree_node_lock(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct bpos pos, unsigned level,
- enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip)
-{
- struct btree_path *linked;
- unsigned reason;
-
- /* Check if it's safe to block: */
- trans_for_each_path(trans, linked) {
- if (!linked->nodes_locked)
- continue;
-
- /*
- * Can't block taking an intent lock if we have _any_ nodes read
- * locked:
- *
- * - Our read lock blocks another thread with an intent lock on
- * the same node from getting a write lock, and thus from
- * dropping its intent lock
- *
- * - And the other thread may have multiple nodes intent locked:
- * both the node we want to intent lock, and the node we
- * already have read locked - deadlock:
- */
- if (type == SIX_LOCK_intent &&
- linked->nodes_locked != linked->nodes_intent_locked) {
- reason = 1;
- goto deadlock;
- }
-
- if (linked->btree_id != path->btree_id) {
- if (linked->btree_id < path->btree_id)
- continue;
-
- reason = 3;
- goto deadlock;
- }
-
- /*
- * Within the same btree, non-cached paths come before cached
- * paths:
- */
- if (linked->cached != path->cached) {
- if (!linked->cached)
- continue;
-
- reason = 4;
- goto deadlock;
- }
-
- /*
- * Interior nodes must be locked before their descendants: if
- * another path has possible descendants locked of the node
- * we're about to lock, it must have the ancestors locked too:
- */
- if (level > __fls(linked->nodes_locked)) {
- reason = 5;
- goto deadlock;
- }
-
- /* Must lock btree nodes in key order: */
- if (btree_node_locked(linked, level) &&
- bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
- linked->cached)) <= 0) {
- reason = 7;
- goto deadlock;
- }
- }
-
- return btree_node_lock_type(trans, path, b, pos, level,
- type, should_sleep_fn, p);
-deadlock:
- trace_trans_restart_would_deadlock(trans, ip, reason, linked, path, &pos);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
-}
-
-/* Btree iterator locking: */
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_btree_path_verify_locks(struct btree_path *path)
-{
- unsigned l;
-
- if (!path->nodes_locked) {
- BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
- btree_path_node(path, path->level));
- return;
- }
-
- for (l = 0; btree_path_node(path, l); l++)
- BUG_ON(btree_lock_want(path, l) !=
- btree_node_locked_type(path, l));
-}
-
-void bch2_trans_verify_locks(struct btree_trans *trans)
-{
- struct btree_path *path;
-
- trans_for_each_path(trans, path)
- bch2_btree_path_verify_locks(path);
-}
-#else
-static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
-#endif
-
-/* Btree path locking: */
-
-/*
- * Only for btree_cache.c - only relocks intent locks
- */
-int bch2_btree_path_relock_intent(struct btree_trans *trans,
- struct btree_path *path)
-{
- unsigned l;
-
- for (l = path->level;
- l < path->locks_want && btree_path_node(path, l);
- l++) {
- if (!bch2_btree_node_relock(trans, path, l)) {
- __bch2_btree_path_unlock(trans, path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- trace_trans_restart_relock_path_intent(trans, _RET_IP_, path);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
- }
- }
-
- return 0;
-}
-
-__flatten
-static bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
-{
- return btree_path_get_locks(trans, path, false);
-}
-
-static int bch2_btree_path_relock(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
-{
- if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
- trace_trans_restart_relock_path(trans, trace_ip, path);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
- }
-
- return 0;
-}
-
-bool __bch2_btree_path_upgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
-{
- struct btree_path *linked;
-
- EBUG_ON(path->locks_want >= new_locks_want);
-
- path->locks_want = new_locks_want;
-
- if (btree_path_get_locks(trans, path, true))
- return true;
-
- /*
- * XXX: this is ugly - we'd prefer to not be mucking with other
- * iterators in the btree_trans here.
- *
- * On failure to upgrade the iterator, setting iter->locks_want and
- * calling get_locks() is sufficient to make bch2_btree_path_traverse()
- * get the locks we want on transaction restart.
- *
- * But if this iterator was a clone, on transaction restart what we did
- * to this iterator isn't going to be preserved.
- *
- * Possibly we could add an iterator field for the parent iterator when
- * an iterator is a copy - for now, we'll just upgrade any other
- * iterators with the same btree id.
- *
- * The code below used to be needed to ensure ancestor nodes get locked
- * before interior nodes - now that's handled by
- * bch2_btree_path_traverse_all().
- */
- if (!path->cached && !trans->in_traverse_all)
- trans_for_each_path(trans, linked)
- if (linked != path &&
- linked->cached == path->cached &&
- linked->btree_id == path->btree_id &&
- linked->locks_want < new_locks_want) {
- linked->locks_want = new_locks_want;
- btree_path_get_locks(trans, linked, true);
- }
-
- return false;
-}
-
-void __bch2_btree_path_downgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
-{
- unsigned l;
-
- EBUG_ON(path->locks_want < new_locks_want);
-
- path->locks_want = new_locks_want;
-
- while (path->nodes_locked &&
- (l = __fls(path->nodes_locked)) >= path->locks_want) {
- if (l > path->level) {
- btree_node_unlock(trans, path, l);
- } else {
- if (btree_node_intent_locked(path, l)) {
- six_lock_downgrade(&path->l[l].b->c.lock);
- path->nodes_intent_locked ^= 1 << l;
- }
- break;
- }
- }
-
- bch2_btree_path_verify_locks(path);
-}
-
-void bch2_trans_downgrade(struct btree_trans *trans)
-{
- struct btree_path *path;
-
- trans_for_each_path(trans, path)
- bch2_btree_path_downgrade(trans, path);
-}
-
-/* Btree transaction locking: */
-
-int bch2_trans_relock(struct btree_trans *trans)
-{
- struct btree_path *path;
-
- if (unlikely(trans->restarted))
- return -BCH_ERR_transaction_restart_relock;
-
- trans_for_each_path(trans, path)
- if (path->should_be_locked &&
- bch2_btree_path_relock(trans, path, _RET_IP_)) {
- trace_trans_restart_relock(trans, _RET_IP_, path);
- BUG_ON(!trans->restarted);
- return -BCH_ERR_transaction_restart_relock;
- }
- return 0;
-}
-
-void bch2_trans_unlock(struct btree_trans *trans)
-{
- struct btree_path *path;
-
- trans_for_each_path(trans, path)
- __bch2_btree_path_unlock(trans, path);
-
- /*
- * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
- * btree nodes, it implements its own walking:
- */
- BUG_ON(!trans->is_initial_gc &&
- lock_class_is_held(&bch2_btree_node_lock_key));
-}
-
/* Btree iterator: */
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -795,7 +358,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
if (cmp < 0)
continue;
- if (!(path->nodes_locked & 1) ||
+ if (!btree_node_locked(path, 0) ||
!path->should_be_locked)
continue;
@@ -1161,13 +724,13 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
struct btree_path *path;
trans_for_each_path(trans, path)
- if (!path->cached &&
+ if (path->uptodate == BTREE_ITER_UPTODATE &&
+ !path->cached &&
btree_path_pos_in_node(path, b)) {
enum btree_node_locked_type t =
btree_lock_want(path, b->c.level);
- if (path->nodes_locked &&
- t != BTREE_NODE_UNLOCKED) {
+ if (t != BTREE_NODE_UNLOCKED) {
btree_node_unlock(trans, path, b->c.level);
six_lock_increment(&b->c.lock, t);
mark_btree_node_locked(trans, path, b->c.level, t);
@@ -1232,7 +795,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
}
lock_type = __btree_lock_want(path, path->level);
- ret = btree_node_lock(trans, path, b, SPOS_MAX,
+ ret = btree_node_lock(trans, path, &b->c, SPOS_MAX,
path->level, lock_type,
lock_root_check_fn, rootp,
trace_ip);
@@ -1517,7 +1080,7 @@ err:
trans->in_traverse_all = false;
- trace_trans_traverse_all(trans, trace_ip);
+ trace_and_count(c, trans_traverse_all, trans, trace_ip);
return ret;
}
@@ -1654,7 +1217,7 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
u64 mask = ~(~0ULL << restart_probability_bits);
if ((prandom_u32() & mask) == mask) {
- trace_transaction_restart_injected(trans, _RET_IP_);
+ trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
}
}
@@ -1956,7 +1519,6 @@ static struct btree_path *btree_path_alloc(struct btree_trans *trans,
path->ref = 0;
path->intent_ref = 0;
path->nodes_locked = 0;
- path->nodes_intent_locked = 0;
btree_path_list_add(trans, pos, path);
return path;
@@ -2006,7 +1568,6 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
path->level = level;
path->locks_want = locks_want;
path->nodes_locked = 0;
- path->nodes_intent_locked = 0;
for (i = 0; i < ARRAY_SIZE(path->l); i++)
path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -2030,10 +1591,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
*/
locks_want = min(locks_want, BTREE_MAX_DEPTH);
- if (locks_want > path->locks_want) {
- path->locks_want = locks_want;
- btree_path_get_locks(trans, path, true);
- }
+ if (locks_want > path->locks_want)
+ bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
return path;
}
@@ -2166,7 +1725,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- trace_trans_restart_relock_next_node(trans, _THIS_IP_, path);
+ trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
goto err;
}
@@ -3185,9 +2744,11 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
- size_t new_top = trans->mem_top + size;
+ unsigned new_top = trans->mem_top + size;
void *p;
+ trans->mem_max = max(trans->mem_max, new_top);
+
if (new_top > trans->mem_bytes) {
size_t old_bytes = trans->mem_bytes;
size_t new_bytes = roundup_pow_of_two(new_top);
@@ -3209,7 +2770,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
trans->mem_bytes = new_bytes;
if (old_bytes) {
- trace_trans_restart_mem_realloced(trans, _RET_IP_, new_bytes);
+ trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
}
}
@@ -3325,12 +2886,10 @@ static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct b
return i;
}
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
- unsigned expected_nr_iters,
- size_t expected_mem_bytes,
- const char *fn)
+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn)
__acquires(&c->btree_trans_barrier)
{
+ struct btree_transaction_stats *s;
struct btree_trans *pos;
BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
@@ -3344,7 +2903,10 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
bch2_trans_alloc_paths(trans, c);
- if (expected_mem_bytes) {
+ s = btree_trans_stats(trans);
+ if (s) {
+ unsigned expected_mem_bytes = s->max_mem;
+
trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
@@ -3395,9 +2957,13 @@ void bch2_trans_exit(struct btree_trans *trans)
{
struct btree_insert_entry *i;
struct bch_fs *c = trans->c;
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
bch2_trans_unlock(trans);
+ if (s)
+ s->max_mem = max(s->max_mem, trans->mem_max);
+
trans_for_each_update(trans, i)
__btree_path_put(i->path, true);
trans->nr_updates = 0;
@@ -3444,12 +3010,23 @@ void bch2_trans_exit(struct btree_trans *trans)
static void __maybe_unused
bch2_btree_path_node_to_text(struct printbuf *out,
- struct btree_bkey_cached_common *b,
- bool cached)
+ struct btree_bkey_cached_common *b)
{
+ struct six_lock_count c = six_lock_counts(&b->lock);
+ struct task_struct *owner;
+ pid_t pid;
+
+ rcu_read_lock();
+ owner = READ_ONCE(b->lock.owner);
+ pid = owner ? owner->pid : 0;;
+ rcu_read_unlock();
+
prt_printf(out, " l=%u %s:",
b->level, bch2_btree_ids[b->btree_id]);
- bch2_bpos_to_text(out, btree_node_pos(b, cached));
+ bch2_bpos_to_text(out, btree_node_pos(b));
+
+ prt_printf(out, " locks %u:%u:%u held by pid %u",
+ c.n[0], c.n[1], c.n[2], pid);
}
void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
@@ -3476,9 +3053,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
for (l = 0; l < BTREE_MAX_DEPTH; l++) {
if (btree_node_locked(path, l) &&
!IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) {
- prt_printf(out, " %s l=%u ",
- btree_node_intent_locked(path, l) ? "i" : "r", l);
- bch2_btree_path_node_to_text(out, b, path->cached);
+ prt_printf(out, " %c l=%u ",
+ lock_types[btree_node_locked_type(path, l)], l);
+ bch2_btree_path_node_to_text(out, b);
prt_printf(out, "\n");
}
}
@@ -3496,7 +3073,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
bch2_bpos_to_text(out, trans->locking_pos);
prt_printf(out, " node ");
- bch2_btree_path_node_to_text(out, b, path->cached);
+ bch2_btree_path_node_to_text(out, b);
prt_printf(out, "\n");
}
}
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 6ad28ff6..7b47b880 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -145,12 +145,10 @@ struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_trans_verify_paths(struct btree_trans *);
-void bch2_trans_verify_locks(struct btree_trans *);
void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
struct bpos, bool);
#else
static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
-static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
struct bpos pos, bool key_cache) {}
#endif
@@ -195,20 +193,6 @@ static inline int btree_trans_restart(struct btree_trans *trans, int err)
bool bch2_btree_node_upgrade(struct btree_trans *,
struct btree_path *, unsigned);
-bool __bch2_btree_path_upgrade(struct btree_trans *,
- struct btree_path *, unsigned);
-
-static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want)
-{
- new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
-
- return path->locks_want < new_locks_want
- ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
- : path->uptodate == BTREE_ITER_UPTODATE;
-}
-
void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned);
static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
@@ -367,8 +351,8 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2) {
- trace_trans_restart_too_many_iters(trans, _THIS_IP_);
+ if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
+ trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
}
@@ -544,11 +528,10 @@ void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
-void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
- unsigned, size_t, const char *);
+void __bch2_trans_init(struct btree_trans *, struct bch_fs *, const char *);
void bch2_trans_exit(struct btree_trans *);
-#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__)
+#define bch2_trans_init(_trans, _c, _nr_iters, _mem) __bch2_trans_init(_trans, _c, __func__)
void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 38b16f95..d900ff42 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -13,6 +13,11 @@
#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
+static inline bool btree_uses_pcpu_readers(enum btree_id id)
+{
+ return id == BTREE_ID_subvolumes;
+}
+
static struct kmem_cache *bch2_key_cache;
static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -84,7 +89,10 @@ static void bkey_cached_free(struct btree_key_cache *bc,
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
- list_move_tail(&ck->list, &bc->freed);
+ if (ck->c.lock.readers)
+ list_move_tail(&ck->list, &bc->freed_pcpu);
+ else
+ list_move_tail(&ck->list, &bc->freed_nonpcpu);
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
@@ -95,15 +103,51 @@ static void bkey_cached_free(struct btree_key_cache *bc,
six_unlock_intent(&ck->c.lock);
}
-static void bkey_cached_free_fast(struct btree_key_cache *bc,
- struct bkey_cached *ck)
+static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct btree_key_cache_freelist *f;
bool freed = false;
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+ if (!ck->c.lock.readers) {
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+
+ if (f->nr < ARRAY_SIZE(f->objs)) {
+ f->objs[f->nr++] = ck;
+ freed = true;
+ }
+ preempt_enable();
+
+ if (!freed) {
+ mutex_lock(&bc->lock);
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+
+ while (f->nr > ARRAY_SIZE(f->objs) / 2) {
+ struct bkey_cached *ck2 = f->objs[--f->nr];
+
+ list_move_tail(&ck2->list, &bc->freed_nonpcpu);
+ }
+ preempt_enable();
+
+ list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ mutex_unlock(&bc->lock);
+ }
+ } else {
+ mutex_lock(&bc->lock);
+ list_move_tail(&ck->list, &bc->freed_pcpu);
+ mutex_unlock(&bc->lock);
+ }
+}
+
+static void bkey_cached_free_fast(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
@@ -114,74 +158,84 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc,
ck->k = NULL;
ck->u64s = 0;
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- if (f->nr < ARRAY_SIZE(f->objs)) {
- f->objs[f->nr++] = ck;
- freed = true;
- }
- preempt_enable();
-
- if (!freed) {
- mutex_lock(&bc->lock);
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- while (f->nr > ARRAY_SIZE(f->objs) / 2) {
- struct bkey_cached *ck2 = f->objs[--f->nr];
-
- list_move_tail(&ck2->list, &bc->freed);
- }
- preempt_enable();
-
- list_move_tail(&ck->list, &bc->freed);
- mutex_unlock(&bc->lock);
- }
+ bkey_cached_move_to_freelist(bc, ck);
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
static struct bkey_cached *
-bkey_cached_alloc(struct btree_key_cache *c)
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path)
{
+ struct bch_fs *c = trans->c;
+ struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck = NULL;
struct btree_key_cache_freelist *f;
+ bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
- preempt_disable();
- f = this_cpu_ptr(c->pcpu_freed);
- if (f->nr)
- ck = f->objs[--f->nr];
- preempt_enable();
-
- if (!ck) {
- mutex_lock(&c->lock);
+ if (!pcpu_readers) {
preempt_disable();
- f = this_cpu_ptr(c->pcpu_freed);
+ f = this_cpu_ptr(bc->pcpu_freed);
+ if (f->nr)
+ ck = f->objs[--f->nr];
+ preempt_enable();
+
+ if (!ck) {
+ mutex_lock(&bc->lock);
+ preempt_disable();
+ f = this_cpu_ptr(bc->pcpu_freed);
+
+ while (!list_empty(&bc->freed_nonpcpu) &&
+ f->nr < ARRAY_SIZE(f->objs) / 2) {
+ ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
+ list_del_init(&ck->list);
+ f->objs[f->nr++] = ck;
+ }
- while (!list_empty(&c->freed) &&
- f->nr < ARRAY_SIZE(f->objs) / 2) {
- ck = list_last_entry(&c->freed, struct bkey_cached, list);
+ ck = f->nr ? f->objs[--f->nr] : NULL;
+ preempt_enable();
+ mutex_unlock(&bc->lock);
+ }
+ } else {
+ mutex_lock(&bc->lock);
+ if (!list_empty(&bc->freed_pcpu)) {
+ ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
list_del_init(&ck->list);
- f->objs[f->nr++] = ck;
}
-
- ck = f->nr ? f->objs[--f->nr] : NULL;
- preempt_enable();
- mutex_unlock(&c->lock);
+ mutex_unlock(&bc->lock);
}
if (ck) {
- six_lock_intent(&ck->c.lock, NULL, NULL);
- six_lock_write(&ck->c.lock, NULL, NULL);
+ int ret;
+
+ ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent);
+ if (unlikely(ret)) {
+ bkey_cached_move_to_freelist(bc, ck);
+ return ERR_PTR(ret);
+ }
+
+ path->l[0].b = (void *) ck;
+ path->l[0].lock_seq = ck->c.lock.state.seq;
+ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+
+ ret = bch2_btree_node_lock_write(trans, path, &ck->c);
+ if (unlikely(ret)) {
+ btree_node_unlock(trans, path, 0);
+ bkey_cached_move_to_freelist(bc, ck);
+ return ERR_PTR(ret);
+ }
+
return ck;
}
ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
if (likely(ck)) {
INIT_LIST_HEAD(&ck->list);
- six_lock_init(&ck->c.lock);
+ __six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
+ if (pcpu_readers)
+ six_lock_pcpu_alloc(&ck->c.lock);
+
+ ck->c.cached = true;
BUG_ON(!six_trylock_intent(&ck->c.lock));
BUG_ON(!six_trylock_write(&ck->c.lock));
return ck;
@@ -215,36 +269,36 @@ bkey_cached_reuse(struct btree_key_cache *c)
}
static struct bkey_cached *
-btree_key_cache_create(struct bch_fs *c,
- enum btree_id btree_id,
- struct bpos pos)
+btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
{
+ struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck;
bool was_new = true;
- ck = bkey_cached_alloc(bc);
+ ck = bkey_cached_alloc(trans, path);
+ if (unlikely(IS_ERR(ck)))
+ return ck;
if (unlikely(!ck)) {
ck = bkey_cached_reuse(bc);
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
- bch2_btree_ids[btree_id]);
+ bch2_btree_ids[path->btree_id]);
return ERR_PTR(-ENOMEM);
}
+ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
was_new = false;
} else {
- if (btree_id == BTREE_ID_subvolumes)
+ if (path->btree_id == BTREE_ID_subvolumes)
six_lock_pcpu_alloc(&ck->c.lock);
- else
- six_lock_pcpu_free(&ck->c.lock);
}
ck->c.level = 0;
- ck->c.btree_id = btree_id;
- ck->key.btree_id = btree_id;
- ck->key.pos = pos;
+ ck->c.btree_id = path->btree_id;
+ ck->key.btree_id = path->btree_id;
+ ck->key.pos = path->pos;
ck->valid = false;
ck->flags = 1U << BKEY_CACHED_ACCESSED;
@@ -256,6 +310,7 @@ btree_key_cache_create(struct bch_fs *c,
if (likely(was_new)) {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
kfree(ck);
} else {
bkey_cached_free_fast(bc, ck);
@@ -291,7 +346,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
k = bch2_btree_path_peek_slot(path, &u);
if (!bch2_btree_node_relock(trans, ck_path, 0)) {
- trace_trans_restart_relock_key_cache_fill(trans, _THIS_IP_, ck_path);
+ trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
goto err;
}
@@ -320,11 +375,12 @@ static int btree_key_cache_fill(struct btree_trans *trans,
}
}
- /*
- * XXX: not allowed to be holding read locks when we take a write lock,
- * currently
- */
- bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
+ ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
+ if (ret) {
+ kfree(new_k);
+ goto err;
+ }
+
if (new_k) {
kfree(ck->k);
ck->u64s = new_u64s;
@@ -372,7 +428,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path
retry:
ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
if (!ck) {
- ck = btree_key_cache_create(c, path->btree_id, path->pos);
+ ck = btree_key_cache_create(trans, path);
ret = PTR_ERR_OR_ZERO(ck);
if (ret)
goto err;
@@ -414,7 +470,7 @@ fill:
*/
if (!path->locks_want &&
!__bch2_btree_path_upgrade(trans, path, 1)) {
- trace_transaction_restart_key_cache_upgrade(trans, _THIS_IP_);
+ trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
goto err;
}
@@ -518,21 +574,21 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
} else {
+ struct btree_path *path2;
evict:
- BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
-
- mark_btree_node_unlocked(c_iter.path, 0);
- c_iter.path->l[0].b = NULL;
+ trans_for_each_path(trans, path2)
+ if (path2 != c_iter.path)
+ __bch2_btree_path_unlock(trans, path2);
- six_lock_write(&ck->c.lock, NULL, NULL);
+ bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
+ mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
bkey_cached_evict(&c->btree_key_cache, ck);
-
bkey_cached_free_fast(&c->btree_key_cache, ck);
}
out:
@@ -548,11 +604,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
struct bkey_cached *ck =
container_of(pin, struct bkey_cached, journal);
struct bkey_cached_key key;
+ struct btree_trans trans;
+ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
int ret = 0;
- int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ bch2_trans_init(&trans, c, 0, 0);
- six_lock_read(&ck->c.lock, NULL, NULL);
+ btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read);
key = ck->key;
if (ck->journal.seq != seq ||
@@ -562,12 +620,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
}
six_unlock_read(&ck->c.lock);
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = commit_do(&trans, NULL, NULL, 0,
btree_key_cache_flush_pos(&trans, key, seq,
BTREE_INSERT_JOURNAL_RECLAIM, false));
unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+ bch2_trans_exit(&trans);
return ret;
}
@@ -674,12 +733,29 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
- list_for_each_entry_safe(ck, t, &bc->freed, list) {
+ list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
break;
list_del(&ck->list);
+ six_lock_pcpu_free(&ck->c.lock);
+ kmem_cache_free(bch2_key_cache, ck);
+ atomic_long_dec(&bc->nr_freed);
+ scanned++;
+ freed++;
+ }
+
+ if (scanned >= nr)
+ goto out;
+
+ list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
+ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+ ck->btree_trans_barrier_seq))
+ break;
+
+ list_del(&ck->list);
+ six_lock_pcpu_free(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
scanned++;
@@ -767,7 +843,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
bkey_cached_evict(bc, ck);
- list_add(&ck->list, &bc->freed);
+ list_add(&ck->list, &bc->freed_nonpcpu);
}
rcu_read_unlock();
@@ -777,11 +853,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
for (i = 0; i < f->nr; i++) {
ck = f->objs[i];
- list_add(&ck->list, &bc->freed);
+ list_add(&ck->list, &bc->freed_nonpcpu);
}
}
- list_for_each_entry_safe(ck, n, &bc->freed, list) {
+ list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+ list_for_each_entry_safe(ck, n, &bc->freed_nonpcpu, list) {
cond_resched();
bch2_journal_pin_drop(&c->journal, &ck->journal);
@@ -789,6 +867,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
list_del(&ck->list);
kfree(ck->k);
+ six_lock_pcpu_free(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
}
@@ -808,7 +887,8 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
mutex_init(&c->lock);
- INIT_LIST_HEAD(&c->freed);
+ INIT_LIST_HEAD(&c->freed_pcpu);
+ INIT_LIST_HEAD(&c->freed_nonpcpu);
}
static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c
new file mode 100644
index 00000000..1cdf7d4f
--- /dev/null
+++ b/libbcachefs/btree_locking.c
@@ -0,0 +1,466 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_types.h"
+
+struct lock_class_key bch2_btree_node_lock_key;
+
+/* Btree node locking: */
+
+static inline void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+ if (lock->readers)
+ this_cpu_add(*lock->readers, nr);
+ else if (nr > 0)
+ atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
+ else
+ atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
+}
+
+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
+ struct btree_path *skip,
+ struct btree_bkey_cached_common *b,
+ unsigned level)
+{
+ struct btree_path *path;
+ struct six_lock_count ret;
+
+ memset(&ret, 0, sizeof(ret));
+
+ if (IS_ERR_OR_NULL(b))
+ return ret;
+
+ trans_for_each_path(trans, path)
+ if (path != skip && &path->l[level].b->c == b) {
+ int t = btree_node_locked_type(path, level);
+
+ if (t != BTREE_NODE_UNLOCKED)
+ ret.n[t]++;
+ }
+
+ return ret;
+}
+
+/* unlock */
+
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+ struct btree_path *path, struct btree *b)
+{
+ bch2_btree_node_unlock_write_inlined(trans, path, b);
+}
+
+/* lock */
+
+void __bch2_btree_node_lock_write(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b)
+{
+ int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
+
+ /*
+ * Must drop our read locks before calling six_lock_write() -
+ * six_unlock() won't do wakeups until the reader count
+ * goes to 0, and it's safe because we have the node intent
+ * locked:
+ */
+ six_lock_readers_add(&b->lock, -readers);
+ btree_node_lock_nopath_nofail(trans, b, SIX_LOCK_write);
+ six_lock_readers_add(&b->lock, readers);
+}
+
+static inline bool path_has_read_locks(struct btree_path *path)
+{
+ unsigned l;
+
+ for (l = 0; l < BTREE_MAX_DEPTH; l++)
+ if (btree_node_read_locked(path, l))
+ return true;
+ return false;
+}
+
+/* Slowpath: */
+int __bch2_btree_node_lock(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b,
+ struct bpos pos, unsigned level,
+ enum six_lock_type type,
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
+{
+ struct btree_path *linked;
+ unsigned reason;
+
+ /* Check if it's safe to block: */
+ trans_for_each_path(trans, linked) {
+ if (!linked->nodes_locked)
+ continue;
+
+ /*
+ * Can't block taking an intent lock if we have _any_ nodes read
+ * locked:
+ *
+ * - Our read lock blocks another thread with an intent lock on
+ * the same node from getting a write lock, and thus from
+ * dropping its intent lock
+ *
+ * - And the other thread may have multiple nodes intent locked:
+ * both the node we want to intent lock, and the node we
+ * already have read locked - deadlock:
+ */
+ if (type == SIX_LOCK_intent &&
+ path_has_read_locks(linked)) {
+ reason = 1;
+ goto deadlock;
+ }
+
+ if (linked->btree_id != path->btree_id) {
+ if (linked->btree_id < path->btree_id)
+ continue;
+
+ reason = 3;
+ goto deadlock;
+ }
+
+ /*
+ * Within the same btree, non-cached paths come before cached
+ * paths:
+ */
+ if (linked->cached != path->cached) {
+ if (!linked->cached)
+ continue;
+
+ reason = 4;
+ goto deadlock;
+ }
+
+ /*
+ * Interior nodes must be locked before their descendants: if
+ * another path has possible descendants locked of the node
+ * we're about to lock, it must have the ancestors locked too:
+ */
+ if (level > btree_path_highest_level_locked(linked)) {
+ reason = 5;
+ goto deadlock;
+ }
+
+ /* Must lock btree nodes in key order: */
+ if (btree_node_locked(linked, level) &&
+ bpos_cmp(pos, btree_node_pos(&linked->l[level].b->c)) <= 0) {
+ reason = 7;
+ goto deadlock;
+ }
+ }
+
+ return btree_node_lock_type(trans, path, b, pos, level,
+ type, should_sleep_fn, p);
+deadlock:
+ trace_and_count(trans->c, trans_restart_would_deadlock, trans, ip, reason, linked, path, &pos);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+}
+
+/* relock */
+
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+ struct btree_path *path,
+ bool upgrade)
+{
+ unsigned l = path->level;
+ int fail_idx = -1;
+
+ do {
+ if (!btree_path_node(path, l))
+ break;
+
+ if (!(upgrade
+ ? bch2_btree_node_upgrade(trans, path, l)
+ : bch2_btree_node_relock(trans, path, l)))
+ fail_idx = l;
+
+ l++;
+ } while (l < path->locks_want);
+
+ /*
+ * When we fail to get a lock, we have to ensure that any child nodes
+ * can't be relocked so bch2_btree_path_traverse has to walk back up to
+ * the node that we failed to relock:
+ */
+ if (fail_idx >= 0) {
+ __bch2_btree_path_unlock(trans, path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+ do {
+ path->l[fail_idx].b = upgrade
+ ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+ : ERR_PTR(-BCH_ERR_no_btree_node_relock);
+ --fail_idx;
+ } while (fail_idx >= 0);
+ }
+
+ if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+ path->uptodate = BTREE_ITER_UPTODATE;
+
+ bch2_trans_verify_locks(trans);
+
+ return path->uptodate < BTREE_ITER_NEED_RELOCK;
+}
+
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
+{
+ struct btree *b = btree_path_node(path, level);
+ int want = __btree_lock_want(path, level);
+
+ if (race_fault())
+ goto fail;
+
+ if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+ (btree_node_lock_seq_matches(path, b, level) &&
+ btree_node_lock_increment(trans, &b->c, level, want))) {
+ mark_btree_node_locked(trans, path, level, want);
+ return true;
+ }
+fail:
+ trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level);
+ return false;
+}
+
+/* upgrade */
+
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
+{
+ struct btree *b = path->l[level].b;
+
+ if (!is_btree_node(path, level))
+ return false;
+
+ switch (btree_lock_want(path, level)) {
+ case BTREE_NODE_UNLOCKED:
+ BUG_ON(btree_node_locked(path, level));
+ return true;
+ case BTREE_NODE_READ_LOCKED:
+ BUG_ON(btree_node_intent_locked(path, level));
+ return bch2_btree_node_relock(trans, path, level);
+ case BTREE_NODE_INTENT_LOCKED:
+ break;
+ case BTREE_NODE_WRITE_LOCKED:
+ BUG();
+ }
+
+ if (btree_node_intent_locked(path, level))
+ return true;
+
+ if (race_fault())
+ return false;
+
+ if (btree_node_locked(path, level)
+ ? six_lock_tryupgrade(&b->c.lock)
+ : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
+ goto success;
+
+ if (btree_node_lock_seq_matches(path, b, level) &&
+ btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) {
+ btree_node_unlock(trans, path, level);
+ goto success;
+ }
+
+ trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
+ return false;
+success:
+ mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
+ return true;
+}
+
+/* Btree path locking: */
+
+/*
+ * Only for btree_cache.c - only relocks intent locks
+ */
+int bch2_btree_path_relock_intent(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ unsigned l;
+
+ for (l = path->level;
+ l < path->locks_want && btree_path_node(path, l);
+ l++) {
+ if (!bch2_btree_node_relock(trans, path, l)) {
+ __bch2_btree_path_unlock(trans, path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
+ }
+ }
+
+ return 0;
+}
+
+__flatten
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
+{
+ return btree_path_get_locks(trans, path, false);
+}
+
+__flatten
+bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
+{
+ return btree_path_get_locks(trans, path, true);
+}
+
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ EBUG_ON(path->locks_want >= new_locks_want);
+
+ path->locks_want = new_locks_want;
+
+ return btree_path_get_locks(trans, path, true);
+}
+
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ struct btree_path *linked;
+
+ if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
+ return true;
+
+ /*
+ * XXX: this is ugly - we'd prefer to not be mucking with other
+ * iterators in the btree_trans here.
+ *
+ * On failure to upgrade the iterator, setting iter->locks_want and
+ * calling get_locks() is sufficient to make bch2_btree_path_traverse()
+ * get the locks we want on transaction restart.
+ *
+ * But if this iterator was a clone, on transaction restart what we did
+ * to this iterator isn't going to be preserved.
+ *
+ * Possibly we could add an iterator field for the parent iterator when
+ * an iterator is a copy - for now, we'll just upgrade any other
+ * iterators with the same btree id.
+ *
+ * The code below used to be needed to ensure ancestor nodes get locked
+ * before interior nodes - now that's handled by
+ * bch2_btree_path_traverse_all().
+ */
+ if (!path->cached && !trans->in_traverse_all)
+ trans_for_each_path(trans, linked)
+ if (linked != path &&
+ linked->cached == path->cached &&
+ linked->btree_id == path->btree_id &&
+ linked->locks_want < new_locks_want) {
+ linked->locks_want = new_locks_want;
+ btree_path_get_locks(trans, linked, true);
+ }
+
+ return false;
+}
+
+void __bch2_btree_path_downgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ unsigned l;
+
+ EBUG_ON(path->locks_want < new_locks_want);
+
+ path->locks_want = new_locks_want;
+
+ while (path->nodes_locked &&
+ (l = btree_path_highest_level_locked(path)) >= path->locks_want) {
+ if (l > path->level) {
+ btree_node_unlock(trans, path, l);
+ } else {
+ if (btree_node_intent_locked(path, l)) {
+ six_lock_downgrade(&path->l[l].b->c.lock);
+ mark_btree_node_locked_noreset(path, l, SIX_LOCK_read);
+ }
+ break;
+ }
+ }
+
+ bch2_btree_path_verify_locks(path);
+}
+
+/* Btree transaction locking: */
+
+void bch2_trans_downgrade(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ bch2_btree_path_downgrade(trans, path);
+}
+
+int bch2_trans_relock(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ if (unlikely(trans->restarted))
+ return - ((int) trans->restarted);
+
+ trans_for_each_path(trans, path)
+ if (path->should_be_locked &&
+ !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+ trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ }
+ return 0;
+}
+
+void bch2_trans_unlock(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ __bch2_btree_path_unlock(trans, path);
+
+ /*
+ * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
+ * btree nodes, it implements its own walking:
+ */
+ BUG_ON(!trans->is_initial_gc &&
+ lock_class_is_held(&bch2_btree_node_lock_key));
+}
+
+/* Debug */
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void bch2_btree_path_verify_locks(struct btree_path *path)
+{
+ unsigned l;
+
+ if (!path->nodes_locked) {
+ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+ btree_path_node(path, path->level));
+ return;
+ }
+
+ for (l = 0; l < BTREE_MAX_DEPTH; l++) {
+ int want = btree_lock_want(path, l);
+ int have = btree_node_locked_type(path, l);
+
+ BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
+
+ BUG_ON(is_btree_node(path, l) &&
+ (want == BTREE_NODE_UNLOCKED ||
+ have != BTREE_NODE_WRITE_LOCKED) &&
+ want != have);
+ }
+}
+
+void bch2_trans_verify_locks(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ bch2_btree_path_verify_locks(path);
+}
+
+#endif
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 205c6b59..3bc490bc 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -14,68 +14,71 @@
#include "btree_iter.h"
+extern struct lock_class_key bch2_btree_node_lock_key;
+
static inline bool is_btree_node(struct btree_path *path, unsigned l)
{
return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b);
}
+static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
+{
+ return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
+ ? &trans->c->btree_transaction_stats[trans->fn_idx]
+ : NULL;
+}
+
/* matches six lock types */
enum btree_node_locked_type {
BTREE_NODE_UNLOCKED = -1,
BTREE_NODE_READ_LOCKED = SIX_LOCK_read,
BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
+ BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write,
};
static inline int btree_node_locked_type(struct btree_path *path,
unsigned level)
{
- /*
- * We're relying on the fact that if nodes_intent_locked is set
- * nodes_locked must be set as well, so that we can compute without
- * branches:
- */
- return BTREE_NODE_UNLOCKED +
- ((path->nodes_locked >> level) & 1) +
- ((path->nodes_intent_locked >> level) & 1);
+ return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
}
-static inline bool btree_node_intent_locked(struct btree_path *path,
- unsigned level)
+static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
{
- return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED;
+ return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
}
-static inline bool btree_node_read_locked(struct btree_path *path,
- unsigned level)
+static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l)
{
- return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED;
+ return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED;
}
-static inline bool btree_node_locked(struct btree_path *path, unsigned level)
+static inline bool btree_node_read_locked(struct btree_path *path, unsigned l)
{
- return path->nodes_locked & (1 << level);
+ return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED;
}
-static inline void mark_btree_node_unlocked(struct btree_path *path,
- unsigned level)
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
{
- path->nodes_locked &= ~(1 << level);
- path->nodes_intent_locked &= ~(1 << level);
+ return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED;
}
-static inline void mark_btree_node_locked_noreset(struct btree_trans *trans,
- struct btree_path *path,
- unsigned level,
- enum six_lock_type type)
+static inline void mark_btree_node_locked_noreset(struct btree_path *path,
+ unsigned level,
+ enum btree_node_locked_type type)
{
/* relying on this to avoid a branch */
BUILD_BUG_ON(SIX_LOCK_read != 0);
BUILD_BUG_ON(SIX_LOCK_intent != 1);
- BUG_ON(trans->in_traverse_all && path->sorted_idx > trans->traverse_all_idx);
+ path->nodes_locked &= ~(3U << (level << 1));
+ path->nodes_locked |= (type + 1) << (level << 1);
+}
- path->nodes_locked |= 1 << level;
- path->nodes_intent_locked |= type << level;
+static inline void mark_btree_node_unlocked(struct btree_path *path,
+ unsigned level)
+{
+ EBUG_ON(btree_node_write_locked(path, level));
+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
}
static inline void mark_btree_node_locked(struct btree_trans *trans,
@@ -83,19 +86,12 @@ static inline void mark_btree_node_locked(struct btree_trans *trans,
unsigned level,
enum six_lock_type type)
{
- mark_btree_node_locked_noreset(trans, path, level, type);
+ mark_btree_node_locked_noreset(path, level, type);
#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
path->l[level].lock_taken_time = ktime_get_ns();
#endif
}
-static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
- struct btree_path *path,
- unsigned level)
-{
- mark_btree_node_locked_noreset(trans, path, level, SIX_LOCK_intent);
-}
-
static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
{
return level < path->locks_want
@@ -115,13 +111,6 @@ btree_lock_want(struct btree_path *path, int level)
return BTREE_NODE_UNLOCKED;
}
-static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans)
-{
- return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats)
- ? &trans->c->btree_transaction_stats[trans->fn_idx]
- : NULL;
-}
-
static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
@@ -135,6 +124,8 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
#endif
}
+/* unlock: */
+
static inline void btree_node_unlock(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
@@ -149,59 +140,91 @@ static inline void btree_node_unlock(struct btree_trans *trans,
mark_btree_node_unlocked(path, level);
}
+static inline int btree_path_lowest_level_locked(struct btree_path *path)
+{
+ return __ffs(path->nodes_locked) >> 1;
+}
+
+static inline int btree_path_highest_level_locked(struct btree_path *path)
+{
+ return __fls(path->nodes_locked) >> 1;
+}
+
static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
struct btree_path *path)
{
btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
while (path->nodes_locked)
- btree_node_unlock(trans, path, __ffs(path->nodes_locked));
+ btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
}
-static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+ struct btree *b)
{
- switch (type) {
- case SIX_LOCK_read:
- return BCH_TIME_btree_lock_contended_read;
- case SIX_LOCK_intent:
- return BCH_TIME_btree_lock_contended_intent;
- case SIX_LOCK_write:
- return BCH_TIME_btree_lock_contended_write;
- default:
- BUG();
- }
+ struct btree_path *linked;
+
+ EBUG_ON(path->l[b->c.level].b != b);
+ EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+ EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
+
+ mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
+
+ trans_for_each_path_with_node(trans, b, linked)
+ linked->l[b->c.level].lock_seq += 2;
+
+ six_unlock_write(&b->c.lock);
+}
+
+void bch2_btree_node_unlock_write(struct btree_trans *,
+ struct btree_path *, struct btree *);
+
+/* lock: */
+
+static inline int __must_check
+btree_node_lock_nopath(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type)
+{
+ six_lock_type(&b->lock, type, NULL, NULL);
+ return 0;
+}
+
+static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type)
+{
+ int ret = btree_node_lock_nopath(trans, b, type);
+
+ BUG_ON(ret);
}
static inline int btree_node_lock_type(struct btree_trans *trans,
struct btree_path *path,
- struct btree *b,
+ struct btree_bkey_cached_common *b,
struct bpos pos, unsigned level,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
- struct bch_fs *c = trans->c;
- u64 start_time;
int ret;
- if (six_trylock_type(&b->c.lock, type))
+ if (six_trylock_type(&b->lock, type))
return 0;
- start_time = local_clock();
-
trans->locking_path_idx = path->idx;
trans->locking_pos = pos;
trans->locking_btree_id = path->btree_id;
trans->locking_level = level;
trans->locking_lock_type = type;
- trans->locking = &b->c;
- ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p);
+ trans->locking = b;
+ ret = six_lock_type(&b->lock, type, should_sleep_fn, p);
trans->locking = NULL;
-
- if (ret)
- return ret;
-
- bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
- return 0;
+ return ret;
}
/*
@@ -209,15 +232,16 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
* iterators:
*/
static inline bool btree_node_lock_increment(struct btree_trans *trans,
- struct btree *b, unsigned level,
+ struct btree_bkey_cached_common *b,
+ unsigned level,
enum btree_node_locked_type want)
{
struct btree_path *path;
trans_for_each_path(trans, path)
- if (path->l[level].b == b &&
+ if (&path->l[level].b->c == b &&
btree_node_locked_type(path, level) >= want) {
- six_lock_increment(&b->c.lock, want);
+ six_lock_increment(&b->lock, want);
return true;
}
@@ -225,14 +249,16 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
}
int __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
- struct btree *, struct bpos, unsigned,
+ struct btree_bkey_cached_common *,
+ struct bpos, unsigned,
enum six_lock_type,
six_lock_should_sleep_fn, void *,
unsigned long);
static inline int btree_node_lock(struct btree_trans *trans,
struct btree_path *path,
- struct btree *b, struct bpos pos, unsigned level,
+ struct btree_bkey_cached_common *b,
+ struct bpos pos, unsigned level,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p,
unsigned long ip)
@@ -242,67 +268,97 @@ static inline int btree_node_lock(struct btree_trans *trans,
EBUG_ON(level >= BTREE_MAX_DEPTH);
EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
- if (likely(six_trylock_type(&b->c.lock, type)) ||
+ if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, type) ||
!(ret = __bch2_btree_node_lock(trans, path, b, pos, level, type,
should_sleep_fn, p, ip))) {
#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- path->l[b->c.level].lock_taken_time = ktime_get_ns();
+ path->l[b->level].lock_taken_time = ktime_get_ns();
#endif
}
return ret;
}
+void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *);
+
+static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b)
+{
+ EBUG_ON(&path->l[b->level].b->c != b);
+ EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
+ EBUG_ON(!btree_node_intent_locked(path, b->level));
+
+ /*
+ * six locks are unfair, and read locks block while a thread wants a
+ * write lock: thus, we need to tell the cycle detector we have a write
+ * lock _before_ taking the lock:
+ */
+ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
+
+ if (unlikely(!six_trylock_write(&b->lock)))
+ __bch2_btree_node_lock_write(trans, b);
+}
+
+static inline int __must_check
+bch2_btree_node_lock_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b)
+{
+ bch2_btree_node_lock_write_nofail(trans, path, b);
+ return 0;
+}
+
+/* relock: */
+
+bool bch2_btree_path_relock_norestart(struct btree_trans *,
+ struct btree_path *, unsigned long);
bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
static inline bool bch2_btree_node_relock(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
EBUG_ON(btree_node_locked(path, level) &&
- btree_node_locked_type(path, level) !=
- __btree_lock_want(path, level));
+ !btree_node_write_locked(path, level) &&
+ btree_node_locked_type(path, level) != __btree_lock_want(path, level));
return likely(btree_node_locked(path, level)) ||
- __bch2_btree_node_relock(trans, path, level);
+ (!IS_ERR_OR_NULL(path->l[level].b) &&
+ __bch2_btree_node_relock(trans, path, level));
}
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
-static inline void
-bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
- struct btree *b)
+static inline int bch2_btree_path_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
{
- struct btree_path *linked;
-
- EBUG_ON(path->l[b->c.level].b != b);
- EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
-
- trans_for_each_path_with_node(trans, b, linked)
- linked->l[b->c.level].lock_seq += 2;
+ if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+ trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
+ }
- six_unlock_write(&b->c.lock);
+ return 0;
}
-void bch2_btree_node_unlock_write(struct btree_trans *,
- struct btree_path *, struct btree *);
+/* upgrade */
-void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
+ struct btree_path *, unsigned);
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned);
-static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
+static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
{
- EBUG_ON(path->l[b->c.level].b != b);
- EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
- EBUG_ON(!btree_node_intent_locked(path, b->c.level));
+ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
- if (unlikely(!six_trylock_write(&b->c.lock)))
- __bch2_btree_node_lock_write(trans, b);
+ return path->locks_want < new_locks_want
+ ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+ : path->uptodate == BTREE_ITER_UPTODATE;
}
+/* misc: */
+
static inline void btree_path_set_should_be_locked(struct btree_path *path)
{
EBUG_ON(!btree_node_locked(path, path->level));
@@ -326,7 +382,20 @@ static inline void btree_path_set_level_up(struct btree_trans *trans,
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
}
+/* debug */
+
struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
- struct btree_path *, struct btree *, unsigned);
+ struct btree_path *,
+ struct btree_bkey_cached_common *b,
+ unsigned);
+
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_path_verify_locks(struct btree_path *);
+void bch2_trans_verify_locks(struct btree_trans *);
+#else
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+#endif
#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 21d76181..7c016637 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -63,6 +63,7 @@ struct btree_bkey_cached_common {
struct six_lock lock;
u8 level;
u8 btree_id;
+ bool cached;
};
struct btree {
@@ -232,9 +233,8 @@ struct btree_path {
*/
bool should_be_locked:1;
unsigned level:3,
- locks_want:4,
- nodes_locked:4,
- nodes_intent_locked:4;
+ locks_want:4;
+ u8 nodes_locked;
struct btree_path_level {
struct btree *b;
@@ -302,7 +302,8 @@ struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
- struct list_head freed;
+ struct list_head freed_pcpu;
+ struct list_head freed_nonpcpu;
struct shrinker shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;
@@ -338,6 +339,13 @@ struct bkey_cached {
struct bkey_i *k;
};
+static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
+{
+ return !b->cached
+ ? container_of(b, struct btree, c)->key.k.p
+ : container_of(b, struct bkey_cached, c)->key.pos;
+}
+
struct btree_insert_entry {
unsigned flags;
u8 bkey_type;
@@ -413,6 +421,7 @@ struct btree_trans {
u64 paths_allocated;
unsigned mem_top;
+ unsigned mem_max;
unsigned mem_bytes;
void *mem;
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 0409737f..d31c6eeb 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -143,7 +143,7 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
static void __btree_node_free(struct bch_fs *c, struct btree *b)
{
- trace_btree_node_free(c, b);
+ trace_and_count(c, btree_node_free, c, b);
BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_need_write(b));
@@ -160,22 +160,23 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
}
static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+ struct btree_path *path,
struct btree *b)
{
struct bch_fs *c = trans->c;
- struct btree_path *path;
-
- trans_for_each_path(trans, path)
- BUG_ON(path->l[b->c.level].b == b &&
- path->l[b->c.level].lock_seq == b->c.lock.state.seq);
-
- six_lock_write(&b->c.lock, NULL, NULL);
+ unsigned level = b->c.level;
+ bch2_btree_node_lock_write_nofail(trans, path, &b->c);
bch2_btree_node_hash_remove(&c->btree_cache, b);
__btree_node_free(c, b);
-
six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
+ mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
+
+ trans_for_each_path(trans, path)
+ if (path->l[level].b == b) {
+ btree_node_unlock(trans, path, level);
+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+ }
}
static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
@@ -258,7 +259,9 @@ mem_alloc:
return b;
}
-static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
+static struct btree *bch2_btree_node_alloc(struct btree_update *as,
+ struct btree_trans *trans,
+ unsigned level)
{
struct bch_fs *c = as->c;
struct btree *b;
@@ -270,8 +273,8 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
b = p->b[--p->nr];
- six_lock_intent(&b->c.lock, NULL, NULL);
- six_lock_write(&b->c.lock, NULL, NULL);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
set_btree_node_accessed(b);
set_btree_node_dirty_acct(c, b);
@@ -304,7 +307,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
BUG_ON(ret);
- trace_btree_node_alloc(c, b);
+ trace_and_count(c, btree_node_alloc, c, b);
return b;
}
@@ -322,12 +325,13 @@ static void btree_set_max(struct btree *b, struct bpos pos)
}
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
+ struct btree_trans *trans,
struct btree *b,
struct bkey_format format)
{
struct btree *n;
- n = bch2_btree_node_alloc(as, b->c.level);
+ n = bch2_btree_node_alloc(as, trans, b->c.level);
SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
@@ -346,6 +350,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
}
static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
+ struct btree_trans *trans,
struct btree *b)
{
struct bkey_format new_f = bch2_btree_calc_format(b);
@@ -357,12 +362,13 @@ static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
if (!bch2_btree_node_format_fits(as->c, b, &new_f))
new_f = b->format;
- return __bch2_btree_node_alloc_replacement(as, b, new_f);
+ return __bch2_btree_node_alloc_replacement(as, trans, b, new_f);
}
-static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
+static struct btree *__btree_root_alloc(struct btree_update *as,
+ struct btree_trans *trans, unsigned level)
{
- struct btree *b = bch2_btree_node_alloc(as, level);
+ struct btree *b = bch2_btree_node_alloc(as, trans, level);
btree_set_min(b, POS_MIN);
btree_set_max(b, SPOS_MAX);
@@ -377,7 +383,7 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
return b;
}
-static void bch2_btree_reserve_put(struct btree_update *as)
+static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans)
{
struct bch_fs *c = as->c;
struct prealloc_nodes *p;
@@ -404,8 +410,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
mutex_unlock(&c->btree_reserve_cache_lock);
- six_lock_intent(&b->c.lock, NULL, NULL);
- six_lock_write(&b->c.lock, NULL, NULL);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
__btree_node_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -459,7 +465,7 @@ err:
/* Asynchronous interior node update machinery */
-static void bch2_btree_update_free(struct btree_update *as)
+static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans)
{
struct bch_fs *c = as->c;
@@ -472,7 +478,7 @@ static void bch2_btree_update_free(struct btree_update *as)
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
bch2_disk_reservation_put(c, &as->disk_res);
- bch2_btree_reserve_put(as);
+ bch2_btree_reserve_put(as, trans);
bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
as->start_time);
@@ -550,12 +556,13 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
static void btree_update_nodes_written(struct btree_update *as)
{
struct bch_fs *c = as->c;
- struct btree *b = as->b;
+ struct btree *b;
struct btree_trans trans;
u64 journal_seq = 0;
unsigned i;
int ret;
+ bch2_trans_init(&trans, c, 0, 512);
/*
* If we're already in an error state, it might be because a btree node
* was never written, and we might be trying to free that same btree
@@ -572,15 +579,16 @@ static void btree_update_nodes_written(struct btree_update *as)
* on disk:
*/
for (i = 0; i < as->nr_old_nodes; i++) {
- struct btree *old = as->old_nodes[i];
__le64 seq;
- six_lock_read(&old->c.lock, NULL, NULL);
- seq = old->data ? old->data->keys.seq : 0;
- six_unlock_read(&old->c.lock);
+ b = as->old_nodes[i];
+
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ seq = b->data ? b->data->keys.seq : 0;
+ six_unlock_read(&b->c.lock);
if (seq == as->old_nodes_seq[i])
- wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
TASK_UNINTERRUPTIBLE);
}
@@ -597,19 +605,19 @@ static void btree_update_nodes_written(struct btree_update *as)
* journal reclaim does btree updates when flushing bkey_cached entries,
* which may require allocations as well.
*/
- bch2_trans_init(&trans, c, 0, 512);
ret = commit_do(&trans, &as->disk_res, &journal_seq,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_JOURNAL_RECLAIM|
- JOURNAL_WATERMARK_reserved,
- btree_update_nodes_written_trans(&trans, as));
- bch2_trans_exit(&trans);
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_JOURNAL_RECLAIM|
+ JOURNAL_WATERMARK_reserved,
+ btree_update_nodes_written_trans(&trans, as));
+ bch2_trans_unlock(&trans);
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
"error %i in btree_update_nodes_written()", ret);
err:
- if (b) {
+ if (as->b) {
+ b = as->b;
/*
* @b is the node we did the final insert into:
*
@@ -622,8 +630,8 @@ err:
* we're in journal error state:
*/
- six_lock_intent(&b->c.lock, NULL, NULL);
- six_lock_write(&b->c.lock, NULL, NULL);
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_write);
mutex_lock(&c->btree_interior_update_lock);
list_del(&as->write_blocked_list);
@@ -680,7 +688,7 @@ err:
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
- six_lock_read(&b->c.lock, NULL, NULL);
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
}
@@ -688,7 +696,8 @@ err:
for (i = 0; i < as->nr_open_buckets; i++)
bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
- bch2_btree_update_free(as);
+ bch2_btree_update_free(as, &trans);
+ bch2_trans_exit(&trans);
}
static void btree_interior_update_work(struct work_struct *work)
@@ -935,7 +944,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
as->nr_old_nodes++;
}
-static void bch2_btree_update_done(struct btree_update *as)
+static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans)
{
struct bch_fs *c = as->c;
u64 start_time = as->start_time;
@@ -946,7 +955,7 @@ static void bch2_btree_update_done(struct btree_update *as)
up_read(&as->c->gc_lock);
as->took_gc_lock = false;
- bch2_btree_reserve_put(as);
+ bch2_btree_reserve_put(as, trans);
continue_at(&as->cl, btree_update_set_nodes_written,
as->c->btree_interior_update_worker);
@@ -994,7 +1003,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
nr_nodes[1] += 1;
if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
- trace_trans_restart_iter_upgrade(trans, _RET_IP_, path);
+ trace_and_count(c, trans_restart_iter_upgrade, trans, _RET_IP_, path);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
return ERR_PTR(ret);
}
@@ -1048,11 +1057,16 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (ret) {
bch2_trans_unlock(trans);
+ if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
+ ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ goto err;
+ }
+
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
journal_flags);
if (ret) {
- trace_trans_restart_journal_preres_get(trans, _RET_IP_);
+ trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
goto err;
}
@@ -1085,8 +1099,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
}
if (ret) {
- trace_btree_reserve_get_fail(trans->fn, _RET_IP_,
- nr_nodes[0] + nr_nodes[1]);
+ trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]);
goto err;
}
@@ -1097,7 +1110,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
bch2_trans_verify_not_restarted(trans, restart_count);
return as;
err:
- bch2_btree_update_free(as);
+ bch2_btree_update_free(as, trans);
return ERR_PTR(ret);
}
@@ -1141,7 +1154,7 @@ static void bch2_btree_set_root(struct btree_update *as,
struct bch_fs *c = as->c;
struct btree *old;
- trace_btree_set_root(c, b);
+ trace_and_count(c, btree_node_set_root, c, b);
BUG_ON(!b->written);
old = btree_node_root(c, b);
@@ -1150,7 +1163,7 @@ static void bch2_btree_set_root(struct btree_update *as,
* Ensure no one is using the old root while we switch to the
* new root:
*/
- bch2_btree_node_lock_write(trans, path, old);
+ bch2_btree_node_lock_write_nofail(trans, path, &old->c);
bch2_btree_set_root_inmem(c, b);
@@ -1249,6 +1262,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
* node)
*/
static struct btree *__btree_split_node(struct btree_update *as,
+ struct btree_trans *trans,
struct btree *n1)
{
struct bkey_format_state s;
@@ -1258,7 +1272,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
struct bpos n1_pos;
- n2 = bch2_btree_node_alloc(as, n1->c.level);
+ n2 = bch2_btree_node_alloc(as, trans, n1->c.level);
n2->data->max_key = n1->data->max_key;
n2->data->format = n1->format;
@@ -1422,15 +1436,15 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_interior_update_will_free_node(as, b);
- n1 = bch2_btree_node_alloc_replacement(as, b);
+ n1 = bch2_btree_node_alloc_replacement(as, trans, b);
if (keys)
btree_split_insert_keys(as, trans, path, n1, keys);
if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
- trace_btree_split(c, b);
+ trace_and_count(c, btree_node_split, c, b);
- n2 = __btree_split_node(as, n1);
+ n2 = __btree_split_node(as, trans, n1);
bch2_btree_build_aux_trees(n2);
bch2_btree_build_aux_trees(n1);
@@ -1452,7 +1466,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
if (!parent) {
/* Depth increases, make a new root */
- n3 = __btree_root_alloc(as, b->c.level + 1);
+ n3 = __btree_root_alloc(as, trans, b->c.level + 1);
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
@@ -1462,7 +1476,7 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
}
} else {
- trace_btree_compact(c, b);
+ trace_and_count(c, btree_node_compact, c, b);
bch2_btree_build_aux_trees(n1);
six_unlock_write(&n1->c.lock);
@@ -1493,22 +1507,19 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
if (n3)
bch2_btree_update_get_open_buckets(as, n3);
- /* Successful split, update the path to point to the new nodes: */
-
- six_lock_increment(&b->c.lock, SIX_LOCK_intent);
- if (n3)
- bch2_trans_node_add(trans, n3);
- if (n2)
- bch2_trans_node_add(trans, n2);
- bch2_trans_node_add(trans, n1);
-
/*
* The old node must be freed (in memory) _before_ unlocking the new
* nodes - else another thread could re-acquire a read lock on the old
* node after another thread has locked and updated the new node, thus
* seeing stale data:
*/
- bch2_btree_node_free_inmem(trans, b);
+ bch2_btree_node_free_inmem(trans, path, b);
+
+ if (n3)
+ bch2_trans_node_add(trans, n3);
+ if (n2)
+ bch2_trans_node_add(trans, n2);
+ bch2_trans_node_add(trans, n1);
if (n3)
six_unlock_intent(&n3->c.lock);
@@ -1617,7 +1628,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
return PTR_ERR(as);
btree_split(as, trans, path, b, NULL, flags);
- bch2_btree_update_done(as);
+ bch2_btree_update_done(as, trans);
for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
ret = bch2_foreground_maybe_merge(trans, path, l, flags);
@@ -1731,12 +1742,12 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if (ret)
goto err;
- trace_btree_merge(c, b);
+ trace_and_count(c, btree_node_merge, c, b);
bch2_btree_interior_update_will_free_node(as, b);
bch2_btree_interior_update_will_free_node(as, m);
- n = bch2_btree_node_alloc(as, b->c.level);
+ n = bch2_btree_node_alloc(as, trans, b->c.level);
SET_BTREE_NODE_SEQ(n->data,
max(BTREE_NODE_SEQ(b->data),
@@ -1771,19 +1782,16 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_get_open_buckets(as, n);
- six_lock_increment(&b->c.lock, SIX_LOCK_intent);
- six_lock_increment(&m->c.lock, SIX_LOCK_intent);
+ bch2_btree_node_free_inmem(trans, path, b);
+ bch2_btree_node_free_inmem(trans, sib_path, m);
bch2_trans_node_add(trans, n);
bch2_trans_verify_paths(trans);
- bch2_btree_node_free_inmem(trans, b);
- bch2_btree_node_free_inmem(trans, m);
-
six_unlock_intent(&n->c.lock);
- bch2_btree_update_done(as);
+ bch2_btree_update_done(as, trans);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
out:
@@ -1817,13 +1825,13 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
bch2_btree_interior_update_will_free_node(as, b);
- n = bch2_btree_node_alloc_replacement(as, b);
+ n = bch2_btree_node_alloc_replacement(as, trans, b);
bch2_btree_update_add_new_node(as, n);
bch2_btree_build_aux_trees(n);
six_unlock_write(&n->c.lock);
- trace_btree_rewrite(c, b);
+ trace_and_count(c, btree_node_rewrite, c, b);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
@@ -1837,12 +1845,12 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
bch2_btree_update_get_open_buckets(as, n);
- six_lock_increment(&b->c.lock, SIX_LOCK_intent);
+ bch2_btree_node_free_inmem(trans, iter->path, b);
+
bch2_trans_node_add(trans, n);
- bch2_btree_node_free_inmem(trans, b);
six_unlock_intent(&n->c.lock);
- bch2_btree_update_done(as);
+ bch2_btree_update_done(as, trans);
out:
bch2_btree_path_downgrade(trans, iter->path);
return ret;
@@ -1989,7 +1997,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
if (ret)
goto err;
- bch2_btree_node_lock_write(trans, iter->path, b);
+ bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h
index adfc6c24..7af810df 100644
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -117,6 +117,7 @@ struct btree_update {
};
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+ struct btree_trans *,
struct btree *,
struct bkey_format);
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index ca988392..e9518fbc 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -81,7 +81,7 @@ void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
struct btree_path *path,
struct btree *b)
{
- bch2_btree_node_lock_write(trans, path, b);
+ bch2_btree_node_lock_write_nofail(trans, path, &b->c);
bch2_btree_node_prep_for_write(trans, path, b);
}
@@ -169,10 +169,13 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
+ struct btree_trans trans;
unsigned long old, new, v;
unsigned idx = w - b->writes;
- six_lock_read(&b->c.lock, NULL, NULL);
+ bch2_trans_init(&trans, c, 0, 0);
+
+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
v = READ_ONCE(b->flags);
do {
@@ -188,6 +191,8 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
+
+ bch2_trans_exit(&trans);
return 0;
}
@@ -285,7 +290,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
ret = bch2_trans_relock(trans);
if (ret) {
- trace_trans_restart_journal_preres_get(trans, trace_ip);
+ trace_and_count(c, trans_restart_journal_preres_get, trans, trace_ip, 0);
return ret;
}
@@ -375,7 +380,7 @@ btree_key_can_insert_cached(struct btree_trans *trans,
* Keys returned by peek() are no longer valid pointers, so we need a
* transaction restart:
*/
- trace_trans_restart_key_cache_key_realloced(trans, _RET_IP_, path, old_u64s, new_u64s);
+ trace_and_count(c, trans_restart_key_cache_key_realloced, trans, _RET_IP_, path, old_u64s, new_u64s);
return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced);
}
@@ -567,7 +572,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
int ret;
if (race_fault()) {
- trace_trans_restart_fault_inject(trans, trace_ip);
+ trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
}
@@ -741,11 +746,12 @@ static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_
static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
{
struct btree *b = path_l(path)->b;
+ unsigned l;
do {
- if (path->nodes_locked &&
- path->nodes_locked != path->nodes_intent_locked)
- path_upgrade_readers(trans, path);
+ for (l = 0; l < BTREE_MAX_DEPTH; l++)
+ if (btree_node_read_locked(path, l))
+ path_upgrade_readers(trans, path);
} while ((path = prev_btree_path(trans, path)) &&
path_l(path)->b == b);
}
@@ -764,11 +770,13 @@ static inline void normalize_read_intent_locks(struct btree_trans *trans)
? trans->paths + trans->sorted[i + 1]
: NULL;
- if (path->nodes_locked) {
- if (path->nodes_intent_locked)
- nr_intent++;
- else
- nr_read++;
+ switch (btree_node_locked_type(path, path->level)) {
+ case BTREE_NODE_READ_LOCKED:
+ nr_read++;
+ break;
+ case BTREE_NODE_INTENT_LOCKED:
+ nr_intent++;
+ break;
}
if (!next || path_l(path)->b != path_l(next)->b) {
@@ -791,7 +799,7 @@ static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct
//if (path == pos)
// break;
- if (path->nodes_locked != path->nodes_intent_locked &&
+ if (btree_node_read_locked(path, path->level) &&
!bch2_btree_path_upgrade(trans, path, path->level + 1))
return true;
}
@@ -808,12 +816,19 @@ static inline int trans_lock_write(struct btree_trans *trans)
if (same_leaf_as_prev(trans, i))
continue;
+ /*
+ * six locks are unfair, and read locks block while a thread
+ * wants a write lock: thus, we need to tell the cycle detector
+ * we have a write lock _before_ taking the lock:
+ */
+ mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_write);
+
if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
if (have_conflicting_read_lock(trans, i->path))
goto fail;
ret = btree_node_lock_type(trans, i->path,
- insert_l(i)->b,
+ &insert_l(i)->b->c,
i->path->pos, i->level,
SIX_LOCK_write, NULL, NULL);
BUG_ON(ret);
@@ -824,6 +839,8 @@ static inline int trans_lock_write(struct btree_trans *trans)
return 0;
fail:
+ mark_btree_node_locked_noreset(i->path, i->level, SIX_LOCK_intent);
+
while (--i >= trans->updates) {
if (same_leaf_as_prev(trans, i))
continue;
@@ -831,7 +848,7 @@ fail:
bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
}
- trace_trans_restart_would_deadlock_write(trans);
+ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
@@ -964,7 +981,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
case BTREE_INSERT_BTREE_NODE_FULL:
ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- trace_trans_restart_btree_node_split(trans, trace_ip, i->path);
+ trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
break;
case BTREE_INSERT_NEED_MARK_REPLICAS:
bch2_trans_unlock(trans);
@@ -975,7 +992,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
ret = bch2_trans_relock(trans);
if (ret)
- trace_trans_restart_mark_replicas(trans, trace_ip);
+ trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip);
break;
case BTREE_INSERT_NEED_JOURNAL_RES:
bch2_trans_unlock(trans);
@@ -992,12 +1009,12 @@ int bch2_trans_commit_error(struct btree_trans *trans,
ret = bch2_trans_relock(trans);
if (ret)
- trace_trans_restart_journal_res_get(trans, trace_ip);
+ trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip);
break;
case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
bch2_trans_unlock(trans);
- trace_trans_blocked_journal_reclaim(trans, trace_ip);
+ trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
wait_event_freezable(c->journal.reclaim_wait,
(ret = journal_reclaim_wait_done(c)));
@@ -1006,7 +1023,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
ret = bch2_trans_relock(trans);
if (ret)
- trace_trans_restart_journal_reclaim(trans, trace_ip);
+ trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
break;
default:
BUG_ON(ret >= 0);
@@ -1107,7 +1124,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
BUG_ON(!i->path->should_be_locked);
if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
- trace_trans_restart_upgrade(trans, _RET_IP_, i->path);
+ trace_and_count(c, trans_restart_upgrade, trans, _RET_IP_, i->path);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
goto out;
}
@@ -1148,7 +1165,7 @@ retry:
if (ret)
goto err;
- trace_transaction_commit(trans, _RET_IP_);
+ trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
@@ -1617,7 +1634,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
ck = (void *) iter->key_cache_path->l[0].b;
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- trace_trans_restart_key_cache_raced(trans, _RET_IP_);
+ trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
}
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 3b442b01..cb25efb6 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -231,9 +231,12 @@ static int bch2_data_update_index_update(struct bch_write_op *op)
m->data_opts.btree_insert_flags);
if (!ret) {
bch2_btree_iter_set_pos(&iter, next_pos);
- atomic_long_inc(&c->extent_migrate_done);
+
if (ec_ob)
bch2_ob_add_backpointer(c, ec_ob, &insert->k);
+
+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
+ trace_move_extent_finish(&new->k);
}
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -248,22 +251,16 @@ next:
}
continue;
nomatch:
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, old);
- bch_info(c, "no match for %s", buf.buf);
- printbuf_exit(&buf);
- }
-
if (m->ctxt) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
atomic64_inc(&m->ctxt->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
&m->ctxt->stats->sectors_raced);
}
- atomic_long_inc(&c->extent_migrate_raced);
- trace_move_race(&new->k);
+
+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_race], new->k.size);
+ trace_move_extent_race(&new->k);
+
bch2_btree_iter_advance(&iter);
goto next;
}
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index f35e714e..fb518d59 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -666,6 +666,9 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
mutex_lock(&s->lock);
+ prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
+ prt_newline(&i->buf);
+
if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
prt_printf(&i->buf, "Lock hold times:");
prt_newline(&i->buf);
diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c
index 7bd44136..22b6b841 100644
--- a/libbcachefs/disk_groups.c
+++ b/libbcachefs/disk_groups.c
@@ -384,32 +384,34 @@ inval:
prt_printf(out, "invalid label %u", v);
}
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
{
struct bch_member *mi;
- int v = -1;
- int ret = 0;
-
- mutex_lock(&c->sb_lock);
+ int ret, v = -1;
if (!strlen(name) || !strcmp(name, "none"))
- goto write_sb;
+ return 0;
v = bch2_disk_path_find_or_create(&c->disk_sb, name);
- if (v < 0) {
- mutex_unlock(&c->sb_lock);
+ if (v < 0)
return v;
- }
ret = bch2_sb_disk_groups_to_cpu(c);
if (ret)
- goto unlock;
-write_sb:
+ return ret;
+
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
SET_BCH_MEMBER_GROUP(mi, v + 1);
+ return 0;
+}
- bch2_write_super(c);
-unlock:
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+ int ret;
+
+ mutex_lock(&c->sb_lock);
+ ret = __bch2_dev_group_set(c, ca, name) ?:
+ bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return ret;
diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h
index de915480..e4470c35 100644
--- a/libbcachefs/disk_groups.h
+++ b/libbcachefs/disk_groups.h
@@ -82,6 +82,7 @@ void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *
int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
const char *bch2_sb_validate_disk_groups(struct bch_sb *,
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 9f768d77..12f2ef44 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -772,9 +772,6 @@ static int hash_redo_key(struct btree_trans *trans,
struct bch_hash_info *hash_info,
struct btree_iter *k_iter, struct bkey_s_c k)
{
- bch_err(trans->c, "hash_redo_key() not implemented yet");
- return -EINVAL;
-#if 0
struct bkey_i *delete;
struct bkey_i *tmp;
@@ -792,8 +789,14 @@ static int hash_redo_key(struct btree_trans *trans,
delete->k.p = k_iter->pos;
return bch2_btree_iter_traverse(k_iter) ?:
bch2_trans_update(trans, k_iter, delete, 0) ?:
- bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
-#endif
+ bch2_hash_set_snapshot(trans, desc, hash_info,
+ (subvol_inum) { 0, k.k->p.inode },
+ k.k->p.snapshot, tmp,
+ BCH_HASH_SET_MUST_CREATE,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW);
}
static int hash_check_key(struct btree_trans *trans,
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 93771f83..a683a689 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -1387,7 +1387,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
struct closure *cl = &op->cl;
struct bio *bio = &op->write.op.wbio.bio;
- trace_promote(&rbio->bio);
+ trace_and_count(op->write.op.c, read_promote, &rbio->bio);
/* we now own pages: */
BUG_ON(!rbio->bounce);
@@ -1653,7 +1653,7 @@ static void bch2_rbio_retry(struct work_struct *work)
};
struct bch_io_failures failed = { .nr = 0 };
- trace_read_retry(&rbio->bio);
+ trace_and_count(c, read_retry, &rbio->bio);
if (rbio->retry == READ_RETRY_AVOID)
bch2_mark_io_failure(&failed, &rbio->pick);
@@ -1909,7 +1909,7 @@ static void bch2_read_endio(struct bio *bio)
if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
ptr_stale(ca, &rbio->pick.ptr)) {
- atomic_long_inc(&c->read_realloc_races);
+ trace_and_count(c, read_reuse_race, &rbio->bio);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
@@ -2197,7 +2197,7 @@ get_bio:
rbio->bio.bi_end_io = bch2_read_endio;
if (rbio->bounce)
- trace_read_bounce(&rbio->bio);
+ trace_and_count(c, read_bounce, &rbio->bio);
this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
@@ -2212,7 +2212,7 @@ get_bio:
if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
bio_inc_remaining(&orig->bio);
- trace_read_split(&orig->bio);
+ trace_and_count(c, read_split, &orig->bio);
}
if (!rbio->pick.idx) {
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 3f1cf1ac..3e8972c2 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -391,12 +391,12 @@ retry:
ret = journal_entry_open(j);
if (ret == JOURNAL_ERR_max_in_flight)
- trace_journal_entry_full(c);
+ trace_and_count(c, journal_entry_full, c);
unlock:
if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
!j->res_get_blocked_start) {
j->res_get_blocked_start = local_clock() ?: 1;
- trace_journal_full(c);
+ trace_and_count(c, journal_full, c);
}
can_discard = j->can_discard;
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 107521e1..55b86cbd 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1551,7 +1551,7 @@ static void do_journal_write(struct closure *cl)
bch2_bio_map(bio, w->data, sectors << 9);
- trace_journal_write(bio);
+ trace_and_count(c, journal_write, bio);
closure_bio_submit(bio, cl);
ca->journal.bucket_seq[ca->journal.cur_idx] =
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 9f8b63b3..e69595bd 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -641,7 +641,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
- trace_journal_reclaim_start(c, direct, kicked,
+ trace_and_count(c, journal_reclaim_start, c,
+ direct, kicked,
min_nr, min_key_cache,
j->prereserved.reserved,
j->prereserved.remaining,
@@ -657,7 +658,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
j->nr_direct_reclaim += nr_flushed;
else
j->nr_background_reclaim += nr_flushed;
- trace_journal_reclaim_finish(c, nr_flushed);
+ trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
if (nr_flushed)
wake_up(&j->reclaim_wait);
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 22470067..e85c3143 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -252,8 +252,8 @@ static int bch2_move_extent(struct btree_trans *trans,
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
-
- trace_move_extent(k.k);
+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
+ trace_move_extent_read(k.k);
atomic_add(io->read_sectors, &ctxt->read_sectors);
list_add_tail(&io->list, &ctxt->reads);
@@ -275,7 +275,7 @@ err_free:
kfree(io);
err:
percpu_ref_put(&c->writes);
- trace_move_alloc_mem_fail(k.k);
+ trace_and_count(c, move_extent_alloc_mem_fail, k.k);
return ret;
}
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index f913864e..35958c6b 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -165,7 +165,7 @@ static int bch2_copygc(struct bch_fs *c)
if (ret < 0)
bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
- trace_copygc(c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
+ trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
return ret;
}
@@ -221,7 +221,7 @@ static int bch2_copygc_thread(void *arg)
wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) {
- trace_copygc_wait(c, wait, last + wait);
+ trace_and_count(c, copygc_wait, c, wait, last + wait);
c->copygc_wait = last + wait;
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h
index 591bbb9f..5c327b31 100644
--- a/libbcachefs/str_hash.h
+++ b/libbcachefs/str_hash.h
@@ -239,29 +239,26 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
}
static __always_inline
-int bch2_hash_set(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- subvol_inum inum,
- struct bkey_i *insert, int flags)
+int bch2_hash_set_snapshot(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, u32 snapshot,
+ struct bkey_i *insert,
+ int flags,
+ int update_flags)
{
struct btree_iter iter, slot = { NULL };
struct bkey_s_c k;
bool found = false;
- u32 snapshot;
int ret;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
- SPOS(inum.inum,
+ SPOS(insert->k.p.inode,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
- POS(inum.inum, U64_MAX),
+ POS(insert->k.p.inode, U64_MAX),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (is_visible_key(desc, inum, k)) {
+ if (!inum.subvol || is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
@@ -304,6 +301,26 @@ not_found:
}
static __always_inline
+int bch2_hash_set(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum,
+ struct bkey_i *insert, int flags)
+{
+ u32 snapshot;
+ int ret;
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ insert->k.p.inode = inum.inum;
+
+ return bch2_hash_set_snapshot(trans, desc, info, inum,
+ snapshot, insert, flags, 0);
+}
+
+static __always_inline
int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index ade09bdf..e1e70d35 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -796,7 +796,7 @@ int bch2_write_super(struct bch_fs *c)
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
int ret = 0;
- trace_write_super(c, _RET_IP_);
+ trace_and_count(c, write_super, c, _RET_IP_);
if (c->opts.very_degraded)
degraded_flags |= BCH_FORCE_IF_LOST;
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 7c634800..8b3ce780 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -1530,6 +1530,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
struct bch_member dev_mi;
unsigned dev_idx, nr_devices, u64s;
struct printbuf errbuf = PRINTBUF;
+ struct printbuf label = PRINTBUF;
int ret;
ret = bch2_read_super(path, &opts, &sb);
@@ -1540,6 +1541,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
+ if (BCH_MEMBER_GROUP(&dev_mi)) {
+ bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+ if (label.allocation_failure) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+
err = bch2_dev_may_add(sb.sb, c);
if (err) {
bch_err(c, "device add error: %s", err);
@@ -1620,6 +1629,14 @@ have_slot:
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
+ if (BCH_MEMBER_GROUP(&dev_mi)) {
+ ret = __bch2_dev_group_set(c, ca, label.buf);
+ if (ret) {
+ bch_err(c, "device add error: error setting label");
+ goto err_unlock;
+ }
+ }
+
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -1652,6 +1669,7 @@ err:
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
+ printbuf_exit(&label);
printbuf_exit(&errbuf);
return ret;
err_late:
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 2dfed1ff..98449e42 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -190,11 +190,6 @@ read_attribute(internal_uuid);
read_attribute(has_data);
read_attribute(alloc_debug);
-read_attribute(read_realloc_races);
-read_attribute(extent_migrate_done);
-read_attribute(extent_migrate_raced);
-read_attribute(bucket_alloc_fail);
-
#define x(t, n, ...) read_attribute(t);
BCH_PERSISTENT_COUNTERS()
#undef x
@@ -378,15 +373,6 @@ SHOW(bch2_fs)
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c));
- sysfs_print(read_realloc_races,
- atomic_long_read(&c->read_realloc_races));
- sysfs_print(extent_migrate_done,
- atomic_long_read(&c->extent_migrate_done));
- sysfs_print(extent_migrate_raced,
- atomic_long_read(&c->extent_migrate_raced));
- sysfs_print(bucket_alloc_fail,
- atomic_long_read(&c->bucket_alloc_fail));
-
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
if (attr == &sysfs_gc_gens_pos)
@@ -625,11 +611,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_invalidates,
&sysfs_prune_cache,
- &sysfs_read_realloc_races,
- &sysfs_extent_migrate_done,
- &sysfs_extent_migrate_raced,
- &sysfs_bucket_alloc_fail,
-
&sysfs_gc_gens_pos,
&sysfs_copy_gc_enabled,
diff --git a/linux/six.c b/linux/six.c
index 5b2d92c6..d2275055 100644
--- a/linux/six.c
+++ b/linux/six.c
@@ -19,6 +19,8 @@
#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
#define six_release(l) lock_release(l, _RET_IP_)
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
+
struct six_lock_vals {
/* Value we add to the lock in order to take the lock: */
u64 lock_val;
@@ -65,14 +67,15 @@ struct six_lock_vals {
}
static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
- union six_lock_state old)
+ union six_lock_state old,
+ struct task_struct *owner)
{
if (type != SIX_LOCK_intent)
return;
if (!old.intent_lock) {
EBUG_ON(lock->owner);
- lock->owner = current;
+ lock->owner = owner;
} else {
EBUG_ON(lock->owner != current);
}
@@ -88,64 +91,21 @@ static inline unsigned pcpu_read_count(struct six_lock *lock)
return read_count;
}
-struct six_lock_waiter {
- struct list_head list;
- struct task_struct *task;
-};
-
/* This is probably up there with the more evil things I've done */
#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
-static inline void six_lock_wakeup(struct six_lock *lock,
- union six_lock_state state,
- unsigned waitlist_id)
-{
- if (waitlist_id == SIX_LOCK_write) {
- if (state.write_locking && !state.read_lock) {
- struct task_struct *p = READ_ONCE(lock->owner);
- if (p)
- wake_up_process(p);
- }
- } else {
- struct list_head *wait_list = &lock->wait_list[waitlist_id];
- struct six_lock_waiter *w, *next;
-
- if (!(state.waiters & (1 << waitlist_id)))
- return;
-
- clear_bit(waitlist_bitnr(waitlist_id),
- (unsigned long *) &lock->state.v);
-
- raw_spin_lock(&lock->wait_lock);
-
- list_for_each_entry_safe(w, next, wait_list, list) {
- list_del_init(&w->list);
-
- if (wake_up_process(w->task) &&
- waitlist_id != SIX_LOCK_read) {
- if (!list_empty(wait_list))
- set_bit(waitlist_bitnr(waitlist_id),
- (unsigned long *) &lock->state.v);
- break;
- }
- }
-
- raw_spin_unlock(&lock->wait_lock);
- }
-}
-
-static __always_inline bool do_six_trylock_type(struct six_lock *lock,
- enum six_lock_type type,
- bool try)
+static int __do_six_trylock_type(struct six_lock *lock,
+ enum six_lock_type type,
+ struct task_struct *task,
+ bool try)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state old, new;
- bool ret;
+ int ret;
u64 v;
- EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+ EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
-
EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
/*
@@ -177,18 +137,6 @@ retry:
preempt_enable();
/*
- * If we failed because a writer was trying to take the
- * lock, issue a wakeup because we might have caused a
- * spurious trylock failure:
- */
- if (old.write_locking) {
- struct task_struct *p = READ_ONCE(lock->owner);
-
- if (p)
- wake_up_process(p);
- }
-
- /*
* If we failed from the lock path and the waiting bit wasn't
* set, set it:
*/
@@ -208,6 +156,14 @@ retry:
} while ((v = atomic64_cmpxchg(&lock->state.counter,
old.v, new.v)) != old.v);
}
+
+ /*
+ * If we failed because a writer was trying to take the
+ * lock, issue a wakeup because we might have caused a
+ * spurious trylock failure:
+ */
+ if (old.write_locking)
+ ret = -1 - SIX_LOCK_write;
} else if (type == SIX_LOCK_write && lock->readers) {
if (try) {
atomic64_add(__SIX_VAL(write_locking, 1),
@@ -227,9 +183,13 @@ retry:
if (ret || try)
v -= __SIX_VAL(write_locking, 1);
+ if (!ret && !try && !(lock->state.waiters & (1 << SIX_LOCK_write)))
+ v += __SIX_VAL(waiters, 1 << SIX_LOCK_write);
+
if (try && !ret) {
old.v = atomic64_add_return(v, &lock->state.counter);
- six_lock_wakeup(lock, old, SIX_LOCK_read);
+ if (old.waiters & (1 << SIX_LOCK_read))
+ ret = -1 - SIX_LOCK_read;
} else {
atomic64_add(v, &lock->state.counter);
}
@@ -243,8 +203,7 @@ retry:
if (type == SIX_LOCK_write)
new.write_locking = 0;
- } else if (!try && type != SIX_LOCK_write &&
- !(new.waiters & (1 << type)))
+ } else if (!try && !(new.waiters & (1 << type)))
new.waiters |= 1 << type;
else
break; /* waiting bit already set */
@@ -256,14 +215,84 @@ retry:
EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
}
- if (ret)
- six_set_owner(lock, type, old);
+ if (ret > 0)
+ six_set_owner(lock, type, old, task);
- EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
+ EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking));
return ret;
}
+static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
+{
+ struct six_lock_waiter *w, *next;
+ struct task_struct *task;
+ bool saw_one;
+ int ret;
+again:
+ ret = 0;
+ saw_one = false;
+ raw_spin_lock(&lock->wait_lock);
+
+ list_for_each_entry_safe(w, next, &lock->wait_list, list) {
+ if (w->lock_want != lock_type)
+ continue;
+
+ if (saw_one && lock_type != SIX_LOCK_read)
+ goto unlock;
+ saw_one = true;
+
+ ret = __do_six_trylock_type(lock, lock_type, w->task, false);
+ if (ret <= 0)
+ goto unlock;
+
+ __list_del(w->list.prev, w->list.next);
+ task = w->task;
+ /*
+ * Do no writes to @w besides setting lock_acquired - otherwise
+ * we would need a memory barrier:
+ */
+ barrier();
+ w->lock_acquired = true;
+ wake_up_process(task);
+ }
+
+ clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v);
+unlock:
+ raw_spin_unlock(&lock->wait_lock);
+
+ if (ret < 0) {
+ lock_type = -ret - 1;
+ goto again;
+ }
+}
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+ union six_lock_state state,
+ enum six_lock_type lock_type)
+{
+ if (lock_type == SIX_LOCK_write && state.read_lock)
+ return;
+
+ if (!(state.waiters & (1 << lock_type)))
+ return;
+
+ __six_lock_wakeup(lock, lock_type);
+}
+
+static bool do_six_trylock_type(struct six_lock *lock,
+ enum six_lock_type type,
+ bool try)
+{
+ int ret;
+
+ ret = __do_six_trylock_type(lock, type, current, try);
+ if (ret < 0)
+ __six_lock_wakeup(lock, -ret - 1);
+
+ return ret > 0;
+}
+
__always_inline __flatten
static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
{
@@ -304,12 +333,8 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
* Similar to the lock path, we may have caused a spurious write
* lock fail and need to issue a wakeup:
*/
- if (old.write_locking) {
- struct task_struct *p = READ_ONCE(lock->owner);
-
- if (p)
- wake_up_process(p);
- }
+ if (old.write_locking)
+ six_lock_wakeup(lock, old, SIX_LOCK_write);
if (ret)
six_acquire(&lock->dep_map, 1);
@@ -327,7 +352,7 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
old.v,
old.v + l[type].lock_val)) != old.v);
- six_set_owner(lock, type, old);
+ six_set_owner(lock, type, old, current);
if (type != SIX_LOCK_write)
six_acquire(&lock->dep_map, 1);
return true;
@@ -335,33 +360,26 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
#ifdef CONFIG_LOCK_SPIN_ON_OWNER
-static inline int six_can_spin_on_owner(struct six_lock *lock)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+ struct six_lock_waiter *wait)
{
- struct task_struct *owner;
- int retval = 1;
+ struct task_struct *owner, *task = current;
- if (need_resched())
- return 0;
+ switch (wait->lock_want) {
+ case SIX_LOCK_read:
+ break;
+ case SIX_LOCK_intent:
+ if (lock->wait_list.next != &wait->list)
+ return false;
+ break;
+ case SIX_LOCK_write:
+ return false;
+ }
rcu_read_lock();
owner = READ_ONCE(lock->owner);
- if (owner)
- retval = owner->on_cpu;
- rcu_read_unlock();
- /*
- * if lock->owner is not set, the mutex owner may have just acquired
- * it and not set the owner yet or the mutex has been released.
- */
- return retval;
-}
-static inline bool six_spin_on_owner(struct six_lock *lock,
- struct task_struct *owner)
-{
- bool ret = true;
-
- rcu_read_lock();
- while (lock->owner == owner) {
+ while (owner && lock->owner == owner) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
* checking lock->owner still matches owner. If that fails,
@@ -370,85 +388,27 @@ static inline bool six_spin_on_owner(struct six_lock *lock,
*/
barrier();
- if (!owner->on_cpu || need_resched()) {
- ret = false;
- break;
- }
-
- cpu_relax();
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-{
- struct task_struct *task = current;
-
- if (type == SIX_LOCK_write)
- return false;
-
- preempt_disable();
- if (!six_can_spin_on_owner(lock))
- goto fail;
-
- if (!osq_lock(&lock->osq))
- goto fail;
-
- while (1) {
- struct task_struct *owner;
-
/*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
- */
- owner = READ_ONCE(lock->owner);
- if (owner && !six_spin_on_owner(lock, owner))
- break;
-
- if (do_six_trylock_type(lock, type, false)) {
- osq_unlock(&lock->osq);
- preempt_enable();
- return true;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
+ * If we're an RT task that will live-lock because we won't let
* the owner complete.
*/
- if (!owner && (need_resched() || rt_task(task)))
+ if (wait->lock_acquired ||
+ !owner->on_cpu ||
+ rt_task(task) ||
+ need_resched())
break;
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
cpu_relax();
}
+ rcu_read_unlock();
- osq_unlock(&lock->osq);
-fail:
- preempt_enable();
-
- /*
- * If we fell out of the spin path because of need_resched(),
- * reschedule now, before we try-lock again. This avoids getting
- * scheduled out right after we obtained the lock.
- */
- if (need_resched())
- schedule();
-
- return false;
+ return wait->lock_acquired;
}
#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+ struct six_lock_waiter *wait)
{
return false;
}
@@ -457,10 +417,10 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
noinline
static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
union six_lock_state old;
- struct six_lock_waiter wait;
int ret = 0;
if (type == SIX_LOCK_write) {
@@ -469,46 +429,58 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
smp_mb__after_atomic();
}
- ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
- if (ret)
- goto out_before_sleep;
+ lock_contended(&lock->dep_map, _RET_IP_);
- if (six_optimistic_spin(lock, type))
- goto out_before_sleep;
+ wait->task = current;
+ wait->lock_want = type;
+ wait->lock_acquired = false;
- lock_contended(&lock->dep_map, _RET_IP_);
+ raw_spin_lock(&lock->wait_lock);
+ /*
+ * Retry taking the lock after taking waitlist lock, have raced with an
+ * unlock:
+ */
+ ret = __do_six_trylock_type(lock, type, current, false);
+ if (ret <= 0)
+ list_add_tail(&wait->list, &lock->wait_list);
+ raw_spin_unlock(&lock->wait_lock);
- INIT_LIST_HEAD(&wait.list);
- wait.task = current;
+ if (unlikely(ret > 0)) {
+ ret = 0;
+ goto out;
+ }
+
+ if (unlikely(ret < 0)) {
+ __six_lock_wakeup(lock, -ret - 1);
+ ret = 0;
+ }
+
+ if (six_optimistic_spin(lock, wait))
+ goto out;
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (type == SIX_LOCK_write)
- EBUG_ON(lock->owner != current);
- else if (list_empty_careful(&wait.list)) {
- raw_spin_lock(&lock->wait_lock);
- list_add_tail(&wait.list, &lock->wait_list[type]);
- raw_spin_unlock(&lock->wait_lock);
- }
- if (do_six_trylock_type(lock, type, false))
+ if (wait->lock_acquired)
break;
ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
- if (ret)
+ if (unlikely(ret)) {
+ raw_spin_lock(&lock->wait_lock);
+ if (!wait->lock_acquired)
+ list_del(&wait->list);
+ raw_spin_unlock(&lock->wait_lock);
+
+ if (wait->lock_acquired)
+ do_six_unlock_type(lock, type);
break;
+ }
schedule();
}
__set_current_state(TASK_RUNNING);
-
- if (!list_empty_careful(&wait.list)) {
- raw_spin_lock(&lock->wait_lock);
- list_del_init(&wait.list);
- raw_spin_unlock(&lock->wait_lock);
- }
-out_before_sleep:
+out:
if (ret && type == SIX_LOCK_write) {
old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
&lock->state.counter);
@@ -518,9 +490,10 @@ out_before_sleep:
return ret;
}
-__always_inline
-static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
+__always_inline __flatten
+static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
+ struct six_lock_waiter *wait,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
{
int ret;
@@ -528,7 +501,7 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
six_acquire(&lock->dep_map, 0);
ret = do_six_trylock_type(lock, type, true) ? 0
- : __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
+ : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p);
if (ret && type != SIX_LOCK_write)
six_release(&lock->dep_map);
@@ -538,28 +511,23 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
return ret;
}
+__always_inline
+static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+ struct six_lock_waiter wait;
+
+ return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p);
+}
+
__always_inline __flatten
-static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state state;
- EBUG_ON(type == SIX_LOCK_write &&
- !(lock->state.v & __SIX_LOCK_HELD_intent));
-
- if (type != SIX_LOCK_write)
- six_release(&lock->dep_map);
-
- if (type == SIX_LOCK_intent) {
- EBUG_ON(lock->owner != current);
-
- if (lock->intent_lock_recurse) {
- --lock->intent_lock_recurse;
- return;
- }
-
+ if (type == SIX_LOCK_intent)
lock->owner = NULL;
- }
if (type == SIX_LOCK_read &&
lock->readers) {
@@ -576,6 +544,27 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
six_lock_wakeup(lock, state, l[type].unlock_wakeup);
}
+__always_inline __flatten
+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+ EBUG_ON(type == SIX_LOCK_write &&
+ !(lock->state.v & __SIX_LOCK_HELD_intent));
+ EBUG_ON((type == SIX_LOCK_write ||
+ type == SIX_LOCK_intent) &&
+ lock->owner != current);
+
+ if (type != SIX_LOCK_write)
+ six_release(&lock->dep_map);
+
+ if (type == SIX_LOCK_intent &&
+ lock->intent_lock_recurse) {
+ --lock->intent_lock_recurse;
+ return;
+ }
+
+ do_six_unlock_type(lock, type);
+}
+
#define __SIX_LOCK(type) \
bool six_trylock_##type(struct six_lock *lock) \
{ \
@@ -596,6 +585,14 @@ int six_lock_##type(struct six_lock *lock, \
} \
EXPORT_SYMBOL_GPL(six_lock_##type); \
\
+int six_lock_waiter_##type(struct six_lock *lock, \
+ struct six_lock_waiter *wait, \
+ six_lock_should_sleep_fn should_sleep_fn, void *p)\
+{ \
+ return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p);\
+} \
+EXPORT_SYMBOL_GPL(six_lock_waiter_##type); \
+ \
void six_unlock_##type(struct six_lock *lock) \
{ \
__six_unlock_type(lock, SIX_LOCK_##type); \
@@ -639,7 +636,7 @@ bool six_lock_tryupgrade(struct six_lock *lock)
if (lock->readers)
this_cpu_dec(*lock->readers);
- six_set_owner(lock, SIX_LOCK_intent, old);
+ six_set_owner(lock, SIX_LOCK_intent, old, current);
return true;
}
@@ -701,44 +698,12 @@ void six_lock_wakeup_all(struct six_lock *lock)
struct six_lock_waiter *w;
raw_spin_lock(&lock->wait_lock);
-
- list_for_each_entry(w, &lock->wait_list[0], list)
- wake_up_process(w->task);
- list_for_each_entry(w, &lock->wait_list[1], list)
+ list_for_each_entry(w, &lock->wait_list, list)
wake_up_process(w->task);
-
raw_spin_unlock(&lock->wait_lock);
}
EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
-struct free_pcpu_rcu {
- struct rcu_head rcu;
- void __percpu *p;
-};
-
-static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
-{
- struct free_pcpu_rcu *rcu =
- container_of(_rcu, struct free_pcpu_rcu, rcu);
-
- free_percpu(rcu->p);
- kfree(rcu);
-}
-
-void six_lock_pcpu_free_rcu(struct six_lock *lock)
-{
- struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
-
- if (!rcu)
- return;
-
- rcu->p = lock->readers;
- lock->readers = NULL;
-
- call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
-}
-EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
-
void six_lock_pcpu_free(struct six_lock *lock)
{
BUG_ON(lock->readers && pcpu_read_count(lock));
@@ -763,15 +728,19 @@ EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
*/
struct six_lock_count six_lock_counts(struct six_lock *lock)
{
- struct six_lock_count ret = { 0, lock->state.intent_lock };
+ struct six_lock_count ret;
+
+ ret.n[SIX_LOCK_read] = 0;
+ ret.n[SIX_LOCK_intent] = lock->state.intent_lock + lock->intent_lock_recurse;
+ ret.n[SIX_LOCK_write] = lock->state.seq & 1;
if (!lock->readers)
- ret.read += lock->state.read_lock;
+ ret.n[SIX_LOCK_read] += lock->state.read_lock;
else {
int cpu;
for_each_possible_cpu(cpu)
- ret.read += *per_cpu_ptr(lock->readers, cpu);
+ ret.n[SIX_LOCK_read] += *per_cpu_ptr(lock->readers, cpu);
}
return ret;