From f44cc269a1c148ad83332d85fe54607e8874ca79 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Jun 2024 20:52:39 -0400 Subject: bcachefs: fix seqmutex_relock() We were grabbing the sequence number before unlock incremented it - fix this by moving the increment to seqmutex_lock() (so the seqmutex_relock() failure path skips the mutex_trylock()), and returning the sequence number from unlock(), to make the API simpler and safer. Signed-off-by: Kent Overstreet --- fs/bcachefs/debug.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs/bcachefs/debug.c') diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 51cbf3928361..8ec2d44e4956 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -575,7 +575,6 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, struct bch_fs *c = i->c; struct btree_trans *trans; ssize_t ret = 0; - u32 seq; i->ubuf = buf; i->size = size; @@ -589,8 +588,7 @@ restart: continue; closure_get(&trans->ref); - seq = seqmutex_seq(&c->btree_trans_lock); - seqmutex_unlock(&c->btree_trans_lock); + u32 seq = seqmutex_unlock(&c->btree_trans_lock); ret = flush_buf(i); if (ret) { @@ -811,7 +809,6 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, struct bch_fs *c = i->c; struct btree_trans *trans; ssize_t ret = 0; - u32 seq; i->ubuf = buf; i->size = size; @@ -828,8 +825,7 @@ restart: continue; closure_get(&trans->ref); - seq = seqmutex_seq(&c->btree_trans_lock); - seqmutex_unlock(&c->btree_trans_lock); + u32 seq = seqmutex_unlock(&c->btree_trans_lock); ret = flush_buf(i); if (ret) { -- cgit v1.2.3 From 18e92841e87bc548fcb91530115a66e72eecb10c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Jun 2024 20:59:09 -0400 Subject: bcachefs: Make btree_deadlock_to_text() clearer btree_deadlock_to_text() searches the list of btree transactions to find a deadlock - when it finds one it's done; it's not like other *_read() functions that's printing each object. Factor out btree_deadlock_to_text() to make this clearer. Signed-off-by: Kent Overstreet --- fs/bcachefs/debug.c | 52 +++++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 23 deletions(-) (limited to 'fs/bcachefs/debug.c') diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 8ec2d44e4956..ecfdb21ebade 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -802,48 +802,54 @@ static const struct file_operations btree_transaction_stats_op = { .read = btree_transaction_stats_read, }; -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +/* walk btree transactions until we find a deadlock and print it */ +static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) { - struct dump_iter *i = file->private_data; - struct bch_fs *c = i->c; struct btree_trans *trans; - ssize_t ret = 0; - - i->ubuf = buf; - i->size = size; - i->ret = 0; - - if (i->iter) - goto out; + pid_t iter = 0; restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { struct task_struct *task = READ_ONCE(trans->locking_wait.task); - if (!task || task->pid <= i->iter) + if (!task || task->pid <= iter) continue; + iter = task->pid; + closure_get(&trans->ref); - u32 seq = seqmutex_unlock(&c->btree_trans_lock); - ret = flush_buf(i); - if (ret) { - closure_put(&trans->ref); - goto out; - } + u32 seq = seqmutex_unlock(&c->btree_trans_lock); - bch2_check_for_deadlock(trans, &i->buf); - - i->iter = task->pid; + bool found = bch2_check_for_deadlock(trans, out) != 0; closure_put(&trans->ref); + if (found) + return; + if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } seqmutex_unlock(&c->btree_trans_lock); -out: +} + +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (!i->iter) { + btree_deadlock_to_text(&i->buf, c); + i->iter++; + } + if (i->buf.allocation_failure) ret = -ENOMEM; -- cgit v1.2.3 From de611ab6fc5ed0d68dd46319b9913353e3b459e9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Jun 2024 22:02:09 -0400 Subject: bcachefs: Fix race between trans_put() and btree_transactions_read() debug.c was using closure_get() on a different thread's closure where the we don't know if the object being refcounted is alive. We keep btree_trans objects on a list so they can be printed by debug code, and because it is cost prohibitive to touch the btree_trans list every time we allocate and free btree_trans objects, cached objects are also on this list. However, we do not want the debug code to see cached but not in use btree_trans objects - critically because the btree_paths array will have been freed (if it was reallocated). closure_get() is also incorrect to use when that get may race with it hitting zero, i.e. we must already have a ref on the object or know the ref can't currently hit 0 for other reasons (as used in the cycle detector). to fix this, use the previously introduced closure_get_not_zero(), closure_return_sync(), and closure_init_stack_release(); the debug code now can only take a ref on a trans object if it's alive and in use. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 10 ++++------ fs/bcachefs/debug.c | 19 +++++++++---------- 2 files changed, 13 insertions(+), 16 deletions(-) (limited to 'fs/bcachefs/debug.c') diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 3a1419d17888..15c1c7cfefe6 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3130,7 +3130,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); memset(trans, 0, sizeof(*trans)); - closure_init_stack(&trans->ref); seqmutex_lock(&c->btree_trans_lock); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { @@ -3161,7 +3160,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) list_add_done: seqmutex_unlock(&c->btree_trans_lock); got_trans: - trans->ref.closure_get_happened = false; trans->c = c; trans->last_begin_time = local_clock(); trans->fn_idx = fn_idx; @@ -3200,6 +3198,8 @@ got_trans: trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; + + closure_init_stack_release(&trans->ref); return trans; } @@ -3257,10 +3257,10 @@ void bch2_trans_put(struct btree_trans *trans) bch2_journal_keys_put(c); /* - * trans->ref protects trans->locking_wait.task, btree_paths arary; used + * trans->ref protects trans->locking_wait.task, btree_paths array; used * by cycle detector */ - closure_sync(&trans->ref); + closure_return_sync(&trans->ref); trans->locking_wait.task = NULL; unsigned long *paths_allocated = trans->paths_allocated; @@ -3385,8 +3385,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; if (trans) { - closure_sync(&trans->ref); - seqmutex_lock(&c->btree_trans_lock); list_del(&trans->list); seqmutex_unlock(&c->btree_trans_lock); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index ecfdb21ebade..61c50522abb9 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -587,14 +587,10 @@ restart: if (!task || task->pid <= i->iter) continue; - closure_get(&trans->ref); - u32 seq = seqmutex_unlock(&c->btree_trans_lock); + if (!closure_get_not_zero(&trans->ref)) + continue; - ret = flush_buf(i); - if (ret) { - closure_put(&trans->ref); - goto unlocked; - } + u32 seq = seqmutex_unlock(&c->btree_trans_lock); bch2_btree_trans_to_text(&i->buf, trans); @@ -604,10 +600,12 @@ restart: printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = task->pid; - closure_put(&trans->ref); + ret = flush_buf(i); + if (ret) + goto unlocked; + if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } @@ -817,7 +815,8 @@ restart: iter = task->pid; - closure_get(&trans->ref); + if (!closure_get_not_zero(&trans->ref)) + continue; u32 seq = seqmutex_unlock(&c->btree_trans_lock); -- cgit v1.2.3 From 1aaf5cb41b8e92dcd3ac7e047124cb0e3e27f1c1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Jun 2024 22:11:01 -0400 Subject: bcachefs: Fix btree_trans list ordering The debug code relies on btree_trans_list being ordered so that it can resume on subsequent calls or lock restarts. However, it was using trans->locknig_wait.task.pid, which is incorrect since btree_trans objects are cached and reused - typically by different tasks. Fix this by switching to pointer order, and also sort them lazily when required - speeding up the btree_trans_get() fastpath. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 9 ++------- fs/bcachefs/debug.c | 36 ++++++++++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 11 deletions(-) (limited to 'fs/bcachefs/debug.c') diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 15c1c7cfefe6..0ed9e6574fcd 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3149,15 +3149,10 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) BUG_ON(pos_task && pid == pos_task->pid && pos->locked); - - if (pos_task && pid < pos_task->pid) { - list_add_tail(&trans->list, &pos->list); - goto list_add_done; - } } } - list_add_tail(&trans->list, &c->btree_trans_list); -list_add_done: + + list_add(&trans->list, &c->btree_trans_list); seqmutex_unlock(&c->btree_trans_lock); got_trans: trans->c = c; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 61c50522abb9..f0d4727c4dc2 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -568,6 +568,32 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; +typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r); + +static void list_sort(struct list_head *head, list_cmp_fn cmp) +{ + struct list_head *pos; + + list_for_each(pos, head) + while (!list_is_last(pos, head) && + cmp(pos, pos->next) > 0) { + struct list_head *pos2, *next = pos->next; + + list_del(next); + list_for_each(pos2, head) + if (cmp(next, pos2) < 0) + goto pos_found; + BUG(); +pos_found: + list_add_tail(next, pos2); + } +} + +static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r) +{ + return cmp_int(l, r); +} + static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -581,12 +607,14 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, i->ret = 0; restart: seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct task_struct *task = READ_ONCE(trans->locking_wait.task); + list_sort(&c->btree_trans_list, list_ptr_order_cmp); - if (!task || task->pid <= i->iter) + list_for_each_entry(trans, &c->btree_trans_list, list) { + if ((ulong) trans < i->iter) continue; + i->iter = (ulong) trans; + if (!closure_get_not_zero(&trans->ref)) continue; @@ -596,7 +624,7 @@ restart: prt_printf(&i->buf, "backtrace:\n"); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL); + bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); -- cgit v1.2.3