From 13f53aa228c83731226f4a359983215f1f7c2a47 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Nov 2020 14:41:06 -0500 Subject: Update bcachefs sources to d1fd471830 bcachefs: Add more debug checks --- .bcachefs_revision | 2 +- include/linux/bitops.h | 11 ++ include/linux/kernel.h | 2 + include/linux/srcu.h | 31 +++++ include/linux/types.h | 1 + include/trace/events/bcachefs.h | 10 +- libbcachefs/bcachefs.h | 3 + libbcachefs/bkey_methods.c | 18 ++- libbcachefs/bset.c | 94 ++++++++------- libbcachefs/btree_cache.c | 8 +- libbcachefs/btree_io.c | 9 +- libbcachefs/btree_io.h | 17 +++ libbcachefs/btree_iter.c | 28 ++++- libbcachefs/btree_key_cache.c | 130 ++++++++++++++++++-- libbcachefs/btree_types.h | 9 +- libbcachefs/btree_update_interior.c | 39 ++++-- libbcachefs/btree_update_interior.h | 3 + libbcachefs/btree_update_leaf.c | 2 +- libbcachefs/buckets.c | 2 +- libbcachefs/fs-io.c | 68 ++++++++++- libbcachefs/fs.c | 8 +- libbcachefs/fs.h | 1 + libbcachefs/inode.c | 15 +-- libbcachefs/journal.c | 230 ++++++++++-------------------------- libbcachefs/journal.h | 7 +- libbcachefs/journal_io.c | 117 ++++++++++-------- libbcachefs/journal_reclaim.c | 7 ++ libbcachefs/journal_types.h | 3 +- libbcachefs/recovery.c | 3 + libbcachefs/sysfs.c | 2 +- 30 files changed, 548 insertions(+), 332 deletions(-) create mode 100644 include/linux/srcu.h diff --git a/.bcachefs_revision b/.bcachefs_revision index dc583047..9c20ba85 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -1d669389f79de8571732c13fdf4d23039e2308fd +d1fd47183051729471bce1c9f84fa63cb84dc557 diff --git a/include/linux/bitops.h b/include/linux/bitops.h index f2183d54..2fe736e9 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -85,6 +85,17 @@ static inline bool test_and_set_bit(long nr, volatile unsigned long *addr) return (old & mask) != 0; } +static inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr); + unsigned long old; + + old = __atomic_fetch_and(p, ~mask, __ATOMIC_RELAXED); + + return (old & mask) != 0; +} + static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 10d94c5e..4b45306d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -219,4 +219,6 @@ struct qstr { #define POISON_FREE 0x6b +static inline void dump_stack(void) {} + #endif diff --git a/include/linux/srcu.h b/include/linux/srcu.h new file mode 100644 index 00000000..75823cf2 --- /dev/null +++ b/include/linux/srcu.h @@ -0,0 +1,31 @@ +#ifndef __TOOLS_LINUX_SRCU_H +#define __TOOLS_LINUX_SRCU_H + +struct srcu_struct { +}; + +static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) {} + +static inline int srcu_read_lock(struct srcu_struct *ssp) +{ + return 0; +} + +static inline bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) +{ + return false; +} + +static inline unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) +{ + return 0; +} + +static inline void cleanup_srcu_struct(struct srcu_struct *ssp) {} + +static inline int init_srcu_struct(struct srcu_struct *ssp) +{ + return 0; +} + +#endif /* __TOOLS_LINUX_SRCU_H */ diff --git a/include/linux/types.h b/include/linux/types.h index 387c3831..1e125550 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -31,6 +31,7 @@ typedef unsigned gfp_t; #define __GFP_IO 0 #define __GFP_NOWARN 0 #define __GFP_NORETRY 0 +#define __GFP_NOFAIL 0 #define __GFP_ZERO 1 #define PAGE_ALLOC_COSTLY_ORDER 6 diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index ba2c5555..a8b8c5b6 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -513,7 +513,7 @@ TRACE_EVENT(transaction_restart_ip, __entry->ip = ip; ), - TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip) + TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip) ); DECLARE_EVENT_CLASS(transaction_restart, @@ -528,7 +528,7 @@ DECLARE_EVENT_CLASS(transaction_restart, __entry->ip = ip; ), - TP_printk("%pf", (void *) __entry->ip) + TP_printk("%ps", (void *) __entry->ip) ); DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, @@ -568,7 +568,7 @@ TRACE_EVENT(trans_restart_would_deadlock, __entry->want_iter_type = want_iter_type; ), - TP_printk("%pF %pF because %u have %u:%u want %u:%u", + TP_printk("%ps %pS because %u have %u:%u want %u:%u", (void *) __entry->trans_ip, (void *) __entry->caller_ip, __entry->reason, @@ -592,7 +592,7 @@ TRACE_EVENT(trans_restart_iters_realloced, __entry->nr = nr; ), - TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) + TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr) ); TRACE_EVENT(trans_restart_mem_realloced, @@ -609,7 +609,7 @@ TRACE_EVENT(trans_restart_mem_realloced, __entry->bytes = bytes; ), - TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) + TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes) ); DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 35311dbb..b20895a4 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -193,6 +193,7 @@ #include #include #include +#include #include #include #include @@ -642,6 +643,8 @@ struct bch_fs { mempool_t btree_iters_pool; struct btree_iter_buf __percpu *btree_iters_bufs; + struct srcu_struct btree_trans_barrier; + struct btree_key_cache btree_key_cache; struct workqueue_struct *wq; diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 99b7fce2..f5779795 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -181,8 +181,12 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) { if (k) { - pr_buf(out, "u64s %u type %s ", k->u64s, - bch2_bkey_types[k->type]); + pr_buf(out, "u64s %u type ", k->u64s); + + if (k->type < KEY_TYPE_MAX) + pr_buf(out, "%s ", bch2_bkey_types[k->type]); + else + pr_buf(out, "%u ", k->type); bch2_bpos_to_text(out, k->p); @@ -196,10 +200,14 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; + if (k.k->type < KEY_TYPE_MAX) { + const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; - if (likely(ops->val_to_text)) - ops->val_to_text(out, c, k); + if (likely(ops->val_to_text)) + ops->val_to_text(out, c, k); + } else { + pr_buf(out, "(invalid type %u)", k.k->type); + } } void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 26716657..1c7318c6 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -604,53 +604,23 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k, return (u16) v; } -static void make_bfloat(struct btree *b, struct bset_tree *t, - unsigned j, - struct bkey_packed *min_key, - struct bkey_packed *max_key) +__always_inline +static inline void __make_bfloat(struct btree *b, struct bset_tree *t, + unsigned j, + struct bkey_packed *min_key, + struct bkey_packed *max_key) { struct bkey_float *f = bkey_float(b, t, j); struct bkey_packed *m = tree_to_bkey(b, t, j); - struct bkey_packed *l, *r; + struct bkey_packed *l = is_power_of_2(j) + ? min_key + : tree_to_prev_bkey(b, t, j >> ffs(j)); + struct bkey_packed *r = is_power_of_2(j + 1) + ? max_key + : tree_to_bkey(b, t, j >> (ffz(j) + 1)); unsigned mantissa; int shift, exponent, high_bit; - if (is_power_of_2(j)) { - l = min_key; - - if (!l->u64s) { - if (!bkey_pack_pos(l, b->data->min_key, b)) { - struct bkey_i tmp; - - bkey_init(&tmp.k); - tmp.k.p = b->data->min_key; - bkey_copy(l, &tmp); - } - } - } else { - l = tree_to_prev_bkey(b, t, j >> ffs(j)); - - EBUG_ON(m < l); - } - - if (is_power_of_2(j + 1)) { - r = max_key; - - if (!r->u64s) { - if (!bkey_pack_pos(r, t->max_key, b)) { - struct bkey_i tmp; - - bkey_init(&tmp.k); - tmp.k.p = t->max_key; - bkey_copy(r, &tmp); - } - } - } else { - r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); - - EBUG_ON(m > r); - } - /* * for failed bfloats, the lookup code falls back to comparing against * the original key. @@ -707,6 +677,30 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, f->mantissa = mantissa; } +static void make_bfloat(struct btree *b, struct bset_tree *t, + unsigned j, + struct bkey_packed *min_key, + struct bkey_packed *max_key) +{ + struct bkey_i *k; + + if (is_power_of_2(j) && + !min_key->u64s) { + k = (void *) min_key; + bkey_init(&k->k); + k->k.p = b->data->min_key; + } + + if (is_power_of_2(j + 1) && + !max_key->u64s) { + k = (void *) max_key; + bkey_init(&k->k); + k->k.p = t->max_key; + } + + __make_bfloat(b, t, j, min_key, max_key); +} + /* bytes remaining - only valid for last bset: */ static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) { @@ -726,7 +720,7 @@ static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_t return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); } -static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) +static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *k; @@ -745,15 +739,12 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) } } -static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) +static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); - struct bkey_packed min_key, max_key; + struct bkey_i min_key, max_key; unsigned j, cacheline = 1; - /* signal to make_bfloat() that they're uninitialized: */ - min_key.u64s = max_key.u64s = 0; - t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), bset_ro_tree_capacity(b, t)); retry: @@ -789,9 +780,16 @@ retry: t->max_key = bkey_unpack_pos(b, prev); + bkey_init(&min_key.k); + min_key.k.p = b->data->min_key; + bkey_init(&max_key.k); + max_key.k.p = t->max_key; + /* Then we build the tree */ eytzinger1_for_each(j, t->size) - make_bfloat(b, t, j, &min_key, &max_key); + __make_bfloat(b, t, j, + bkey_to_packed(&min_key), + bkey_to_packed(&max_key)); } static void bset_alloc_tree(struct btree *b, struct bset_tree *t) diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 325a1661..5bceff48 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -328,9 +328,9 @@ restart: clear_btree_node_accessed(b); } - memalloc_nofs_restore(flags); mutex_unlock(&bc->lock); out: + memalloc_nofs_restore(flags); return (unsigned long) freed * btree_pages(c); } @@ -381,11 +381,13 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (btree_node_dirty(b)) bch2_btree_complete_write(c, b, btree_current_write(b)); - clear_btree_node_dirty(b); + clear_btree_node_dirty(c, b); btree_node_data_free(c, b); } + BUG_ON(atomic_read(&c->btree_cache.dirty)); + while (!list_empty(&bc->freed)) { b = list_first_entry(&bc->freed, struct btree, list); list_del(&b->list); @@ -445,7 +447,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->shrink.scan_objects = bch2_btree_cache_scan; bc->shrink.seeks = 4; bc->shrink.batch = btree_pages(c) * 2; - register_shrinker(&bc->shrink); + ret = register_shrinker(&bc->shrink); out: pr_verbose_init(c->opts, "ret %i", ret); return ret; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 10a00085..2406745f 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1442,8 +1442,10 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, ret = validate_bset(c, b, i, sectors, WRITE, false) ?: validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); - if (ret) + if (ret) { bch2_inconsistent_error(c); + dump_stack(); + } return ret; } @@ -1498,6 +1500,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, new ^= (1 << BTREE_NODE_write_idx); } while (cmpxchg_acquire(&b->flags, old, new) != old); + atomic_dec(&c->btree_cache.dirty); + BUG_ON(btree_node_fake(b)); BUG_ON((b->will_make_reachable != 0) != !b->written); @@ -1530,6 +1534,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, seq = max(seq, le64_to_cpu(i->journal_seq)); } + /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ + bytes += 8; + data = btree_bounce_alloc(c, bytes, &used_mempool); if (!b->written) { diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 626d0f07..1a4b11e9 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -14,6 +14,23 @@ struct btree_write; struct btree; struct btree_iter; +static inline bool btree_node_dirty(struct btree *b) +{ + return test_bit(BTREE_NODE_dirty, &b->flags); +} + +static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b) +{ + if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) + atomic_inc(&c->btree_cache.dirty); +} + +static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) +{ + if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) + atomic_dec(&c->btree_cache.dirty); +} + struct btree_read_bio { struct bch_fs *c; u64 start_time; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 58f1a3dd..96cc5394 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -2342,12 +2342,15 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) unsigned new_size = BTREE_ITER_MAX; size_t iters_bytes = sizeof(struct btree_iter) * new_size; size_t updates_bytes = sizeof(struct btree_insert_entry) * new_size; - void *p; + void *p = NULL; BUG_ON(trans->used_mempool); - p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?: - mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); +#ifdef __KERNEL__ + p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL); +#endif + if (!p) + p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); trans->iters = p; p += iters_bytes; trans->updates = p; p += updates_bytes; @@ -2369,8 +2372,12 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, */ bch2_trans_alloc_iters(trans, c); - if (expected_mem_bytes) - bch2_trans_preload_mem(trans, expected_mem_bytes); + if (expected_mem_bytes) { + trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); + trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL); + } + + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); #ifdef CONFIG_BCACHEFS_DEBUG trans->pid = current->pid; @@ -2392,12 +2399,19 @@ int bch2_trans_exit(struct btree_trans *trans) mutex_unlock(&trans->c->btree_trans_lock); #endif + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); kfree(trans->fs_usage_deltas); kfree(trans->mem); +#ifdef __KERNEL__ + /* + * Userspace doesn't have a real percpu implementation: + */ trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); +#endif if (trans->iters) mempool_free(trans->iters, &trans->c->btree_iters_pool); @@ -2474,6 +2488,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) void bch2_fs_btree_iter_exit(struct bch_fs *c) { mempool_exit(&c->btree_iters_pool); + cleanup_srcu_struct(&c->btree_trans_barrier); } int bch2_fs_btree_iter_init(struct bch_fs *c) @@ -2483,7 +2498,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) INIT_LIST_HEAD(&c->btree_trans_list); mutex_init(&c->btree_trans_lock); - return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + return init_srcu_struct(&c->btree_trans_barrier) ?: + mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, sizeof(struct btree_iter) * nr + sizeof(struct btree_insert_entry) * nr + sizeof(struct btree_insert_entry) * nr); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 0ee4f78c..d605ff18 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -9,6 +9,7 @@ #include "journal.h" #include "journal_reclaim.h" +#include #include static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, @@ -66,12 +67,19 @@ static void bkey_cached_evict(struct btree_key_cache *c, BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, bch2_btree_key_cache_params)); memset(&ck->key, ~0, sizeof(ck->key)); + + c->nr_keys--; } -static void bkey_cached_free(struct btree_key_cache *c, +static void bkey_cached_free(struct btree_key_cache *bc, struct bkey_cached *ck) { - list_move(&ck->list, &c->freed); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + + ck->btree_trans_barrier_seq = + start_poll_synchronize_srcu(&c->btree_trans_barrier); + + list_move(&ck->list, &bc->freed); kfree(ck->k); ck->k = NULL; @@ -135,6 +143,8 @@ btree_key_cache_create(struct btree_key_cache *c, return NULL; } + c->nr_keys++; + list_move(&ck->list, &c->clean); six_unlock_write(&ck->c.lock); @@ -355,10 +365,14 @@ err: bch2_journal_pin_drop(j, &ck->journal); bch2_journal_preres_put(j, &ck->res); - clear_bit(BKEY_CACHED_DIRTY, &ck->flags); if (!evict) { mutex_lock(&c->btree_key_cache.lock); + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + c->btree_key_cache.nr_dirty--; + } + list_move_tail(&ck->list, &c->btree_key_cache.clean); mutex_unlock(&c->btree_key_cache.lock); } else { @@ -371,6 +385,11 @@ evict: six_lock_write(&ck->c.lock, NULL, NULL); mutex_lock(&c->btree_key_cache.lock); + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + c->btree_key_cache.nr_dirty--; + } + bkey_cached_evict(&c->btree_key_cache, ck); bkey_cached_free(&c->btree_key_cache, ck); mutex_unlock(&c->btree_key_cache.lock); @@ -391,19 +410,23 @@ static void btree_key_cache_journal_flush(struct journal *j, struct bkey_cached_key key; struct btree_trans trans; + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + six_lock_read(&ck->c.lock, NULL, NULL); key = ck->key; if (ck->journal.seq != seq || !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { six_unlock_read(&ck->c.lock); - return; + goto unlock; } six_unlock_read(&ck->c.lock); bch2_trans_init(&trans, c, 0, 0); btree_key_cache_flush_pos(&trans, key, seq, false); bch2_trans_exit(&trans); +unlock: + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); } /* @@ -448,9 +471,10 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { mutex_lock(&c->btree_key_cache.lock); - list_del_init(&ck->list); + list_move(&ck->list, &c->btree_key_cache.dirty); set_bit(BKEY_CACHED_DIRTY, &ck->flags); + c->btree_key_cache.nr_dirty++; mutex_unlock(&c->btree_key_cache.lock); } @@ -467,20 +491,97 @@ void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, } #endif -void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c) +static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_key_cache.shrink); + struct btree_key_cache *bc = &c->btree_key_cache; + struct bkey_cached *ck, *t; + size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; + unsigned flags; + + /* Return -1 if we can't do anything right now */ + if (sc->gfp_mask & __GFP_FS) + mutex_lock(&bc->lock); + else if (!mutex_trylock(&bc->lock)) + return -1; + + flags = memalloc_nofs_save(); + + list_for_each_entry_safe(ck, t, &bc->freed, list) { + scanned++; + + if (poll_state_synchronize_srcu(&c->btree_trans_barrier, + ck->btree_trans_barrier_seq)) { + list_del(&ck->list); + kfree(ck); + freed++; + } + + if (scanned >= nr) + goto out; + } + + list_for_each_entry_safe(ck, t, &bc->clean, list) { + scanned++; + + if (bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(bc, ck); + bkey_cached_free(bc, ck); + } + + if (scanned >= nr) { + if (&t->list != &bc->clean) + list_move_tail(&bc->clean, &t->list); + goto out; + } + } +out: + memalloc_nofs_restore(flags); + mutex_unlock(&bc->lock); + + return freed; +} + +static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, + struct shrink_control *sc) { + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_key_cache.shrink); + struct btree_key_cache *bc = &c->btree_key_cache; + + return bc->nr_keys; +} + +void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); struct bkey_cached *ck, *n; - mutex_lock(&c->lock); - list_for_each_entry_safe(ck, n, &c->clean, list) { + if (bc->shrink.list.next) + unregister_shrinker(&bc->shrink); + + mutex_lock(&bc->lock); + list_splice(&bc->dirty, &bc->clean); + + list_for_each_entry_safe(ck, n, &bc->clean, list) { + bch2_journal_pin_drop(&c->journal, &ck->journal); + bch2_journal_preres_put(&c->journal, &ck->res); + kfree(ck->k); kfree(ck); + bc->nr_keys--; } - list_for_each_entry_safe(ck, n, &c->freed, list) + + BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal)); + BUG_ON(bc->nr_keys); + + list_for_each_entry_safe(ck, n, &bc->freed, list) kfree(ck); - mutex_unlock(&c->lock); + mutex_unlock(&bc->lock); - rhashtable_destroy(&c->table); + rhashtable_destroy(&bc->table); } void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) @@ -488,11 +589,16 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) mutex_init(&c->lock); INIT_LIST_HEAD(&c->freed); INIT_LIST_HEAD(&c->clean); + INIT_LIST_HEAD(&c->dirty); } int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) { - return rhashtable_init(&c->table, &bch2_btree_key_cache_params); + c->shrink.count_objects = bch2_btree_key_cache_count; + c->shrink.scan_objects = bch2_btree_key_cache_scan; + + return register_shrinker(&c->shrink) ?: + rhashtable_init(&c->table, &bch2_btree_key_cache_params); } void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 93721fbc..6013c916 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -158,6 +158,7 @@ struct btree_cache { /* Number of elements in live + freeable lists */ unsigned used; unsigned reserve; + atomic_t dirty; struct shrinker shrink; /* @@ -294,6 +295,11 @@ struct btree_key_cache { struct rhashtable table; struct list_head freed; struct list_head clean; + struct list_head dirty; + struct shrinker shrink; + + size_t nr_keys; + size_t nr_dirty; }; struct bkey_cached_key { @@ -309,6 +315,7 @@ struct bkey_cached { unsigned long flags; u8 u64s; bool valid; + u32 btree_trans_barrier_seq; struct bkey_cached_key key; struct rhash_head hash; @@ -345,6 +352,7 @@ struct btree_trans { pid_t pid; #endif unsigned long ip; + int srcu_idx; u64 iters_linked; u64 iters_live; @@ -411,7 +419,6 @@ enum btree_flags { BTREE_FLAG(read_in_flight); BTREE_FLAG(read_error); -BTREE_FLAG(dirty); BTREE_FLAG(need_write); BTREE_FLAG(noevict); BTREE_FLAG(write_idx); diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 4ddd1697..d4f3dd7a 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -11,6 +11,7 @@ #include "btree_iter.h" #include "btree_locking.h" #include "buckets.h" +#include "error.h" #include "extents.h" #include "journal.h" #include "journal_reclaim.h" @@ -149,7 +150,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) b->ob.nr = 0; - clear_btree_node_dirty(b); + clear_btree_node_dirty(c, b); btree_node_lock_type(c, b, SIX_LOCK_write); __btree_node_free(c, b); @@ -264,7 +265,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev b = as->prealloc_nodes[--as->nr_prealloc_nodes]; set_btree_node_accessed(b); - set_btree_node_dirty(b); + set_btree_node_dirty(c, b); set_btree_node_need_write(b); bch2_bset_init_first(b, &b->data->keys); @@ -523,6 +524,7 @@ static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; struct btree *b = as->b; + struct btree_trans trans; u64 journal_seq = 0; unsigned i; int ret; @@ -540,14 +542,16 @@ static void btree_update_nodes_written(struct btree_update *as) * journal reclaim does btree updates when flushing bkey_cached entries, * which may require allocations as well. */ - ret = bch2_trans_do(c, &as->disk_res, &journal_seq, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_JOURNAL_RESERVED, - btree_update_nodes_written_trans(&trans, as)); + bch2_trans_init(&trans, c, 0, 512); + ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RECLAIM| + BTREE_INSERT_JOURNAL_RESERVED, + btree_update_nodes_written_trans(&trans, as)); + bch2_trans_exit(&trans); BUG_ON(ret && !bch2_journal_error(&c->journal)); if (b) { @@ -827,7 +831,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, closure_wake_up(&c->btree_interior_update_wait); } - clear_btree_node_dirty(b); + clear_btree_node_dirty(c, b); clear_btree_node_need_write(b); /* @@ -1018,7 +1022,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b struct bkey_i *insert, struct btree_node_iter *node_iter) { + struct bch_fs *c = as->c; struct bkey_packed *k; + const char *invalid; + + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)); + if (invalid) { + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert)); + bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid); + dump_stack(); + } BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ARRAY_SIZE(as->journal_entries)); @@ -1034,7 +1049,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b bch2_btree_node_iter_advance(node_iter, b); bch2_btree_bset_insert_key(iter, b, node_iter, insert); - set_btree_node_dirty(b); + set_btree_node_dirty(c, b); set_btree_node_need_write(b); } diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 7668225e..41854fc3 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -237,6 +237,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, b->whiteout_u64s; ssize_t total = c->opts.btree_node_size << 6; + /* Always leave one extra u64 for bch2_varint_decode: */ + used++; + return total - used; } diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index e386f8ed..a2ca31e7 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -191,7 +191,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); if (unlikely(!btree_node_dirty(b))) - set_btree_node_dirty(b); + set_btree_node_dirty(c, b); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) bset_u64s(t) - old_u64s; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 82f1cc4c..be65f2e7 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -323,7 +323,7 @@ static u64 reserve_factor(u64 r) static u64 avail_factor(u64 r) { - return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); + return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); } u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 1eb69ed3..389f23ee 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -35,6 +35,22 @@ #include #include +static inline struct address_space *faults_disabled_mapping(void) +{ + return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); +} + +static inline void set_fdm_dropped_locks(void) +{ + current->faults_disabled_mapping = + (void *) (((unsigned long) current->faults_disabled_mapping)|1); +} + +static inline bool fdm_dropped_locks(void) +{ + return ((unsigned long) current->faults_disabled_mapping) & 1; +} + struct quota_res { u64 sectors; }; @@ -493,10 +509,35 @@ static void bch2_set_page_dirty(struct bch_fs *c, vm_fault_t bch2_page_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct address_space *fdm = faults_disabled_mapping(); struct bch_inode_info *inode = file_bch_inode(file); int ret; + if (fdm == mapping) + return VM_FAULT_SIGBUS; + + /* Lock ordering: */ + if (fdm > mapping) { + struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); + + if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock)) + goto got_lock; + + bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock); + + bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + + bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock); + + /* Signal that lock has been dropped: */ + set_fdm_dropped_locks(); + return VM_FAULT_SIGBUS; + } + bch2_pagecache_add_get(&inode->ei_pagecache_lock); +got_lock: ret = filemap_fault(vmf); bch2_pagecache_add_put(&inode->ei_pagecache_lock); @@ -1742,14 +1783,16 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct bio *bio = &dio->op.wbio.bio; struct bvec_iter_all iter; struct bio_vec *bv; - unsigned unaligned; - bool sync = dio->sync; + unsigned unaligned, iter_count; + bool sync = dio->sync, dropped_locks; long ret; if (dio->loop) goto loop; while (1) { + iter_count = dio->iter.count; + if (kthread) kthread_use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); @@ -1757,13 +1800,34 @@ static long bch2_dio_write_loop(struct dio_write *dio) ret = bio_iov_iter_get_pages(bio, &dio->iter); + dropped_locks = fdm_dropped_locks(); + current->faults_disabled_mapping = NULL; if (kthread) kthread_unuse_mm(dio->mm); + /* + * If the fault handler returned an error but also signalled + * that it dropped & retook ei_pagecache_lock, we just need to + * re-shoot down the page cache and retry: + */ + if (dropped_locks && ret) + ret = 0; + if (unlikely(ret < 0)) goto err; + if (unlikely(dropped_locks)) { + ret = write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter_count - 1); + if (unlikely(ret)) + goto err; + + if (!bio->bi_iter.bi_size) + continue; + } + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); bio->bi_iter.bi_size -= unaligned; iov_iter_revert(&dio->iter, unaligned); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 3ac57ba2..6e3d4bea 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -91,6 +91,11 @@ void bch2_pagecache_add_put(struct pagecache_lock *lock) __pagecache_lock_put(lock, 1); } +bool bch2_pagecache_add_tryget(struct pagecache_lock *lock) +{ + return __pagecache_lock_tryget(lock, 1); +} + void bch2_pagecache_add_get(struct pagecache_lock *lock) { __pagecache_lock_get(lock, 1); @@ -271,7 +276,8 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, if (!tmpfile) mutex_lock(&dir->ei_update_lock); - bch2_trans_init(&trans, c, 8, 1024); + bch2_trans_init(&trans, c, 8, + 2048 + (!tmpfile ? dentry->d_name.len : 0)); retry: bch2_trans_begin(&trans); diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index eda903a4..4ee1ac99 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -26,6 +26,7 @@ static inline void pagecache_lock_init(struct pagecache_lock *lock) } void bch2_pagecache_add_put(struct pagecache_lock *); +bool bch2_pagecache_add_tryget(struct pagecache_lock *); void bch2_pagecache_add_get(struct pagecache_lock *); void bch2_pagecache_block_put(struct pagecache_lock *); void bch2_pagecache_block_get(struct pagecache_lock *); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 42371de7..823a1dde 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -537,7 +537,9 @@ found_slot: inode_u->bi_inum = k.k->p.offset; inode_u->bi_generation = bkey_generation(k); - return bch2_inode_write(trans, iter, inode_u); + ret = bch2_inode_write(trans, iter, inode_u); + bch2_trans_iter_put(trans, iter); + return ret; } int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) @@ -574,16 +576,9 @@ retry: bi_generation = 0; - ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr)); - if (ret) { - if (ret != -EINTR) - bch_err(c, "error flushing btree key cache: %i", ret); - goto err; - } - iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); + BTREE_ITER_CACHED|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_cached(iter); ret = bkey_err(k); if (ret) diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index c2cafd38..e99faad8 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -18,7 +18,19 @@ #include -static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64); +static u64 last_unwritten_seq(struct journal *j) +{ + union journal_res_state s = READ_ONCE(j->reservations); + + lockdep_assert_held(&j->lock); + + return journal_cur_seq(j) - s.prev_buf_unwritten; +} + +static inline bool journal_seq_unwritten(struct journal *j, u64 seq) +{ + return seq >= last_unwritten_seq(j); +} static bool __journal_entry_is_open(union journal_res_state state) { @@ -30,6 +42,22 @@ static bool journal_entry_is_open(struct journal *j) return __journal_entry_is_open(j->reservations); } +static inline struct journal_buf * +journal_seq_to_buf(struct journal *j, u64 seq) +{ + struct journal_buf *buf = NULL; + + EBUG_ON(seq > journal_cur_seq(j)); + EBUG_ON(seq == journal_cur_seq(j) && + j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); + + if (journal_seq_unwritten(j, seq)) { + buf = j->buf + (seq & 1); + EBUG_ON(le64_to_cpu(buf->data->seq) != seq); + } + return buf; +} + static void journal_pin_new_entry(struct journal *j, int count) { struct journal_entry_pin_list *p; @@ -51,6 +79,8 @@ static void bch2_journal_buf_init(struct journal *j) { struct journal_buf *buf = journal_cur_buf(j); + bkey_extent_init(&buf->key); + memset(buf->has_inode, 0, sizeof(buf->has_inode)); memset(buf->data, 0, sizeof(*buf->data)); @@ -72,6 +102,7 @@ void bch2_journal_halt(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + j->err_seq = journal_cur_seq(j); journal_wake(j); closure_wake_up(&journal_cur_buf(j)->wait); } @@ -139,8 +170,6 @@ static bool __journal_entry_close(struct journal *j) BUG_ON(sectors > buf->sectors); buf->sectors = sectors; - bkey_extent_init(&buf->key); - /* * We have to set last_seq here, _before_ opening a new journal entry: * @@ -162,11 +191,6 @@ static bool __journal_entry_close(struct journal *j) */ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); - if (journal_entry_empty(buf->data)) - clear_bit(JOURNAL_NOT_EMPTY, &j->flags); - else - set_bit(JOURNAL_NOT_EMPTY, &j->flags); - journal_pin_new_entry(j, 1); bch2_journal_buf_init(j); @@ -391,8 +415,17 @@ unlock: goto retry; if (ret == -ENOSPC) { - WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), - "JOURNAL_RES_GET_RESERVED set but journal full"); + if (WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), + "JOURNAL_RES_GET_RESERVED set but journal full")) { + char *buf; + + buf = kmalloc(4096, GFP_NOFS); + if (buf) { + bch2_journal_debug_to_text(&PBUF(buf), j); + pr_err("\n%s", buf); + kfree(buf); + } + } /* * Journal is full - can't rely on reclaim from work item due to @@ -503,146 +536,28 @@ out: /* journal flushing: */ -u64 bch2_journal_last_unwritten_seq(struct journal *j) -{ - u64 seq; - - spin_lock(&j->lock); - seq = journal_cur_seq(j); - if (j->reservations.prev_buf_unwritten) - seq--; - spin_unlock(&j->lock); - - return seq; -} - -/** - * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't - * open yet, or wait if we cannot - * - * used by the btree interior update machinery, when it needs to write a new - * btree root - every journal entry contains the roots of all the btrees, so it - * doesn't need to bother with getting a journal reservation - */ -int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - int ret; - - spin_lock(&j->lock); - - /* - * Can't try to open more than one sequence number ahead: - */ - BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); - - if (journal_cur_seq(j) > seq || - journal_entry_is_open(j)) { - spin_unlock(&j->lock); - return 0; - } - - if (journal_cur_seq(j) < seq && - !__journal_entry_close(j)) { - /* haven't finished writing out the previous one: */ - trace_journal_entry_full(c); - ret = -EAGAIN; - } else { - BUG_ON(journal_cur_seq(j) != seq); - - ret = journal_entry_open(j); - } - - if ((ret == -EAGAIN || ret == -ENOSPC) && - !j->res_get_blocked_start) - j->res_get_blocked_start = local_clock() ?: 1; - - if (ret == -EAGAIN || ret == -ENOSPC) - closure_wait(&j->async_wait, cl); - - spin_unlock(&j->lock); - - if (ret == -ENOSPC) { - trace_journal_full(c); - bch2_journal_reclaim_work(&j->reclaim_work.work); - ret = -EAGAIN; - } - - return ret; -} - -static int journal_seq_error(struct journal *j, u64 seq) -{ - union journal_res_state state = READ_ONCE(j->reservations); - - if (seq == journal_cur_seq(j)) - return bch2_journal_error(j); - - if (seq + 1 == journal_cur_seq(j) && - !state.prev_buf_unwritten && - seq > j->seq_ondisk) - return -EIO; - - return 0; -} - -static inline struct journal_buf * -journal_seq_to_buf(struct journal *j, u64 seq) -{ - /* seq should be for a journal entry that has been opened: */ - BUG_ON(seq > journal_cur_seq(j)); - BUG_ON(seq == journal_cur_seq(j) && - j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); - - if (seq == journal_cur_seq(j)) - return journal_cur_buf(j); - if (seq + 1 == journal_cur_seq(j) && - j->reservations.prev_buf_unwritten) - return journal_prev_buf(j); - return NULL; -} - -/** - * bch2_journal_wait_on_seq - wait for a journal entry to be written - * - * does _not_ cause @seq to be written immediately - if there is no other - * activity to cause the relevant journal entry to be filled up or flushed it - * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is - * configurable). - */ -void bch2_journal_wait_on_seq(struct journal *j, u64 seq, - struct closure *parent) -{ - struct journal_buf *buf; - - spin_lock(&j->lock); - - if ((buf = journal_seq_to_buf(j, seq))) { - if (!closure_wait(&buf->wait, parent)) - BUG(); - - if (seq == journal_cur_seq(j)) { - smp_mb(); - if (bch2_journal_error(j)) - closure_wake_up(&buf->wait); - } - } - - spin_unlock(&j->lock); -} - /** * bch2_journal_flush_seq_async - wait for a journal entry to be written * * like bch2_journal_wait_on_seq, except that it triggers a write immediately if * necessary */ -void bch2_journal_flush_seq_async(struct journal *j, u64 seq, +int bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) { struct journal_buf *buf; + int ret = 0; spin_lock(&j->lock); + if (seq <= j->err_seq) { + ret = -EIO; + goto out; + } + + if (seq <= j->seq_ondisk) { + ret = 1; + goto out; + } if (parent && (buf = journal_seq_to_buf(j, seq))) @@ -651,20 +566,8 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, if (seq == journal_cur_seq(j)) __journal_entry_close(j); +out: spin_unlock(&j->lock); -} - -static int journal_seq_flushed(struct journal *j, u64 seq) -{ - int ret; - - spin_lock(&j->lock); - ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); - - if (seq == journal_cur_seq(j)) - __journal_entry_close(j); - spin_unlock(&j->lock); - return ret; } @@ -673,28 +576,13 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) u64 start_time = local_clock(); int ret, ret2; - ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); + ret = wait_event_killable(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); bch2_time_stats_update(j->flush_seq_time, start_time); return ret ?: ret2 < 0 ? ret2 : 0; } -/** - * bch2_journal_meta_async - force a journal entry to be written - */ -void bch2_journal_meta_async(struct journal *j, struct closure *parent) -{ - struct journal_res res; - - memset(&res, 0, sizeof(res)); - - bch2_journal_res_get(j, &res, jset_u64s(0), 0); - bch2_journal_res_put(j, &res); - - bch2_journal_flush_seq_async(j, res.seq, parent); -} - int bch2_journal_meta(struct journal *j) { struct journal_res res; @@ -989,7 +877,8 @@ void bch2_fs_journal_stop(struct journal *j) journal_quiesce(j); BUG_ON(!bch2_journal_error(j) && - test_bit(JOURNAL_NOT_EMPTY, &j->flags)); + (journal_entry_is_open(j) || + j->last_empty_seq + 1 != journal_cur_seq(j))); cancel_delayed_work_sync(&j->write_work); cancel_delayed_work_sync(&j->reclaim_work); @@ -1047,6 +936,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, set_bit(JOURNAL_STARTED, &j->flags); journal_pin_new_entry(j, 1); + + j->reservations.idx = journal_cur_seq(j); + bch2_journal_buf_init(j); c->last_bucket_seq_cleanup = journal_cur_seq(j); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index f60bc964..25c68767 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -464,13 +464,8 @@ void bch2_journal_entry_res_resize(struct journal *, struct journal_entry_res *, unsigned); -u64 bch2_journal_last_unwritten_seq(struct journal *); -int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); - -void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); -void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); +int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); void bch2_journal_flush_async(struct journal *, struct closure *); -void bch2_journal_meta_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush(struct journal *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index bd0e6b37..7c157bc5 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -161,6 +161,8 @@ static void journal_entry_null_range(void *start, void *end) #define journal_entry_err_on(cond, c, msg, ...) \ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) +#define FSCK_DELETED_KEY 5 + static int journal_validate_key(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned level, enum btree_id btree_id, @@ -173,28 +175,42 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, int ret = 0; if (journal_entry_err_on(!k->k.u64s, c, - "invalid %s in journal: k->u64s 0", type)) { + "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0", + type, le64_to_cpu(jset->seq), + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s))) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); - return 0; + return FSCK_DELETED_KEY; } if (journal_entry_err_on((void *) bkey_next(k) > (void *) vstruct_next(entry), c, - "invalid %s in journal: extends past end of journal entry", - type)) { + "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry", + type, le64_to_cpu(jset->seq), + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s))) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); - return 0; + return FSCK_DELETED_KEY; } if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, - "invalid %s in journal: bad format %u", - type, k->k.format)) { - le16_add_cpu(&entry->u64s, -k->k.u64s); + "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u", + type, le64_to_cpu(jset->seq), + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s), + k->k.format)) { + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); - return 0; + return FSCK_DELETED_KEY; } if (!write) @@ -208,13 +224,18 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, char buf[160]; bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); - mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", - type, invalid, buf); - - le16_add_cpu(&entry->u64s, -k->k.u64s); + mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s", + type, le64_to_cpu(jset->seq), + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s), + invalid, buf); + + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); - return 0; + return FSCK_DELETED_KEY; } if (write) @@ -230,15 +251,17 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c, struct jset_entry *entry, int write) { - struct bkey_i *k; + struct bkey_i *k = entry->start; - vstruct_for_each(entry, k) { + while (k != vstruct_last(entry)) { int ret = journal_validate_key(c, jset, entry, entry->level, entry->btree_id, k, "key", write); - if (ret) - return ret; + if (ret == FSCK_DELETED_KEY) + continue; + + k = bkey_next(k); } return 0; @@ -432,46 +455,45 @@ static int jset_validate(struct bch_fs *c, "%s sector %llu seq %llu: unknown journal entry version %u", ca->name, sector, le64_to_cpu(jset->seq), version)) { - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; + /* don't try to continue: */ + return EINVAL; } + if (bytes > (sectors_read << 9) && + sectors_read < bucket_sectors_left) + return JOURNAL_ENTRY_REREAD; + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, "%s sector %llu seq %llu: journal entry too big (%zu bytes)", ca->name, sector, le64_to_cpu(jset->seq), bytes)) { - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; + ret = JOURNAL_ENTRY_BAD; + le32_add_cpu(&jset->u64s, + -((bytes - (bucket_sectors_left << 9)) / 8)); } - if (bytes > sectors_read << 9) - return JOURNAL_ENTRY_REREAD; - if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, "%s sector %llu seq %llu: journal entry with unknown csum type %llu", ca->name, sector, le64_to_cpu(jset->seq), - JSET_CSUM_TYPE(jset))) - return JOURNAL_ENTRY_BAD; + JSET_CSUM_TYPE(jset))) { + ret = JOURNAL_ENTRY_BAD; + goto bad_csum_type; + } csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, "%s sector %llu seq %llu: journal checksum bad", - ca->name, sector, le64_to_cpu(jset->seq))) { - /* XXX: retry IO, when we start retrying checksum errors */ - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; - } + ca->name, sector, le64_to_cpu(jset->seq))) + ret = JOURNAL_ENTRY_BAD; bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); - +bad_csum_type: if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, "invalid journal entry: last_seq > seq")) { jset->last_seq = jset->seq; return JOURNAL_ENTRY_BAD; } - - return 0; fsck_err: return ret; } @@ -939,24 +961,29 @@ static void journal_write_done(struct closure *cl) struct bch_replicas_padded replicas; u64 seq = le64_to_cpu(w->data->seq); u64 last_seq = le64_to_cpu(w->data->last_seq); + int err = 0; bch2_time_stats_update(j->write_time, j->write_start_time); if (!devs.nr) { bch_err(c, "unable to write journal to sufficient devices"); - goto err; + err = -EIO; + } else { + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); + if (bch2_mark_replicas(c, &replicas.e)) + err = -EIO; } - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); - - if (bch2_mark_replicas(c, &replicas.e)) - goto err; + if (err) + bch2_fatal_error(c); spin_lock(&j->lock); if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = devs; j->seq_ondisk = seq; + if (err && (!j->err_seq || seq < j->err_seq)) + j->err_seq = seq; j->last_seq_ondisk = last_seq; bch2_journal_space_available(j); @@ -968,7 +995,7 @@ static void journal_write_done(struct closure *cl) * bch2_fs_journal_stop(): */ mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); -out: + /* also must come before signalling write completion: */ closure_debug_destroy(cl); @@ -982,11 +1009,6 @@ out: if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) mod_delayed_work(system_freezable_wq, &j->write_work, 0); spin_unlock(&j->lock); - return; -err: - bch2_fatal_error(c); - spin_lock(&j->lock); - goto out; } static void journal_write_endio(struct bio *bio) @@ -1067,6 +1089,9 @@ void bch2_journal_write(struct closure *cl) SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); + if (journal_entry_empty(jset)) + j->last_empty_seq = le64_to_cpu(jset->seq); + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 18e45296..7a04d06b 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -263,6 +263,7 @@ static void bch2_journal_reclaim_fast(struct journal *j) while (!fifo_empty(&j->pin) && !atomic_read(&fifo_peek_front(&j->pin).count)) { BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); + BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed)); BUG_ON(!fifo_pop(&j->pin, temp)); popped = true; } @@ -547,6 +548,12 @@ void bch2_journal_reclaim(struct journal *j) if (j->prereserved.reserved * 2 > j->prereserved.remaining) min_nr = 1; + + if ((atomic_read(&c->btree_cache.dirty) * 4 > + c->btree_cache.used * 3) || + (c->btree_key_cache.nr_dirty * 4 > + c->btree_key_cache.nr_keys)) + min_nr = 1; } while (journal_flush_pins(j, seq_to_flush, min_nr)); if (!bch2_journal_error(j)) diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 154b51b8..9757e3d5 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -127,7 +127,6 @@ enum { JOURNAL_STARTED, JOURNAL_RECLAIM_STARTED, JOURNAL_NEED_WRITE, - JOURNAL_NOT_EMPTY, JOURNAL_MAY_GET_UNRESERVED, }; @@ -181,6 +180,8 @@ struct journal { /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; u64 last_seq_ondisk; + u64 err_seq; + u64 last_empty_seq; /* * FIFO of journal entries whose btree updates have not yet been diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 1745cfac..67500636 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -456,6 +456,7 @@ retry: __bch2_btree_iter_set_pos(split_iter, split->k.p, false); bch2_trans_update(&trans, split_iter, split, BTREE_TRIGGER_NORUN); + bch2_trans_iter_put(&trans, split_iter); bch2_btree_iter_set_pos(iter, split->k.p); @@ -481,6 +482,8 @@ retry: BTREE_INSERT_LAZY_RW| BTREE_INSERT_JOURNAL_REPLAY); err: + bch2_trans_iter_put(&trans, iter); + if (ret == -EINTR) goto retry; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index d7ad293a..58c00e26 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -458,7 +458,7 @@ STORE(bch2_fs) /* Debugging: */ if (attr == &sysfs_trigger_journal_flush) - bch2_journal_meta_async(&c->journal, NULL); + bch2_journal_meta(&c->journal); if (attr == &sysfs_trigger_btree_coalesce) bch2_coalesce(c); -- cgit v1.2.3