From 1e574cb1aa07ab3a796c7d6c5501b96f3056ef4d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 29 Nov 2020 23:55:51 -0500 Subject: Update bcachefs sources to 021e62a098 bcachefs: Fix error in filesystem initialization --- .bcachefs_revision | 2 +- include/linux/sched/mm.h | 15 +++- include/linux/slab.h | 31 +++++++ include/trace/events/bcachefs.h | 69 ++++++++++++++-- libbcachefs/alloc_background.c | 2 +- libbcachefs/bcachefs.h | 1 - libbcachefs/btree_cache.c | 6 ++ libbcachefs/btree_cache.h | 1 + libbcachefs/btree_gc.c | 2 +- libbcachefs/btree_key_cache.c | 138 +++++++++++++++++++------------ libbcachefs/btree_key_cache.h | 21 +++++ libbcachefs/btree_types.h | 6 +- libbcachefs/btree_update.h | 4 +- libbcachefs/btree_update_interior.c | 27 ++++-- libbcachefs/btree_update_interior.h | 1 - libbcachefs/btree_update_leaf.c | 96 +++++++++++----------- libbcachefs/buckets.c | 10 --- libbcachefs/buckets.h | 6 +- libbcachefs/chardev.c | 3 +- libbcachefs/fs-io.c | 2 +- libbcachefs/fs.c | 2 +- libbcachefs/fsck.c | 2 +- libbcachefs/inode.c | 34 ++++---- libbcachefs/inode.h | 2 +- libbcachefs/journal.c | 115 +++++++++++++++----------- libbcachefs/journal_io.c | 4 +- libbcachefs/journal_reclaim.c | 158 +++++++++++++++++++++++++++++------- libbcachefs/journal_reclaim.h | 15 +++- libbcachefs/journal_types.h | 14 +++- libbcachefs/movinggc.c | 2 +- libbcachefs/rebalance.c | 2 +- libbcachefs/recovery.c | 1 + libbcachefs/super.c | 18 ++-- libbcachefs/sysfs.c | 7 ++ 34 files changed, 577 insertions(+), 242 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index ec3e6587..6ba1c9af 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -b1107114caf6aa6f725170f3d75b072badcfa573 +021e62a098d9fa7e558ae935180e2fb16bb50a3a diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 347105c6..03feda7a 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -1,7 +1,8 @@ #ifndef _LINUX_SCHED_MM_H #define _LINUX_SCHED_MM_H -#define PF_MEMALLOC_NOFS 0 +#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ static inline unsigned int memalloc_nofs_save(void) { @@ -15,4 +16,16 @@ static inline void memalloc_nofs_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; } +static inline unsigned int memalloc_noreclaim_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC; + current->flags |= PF_MEMALLOC; + return flags; +} + +static inline void memalloc_noreclaim_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC) | flags; +} + #endif /* _LINUX_SCHED_MM_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index ff342b65..b8a1235b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -132,4 +132,35 @@ static inline void *kmemdup(const void *src, size_t len, gfp_t gfp) return p; } +struct kmem_cache { + size_t obj_size; +}; + +static inline void *kmem_cache_alloc(struct kmem_cache *c, gfp_t gfp) +{ + return kmalloc(c->obj_size, gfp); +} + +static inline void kmem_cache_free(struct kmem_cache *c, void *p) +{ + kfree(p); +} + +static inline void kmem_cache_destroy(struct kmem_cache *p) +{ + kfree(p); +} + +static inline struct kmem_cache *kmem_cache_create(size_t obj_size) +{ + struct kmem_cache *p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return NULL; + + p->obj_size = obj_size; + return p; +} + +#define KMEM_CACHE(_struct, _flags) kmem_cache_create(sizeof(struct _struct)) + #endif /* __TOOLS_LINUX_SLAB_H */ diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index a8b8c5b6..d4cb7a29 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -121,6 +121,65 @@ DEFINE_EVENT(bio, journal_write, TP_ARGS(bio) ); +TRACE_EVENT(journal_reclaim_start, + TP_PROTO(struct bch_fs *c, u64 min_nr, + u64 prereserved, u64 prereserved_total, + u64 btree_cache_dirty, u64 btree_cache_total, + u64 btree_key_cache_dirty, u64 btree_key_cache_total), + TP_ARGS(c, min_nr, prereserved, prereserved_total, + btree_cache_dirty, btree_cache_total, + btree_key_cache_dirty, btree_key_cache_total), + + TP_STRUCT__entry( + __array(char, uuid, 16 ) + __field(u64, min_nr ) + __field(u64, prereserved ) + __field(u64, prereserved_total ) + __field(u64, btree_cache_dirty ) + __field(u64, btree_cache_total ) + __field(u64, btree_key_cache_dirty ) + __field(u64, btree_key_cache_total ) + ), + + TP_fast_assign( + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->min_nr = min_nr; + __entry->prereserved = prereserved; + __entry->prereserved_total = prereserved_total; + __entry->btree_cache_dirty = btree_cache_dirty; + __entry->btree_cache_total = btree_cache_total; + __entry->btree_key_cache_dirty = btree_key_cache_dirty; + __entry->btree_key_cache_total = btree_key_cache_total; + ), + + TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + __entry->uuid, + __entry->min_nr, + __entry->prereserved, + __entry->prereserved_total, + __entry->btree_cache_dirty, + __entry->btree_cache_total, + __entry->btree_key_cache_dirty, + __entry->btree_key_cache_total) +); + +TRACE_EVENT(journal_reclaim_finish, + TP_PROTO(struct bch_fs *c, u64 nr_flushed), + TP_ARGS(c, nr_flushed), + + TP_STRUCT__entry( + __array(char, uuid, 16 ) + __field(u64, nr_flushed ) + ), + + TP_fast_assign( + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->nr_flushed = nr_flushed; + ), + + TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed) +); + /* bset.c: */ DEFINE_EVENT(bpos, bkey_pack_pos_fail, @@ -622,6 +681,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, TP_ARGS(ip) ); +DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, TP_PROTO(unsigned long ip), TP_ARGS(ip) @@ -657,11 +721,6 @@ DEFINE_EVENT(transaction_restart, trans_restart_traverse, TP_ARGS(ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_atomic, - TP_PROTO(unsigned long ip), - TP_ARGS(ip) -); - DECLARE_EVENT_CLASS(node_lock_fail, TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), TP_ARGS(level, iter_seq, node, node_seq), diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 97508de9..2dd8a37f 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -1456,7 +1456,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca) return 0; p = kthread_create(bch2_allocator_thread, ca, - "bch_alloc[%s]", ca->name); + "bch-alloc/%s", ca->name); if (IS_ERR(p)) return PTR_ERR(p); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index b20895a4..6d54defc 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -650,7 +650,6 @@ struct bch_fs { struct workqueue_struct *wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; - struct workqueue_struct *journal_reclaim_wq; /* ALLOCATION */ struct delayed_work pd_controllers_update; diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 5bceff48..09774f56 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -1064,3 +1064,9 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, stats.floats, stats.failed); } + +void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c) +{ + pr_buf(out, "nr nodes:\t%u\n", c->btree_cache.used); + pr_buf(out, "nr dirty:\t%u\n", atomic_read(&c->btree_cache.dirty)); +} diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 8a19e60e..e766ef55 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -100,5 +100,6 @@ static inline unsigned btree_blocks(struct bch_fs *c) void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, struct btree *); +void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index ba4acc11..ac81c9b9 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -1427,7 +1427,7 @@ int bch2_gc_thread_start(struct bch_fs *c) BUG_ON(c->gc_thread); - p = kthread_create(bch2_gc_thread, c, "bch_gc"); + p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); if (IS_ERR(p)) return PTR_ERR(p); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index d605ff18..a21dc485 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -12,6 +12,8 @@ #include #include +static struct kmem_cache *bch2_key_cache; + static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -76,10 +78,13 @@ static void bkey_cached_free(struct btree_key_cache *bc, { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + ck->btree_trans_barrier_seq = start_poll_synchronize_srcu(&c->btree_trans_barrier); - list_move(&ck->list, &bc->freed); + list_move_tail(&ck->list, &bc->freed); + bc->nr_freed++; kfree(ck->k); ck->k = NULL; @@ -94,9 +99,20 @@ bkey_cached_alloc(struct btree_key_cache *c) { struct bkey_cached *ck; - list_for_each_entry(ck, &c->freed, list) - if (bkey_cached_lock_for_evict(ck)) + list_for_each_entry_reverse(ck, &c->freed, list) + if (bkey_cached_lock_for_evict(ck)) { + c->nr_freed--; return ck; + } + + ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); + if (likely(ck)) { + INIT_LIST_HEAD(&ck->list); + six_lock_init(&ck->c.lock); + BUG_ON(!six_trylock_intent(&ck->c.lock)); + BUG_ON(!six_trylock_write(&ck->c.lock)); + return ck; + } list_for_each_entry(ck, &c->clean, list) if (bkey_cached_lock_for_evict(ck)) { @@ -104,16 +120,7 @@ bkey_cached_alloc(struct btree_key_cache *c) return ck; } - ck = kzalloc(sizeof(*ck), GFP_NOFS); - if (!ck) - return NULL; - - INIT_LIST_HEAD(&ck->list); - six_lock_init(&ck->c.lock); - BUG_ON(!six_trylock_intent(&ck->c.lock)); - BUG_ON(!six_trylock_write(&ck->c.lock)); - - return ck; + return NULL; } static struct bkey_cached * @@ -132,8 +139,7 @@ btree_key_cache_create(struct btree_key_cache *c, ck->key.btree_id = btree_id; ck->key.pos = pos; ck->valid = false; - - BUG_ON(ck->flags); + ck->flags = 1U << BKEY_CACHED_ACCESSED; if (rhashtable_lookup_insert_fast(&c->table, &ck->hash, @@ -290,6 +296,9 @@ fill: goto err; } + if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + iter->uptodate = BTREE_ITER_NEED_PEEK; bch2_btree_iter_downgrade(iter); return ret; @@ -451,6 +460,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_cached *ck = (void *) iter->l[0].b; + bool kick_reclaim = false; BUG_ON(insert->u64s > ck->u64s); @@ -475,11 +485,18 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, set_bit(BKEY_CACHED_DIRTY, &ck->flags); c->btree_key_cache.nr_dirty++; + + if (bch2_nr_btree_keys_need_flush(c)) + kick_reclaim = true; + mutex_unlock(&c->btree_key_cache.lock); } bch2_journal_pin_update(&c->journal, trans->journal_res.seq, &ck->journal, btree_key_cache_journal_flush); + + if (kick_reclaim) + journal_reclaim_kick(&c->journal); return true; } @@ -509,28 +526,34 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, flags = memalloc_nofs_save(); + /* + * Newest freed entries are at the end of the list - once we hit one + * that's too new to be freed, we can bail out: + */ list_for_each_entry_safe(ck, t, &bc->freed, list) { - scanned++; + if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, + ck->btree_trans_barrier_seq)) + break; - if (poll_state_synchronize_srcu(&c->btree_trans_barrier, - ck->btree_trans_barrier_seq)) { - list_del(&ck->list); - kfree(ck); - freed++; - } - - if (scanned >= nr) - goto out; + list_del(&ck->list); + kmem_cache_free(bch2_key_cache, ck); + bc->nr_freed--; + scanned++; + freed++; } - list_for_each_entry_safe(ck, t, &bc->clean, list) { - scanned++; + if (scanned >= nr) + goto out; - if (bkey_cached_lock_for_evict(ck)) { + list_for_each_entry_safe(ck, t, &bc->clean, list) { + if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); + else if (bkey_cached_lock_for_evict(ck)) { bkey_cached_evict(bc, ck); bkey_cached_free(bc, ck); } + scanned++; if (scanned >= nr) { if (&t->list != &bc->clean) list_move_tail(&bc->clean, &t->list); @@ -570,18 +593,22 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) bch2_journal_preres_put(&c->journal, &ck->res); kfree(ck->k); - kfree(ck); + list_del(&ck->list); + kmem_cache_free(bch2_key_cache, ck); bc->nr_keys--; } BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal)); BUG_ON(bc->nr_keys); - list_for_each_entry_safe(ck, n, &bc->freed, list) - kfree(ck); + list_for_each_entry_safe(ck, n, &bc->freed, list) { + list_del(&ck->list); + kmem_cache_free(bch2_key_cache, ck); + } mutex_unlock(&bc->lock); - rhashtable_destroy(&bc->table); + if (bc->table_init_done) + rhashtable_destroy(&bc->table); } void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) @@ -594,33 +621,42 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) { + int ret; + + c->shrink.seeks = 1; c->shrink.count_objects = bch2_btree_key_cache_count; c->shrink.scan_objects = bch2_btree_key_cache_scan; - return register_shrinker(&c->shrink) ?: - rhashtable_init(&c->table, &bch2_btree_key_cache_params); + ret = register_shrinker(&c->shrink); + if (ret) + return ret; + + ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params); + if (ret) + return ret; + + c->table_init_done = true; + return 0; } void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) { - struct bucket_table *tbl; - struct bkey_cached *ck; - struct rhash_head *pos; - size_t i; + pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed); + pr_buf(out, "nr_keys:\t%zu\n", c->nr_keys); + pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty); +} - mutex_lock(&c->lock); - tbl = rht_dereference_rcu(c->table.tbl, &c->table); +void bch2_btree_key_cache_exit(void) +{ + if (bch2_key_cache) + kmem_cache_destroy(bch2_key_cache); +} - for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { - pr_buf(out, "%s:", - bch2_btree_ids[ck->key.btree_id]); - bch2_bpos_to_text(out, ck->key.pos); +int __init bch2_btree_key_cache_init(void) +{ + bch2_key_cache = KMEM_CACHE(bkey_cached, 0); + if (!bch2_key_cache) + return -ENOMEM; - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) - pr_buf(out, " journal seq %llu", ck->journal.seq); - pr_buf(out, "\n"); - } - } - mutex_unlock(&c->lock); + return 0; } diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index d448264a..d7d31a06 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -1,6 +1,24 @@ #ifndef _BCACHEFS_BTREE_KEY_CACHE_H #define _BCACHEFS_BTREE_KEY_CACHE_H +static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) +{ + size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); + size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty); + size_t max_dirty = 4096 + nr_keys / 2; + + return max_t(ssize_t, 0, nr_dirty - max_dirty); +} + +static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +{ + size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); + size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty); + size_t max_dirty = 4096 + (nr_keys * 3) / 4; + + return nr_dirty > max_dirty; +} + struct bkey_cached * bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); @@ -25,4 +43,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *); void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); +void bch2_btree_key_cache_exit(void); +int __init bch2_btree_key_cache_init(void); + #endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 6013c916..cf59f122 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -293,11 +293,13 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter) struct btree_key_cache { struct mutex lock; struct rhashtable table; + bool table_init_done; struct list_head freed; struct list_head clean; struct list_head dirty; struct shrinker shrink; + size_t nr_freed; size_t nr_keys; size_t nr_dirty; }; @@ -307,7 +309,8 @@ struct bkey_cached_key { struct bpos pos; } __attribute__((packed, aligned(4))); -#define BKEY_CACHED_DIRTY 0 +#define BKEY_CACHED_ACCESSED 0 +#define BKEY_CACHED_DIRTY 1 struct bkey_cached { struct btree_bkey_cached_common c; @@ -647,6 +650,7 @@ enum btree_insert_ret { BTREE_INSERT_ENOSPC, BTREE_INSERT_NEED_MARK_REPLICAS, BTREE_INSERT_NEED_JOURNAL_RES, + BTREE_INSERT_NEED_JOURNAL_RECLAIM, }; enum btree_gc_coalesce_fail_reason { diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index e0b1bde3..adb07043 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -67,8 +67,8 @@ int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, u64 *, int flags); -int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, - struct bpos, u64 *); +int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, + struct bpos, struct bpos, u64 *); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, u64 *); diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index d4f3dd7a..5143896e 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -49,12 +49,27 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) break; bp = bkey_s_c_to_btree_ptr_v2(k); - BUG_ON(bkey_cmp(next_node, bp.v->min_key)); + if (bkey_cmp(next_node, bp.v->min_key)) { + bch2_dump_btree_node(c, b); + panic("expected next min_key %llu:%llu got %llu:%llu\n", + next_node.inode, + next_node.offset, + bp.v->min_key.inode, + bp.v->min_key.offset); + } bch2_btree_node_iter_advance(&iter, b); if (bch2_btree_node_iter_end(&iter)) { - BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); + + if (bkey_cmp(k.k->p, b->key.k.p)) { + bch2_dump_btree_node(c, b); + panic("expected end %llu:%llu got %llu:%llu\n", + b->key.k.p.inode, + b->key.k.p.offset, + k.k->p.inode, + k.k->p.offset); + } break; } @@ -1026,7 +1041,8 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b struct bkey_packed *k; const char *invalid; - invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)); + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); if (invalid) { char buf[160]; @@ -1368,9 +1384,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, BUG_ON(!as || as->b); bch2_verify_keylist_sorted(keys); - if (as->must_rewrite) - goto split; - bch2_btree_node_lock_for_insert(c, b, iter); if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { @@ -1378,6 +1391,8 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, goto split; } + btree_node_interior_verify(c, b); + bch2_btree_insert_keys_interior(as, b, iter, keys); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 41854fc3..45d21273 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -47,7 +47,6 @@ struct btree_update { BTREE_INTERIOR_UPDATING_AS, } mode; - unsigned must_rewrite:1; unsigned nodes_written:1; enum btree_id btree_id; diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index a2ca31e7..bbc6d512 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -286,6 +286,10 @@ btree_key_can_insert_cached(struct btree_trans *trans, BUG_ON(iter->level); + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(trans->c)) + return BTREE_INSERT_NEED_JOURNAL_RECLAIM; + if (u64s <= ck->u64s) return BTREE_INSERT_OK; @@ -642,20 +646,24 @@ int bch2_trans_commit_error(struct btree_trans *trans, trace_trans_restart_journal_res_get(trans->ip); ret = -EINTR; break; - default: - BUG_ON(ret >= 0); - break; - } - - if (ret == -EINTR) { - int ret2 = bch2_btree_iter_traverse_all(trans); + case BTREE_INSERT_NEED_JOURNAL_RECLAIM: + bch2_trans_unlock(trans); - if (ret2) { - trace_trans_restart_traverse(trans->ip); - return ret2; + while (bch2_btree_key_cache_must_wait(c)) { + mutex_lock(&c->journal.reclaim_lock); + bch2_journal_reclaim(&c->journal); + mutex_unlock(&c->journal.reclaim_lock); } - trace_trans_restart_atomic(trans->ip); + if (bch2_trans_relock(trans)) + return 0; + + trace_trans_restart_journal_reclaim(trans->ip); + ret = -EINTR; + break; + default: + BUG_ON(ret >= 0); + break; } return ret; @@ -1076,13 +1084,32 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, __bch2_btree_insert(&trans, id, k)); } -int bch2_btree_delete_at_range(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end, - u64 *journal_seq) +int bch2_btree_delete_at(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) +{ + struct bkey_i k; + + bkey_init(&k.k); + k.k.p = iter->pos; + + bch2_trans_update(trans, iter, &k, 0); + return bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE|flags); +} + +int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + struct bpos start, struct bpos end, + u64 *journal_seq) { + struct btree_iter *iter; struct bkey_s_c k; int ret = 0; + + iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT); + ret = PTR_ERR_OR_ZERO(iter); + if (ret) + return ret; retry: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -1093,6 +1120,10 @@ retry: bkey_init(&delete.k); + /* + * This could probably be more efficient for extents: + */ + /* * For extents, iter.pos won't necessarily be the same as * bkey_start_pos(k.k) (for non extents they always will be the @@ -1132,22 +1163,8 @@ retry: goto retry; } + bch2_trans_iter_put(trans, iter); return ret; - -} - -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned flags) -{ - struct bkey_i k; - - bkey_init(&k.k); - k.k.p = iter->pos; - - bch2_trans_update(trans, iter, &k, 0); - return bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE|flags); } /* @@ -1159,21 +1176,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, struct bpos start, struct bpos end, u64 *journal_seq) { - struct btree_trans trans; - struct btree_iter *iter; - int ret = 0; - - /* - * XXX: whether we need mem/more iters depends on whether this btree id - * has triggers - */ - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); - - iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); - - ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); - ret = bch2_trans_exit(&trans) ?: ret; - - BUG_ON(ret == -EINTR); - return ret; + return bch2_trans_do(c, NULL, journal_seq, 0, + bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq)); } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index be65f2e7..f7bdb143 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -2044,16 +2044,6 @@ static u64 bch2_recalc_sectors_available(struct bch_fs *c) return avail_factor(__bch2_fs_usage_read_short(c).free); } -void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) -{ - percpu_down_read(&c->mark_lock); - this_cpu_sub(c->usage[0]->online_reserved, - res->sectors); - percpu_up_read(&c->mark_lock); - - res->sectors = 0; -} - #define SECTORS_CACHE 1024 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index a3873bec..856dc5a8 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -272,13 +272,11 @@ void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); /* disk reservations: */ -void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); - static inline void bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { - if (res->sectors) - __bch2_disk_reservation_put(c, res); + this_cpu_sub(c->usage[0]->online_reserved, res->sectors); + res->sectors = 0; } #define BCH_DISK_RESERVATION_NOFAIL (1 << 0) diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 4663784d..e7c8969a 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -341,7 +341,8 @@ static long bch2_ioctl_data(struct bch_fs *c, ctx->c = c; ctx->arg = arg; - ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); + ctx->thread = kthread_create(bch2_data_thread, ctx, + "bch-data/%s", c->name); if (IS_ERR(ctx->thread)) { ret = PTR_ERR(ctx->thread); goto err; diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 389f23ee..7d193ce4 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -684,7 +684,7 @@ static int readpages_iter_init(struct readpages_iter *iter, if (!iter->pages) return -ENOMEM; - __readahead_batch(ractl, iter->pages, nr_pages); + nr_pages = __readahead_batch(ractl, iter->pages, nr_pages); for (i = 0; i < nr_pages; i++) { __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); put_page(iter->pages[i]); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 6e3d4bea..f3f6fe6c 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1252,7 +1252,7 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode->v.i_ino); + bch2_inode_rm(c, inode->v.i_ino, true); } } diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 0c503527..09ce6c29 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1254,7 +1254,7 @@ static int check_inode(struct btree_trans *trans, bch2_fs_lazy_rw(c); - ret = bch2_inode_rm(c, u.bi_inum); + ret = bch2_inode_rm(c, u.bi_inum, false); if (ret) bch_err(c, "error in fsck: error %i while deleting inode", ret); return ret; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 823a1dde..82099e5a 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -542,7 +542,7 @@ found_slot: return ret; } -int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) +int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) { struct btree_trans trans; struct btree_iter *iter; @@ -553,6 +553,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) u64 bi_generation; int ret; + bch2_trans_init(&trans, c, 0, 0); + /* * If this was a directory, there shouldn't be any real dirents left - * but there could be whiteouts (from hash collisions) that we should @@ -561,30 +563,34 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) * XXX: the dirent could ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, - start, end, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_XATTRS, - start, end, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_DIRENTS, - start, end, NULL); + ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_EXTENTS, + start, end, NULL) ?: + bch2_btree_delete_range_trans(&trans, BTREE_ID_XATTRS, + start, end, NULL) ?: + bch2_btree_delete_range_trans(&trans, BTREE_ID_DIRENTS, + start, end, NULL); if (ret) - return ret; - - bch2_trans_init(&trans, c, 0, 0); + goto err; retry: bch2_trans_begin(&trans); bi_generation = 0; - iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), - BTREE_ITER_CACHED|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_cached(iter); + if (cached) { + iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), + BTREE_ITER_CACHED|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_cached(iter); + } else { + iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + } ret = bkey_err(k); if (ret) goto err; - bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, + bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, trans.c, "inode %llu not found when deleting", inode_nr); diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index ef7e885d..dbdfcf63 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -71,7 +71,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *); -int bch2_inode_rm(struct bch_fs *, u64); +int bch2_inode_rm(struct bch_fs *, u64, bool); int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, struct bch_inode_unpacked *); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 1b3f249b..5874a9ff 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -226,16 +226,19 @@ static bool journal_entry_close(struct journal *j) */ static int journal_entry_open(struct journal *j) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; int u64s; u64 v; + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); if (j->blocked) - return -EAGAIN; + return cur_entry_blocked; if (j->cur_entry_error) return j->cur_entry_error; @@ -251,7 +254,7 @@ static int journal_entry_open(struct journal *j) u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= le32_to_cpu(buf->data->u64s)) - return -ENOSPC; + return cur_entry_journal_full; /* * Must be set before marking the journal entry as open: @@ -263,7 +266,7 @@ static int journal_entry_open(struct journal *j) old.v = new.v = v; if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return -EROFS; + return cur_entry_insufficient_devices; /* Handle any already added entries */ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); @@ -376,7 +379,7 @@ retry: * Don't want to close current journal entry, just need to * invoke reclaim: */ - ret = -ENOSPC; + ret = cur_entry_journal_full; goto unlock; } @@ -399,14 +402,16 @@ retry: * there's still a previous one in flight: */ trace_journal_entry_full(c); - ret = -EAGAIN; + ret = cur_entry_blocked; } else { ret = journal_entry_open(j); } unlock: - if ((ret == -EAGAIN || ret == -ENOSPC) && - !j->res_get_blocked_start) + if ((ret && ret != cur_entry_insufficient_devices) && + !j->res_get_blocked_start) { j->res_get_blocked_start = local_clock() ?: 1; + trace_journal_full(c); + } can_discard = j->can_discard; spin_unlock(&j->lock); @@ -414,41 +419,39 @@ unlock: if (!ret) goto retry; - if (ret == -ENOSPC) { - if (WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), - "JOURNAL_RES_GET_RESERVED set but journal full")) { - char *buf; - - buf = kmalloc(4096, GFP_NOFS); - if (buf) { - bch2_journal_debug_to_text(&PBUF(buf), j); - pr_err("\n%s", buf); - kfree(buf); - } + if (WARN_ONCE(ret == cur_entry_journal_full && + !can_discard && + (flags & JOURNAL_RES_GET_RESERVED), + "JOURNAL_RES_GET_RESERVED set but journal full")) { + char *buf; + + buf = kmalloc(4096, GFP_NOFS); + if (buf) { + bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); + pr_err("\n%s", buf); + kfree(buf); } + } - /* - * Journal is full - can't rely on reclaim from work item due to - * freezing: - */ - trace_journal_full(c); - - if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { - if (can_discard) { - bch2_journal_do_discards(j); - goto retry; - } - - if (mutex_trylock(&j->reclaim_lock)) { - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); - } + /* + * Journal is full - can't rely on reclaim from work item due to + * freezing: + */ + if ((ret == cur_entry_journal_full || + ret == cur_entry_journal_pin_full) && + !(flags & JOURNAL_RES_GET_NONBLOCK)) { + if (can_discard) { + bch2_journal_do_discards(j); + goto retry; } - ret = -EAGAIN; + if (mutex_trylock(&j->reclaim_lock)) { + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); + } } - return ret; + return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN; } /* @@ -481,8 +484,10 @@ static bool journal_preres_available(struct journal *j, { bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); - if (!ret) - bch2_journal_reclaim_work(&j->reclaim_work.work); + if (!ret && mutex_trylock(&j->reclaim_lock)) { + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); + } return ret; } @@ -543,12 +548,20 @@ out: * necessary */ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, - struct closure *parent) + struct closure *parent) { struct journal_buf *buf; int ret = 0; + if (seq <= j->err_seq) + return -EIO; + + if (seq <= j->seq_ondisk) + return 1; + spin_lock(&j->lock); + + /* Recheck under lock: */ if (seq <= j->err_seq) { ret = -EIO; goto out; @@ -678,16 +691,19 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (nr <= ja->nr) return 0; - ret = -ENOMEM; new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); - if (!new_buckets || !new_bucket_seq) + if (!new_buckets || !new_bucket_seq) { + ret = -ENOMEM; goto err; + } journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, nr + sizeof(*journal_buckets) / sizeof(u64)); - if (!journal_buckets) + if (!journal_buckets) { + ret = -ENOSPC; goto err; + } /* * We may be called from the device add path, before the new device has @@ -716,8 +732,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, goto err; } } else { + rcu_read_lock(); ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl); + rcu_read_unlock(); if (IS_ERR(ob)) { ret = cl ? -EAGAIN : -ENOSPC; goto err; @@ -769,8 +787,6 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (!new_fs) bch2_open_bucket_put(c, ob); } - - ret = 0; err: bch2_sb_resize_journal(&ca->disk_sb, ja->nr + sizeof(*journal_buckets) / sizeof(u64)); @@ -889,7 +905,7 @@ void bch2_fs_journal_stop(struct journal *j) j->last_empty_seq + 1 != journal_cur_seq(j))); cancel_delayed_work_sync(&j->write_work); - cancel_delayed_work_sync(&j->reclaim_work); + bch2_journal_reclaim_stop(j); } int bch2_fs_journal_start(struct journal *j, u64 cur_seq, @@ -1017,7 +1033,6 @@ int bch2_fs_journal_init(struct journal *j) spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); INIT_DELAYED_WORK(&j->write_work, journal_write_work); - INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); init_waitqueue_head(&j->pin_flush_wait); mutex_init(&j->reclaim_lock); mutex_init(&j->discard_lock); @@ -1069,7 +1084,10 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) "last_seq:\t\t%llu\n" "last_seq_ondisk:\t%llu\n" "prereserved:\t\t%u/%u\n" + "nr direct reclaim:\t%llu\n" + "nr background reclaim:\t%llu\n" "current entry sectors:\t%u\n" + "current entry error:\t%u\n" "current entry:\t\t", fifo_used(&j->pin), journal_cur_seq(j), @@ -1077,7 +1095,10 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) j->last_seq_ondisk, j->prereserved.reserved, j->prereserved.remaining, - j->cur_entry_sectors); + j->nr_direct_reclaim, + j->nr_background_reclaim, + j->cur_entry_sectors, + j->cur_entry_error); switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 7c157bc5..d1367cf0 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -994,7 +994,7 @@ static void journal_write_done(struct closure *cl) * Must come before signaling write completion, for * bch2_fs_journal_stop(): */ - mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); + journal_reclaim_kick(&c->journal); /* also must come before signalling write completion: */ closure_debug_destroy(cl); @@ -1045,6 +1045,8 @@ void bch2_journal_write(struct closure *cl) unsigned i, sectors, bytes, u64s; int ret; + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); journal_buf_realloc(j, w); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 7a04d06b..66f5dcce 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -1,12 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_key_cache.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" #include "replicas.h" #include "super.h" +#include +#include +#include + /* Free space calculations: */ static unsigned journal_space_from(struct journal_device *ja, @@ -164,12 +169,12 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; if (nr_online < c->opts.metadata_replicas_required) { - ret = -EROFS; + ret = cur_entry_insufficient_devices; goto out; } if (!fifo_free(&j->pin)) { - ret = -ENOSPC; + ret = cur_entry_journal_pin_full; goto out; } @@ -180,7 +185,7 @@ void bch2_journal_space_available(struct journal *j) clean = __journal_space_available(j, nr_devs_want, journal_space_clean); if (!discarded.next_entry) - ret = -ENOSPC; + ret = cur_entry_journal_full; overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * journal_entry_overhead(j); @@ -432,7 +437,6 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) list_move(&ret->list, &pin_list->flushed); BUG_ON(j->flush_in_progress); j->flush_in_progress = ret; - j->last_flushed = jiffies; } spin_unlock(&j->lock); @@ -441,17 +445,24 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) } /* returns true if we did work */ -static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, - unsigned min_nr) +static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush, + unsigned min_nr) { struct journal_entry_pin *pin; - bool ret = false; - u64 seq; + u64 seq, ret = 0; lockdep_assert_held(&j->reclaim_lock); - while ((pin = journal_get_next_pin(j, min_nr - ? U64_MAX : seq_to_flush, &seq))) { + while (1) { + cond_resched(); + + j->last_flushed = jiffies; + + pin = journal_get_next_pin(j, min_nr + ? U64_MAX : seq_to_flush, &seq); + if (!pin) + break; + if (min_nr) min_nr--; @@ -460,7 +471,7 @@ static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, BUG_ON(j->flush_in_progress != pin); j->flush_in_progress = NULL; wake_up(&j->pin_flush_wait); - ret = true; + ret++; } return ret; @@ -524,15 +535,27 @@ static u64 journal_seq_to_flush(struct journal *j) * 512 journal entries or 25% of all journal buckets, then * journal_next_bucket() should not stall. */ -void bch2_journal_reclaim(struct journal *j) +static void __bch2_journal_reclaim(struct journal *j, bool direct) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - unsigned min_nr = 0; - u64 seq_to_flush = 0; + bool kthread = (current->flags & PF_KTHREAD) != 0; + u64 seq_to_flush, nr_flushed = 0; + size_t min_nr; + unsigned flags; + /* + * We can't invoke memory reclaim while holding the reclaim_lock - + * journal reclaim is required to make progress for memory reclaim + * (cleaning the caches), so we can't get stuck in memory reclaim while + * we're holding the reclaim lock: + */ lockdep_assert_held(&j->reclaim_lock); + flags = memalloc_noreclaim_save(); do { + if (kthread && kthread_should_stop()) + break; + bch2_journal_do_discards(j); seq_to_flush = journal_seq_to_flush(j); @@ -549,26 +572,103 @@ void bch2_journal_reclaim(struct journal *j) if (j->prereserved.reserved * 2 > j->prereserved.remaining) min_nr = 1; - if ((atomic_read(&c->btree_cache.dirty) * 4 > - c->btree_cache.used * 3) || - (c->btree_key_cache.nr_dirty * 4 > - c->btree_key_cache.nr_keys)) + if (atomic_read(&c->btree_cache.dirty) * 4 > + c->btree_cache.used * 3) min_nr = 1; - } while (journal_flush_pins(j, seq_to_flush, min_nr)); - if (!bch2_journal_error(j)) - queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, - msecs_to_jiffies(j->reclaim_delay_ms)); + min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c)); + + trace_journal_reclaim_start(c, + min_nr, + j->prereserved.reserved, + j->prereserved.remaining, + atomic_read(&c->btree_cache.dirty), + c->btree_cache.used, + c->btree_key_cache.nr_dirty, + c->btree_key_cache.nr_keys); + + nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr); + + if (direct) + j->nr_direct_reclaim += nr_flushed; + else + j->nr_background_reclaim += nr_flushed; + trace_journal_reclaim_finish(c, nr_flushed); + } while (min_nr); + + memalloc_noreclaim_restore(flags); } -void bch2_journal_reclaim_work(struct work_struct *work) +void bch2_journal_reclaim(struct journal *j) { - struct journal *j = container_of(to_delayed_work(work), - struct journal, reclaim_work); + __bch2_journal_reclaim(j, true); +} - mutex_lock(&j->reclaim_lock); - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); +static int bch2_journal_reclaim_thread(void *arg) +{ + struct journal *j = arg; + unsigned long next; + + set_freezable(); + + kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); + + while (!kthread_should_stop()) { + j->reclaim_kicked = false; + + mutex_lock(&j->reclaim_lock); + __bch2_journal_reclaim(j, false); + mutex_unlock(&j->reclaim_lock); + + next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + if (j->reclaim_kicked) + break; + if (time_after_eq(jiffies, next)) + break; + schedule_timeout(next - jiffies); + try_to_freeze(); + + } + __set_current_state(TASK_RUNNING); + } + + return 0; +} + +void bch2_journal_reclaim_stop(struct journal *j) +{ + struct task_struct *p = j->reclaim_thread; + + j->reclaim_thread = NULL; + + if (p) { + kthread_stop(p); + put_task_struct(p); + } +} + +int bch2_journal_reclaim_start(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct task_struct *p; + + if (j->reclaim_thread) + return 0; + + p = kthread_create(bch2_journal_reclaim_thread, j, + "bch-reclaim/%s", c->name); + if (IS_ERR(p)) + return PTR_ERR(p); + + get_task_struct(p); + j->reclaim_thread = p; + wake_up_process(p); + return 0; } static int journal_flush_done(struct journal *j, u64 seq_to_flush, @@ -582,7 +682,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - *did_work = journal_flush_pins(j, seq_to_flush, 0); + *did_work = journal_flush_pins(j, seq_to_flush, 0) != 0; spin_lock(&j->lock); /* diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index 8128907a..bae2c921 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -10,6 +10,17 @@ enum journal_space_from { journal_space_clean, }; +static inline void journal_reclaim_kick(struct journal *j) +{ + struct task_struct *p = READ_ONCE(j->reclaim_thread); + + if (p && !j->reclaim_kicked) { + j->reclaim_kicked = true; + if (p) + wake_up_process(p); + } +} + unsigned bch2_journal_dev_buckets_available(struct journal *, struct journal_device *, enum journal_space_from); @@ -55,7 +66,9 @@ void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); void bch2_journal_do_discards(struct journal *); void bch2_journal_reclaim(struct journal *); -void bch2_journal_reclaim_work(struct work_struct *); + +void bch2_journal_reclaim_stop(struct journal *); +int bch2_journal_reclaim_start(struct journal *); bool bch2_journal_flush_pins(struct journal *, u64); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 9757e3d5..4640bb86 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -146,7 +146,13 @@ struct journal { * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if * insufficient devices: */ - int cur_entry_error; + enum { + cur_entry_ok, + cur_entry_blocked, + cur_entry_journal_full, + cur_entry_journal_pin_full, + cur_entry_insufficient_devices, + } cur_entry_error; union journal_preres_state prereserved; @@ -210,8 +216,12 @@ struct journal { struct write_point wp; spinlock_t err_lock; - struct delayed_work reclaim_work; struct mutex reclaim_lock; + struct task_struct *reclaim_thread; + bool reclaim_kicked; + u64 nr_direct_reclaim; + u64 nr_background_reclaim; + unsigned long last_flushed; struct journal_entry_pin *flush_in_progress; wait_queue_head_t pin_flush_wait; diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index ddfda1ef..4834f41f 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -345,7 +345,7 @@ int bch2_copygc_start(struct bch_fs *c) if (bch2_fs_init_fault("copygc_start")) return -ENOMEM; - t = kthread_create(bch2_copygc_thread, c, "bch_copygc"); + t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); if (IS_ERR(t)) return PTR_ERR(t); diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 44d2651b..c3373c48 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -314,7 +314,7 @@ int bch2_rebalance_start(struct bch_fs *c) if (c->opts.nochanges) return 0; - p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); + p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); if (IS_ERR(p)) return PTR_ERR(p); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 67500636..0b3521c9 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -616,6 +616,7 @@ static int bch2_journal_replay(struct bch_fs *c, */ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); + journal_reclaim_kick(j); j->replay_journal_seq = seq; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 8673e974..e3bbd0b0 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include #include @@ -259,7 +258,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes) void bch2_fs_read_only(struct bch_fs *c) { if (!test_bit(BCH_FS_RW, &c->flags)) { - cancel_delayed_work_sync(&c->journal.reclaim_work); + BUG_ON(c->journal.reclaim_thread); return; } @@ -417,6 +416,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) { + bch_err(c, "error starting journal reclaim: %i", ret); + return ret; + } + if (!early) { ret = bch2_fs_read_write_late(c); if (ret) @@ -425,9 +430,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) percpu_ref_reinit(&c->writes); set_bit(BCH_FS_RW, &c->flags); - - queue_delayed_work(c->journal_reclaim_wq, - &c->journal.reclaim_work, 0); return 0; err: __bch2_fs_read_only(c); @@ -495,8 +497,6 @@ static void __bch2_fs_free(struct bch_fs *c) kfree(c->unused_inode_hints); free_heap(&c->copygc_heap); - if (c->journal_reclaim_wq) - destroy_workqueue(c->journal_reclaim_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->wq) @@ -750,8 +750,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || @@ -2018,6 +2016,7 @@ static void bcachefs_exit(void) bch2_debug_exit(); bch2_vfs_exit(); bch2_chardev_exit(); + bch2_btree_key_cache_exit(); if (bcachefs_kset) kset_unregister(bcachefs_kset); } @@ -2027,6 +2026,7 @@ static int __init bcachefs_init(void) bch2_bkey_pack_test(); if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || + bch2_btree_key_cache_init() || bch2_chardev_init() || bch2_vfs_init() || bch2_debug_init()) diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 58c00e26..900eda88 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -165,6 +165,7 @@ read_attribute(journal_debug); read_attribute(journal_pins); read_attribute(btree_updates); read_attribute(dirty_btree_nodes); +read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(btree_transactions); read_attribute(stripes_heap); @@ -374,6 +375,11 @@ SHOW(bch2_fs) return out.pos - buf; } + if (attr == &sysfs_btree_cache) { + bch2_btree_cache_to_text(&out, c); + return out.pos - buf; + } + if (attr == &sysfs_btree_key_cache) { bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); return out.pos - buf; @@ -550,6 +556,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_journal_pins, &sysfs_btree_updates, &sysfs_dirty_btree_nodes, + &sysfs_btree_cache, &sysfs_btree_key_cache, &sysfs_btree_transactions, &sysfs_stripes_heap, -- cgit v1.2.3