diff options
author | Kent Overstreet <kent.overstreet@linux.dev> | 2025-09-09 16:09:20 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2025-09-14 11:28:41 -0400 |
commit | 6ad0fa30a9c5aafda1206a64ba1262796ed35457 (patch) | |
tree | d779ccee13ee4e8fa7a7b21e799dd6126a48274d | |
parent | 0c6b9627a40559939ae58d305089c17dea7df5df (diff) |
Update bcachefs sources to 1c8d3fc41e72 bcachefs: fast_list.c is only required for async obj debugging
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
39 files changed, 597 insertions, 559 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision index 13c25773..5fc72f30 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -292344971769fe1dd561d8844c57c15c833f91ef +1c8d3fc41e7291ee39458e225a1ceac76bb8d8f1 @@ -266,8 +266,10 @@ update-bcachefs-sources: git rm -rf --ignore-unmatch libbcachefs test -d libbcachefs || mkdir libbcachefs cp $(LINUX_DIR)/fs/bcachefs/*.[ch] libbcachefs/ + cp $(LINUX_DIR)/fs/bcachefs/Makefile libbcachefs/ rm libbcachefs/fast_list.c libbcachefs/async_objs.c git add libbcachefs/*.[ch] + git add libbcachefs/Makefile git rm -f libbcachefs/mean_and_variance_test.c cp $(LINUX_DIR)/include/linux/closure.h include/linux/ git add include/linux/closure.h diff --git a/c_src/cmd_option.c b/c_src/cmd_option.c index 433d6196..1ae4e076 100644 --- a/c_src/cmd_option.c +++ b/c_src/cmd_option.c @@ -117,7 +117,7 @@ int cmd_set_option(int argc, char *argv[]) fprintf(stderr, "Can't set option %s\n", opt->attr.name); if (opt->flags & OPT_FS) { - ret = bch2_opt_hook_pre_set(c, NULL, 0, i, v); + ret = bch2_opt_hook_pre_set(c, NULL, 0, i, v, true); if (ret < 0) { fprintf(stderr, "error setting %s: %i\n", opt->attr.name, ret); continue; @@ -135,7 +135,7 @@ int cmd_set_option(int argc, char *argv[]) continue; } - ret = bch2_opt_hook_pre_set(c, ca, 0, i, v); + ret = bch2_opt_hook_pre_set(c, ca, 0, i, v, true); if (ret < 0) { fprintf(stderr, "error setting %s: %i\n", opt->attr.name, ret); continue; diff --git a/include/linux/atomic.h b/include/linux/atomic.h index ae87a25a..62c70879 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -275,6 +275,13 @@ static inline bool a_type##_try_cmpxchg_acquire(a_type##_t *v, i_type *old, i_ty i_type prev = *old; \ *old = cmpxchg_acquire(&v->counter, *old, new); \ return prev == *old; \ +} \ + \ +static inline bool a_type##_try_cmpxchg_release(a_type##_t *v, i_type *old, i_type new)\ +{ \ + i_type prev = *old; \ + *old = cmpxchg_release(&v->counter, *old, new); \ + return prev == *old; \ } DEF_ATOMIC_OPS(atomic, int) diff --git a/include/linux/closure.h b/include/linux/closure.h index 880fe85e..83a0dde3 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -128,14 +128,15 @@ enum closure_state { * annotate where references are being transferred. */ - CLOSURE_BITS_START = (1U << 26), - CLOSURE_DESTRUCTOR = (1U << 26), + CLOSURE_BITS_START = (1U << 24), + CLOSURE_DESTRUCTOR = (1U << 24), + CLOSURE_SLEEPING = (1U << 26), CLOSURE_WAITING = (1U << 28), CLOSURE_RUNNING = (1U << 30), }; #define CLOSURE_GUARD_MASK \ - ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) + (((CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)|(CLOSURE_BITS_START >> 1)) #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) @@ -144,7 +145,7 @@ struct closure { union { struct { struct workqueue_struct *wq; - struct closure_syncer *s; + struct task_struct *sleeper; struct llist_node list; closure_fn *fn; }; @@ -154,7 +155,6 @@ struct closure { struct closure *parent; atomic_t remaining; - bool closure_get_happened; #ifdef CONFIG_DEBUG_CLOSURES #define CLOSURE_MAGIC_DEAD 0xc054dead @@ -169,11 +169,18 @@ struct closure { }; void closure_sub(struct closure *cl, int v); -void closure_put(struct closure *cl); void __closure_wake_up(struct closure_waitlist *list); bool closure_wait(struct closure_waitlist *list, struct closure *cl); void __closure_sync(struct closure *cl); +/* + * closure_put - decrement a closure's refcount + */ +static inline void closure_put(struct closure *cl) +{ + closure_sub(cl, 1); +} + static inline unsigned closure_nr_remaining(struct closure *cl) { return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK; @@ -187,11 +194,7 @@ static inline unsigned closure_nr_remaining(struct closure *cl) */ static inline void closure_sync(struct closure *cl) { -#ifdef CONFIG_DEBUG_CLOSURES - BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened); -#endif - - if (cl->closure_get_happened) + if (closure_nr_remaining(cl) > 1) __closure_sync(cl); } @@ -199,10 +202,7 @@ int __closure_sync_timeout(struct closure *cl, unsigned long timeout); static inline int closure_sync_timeout(struct closure *cl, unsigned long timeout) { -#ifdef CONFIG_DEBUG_CLOSURES - BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened); -#endif - return cl->closure_get_happened + return closure_nr_remaining(cl) > 1 ? __closure_sync_timeout(cl, timeout) : 0; } @@ -275,8 +275,6 @@ static inline void closure_queue(struct closure *cl) */ static inline void closure_get(struct closure *cl) { - cl->closure_get_happened = true; - #ifdef CONFIG_DEBUG_CLOSURES BUG_ON((atomic_inc_return(&cl->remaining) & CLOSURE_REMAINING_MASK) <= 1); @@ -314,7 +312,6 @@ static inline void closure_init(struct closure *cl, struct closure *parent) closure_get(parent); atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); - cl->closure_get_happened = false; closure_debug_create(cl); closure_set_ip(cl); diff --git a/libbcachefs/Makefile b/libbcachefs/Makefile index 93c8ee54..bb2a80fb 100644 --- a/libbcachefs/Makefile +++ b/libbcachefs/Makefile @@ -41,7 +41,6 @@ bcachefs-y := \ extents.o \ extent_update.o \ eytzinger.o \ - fast_list.o \ fs.o \ fs-ioctl.o \ fs-io.o \ @@ -99,6 +98,7 @@ bcachefs-y := \ varint.o \ xattr.o +bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += fast_list.o bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index 02aef668..ae6d0aa8 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -306,7 +306,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *); void __bch2_wait_on_allocator(struct bch_fs *, struct closure *); static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) { - if (cl->closure_get_happened) + if (closure_nr_remaining(cl) > 1) __bch2_wait_on_allocator(c, cl); } diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index c662eeba..3193dbcf 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -432,6 +432,10 @@ fsck_err: /* verify that every backpointer has a corresponding alloc key */ int bch2_check_btree_backpointers(struct bch_fs *c) { + struct progress_indicator_state progress; + + bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_backpointers)); + struct bkey_buf last_flushed; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); @@ -439,8 +443,10 @@ int bch2_check_btree_backpointers(struct bch_fs *c) CLASS(btree_trans, trans)(c); int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter); + bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed); + })); bch2_bkey_buf_exit(&last_flushed, c); return ret; @@ -815,7 +821,9 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct progress_indicator_state progress; int ret = 0; - bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); + bch2_progress_init_inner(&progress, trans->c, + btree_has_data_ptrs_mask, + ~0ULL); for (enum btree_id btree_id = 0; btree_id < btree_id_nr_alive(c); diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 08393971..d29bd684 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -706,7 +706,8 @@ struct bch_sb_field_ext { x(fast_device_removal, BCH_VERSION(1, 27)) \ x(inode_has_case_insensitive, BCH_VERSION(1, 28)) \ x(extent_snapshot_whiteouts, BCH_VERSION(1, 29)) \ - x(31bit_dirent_offset, BCH_VERSION(1, 30)) + x(31bit_dirent_offset, BCH_VERSION(1, 30)) \ + x(btree_node_accounting, BCH_VERSION(1, 31)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -717,7 +718,7 @@ enum bcachefs_metadata_version { }; static const __maybe_unused -unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work; +unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_btree_node_accounting; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -965,7 +966,8 @@ enum bch_sb_feature { x(alloc_info, 0) \ x(alloc_metadata, 1) \ x(extents_above_btree_updates_done, 2) \ - x(bformat_overflow_done, 3) + x(bformat_overflow_done, 3) \ + x(no_stale_ptrs, 4) enum bch_sb_compat { #define x(f, n) BCH_COMPAT_##f, diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 75d73677..da1a1a21 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -344,15 +344,6 @@ void bch2_bkey_swab_val(struct bkey_s k) ops->swab(k); } -bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) -{ - const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - - return ops->key_normalize - ? ops->key_normalize(c, k) - : false; -} - bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type); diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index bf34111c..5adce4e9 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -26,7 +26,6 @@ struct bkey_ops { void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); - bool (*key_normalize)(struct bch_fs *, struct bkey_s); bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); int (*trigger)(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, @@ -66,8 +65,6 @@ void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, void bch2_bkey_swab_val(struct bkey_s); -bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); - static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) { return l->type == r->type && diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 3b1d694d..59638d09 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -15,7 +15,6 @@ #include <linux/prefetch.h> #include <linux/sched/mm.h> -#include <linux/seq_buf.h> #include <linux/swap.h> const char * const bch2_btree_node_flags[] = { @@ -566,19 +565,6 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, return btree_cache_can_free(list); } -static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) -{ - struct btree_cache_list *list = shrink->private_data; - struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); - - char *cbuf; - size_t buflen = seq_buf_get_buf(s, &cbuf); - struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); - - bch2_btree_cache_to_text(&out, bc); - seq_buf_commit(s, out.pos); -} - void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; @@ -673,7 +659,6 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->live[0].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; - shrink->to_text = bch2_btree_cache_shrinker_to_text; shrink->seeks = 2; shrink->private_data = &bc->live[0]; shrinker_register(shrink); @@ -684,7 +669,6 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->live[1].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; - shrink->to_text = bch2_btree_cache_shrinker_to_text; shrink->seeks = 8; shrink->private_data = &bc->live[1]; shrinker_register(shrink); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 2338feb8..63dc0836 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -780,7 +780,7 @@ static int bch2_gc_btrees(struct bch_fs *c) int ret = 0; struct progress_indicator_state progress; - bch2_progress_init(&progress, c, ~0ULL); + bch2_progress_init_inner(&progress, c, ~0ULL, ~0ULL); enum btree_id ids[BTREE_ID_NR]; for (unsigned i = 0; i < BTREE_ID_NR; i++) @@ -1140,43 +1140,11 @@ static int gc_btree_gens_key(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) return -EROFS; - bool too_stale = false; - scoped_guard(rcu) { - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; - - too_stale |= dev_ptr_stale(ca, ptr) > 16; - } - - if (!too_stale) - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; - - u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; - if (gen_after(*gen, ptr->gen)) - *gen = ptr->gen; - } - } - - if (too_stale) { - struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0); - int ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - bch2_extent_normalize(c, bkey_i_to_s(u)); - } - - return 0; + return bch2_bkey_drop_stale_ptrs(trans, iter, k); } static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca, @@ -1281,6 +1249,12 @@ int bch2_gc_gens(struct bch_fs *c) bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); trace_and_count(c, gc_gens_end, c); + + if (!(c->sb.compat & BIT_ULL(BCH_COMPAT_no_stale_ptrs))) { + guard(mutex)(&c->sb_lock); + c->disk_sb.sb->compat[0] |= cpu_to_le64(BIT_ULL(BCH_COMPAT_no_stale_ptrs)); + bch2_write_super(c); + } err: for_each_member_device(c, ca) { kvfree(ca->oldest_gen); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index e3336ab2..4890cbc8 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -13,7 +13,6 @@ #include "trace.h" #include <linux/sched/mm.h> -#include <linux/seq_buf.h> static inline bool btree_uses_pcpu_readers(enum btree_id id) { @@ -809,18 +808,6 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { } -static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) -{ - struct bch_fs *c = shrink->private_data; - struct btree_key_cache *bc = &c->btree_key_cache; - char *cbuf; - size_t buflen = seq_buf_get_buf(s, &cbuf); - struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); - - bch2_btree_key_cache_to_text(&out, bc); - seq_buf_commit(s, out.pos); -} - int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); @@ -845,7 +832,6 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) bc->shrink = shrink; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; - shrink->to_text = bch2_btree_key_cache_shrinker_to_text; shrink->batch = 1 << 14; shrink->seeks = 0; shrink->private_data = c; diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index a4f8aac4..00477464 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -69,6 +69,7 @@ struct trans_waiting_for_lock { struct lock_graph { struct trans_waiting_for_lock g[8]; unsigned nr; + bool printed_chain; }; static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) @@ -89,6 +90,10 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) static noinline void print_chain(struct printbuf *out, struct lock_graph *g) { + if (g->printed_chain || g->nr <= 1) + return; + g->printed_chain = true; + struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { @@ -124,6 +129,7 @@ static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) .node_want = trans->locking, .lock_want = trans->locking_wait.lock_want, }; + g->printed_chain = false; } static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) @@ -265,8 +271,12 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, if (unlikely(g->nr == ARRAY_SIZE(g->g))) { closure_put(&trans->ref); - if (orig_trans->lock_may_not_fail) + if (orig_trans->lock_may_not_fail) { + /* Other threads will have to rerun the cycle detector: */ + for (struct trans_waiting_for_lock *i = g->g + 1; i < g->g + g->nr; i++) + wake_up_process(i->trans->locking_wait.task); return 0; + } lock_graph_pop_all(g); @@ -398,7 +408,7 @@ next: } } up: - if (g.nr > 1 && cycle) + if (cycle) print_chain(cycle, &g); lock_graph_up(&g); goto next; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 021f5cb7..7f08863f 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -462,6 +462,7 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); CLASS(printbuf, buf)(); bool inserting = sectors > 0; + int ret = 0; BUG_ON(!sectors); @@ -489,8 +490,17 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, BCH_FSCK_ERR_ptr_too_stale); } - if (b_gen != ptr->gen && ptr->cached) + if (b_gen != ptr->gen && ptr->cached) { + if (fsck_err_on(c->sb.compat & BIT_ULL(BCH_COMPAT_no_stale_ptrs), + trans, stale_ptr_with_no_stale_ptrs_feature, + "stale cached ptr, but have no_stale_ptrs feature\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + guard(mutex)(&c->sb_lock); + c->disk_sb.sb->compat[0] &= ~cpu_to_le64(BIT_ULL(BCH_COMPAT_no_stale_ptrs)); + bch2_write_super(c); + } return 1; + } if (unlikely(b_gen != ptr->gen)) { bch2_log_msg_start(c, &buf); @@ -530,7 +540,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } *bucket_sectors += sectors; - return 0; +fsck_err: + return ret; } void bch2_trans_account_disk_usage_change(struct btree_trans *trans) @@ -749,6 +760,7 @@ static int __trigger_extent(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { bool gc = flags & BTREE_TRIGGER_gc; + bool insert = !(flags & BTREE_TRIGGER_overwrite); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -802,7 +814,7 @@ static int __trigger_extent(struct btree_trans *trans, if (cur_compression_type && cur_compression_type != p.crc.compression_type) { - if (flags & BTREE_TRIGGER_overwrite) + if (!insert) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, @@ -835,7 +847,7 @@ static int __trigger_extent(struct btree_trans *trans, } if (cur_compression_type) { - if (flags & BTREE_TRIGGER_overwrite) + if (!insert) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, @@ -845,12 +857,17 @@ static int __trigger_extent(struct btree_trans *trans, } if (level) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id); + const bool leaf_node = level == 1; + s64 v[3] = { + replicas_sectors, + insert ? 1 : -1, + !leaf_node ? (insert ? 1 : -1) : 0, + }; + + ret = bch2_disk_accounting_mod2(trans, gc, v, btree, btree_id); if (ret) return ret; } else { - bool insert = !(flags & BTREE_TRIGGER_overwrite); - s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), @@ -869,7 +886,6 @@ int bch2_trigger_extent(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s new, enum btree_iter_update_trigger_flags flags) { - struct bch_fs *c = trans->c; struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c); struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old); unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; @@ -900,30 +916,9 @@ int bch2_trigger_extent(struct btree_trans *trans, return ret; } - int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta[1] = { 0 }; - - s64 s = bch2_bkey_sectors_need_rebalance(c, old); - need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta[0] -= s; - - s = bch2_bkey_sectors_need_rebalance(c, new.s_c); - need_rebalance_delta += s != 0; - need_rebalance_sectors_delta[0] += s; - - if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { - int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, - new.k->p, need_rebalance_delta > 0); - if (ret) - return ret; - } - - if (need_rebalance_sectors_delta[0]) { - int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, - need_rebalance_sectors_delta, rebalance_work); - if (ret) - return ret; - } + int ret = bch2_trigger_extent_rebalance(trans, old, new.s_c, flags); + if (ret) + return ret; } return 0; diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 7a0da6cd..ca925c5d 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -393,7 +393,7 @@ restart_drop_extra_replicas: bch2_extent_ptr_decoded_append(insert, &p); bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert)); + bch2_bkey_drop_extra_cached_ptrs(c, &m->op.opts, bkey_i_to_s(insert)); ret = bch2_sum_sector_overwrites(trans, &iter, insert, &should_check_enospc, @@ -721,7 +721,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, * will do the appropriate thing with it (turning it into a * KEY_TYPE_error key, or just a discard if it was a cached extent) */ - bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n)); + bch2_bkey_drop_extra_cached_ptrs(c, io_opts, bkey_i_to_s(n)); /* * Since we're not inserting through an extent iterator diff --git a/libbcachefs/disk_accounting_format.h b/libbcachefs/disk_accounting_format.h index 8269af1d..730a17ea 100644 --- a/libbcachefs/disk_accounting_format.h +++ b/libbcachefs/disk_accounting_format.h @@ -108,7 +108,7 @@ static inline bool data_type_is_hidden(enum bch_data_type type) x(dev_data_type, 3, 3) \ x(compression, 4, 3) \ x(snapshot, 5, 1) \ - x(btree, 6, 1) \ + x(btree, 6, 3) \ x(rebalance_work, 7, 1) \ x(inum, 8, 3) @@ -174,6 +174,14 @@ struct bch_acct_snapshot { __u32 id; } __packed; +/* + * Metadata accounting per btree id: + * [ + * total btree disk usage in sectors + * total number of btree nodes + * number of non-leaf btree nodes + * ] + */ struct bch_acct_btree { __u32 id; } __packed; diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 271e2521..89a95b6c 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -22,6 +22,7 @@ #include "io_write.h" #include "keylist.h" #include "lru.h" +#include "rebalance.h" #include "recovery.h" #include "replicas.h" #include "super-io.h" @@ -1129,7 +1130,13 @@ static int ec_stripe_update_extent(struct btree_trans *trans, (union bch_extent_entry *) ec_ptr, (union bch_extent_entry *) &stripe_ptr); - ret = bch2_trans_update(trans, &iter, n, 0); + struct bch_inode_opts opts; + + ret = bch2_extent_get_io_opts_one(trans, &opts, &iter, bkey_i_to_s_c(n), + SET_NEEDS_REBALANCE_other) ?: + bch2_bkey_set_needs_rebalance(trans->c, &opts, n, + SET_NEEDS_REBALANCE_other, 0) ?: + bch2_trans_update(trans, &iter, n, 0); out: bch2_trans_iter_exit(&iter); return ret; @@ -1144,8 +1151,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b int ret = 0; CLASS(bch2_dev_tryget, ca)(c, ptr.dev); - if (!ca) - return bch_err_throw(c, ENOENT_dev_not_found); + if (!ca) /* BCH_SB_MEMBER_INVALID */ + return 0; struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 86aa93ea..3274ba42 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -12,6 +12,7 @@ #include "btree_gc.h" #include "btree_io.h" #include "btree_iter.h" +#include "btree_update.h" #include "buckets.h" #include "checksum.h" #include "compress.h" @@ -1213,6 +1214,21 @@ drop: bch2_bkey_drop_ptr_noerror(k, ptr); } +static bool bch2_bkey_has_stale_ptrs(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_dev *ca; + + guard(rcu)(); + bkey_for_each_ptr(ptrs, ptr) + if (ptr->cached && + (ca = bch2_dev_rcu_noerror(c, ptr->dev)) && + dev_ptr_stale_rcu(ca, ptr) > 0) + return true; + + return false; +} + /* * bch2_extent_normalize - clean up an extent, dropping stale pointers etc. * @@ -1221,7 +1237,7 @@ drop: * For existing keys, only called when btree nodes are being rewritten, not when * they're merely being compacted/resorted in memory. */ -bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) +static void __bch2_bkey_drop_stale_ptrs(struct bch_fs *c, struct bkey_s k) { struct bch_dev *ca; @@ -1230,19 +1246,26 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ptr->cached && (!(ca = bch2_dev_rcu_noerror(c, ptr->dev)) || dev_ptr_stale_rcu(ca, ptr) > 0)); +} + +int bch2_bkey_drop_stale_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) +{ + if (bch2_bkey_has_stale_ptrs(trans->c, k)) { + struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, + BTREE_UPDATE_internal_snapshot_node); + int ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + __bch2_bkey_drop_stale_ptrs(trans->c, bkey_i_to_s(u)); + } - return bkey_deleted(k.k); + return 0; } -/* - * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc. - * - * Like bch2_extent_normalize(), but also only keeps a single cached pointer on - * the promote target. - */ -bool bch2_extent_normalize_by_opts(struct bch_fs *c, - struct bch_inode_opts *opts, - struct bkey_s k) +void bch2_bkey_drop_extra_cached_ptrs(struct bch_fs *c, + struct bch_inode_opts *opts, + struct bkey_s k) { struct bkey_ptrs ptrs; bool have_cached_ptr; @@ -1260,8 +1283,6 @@ restart_drop_ptrs: } have_cached_ptr = true; } - - return bkey_deleted(k.k); } void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 03ea7c68..1ea9752b 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -440,7 +440,6 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .key_validate = bch2_bkey_ptrs_validate, \ .val_to_text = bch2_bkey_ptrs_to_text, \ .swab = bch2_ptr_swab, \ - .key_normalize = bch2_extent_normalize, \ .key_merge = bch2_extent_merge, \ .trigger = bch2_trigger_extent, \ }) @@ -689,8 +688,8 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_inode_opts *, struct bkey_s, struct bch_extent_ptr *); -bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_inode_opts *, struct bkey_s); -bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); +int bch2_bkey_drop_stale_ptrs(struct btree_trans *, struct btree_iter *, struct bkey_s_c); +void bch2_bkey_drop_extra_cached_ptrs(struct bch_fs *, struct bch_inode_opts *, struct bkey_s); void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c index a104b9d7..d5340973 100644 --- a/libbcachefs/fs-io-direct.c +++ b/libbcachefs/fs-io-direct.c @@ -117,7 +117,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) } else { atomic_set(&dio->cl.remaining, CLOSURE_REMAINING_INITIALIZER + 1); - dio->cl.closure_get_happened = true; } dio->req = req; diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index ccc44b1f..3bde5c07 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1963,7 +1963,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, } } - ret = check_extent_overbig(trans, iter, k); + ret = check_extent_overbig(trans, iter, k) ?: + bch2_bkey_drop_stale_ptrs(trans, iter, k); if (ret) goto err; @@ -2040,7 +2041,8 @@ int bch2_check_indirect_extents(struct bch_fs *c) BCH_TRANS_COMMIT_no_enospc, ({ progress_update_iter(trans, &progress, &iter); bch2_disk_reservation_put(c, &res); - check_extent_overbig(trans, &iter, k); + check_extent_overbig(trans, &iter, k) ?: + bch2_bkey_drop_stale_ptrs(trans, &iter, k); })); bch2_disk_reservation_put(c, &res); diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index e7ba0d0b..3765aa52 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -37,6 +37,12 @@ module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(read_corrupt_ratio, ""); #endif +static bool bch2_poison_extents_on_checksum_error; +module_param_named(poison_extents_on_checksum_error, + bch2_poison_extents_on_checksum_error, bool, 0644); +MODULE_PARM_DESC(poison_extents_on_checksum_error, + "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static inline u32 bch2_dev_congested_read(struct bch_dev *ca, u64 now) @@ -539,6 +545,9 @@ static void get_rbio_extent(struct btree_trans *trans, static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, enum btree_id btree, struct bkey_s_c read_k) { + if (!bch2_poison_extents_on_checksum_error) + return 0; + struct bch_fs *c = trans->c; struct data_update *u = rbio_data_update(rbio); @@ -1274,10 +1283,6 @@ retry_pick: async_object_list_add(c, rbio, rbio, &rbio->list_idx); - /* XXX: also nvme read recovery level */ - if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev))) - rbio->bio.bi_opf |= REQ_FUA; - if (rbio->bounce) trace_and_count(c, io_read_bounce, &rbio->bio); diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 8a3981e1..92edff50 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -84,13 +84,6 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, return ret; /* - * If the new extent no longer has any pointers, bch2_extent_normalize() - * will do the appropriate thing with it (turning it into a - * KEY_TYPE_error key, or just a discard if it was a cached extent) - */ - bch2_extent_normalize(c, bkey_i_to_s(n)); - - /* * Since we're not inserting through an extent iterator * (BTREE_ITER_all_snapshots iterators aren't extent iterators), * we aren't using the extent overwrite path to delete, we're @@ -273,10 +266,15 @@ int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags, struct printbuf *err) { struct progress_indicator_state progress; + int ret; + bch2_progress_init(&progress, c, - BIT_ULL(BTREE_ID_extents)| - BIT_ULL(BTREE_ID_reflink)); + btree_has_data_ptrs_mask & ~BIT_ULL(BTREE_ID_stripes)); + + if ((ret = bch2_dev_usrdata_drop(c, &progress, dev_idx, flags, err))) + return ret; + + bch2_progress_init_inner(&progress, c, 0, ~0ULL); - return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags, err) ?: - bch2_dev_metadata_drop(c, &progress, dev_idx, flags, err); + return bch2_dev_metadata_drop(c, &progress, dev_idx, flags, err); } diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 122bc98e..bd5faafc 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -518,7 +518,8 @@ void bch2_opts_to_text(struct printbuf *out, } } -int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, u64 v) +int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, u64 v, + bool change) { int ret = 0; @@ -542,13 +543,26 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum b break; } + if (change && + (id == Opt_foreground_target || + id == Opt_background_target || + id == Opt_promote_target || + id == Opt_compression || + id == Opt_background_compression || + id == Opt_data_checksum || + id == Opt_data_replicas)) { + ret = bch2_set_rebalance_needs_scan(c, inum); + if (ret) + return ret; + } + return ret; } int bch2_opts_hooks_pre_set(struct bch_fs *c) { for (unsigned i = 0; i < bch2_opts_nr; i++) { - int ret = bch2_opt_hook_pre_set(c, NULL, 0, i, bch2_opt_get_by_id(&c->opts, i)); + int ret = bch2_opt_hook_pre_set(c, NULL, 0, i, bch2_opt_get_by_id(&c->opts, i), false); if (ret) return ret; } @@ -559,14 +573,18 @@ int bch2_opts_hooks_pre_set(struct bch_fs *c) void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, u64 v) { - switch (id) { - case Opt_foreground_target: - case Opt_compression: - case Opt_background_target: - case Opt_background_compression: + if (id == Opt_foreground_target || + id == Opt_background_target || + id == Opt_promote_target || + id == Opt_compression || + id == Opt_background_compression || + id == Opt_data_checksum || + id == Opt_data_replicas) { bch2_set_rebalance_needs_scan(c, inum); bch2_rebalance_wakeup(c); - break; + } + + switch (id) { case Opt_rebalance_enabled: bch2_rebalance_wakeup(c); break; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 22cf109f..6b9f1883 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -149,12 +149,12 @@ enum fsck_err_opts { BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ NULL, "Number of consecutive write errors allowed before kicking out a device")\ x(metadata_replicas, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX + 1), \ BCH_SB_META_REPLICAS_WANT, 1, \ "#", "Number of metadata replicas") \ x(data_replicas, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX + 1), \ BCH_SB_DATA_REPLICAS_WANT, 1, \ "#", "Number of data replicas") \ @@ -175,12 +175,12 @@ enum fsck_err_opts { BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ "size", "Maximum size of checksummed/compressed extents")\ x(metadata_checksum, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_RUNTIME, \ OPT_STR(__bch2_csum_opts), \ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(data_checksum, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ @@ -190,12 +190,12 @@ enum fsck_err_opts { BCH_SB_CSUM_ERR_RETRY_NR, 3, \ NULL, NULL) \ x(compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ x(background_compression, u8, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ @@ -205,27 +205,27 @@ enum fsck_err_opts { BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ x(metadata_target, u16, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_METADATA_TARGET, 0, \ "(target)", "Device or label for metadata writes") \ x(foreground_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_FOREGROUND_TARGET, 0, \ "(target)", "Device or label for foreground writes") \ x(background_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_BACKGROUND_TARGET, 0, \ "(target)", "Device or label to move data to in the background")\ x(promote_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_PROMOTE_TARGET, 0, \ "(target)", "Device or label to promote data to on read") \ x(erasure_code, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ @@ -658,7 +658,7 @@ void bch2_opts_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, unsigned, unsigned, unsigned); -int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, u64, enum bch_opt_id, u64); +int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, u64, enum bch_opt_id, u64, bool); int bch2_opts_hooks_pre_set(struct bch_fs *); void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, enum bch_opt_id, u64); diff --git a/libbcachefs/progress.c b/libbcachefs/progress.c index 541ee951..7cc16490 100644 --- a/libbcachefs/progress.c +++ b/libbcachefs/progress.c @@ -4,14 +4,21 @@ #include "disk_accounting.h" #include "progress.h" -void bch2_progress_init(struct progress_indicator_state *s, - struct bch_fs *c, - u64 btree_id_mask) +void bch2_progress_init_inner(struct progress_indicator_state *s, + struct bch_fs *c, + u64 leaf_btree_id_mask, + u64 inner_btree_id_mask) { memset(s, 0, sizeof(*s)); s->next_print = jiffies + HZ * 10; + /* This is only an estimation: nodes can have different replica counts */ + const u32 expected_node_disk_sectors = + READ_ONCE(c->opts.metadata_replicas) * btree_sectors(c); + + const u64 btree_id_mask = leaf_btree_id_mask | inner_btree_id_mask; + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { if (!(btree_id_mask & BIT_ULL(i))) continue; @@ -19,9 +26,29 @@ void bch2_progress_init(struct progress_indicator_state *s, struct disk_accounting_pos acc; disk_accounting_key_init(acc, btree, .id = i); - u64 v; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); - s->nodes_total += div64_ul(v, btree_sectors(c)); + struct { + u64 disk_sectors; + u64 total_nodes; + u64 inner_nodes; + } v = {0}; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), + (u64 *)&v, sizeof(v) / sizeof(u64)); + + /* Better to estimate as 0 than the total node count */ + if (inner_btree_id_mask & BIT_ULL(i)) + s->nodes_total += v.inner_nodes; + + if (!(leaf_btree_id_mask & BIT_ULL(i))) + continue; + + /* + * We check for zeros to degrade gracefully when run + * with un-upgraded accounting info (missing some counters). + */ + if (v.total_nodes != 0) + s->nodes_total += v.total_nodes - v.inner_nodes; + else + s->nodes_total += div_u64(v.disk_sectors, expected_node_disk_sectors); } } diff --git a/libbcachefs/progress.h b/libbcachefs/progress.h index 972a7308..91f34533 100644 --- a/libbcachefs/progress.h +++ b/libbcachefs/progress.h @@ -20,7 +20,17 @@ struct progress_indicator_state { struct btree *last_node; }; -void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64); +void bch2_progress_init_inner(struct progress_indicator_state *s, + struct bch_fs *c, + u64 leaf_btree_id_mask, + u64 inner_btree_id_mask); + +static inline void bch2_progress_init(struct progress_indicator_state *s, + struct bch_fs *c, u64 btree_id_mask) +{ + bch2_progress_init_inner(s, c, btree_id_mask, 0); +} + void bch2_progress_update_iter(struct btree_trans *, struct progress_indicator_state *, struct btree_iter *, diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index fa73de78..67d6a90e 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -92,122 +92,140 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, } } -static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, - struct bch_inode_opts *opts, - struct bkey_s_c k, - struct bkey_ptrs_c ptrs) +int bch2_trigger_extent_rebalance(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + enum btree_iter_update_trigger_flags flags) { - if (!opts->background_compression) - return 0; + struct bch_fs *c = trans->c; + int need_rebalance_delta = 0; + s64 need_rebalance_sectors_delta[1] = { 0 }; - unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned ptr_bit = 1; - unsigned rewrite_ptrs = 0; + s64 s = bch2_bkey_sectors_need_rebalance(c, old); + need_rebalance_delta -= s != 0; + need_rebalance_sectors_delta[0] -= s; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) - return 0; + s = bch2_bkey_sectors_need_rebalance(c, new); + need_rebalance_delta += s != 0; + need_rebalance_sectors_delta[0] += s; - if (!p.ptr.cached && p.crc.compression_type != compression_type) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; + if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + new.k->p, need_rebalance_delta > 0); + if (ret) + return ret; } - return rewrite_ptrs; -} - -static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, - struct bch_inode_opts *opts, - struct bkey_ptrs_c ptrs) -{ - if (!opts->background_target || - !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) - return 0; - - unsigned ptr_bit = 1; - unsigned rewrite_ptrs = 0; - - guard(rcu)(); - bkey_for_each_ptr(ptrs, ptr) { - if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; + if (need_rebalance_sectors_delta[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + need_rebalance_sectors_delta, rebalance_work); + if (ret) + return ret; } - return rewrite_ptrs; + return 0; } -static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, - struct bch_inode_opts *opts, - struct bkey_s_c k) +static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k, + struct bch_inode_opts *io_opts, + unsigned *move_ptrs, + unsigned *compress_ptrs, + u64 *sectors) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + *move_ptrs = 0; + *compress_ptrs = 0; + *sectors = 0; - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return 0; - - return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | - bch2_bkey_ptrs_need_move(c, opts, ptrs); -} - -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) -{ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); - if (!opts) - return 0; + const struct bch_extent_rebalance *rb_opts = bch2_bkey_ptrs_rebalance_opts(ptrs); + if (!io_opts && !rb_opts) + return; if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return 0; + return; + + unsigned compression_type = + bch2_compression_opt_to_type(io_opts + ? io_opts->background_compression + : rb_opts->background_compression); + unsigned target = io_opts + ? io_opts->background_target + : rb_opts->background_target; + if (target && !bch2_target_accepts_data(c, BCH_DATA_user, target)) + target = 0; const union bch_extent_entry *entry; struct extent_ptr_decoded p; - u64 sectors = 0; + bool incompressible = false, unwritten = false; + + unsigned ptr_idx = 1; - if (opts->background_compression) { - unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); + guard(rcu)(); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + incompressible |= p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible; + unwritten |= p.ptr.unwritten; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) { - sectors = 0; - goto incompressible; - } + if (!p.ptr.cached) { + if (p.crc.compression_type != compression_type) + *compress_ptrs |= ptr_idx; - if (!p.ptr.cached && p.crc.compression_type != compression_type) - sectors += p.crc.compressed_size; + if (target && !bch2_dev_in_target(c, p.ptr.dev, target)) + *move_ptrs |= ptr_idx; } + + ptr_idx <<= 1; } -incompressible: - if (opts->background_target) { - guard(rcu)(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && - !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) - sectors += p.crc.compressed_size; + + if (unwritten) + *compress_ptrs = 0; + if (incompressible) + *compress_ptrs = 0; + + unsigned rb_ptrs = *move_ptrs | *compress_ptrs; + + if (!rb_ptrs) + return; + + ptr_idx = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (rb_ptrs & ptr_idx) + *sectors += p.crc.compressed_size; + ptr_idx <<= 1; } +} + +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) +{ + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + u64 sectors = 0; + bch2_bkey_needs_rebalance(c, k, NULL, &move_ptrs, &compress_ptrs, §ors); return sectors; } -static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_inode_opts *opts, - struct bkey_s_c k) +static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, + struct bch_inode_opts *opts, + struct bkey_s_c k) { - if (!bkey_extent_is_direct_data(k.k)) - return 0; + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + u64 sectors = 0; - const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); + bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, §ors); + return move_ptrs|compress_ptrs; +} - if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts); - return old == NULL || memcmp(old, &new, sizeof(new)); - } else { - return old != NULL; +static inline bool bkey_should_have_rb_opts(struct bch_fs *c, + struct bch_inode_opts *opts, + struct bkey_s_c k) +{ + if (k.k->type == KEY_TYPE_reflink_v) { +#define x(n) if (opts->n##_from_inode) return true; + BCH_REBALANCE_OPTS() +#undef x } + return bch2_bkey_ptrs_need_rebalance(c, opts, k); } int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts, @@ -222,7 +240,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts, struct bch_extent_rebalance *old = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) { + if (bkey_should_have_rb_opts(c, opts, k.s_c)) { if (!old) { old = bkey_val_end(k); k.k->u64s += sizeof(*old) / sizeof(u64); @@ -243,22 +261,40 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans, struct bkey_s_c k, enum set_needs_rebalance_ctx ctx) { + struct bch_fs *c = trans->c; + BUG_ON(iter->flags & BTREE_ITER_is_extents); BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); - const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v - ? bch2_bkey_rebalance_opts(k) : NULL; - if (r) { -#define x(_name) \ - if (r->_name##_from_inode) { \ - io_opts->_name = r->_name; \ - io_opts->_name##_from_inode = true; \ + if (!bkey_extent_is_direct_data(k.k)) + return 0; + + bool may_update_indirect = ctx == SET_NEEDS_REBALANCE_opt_change_indirect; + + /* + * If it's an indirect extent, and we walked to it directly, we won't + * have the options from the inode that were directly applied: options + * from the extent take precedence - unless the io_opts option came from + * the inode and may_update_indirect is true (walked from a + * REFLINK_P_MAY_UPDATE_OPTIONS pointer). + */ + const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); + if (old && k.k->type == KEY_TYPE_reflink_v) { +#define x(_name) \ + if (old->_name##_from_inode && \ + !(may_update_indirect && io_opts->_name##_from_inode)) { \ + io_opts->_name = old->_name; \ + io_opts->_name##_from_inode = true; \ } BCH_REBALANCE_OPTS() #undef x } - if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k)) + struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, io_opts); + + if (bkey_should_have_rb_opts(c, io_opts, k) + ? old && !memcmp(old, &new, sizeof(new)) + : !old) return 0; struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); @@ -270,10 +306,10 @@ static int bch2_get_update_rebalance_opts(struct btree_trans *trans, /* On successfull transaction commit, @k was invalidated: */ - return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n, ctx, 0) ?: + return bch2_bkey_set_needs_rebalance(c, io_opts, n, ctx, 0) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, 0) ?: - bch_err_throw(trans->c, transaction_restart_commit); + bch_err_throw(c, transaction_restart_commit); } static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans *trans, @@ -569,23 +605,25 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned move_ptrs = 0; + unsigned compress_ptrs = 0; + u64 sectors = 0; - unsigned p = bch2_bkey_ptrs_need_compress(c, opts, k, ptrs); - if (p) { - prt_str(&buf, "compression="); - bch2_compression_opt_to_text(&buf, opts->background_compression); + bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, §ors); + + if (move_ptrs) { + prt_str(&buf, "move="); + bch2_target_to_text(&buf, c, opts->background_target); prt_str(&buf, " "); - bch2_prt_u64_base2(&buf, p); + bch2_prt_u64_base2(&buf, move_ptrs); prt_newline(&buf); } - p = bch2_bkey_ptrs_need_move(c, opts, ptrs); - if (p) { - prt_str(&buf, "move="); - bch2_target_to_text(&buf, c, opts->background_target); + if (compress_ptrs) { + prt_str(&buf, "compression="); + bch2_compression_opt_to_text(&buf, opts->background_compression); prt_str(&buf, " "); - bch2_prt_u64_base2(&buf, p); + bch2_prt_u64_base2(&buf, compress_ptrs); prt_newline(&buf); } @@ -659,7 +697,9 @@ static int do_rebalance_scan_indirect(struct btree_trans *trans, u32 restart_count = trans->restart_count; int ret = for_each_btree_key(trans, iter, BTREE_ID_reflink, - POS(0, idx), BTREE_ITER_not_extents, k, ({ + POS(0, idx), + BTREE_ITER_intent| + BTREE_ITER_not_extents, k, ({ if (bpos_ge(bkey_start_pos(k.k), POS(0, end))) break; bch2_get_update_rebalance_opts(trans, opts, &iter, k, @@ -696,10 +736,13 @@ static int do_rebalance_scan(struct moving_context *ctxt, int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, r->scan_start.pos, r->scan_end.pos, + BTREE_ITER_intent| BTREE_ITER_all_snapshots| BTREE_ITER_prefetch, k, ({ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + atomic64_add(k.k->size, &r->scan_stats.sectors_seen); + struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans, snapshot_io_opts, iter.pos, &iter, k, SET_NEEDS_REBALANCE_opt_change); @@ -709,10 +752,31 @@ static int do_rebalance_scan(struct moving_context *ctxt, REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v) ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), opts) : 0); - })) ?: - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + })); + if (ret) + goto out; + + if (!inum) { + ret = for_each_btree_key_max(trans, iter, BTREE_ID_reflink, + POS_MIN, POS_MAX, + BTREE_ITER_all_snapshots| + BTREE_ITER_prefetch, k, ({ + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + + atomic64_add(k.k->size, &r->scan_stats.sectors_seen); + struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans, + snapshot_io_opts, iter.pos, &iter, k, + SET_NEEDS_REBALANCE_opt_change); + PTR_ERR_OR_ZERO(opts); + })); + if (ret) + goto out; + } + + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_clear_rebalance_needs_scan(trans, inum, cookie)); +out: *sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen); /* * Ensure that the rebalance_work entries we created are seen by the diff --git a/libbcachefs/rebalance.h b/libbcachefs/rebalance.h index bff91aa0..24bafa42 100644 --- a/libbcachefs/rebalance.h +++ b/libbcachefs/rebalance.h @@ -29,6 +29,10 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_f void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *, const struct bch_extent_rebalance *); +int bch2_trigger_extent_rebalance(struct btree_trans *, + struct bkey_s_c, struct bkey_s_c, + enum btree_iter_update_trigger_flags); + u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); enum set_needs_rebalance_ctx { diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 531c2ef1..6942d3cf 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -920,6 +920,13 @@ use_clean: if (bch2_blacklist_entries_gc(c)) write_sb = true; + if (!(c->sb.compat & BIT_ULL(BCH_COMPAT_no_stale_ptrs)) && + (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_extents)) && + (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_indirect_extents))) { + c->disk_sb.sb->compat[0] |= cpu_to_le64(BIT_ULL(BCH_COMPAT_no_stale_ptrs)); + write_sb = true; + } + if (write_sb) bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -982,8 +989,9 @@ int bch2_fs_initialize(struct bch_fs *c) set_bit(BCH_FS_new_fs, &c->flags); scoped_guard(mutex, &c->sb_lock) { - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + c->disk_sb.sb->compat[0] |= cpu_to_le64(BIT_ULL(BCH_COMPAT_extents_above_btree_updates_done)); + c->disk_sb.sb->compat[0] |= cpu_to_le64(BIT_ULL(BCH_COMPAT_bformat_overflow_done)); + c->disk_sb.sb->compat[0] |= cpu_to_le64(BIT_ULL(BCH_COMPAT_no_stale_ptrs)); bch2_check_version_downgrade(c); diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c index de56a1ee..bfd06fd5 100644 --- a/libbcachefs/sb-downgrade.c +++ b/libbcachefs/sb-downgrade.c @@ -104,7 +104,10 @@ x(inode_has_case_insensitive, \ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \ - BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set) + BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\ + x(btree_node_accounting, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ @@ -152,7 +155,11 @@ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BCH_FSCK_ERR_accounting_mismatch, \ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) + BCH_FSCK_ERR_accounting_key_junk_at_end) \ + x(btree_node_accounting, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_nr_counters_wrong) struct upgrade_downgrade_entry { u64 recovery_passes; diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index 728d8780..77e3fc92 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -170,9 +170,10 @@ enum bch_fsck_flags { x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \ x(ptr_to_missing_stripe, 150, 0) \ x(ptr_to_incorrect_stripe, 151, 0) \ - x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \ + x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \ x(ptr_too_stale, 153, 0) \ x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ + x(stale_ptr_with_no_stale_ptrs_feature, 327, FSCK_AUTOFIX) \ x(ptr_bucket_data_type_mismatch, 155, 0) \ x(ptr_cached_and_erasure_coded, 156, 0) \ x(ptr_crc_uncompressed_size_too_small, 157, 0) \ @@ -338,7 +339,7 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 327, 0) + x(MAX, 328, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 473ad4b5..03b12c2d 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -238,6 +238,7 @@ static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); static void bch2_dev_io_ref_stop(struct bch_dev *, int); static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); static int bch2_dev_attach_bdev(struct bch_fs *, struct bch_sb_handle *, struct printbuf *); +static bool bch2_fs_will_resize_on_mount(struct bch_fs *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { @@ -964,6 +965,9 @@ static int bch2_fs_opt_version_init(struct bch_fs *c) if (c->opts.journal_rewind) c->opts.fsck = true; + bool may_upgrade_downgrade = !(c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) || + bch2_fs_will_resize_on_mount(c); + CLASS(printbuf, p)(); bch2_log_msg_start(c, &p); @@ -1040,22 +1044,24 @@ static int bch2_fs_opt_version_init(struct bch_fs *c) prt_bitflags(&p, __bch2_btree_ids, btrees_lost_data); } - if (bch2_check_version_downgrade(c)) { - prt_str(&p, "\nVersion downgrade required:"); + if (may_upgrade_downgrade) { + if (bch2_check_version_downgrade(c)) { + prt_str(&p, "\nVersion downgrade required:"); - __le64 passes = ext->recovery_passes_required[0]; - bch2_sb_set_downgrade(c, - BCH_VERSION_MINOR(bcachefs_metadata_version_current), - BCH_VERSION_MINOR(c->sb.version)); - passes = ext->recovery_passes_required[0] & ~passes; - if (passes) { - prt_str(&p, "\nrunning recovery passes: "); - prt_bitflags(&p, bch2_recovery_passes, - bch2_recovery_passes_from_stable(le64_to_cpu(passes))); + __le64 passes = ext->recovery_passes_required[0]; + bch2_sb_set_downgrade(c, + BCH_VERSION_MINOR(bcachefs_metadata_version_current), + BCH_VERSION_MINOR(c->sb.version)); + passes = ext->recovery_passes_required[0] & ~passes; + if (passes) { + prt_str(&p, "\nrunning recovery passes: "); + prt_bitflags(&p, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); + } } - } - check_version_upgrade(c); + check_version_upgrade(c); + } c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); @@ -1993,7 +1999,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags, struct printbuf *err) { unsigned dev_idx = ca->dev_idx, data; - bool fast_device_removal = !bch2_request_incompat_feature(c, + bool fast_device_removal = (c->sb.compat & BIT_ULL(BCH_COMPAT_no_stale_ptrs)) && + !bch2_request_incompat_feature(c, bcachefs_metadata_version_fast_device_removal); int ret; @@ -2421,15 +2428,29 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets, struct p return 0; } +static bool bch2_dev_will_resize_on_mount(struct bch_dev *ca) +{ + return ca->mi.resize_on_mount && + ca->mi.nbuckets < div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), + ca->mi.bucket_size); +} + +static bool bch2_fs_will_resize_on_mount(struct bch_fs *c) +{ + for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) + if (bch2_dev_will_resize_on_mount(ca)) + return true; + return false; +} + int bch2_fs_resize_on_mount(struct bch_fs *c) { for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { - u64 old_nbuckets = ca->mi.nbuckets; - u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), - ca->mi.bucket_size); + if (bch2_dev_will_resize_on_mount(ca)) { + u64 old_nbuckets = ca->mi.nbuckets; + u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), + ca->mi.bucket_size); - if (ca->mi.resize_on_mount && - new_nbuckets > ca->mi.nbuckets) { bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); bch_err_fn(ca, ret); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 4c6e6c46..40adefe7 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -45,7 +45,6 @@ #include <linux/blkdev.h> #include <linux/sort.h> -#include <linux/string_choices.h> #include <linux/sched/clock.h> #include "util.h" @@ -158,7 +157,6 @@ write_attribute(trigger_recalc_capacity); write_attribute(trigger_delete_dead_snapshots); write_attribute(trigger_emergency_read_only); read_attribute(gc_gens_pos); -__sysfs_attribute(read_fua_test, 0400); read_attribute(uuid); read_attribute(minor); @@ -306,116 +304,6 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) prt_printf(out, "reserved:\t\t%llu\n", b.reserved); } -static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca) -{ - struct bch_fs *c = ca->fs; - struct bio *bio = NULL; - void *buf = NULL; - unsigned bs = c->opts.block_size, iters; - u64 end, test_duration = NSEC_PER_SEC * 2; - struct bch2_time_stats stats_nofua, stats_fua, stats_random; - int ret = 0; - - bch2_time_stats_init_no_pcpu(&stats_nofua); - bch2_time_stats_init_no_pcpu(&stats_fua); - bch2_time_stats_init_no_pcpu(&stats_random); - - if (!bch2_dev_get_ioref(c, ca->dev_idx, READ, BCH_DEV_READ_REF_read_fua_test)) { - prt_str(out, "offline\n"); - return 0; - } - - struct block_device *bdev = ca->disk_sb.bdev; - - bio = bio_kmalloc(1, GFP_KERNEL); - if (!bio) { - ret = -ENOMEM; - goto err; - } - - buf = kmalloc(bs, GFP_KERNEL); - if (!buf) - goto err; - - end = ktime_get_ns() + test_duration; - for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { - bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); - bch2_bio_map(bio, buf, bs); - - u64 submit_time = ktime_get_ns(); - ret = submit_bio_wait(bio); - bch2_time_stats_update(&stats_nofua, submit_time); - - if (ret) - goto err; - } - - end = ktime_get_ns() + test_duration; - for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { - bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ); - bch2_bio_map(bio, buf, bs); - - u64 submit_time = ktime_get_ns(); - ret = submit_bio_wait(bio); - bch2_time_stats_update(&stats_fua, submit_time); - - if (ret) - goto err; - } - - u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca); - - end = ktime_get_ns() + test_duration; - for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { - bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); - bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9; - bch2_bio_map(bio, buf, bs); - - u64 submit_time = ktime_get_ns(); - ret = submit_bio_wait(bio); - bch2_time_stats_update(&stats_random, submit_time); - - if (ret) - goto err; - } - - u64 ns_nofua = mean_and_variance_get_mean(stats_nofua.duration_stats); - u64 ns_fua = mean_and_variance_get_mean(stats_fua.duration_stats); - u64 ns_rand = mean_and_variance_get_mean(stats_random.duration_stats); - - u64 stddev_nofua = mean_and_variance_get_stddev(stats_nofua.duration_stats); - u64 stddev_fua = mean_and_variance_get_stddev(stats_fua.duration_stats); - u64 stddev_rand = mean_and_variance_get_stddev(stats_random.duration_stats); - - printbuf_tabstop_push(out, 8); - printbuf_tabstop_push(out, 12); - printbuf_tabstop_push(out, 12); - prt_printf(out, "This test must be run on an idle drive for accurate results\n"); - prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device)); - prt_printf(out, "fua support advertized: %s\n", str_yes_no(bdev_fua(bdev))); - prt_newline(out); - prt_printf(out, "ns:\tlatency\rstddev\r\n"); - prt_printf(out, "nofua\t%llu\r%llu\r\n", ns_nofua, stddev_nofua); - prt_printf(out, "fua\t%llu\r%llu\r\n", ns_fua, stddev_fua); - prt_printf(out, "random\t%llu\r%llu\r\n", ns_rand, stddev_rand); - - bool read_cache = ns_nofua * 2 < ns_rand; - bool fua_cached = read_cache && ns_fua < (ns_nofua + ns_rand) / 2; - - if (!read_cache) - prt_str(out, "reads don't appear to be cached - safe\n"); - else if (!fua_cached) - prt_str(out, "fua reads don't appear to be cached - safe\n"); - else - prt_str(out, "fua reads appear to be cached - unsafe\n"); -err: - kfree(buf); - kfree(bio); - enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_read_fua_test); - bch_err_fn(c, ret); - return ret; -} - SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -784,7 +672,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, u64 v; ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: - bch2_opt_hook_pre_set(c, ca, 0, id, v); + bch2_opt_hook_pre_set(c, ca, 0, id, v, true); kfree(tmp); if (ret < 0) @@ -959,9 +847,6 @@ SHOW(bch2_dev) if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, ca); - if (attr == &sysfs_read_fua_test) - return bch2_read_fua_test(out, ca); - int opt_id = bch2_opt_lookup(attr->name); if (opt_id >= 0) return sysfs_opt_show(c, ca, opt_id, out); @@ -1026,8 +911,6 @@ struct attribute *bch2_dev_files[] = { &sysfs_congested, #endif - &sysfs_read_fua_test, - /* debug: */ &sysfs_alloc_debug, &sysfs_open_buckets, diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 2a946227..16d746f1 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -299,8 +299,10 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigne if (ret) return ret; + skipnr += task == current; + do { - nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); + nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr); } while (nr_entries == stack->size && !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp))); @@ -321,8 +323,10 @@ void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp) { + skipnr += task == current; + CLASS(bch_stacktrace, stack)(); - int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp); + int ret = bch2_save_backtrace(&stack, task, skipnr, gfp); bch2_prt_backtrace(out, &stack); return ret; diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 784e75a2..2b8d0502 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -550,7 +550,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, if (ret < 0) goto err; - ret = bch2_opt_hook_pre_set(c, NULL, inode->ei_inode.bi_inum, opt_id, v); + ret = bch2_opt_hook_pre_set(c, NULL, inode->ei_inode.bi_inum, opt_id, v, true); if (ret < 0) goto err; diff --git a/linux/closure.c b/linux/closure.c index 4fb78d18..f1b4a797 100644 --- a/linux/closure.c +++ b/linux/closure.c @@ -13,65 +13,83 @@ #include <linux/seq_file.h> #include <linux/sched/debug.h> -static inline void closure_put_after_sub_checks(struct closure *cl, int flags) +static void closure_val_checks(struct closure *cl, unsigned new, int d) { - int r = flags & CLOSURE_REMAINING_MASK; + unsigned count = new & CLOSURE_REMAINING_MASK; - if (WARN(flags & CLOSURE_GUARD_MASK, - "closure %ps has guard bits set: %x (%u)", + if (WARN(new & CLOSURE_GUARD_MASK, + "closure %ps has guard bits set: %x (%u), delta %i", cl->fn, - flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r))) - r &= ~CLOSURE_GUARD_MASK; + new, (unsigned) __fls(new & CLOSURE_GUARD_MASK), d)) + new &= ~CLOSURE_GUARD_MASK; - WARN(!r && (flags & ~CLOSURE_DESTRUCTOR), + WARN(!count && (new & ~(CLOSURE_DESTRUCTOR|CLOSURE_SLEEPING)), "closure %ps ref hit 0 with incorrect flags set: %x (%u)", cl->fn, - flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags)); + new, (unsigned) __fls(new)); } -static inline void closure_put_after_sub(struct closure *cl, int flags) +enum new_closure_state { + CLOSURE_normal_put, + CLOSURE_requeue, + CLOSURE_done, +}; + +/* For clearing flags with the same atomic op as a put */ +void closure_sub(struct closure *cl, int v) { - closure_put_after_sub_checks(cl, flags); + enum new_closure_state s; + struct task_struct *sleeper; - if (!(flags & CLOSURE_REMAINING_MASK)) { - smp_acquire__after_ctrl_dep(); + /* rcu_read_lock, atomic_read_acquire() are both for cl->sleeper: */ + guard(rcu)(); - cl->closure_get_happened = false; + int old = atomic_read_acquire(&cl->remaining), new; + do { + new = old - v; - if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { - atomic_set(&cl->remaining, - CLOSURE_REMAINING_INITIALIZER); - closure_queue(cl); + if (new & CLOSURE_REMAINING_MASK) { + s = CLOSURE_normal_put; } else { - struct closure *parent = cl->parent; - closure_fn *destructor = cl->fn; + if ((cl->fn || (new & CLOSURE_SLEEPING)) && + !(new & CLOSURE_DESTRUCTOR)) { + s = CLOSURE_requeue; + new += CLOSURE_REMAINING_INITIALIZER; + } else + s = CLOSURE_done; - closure_debug_destroy(cl); + sleeper = new & CLOSURE_SLEEPING ? cl->sleeper : NULL; + new &= ~CLOSURE_SLEEPING; + } - if (destructor) - destructor(&cl->work); + closure_val_checks(cl, new, -v); + } while (!atomic_try_cmpxchg_release(&cl->remaining, &old, new)); - if (parent) - closure_put(parent); - } + if (s == CLOSURE_normal_put) + return; + + if (sleeper) { + smp_mb(); + wake_up_process(sleeper); + return; } -} -/* For clearing flags with the same atomic op as a put */ -void closure_sub(struct closure *cl, int v) -{ - closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining)); -} -EXPORT_SYMBOL(closure_sub); + if (s == CLOSURE_requeue) { + closure_queue(cl); + } else { + struct closure *parent = cl->parent; + closure_fn *destructor = cl->fn; -/* - * closure_put - decrement a closure's refcount - */ -void closure_put(struct closure *cl) -{ - closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining)); + closure_debug_destroy(cl); + + if (destructor) + destructor(&cl->work); + + if (parent) + closure_put(parent); + } } -EXPORT_SYMBOL(closure_put); +EXPORT_SYMBOL(closure_sub); /* * closure_wake_up - wake up all closures on a wait list, without memory barrier @@ -107,43 +125,26 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) if (atomic_read(&cl->remaining) & CLOSURE_WAITING) return false; - cl->closure_get_happened = true; closure_set_waiting(cl, _RET_IP_); - atomic_add(CLOSURE_WAITING + 1, &cl->remaining); + unsigned r = atomic_add_return(CLOSURE_WAITING + 1, &cl->remaining); + closure_val_checks(cl, r, CLOSURE_WAITING + 1); + llist_add(&cl->list, &waitlist->list); return true; } EXPORT_SYMBOL(closure_wait); -struct closure_syncer { - struct task_struct *task; - int done; -}; - -static CLOSURE_CALLBACK(closure_sync_fn) -{ - struct closure *cl = container_of(ws, struct closure, work); - struct closure_syncer *s = cl->s; - struct task_struct *p; - - rcu_read_lock(); - p = READ_ONCE(s->task); - s->done = 1; - wake_up_process(p); - rcu_read_unlock(); -} - void __sched __closure_sync(struct closure *cl) { - struct closure_syncer s = { .task = current }; - - cl->s = &s; - continue_at(cl, closure_sync_fn, NULL); + cl->sleeper = current; + closure_sub(cl, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_SLEEPING); while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - if (s.done) + if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) break; schedule(); } @@ -157,31 +158,25 @@ EXPORT_SYMBOL(__closure_sync); * for outstanding get()s to finish) and returning once closure refcount is 0. * * Unlike closure_sync() this doesn't reinit the ref to 1; subsequent - * closure_get_not_zero() calls waill fail. + * closure_get_not_zero() calls will fail. */ void __sched closure_return_sync(struct closure *cl) { - struct closure_syncer s = { .task = current }; - - cl->s = &s; - set_closure_fn(cl, closure_sync_fn, NULL); - - unsigned flags = atomic_sub_return_release(1 + CLOSURE_RUNNING - CLOSURE_DESTRUCTOR, - &cl->remaining); - - closure_put_after_sub_checks(cl, flags); - - if (unlikely(flags & CLOSURE_REMAINING_MASK)) { - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (s.done) - break; - schedule(); - } + cl->sleeper = current; + closure_sub(cl, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_DESTRUCTOR - + CLOSURE_SLEEPING); - __set_current_state(TASK_RUNNING); + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) + break; + schedule(); } + __set_current_state(TASK_RUNNING); + if (cl->parent) closure_put(cl->parent); } @@ -189,31 +184,35 @@ EXPORT_SYMBOL(closure_return_sync); int __sched __closure_sync_timeout(struct closure *cl, unsigned long timeout) { - struct closure_syncer s = { .task = current }; int ret = 0; - cl->s = &s; - continue_at(cl, closure_sync_fn, NULL); + cl->sleeper = current; + closure_sub(cl, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_SLEEPING); while (1) { set_current_state(TASK_UNINTERRUPTIBLE); - if (s.done) - break; + /* + * Carefully undo the continue_at() - but only if it + * hasn't completed, i.e. the final closure_put() hasn't + * happened yet: + */ + unsigned old = atomic_read(&cl->remaining), new; + if (!(old & CLOSURE_SLEEPING)) + goto success; + if (!timeout) { - /* - * Carefully undo the continue_at() - but only if it - * hasn't completed, i.e. the final closure_put() hasn't - * happened yet: - */ - unsigned old, new, v = atomic_read(&cl->remaining); do { - old = v; - if (!old || (old & CLOSURE_RUNNING)) + if (!(old & CLOSURE_SLEEPING)) goto success; - new = old + CLOSURE_REMAINING_INITIALIZER; - } while ((v = atomic_cmpxchg(&cl->remaining, old, new)) != old); + new = old + CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING; + closure_val_checks(cl, new, CLOSURE_REMAINING_INITIALIZER - CLOSURE_SLEEPING); + } while (!atomic_try_cmpxchg(&cl->remaining, &old, new)); + ret = -ETIME; + break; } timeout = schedule_timeout(timeout); |