diff options
Diffstat (limited to 'fs/bcachefs')
53 files changed, 816 insertions, 619 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index b6850b15494d..21cdc42eff46 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1771,13 +1771,6 @@ static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) darray_remove_item(&ca->discard_buckets_in_flight, i); } -struct discard_buckets_state { - u64 seen; - u64 open; - u64 need_journal_commit; - u64 discarded; -}; - static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, @@ -1790,6 +1783,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, bool discard_locked = false; int ret = 0; + s->seen++; + if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { s->open++; return 0; @@ -1800,6 +1795,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (seq_ready > c->journal.flushed_seq_ondisk) { if (seq_ready > c->journal.flushing_seq) s->need_journal_commit++; + else + s->commit_in_flight++; return 0; } @@ -1815,6 +1812,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, return ret; if (a->v.data_type != BCH_DATA_need_discard) { + s->bad_data_type++; + if (need_discard_or_freespace_err(trans, k, true, true, true)) { ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false); if (ret) @@ -1826,8 +1825,10 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } if (!fastpath) { - if (discard_in_flight_add(ca, iter.pos.offset, true)) + if (discard_in_flight_add(ca, iter.pos.offset, true)) { + s->already_discarding++; goto out; + } discard_locked = true; } @@ -1861,6 +1862,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, commit: ret = bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; @@ -1873,14 +1875,11 @@ out: fsck_err: if (discard_locked) discard_in_flight_remove(ca, iter.pos.offset); - if (!ret) - s->seen++; return ret; } -static void bch2_do_discards_work(struct work_struct *work) +static void __bch2_dev_do_discards(struct bch_dev *ca) { - struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); struct bch_fs *c = ca->fs; struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; @@ -1901,10 +1900,25 @@ static void bch2_do_discards_work(struct work_struct *work) if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal)) bch2_journal_flush_async(&c->journal, NULL); - trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, - bch2_err_str(ret)); + trace_discard_buckets(c, &s, bch2_err_str(ret)); enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); +} + +void bch2_do_discards_going_ro(struct bch_fs *c) +{ + for_each_member_device(c, ca) + if (bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards)) + __bch2_dev_do_discards(ca); +} + +static void bch2_do_discards_work(struct work_struct *work) +{ + struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); + struct bch_fs *c = ca->fs; + + __bch2_dev_do_discards(ca); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); } @@ -1992,7 +2006,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work) break; } - trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + trace_discard_buckets_fast(c, &s, bch2_err_str(ret)); bch2_trans_put(trans); enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); @@ -2384,8 +2398,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) * We clear the LRU and need_discard btrees first so that we don't race * with bch2_do_invalidates() and bch2_do_discards() */ - ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_norun, NULL) ?: + ret = bch2_dev_remove_lrus(c, ca) ?: bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, @@ -2396,7 +2409,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, BTREE_TRIGGER_norun, NULL) ?: - bch2_dev_usage_remove(c, ca->dev_idx); + bch2_dev_usage_remove(c, ca); bch_err_msg(ca, ret, "removing dev alloc info"); return ret; } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index c2e8482fbbe6..a602507fef19 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -320,6 +320,7 @@ static inline int bch2_check_discard_freespace_key_async(struct btree_trans *tra int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_dev_do_discards(struct bch_dev *); +void bch2_do_discards_going_ro(struct bch_fs *); void bch2_do_discards(struct bch_fs *); static inline u64 should_invalidate_buckets(struct bch_dev *ca, diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 3d125ee81663..97b627ed3b22 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1529,6 +1529,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) printbuf_tabstop_push(out, 24); prt_printf(out, "capacity\t%llu\n", c->capacity); + prt_printf(out, "used\t%llu\n", bch2_fs_usage_read_short(c).used); prt_printf(out, "reserved\t%llu\n", c->reserved); prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden)); prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree)); diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index e7becdf22cba..ee52b66dc5d7 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -118,4 +118,14 @@ struct write_point_specifier { unsigned long v; }; +struct discard_buckets_state { + u64 seen; + u64 open; + u64 need_journal_commit; + u64 commit_in_flight; + u64 bad_data_type; + u64 already_discarding; + u64 discarded; +}; + #endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 6aeb1c876619..c662eeba66ab 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -820,7 +820,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, for (enum btree_id btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { - int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; + int level, depth = btree_type_has_data_ptrs(btree_id) ? 0 : 1; ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 0ede47f62129..83d6ab9c1a91 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -458,7 +458,6 @@ BCH_DEBUG_PARAMS_ALL() x(btree_node_compact) \ x(btree_node_merge) \ x(btree_node_sort) \ - x(btree_node_get) \ x(btree_node_read) \ x(btree_node_read_done) \ x(btree_node_write) \ @@ -466,10 +465,6 @@ BCH_DEBUG_PARAMS_ALL() x(btree_interior_update_total) \ x(btree_gc) \ x(data_write) \ - x(data_write_to_submit) \ - x(data_write_to_queue) \ - x(data_write_to_btree_update) \ - x(data_write_btree_update) \ x(data_read) \ x(data_promote) \ x(journal_flush_write) \ @@ -483,6 +478,7 @@ BCH_DEBUG_PARAMS_ALL() x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ x(blocked_write_buffer_full) \ + x(blocked_writeback_throttle) \ x(nocow_lock_contended) enum bch_time_stats { @@ -675,6 +671,7 @@ struct bch_dev { x(error) \ x(topology_error) \ x(errors_fixed) \ + x(errors_fixed_silent) \ x(errors_not_fixed) \ x(no_invalid_checks) \ x(discard_mount_opt_set) \ @@ -808,6 +805,8 @@ struct bch_fs { struct bch_disk_groups_cpu __rcu *disk_groups; struct bch_opts opts; + atomic_t opt_change_cookie; + unsigned loglevel; unsigned prev_loglevel; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index b2de993d802b..0839397105a9 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -654,7 +654,6 @@ struct bch_sb_field_ext { /* * field 1: version name * field 2: BCH_VERSION(major, minor) - * field 3: recovery passess required on upgrade */ #define BCH_METADATA_VERSIONS() \ x(bkey_renumber, BCH_VERSION(0, 10)) \ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 43f294284d57..2338feb8d8ed 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -717,16 +717,12 @@ fsck_err: static int bch2_gc_btree(struct btree_trans *trans, struct progress_indicator_state *progress, - enum btree_id btree, bool initial) + enum btree_id btree, unsigned target_depth, + bool initial) { struct bch_fs *c = trans->c; - unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; int ret = 0; - /* We need to make sure every leaf node is readable before going RW */ - if (initial) - target_depth = 0; - for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) { struct btree *prev = NULL; struct btree_iter iter; @@ -797,7 +793,21 @@ static int bch2_gc_btrees(struct bch_fs *c) if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) continue; - ret = bch2_gc_btree(trans, &progress, btree, true); + + unsigned target_depth = BIT_ULL(btree) & btree_leaf_has_triggers_mask ? 0 : 1; + + /* + * In fsck, we need to make sure every leaf node is readable + * before going RW, otherwise we can no longer rewind inside + * btree_lost_data to repair during the current fsck run. + * + * Otherwise, we can delay the repair to the next + * mount or offline fsck. + */ + if (test_bit(BCH_FS_in_fsck, &c->flags)) + target_depth = 0; + + ret = bch2_gc_btree(trans, &progress, btree, target_depth, true); } bch_err_fn(c, ret); @@ -1228,7 +1238,7 @@ int bch2_gc_gens(struct bch_fs *c) } for (unsigned i = 0; i < BTREE_ID_NR; i++) - if (btree_type_has_ptrs(i)) { + if (btree_type_has_data_ptrs(i)) { c->gc_gens_btree = i; c->gc_gens_pos = POS_MIN; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 34ec1a90980d..52d21259ed6f 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -27,10 +27,15 @@ #include <linux/moduleparam.h> #include <linux/sched/mm.h> +static __maybe_unused unsigned bch2_btree_read_corrupt_ratio; +static __maybe_unused int bch2_btree_read_corrupt_device; + #ifdef CONFIG_BCACHEFS_DEBUG -static unsigned bch2_btree_read_corrupt_ratio; module_param_named(btree_read_corrupt_ratio, bch2_btree_read_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(btree_read_corrupt_ratio, ""); + +module_param_named(btree_read_corrupt_device, bch2_btree_read_corrupt_device, int, 0644); +MODULE_PARM_DESC(btree_read_corrupt_ratio, ""); #endif static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) @@ -1438,7 +1443,9 @@ start: memset(&bio->bi_iter, 0, sizeof(bio->bi_iter)); bio->bi_iter.bi_size = btree_buf_bytes(b); - bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio); + if (bch2_btree_read_corrupt_device == rb->pick.ptr.dev || + bch2_btree_read_corrupt_device < 0) + bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio); ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); if (ret != -BCH_ERR_btree_node_read_err_want_retry && diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 5fa7f2f9f1e9..2966971ee43e 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -970,6 +970,7 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans, struct bkey_i *accounting; retry: + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); percpu_down_read(&c->mark_lock); for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); accounting != btree_trans_subbuf_top(trans, &trans->accounting); @@ -983,6 +984,9 @@ retry: } percpu_up_read(&c->mark_lock); + /* Only fatal errors are possible later, so no need to revert this */ + bch2_trans_account_disk_usage_change(trans); + trans_for_each_update(trans, i) { ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); if (ret) diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index e893eb938bb3..9e3c851200eb 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -840,6 +840,10 @@ static inline bool btree_node_type_has_triggers(enum btree_node_type type) return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS; } +/* A mask of btree id bits that have triggers for their leaves */ +__maybe_unused +static const u64 btree_leaf_has_triggers_mask = BTREE_NODE_TYPE_HAS_TRIGGERS >> 1; + static const u64 btree_is_extents_mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr) BCH_BTREE_IDS() @@ -883,15 +887,15 @@ static inline bool btree_type_has_snapshot_field(enum btree_id btree) return BIT_ULL(btree) & mask; } -static inline bool btree_type_has_ptrs(enum btree_id btree) -{ - const u64 mask = 0 +static const u64 btree_has_data_ptrs_mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr) BCH_BTREE_IDS() #undef x ; - return BIT_ULL(btree) & mask; +static inline bool btree_type_has_data_ptrs(enum btree_id btree) +{ + return BIT_ULL(btree) & btree_has_data_ptrs_mask; } static inline bool btree_type_uses_write_buffer(enum btree_id btree) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 8d9fcaa26268..ce86d158aa8e 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -324,9 +324,6 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct btree *b; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim - ? BTREE_NODE_RESERVE - : 0; int ret; b = bch2_btree_node_mem_alloc(trans, interior_node); @@ -334,41 +331,6 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, return b; BUG_ON(b->ob.nr); - - mutex_lock(&c->btree_reserve_cache_lock); - if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { - guard(spinlock)(&c->freelist_lock); - if (c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark)) { - if (cl) - closure_wait(&c->open_buckets_wait, cl); - - ret = cl - ? bch_err_throw(c, bucket_alloc_blocked) - : bch_err_throw(c, open_buckets_empty); - mutex_unlock(&c->btree_reserve_cache_lock); - goto err; - } - } - - if (c->btree_reserve_cache_nr > nr_reserve) { - for (struct btree_alloc *a = c->btree_reserve_cache; - a < c->btree_reserve_cache + c->btree_reserve_cache_nr;) { - /* check if it has sufficient durability */ - - if (!can_use_btree_node(c, res, target, bkey_i_to_s_c(&a->k))) { - bch2_open_buckets_put(c, &a->ob); - *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - continue; - } - - bkey_copy(&b->key, &a->k); - b->ob = a->ob; - *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - mutex_unlock(&c->btree_reserve_cache_lock); - goto out; - } - } - mutex_unlock(&c->btree_reserve_cache_lock); retry: ret = bch2_alloc_sectors_start_trans(trans, target ?: @@ -398,12 +360,29 @@ retry: goto retry; } + mutex_lock(&c->btree_reserve_cache_lock); + while (c->btree_reserve_cache_nr) { + struct btree_alloc *a = c->btree_reserve_cache + --c->btree_reserve_cache_nr; + + /* check if it has sufficient durability */ + + if (can_use_btree_node(c, res, target, bkey_i_to_s_c(&a->k))) { + bkey_copy(&b->key, &a->k); + b->ob = a->ob; + mutex_unlock(&c->btree_reserve_cache_lock); + goto out; + } + + bch2_open_buckets_put(c, &a->ob); + } + mutex_unlock(&c->btree_reserve_cache_lock); + bkey_btree_ptr_v2_init(&b->key); bch2_alloc_sectors_append_ptrs(c, wp, &b->key, btree_sectors(c), false); bch2_open_bucket_get(c, wp, &b->ob); - bch2_alloc_sectors_done(c, wp); out: + bch2_alloc_sectors_done(c, wp); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -723,8 +702,10 @@ static void btree_update_nodes_written(struct btree_update *as) if (ret) goto err; - if (!btree_update_new_nodes_marked_sb(as)) + if (!btree_update_new_nodes_marked_sb(as)) { + bch2_trans_unlock_long(trans); btree_update_new_nodes_mark_sb(as); + } /* * Wait for any in flight writes to finish before we free the old nodes diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 7bd9cf6104ca..10bfadcde80a 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -130,7 +130,7 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type, } static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, - struct bch_io_opts opts) + struct bch_inode_opts opts) { if (opts.nocow) return 0; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 20b900bee32d..7a0da6cdf78c 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -11,6 +11,7 @@ #include "ec.h" #include "error.h" #include "extents.h" +#include "inode.h" #include "io_write.h" #include "keylist.h" #include "move.h" @@ -428,13 +429,18 @@ restart_drop_extra_replicas: goto out; } + struct bch_inode_opts opts; + ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: + bch2_inum_snapshot_opts_get(trans, k.k->p.inode, k.k->p.snapshot, &opts) ?: + bch2_bkey_set_needs_rebalance(c, &opts, insert, + SET_NEEDS_REBALANCE_foreground, + m->op.opts.change_cookie) ?: bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_internal_snapshot_node); if (ret) @@ -613,7 +619,7 @@ int bch2_update_unwritten_extent(struct btree_trans *trans, } void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { if (!out->nr_tabstops) @@ -682,7 +688,7 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { struct bch_fs *c = trans->c; @@ -732,7 +738,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, } static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, unsigned buf_bytes) { unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); @@ -759,7 +765,7 @@ static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, } int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts) + struct bch_inode_opts *io_opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); const union bch_extent_entry *entry; @@ -831,7 +837,7 @@ int bch2_data_update_init(struct btree_trans *trans, struct moving_context *ctxt, struct data_update *m, struct write_point_specifier wp, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts data_opts, enum btree_id btree_id, struct bkey_s_c k) diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index fc12aa65366f..3b0ba6f6497f 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -23,7 +23,7 @@ struct data_update_opts { }; void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, - struct bch_io_opts *, struct data_update_opts *); + struct bch_inode_opts *, struct data_update_opts *); #define BCH_DATA_UPDATE_TYPES() \ x(copygc, 0) \ @@ -76,18 +76,18 @@ void bch2_data_update_read_done(struct data_update *); int bch2_extent_drop_ptrs(struct btree_trans *, struct btree_iter *, struct bkey_s_c, - struct bch_io_opts *, + struct bch_inode_opts *, struct data_update_opts *); int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, - struct bch_io_opts *); + struct bch_inode_opts *); void bch2_data_update_exit(struct data_update *); int bch2_data_update_init(struct btree_trans *, struct btree_iter *, struct moving_context *, struct data_update *, struct write_point_specifier, - struct bch_io_opts *, struct data_update_opts, + struct bch_inode_opts *, struct data_update_opts, enum btree_id, struct bkey_s_c); void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index f0ebf91cd5fd..a99f821c6a1c 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -239,10 +239,12 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, c, accounting_key_junk_at_end, "junk at end of accounting key"); - bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type], + const unsigned nr_counters = bch2_accounting_counters(k.k); + + bkey_fsck_err_on(!nr_counters || nr_counters > BCH_ACCOUNTING_MAX_COUNTERS, c, accounting_key_nr_counters_wrong, "accounting key with %u counters, should be %u", - bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]); + nr_counters, bch2_accounting_type_nr_counters[acc_k.type]); fsck_err: return ret; } @@ -359,10 +361,13 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun accounting_pos_cmp, &a.k->p) < acc->k.nr) return 0; + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, a.k->p); + struct accounting_mem_entry n = { .pos = a.k->p, .bversion = a.k->bversion, - .nr_counters = bch2_accounting_counters(a.k), + .nr_counters = bch2_accounting_type_nr_counters[acc_k.type], .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), sizeof(u64), GFP_KERNEL), }; @@ -878,46 +883,44 @@ int bch2_accounting_read(struct bch_fs *c) *dst++ = *i; keys->gap = keys->nr = dst - keys->data; - guard(percpu_write)(&c->mark_lock); - - darray_for_each_reverse(acc->k, i) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, i->pos); + CLASS(printbuf, underflow_err)(); - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - memset(v, 0, sizeof(v)); + scoped_guard(percpu_write, &c->mark_lock) { + darray_for_each_reverse(acc->k, i) { + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, i->pos); - for (unsigned j = 0; j < i->nr_counters; j++) - v[j] = percpu_u64_get(i->v[0] + j); + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + memset(v, 0, sizeof(v)); - /* - * If the entry counters are zeroed, it should be treated as - * nonexistent - it might point to an invalid device. - * - * Remove it, so that if it's re-added it gets re-marked in the - * superblock: - */ - ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) - ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); - - if (ret == -BCH_ERR_remove_disk_accounting_entry) { - free_percpu(i->v[0]); - free_percpu(i->v[1]); - darray_remove_item(&acc->k, i); - ret = 0; - continue; - } + for (unsigned j = 0; j < i->nr_counters; j++) + v[j] = percpu_u64_get(i->v[0] + j); - if (ret) - return ret; - } + /* + * If the entry counters are zeroed, it should be treated as + * nonexistent - it might point to an invalid device. + * + * Remove it, so that if it's re-added it gets re-marked in the + * superblock: + */ + ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) + ? -BCH_ERR_remove_disk_accounting_entry + : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); + + if (ret == -BCH_ERR_remove_disk_accounting_entry) { + free_percpu(i->v[0]); + free_percpu(i->v[1]); + darray_remove_item(&acc->k, i); + ret = 0; + continue; + } - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); + if (ret) + return ret; + } - scoped_guard(preempt) { - struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); for (unsigned i = 0; i < acc->k.nr; i++) { struct disk_accounting_pos k; @@ -939,27 +942,20 @@ int bch2_accounting_read(struct bch_fs *c) underflow |= (s64) v[j] < 0; if (underflow) { - CLASS(printbuf, buf)(); - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "Accounting underflow for\n"); - bch2_accounting_key_to_text(&buf, &k); + if (!underflow_err.pos) { + bch2_log_msg_start(c, &underflow_err); + prt_printf(&underflow_err, "Accounting underflow for\n"); + } + bch2_accounting_key_to_text(&underflow_err, &k); for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) - prt_printf(&buf, " %lli", v[j]); - - bool print = bch2_count_fsck_err(c, accounting_key_underflow, &buf); - unsigned pos = buf.pos; - ret = bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_allocations, 0); - print |= buf.pos != pos; - - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - if (ret) - return ret; + prt_printf(&underflow_err, " %lli", v[j]); + prt_newline(&underflow_err); } + guard(preempt)(); + struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); + switch (k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: usage->reserved += v[0] * k.persistent_reserved.nr_replicas; @@ -986,24 +982,60 @@ int bch2_accounting_read(struct bch_fs *c) } } + if (underflow_err.pos) { + bool print = bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err); + unsigned pos = underflow_err.pos; + ret = bch2_run_explicit_recovery_pass(c, &underflow_err, + BCH_RECOVERY_PASS_check_allocations, 0); + print |= underflow_err.pos != pos; + + if (print) + bch2_print_str(c, KERN_ERR, underflow_err.buf); + if (ret) + return ret; + } + return ret; } -int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) +int bch2_dev_usage_remove(struct bch_fs *c, struct bch_dev *ca) { CLASS(btree_trans, trans)(c); + + struct disk_accounting_pos start; + disk_accounting_key_init(start, dev_data_type, .dev = ca->dev_idx); + + struct disk_accounting_pos end; + disk_accounting_key_init(end, dev_data_type, .dev = ca->dev_idx, .data_type = U8_MAX); + return bch2_btree_write_buffer_flush_sync(trans) ?: - for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, - BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ - struct disk_accounting_pos acc; - bpos_to_disk_accounting_pos(&acc, k.k->p); - - acc.type == BCH_DISK_ACCOUNTING_dev_data_type && - acc.dev_data_type.dev == dev - ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) - : 0; - })) ?: - bch2_btree_write_buffer_flush_sync(trans); + commit_do(trans, NULL, NULL, 0, ({ + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_accounting, + disk_accounting_pos_to_bpos(&start), + disk_accounting_pos_to_bpos(&end), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->type != KEY_TYPE_accounting) + continue; + + struct disk_accounting_pos acc; + bpos_to_disk_accounting_pos(&acc, k.k->p); + + const unsigned nr = bch2_accounting_counters(k.k); + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + memcpy_u64s_small(v, bkey_s_c_to_accounting(k).v->d, nr); + + bch2_u64s_neg(v, nr); + + ret = bch2_disk_accounting_mod(trans, &acc, v, nr, false); + if (ret) + break; + } + + ret; + })) ?: bch2_btree_write_buffer_flush_sync(trans); } int bch2_dev_usage_init(struct bch_dev *ca, bool gc) @@ -1074,13 +1106,17 @@ void bch2_verify_accounting_clean(struct bch_fs *c) case BCH_DISK_ACCOUNTING_dev_data_type: { { guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ + const enum bch_data_type data_type = acc_k.dev_data_type.data_type; struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (!ca) continue; - v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); - v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); - v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); + v[0] = percpu_u64_get(&ca->usage->d[data_type].buckets); + v[1] = percpu_u64_get(&ca->usage->d[data_type].sectors); + v[2] = percpu_u64_get(&ca->usage->d[data_type].fragmented); + + if (data_type == BCH_DATA_sb || data_type == BCH_DATA_journal) + base.hidden += a.v->d[0] * ca->mi.bucket_size; } if (memcmp(a.v->d, v, 3 * sizeof(u64))) { @@ -1108,7 +1144,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) mismatch = true; \ } - //check(hidden); + check(hidden); check(btree); check(data); check(cached); diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index cc73cce98a44..c0d3d7e8fda6 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -186,11 +186,15 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, break; case BCH_DISK_ACCOUNTING_dev_data_type: { guard(rcu)(); + const enum bch_data_type data_type = acc_k.dev_data_type.data_type; struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (ca) { - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); + this_cpu_add(ca->usage->d[data_type].buckets, a.v->d[0]); + this_cpu_add(ca->usage->d[data_type].sectors, a.v->d[1]); + this_cpu_add(ca->usage->d[data_type].fragmented, a.v->d[2]); + + if (data_type == BCH_DATA_sb || data_type == BCH_DATA_journal) + trans->fs_usage_delta.hidden += a.v->d[0] * ca->mi.bucket_size; } break; } @@ -212,9 +216,9 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct accounting_mem_entry *e = &acc->k.data[idx]; - EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); + const unsigned nr = min_t(unsigned, bch2_accounting_counters(a.k), e->nr_counters); - for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) + for (unsigned i = 0; i < nr; i++) this_cpu_add(e->v[gc][i], a.v->d[i]); return 0; } @@ -297,7 +301,7 @@ int bch2_gc_accounting_done(struct bch_fs *); int bch2_accounting_read(struct bch_fs *); -int bch2_dev_usage_remove(struct bch_fs *, unsigned); +int bch2_dev_usage_remove(struct bch_fs *, struct bch_dev *); int bch2_dev_usage_init(struct bch_dev *, bool); void bch2_verify_accounting_clean(struct bch_fs *c); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index adc1f9315eab..420f6922dacb 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -345,6 +345,7 @@ x(BCH_ERR_data_read, data_read_no_encryption_key) \ x(BCH_ERR_data_read, data_read_buffer_too_small) \ x(BCH_ERR_data_read, data_read_key_overwritten) \ + x(0, rbio_narrow_crcs_fail) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 9e69263eb796..a16f55d98d97 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -468,10 +468,10 @@ int __bch2_fsck_err(struct bch_fs *c, if ((flags & FSCK_ERR_SILENT) || test_bit(err, c->sb.errors_silent)) { - ret = flags & FSCK_CAN_FIX + set_bit(BCH_FS_errors_fixed_silent, &c->flags); + return flags & FSCK_CAN_FIX ? bch_err_throw(c, fsck_fix) : bch_err_throw(c, fsck_ignore); - goto err; } printbuf_indent_add_nextline(out, 2); diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 73eb28090bc7..1279026b4c1e 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -146,6 +146,7 @@ int bch2_extent_trim_atomic(struct btree_trans *trans, if (bpos_ge(bkey_start_pos(k.k), end)) break; + nr_iters += 1; ret = count_iters_for_insert(trans, k, offset, &end, &nr_iters); if (ret) break; diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index c0d00a692c18..86aa93ea2345 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1151,7 +1151,7 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke return NULL; } -static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, +static bool want_cached_ptr(struct bch_fs *c, struct bch_inode_opts *opts, struct bch_extent_ptr *ptr) { unsigned target = opts->promote_target ?: opts->foreground_target; @@ -1165,7 +1165,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, } void bch2_extent_ptr_set_cached(struct bch_fs *c, - struct bch_io_opts *opts, + struct bch_inode_opts *opts, struct bkey_s k, struct bch_extent_ptr *ptr) { @@ -1241,7 +1241,7 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) * the promote target. */ bool bch2_extent_normalize_by_opts(struct bch_fs *c, - struct bch_io_opts *opts, + struct bch_inode_opts *opts, struct bkey_s k) { struct bkey_ptrs ptrs; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index f6dcb17108cd..03ea7c689d9a 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -686,10 +686,10 @@ bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); struct bch_extent_ptr * bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); -void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *, +void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_inode_opts *, struct bkey_s, struct bch_extent_ptr *); -bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s); +bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_inode_opts *, struct bkey_s); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 45175a478b92..fe684adca370 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -284,12 +284,12 @@ void bch2_readahead(struct readahead_control *ractl) { struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; struct folio *folio; struct readpages_iter readpages_iter; struct blk_plug plug; - bch2_inode_opts_get(&opts, c, &inode->ei_inode); + struct bch_inode_opts opts; + bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts); int ret = readpages_iter_init(&readpages_iter, ractl); if (ret) @@ -350,7 +350,7 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_read_bio *rbio; - struct bch_io_opts opts; + struct bch_inode_opts opts; struct blk_plug plug; int ret; DECLARE_COMPLETION_ONSTACK(done); @@ -361,7 +361,7 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) if (!bch2_folio_create(folio, GFP_KERNEL)) return -ENOMEM; - bch2_inode_opts_get(&opts, c, &inode->ei_inode); + bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), c, @@ -407,7 +407,7 @@ struct bch_writepage_io { struct bch_writepage_state { struct bch_writepage_io *io; - struct bch_io_opts opts; + struct bch_inode_opts opts; struct bch_folio_sector *tmp; unsigned tmp_sectors; struct blk_plug plug; @@ -532,6 +532,39 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); } +static bool can_write_now(struct bch_fs *c, unsigned replicas_want, struct closure *cl) +{ + unsigned reserved = OPEN_BUCKETS_COUNT - + (OPEN_BUCKETS_COUNT - bch2_open_buckets_reserved(BCH_WATERMARK_normal)) / 2; + + if (unlikely(c->open_buckets_nr_free <= reserved)) { + closure_wait(&c->open_buckets_wait, cl); + return false; + } + + if (BCH_WATERMARK_normal < c->journal.watermark && !bch2_journal_error(&c->journal)) { + closure_wait(&c->journal.async_wait, cl); + return false; + } + + return true; +} + +static void throttle_writes(struct bch_fs *c, unsigned replicas_want, struct closure *cl) +{ + u64 start = 0; + while (!can_write_now(c, replicas_want, cl)) { + if (!start) + start = local_clock(); + closure_sync(cl); + } + + BUG_ON(closure_nr_remaining(cl) > 1); + + if (start) + bch2_time_stats_update(&c->times[BCH_TIME_blocked_writeback_throttle], start); +} + static int __bch2_writepage(struct folio *folio, struct writeback_control *wbc, void *data) @@ -667,26 +700,25 @@ do_io: return 0; } -static int bch2_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, void *data) -{ - struct folio *folio = NULL; - int error; - - while ((folio = writeback_iter(mapping, wbc, folio, &error))) - error = __bch2_writepage(folio, wbc, data); - return error; -} - int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct bch_fs *c = mapping->host->i_sb->s_fs_info; struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL); - bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode); + bch2_inode_opts_get_inode(c, &to_bch_ei(mapping->host)->ei_inode, &w->opts); blk_start_plug(&w->plug); - int ret = bch2_write_cache_pages(mapping, wbc, w); + + struct closure cl; + closure_init_stack(&cl); + + struct folio *folio = NULL; + int ret = 0; + + while (throttle_writes(c, w->opts.data_replicas, &cl), + (folio = writeback_iter(mapping, wbc, folio, &ret))) + ret = __bch2_writepage(folio, wbc, w); + if (w->io) bch2_writepage_do_io(w); blk_finish_plug(&w->plug); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 79823234160f..a104b9d70bea 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -68,7 +68,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct file *file = req->ki_filp; struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; struct dio_read *dio; struct bio *bio; struct blk_plug plug; @@ -78,7 +77,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) size_t shorten; ssize_t ret; - bch2_inode_opts_get(&opts, c, &inode->ei_inode); + struct bch_inode_opts opts; + bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts); /* bios must be 512 byte aligned: */ if ((offset|iter->count) & (SECTOR_SIZE - 1)) @@ -445,13 +445,13 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) struct kiocb *req = dio->req; struct address_space *mapping = dio->mapping; struct bch_inode_info *inode = dio->inode; - struct bch_io_opts opts; + struct bch_inode_opts opts; struct bio *bio = &dio->op.wbio.bio; unsigned unaligned, iter_count; bool sync = dio->sync, dropped_locks; long ret; - bch2_inode_opts_get(&opts, c, &inode->ei_inode); + bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts); while (1) { iter_count = dio->iter.count; diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index de0d965f3fde..57e9459afa07 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -627,10 +627,10 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bpos end_pos = POS(inode->v.i_ino, end_sector); - struct bch_io_opts opts; + struct bch_inode_opts opts; int ret = 0; - bch2_inode_opts_get(&opts, c, &inode->ei_inode); + bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts); CLASS(btree_trans, trans)(c); CLASS(btree_iter, iter)(trans, BTREE_ID_extents, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index c7bb5b108e2f..d6a2031e17e8 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2147,9 +2147,11 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); int ret = bch2_inode_rm(c, inode_inum(inode)); if (ret && !bch2_err_matches(ret, EROFS)) { - bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu", - inode->ei_inum.subvol, - inode->ei_inum.inum); + CLASS(printbuf, buf)(); + bch2_trans_do(c, bch2_inum_to_path(trans, inode->ei_inum, &buf)); + + bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu\n%s", + inode->ei_inum.subvol, inode->ei_inum.inum, buf.buf); bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm); } @@ -2236,11 +2238,16 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) struct bch_fs *c = sb->s_fs_info; struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); unsigned shift = sb->s_blocksize_bits - 9; + /* - * this assumes inodes take up 64 bytes, which is a decent average + * This assumes inodes take up 64 bytes, which is a decent average * number: + * + * Not anymore - bi_dir, bi_dir_offset came later and shouldn't have + * been varint fields: seeing 144-160 byte inodes, so let's call it 256 + * bytes: */ - u64 avail_inodes = ((usage.capacity - usage.used) << 3); + u64 avail_inodes = ((usage.capacity - usage.used) << 1); buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index bba273d55c37..543627fb58be 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -369,9 +369,9 @@ err: } int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans, - u64 inode_nr, u32 snapshot, - struct bch_inode_unpacked *inode, - unsigned flags) + u64 inode_nr, u32 snapshot, + struct bch_inode_unpacked *inode, + unsigned flags) { CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, SPOS(0, inode_nr, snapshot), flags); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); @@ -673,7 +673,7 @@ static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) { - unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; + unsigned f = bkey_inode_flags(k); return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); } @@ -1223,32 +1223,45 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) return ret; } -void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, - struct bch_inode_unpacked *inode) +void bch2_inode_opts_get_inode(struct bch_fs *c, + struct bch_inode_unpacked *inode, + struct bch_inode_opts *ret) { #define x(_name, _bits) \ if ((inode)->bi_##_name) { \ - opts->_name = inode->bi_##_name - 1; \ - opts->_name##_from_inode = true; \ + ret->_name = inode->bi_##_name - 1; \ + ret->_name##_from_inode = true; \ } else { \ - opts->_name = c->opts._name; \ - opts->_name##_from_inode = false; \ + ret->_name = c->opts._name; \ + ret->_name##_from_inode = false; \ } BCH_INODE_OPTS() #undef x - bch2_io_opts_fixups(opts); + ret->change_cookie = atomic_read(&c->opt_change_cookie); + + bch2_io_opts_fixups(ret); } -int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) +int bch2_inum_snapshot_opts_get(struct btree_trans *trans, + u64 inum, u32 snapshot, + struct bch_inode_opts *opts) { - struct bch_inode_unpacked inode; - int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode)); + if (inum) { + struct bch_inode_unpacked inode; + int ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); + if (ret) + return ret; - if (ret) - return ret; + bch2_inode_opts_get_inode(trans->c, &inode, opts); + } else { + /* + * data_update_index_update may call us for reflink btree extent + * updates, inum will be 0 + */ - bch2_inode_opts_get(opts, trans->c, &inode); + bch2_inode_opts_get(trans->c, opts); + } return 0; } @@ -1346,7 +1359,7 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - return ret ?: bch_err_throw(c, transaction_restart_nested); + return ret; } /* @@ -1385,7 +1398,8 @@ next_parent: int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: - delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); + delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)) ?: + bch_err_throw(trans->c, transaction_restart_nested); } static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 79092ea74844..63b7088811fb 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -289,9 +289,8 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); -void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, - struct bch_inode_unpacked *); -int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *); +void bch2_inode_opts_get_inode(struct bch_fs *, struct bch_inode_unpacked *, struct bch_inode_opts *); +int bch2_inum_snapshot_opts_get(struct btree_trans *, u64, u32, struct bch_inode_opts *); int bch2_inode_set_casefold(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, unsigned); @@ -300,8 +299,8 @@ int bch2_inode_set_casefold(struct btree_trans *, subvol_inum, static inline struct bch_extent_rebalance bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) { - struct bch_io_opts io_opts; - bch2_inode_opts_get(&io_opts, c, inode); + struct bch_inode_opts io_opts; + bch2_inode_opts_get_inode(c, inode, &io_opts); return io_opts_to_rebalance_opts(c, &io_opts); } diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index fa0b06e17d17..04eb5ecd102b 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -24,7 +24,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, subvol_inum inum, struct btree_iter *iter, u64 sectors, - struct bch_io_opts opts, + struct bch_inode_opts opts, s64 *i_sectors_delta, struct write_point_specifier write_point) { @@ -109,7 +109,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, } ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, - 0, i_sectors_delta, true); + 0, i_sectors_delta, true, 0); err: if (!ret && sectors_allocated) bch2_increment_clock(c, sectors_allocated, WRITE); @@ -211,7 +211,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_cut_back(end_pos, &delete); ret = bch2_extent_update(trans, inum, iter, &delete, - &disk_res, 0, i_sectors_delta, false); + &disk_res, 0, i_sectors_delta, false, 0); bch2_disk_reservation_put(c, &disk_res); } @@ -373,7 +373,6 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, struct btree_iter iter; struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; - struct bch_io_opts opts; u64 dst_offset = le64_to_cpu(op->v.dst_offset); u64 src_offset = le64_to_cpu(op->v.src_offset); s64 shift = dst_offset - src_offset; @@ -384,10 +383,6 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, bool warn_errors = i_sectors_delta != NULL; int ret = 0; - ret = bch2_inum_opts_get(trans, inum, &opts); - if (ret) - return ret; - /* * check for missing subvolume before fpunch, as in resume we don't want * it to be a fatal error @@ -476,8 +471,7 @@ case LOGGED_OP_FINSERT_shift_extents: op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?: - bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc); diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h index b93e4d4b3c0c..6a294f2a6dd6 100644 --- a/fs/bcachefs/io_misc.h +++ b/fs/bcachefs/io_misc.h @@ -3,7 +3,7 @@ #define _BCACHEFS_IO_MISC_H int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, - u64, struct bch_io_opts, s64 *, + u64, struct bch_inode_opts, s64 *, struct write_point_specifier); int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 579815c691af..e7ba0d0bf5ef 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -37,12 +37,6 @@ module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(read_corrupt_ratio, ""); #endif -static bool bch2_poison_extents_on_checksum_error; -module_param_named(poison_extents_on_checksum_error, - bch2_poison_extents_on_checksum_error, bool, 0644); -MODULE_PARM_DESC(poison_extents_on_checksum_error, - "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); - #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static inline u32 bch2_dev_congested_read(struct bch_dev *ca, u64 now) @@ -164,7 +158,7 @@ static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, struct bpos pos, - struct bch_io_opts opts, + struct bch_inode_opts opts, unsigned flags, struct bch_io_failures *failed) { @@ -545,9 +539,6 @@ static void get_rbio_extent(struct btree_trans *trans, static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, enum btree_id btree, struct bkey_s_c read_k) { - if (!bch2_poison_extents_on_checksum_error) - return 0; - struct bch_fs *c = trans->c; struct data_update *u = rbio_data_update(rbio); @@ -749,15 +740,13 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, } static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, - struct bch_read_bio *rbio) + struct bch_read_bio *rbio, + struct bch_extent_crc_unpacked *new_crc) { struct bch_fs *c = rbio->c; u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; int ret = 0; - if (crc_is_compressed(rbio->pick.crc)) - return 0; - CLASS(btree_iter, iter)(trans, rbio->data_btree, rbio->data_pos, BTREE_ITER_intent); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) @@ -765,21 +754,12 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (bversion_cmp(k.k->bversion, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) - return 0; - - /* Extent was merged? */ - if (bkey_start_offset(k.k) < data_offset || - k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) - return 0; + return bch_err_throw(c, rbio_narrow_crcs_fail); - struct bch_extent_crc_unpacked new_crc; - if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, - rbio->pick.crc, NULL, &new_crc, - bkey_start_offset(k.k) - data_offset, k.k->size, - rbio->pick.crc.csum_type)) { - bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); - return 0; - } + /* Extent was trimmed/merged? */ + if (!bpos_eq(bkey_start_pos(k.k), rbio->data_pos) || + k.k->p.offset != rbio->data_pos.offset + rbio->pick.crc.live_size) + return bch_err_throw(c, rbio_narrow_crcs_fail); /* * going to be temporarily appending another checksum entry: @@ -791,17 +771,37 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, bkey_reassemble(new, k); - if (!bch2_bkey_narrow_crcs(new, new_crc)) - return 0; + if (!bch2_bkey_narrow_crcs(new, *new_crc)) + return bch_err_throw(c, rbio_narrow_crcs_fail); return bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node); } static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { - CLASS(btree_trans, trans)(rbio->c); - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_rbio_narrow_crcs(trans, rbio)); + struct bch_fs *c = rbio->c; + + if (crc_is_compressed(rbio->pick.crc)) + return; + + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; + + struct bch_extent_crc_unpacked new_crc; + if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, + rbio->pick.crc, NULL, &new_crc, + rbio->data_pos.offset - data_offset, rbio->pick.crc.live_size, + rbio->pick.crc.csum_type)) { + bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); + return; + } + + CLASS(btree_trans, trans)(c); + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_rbio_narrow_crcs(trans, rbio, &new_crc)); + if (!ret) + count_event(c, io_read_narrow_crcs); + else if (ret == -BCH_ERR_rbio_narrow_crcs_fail) + count_event(c, io_read_narrow_crcs_fail); } static void bch2_read_decompress_err(struct work_struct *work) @@ -1274,6 +1274,10 @@ retry_pick: async_object_list_add(c, rbio, rbio, &rbio->list_idx); + /* XXX: also nvme read recovery level */ + if (unlikely(failed && bch2_dev_io_failures(failed, pick.ptr.dev))) + rbio->bio.bi_opf |= REQ_FUA; + if (rbio->bounce) trace_and_count(c, io_read_bounce, &rbio->bio); diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index 1e1c0476bd03..df4632f6fe9e 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -74,7 +74,7 @@ struct bch_read_bio { struct bpos data_pos; struct bversion version; - struct bch_io_opts opts; + struct bch_inode_opts opts; struct work_struct work; @@ -192,7 +192,7 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, static inline struct bch_read_bio *rbio_init(struct bio *bio, struct bch_fs *c, - struct bch_io_opts opts, + struct bch_inode_opts opts, bio_end_io_t end_io) { struct bch_read_bio *rbio = to_rbio(bio); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index aed22fc7759b..6a5da02ce266 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -205,7 +205,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, struct btree_iter *extent_iter, u64 new_i_size, - s64 i_sectors_delta) + s64 i_sectors_delta, + struct bch_inode_unpacked *inode_u) { /* * Crazy performance optimization: @@ -227,7 +228,13 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, BTREE_ITER_intent| BTREE_ITER_cached); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); - int ret = bkey_err(k); + + /* + * XXX: we currently need to unpack the inode on every write because we + * need the current io_opts, for transactional consistency - inode_v4? + */ + int ret = bkey_err(k) ?: + bch2_inode_unpack(k, inode_u); if (unlikely(ret)) return ret; @@ -303,8 +310,10 @@ int bch2_extent_update(struct btree_trans *trans, struct disk_reservation *disk_res, u64 new_i_size, s64 *i_sectors_delta_total, - bool check_enospc) + bool check_enospc, + u32 change_cookie) { + struct bch_fs *c = trans->c; struct bpos next_pos; bool usage_increasing; s64 i_sectors_delta = 0, disk_sectors_delta = 0; @@ -335,7 +344,7 @@ int bch2_extent_update(struct btree_trans *trans, if (disk_res && disk_sectors_delta > (s64) disk_res->sectors) { - ret = bch2_disk_reservation_add(trans->c, disk_res, + ret = bch2_disk_reservation_add(c, disk_res, disk_sectors_delta - disk_res->sectors, !check_enospc || !usage_increasing ? BCH_DISK_RESERVATION_NOFAIL : 0); @@ -349,9 +358,16 @@ int bch2_extent_update(struct btree_trans *trans, * aren't changing - for fsync to work properly; fsync relies on * inode->bi_journal_seq which is updated by the trigger code: */ + struct bch_inode_unpacked inode; + struct bch_inode_opts opts; + ret = bch2_extent_update_i_size_sectors(trans, iter, min(k->k.p.offset << 9, new_i_size), - i_sectors_delta) ?: + i_sectors_delta, &inode) ?: + (bch2_inode_opts_get_inode(c, &inode, &opts), + bch2_bkey_set_needs_rebalance(c, &opts, k, + SET_NEEDS_REBALANCE_foreground, + change_cookie)) ?: bch2_trans_update(trans, iter, k, 0) ?: bch2_trans_commit(trans, disk_res, NULL, BCH_TRANS_COMMIT_no_check_rw| @@ -402,7 +418,8 @@ static int bch2_write_index_default(struct bch_write_op *op) ret = bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_check_enospc); + op->flags & BCH_WRITE_check_enospc, + op->opts.change_cookie); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -792,10 +809,6 @@ static void init_append_extent(struct bch_write_op *op, bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, op->flags & BCH_WRITE_cached); - - if (!(op->flags & BCH_WRITE_move)) - bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); - bch2_keylist_push(&op->insert_keys); } @@ -1225,6 +1238,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return 0; } + struct bch_fs *c = trans->c; struct bkey_i *new = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + sizeof(struct bch_extent_rebalance)); int ret = PTR_ERR_OR_ZERO(new); @@ -1239,8 +1253,6 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, bkey_for_each_ptr(ptrs, ptr) ptr->unwritten = 0; - bch2_bkey_set_needs_rebalance(op->c, &op->opts, new); - /* * Note that we're not calling bch2_subvol_get_snapshot() in this path - * that was done when we kicked off the write, and here it's important @@ -1248,8 +1260,20 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, * since been created. The write is still outstanding, so we're ok * w.r.t. snapshot atomicity: */ + + /* + * For transactional consistency, set_needs_rebalance() has to be called + * with the io_opts from the btree in the same transaction: + */ + struct bch_inode_unpacked inode; + struct bch_inode_opts opts; + return bch2_extent_update_i_size_sectors(trans, iter, - min(new->k.p.offset << 9, new_i_size), 0) ?: + min(new->k.p.offset << 9, new_i_size), 0, &inode) ?: + (bch2_inode_opts_get_inode(c, &inode, &opts), + bch2_bkey_set_needs_rebalance(c, &opts, new, + SET_NEEDS_REBALANCE_foreground, + op->opts.change_cookie)) ?: bch2_trans_update(trans, iter, new, BTREE_UPDATE_internal_snapshot_node); } diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index 2c0a8f35ee1f..692529bf401d 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -28,10 +28,10 @@ int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, struct bkey_i *, bool *, s64 *, s64 *); int bch2_extent_update(struct btree_trans *, subvol_inum, struct btree_iter *, struct bkey_i *, - struct disk_reservation *, u64, s64 *, bool); + struct disk_reservation *, u64, s64 *, bool, u32); static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, - struct bch_io_opts opts) + struct bch_inode_opts opts) { op->c = c; op->end_io = NULL; diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index 5da4eb8bb6f6..ab36b03e0a46 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -90,7 +90,7 @@ struct bch_write_op { struct bch_devs_list devs_have; u16 target; u16 nonce; - struct bch_io_opts opts; + struct bch_inode_opts opts; u32 subvol; struct bpos pos; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index b9c0834498dd..c533b60706bf 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -51,25 +51,17 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, : 0; } -int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) +static int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) { - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted); -} - -int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) -{ - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); + return __bch2_lru_set(trans, lru_id, dev_bucket, time, true); } int __bch2_lru_change(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 old_time, u64 new_time) { - if (old_time == new_time) - return 0; - - return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?: - bch2_lru_set(trans, lru_id, dev_bucket, new_time); + return __bch2_lru_set(trans, lru_id, dev_bucket, old_time, false) ?: + __bch2_lru_set(trans, lru_id, dev_bucket, new_time, true); } static const char * const bch2_lru_types[] = { @@ -87,7 +79,6 @@ int bch2_lru_check_set(struct btree_trans *trans, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - CLASS(printbuf, buf)(); CLASS(btree_iter, lru_iter)(trans, BTREE_ID_lru, lru_pos(lru_id, dev_bucket, time), 0); struct bkey_s_c lru_k = bch2_btree_iter_peek_slot(&lru_iter); int ret = bkey_err(lru_k); @@ -99,10 +90,13 @@ int bch2_lru_check_set(struct btree_trans *trans, if (ret) return ret; - if (fsck_err(trans, alloc_key_to_missing_lru_entry, - "missing %s lru entry\n%s", - bch2_lru_types[lru_type(lru_k)], - (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { + CLASS(printbuf, buf)(); + prt_printf(&buf, "missing %s lru entry at pos ", bch2_lru_types[lru_type(lru_k)]); + bch2_bpos_to_text(&buf, lru_iter.pos); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, referring_k); + + if (fsck_err(trans, alloc_key_to_missing_lru_entry, "%s", buf.buf)) { ret = bch2_lru_set(trans, lru_id, dev_bucket, time); if (ret) return ret; @@ -127,6 +121,23 @@ static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) } } +int bch2_dev_remove_lrus(struct bch_fs *c, struct bch_dev *ca) +{ + CLASS(btree_trans, trans)(c); + int ret = bch2_btree_write_buffer_flush_sync(trans) ?: + for_each_btree_key(trans, iter, + BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, ({ + struct bbpos bp = lru_pos_to_bp(k); + + bp.btree == BTREE_ID_alloc && bp.pos.inode == ca->dev_idx + ? (bch2_btree_delete_at(trans, &iter, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0)) + : 0; + })); + bch_err_fn(c, ret); + return ret; +} + static u64 bkey_lru_type_idx(struct bch_fs *c, enum bch_lru_type type, struct bkey_s_c k) diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index 6f1e0a7b5db5..d5a2620f2507 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -59,8 +59,6 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos); .min_val_size = 8, \ }) -int bch2_lru_del(struct btree_trans *, u16, u64, u64); -int bch2_lru_set(struct btree_trans *, u16, u64, u64); int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); static inline int bch2_lru_change(struct btree_trans *trans, @@ -72,9 +70,10 @@ static inline int bch2_lru_change(struct btree_trans *trans, : 0; } +int bch2_dev_remove_lrus(struct bch_fs *, struct bch_dev *); + struct bkey_buf; int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); - int bch2_check_lrus(struct bch_fs *); #endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 5b4c3f4b1c25..8a3981e1016e 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -126,8 +126,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, { CLASS(btree_trans, trans)(c); + /* FIXME: this does not handle unknown btrees with data pointers */ for (unsigned id = 0; id < BTREE_ID_NR; id++) { - if (!btree_type_has_ptrs(id)) + if (!btree_type_has_data_ptrs(id)) continue; /* Stripe keys have pointers, but are handled separately */ @@ -167,7 +168,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, bch2_bkey_buf_init(&k); closure_init_stack(&cl); - for (id = 0; id < BTREE_ID_NR; id++) { + for (id = 0; id < btree_id_nr_alive(c); id++) { bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, BTREE_ITER_prefetch); retry: diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index c46a8965a7eb..9a440d3f7180 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -46,12 +46,12 @@ struct evacuate_bucket_arg { static bool evacuate_bucket_pred(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, - struct bch_io_opts *, + struct bch_inode_opts *, struct data_update_opts *); static noinline void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { CLASS(printbuf, buf)(); @@ -72,7 +72,7 @@ static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) static noinline void trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts, move_pred_fn pred, void *_arg, bool p) { @@ -327,7 +327,7 @@ int bch2_move_extent(struct moving_context *ctxt, struct move_bucket *bucket_in_flight, struct btree_iter *iter, struct bkey_s_c k, - struct bch_io_opts io_opts, + struct bch_inode_opts io_opts, struct data_update_opts data_opts) { struct btree_trans *trans = ctxt->trans; @@ -451,93 +451,6 @@ err: return ret; } -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, - struct per_snapshot_io_opts *io_opts, - struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ - struct btree_iter *extent_iter, - struct bkey_s_c extent_k) -{ - struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; - struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; - int ret = 0; - - if (btree_iter_path(trans, extent_iter)->level) - return opts_ret; - - if (extent_k.k->type == KEY_TYPE_reflink_v) - goto out; - - if (io_opts->cur_inum != extent_pos.inode) { - io_opts->d.nr = 0; - - ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), - BTREE_ITER_all_snapshots, k, ({ - if (k.k->p.offset != extent_pos.inode) - break; - - if (!bkey_is_inode(k.k)) - continue; - - struct bch_inode_unpacked inode; - _ret3 = bch2_inode_unpack(k, &inode); - if (_ret3) - break; - - struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; - bch2_inode_opts_get(&e.io_opts, trans->c, &inode); - - darray_push(&io_opts->d, e); - })); - io_opts->cur_inum = extent_pos.inode; - } - - ret = ret ?: trans_was_restarted(trans, restart_count); - if (ret) - return ERR_PTR(ret); - - if (extent_k.k->p.snapshot) - darray_for_each(io_opts->d, i) - if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) { - opts_ret = &i->io_opts; - break; - } -out: - ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); - if (ret) - return ERR_PTR(ret); - return opts_ret; -} - -int bch2_move_get_io_opts_one(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct btree_iter *extent_iter, - struct bkey_s_c extent_k) -{ - struct bch_fs *c = trans->c; - - *io_opts = bch2_opts_to_inode_opts(c->opts); - - /* reflink btree? */ - if (extent_k.k->p.inode) { - CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes, - SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), - BTREE_ITER_cached); - struct bkey_s_c inode_k = bch2_btree_iter_peek_slot(&inode_iter); - int ret = bkey_err(inode_k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - - if (!ret && bkey_is_inode(inode_k.k)) { - struct bch_inode_unpacked inode; - bch2_inode_unpack(inode_k, &inode); - bch2_inode_opts_get(io_opts, c, &inode); - } - } - - return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); -} - int bch2_move_ratelimit(struct moving_context *ctxt) { struct bch_fs *c = ctxt->trans->c; @@ -582,37 +495,6 @@ int bch2_move_ratelimit(struct moving_context *ctxt) return 0; } -/* - * Move requires non extents iterators, and there's also no need for it to - * signal indirect_extent_missing_error: - */ -static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_reflink_p p) -{ - if (unlikely(REFLINK_P_ERROR(p.v))) - return bkey_s_c_null; - - struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v)); - - bch2_trans_iter_init(trans, iter, - BTREE_ID_reflink, reflink_pos, - BTREE_ITER_not_extents); - - struct bkey_s_c k = bch2_btree_iter_peek(iter); - if (!k.k || bkey_err(k)) { - bch2_trans_iter_exit(iter); - return k; - } - - if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { - bch2_trans_iter_exit(iter); - return bkey_s_c_null; - } - - return k; -} - int bch2_move_data_btree(struct moving_context *ctxt, struct bpos start, struct bpos end, @@ -622,17 +504,11 @@ int bch2_move_data_btree(struct moving_context *ctxt, struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct per_snapshot_io_opts snapshot_io_opts; - struct bch_io_opts *io_opts; + struct bch_inode_opts *io_opts; struct bkey_buf sk; struct btree_iter iter, reflink_iter = {}; struct bkey_s_c k; struct data_update_opts data_opts; - /* - * If we're moving a single file, also process reflinked data it points - * to (this includes propagating changed io_opts from the inode to the - * extent): - */ - bool walk_indirect = start.inode == end.inode; int ret = 0, ret2; per_snapshot_io_opts_init(&snapshot_io_opts, c); @@ -697,8 +573,6 @@ root_err: bch2_ratelimit_reset(ctxt->rate); while (!bch2_move_ratelimit(ctxt)) { - struct btree_iter *extent_iter = &iter; - bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); @@ -717,41 +591,18 @@ root_err: if (ctxt->stats) ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - if (walk_indirect && - k.k->type == KEY_TYPE_reflink_p && - REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - - bch2_trans_iter_exit(&reflink_iter); - k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (!k.k) - goto next_nondata; - - /* - * XXX: reflink pointers may point to multiple indirect - * extents, so don't advance past the entire reflink - * pointer - need to fixup iter->k - */ - extent_iter = &reflink_iter; - } - if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, - iter.pos, extent_iter, k); + io_opts = bch2_extent_get_apply_io_opts(trans, &snapshot_io_opts, + iter.pos, &iter, k, + SET_NEEDS_REBALANCE_other); ret = PTR_ERR_OR_ZERO(io_opts); if (ret) continue; memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) + if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) goto next; /* @@ -762,7 +613,7 @@ root_err: k = bkey_i_to_s_c(sk.k); if (!level) - ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); else if (!data_opts.scrub) ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, k.k->p, data_opts.target, 0); @@ -824,7 +675,7 @@ static int bch2_move_data(struct bch_fs *c, unsigned min_depth_this_btree = min_depth; /* Stripe keys have pointers, but are handled separately */ - if (!btree_type_has_ptrs(id) || + if (!btree_type_has_data_ptrs(id) || id == BTREE_ID_stripes) min_depth_this_btree = max(min_depth_this_btree, 1); @@ -859,7 +710,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; bool is_kthread = current->flags & PF_KTHREAD; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree_iter iter = {}; struct bkey_buf sk; struct bkey_s_c k; @@ -867,6 +717,9 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, u64 check_mismatch_done = bucket_start; int ret = 0; + struct bch_inode_opts io_opts; + bch2_inode_opts_get(c, &io_opts); + /* Userspace might have supplied @dev: */ CLASS(bch2_dev_tryget_noerror, ca)(c, dev); if (!ca) @@ -942,7 +795,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, goto next; if (!bp.v->level) { - ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); + ret = bch2_extent_get_apply_io_opts_one(trans, &io_opts, &iter, k, + SET_NEEDS_REBALANCE_other); if (ret) { bch2_trans_iter_exit(&iter); continue; @@ -1039,7 +893,7 @@ int bch2_move_data_phys(struct bch_fs *c, static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { struct evacuate_bucket_arg *arg = _arg; @@ -1080,7 +934,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, } typedef bool (*move_btree_pred)(struct bch_fs *, void *, - struct btree *, struct bch_io_opts *, + struct btree *, struct bch_inode_opts *, struct data_update_opts *); static int bch2_move_btree(struct bch_fs *c, @@ -1090,7 +944,6 @@ static int bch2_move_btree(struct bch_fs *c, struct bch_move_stats *stats) { bool kthread = (current->flags & PF_KTHREAD) != 0; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct moving_context ctxt; struct btree_trans *trans; struct btree_iter iter; @@ -1099,6 +952,9 @@ static int bch2_move_btree(struct bch_fs *c, struct data_update_opts data_opts; int ret = 0; + struct bch_inode_opts io_opts; + bch2_inode_opts_get(c, &io_opts); + bch2_moving_ctxt_init(&ctxt, c, NULL, stats, writepoint_ptr(&c->btree_write_point), true); @@ -1159,7 +1015,7 @@ next: static bool rereplicate_pred(struct bch_fs *c, void *arg, enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { unsigned nr_good = bch2_bkey_durability(c, k); @@ -1190,7 +1046,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, static bool migrate_pred(struct bch_fs *c, void *arg, enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -1227,7 +1083,7 @@ static bool bformat_needs_redo(struct bkey_format *f) static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, struct btree *b, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { if (b->version_ondisk != c->sb.version || @@ -1264,7 +1120,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { unsigned durability = bch2_bkey_durability(c, k); @@ -1302,7 +1158,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, static bool scrub_pred(struct bch_fs *c, void *_arg, enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { struct bch_ioctl_data *arg = _arg; diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 481026ff99ab..754b0ad45950 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -73,7 +73,7 @@ do { \ } while (1) typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, - struct bch_io_opts *, struct data_update_opts *); + struct bch_inode_opts *, struct data_update_opts *); extern const char * const bch2_data_ops_strs[]; @@ -87,45 +87,15 @@ void bch2_moving_ctxt_flush_all(struct moving_context *); void bch2_move_ctxt_wait_for_io(struct moving_context *); int bch2_move_ratelimit(struct moving_context *); -/* Inodes in different snapshots may have different IO options: */ -struct snapshot_io_opts_entry { - u32 snapshot; - struct bch_io_opts io_opts; -}; - -struct per_snapshot_io_opts { - u64 cur_inum; - struct bch_io_opts fs_io_opts; - DARRAY(struct snapshot_io_opts_entry) d; -}; - -static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) -{ - memset(io_opts, 0, sizeof(*io_opts)); - io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts); -} - -static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) -{ - darray_exit(&io_opts->d); -} - -int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, - struct btree_iter *, struct bkey_s_c); - int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); int bch2_move_extent(struct moving_context *, struct move_bucket *, struct btree_iter *, struct bkey_s_c, - struct bch_io_opts, + struct bch_inode_opts, struct data_update_opts); -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, - struct per_snapshot_io_opts *, struct bpos, - struct btree_iter *, struct bkey_s_c); - int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, move_pred_fn, void *, enum btree_id, unsigned); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index c3ef35dc01e2..122bc98e4cbb 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -518,7 +518,7 @@ void bch2_opts_to_text(struct printbuf *out, } } -int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v) +int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, u64 v) { int ret = 0; @@ -531,6 +531,8 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id case Opt_compression: case Opt_background_compression: ret = bch2_check_set_has_compressed_data(c, v); + if (ret) + return ret; break; case Opt_erasure_code: if (v) @@ -546,7 +548,7 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id int bch2_opts_hooks_pre_set(struct bch_fs *c) { for (unsigned i = 0; i < bch2_opts_nr; i++) { - int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); + int ret = bch2_opt_hook_pre_set(c, NULL, 0, i, bch2_opt_get_by_id(&c->opts, i)); if (ret) return ret; } @@ -555,26 +557,15 @@ int bch2_opts_hooks_pre_set(struct bch_fs *c) } void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, - struct bch_opts *new_opts, enum bch_opt_id id) + enum bch_opt_id id, u64 v) { switch (id) { case Opt_foreground_target: - if (new_opts->foreground_target && - !new_opts->background_target) - bch2_set_rebalance_needs_scan(c, inum); - break; case Opt_compression: - if (new_opts->compression && - !new_opts->background_compression) - bch2_set_rebalance_needs_scan(c, inum); - break; case Opt_background_target: - if (new_opts->background_target) - bch2_set_rebalance_needs_scan(c, inum); - break; case Opt_background_compression: - if (new_opts->background_compression) - bch2_set_rebalance_needs_scan(c, inum); + bch2_set_rebalance_needs_scan(c, inum); + bch2_rebalance_wakeup(c); break; case Opt_rebalance_enabled: bch2_rebalance_wakeup(c); @@ -600,12 +591,14 @@ void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, * upgrades at runtime as well, but right now there's nothing * that does that: */ - if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible) + if (v == BCH_VERSION_UPGRADE_incompatible) bch2_sb_upgrade_incompat(c); break; default: break; } + + atomic_inc(&c->opt_change_cookie); } int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, @@ -802,16 +795,17 @@ bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, /* io opts: */ -struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) +void bch2_inode_opts_get(struct bch_fs *c, struct bch_inode_opts *ret) { - struct bch_io_opts opts = { -#define x(_name, _bits) ._name = src._name, + memset(ret, 0, sizeof(*ret)); + +#define x(_name, _bits) ret->_name = c->opts._name, BCH_INODE_OPTS() #undef x - }; - bch2_io_opts_fixups(&opts); - return opts; + ret->change_cookie = atomic_read(&c->opt_change_cookie); + + bch2_io_opts_fixups(ret); } bool bch2_opt_is_inode_opt(enum bch_opt_id id) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index f8828f4699c7..22cf109fb9c9 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -658,10 +658,9 @@ void bch2_opts_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, unsigned, unsigned, unsigned); -int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64); +int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, u64, enum bch_opt_id, u64); int bch2_opts_hooks_pre_set(struct bch_fs *); -void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, - struct bch_opts *, enum bch_opt_id); +void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, enum bch_opt_id, u64); int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, struct printbuf *, const char *, const char *); @@ -670,16 +669,19 @@ int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, /* inode opts: */ -struct bch_io_opts { +struct bch_inode_opts { #define x(_name, _bits) u##_bits _name; BCH_INODE_OPTS() #undef x + #define x(_name, _bits) u64 _name##_from_inode:1; BCH_INODE_OPTS() #undef x + + u32 change_cookie; }; -static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) +static inline void bch2_io_opts_fixups(struct bch_inode_opts *opts) { if (!opts->background_target) opts->background_target = opts->foreground_target; @@ -692,7 +694,7 @@ static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) } } -struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); +void bch2_inode_opts_get(struct bch_fs *, struct bch_inode_opts *); bool bch2_opt_is_inode_opt(enum bch_opt_id); #endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c index 792fc6fef270..541ee951d1c9 100644 --- a/fs/bcachefs/progress.c +++ b/fs/bcachefs/progress.c @@ -12,7 +12,7 @@ void bch2_progress_init(struct progress_indicator_state *s, s->next_print = jiffies + HZ * 10; - for (unsigned i = 0; i < BTREE_ID_NR; i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { if (!(btree_id_mask & BIT_ULL(i))) continue; diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 35aff96bf12a..fa73de7890da 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -93,7 +93,7 @@ void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, } static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, - struct bch_io_opts *opts, + struct bch_inode_opts *opts, struct bkey_s_c k, struct bkey_ptrs_c ptrs) { @@ -120,7 +120,7 @@ static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, } static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, - struct bch_io_opts *opts, + struct bch_inode_opts *opts, struct bkey_ptrs_c ptrs) { if (!opts->background_target || @@ -141,7 +141,7 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, } static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, - struct bch_io_opts *opts, + struct bch_inode_opts *opts, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -194,7 +194,7 @@ incompressible: return sectors; } -static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, +static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_inode_opts *opts, struct bkey_s_c k) { if (!bkey_extent_is_direct_data(k.k)) @@ -210,8 +210,10 @@ static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opt } } -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, - struct bkey_i *_k) +int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts, + struct bkey_i *_k, + enum set_needs_rebalance_ctx ctx, + u32 change_cookie) { if (!bkey_extent_is_direct_data(&_k->k)) return 0; @@ -235,10 +237,11 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, return 0; } -int bch2_get_update_rebalance_opts(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct btree_iter *iter, - struct bkey_s_c k) +static int bch2_get_update_rebalance_opts(struct btree_trans *trans, + struct bch_inode_opts *io_opts, + struct btree_iter *iter, + struct bkey_s_c k, + enum set_needs_rebalance_ctx ctx) { BUG_ON(iter->flags & BTREE_ITER_is_extents); BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); @@ -267,10 +270,121 @@ int bch2_get_update_rebalance_opts(struct btree_trans *trans, /* On successfull transaction commit, @k was invalidated: */ - return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: + return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n, ctx, 0) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, 0) ?: - bch_err_throw(trans->c, transaction_restart_nested); + bch_err_throw(trans->c, transaction_restart_commit); +} + +static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *io_opts, + struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ + struct btree_iter *extent_iter, + struct bkey_s_c extent_k) +{ + struct bch_fs *c = trans->c; + u32 restart_count = trans->restart_count; + int ret = 0; + + if (btree_iter_path(trans, extent_iter)->level) + return &io_opts->fs_io_opts; + + if (extent_k.k->type == KEY_TYPE_reflink_v) + return &io_opts->fs_io_opts; + + if (io_opts->cur_inum != extent_pos.inode) { + io_opts->d.nr = 0; + + ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), + BTREE_ITER_all_snapshots, k, ({ + if (k.k->p.offset != extent_pos.inode) + break; + + if (!bkey_is_inode(k.k)) + continue; + + struct bch_inode_unpacked inode; + _ret3 = bch2_inode_unpack(k, &inode); + if (_ret3) + break; + + struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; + bch2_inode_opts_get_inode(c, &inode, &e.io_opts); + + darray_push(&io_opts->d, e); + })); + io_opts->cur_inum = extent_pos.inode; + } + + ret = ret ?: trans_was_restarted(trans, restart_count); + if (ret) + return ERR_PTR(ret); + + if (extent_k.k->p.snapshot) + darray_for_each(io_opts->d, i) + if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) + return &i->io_opts; + + return &io_opts->fs_io_opts; +} + +struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, + struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ + struct btree_iter *extent_iter, + struct bkey_s_c extent_k, + enum set_needs_rebalance_ctx ctx) +{ + struct bch_inode_opts *opts = + bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, extent_iter, extent_k); + if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level) + return opts; + + int ret = bch2_get_update_rebalance_opts(trans, opts, extent_iter, extent_k, ctx); + return ret ? ERR_PTR(ret) : opts; +} + +int bch2_extent_get_io_opts_one(struct btree_trans *trans, + struct bch_inode_opts *io_opts, + struct btree_iter *extent_iter, + struct bkey_s_c extent_k, + enum set_needs_rebalance_ctx ctx) +{ + struct bch_fs *c = trans->c; + + bch2_inode_opts_get(c, io_opts); + + /* reflink btree? */ + if (extent_k.k->p.inode) { + CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes, + SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), + BTREE_ITER_cached); + struct bkey_s_c inode_k = bch2_btree_iter_peek_slot(&inode_iter); + int ret = bkey_err(inode_k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + + if (!ret && bkey_is_inode(inode_k.k)) { + struct bch_inode_unpacked inode; + bch2_inode_unpack(inode_k, &inode); + bch2_inode_opts_get_inode(c, &inode, io_opts); + } + } + + return 0; +} + +int bch2_extent_get_apply_io_opts_one(struct btree_trans *trans, + struct bch_inode_opts *io_opts, + struct btree_iter *extent_iter, + struct bkey_s_c extent_k, + enum set_needs_rebalance_ctx ctx) +{ + int ret = bch2_extent_get_io_opts_one(trans, io_opts, extent_iter, extent_k, ctx); + if (ret || btree_iter_path(trans, extent_iter)->level) + return ret; + + return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k, ctx); } #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) @@ -403,9 +517,10 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, } static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, struct bpos work_pos, struct btree_iter *extent_iter, - struct bch_io_opts *io_opts, + struct bch_inode_opts **opts_ret, struct data_update_opts *data_opts) { struct bch_fs *c = trans->c; @@ -419,13 +534,19 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, if (bkey_err(k)) return k; - int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k); + struct bch_inode_opts *opts = + bch2_extent_get_apply_io_opts(trans, snapshot_io_opts, + extent_iter->pos, extent_iter, k, + SET_NEEDS_REBALANCE_other); + int ret = PTR_ERR_OR_ZERO(opts); if (ret) return bkey_s_c_err(ret); + *opts_ret = opts; + memset(data_opts, 0, sizeof(*data_opts)); - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k); + data_opts->target = opts->background_target; data_opts->write_flags |= BCH_WRITE_only_specified_devs; if (!data_opts->rewrite_ptrs) { @@ -450,19 +571,19 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs); + unsigned p = bch2_bkey_ptrs_need_compress(c, opts, k, ptrs); if (p) { prt_str(&buf, "compression="); - bch2_compression_opt_to_text(&buf, io_opts->background_compression); + bch2_compression_opt_to_text(&buf, opts->background_compression); prt_str(&buf, " "); bch2_prt_u64_base2(&buf, p); prt_newline(&buf); } - p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs); + p = bch2_bkey_ptrs_need_move(c, opts, ptrs); if (p) { prt_str(&buf, "move="); - bch2_target_to_text(&buf, c, io_opts->background_target); + bch2_target_to_text(&buf, c, opts->background_target); prt_str(&buf, " "); bch2_prt_u64_base2(&buf, p); prt_newline(&buf); @@ -477,6 +598,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, noinline_for_stack static int do_rebalance_extent(struct moving_context *ctxt, + struct per_snapshot_io_opts *snapshot_io_opts, struct bpos work_pos, struct btree_iter *extent_iter) { @@ -484,7 +606,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &trans->c->rebalance; struct data_update_opts data_opts; - struct bch_io_opts io_opts; + struct bch_inode_opts *io_opts; struct bkey_s_c k; struct bkey_buf sk; int ret; @@ -495,8 +617,8 @@ static int do_rebalance_extent(struct moving_context *ctxt, bch2_bkey_buf_init(&sk); ret = lockrestart_do(trans, - bkey_err(k = next_rebalance_extent(trans, work_pos, - extent_iter, &io_opts, &data_opts))); + bkey_err(k = next_rebalance_extent(trans, snapshot_io_opts, + work_pos, extent_iter, &io_opts, &data_opts))); if (ret || !k.k) goto out; @@ -509,7 +631,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); + ret = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); if (ret) { if (bch2_err_matches(ret, ENOMEM)) { /* memory allocation failure, wait for some IO to finish */ @@ -528,7 +650,31 @@ out: return ret; } +static int do_rebalance_scan_indirect(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + struct bch_inode_opts *opts) +{ + u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); + u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); + u32 restart_count = trans->restart_count; + + int ret = for_each_btree_key(trans, iter, BTREE_ID_reflink, + POS(0, idx), BTREE_ITER_not_extents, k, ({ + if (bpos_ge(bkey_start_pos(k.k), POS(0, end))) + break; + bch2_get_update_rebalance_opts(trans, opts, &iter, k, + SET_NEEDS_REBALANCE_opt_change_indirect); + })); + if (ret) + return ret; + + /* suppress trans_was_restarted() check */ + trans->restart_count = restart_count; + return 0; +} + static int do_rebalance_scan(struct moving_context *ctxt, + struct per_snapshot_io_opts *snapshot_io_opts, u64 inum, u64 cookie, u64 *sectors_scanned) { struct btree_trans *trans = ctxt->trans; @@ -548,32 +694,33 @@ static int do_rebalance_scan(struct moving_context *ctxt, r->state = BCH_REBALANCE_scanning; - struct per_snapshot_io_opts snapshot_io_opts; - per_snapshot_io_opts_init(&snapshot_io_opts, c); - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, r->scan_start.pos, r->scan_end.pos, BTREE_ITER_all_snapshots| BTREE_ITER_prefetch, k, ({ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, - &snapshot_io_opts, iter.pos, &iter, k); - PTR_ERR_OR_ZERO(io_opts); + struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans, + snapshot_io_opts, iter.pos, &iter, k, + SET_NEEDS_REBALANCE_opt_change); + PTR_ERR_OR_ZERO(opts) ?: + (inum && + k.k->type == KEY_TYPE_reflink_p && + REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v) + ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), opts) + : 0); })) ?: commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_clear_rebalance_needs_scan(trans, inum, cookie)); - per_snapshot_io_opts_exit(&snapshot_io_opts); *sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen); - bch2_move_stats_exit(&r->scan_stats, c); - /* * Ensure that the rebalance_work entries we created are seen by the * next iteration of do_rebalance(), so we don't end up stuck in * rebalance_wait(): */ *sectors_scanned += 1; + bch2_move_stats_exit(&r->scan_stats, c); bch2_btree_write_buffer_flush_sync(trans); @@ -625,6 +772,9 @@ static int do_rebalance(struct moving_context *ctxt) bch2_move_stats_init(&r->work_stats, "rebalance_work"); + struct per_snapshot_io_opts snapshot_io_opts; + per_snapshot_io_opts_init(&snapshot_io_opts, c); + while (!bch2_move_ratelimit(ctxt)) { if (!bch2_rebalance_enabled(c)) { bch2_moving_ctxt_flush_all(ctxt); @@ -639,15 +789,18 @@ static int do_rebalance(struct moving_context *ctxt) break; ret = k->k.type == KEY_TYPE_cookie - ? do_rebalance_scan(ctxt, k->k.p.inode, + ? do_rebalance_scan(ctxt, &snapshot_io_opts, + k->k.p.inode, le64_to_cpu(bkey_i_to_cookie(k)->v.cookie), §ors_scanned) - : do_rebalance_extent(ctxt, k->k.p, &extent_iter); + : do_rebalance_extent(ctxt, &snapshot_io_opts, + k->k.p, &extent_iter); if (ret) break; } bch2_trans_iter_exit(&extent_iter); + per_snapshot_io_opts_exit(&snapshot_io_opts); bch2_move_stats_exit(&r->work_stats, c); if (!ret && diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 4a8812a65c61..bff91aa0102e 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -8,7 +8,7 @@ #include "rebalance_types.h" static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, - struct bch_io_opts *opts) + struct bch_inode_opts *opts) { struct bch_extent_rebalance r = { .type = BIT(BCH_EXTENT_ENTRY_rebalance), @@ -30,11 +30,51 @@ void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *, const struct bch_extent_rebalance *); u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); -int bch2_get_update_rebalance_opts(struct btree_trans *, - struct bch_io_opts *, - struct btree_iter *, - struct bkey_s_c); + +enum set_needs_rebalance_ctx { + SET_NEEDS_REBALANCE_opt_change, + SET_NEEDS_REBALANCE_opt_change_indirect, + SET_NEEDS_REBALANCE_foreground, + SET_NEEDS_REBALANCE_other, +}; + +int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *, + struct bkey_i *, enum set_needs_rebalance_ctx, u32); + +/* Inodes in different snapshots may have different IO options: */ +struct snapshot_io_opts_entry { + u32 snapshot; + struct bch_inode_opts io_opts; +}; + +struct per_snapshot_io_opts { + u64 cur_inum; + struct bch_inode_opts fs_io_opts; + DARRAY(struct snapshot_io_opts_entry) d; +}; + +static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) +{ + memset(io_opts, 0, sizeof(*io_opts)); + bch2_inode_opts_get(c, &io_opts->fs_io_opts); +} + +static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) +{ + darray_exit(&io_opts->d); +} + +struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *, + struct per_snapshot_io_opts *, struct bpos, + struct btree_iter *, struct bkey_s_c, + enum set_needs_rebalance_ctx); + +int bch2_extent_get_io_opts_one(struct btree_trans *, struct bch_inode_opts *, + struct btree_iter *, struct bkey_s_c, + enum set_needs_rebalance_ctx); +int bch2_extent_get_apply_io_opts_one(struct btree_trans *, struct bch_inode_opts *, + struct btree_iter *, struct bkey_s_c, + enum set_needs_rebalance_ctx); int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 8679c8aad0e7..531c2ef128ae 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -837,33 +837,39 @@ use_clean: bch2_async_btree_node_rewrites_flush(c); /* fsync if we fixed errors */ - if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + bool errors_fixed = test_bit(BCH_FS_errors_fixed, &c->flags) || + test_bit(BCH_FS_errors_fixed_silent, &c->flags); + + if (errors_fixed) { bch2_journal_flush_all_pins(&c->journal); bch2_journal_meta(&c->journal); } /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - test_bit(BCH_FS_errors_fixed, &c->flags) && + errors_fixed && !test_bit(BCH_FS_errors_not_fixed, &c->flags) && !test_bit(BCH_FS_error, &c->flags)) { bch2_flush_fsck_errs(c); bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); + errors_fixed = test_bit(BCH_FS_errors_fixed, &c->flags); clear_bit(BCH_FS_errors_fixed, &c->flags); + clear_bit(BCH_FS_errors_fixed_silent, &c->flags); ret = bch2_run_recovery_passes(c, BCH_RECOVERY_PASS_check_alloc_info); if (ret) goto err; - if (test_bit(BCH_FS_errors_fixed, &c->flags) || + if (errors_fixed || test_bit(BCH_FS_errors_not_fixed, &c->flags)) { bch_err(c, "Second fsck run was not clean"); set_bit(BCH_FS_errors_not_fixed, &c->flags); } - set_bit(BCH_FS_errors_fixed, &c->flags); + if (errors_fixed) + set_bit(BCH_FS_errors_fixed, &c->flags); } if (enabled_qtypes(c)) { diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 238a362de19e..d54468fdcb18 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -589,7 +589,6 @@ s64 bch2_remap_range(struct bch_fs *c, struct bpos dst_start = POS(dst_inum.inum, dst_offset); struct bpos src_start = POS(src_inum.inum, src_offset); struct bpos dst_end = dst_start, src_end = src_start; - struct bch_io_opts opts; struct bpos src_want; u64 dst_done = 0; u32 dst_snapshot, src_snapshot; @@ -609,10 +608,6 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_init(&new_src); CLASS(btree_trans, trans)(c); - ret = bch2_inum_opts_get(trans, src_inum, &opts); - if (ret) - goto err; - bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, BTREE_ITER_intent); bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, @@ -709,11 +704,10 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?: - bch2_extent_update(trans, dst_inum, &dst_iter, - new_dst.k, &disk_res, - new_i_size, i_sectors_delta, - true); + ret = bch2_extent_update(trans, dst_inum, &dst_iter, + new_dst.k, &disk_res, + new_i_size, i_sectors_delta, + true, 0); bch2_disk_reservation_put(c, &disk_res); } bch2_trans_iter_exit(&dst_iter); @@ -744,7 +738,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_trans_iter_exit(&inode_iter); } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); -err: + bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index 17cd617664d9..3907ba7edff2 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -23,6 +23,8 @@ enum counters_flags { x(io_read_reuse_race, 34, TYPE_COUNTER) \ x(io_read_retry, 32, TYPE_COUNTER) \ x(io_read_fail_and_poison, 95, TYPE_COUNTER) \ + x(io_read_narrow_crcs, 97, TYPE_COUNTER) \ + x(io_read_narrow_crcs_fail, 98, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ x(io_move_read, 35, TYPE_SECTORS) \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 7c6f18a1ee2a..728d878057af 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -160,7 +160,7 @@ enum bch_fsck_flags { x(extent_ptrs_unwritten, 140, 0) \ x(extent_ptrs_written_and_unwritten, 141, 0) \ x(ptr_to_invalid_device, 142, 0) \ - x(ptr_to_removed_device, 322, 0) \ + x(ptr_to_removed_device, 322, FSCK_AUTOFIX) \ x(ptr_to_duplicate_device, 143, 0) \ x(ptr_after_last_bucket, 144, 0) \ x(ptr_before_first_bucket, 145, 0) \ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 32b12311928e..473ad4b51180 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -322,6 +322,8 @@ static void __bch2_fs_read_only(struct bch_fs *c) do { clean_passes++; + bch2_do_discards_going_ro(c); + if (bch2_btree_interior_updates_flush(c) || bch2_btree_write_buffer_flush_going_ro(c) || bch2_journal_flush_all_pins(&c->journal) || @@ -1209,12 +1211,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch2_opts_apply(&c->opts, *opts); +#ifdef __KERNEL__ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && c->opts.block_size > PAGE_SIZE) { bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); ret = -EINVAL; goto err; } +#endif c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) @@ -1286,7 +1290,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, if (ret) goto err; - if (go_rw_in_recovery(c)) { + /* + * just make sure this is always allocated if we might need it - mount + * failing due to kthread_create() failing is _very_ annoying + */ + if (!(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) || + go_rw_in_recovery(c)) { /* * start workqueues/kworkers early - kthread creation checks for * pending signals, which is _very_ annoying diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 6b071dcc062b..4c6e6c46d18a 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -784,7 +784,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, u64 v; ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: - bch2_opt_hook_pre_set(c, ca, id, v); + bch2_opt_hook_pre_set(c, ca, 0, id, v); kfree(tmp); if (ret < 0) @@ -807,7 +807,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, bch2_opt_set_by_id(&c->opts, id, v); if (changed) - bch2_opt_hook_post_set(c, ca, 0, &c->opts, id); + bch2_opt_hook_post_set(c, ca, 0, id, v); ret = size; err: diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 269cdf1a87a4..6c312fd9a447 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -720,47 +720,55 @@ DEFINE_EVENT(fs_str, bucket_alloc_fail, ); DECLARE_EVENT_CLASS(discard_buckets_class, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err), + TP_PROTO(struct bch_fs *c, struct discard_buckets_state *s, const char *err), + TP_ARGS(c, s, err), TP_STRUCT__entry( __field(dev_t, dev ) __field(u64, seen ) __field(u64, open ) __field(u64, need_journal_commit ) + __field(u64, commit_in_flight ) + __field(u64, bad_data_type ) + __field(u64, already_discarding ) __field(u64, discarded ) __array(char, err, 16 ) ), TP_fast_assign( __entry->dev = c->dev; - __entry->seen = seen; - __entry->open = open; - __entry->need_journal_commit = need_journal_commit; - __entry->discarded = discarded; + __entry->seen = s->seen; + __entry->open = s->open; + __entry->need_journal_commit = s->need_journal_commit; + __entry->commit_in_flight = s->commit_in_flight; + __entry->bad_data_type = s->bad_data_type; + __entry->already_discarding = s->already_discarding; + __entry->discarded = s->discarded; strscpy(__entry->err, err, sizeof(__entry->err)); ), - TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", + TP_printk("%d%d seen %llu open %llu\n" + "need_commit %llu committing %llu bad_data_type %llu\n" + "already_discarding %llu discarded %llu err %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->seen, __entry->open, __entry->need_journal_commit, + __entry->commit_in_flight, + __entry->bad_data_type, + __entry->already_discarding, __entry->discarded, __entry->err) ); DEFINE_EVENT(discard_buckets_class, discard_buckets, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err) + TP_PROTO(struct bch_fs *c, struct discard_buckets_state *s, const char *err), + TP_ARGS(c, s, err) ); DEFINE_EVENT(discard_buckets_class, discard_buckets_fast, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err) + TP_PROTO(struct bch_fs *c, struct discard_buckets_state *s, const char *err), + TP_ARGS(c, s, err) ); TRACE_EVENT(bucket_invalidate, diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 6d7303008b19..784e75a21132 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -535,10 +535,9 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, return -EINVAL; s.id = inode_opt_id; + u64 v = 0; if (value) { - u64 v = 0; - buf = kmalloc(size + 1, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -551,7 +550,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, if (ret < 0) goto err; - ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v); + ret = bch2_opt_hook_pre_set(c, NULL, inode->ei_inode.bi_inum, opt_id, v); if (ret < 0) goto err; @@ -591,6 +590,8 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); } + + bch2_opt_hook_post_set(c, NULL, inode->ei_inode.bi_inum, opt_id, v); err: return bch2_err_class(ret); } |