diff options
84 files changed, 1573 insertions, 1202 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision index 97055a41..13c25773 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -f5a0255e65246610d9c71fa1ec5bee277870330e +292344971769fe1dd561d8844c57c15c833f91ef diff --git a/c_src/cmd_image.c b/c_src/cmd_image.c index 9da3ed34..00f0c6f4 100644 --- a/c_src/cmd_image.c +++ b/c_src/cmd_image.c @@ -90,7 +90,7 @@ struct move_btree_args { static bool move_btree_pred(struct bch_fs *c, void *_arg, enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { struct move_btree_args *args = _arg; diff --git a/c_src/cmd_option.c b/c_src/cmd_option.c index 14201c8d..433d6196 100644 --- a/c_src/cmd_option.c +++ b/c_src/cmd_option.c @@ -117,7 +117,7 @@ int cmd_set_option(int argc, char *argv[]) fprintf(stderr, "Can't set option %s\n", opt->attr.name); if (opt->flags & OPT_FS) { - ret = bch2_opt_hook_pre_set(c, NULL, i, v); + ret = bch2_opt_hook_pre_set(c, NULL, 0, i, v); if (ret < 0) { fprintf(stderr, "error setting %s: %i\n", opt->attr.name, ret); continue; @@ -135,7 +135,7 @@ int cmd_set_option(int argc, char *argv[]) continue; } - ret = bch2_opt_hook_pre_set(c, ca, i, v); + ret = bch2_opt_hook_pre_set(c, ca, 0, i, v); if (ret < 0) { fprintf(stderr, "error setting %s: %i\n", opt->attr.name, ret); continue; diff --git a/c_src/posix_to_bcachefs.c b/c_src/posix_to_bcachefs.c index e80a3564..1d499b9e 100644 --- a/c_src/posix_to_bcachefs.c +++ b/c_src/posix_to_bcachefs.c @@ -252,8 +252,8 @@ static void read_data(struct bch_fs *c, rbio.bio.bi_private = &cl; bch2_bio_map(&rbio.bio, buf, len); - struct bch_io_opts opts; - bch2_inode_opts_get(&opts, c, inode); + struct bch_inode_opts opts; + bch2_inode_opts_get_inode(c, inode, &opts); rbio_init(&rbio.bio, c, opts, read_data_endio); @@ -279,7 +279,10 @@ static void write_data(struct bch_fs *c, bio_init(&op.wbio.bio, NULL, bv, ARRAY_SIZE(bv), 0); bch2_bio_map(&op.wbio.bio, buf, len); - bch2_write_op_init(&op, c, bch2_opts_to_inode_opts(c->opts)); + struct bch_inode_opts opts; + bch2_inode_opts_get(c, &opts); + + bch2_write_op_init(&op, c, opts); op.write_point = writepoint_hashed(0); op.nr_replicas = 1; op.subvol = 1; diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 5b51c3d5..52b012ef 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -155,20 +155,27 @@ void __genradix_free(struct __genradix *); */ #define genradix_free(_radix) __genradix_free(&(_radix)->tree) -static inline size_t __idx_to_offset(size_t idx, size_t obj_size) +static inline bool __idx_to_offset(size_t idx, size_t obj_size, size_t *offset) { + /* + * XXX: check for overflow, we have a bug in multiple places where we + * assume idx fits in 64 bits (because it's an inode number; already a + * bug on 32 bit) - but we then multiply by the object size and we + * overflow + */ if (__builtin_constant_p(obj_size)) BUILD_BUG_ON(obj_size > GENRADIX_NODE_SIZE); else BUG_ON(obj_size > GENRADIX_NODE_SIZE); if (!is_power_of_2(obj_size)) { - size_t objs_per_page = GENRADIX_NODE_SIZE / obj_size; + size_t objs_per_page = GENRADIX_NODE_SIZE / obj_size, node, node_offset; - return (idx / objs_per_page) * GENRADIX_NODE_SIZE + - (idx % objs_per_page) * obj_size; + return check_mul_overflow(idx / objs_per_page, GENRADIX_NODE_SIZE, &node) ? true : + check_mul_overflow(idx % objs_per_page, obj_size, &node_offset) ? true : + check_add_overflow(node, node_offset, offset); } else { - return idx * obj_size; + return check_mul_overflow(idx, obj_size, offset); } } @@ -179,8 +186,8 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) #define __genradix_page_remainder(_radix) \ (GENRADIX_NODE_SIZE % sizeof((_radix)->type[0])) -#define __genradix_idx_to_offset(_radix, _idx) \ - __idx_to_offset(_idx, __genradix_obj_size(_radix)) +#define __genradix_idx_to_offset(_radix, _idx, _offset) \ + __idx_to_offset(_idx, __genradix_obj_size(_radix), _offset) static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset) { @@ -202,9 +209,13 @@ static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offs } #define genradix_ptr_inlined(_radix, _idx) \ - (__genradix_cast(_radix) \ - __genradix_ptr_inlined(&(_radix)->tree, \ - __genradix_idx_to_offset(_radix, _idx))) +({ \ + size_t _offset; \ + !__genradix_idx_to_offset(_radix, _idx, &_offset) \ + ? __genradix_cast(_radix) \ + __genradix_ptr_inlined(&(_radix)->tree, _offset) \ + : NULL; \ +}) void *__genradix_ptr(struct __genradix *, size_t); @@ -216,28 +227,39 @@ void *__genradix_ptr(struct __genradix *, size_t); * Returns a pointer to entry at @_idx, or NULL if that entry does not exist. */ #define genradix_ptr(_radix, _idx) \ - (__genradix_cast(_radix) \ - __genradix_ptr(&(_radix)->tree, \ - __genradix_idx_to_offset(_radix, _idx))) +({ \ + size_t _offset; \ + !__genradix_idx_to_offset(_radix, _idx, &_offset) \ + ? __genradix_cast(_radix) \ + __genradix_ptr(&(_radix)->tree, _offset) \ + : NULL; \ +}) void *__genradix_ptr_alloc(struct __genradix *, size_t, struct genradix_node **, gfp_t); -#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \ - (__genradix_cast(_radix) \ - (__genradix_ptr_inlined(&(_radix)->tree, \ - __genradix_idx_to_offset(_radix, _idx)) ?: \ - __genradix_ptr_alloc(&(_radix)->tree, \ - __genradix_idx_to_offset(_radix, _idx), \ - NULL, _gfp))) - #define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\ - (__genradix_cast(_radix) \ - (__genradix_ptr_inlined(&(_radix)->tree, \ - __genradix_idx_to_offset(_radix, _idx)) ?: \ - __genradix_ptr_alloc(&(_radix)->tree, \ - __genradix_idx_to_offset(_radix, _idx), \ - _new_node, _gfp))) +({ \ + size_t _offset; \ + !__genradix_idx_to_offset(_radix, _idx, &_offset) \ + ? __genradix_cast(_radix) \ + (__genradix_ptr_inlined(&(_radix)->tree, _offset) ?: \ + __genradix_ptr_alloc(&(_radix)->tree, _offset, \ + _new_node, _gfp)) \ + : NULL; \ +}) + +#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \ + genradix_ptr_alloc_preallocated_inlined(_radix, _idx, NULL, _gfp) + +#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp) \ +({ \ + size_t _offset; \ + !__genradix_idx_to_offset(_radix, _idx, &_offset) \ + ? __genradix_cast(_radix) \ + __genradix_ptr_alloc(&(_radix)->tree, _offset, _new_node, _gfp) \ + : NULL; \ +}) /** * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it @@ -248,17 +270,8 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, * * Returns a pointer to entry at @_idx, or NULL on allocation failure */ -#define genradix_ptr_alloc(_radix, _idx, _gfp) \ - (__genradix_cast(_radix) \ - __genradix_ptr_alloc(&(_radix)->tree, \ - __genradix_idx_to_offset(_radix, _idx), \ - NULL, _gfp)) - -#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\ - (__genradix_cast(_radix) \ - __genradix_ptr_alloc(&(_radix)->tree, \ - __genradix_idx_to_offset(_radix, _idx), \ - _new_node, _gfp)) +#define genradix_ptr_alloc(_radix, _idx, _gfp) \ + genradix_ptr_alloc_preallocated(_radix, _idx, NULL, _gfp) struct genradix_iter { size_t offset; @@ -271,10 +284,15 @@ struct genradix_iter { * @_idx: index to start iterating from */ #define genradix_iter_init(_radix, _idx) \ - ((struct genradix_iter) { \ +({ \ + size_t _offset; \ + __genradix_idx_to_offset(_radix, _idx, &_offset); \ + \ + (struct genradix_iter) { \ .pos = (_idx), \ - .offset = __genradix_idx_to_offset((_radix), (_idx)),\ - }) + .offset = _offset, \ + }; \ +}) void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t); @@ -394,9 +412,11 @@ int __genradix_prealloc(struct __genradix *, size_t, gfp_t); * Returns 0 on success, -ENOMEM on failure */ #define genradix_prealloc(_radix, _nr, _gfp) \ - __genradix_prealloc(&(_radix)->tree, \ - __genradix_idx_to_offset(_radix, _nr + 1),\ - _gfp) - +({ \ + size_t _offset; \ + !__genradix_idx_to_offset(_radix, _nr + 1, &_offset) \ + ? __genradix_prealloc(&(_radix)->tree, _offset, _gfp) \ + : -ENOMEM; \ +}) #endif /* _LINUX_GENERIC_RADIX_TREE_H */ diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h index df425114..27f57eca 100644 --- a/include/linux/xxhash.h +++ b/include/linux/xxhash.h @@ -178,32 +178,6 @@ struct xxh64_state { void xxh32_reset(struct xxh32_state *state, uint32_t seed); /** - * xxh32_update() - hash the data given and update the xxh32 state - * - * @state: The xxh32 state to update. - * @input: The data to hash. - * @length: The length of the data to hash. - * - * After calling xxh32_reset() call xxh32_update() as many times as necessary. - * - * Return: Zero on success, otherwise an error code. - */ -int xxh32_update(struct xxh32_state *state, const void *input, size_t length); - -/** - * xxh32_digest() - produce the current xxh32 hash - * - * @state: Produce the current xxh32 hash of this state. - * - * A hash value can be produced at any time. It is still possible to continue - * inserting input into the hash state after a call to xxh32_digest(), and - * generate new hashes later on, by calling xxh32_digest() again. - * - * Return: The xxh32 hash stored in the state. - */ -uint32_t xxh32_digest(const struct xxh32_state *state); - -/** * xxh64_reset() - reset the xxh64 state to start a new hashing operation * * @state: The xxh64 state to reset. diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 3fc728ef..21cdc42e 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -344,7 +344,7 @@ static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs * struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, k.k->p.inode) : NULL; prt_newline(out); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); bch2_prt_data_type(out, a->data_type); @@ -367,7 +367,6 @@ static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs * if (ca) prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca)); prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a)); - printbuf_indent_sub(out, 2); bch2_dev_put(ca); } @@ -1772,13 +1771,6 @@ static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) darray_remove_item(&ca->discard_buckets_in_flight, i); } -struct discard_buckets_state { - u64 seen; - u64 open; - u64 need_journal_commit; - u64 discarded; -}; - static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, @@ -1791,6 +1783,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, bool discard_locked = false; int ret = 0; + s->seen++; + if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { s->open++; return 0; @@ -1801,6 +1795,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (seq_ready > c->journal.flushed_seq_ondisk) { if (seq_ready > c->journal.flushing_seq) s->need_journal_commit++; + else + s->commit_in_flight++; return 0; } @@ -1816,6 +1812,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, return ret; if (a->v.data_type != BCH_DATA_need_discard) { + s->bad_data_type++; + if (need_discard_or_freespace_err(trans, k, true, true, true)) { ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false); if (ret) @@ -1827,8 +1825,10 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } if (!fastpath) { - if (discard_in_flight_add(ca, iter.pos.offset, true)) + if (discard_in_flight_add(ca, iter.pos.offset, true)) { + s->already_discarding++; goto out; + } discard_locked = true; } @@ -1862,6 +1862,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, commit: ret = bch2_trans_commit(trans, NULL, NULL, BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; @@ -1874,14 +1875,11 @@ out: fsck_err: if (discard_locked) discard_in_flight_remove(ca, iter.pos.offset); - if (!ret) - s->seen++; return ret; } -static void bch2_do_discards_work(struct work_struct *work) +static void __bch2_dev_do_discards(struct bch_dev *ca) { - struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); struct bch_fs *c = ca->fs; struct discard_buckets_state s = {}; struct bpos discard_pos_done = POS_MAX; @@ -1902,10 +1900,25 @@ static void bch2_do_discards_work(struct work_struct *work) if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal)) bch2_journal_flush_async(&c->journal, NULL); - trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, - bch2_err_str(ret)); + trace_discard_buckets(c, &s, bch2_err_str(ret)); enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); +} + +void bch2_do_discards_going_ro(struct bch_fs *c) +{ + for_each_member_device(c, ca) + if (bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards)) + __bch2_dev_do_discards(ca); +} + +static void bch2_do_discards_work(struct work_struct *work) +{ + struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); + struct bch_fs *c = ca->fs; + + __bch2_dev_do_discards(ca); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); } @@ -1993,7 +2006,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work) break; } - trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + trace_discard_buckets_fast(c, &s, bch2_err_str(ret)); bch2_trans_put(trans); enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); @@ -2385,8 +2398,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) * We clear the LRU and need_discard btrees first so that we don't race * with bch2_do_invalidates() and bch2_do_discards() */ - ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_norun, NULL) ?: + ret = bch2_dev_remove_lrus(c, ca) ?: bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, @@ -2397,7 +2409,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, BTREE_TRIGGER_norun, NULL) ?: - bch2_dev_usage_remove(c, ca->dev_idx); + bch2_dev_usage_remove(c, ca); bch_err_msg(ca, ret, "removing dev alloc info"); return ret; } diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index c2e8482f..a602507f 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -320,6 +320,7 @@ static inline int bch2_check_discard_freespace_key_async(struct btree_trans *tra int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_dev_do_discards(struct bch_dev *); +void bch2_do_discards_going_ro(struct bch_fs *); void bch2_do_discards(struct bch_fs *); static inline u64 should_invalidate_buckets(struct bch_dev *ca, diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index f6ea4a82..97b627ed 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -1491,10 +1491,9 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, prt_newline(out); - printbuf_indent_add(out, 2); - open_bucket_for_each(c, &wp->ptrs, ob, i) - bch2_open_bucket_to_text(out, c, ob); - printbuf_indent_sub(out, 2); + scoped_guard(printbuf_indent, out) + open_bucket_for_each(c, &wp->ptrs, ob, i) + bch2_open_bucket_to_text(out, c, ob); } void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) @@ -1530,6 +1529,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) printbuf_tabstop_push(out, 24); prt_printf(out, "capacity\t%llu\n", c->capacity); + prt_printf(out, "used\t%llu\n", bch2_fs_usage_read_short(c).used); prt_printf(out, "reserved\t%llu\n", c->reserved); prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden)); prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree)); @@ -1586,9 +1586,8 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) c->opts.allocator_stuck_timeout); prt_printf(&buf, "Allocator debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_fs_alloc_debug_to_text(&buf, c); - printbuf_indent_sub(&buf, 2); + scoped_guard(printbuf_indent, &buf) + bch2_fs_alloc_debug_to_text(&buf, c); prt_newline(&buf); bch2_printbuf_make_room(&buf, 4096); @@ -1597,23 +1596,20 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) guard(printbuf_atomic)(&buf); for_each_online_member_rcu(c, ca) { prt_printf(&buf, "Dev %u:\n", ca->dev_idx); - printbuf_indent_add(&buf, 2); - bch2_dev_alloc_debug_to_text(&buf, ca); - printbuf_indent_sub(&buf, 2); + scoped_guard(printbuf_indent, &buf) + bch2_dev_alloc_debug_to_text(&buf, ca); prt_newline(&buf); } } prt_printf(&buf, "Copygc debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_copygc_wait_to_text(&buf, c); - printbuf_indent_sub(&buf, 2); + scoped_guard(printbuf_indent, &buf) + bch2_copygc_wait_to_text(&buf, c); prt_newline(&buf); prt_printf(&buf, "Journal debug:\n"); - printbuf_indent_add(&buf, 2); - bch2_journal_debug_to_text(&buf, &c->journal); - printbuf_indent_sub(&buf, 2); + scoped_guard(printbuf_indent, &buf) + bch2_journal_debug_to_text(&buf, &c->journal); bch2_print_str(c, KERN_ERR, buf.buf); } diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index e7becdf2..ee52b66d 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -118,4 +118,14 @@ struct write_point_specifier { unsigned long v; }; +struct discard_buckets_state { + u64 seen; + u64 open; + u64 need_journal_commit; + u64 commit_in_flight; + u64 bad_data_type; + u64 already_discarding; + u64 discarded; +}; + #endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index cb25cddb..c662eeba 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -117,7 +117,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, prt_printf(&buf, "existing backpointer found when inserting "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); prt_newline(&buf); - printbuf_indent_add(&buf, 2); + guard(printbuf_indent)(&buf); prt_printf(&buf, "found "); bch2_bkey_val_to_text(&buf, c, found_bp); @@ -127,7 +127,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, orig_k); } else if (!will_check) { prt_printf(&buf, "backpointer not found when deleting\n"); - printbuf_indent_add(&buf, 2); + guard(printbuf_indent)(&buf); prt_printf(&buf, "searching for "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); @@ -278,9 +278,20 @@ static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, bp.v->level - 1, 0); struct btree *b = bch2_btree_iter_peek_node(iter); - if (IS_ERR_OR_NULL(b)) + if (IS_ERR(b)) goto err; + if (!b) { + /* Backpointer for nonexistent tree depth: */ + bkey_init(&iter->k); + iter->k.p = bp.v->pos; + struct bkey_s_c k = { &iter->k }; + + int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit); + b = ret ? ERR_PTR(ret) : NULL; + goto err; + } + BUG_ON(b->c.level != bp.v->level - 1); if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, @@ -809,7 +820,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, for (enum btree_id btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { - int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; + int level, depth = btree_type_has_data_ptrs(btree_id) ? 0 : 1; ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -862,17 +873,25 @@ static int data_type_to_alloc_counter(enum bch_data_type t) } } -static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); +static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos, + struct bkey_buf *last_flushed); static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, bool *had_mismatch, - struct bkey_buf *last_flushed) + struct bkey_buf *last_flushed, + struct bpos *last_pos, + unsigned *nr_iters) { struct bch_fs *c = trans->c; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); bool need_commit = false; + if (!bpos_eq(*last_pos, alloc_k.k->p)) + *nr_iters = 0; + + *last_pos = alloc_k.k->p; + *had_mismatch = false; if (a->data_type == BCH_DATA_sb || @@ -926,6 +945,46 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b return ret; } + if (sectors[ALLOC_dirty] > a->dirty_sectors || + sectors[ALLOC_cached] > a->cached_sectors || + sectors[ALLOC_stripe] > a->stripe_sectors) { + if (*nr_iters) { + CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "backpointer sectors > bucket sectors, but found no bad backpointers\n" + "bucket %llu:%llu data type %s, counters\n", + alloc_k.k->p.inode, + alloc_k.k->p.offset, + __bch2_data_types[a->data_type]); + if (sectors[ALLOC_dirty] > a->dirty_sectors) + prt_printf(&buf, "dirty: %u > %u\n", + sectors[ALLOC_dirty], a->dirty_sectors); + if (sectors[ALLOC_cached] > a->cached_sectors) + prt_printf(&buf, "cached: %u > %u\n", + sectors[ALLOC_cached], a->cached_sectors); + if (sectors[ALLOC_stripe] > a->stripe_sectors) + prt_printf(&buf, "stripe: %u > %u\n", + sectors[ALLOC_stripe], a->stripe_sectors); + + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, alloc_k.k->p), + bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { + bch2_bkey_val_to_text(&buf, c, bp_k); + prt_newline(&buf); + } + + bch2_print_str(c, KERN_ERR, buf.buf); + __WARN(); + return ret; + } + + *nr_iters += 1; + + return check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p, last_flushed) ?: + bch_err_throw(c, transaction_restart_nested); + } + if (sectors[ALLOC_dirty] != a->dirty_sectors || sectors[ALLOC_cached] != a->cached_sectors || sectors[ALLOC_stripe] != a->stripe_sectors) { @@ -943,13 +1002,6 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b return ret; } - if (sectors[ALLOC_dirty] > a->dirty_sectors || - sectors[ALLOC_cached] > a->cached_sectors || - sectors[ALLOC_stripe] > a->stripe_sectors) { - return check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: - bch_err_throw(c, transaction_restart_nested); - } - bool empty = (sectors[ALLOC_dirty] + sectors[ALLOC_stripe] + sectors[ALLOC_cached]) == 0; @@ -1113,6 +1165,8 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) CLASS(btree_trans, trans)(c); struct extents_to_bp_state s = { .bp_start = POS_MIN }; + struct bpos last_pos = POS_MIN; + unsigned nr_iters = 0; bch2_bkey_buf_init(&s.last_flushed); bkey_init(&s.last_flushed.k->k); @@ -1121,7 +1175,8 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) POS_MIN, BTREE_ITER_prefetch, k, ({ bool had_mismatch; bch2_fs_going_ro(c) ?: - check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed); + check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed, + &last_pos, &nr_iters); })); if (ret) goto err; @@ -1189,7 +1244,11 @@ static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans, if (ret) return ret; - return check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed); + struct bpos last_pos = POS_MIN; + unsigned nr_iters = 0; + return check_bucket_backpointer_mismatch(trans, k, had_mismatch, + last_flushed, + &last_pos, &nr_iters); } int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans, @@ -1253,22 +1312,21 @@ static int check_one_backpointer(struct btree_trans *trans, } static int check_bucket_backpointers_to_extents(struct btree_trans *trans, - struct bch_dev *ca, struct bpos bucket) + struct bch_dev *ca, struct bpos bucket, + struct bkey_buf *last_flushed) { u32 restart_count = trans->restart_count; - struct bkey_buf last_flushed; - bch2_bkey_buf_init(&last_flushed); - bkey_init(&last_flushed.k->k); int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, bucket_pos_to_bp_start(ca, bucket), bucket_pos_to_bp_end(ca, bucket), 0, k, - check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) + check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, last_flushed) ); - bch2_bkey_buf_exit(&last_flushed, trans->c); - return ret ?: trans_was_restarted(trans, restart_count); + return ret ?: + bch2_btree_write_buffer_flush_sync(trans) ?: /* make sure bad backpointers that were deleted are visible */ + trans_was_restarted(trans, restart_count); } static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 16d08dfb..83d6ab9c 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -458,7 +458,6 @@ BCH_DEBUG_PARAMS_ALL() x(btree_node_compact) \ x(btree_node_merge) \ x(btree_node_sort) \ - x(btree_node_get) \ x(btree_node_read) \ x(btree_node_read_done) \ x(btree_node_write) \ @@ -466,10 +465,6 @@ BCH_DEBUG_PARAMS_ALL() x(btree_interior_update_total) \ x(btree_gc) \ x(data_write) \ - x(data_write_to_submit) \ - x(data_write_to_queue) \ - x(data_write_to_btree_update) \ - x(data_write_btree_update) \ x(data_read) \ x(data_promote) \ x(journal_flush_write) \ @@ -483,6 +478,7 @@ BCH_DEBUG_PARAMS_ALL() x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ x(blocked_write_buffer_full) \ + x(blocked_writeback_throttle) \ x(nocow_lock_contended) enum bch_time_stats { @@ -523,6 +519,7 @@ struct discard_in_flight { x(journal_read) \ x(fs_journal_alloc) \ x(fs_resize_on_mount) \ + x(sb_journal_sort) \ x(btree_node_read) \ x(btree_node_read_all_replicas) \ x(btree_node_scrub) \ @@ -674,6 +671,7 @@ struct bch_dev { x(error) \ x(topology_error) \ x(errors_fixed) \ + x(errors_fixed_silent) \ x(errors_not_fixed) \ x(no_invalid_checks) \ x(discard_mount_opt_set) \ @@ -807,6 +805,8 @@ struct bch_fs { struct bch_disk_groups_cpu __rcu *disk_groups; struct bch_opts opts; + atomic_t opt_change_cookie; + unsigned loglevel; unsigned prev_loglevel; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index b2de993d..08393971 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -654,7 +654,6 @@ struct bch_sb_field_ext { /* * field 1: version name * field 2: BCH_VERSION(major, minor) - * field 3: recovery passess required on upgrade */ #define BCH_METADATA_VERSIONS() \ x(bkey_renumber, BCH_VERSION(0, 10)) \ diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 035b2cb2..49d0be64 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -166,7 +166,7 @@ void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); do { \ if (trace_##event##_enabled()) { \ CLASS(printbuf, buf)(); \ - printbuf_indent_add(&buf, 2); \ + guard(printbuf_indent)(&buf); \ bch2_btree_pos_to_text(&buf, c, b); \ trace_##event(c, buf.buf); \ } \ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 43f29428..2338feb8 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -717,16 +717,12 @@ fsck_err: static int bch2_gc_btree(struct btree_trans *trans, struct progress_indicator_state *progress, - enum btree_id btree, bool initial) + enum btree_id btree, unsigned target_depth, + bool initial) { struct bch_fs *c = trans->c; - unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; int ret = 0; - /* We need to make sure every leaf node is readable before going RW */ - if (initial) - target_depth = 0; - for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) { struct btree *prev = NULL; struct btree_iter iter; @@ -797,7 +793,21 @@ static int bch2_gc_btrees(struct bch_fs *c) if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) continue; - ret = bch2_gc_btree(trans, &progress, btree, true); + + unsigned target_depth = BIT_ULL(btree) & btree_leaf_has_triggers_mask ? 0 : 1; + + /* + * In fsck, we need to make sure every leaf node is readable + * before going RW, otherwise we can no longer rewind inside + * btree_lost_data to repair during the current fsck run. + * + * Otherwise, we can delay the repair to the next + * mount or offline fsck. + */ + if (test_bit(BCH_FS_in_fsck, &c->flags)) + target_depth = 0; + + ret = bch2_gc_btree(trans, &progress, btree, target_depth, true); } bch_err_fn(c, ret); @@ -1228,7 +1238,7 @@ int bch2_gc_gens(struct bch_fs *c) } for (unsigned i = 0; i < BTREE_ID_NR; i++) - if (btree_type_has_ptrs(i)) { + if (btree_type_has_data_ptrs(i)) { c->gc_gens_btree = i; c->gc_gens_pos = POS_MIN; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 2e3dd9ba..52d21259 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -27,10 +27,15 @@ #include <linux/moduleparam.h> #include <linux/sched/mm.h> +static __maybe_unused unsigned bch2_btree_read_corrupt_ratio; +static __maybe_unused int bch2_btree_read_corrupt_device; + #ifdef CONFIG_BCACHEFS_DEBUG -static unsigned bch2_btree_read_corrupt_ratio; module_param_named(btree_read_corrupt_ratio, bch2_btree_read_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(btree_read_corrupt_ratio, ""); + +module_param_named(btree_read_corrupt_device, bch2_btree_read_corrupt_device, int, 0644); +MODULE_PARM_DESC(btree_read_corrupt_ratio, ""); #endif static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) @@ -1438,7 +1443,9 @@ start: memset(&bio->bi_iter, 0, sizeof(bio->bi_iter)); bio->bi_iter.bi_size = btree_buf_bytes(b); - bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio); + if (bch2_btree_read_corrupt_device == rb->pick.ptr.dev || + bch2_btree_read_corrupt_device < 0) + bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio); ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); if (ret != -BCH_ERR_btree_node_read_err_want_retry && @@ -2523,7 +2530,7 @@ do_write: if (trace_btree_node_write_enabled()) { CLASS(printbuf, buf)(); - printbuf_indent_add(&buf, 2); + guard(printbuf_indent)(&buf); prt_printf(&buf, "offset %u sectors %u bytes %u\n", b->written, sectors_to_write, diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 1e152c67..b72ed543 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -137,18 +137,8 @@ static void __bch2_btree_path_verify_cached(struct btree_trans *trans, static void __bch2_btree_path_verify_level(struct btree_trans *trans, struct btree_path *path, unsigned level) { - struct btree_path_level *l; - struct btree_node_iter tmp; - bool locked; - struct bkey_packed *p, *k; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - struct printbuf buf3 = PRINTBUF; - const char *msg; - - l = &path->l[level]; - tmp = l->iter; - locked = btree_node_locked(path, level); + struct btree_path_level *l = &path->l[level]; + bool locked = btree_node_locked(path, level); if (path->cached) { if (!level) @@ -166,51 +156,68 @@ static void __bch2_btree_path_verify_level(struct btree_trans *trans, bch2_btree_node_iter_verify(&l->iter, l->b); - /* - * For interior nodes, the iterator will have skipped past deleted keys: - */ - p = level + /* For interior nodes, the iterator may have skipped past deleted keys: */ + struct btree_node_iter tmp = l->iter; + const struct bkey_packed *p = level ? bch2_btree_node_iter_prev(&tmp, l->b) : bch2_btree_node_iter_prev_all(&tmp, l->b); - k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + tmp = l->iter; + const struct bkey_packed *k = level + ? bch2_btree_node_iter_peek(&tmp, l->b) + : bch2_btree_node_iter_peek_all(&tmp, l->b); - if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { - msg = "before"; - goto err; - } + const char *msg; + if (!(level > path->level && trans->journal_replay_not_finished)) { + /* + * We can't run these checks for interior nodes when we're still + * using the journal overlay because there might be a key in + * the interior node that points midway through the current leaf + * node - which is deleted in the journal overlay, but set_pos() + * will skip past it and cause the interior node iterators to be + * inconsistent in a way that doesn't matter and it can't check + * for. + */ - if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { - msg = "after"; - goto err; + if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { + msg = "before"; + goto err; + } + + if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { + msg = "after"; + goto err; + } } if (!locked) btree_node_unlock(trans, path, level); return; err: - bch2_bpos_to_text(&buf1, path->pos); + { + CLASS(printbuf, buf)(); + prt_printf(&buf, "path should be %s key at level %u", msg, level); - if (p) { - struct bkey uk = bkey_unpack_key(l->b, p); + prt_str(&buf, "\npath pos "); + bch2_bpos_to_text(&buf, path->pos); - bch2_bkey_to_text(&buf2, &uk); - } else { - prt_printf(&buf2, "(none)"); - } + prt_str(&buf, "\nprev key "); + if (p) { + struct bkey uk = bkey_unpack_key(l->b, p); + bch2_bkey_to_text(&buf, &uk); + } else { + prt_printf(&buf, "(none)"); + } - if (k) { - struct bkey uk = bkey_unpack_key(l->b, k); + prt_str(&buf, "\ncur key "); + if (k) { + struct bkey uk = bkey_unpack_key(l->b, k); + bch2_bkey_to_text(&buf, &uk); + } else { + prt_printf(&buf, "(none)"); + } - bch2_bkey_to_text(&buf3, &uk); - } else { - prt_printf(&buf3, "(none)"); + panic("%s\n", buf.buf); } - - panic("path should be %s key at level %u:\n" - "path pos %s\n" - "prev key %s\n" - "cur key %s\n", - msg, level, buf1.buf, buf2.buf, buf3.buf); } static void __bch2_btree_path_verify(struct btree_trans *trans, @@ -886,28 +893,53 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, btree_node_unlock(trans, path, plevel); } +static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans, + struct btree_path *path) +{ + struct bch_fs *c = trans->c; + CLASS(printbuf, buf)(); + + prt_str(&buf, "node not found at pos: "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, "\n within parent node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key)); + prt_newline(&buf); + + return __bch2_topology_error(c, &buf); +} + +static noinline_for_stack int btree_node_gap_err(struct btree_trans *trans, + struct btree_path *path, + struct bkey_i *k) +{ + struct bch_fs *c = trans->c; + CLASS(printbuf, buf)(); + + prt_str(&buf, "node doesn't cover expected range at pos: "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, "\n within parent node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key)); + prt_str(&buf, "\n but got node: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + prt_newline(&buf); + + return __bch2_topology_error(c, &buf); +} + static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, struct btree_path *path, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; struct btree_path_level *l = path_l(path); - struct btree_and_journal_iter jiter; - struct bkey_s_c k; int ret = 0; + struct btree_and_journal_iter jiter; __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); - k = bch2_btree_and_journal_iter_peek(c, &jiter); + struct bkey_s_c k = bch2_btree_and_journal_iter_peek(c, &jiter); if (!k.k) { - CLASS(printbuf, buf)(); - - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " at btree "); - bch2_btree_pos_to_text(&buf, c, l->b); - - ret = bch2_fs_topology_error(c, "%s", buf.buf); + ret = btree_node_missing_err(trans, path); goto err; } @@ -922,20 +954,16 @@ err: return ret; } -static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans, - struct btree_path *path) +static inline bool bpos_in_btree_node_key(struct bpos pos, const struct bkey_i *k) { - struct bch_fs *c = trans->c; - CLASS(printbuf, buf)(); + if (bpos_gt(pos, k->k.p)) + return false; - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " within parent node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key)); + if (k->k.type == KEY_TYPE_btree_ptr_v2 && + bpos_lt(pos, bkey_i_to_btree_ptr_v2_c(k)->v.min_key)) + return false; - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - return bch_err_throw(c, btree_need_topology_repair); + return true; } static __always_inline int btree_path_down(struct btree_trans *trans, @@ -971,6 +999,9 @@ static __always_inline int btree_path_down(struct btree_trans *trans, } } + if (unlikely(!bpos_in_btree_node_key(path->pos, &trans->btree_path_down))) + return btree_node_gap_err(trans, path, &trans->btree_path_down); + b = bch2_btree_node_get(trans, path, &trans->btree_path_down, level, lock_type, trace_ip); ret = PTR_ERR_OR_ZERO(b); @@ -1476,7 +1507,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { prt_printf(buf, "%u transaction updates for %s journal seq %llu\n", trans->nr_updates, trans->fn, trans->journal_res.seq); - printbuf_indent_add(buf, 2); + guard(printbuf_indent)(buf); trans_for_each_update(trans, i) { struct bkey_s_c old = { &i->old_k, i->old_v }; @@ -1502,8 +1533,6 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) bch2_journal_entry_to_text(buf, trans->c, e); prt_newline(buf); } - - printbuf_indent_sub(buf, 2); } static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) @@ -1556,8 +1585,8 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, bt prt_printf(out, " uptodate %u locks_want %u", path->uptodate, path->locks_want); prt_newline(out); + guard(printbuf_indent)(out); - printbuf_indent_add(out, 2); for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { prt_printf(out, "l=%u locks %s seq %u node ", l, btree_node_locked_str(btree_node_locked_type(path, l)), @@ -1570,7 +1599,6 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, bt prt_printf(out, "%px", path->l[l].b); prt_newline(out); } - printbuf_indent_sub(out, 2); } static noinline __cold diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index 38c5643e..a4f8aac4 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -205,9 +205,8 @@ static noinline __noreturn void break_cycle_fail(struct lock_graph *g) bch2_btree_trans_to_text(&buf, trans); prt_printf(&buf, "backtrace:\n"); - printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); - printbuf_indent_sub(&buf, 2); + scoped_guard(printbuf_indent, &buf) + bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); prt_newline(&buf); } diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c index b618a0bd..c0dff992 100644 --- a/libbcachefs/btree_node_scan.c +++ b/libbcachefs/btree_node_scan.c @@ -42,12 +42,11 @@ static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, con static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) { - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); darray_for_each(nodes, i) { found_btree_node_to_text(out, c, i); prt_newline(out); } - printbuf_indent_sub(out, 2); } static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 5fa7f2f9..2966971e 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -970,6 +970,7 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans, struct bkey_i *accounting; retry: + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); percpu_down_read(&c->mark_lock); for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); accounting != btree_trans_subbuf_top(trans, &trans->accounting); @@ -983,6 +984,9 @@ retry: } percpu_up_read(&c->mark_lock); + /* Only fatal errors are possible later, so no need to revert this */ + bch2_trans_account_disk_usage_change(trans); + trans_for_each_update(trans, i) { ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); if (ret) diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index e893eb93..9e3c8512 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -840,6 +840,10 @@ static inline bool btree_node_type_has_triggers(enum btree_node_type type) return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS; } +/* A mask of btree id bits that have triggers for their leaves */ +__maybe_unused +static const u64 btree_leaf_has_triggers_mask = BTREE_NODE_TYPE_HAS_TRIGGERS >> 1; + static const u64 btree_is_extents_mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr) BCH_BTREE_IDS() @@ -883,15 +887,15 @@ static inline bool btree_type_has_snapshot_field(enum btree_id btree) return BIT_ULL(btree) & mask; } -static inline bool btree_type_has_ptrs(enum btree_id btree) -{ - const u64 mask = 0 +static const u64 btree_has_data_ptrs_mask = 0 #define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr) BCH_BTREE_IDS() #undef x ; - return BIT_ULL(btree) & mask; +static inline bool btree_type_has_data_ptrs(enum btree_id btree) +{ + return BIT_ULL(btree) & btree_has_data_ptrs_mask; } static inline bool btree_type_uses_write_buffer(enum btree_id btree) diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index a9877a47..ce86d158 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -324,9 +324,6 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct btree *b; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim - ? BTREE_NODE_RESERVE - : 0; int ret; b = bch2_btree_node_mem_alloc(trans, interior_node); @@ -334,41 +331,6 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, return b; BUG_ON(b->ob.nr); - - mutex_lock(&c->btree_reserve_cache_lock); - if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { - guard(spinlock)(&c->freelist_lock); - if (c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark)) { - if (cl) - closure_wait(&c->open_buckets_wait, cl); - - ret = cl - ? bch_err_throw(c, bucket_alloc_blocked) - : bch_err_throw(c, open_buckets_empty); - mutex_unlock(&c->btree_reserve_cache_lock); - goto err; - } - } - - if (c->btree_reserve_cache_nr > nr_reserve) { - for (struct btree_alloc *a = c->btree_reserve_cache; - a < c->btree_reserve_cache + c->btree_reserve_cache_nr;) { - /* check if it has sufficient durability */ - - if (!can_use_btree_node(c, res, target, bkey_i_to_s_c(&a->k))) { - bch2_open_buckets_put(c, &a->ob); - *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - continue; - } - - bkey_copy(&b->key, &a->k); - b->ob = a->ob; - *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - mutex_unlock(&c->btree_reserve_cache_lock); - goto out; - } - } - mutex_unlock(&c->btree_reserve_cache_lock); retry: ret = bch2_alloc_sectors_start_trans(trans, target ?: @@ -398,12 +360,29 @@ retry: goto retry; } + mutex_lock(&c->btree_reserve_cache_lock); + while (c->btree_reserve_cache_nr) { + struct btree_alloc *a = c->btree_reserve_cache + --c->btree_reserve_cache_nr; + + /* check if it has sufficient durability */ + + if (can_use_btree_node(c, res, target, bkey_i_to_s_c(&a->k))) { + bkey_copy(&b->key, &a->k); + b->ob = a->ob; + mutex_unlock(&c->btree_reserve_cache_lock); + goto out; + } + + bch2_open_buckets_put(c, &a->ob); + } + mutex_unlock(&c->btree_reserve_cache_lock); + bkey_btree_ptr_v2_init(&b->key); bch2_alloc_sectors_append_ptrs(c, wp, &b->key, btree_sectors(c), false); bch2_open_bucket_get(c, wp, &b->ob); - bch2_alloc_sectors_done(c, wp); out: + bch2_alloc_sectors_done(c, wp); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -723,8 +702,10 @@ static void btree_update_nodes_written(struct btree_update *as) if (ret) goto err; - if (!btree_update_new_nodes_marked_sb(as)) + if (!btree_update_new_nodes_marked_sb(as)) { + bch2_trans_unlock_long(trans); btree_update_new_nodes_mark_sb(as); + } /* * Wait for any in flight writes to finish before we free the old nodes @@ -2810,7 +2791,7 @@ static void bch2_btree_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct btree_alloc *a) { - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k)); prt_newline(out); @@ -2818,8 +2799,6 @@ static void bch2_btree_alloc_to_text(struct printbuf *out, unsigned i; open_bucket_for_each(c, &a->ob, ob, i) bch2_open_bucket_to_text(out, c, ob); - - printbuf_indent_sub(out, 2); } void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c) diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 7bd9cf61..10bfadcd 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -130,7 +130,7 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type, } static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, - struct bch_io_opts opts) + struct bch_inode_opts opts) { if (opts.nocow) return 0; diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 2c997fdd..7a0da6cd 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -11,6 +11,7 @@ #include "ec.h" #include "error.h" #include "extents.h" +#include "inode.h" #include "io_write.h" #include "keylist.h" #include "move.h" @@ -428,13 +429,18 @@ restart_drop_extra_replicas: goto out; } + struct bch_inode_opts opts; + ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: + bch2_inum_snapshot_opts_get(trans, k.k->p.inode, k.k->p.snapshot, &opts) ?: + bch2_bkey_set_needs_rebalance(c, &opts, insert, + SET_NEEDS_REBALANCE_foreground, + m->op.opts.change_cookie) ?: bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_internal_snapshot_node); if (ret) @@ -613,7 +619,7 @@ int bch2_update_unwritten_extent(struct btree_trans *trans, } void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { if (!out->nr_tabstops) @@ -657,31 +663,32 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) prt_str_indented(out, "old key:\t"); bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); + + bch2_write_op_to_text(out, &m->op); } void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) { bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); prt_newline(out); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); if (!m->read_done) { prt_printf(out, "read:\n"); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); bch2_read_bio_to_text(out, m->op.c, &m->rbio); } else { prt_printf(out, "write:\n"); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); bch2_write_op_to_text(out, &m->op); } - printbuf_indent_sub(out, 4); } int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { struct bch_fs *c = trans->c; @@ -731,7 +738,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, } static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, unsigned buf_bytes) { unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); @@ -758,7 +765,7 @@ static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, } int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts) + struct bch_inode_opts *io_opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); const union bch_extent_entry *entry; @@ -830,7 +837,7 @@ int bch2_data_update_init(struct btree_trans *trans, struct moving_context *ctxt, struct data_update *m, struct write_point_specifier wp, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts data_opts, enum btree_id btree_id, struct bkey_s_c k) diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h index fc12aa65..3b0ba6f6 100644 --- a/libbcachefs/data_update.h +++ b/libbcachefs/data_update.h @@ -23,7 +23,7 @@ struct data_update_opts { }; void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, - struct bch_io_opts *, struct data_update_opts *); + struct bch_inode_opts *, struct data_update_opts *); #define BCH_DATA_UPDATE_TYPES() \ x(copygc, 0) \ @@ -76,18 +76,18 @@ void bch2_data_update_read_done(struct data_update *); int bch2_extent_drop_ptrs(struct btree_trans *, struct btree_iter *, struct bkey_s_c, - struct bch_io_opts *, + struct bch_inode_opts *, struct data_update_opts *); int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, - struct bch_io_opts *); + struct bch_inode_opts *); void bch2_data_update_exit(struct data_update *); int bch2_data_update_init(struct btree_trans *, struct btree_iter *, struct moving_context *, struct data_update *, struct write_point_specifier, - struct bch_io_opts *, struct data_update_opts, + struct bch_inode_opts *, struct data_update_opts, enum btree_id, struct bkey_s_c); void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 33cb94f7..ebfb68e2 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -282,16 +282,13 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, le64_to_cpu(i->journal_seq)); offset += sectors; - printbuf_indent_add(out, 4); + scoped_guard(printbuf_indent, out) + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) { + struct bkey u; - for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) { - struct bkey u; - - bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u)); - prt_newline(out); - } - - printbuf_indent_sub(out, 4); + bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u)); + prt_newline(out); + } } out: if (bio) @@ -468,7 +465,7 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); prt_printf(out, "\n"); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); prt_newline(out); @@ -488,8 +485,6 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * &b->writes[1].journal, b->writes[1].journal.seq); prt_printf(out, "ob:\t%u\n", b->ob.nr); - - printbuf_indent_sub(out, 2); } static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, @@ -605,9 +600,8 @@ restart: bch2_btree_trans_to_text(&i->buf, trans); prt_printf(&i->buf, "backtrace:\n"); - printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL); - printbuf_indent_sub(&i->buf, 2); + scoped_guard(printbuf_indent, &i->buf) + bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL); prt_newline(&i->buf); closure_put(&trans->ref); @@ -765,40 +759,35 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, break; prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]); - printbuf_indent_add(&i->buf, 2); + guard(printbuf_indent)(&i->buf); guard(mutex)(&s->lock); prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); #ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE - printbuf_indent_add(&i->buf, 2); - bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); - printbuf_indent_sub(&i->buf, 2); + scoped_guard(printbuf_indent, &i->buf) + bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); #endif prt_printf(&i->buf, "Transaction duration:\n"); - printbuf_indent_add(&i->buf, 2); - bch2_time_stats_to_text(&i->buf, &s->duration); - printbuf_indent_sub(&i->buf, 2); + scoped_guard(printbuf_indent, &i->buf) + bch2_time_stats_to_text(&i->buf, &s->duration); if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { prt_printf(&i->buf, "Lock hold times:\n"); - printbuf_indent_add(&i->buf, 2); - bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); - printbuf_indent_sub(&i->buf, 2); + scoped_guard(printbuf_indent, &i->buf) + bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); } if (s->max_paths_text) { prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths); - printbuf_indent_add(&i->buf, 2); - prt_str_indented(&i->buf, s->max_paths_text); - printbuf_indent_sub(&i->buf, 2); + scoped_guard(printbuf_indent, &i->buf) + prt_str_indented(&i->buf, s->max_paths_text); } - printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); i->iter++; } diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c index d6c91abc..a99f821c 100644 --- a/libbcachefs/disk_accounting.c +++ b/libbcachefs/disk_accounting.c @@ -239,10 +239,12 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, c, accounting_key_junk_at_end, "junk at end of accounting key"); - bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type], + const unsigned nr_counters = bch2_accounting_counters(k.k); + + bkey_fsck_err_on(!nr_counters || nr_counters > BCH_ACCOUNTING_MAX_COUNTERS, c, accounting_key_nr_counters_wrong, "accounting key with %u counters, should be %u", - bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]); + nr_counters, bch2_accounting_type_nr_counters[acc_k.type]); fsck_err: return ret; } @@ -359,10 +361,13 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun accounting_pos_cmp, &a.k->p) < acc->k.nr) return 0; + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, a.k->p); + struct accounting_mem_entry n = { .pos = a.k->p, .bversion = a.k->bversion, - .nr_counters = bch2_accounting_counters(a.k), + .nr_counters = bch2_accounting_type_nr_counters[acc_k.type], .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), sizeof(u64), GFP_KERNEL), }; @@ -735,10 +740,12 @@ invalid_device: goto fsck_err; } -static struct journal_key *accumulate_newer_accounting_keys(struct bch_fs *c, struct journal_key *i) +static struct journal_key *accumulate_newer_accounting_keys(struct btree_trans *trans, struct journal_key *i) { + struct bch_fs *c = trans->c; struct journal_keys *keys = &c->journal_keys; struct bkey_i *k = journal_key_k(c, i); + int ret = 0; darray_for_each_from(*keys, j, i + 1) { if (journal_key_cmp(c, i, j)) @@ -746,7 +753,18 @@ static struct journal_key *accumulate_newer_accounting_keys(struct bch_fs *c, st struct bkey_i *n = journal_key_k(c, j); if (n->k.type == KEY_TYPE_accounting) { - WARN_ON(bversion_cmp(k->k.bversion, n->k.bversion) >= 0); + if (bversion_cmp(k->k.bversion, n->k.bversion) >= 0) { + CLASS(printbuf, buf)(); + prt_printf(&buf, "accounting keys with out of order versions:"); + + prt_newline(&buf); + prt_printf(&buf, "%u.%u ", i->journal_seq_offset, i->journal_offset); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + prt_newline(&buf); + prt_printf(&buf, "%u.%u ", j->journal_seq_offset, j->journal_offset); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(n)); + fsck_err(trans, accounting_key_version_out_of_order, "%s", buf.buf); + } bch2_accounting_accumulate(bkey_i_to_accounting(k), bkey_i_to_s_c_accounting(n)); @@ -755,14 +773,16 @@ static struct journal_key *accumulate_newer_accounting_keys(struct bch_fs *c, st } return &darray_top(*keys); +fsck_err: + return ERR_PTR(ret); } static struct journal_key *accumulate_and_read_journal_accounting(struct btree_trans *trans, struct journal_key *i) { - struct bch_fs *c = trans->c; - struct journal_key *next = accumulate_newer_accounting_keys(c, i); + struct journal_key *next = accumulate_newer_accounting_keys(trans, i); - int ret = accounting_read_key(trans, bkey_i_to_s_c(journal_key_k(c, i))); + int ret = PTR_ERR_OR_ZERO(next) ?: + accounting_read_key(trans, bkey_i_to_s_c(journal_key_k(trans->c, i))); return ret ? ERR_PTR(ret) : next; } @@ -863,46 +883,44 @@ int bch2_accounting_read(struct bch_fs *c) *dst++ = *i; keys->gap = keys->nr = dst - keys->data; - guard(percpu_write)(&c->mark_lock); + CLASS(printbuf, underflow_err)(); - darray_for_each_reverse(acc->k, i) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, i->pos); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - memset(v, 0, sizeof(v)); + scoped_guard(percpu_write, &c->mark_lock) { + darray_for_each_reverse(acc->k, i) { + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, i->pos); - for (unsigned j = 0; j < i->nr_counters; j++) - v[j] = percpu_u64_get(i->v[0] + j); + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + memset(v, 0, sizeof(v)); - /* - * If the entry counters are zeroed, it should be treated as - * nonexistent - it might point to an invalid device. - * - * Remove it, so that if it's re-added it gets re-marked in the - * superblock: - */ - ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) - ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); + for (unsigned j = 0; j < i->nr_counters; j++) + v[j] = percpu_u64_get(i->v[0] + j); - if (ret == -BCH_ERR_remove_disk_accounting_entry) { - free_percpu(i->v[0]); - free_percpu(i->v[1]); - darray_remove_item(&acc->k, i); - ret = 0; - continue; - } + /* + * If the entry counters are zeroed, it should be treated as + * nonexistent - it might point to an invalid device. + * + * Remove it, so that if it's re-added it gets re-marked in the + * superblock: + */ + ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) + ? -BCH_ERR_remove_disk_accounting_entry + : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); - if (ret) - return ret; - } + if (ret == -BCH_ERR_remove_disk_accounting_entry) { + free_percpu(i->v[0]); + free_percpu(i->v[1]); + darray_remove_item(&acc->k, i); + ret = 0; + continue; + } - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); + if (ret) + return ret; + } - scoped_guard(preempt) { - struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); for (unsigned i = 0; i < acc->k.nr; i++) { struct disk_accounting_pos k; @@ -924,27 +942,20 @@ int bch2_accounting_read(struct bch_fs *c) underflow |= (s64) v[j] < 0; if (underflow) { - CLASS(printbuf, buf)(); - bch2_log_msg_start(c, &buf); - - prt_printf(&buf, "Accounting underflow for\n"); - bch2_accounting_key_to_text(&buf, &k); + if (!underflow_err.pos) { + bch2_log_msg_start(c, &underflow_err); + prt_printf(&underflow_err, "Accounting underflow for\n"); + } + bch2_accounting_key_to_text(&underflow_err, &k); for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) - prt_printf(&buf, " %lli", v[j]); - - bool print = bch2_count_fsck_err(c, accounting_key_underflow, &buf); - unsigned pos = buf.pos; - ret = bch2_run_explicit_recovery_pass(c, &buf, - BCH_RECOVERY_PASS_check_allocations, 0); - print |= buf.pos != pos; - - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); - if (ret) - return ret; + prt_printf(&underflow_err, " %lli", v[j]); + prt_newline(&underflow_err); } + guard(preempt)(); + struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); + switch (k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: usage->reserved += v[0] * k.persistent_reserved.nr_replicas; @@ -971,24 +982,60 @@ int bch2_accounting_read(struct bch_fs *c) } } + if (underflow_err.pos) { + bool print = bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err); + unsigned pos = underflow_err.pos; + ret = bch2_run_explicit_recovery_pass(c, &underflow_err, + BCH_RECOVERY_PASS_check_allocations, 0); + print |= underflow_err.pos != pos; + + if (print) + bch2_print_str(c, KERN_ERR, underflow_err.buf); + if (ret) + return ret; + } + return ret; } -int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) +int bch2_dev_usage_remove(struct bch_fs *c, struct bch_dev *ca) { CLASS(btree_trans, trans)(c); + + struct disk_accounting_pos start; + disk_accounting_key_init(start, dev_data_type, .dev = ca->dev_idx); + + struct disk_accounting_pos end; + disk_accounting_key_init(end, dev_data_type, .dev = ca->dev_idx, .data_type = U8_MAX); + return bch2_btree_write_buffer_flush_sync(trans) ?: - for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, - BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ - struct disk_accounting_pos acc; - bpos_to_disk_accounting_pos(&acc, k.k->p); + commit_do(trans, NULL, NULL, 0, ({ + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_accounting, + disk_accounting_pos_to_bpos(&start), + disk_accounting_pos_to_bpos(&end), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->type != KEY_TYPE_accounting) + continue; + + struct disk_accounting_pos acc; + bpos_to_disk_accounting_pos(&acc, k.k->p); - acc.type == BCH_DISK_ACCOUNTING_dev_data_type && - acc.dev_data_type.dev == dev - ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) - : 0; - })) ?: - bch2_btree_write_buffer_flush_sync(trans); + const unsigned nr = bch2_accounting_counters(k.k); + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; + memcpy_u64s_small(v, bkey_s_c_to_accounting(k).v->d, nr); + + bch2_u64s_neg(v, nr); + + ret = bch2_disk_accounting_mod(trans, &acc, v, nr, false); + if (ret) + break; + } + + ret; + })) ?: bch2_btree_write_buffer_flush_sync(trans); } int bch2_dev_usage_init(struct bch_dev *ca, bool gc) @@ -1059,13 +1106,17 @@ void bch2_verify_accounting_clean(struct bch_fs *c) case BCH_DISK_ACCOUNTING_dev_data_type: { { guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ + const enum bch_data_type data_type = acc_k.dev_data_type.data_type; struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (!ca) continue; - v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); - v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); - v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); + v[0] = percpu_u64_get(&ca->usage->d[data_type].buckets); + v[1] = percpu_u64_get(&ca->usage->d[data_type].sectors); + v[2] = percpu_u64_get(&ca->usage->d[data_type].fragmented); + + if (data_type == BCH_DATA_sb || data_type == BCH_DATA_journal) + base.hidden += a.v->d[0] * ca->mi.bucket_size; } if (memcmp(a.v->d, v, 3 * sizeof(u64))) { @@ -1093,7 +1144,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) mismatch = true; \ } - //check(hidden); + check(hidden); check(btree); check(data); check(cached); diff --git a/libbcachefs/disk_accounting.h b/libbcachefs/disk_accounting.h index cc73cce9..c0d3d7e8 100644 --- a/libbcachefs/disk_accounting.h +++ b/libbcachefs/disk_accounting.h @@ -186,11 +186,15 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, break; case BCH_DISK_ACCOUNTING_dev_data_type: { guard(rcu)(); + const enum bch_data_type data_type = acc_k.dev_data_type.data_type; struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (ca) { - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); - this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); + this_cpu_add(ca->usage->d[data_type].buckets, a.v->d[0]); + this_cpu_add(ca->usage->d[data_type].sectors, a.v->d[1]); + this_cpu_add(ca->usage->d[data_type].fragmented, a.v->d[2]); + + if (data_type == BCH_DATA_sb || data_type == BCH_DATA_journal) + trans->fs_usage_delta.hidden += a.v->d[0] * ca->mi.bucket_size; } break; } @@ -212,9 +216,9 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct accounting_mem_entry *e = &acc->k.data[idx]; - EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); + const unsigned nr = min_t(unsigned, bch2_accounting_counters(a.k), e->nr_counters); - for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) + for (unsigned i = 0; i < nr; i++) this_cpu_add(e->v[gc][i], a.v->d[i]); return 0; } @@ -297,7 +301,7 @@ int bch2_gc_accounting_done(struct bch_fs *); int bch2_accounting_read(struct bch_fs *); -int bch2_dev_usage_remove(struct bch_fs *, unsigned); +int bch2_dev_usage_remove(struct bch_fs *, struct bch_dev *); int bch2_dev_usage_init(struct bch_dev *, bool); void bch2_verify_accounting_clean(struct bch_fs *c); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index c2840cb6..271e2521 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -2040,7 +2040,7 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, unsigned dev_idx, - unsigned flags) + unsigned flags, struct printbuf *err) { if (k.k->type != KEY_TYPE_stripe) return 0; @@ -2081,13 +2081,21 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; } - if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) + if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) { + prt_str(err, "cannot drop device without degrading\n "); + bch2_bkey_val_to_text(err, c, k); + prt_newline(err); return bch_err_throw(c, remove_would_lose_data); + } unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; - if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) + if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) { + prt_str(err, "cannot drop device without losing data\n "); + bch2_bkey_val_to_text(err, c, k); + prt_newline(err); return bch_err_throw(c, remove_would_lose_data); + } sectors = -sectors; @@ -2099,7 +2107,7 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, } static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a, - unsigned flags) + unsigned flags, struct printbuf *err) { struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); @@ -2119,17 +2127,18 @@ static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, s if (ret) return ret; - return bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags); + return bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags, err); } -int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) +int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, + unsigned flags, struct printbuf *err) { CLASS(btree_trans, trans)(c); int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), BTREE_ITER_intent, k, NULL, NULL, 0, ({ - bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); + bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags, err); })); bch_err_fn(c, ret); return ret; diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index e807e702..cc778da9 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -289,8 +289,9 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, } int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *, - struct bkey_s_c, unsigned, unsigned); -int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned); + struct bkey_s_c, unsigned, + unsigned, struct printbuf *); +int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned, struct printbuf *); void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_fs_ec_stop(struct bch_fs *); diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index adc1f931..420f6922 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -345,6 +345,7 @@ x(BCH_ERR_data_read, data_read_no_encryption_key) \ x(BCH_ERR_data_read, data_read_buffer_too_small) \ x(BCH_ERR_data_read, data_read_key_overwritten) \ + x(0, rbio_narrow_crcs_fail) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index e33f3166..a16f55d9 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -394,7 +394,7 @@ int bch2_fsck_err_opt(struct bch_fs *c, flags |= fsck_flags_extra[err]; if (test_bit(BCH_FS_in_fsck, &c->flags) || - test_bit(BCH_FS_in_recovery, &c->flags)) { + c->opts.fix_errors != FSCK_FIX_exit) { if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) return bch_err_throw(c, fsck_repair_unimplemented); @@ -468,10 +468,10 @@ int __bch2_fsck_err(struct bch_fs *c, if ((flags & FSCK_ERR_SILENT) || test_bit(err, c->sb.errors_silent)) { - ret = flags & FSCK_CAN_FIX + set_bit(BCH_FS_errors_fixed_silent, &c->flags); + return flags & FSCK_CAN_FIX ? bch_err_throw(c, fsck_fix) : bch_err_throw(c, fsck_ignore); - goto err; } printbuf_indent_add_nextline(out, 2); diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c index 73eb2809..1279026b 100644 --- a/libbcachefs/extent_update.c +++ b/libbcachefs/extent_update.c @@ -146,6 +146,7 @@ int bch2_extent_trim_atomic(struct btree_trans *trans, if (bpos_ge(bkey_start_pos(k.k), end)) break; + nr_iters += 1; ret = count_iters_for_insert(trans, k, offset, &end, &nr_iters); if (ret) break; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 68a61f7b..86aa93ea 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -1151,7 +1151,7 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke return NULL; } -static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, +static bool want_cached_ptr(struct bch_fs *c, struct bch_inode_opts *opts, struct bch_extent_ptr *ptr) { unsigned target = opts->promote_target ?: opts->foreground_target; @@ -1165,7 +1165,7 @@ static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, } void bch2_extent_ptr_set_cached(struct bch_fs *c, - struct bch_io_opts *opts, + struct bch_inode_opts *opts, struct bkey_s k, struct bch_extent_ptr *ptr) { @@ -1241,7 +1241,7 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) * the promote target. */ bool bch2_extent_normalize_by_opts(struct bch_fs *c, - struct bch_io_opts *opts, + struct bch_inode_opts *opts, struct bkey_s k) { struct bkey_ptrs ptrs; @@ -1270,14 +1270,14 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); if (!ca) { - prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + prt_printf(out, "%u:%llu gen %u%s", ptr->dev, (u64) ptr->offset, ptr->gen, ptr->cached ? " cached" : ""); } else { u32 offset; u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); - prt_printf(out, "ptr: %u:%llu:%u gen %u", + prt_printf(out, "%u:%llu:%u gen %u", ptr->dev, b, offset, ptr->gen); if (ca->mi.durability != 1) prt_printf(out, " d=%u", ca->mi.durability); @@ -1295,7 +1295,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc) { - prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ", + prt_printf(out, "c_size %u size %u offset %u nonce %u csum ", crc->compressed_size, crc->uncompressed_size, crc->offset, crc->nonce); @@ -1305,72 +1305,34 @@ void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_cr bch2_prt_compression_type(out, crc->compression_type); } -static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, - const struct bch_extent_rebalance *r) -{ - prt_str(out, "rebalance:"); - - prt_printf(out, " replicas=%u", r->data_replicas); - if (r->data_replicas_from_inode) - prt_str(out, " (inode)"); - - prt_str(out, " checksum="); - bch2_prt_csum_opt(out, r->data_checksum); - if (r->data_checksum_from_inode) - prt_str(out, " (inode)"); - - if (r->background_compression || r->background_compression_from_inode) { - prt_str(out, " background_compression="); - bch2_compression_opt_to_text(out, r->background_compression); - - if (r->background_compression_from_inode) - prt_str(out, " (inode)"); - } - - if (r->background_target || r->background_target_from_inode) { - prt_str(out, " background_target="); - if (c) - bch2_target_to_text(out, c, r->background_target); - else - prt_printf(out, "%u", r->background_target); - - if (r->background_target_from_inode) - prt_str(out, " (inode)"); - } - - if (r->promote_target || r->promote_target_from_inode) { - prt_str(out, " promote_target="); - if (c) - bch2_target_to_text(out, c, r->promote_target); - else - prt_printf(out, "%u", r->promote_target); - - if (r->promote_target_from_inode) - prt_str(out, " (inode)"); - } - - if (r->erasure_code || r->erasure_code_from_inode) { - prt_printf(out, " ec=%u", r->erasure_code); - if (r->erasure_code_from_inode) - prt_str(out, " (inode)"); - } -} +static const char * const extent_entry_types[] = { +#define x(t, n, ...) [n] = #t, + BCH_EXTENT_ENTRY_TYPES() +#undef x + NULL +}; void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; - bool first = true; if (c) prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k)); + guard(printbuf_indent)(out); + bkey_extent_entry_for_each(ptrs, entry) { - if (!first) - prt_printf(out, " "); + prt_newline(out); - switch (__extent_entry_type(entry)) { + unsigned type = __extent_entry_type(entry); + if (type < BCH_EXTENT_ENTRY_MAX) { + prt_str(out, extent_entry_types[__extent_entry_type(entry)]); + prt_str(out, ": "); + } + + switch (type) { case BCH_EXTENT_ENTRY_ptr: bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); break; @@ -1387,8 +1349,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, case BCH_EXTENT_ENTRY_stripe_ptr: { const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr; - prt_printf(out, "ec: idx %llu block %u", - (u64) ec->idx, ec->block); + prt_printf(out, "idx %llu block %u", (u64) ec->idx, ec->block); break; } case BCH_EXTENT_ENTRY_rebalance: @@ -1403,8 +1364,6 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; } - - first = false; } } diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index f6dcb171..03ea7c68 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -686,10 +686,10 @@ bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); struct bch_extent_ptr * bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); -void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *, +void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_inode_opts *, struct bkey_s, struct bch_extent_ptr *); -bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s); +bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_inode_opts *, struct bkey_s); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index 9532f1a7..fe684adc 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -284,12 +284,12 @@ void bch2_readahead(struct readahead_control *ractl) { struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; struct folio *folio; struct readpages_iter readpages_iter; struct blk_plug plug; - bch2_inode_opts_get(&opts, c, &inode->ei_inode); + struct bch_inode_opts opts; + bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts); int ret = readpages_iter_init(&readpages_iter, ractl); if (ret) @@ -350,7 +350,7 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_read_bio *rbio; - struct bch_io_opts opts; + struct bch_inode_opts opts; struct blk_plug plug; int ret; DECLARE_COMPLETION_ONSTACK(done); @@ -361,7 +361,7 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) if (!bch2_folio_create(folio, GFP_KERNEL)) return -ENOMEM; - bch2_inode_opts_get(&opts, c, &inode->ei_inode); + bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), c, @@ -407,7 +407,7 @@ struct bch_writepage_io { struct bch_writepage_state { struct bch_writepage_io *io; - struct bch_io_opts opts; + struct bch_inode_opts opts; struct bch_folio_sector *tmp; unsigned tmp_sectors; struct blk_plug plug; @@ -532,6 +532,39 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); } +static bool can_write_now(struct bch_fs *c, unsigned replicas_want, struct closure *cl) +{ + unsigned reserved = OPEN_BUCKETS_COUNT - + (OPEN_BUCKETS_COUNT - bch2_open_buckets_reserved(BCH_WATERMARK_normal)) / 2; + + if (unlikely(c->open_buckets_nr_free <= reserved)) { + closure_wait(&c->open_buckets_wait, cl); + return false; + } + + if (BCH_WATERMARK_normal < c->journal.watermark && !bch2_journal_error(&c->journal)) { + closure_wait(&c->journal.async_wait, cl); + return false; + } + + return true; +} + +static void throttle_writes(struct bch_fs *c, unsigned replicas_want, struct closure *cl) +{ + u64 start = 0; + while (!can_write_now(c, replicas_want, cl)) { + if (!start) + start = local_clock(); + closure_sync(cl); + } + + BUG_ON(closure_nr_remaining(cl) > 1); + + if (start) + bch2_time_stats_update(&c->times[BCH_TIME_blocked_writeback_throttle], start); +} + static int __bch2_writepage(struct folio *folio, struct writeback_control *wbc, void *data) @@ -667,26 +700,25 @@ do_io: return 0; } -static int bch2_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, void *data) -{ - struct folio *folio = NULL; - int error; - - while ((folio = writeback_iter(mapping, wbc, folio, &error))) - error = __bch2_writepage(folio, wbc, data); - return error; -} - int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct bch_fs *c = mapping->host->i_sb->s_fs_info; struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL); - bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode); + bch2_inode_opts_get_inode(c, &to_bch_ei(mapping->host)->ei_inode, &w->opts); blk_start_plug(&w->plug); - int ret = bch2_write_cache_pages(mapping, wbc, w); + + struct closure cl; + closure_init_stack(&cl); + + struct folio *folio = NULL; + int ret = 0; + + while (throttle_writes(c, w->opts.data_replicas, &cl), + (folio = writeback_iter(mapping, wbc, folio, &ret))) + ret = __bch2_writepage(folio, wbc, w); + if (w->io) bch2_writepage_do_io(w); blk_finish_plug(&w->plug); @@ -697,7 +729,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc /* buffered writes: */ -int bch2_write_begin(struct file *file, struct address_space *mapping, +int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -780,7 +812,7 @@ err_unlock: return bch2_err_class(ret); } -int bch2_write_end(struct file *file, struct address_space *mapping, +int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { diff --git a/libbcachefs/fs-io-buffered.h b/libbcachefs/fs-io-buffered.h index 3207ebbb..14de91c2 100644 --- a/libbcachefs/fs-io-buffered.h +++ b/libbcachefs/fs-io-buffered.h @@ -10,9 +10,9 @@ int bch2_read_folio(struct file *, struct folio *); int bch2_writepages(struct address_space *, struct writeback_control *); void bch2_readahead(struct readahead_control *); -int bch2_write_begin(struct file *, struct address_space *, loff_t pos, +int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos, unsigned len, struct folio **, void **); -int bch2_write_end(struct file *, struct address_space *, loff_t, +int bch2_write_end(const struct kiocb *, struct address_space *, loff_t, unsigned len, unsigned copied, struct folio *, void *); ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c index 79823234..a104b9d7 100644 --- a/libbcachefs/fs-io-direct.c +++ b/libbcachefs/fs-io-direct.c @@ -68,7 +68,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct file *file = req->ki_filp; struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; struct dio_read *dio; struct bio *bio; struct blk_plug plug; @@ -78,7 +77,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) size_t shorten; ssize_t ret; - bch2_inode_opts_get(&opts, c, &inode->ei_inode); + struct bch_inode_opts opts; + bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts); /* bios must be 512 byte aligned: */ if ((offset|iter->count) & (SECTOR_SIZE - 1)) @@ -445,13 +445,13 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) struct kiocb *req = dio->req; struct address_space *mapping = dio->mapping; struct bch_inode_info *inode = dio->inode; - struct bch_io_opts opts; + struct bch_inode_opts opts; struct bio *bio = &dio->op.wbio.bio; unsigned unaligned, iter_count; bool sync = dio->sync, dropped_locks; long ret; - bch2_inode_opts_get(&opts, c, &inode->ei_inode); + bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts); while (1) { iter_count = dio->iter.count; diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index de0d965f..57e9459a 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -627,10 +627,10 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bpos end_pos = POS(inode->v.i_ino, end_sector); - struct bch_io_opts opts; + struct bch_inode_opts opts; int ret = 0; - bch2_inode_opts_get(&opts, c, &inode->ei_inode); + bch2_inode_opts_get_inode(c, &inode->ei_inode, &opts); CLASS(btree_trans, trans)(c); CLASS(btree_iter, iter)(trans, BTREE_ID_extents, diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 39ebcab1..d6a2031e 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1520,11 +1520,11 @@ static const struct vm_operations_struct bch_vm_ops = { .page_mkwrite = bch2_page_mkwrite, }; -static int bch2_mmap(struct file *file, struct vm_area_struct *vma) +static int bch2_mmap_prepare(struct vm_area_desc *desc) { - file_accessed(file); + file_accessed(desc->file); - vma->vm_ops = &bch_vm_ops; + desc->vm_ops = &bch_vm_ops; return 0; } @@ -1586,7 +1586,7 @@ static const __maybe_unused unsigned bch_flags_to_xflags[] = { }; static int bch2_fileattr_get(struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -1649,7 +1649,7 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans, static int bch2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -1714,7 +1714,7 @@ static const struct file_operations bch_file_operations = { .llseek = bch2_llseek, .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, - .mmap = bch2_mmap, + .mmap_prepare = bch2_mmap_prepare, .get_unmapped_area = thp_get_unmapped_area, .fsync = bch2_fsync, .splice_read = filemap_splice_read, @@ -2147,9 +2147,11 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); int ret = bch2_inode_rm(c, inode_inum(inode)); if (ret && !bch2_err_matches(ret, EROFS)) { - bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu", - inode->ei_inum.subvol, - inode->ei_inum.inum); + CLASS(printbuf, buf)(); + bch2_trans_do(c, bch2_inum_to_path(trans, inode->ei_inum, &buf)); + + bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu\n%s", + inode->ei_inum.subvol, inode->ei_inum.inum, buf.buf); bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm); } @@ -2236,11 +2238,16 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) struct bch_fs *c = sb->s_fs_info; struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); unsigned shift = sb->s_blocksize_bits - 9; + /* - * this assumes inodes take up 64 bytes, which is a decent average + * This assumes inodes take up 64 bytes, which is a decent average * number: + * + * Not anymore - bi_dir, bi_dir_offset came later and shouldn't have + * been varint fields: seeing 144-160 byte inodes, so let's call it 256 + * bytes: */ - u64 avail_inodes = ((usage.capacity - usage.used) << 3); + u64 avail_inodes = ((usage.capacity - usage.used) << 1); buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 4aa130ff..543627fb 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -369,9 +369,9 @@ err: } int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans, - u64 inode_nr, u32 snapshot, - struct bch_inode_unpacked *inode, - unsigned flags) + u64 inode_nr, u32 snapshot, + struct bch_inode_unpacked *inode, + unsigned flags) { CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, SPOS(0, inode_nr, snapshot), flags); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); @@ -598,7 +598,7 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { prt_printf(out, "\n"); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); prt_printf(out, "mode=%o\n", inode->bi_mode); prt_str(out, "flags="); @@ -620,7 +620,6 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, #undef x bch2_printbuf_strip_trailing_newline(out); - printbuf_indent_sub(out, 2); } void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) @@ -674,7 +673,7 @@ static inline void bkey_inode_flags_set(struct bkey_s k, u64 f) static inline bool bkey_is_unlinked_inode(struct bkey_s_c k) { - unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked; + unsigned f = bkey_inode_flags(k); return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot); } @@ -1224,32 +1223,45 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) return ret; } -void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, - struct bch_inode_unpacked *inode) +void bch2_inode_opts_get_inode(struct bch_fs *c, + struct bch_inode_unpacked *inode, + struct bch_inode_opts *ret) { #define x(_name, _bits) \ if ((inode)->bi_##_name) { \ - opts->_name = inode->bi_##_name - 1; \ - opts->_name##_from_inode = true; \ + ret->_name = inode->bi_##_name - 1; \ + ret->_name##_from_inode = true; \ } else { \ - opts->_name = c->opts._name; \ - opts->_name##_from_inode = false; \ + ret->_name = c->opts._name; \ + ret->_name##_from_inode = false; \ } BCH_INODE_OPTS() #undef x - bch2_io_opts_fixups(opts); + ret->change_cookie = atomic_read(&c->opt_change_cookie); + + bch2_io_opts_fixups(ret); } -int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) +int bch2_inum_snapshot_opts_get(struct btree_trans *trans, + u64 inum, u32 snapshot, + struct bch_inode_opts *opts) { - struct bch_inode_unpacked inode; - int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode)); + if (inum) { + struct bch_inode_unpacked inode; + int ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); + if (ret) + return ret; - if (ret) - return ret; + bch2_inode_opts_get_inode(trans->c, &inode, opts); + } else { + /* + * data_update_index_update may call us for reflink btree extent + * updates, inum will be 0 + */ - bch2_inode_opts_get(opts, trans->c, &inode); + bch2_inode_opts_get(trans->c, opts); + } return 0; } @@ -1347,7 +1359,7 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - return ret ?: bch_err_throw(c, transaction_restart_nested); + return ret; } /* @@ -1386,7 +1398,8 @@ next_parent: int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?: - delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); + delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)) ?: + bch_err_throw(trans->c, transaction_restart_nested); } static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 79092ea7..63b70888 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -289,9 +289,8 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); -void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, - struct bch_inode_unpacked *); -int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *); +void bch2_inode_opts_get_inode(struct bch_fs *, struct bch_inode_unpacked *, struct bch_inode_opts *); +int bch2_inum_snapshot_opts_get(struct btree_trans *, u64, u32, struct bch_inode_opts *); int bch2_inode_set_casefold(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, unsigned); @@ -300,8 +299,8 @@ int bch2_inode_set_casefold(struct btree_trans *, subvol_inum, static inline struct bch_extent_rebalance bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) { - struct bch_io_opts io_opts; - bch2_inode_opts_get(&io_opts, c, inode); + struct bch_inode_opts io_opts; + bch2_inode_opts_get_inode(c, inode, &io_opts); return io_opts_to_rebalance_opts(c, &io_opts); } diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c index fa0b06e1..04eb5ecd 100644 --- a/libbcachefs/io_misc.c +++ b/libbcachefs/io_misc.c @@ -24,7 +24,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, subvol_inum inum, struct btree_iter *iter, u64 sectors, - struct bch_io_opts opts, + struct bch_inode_opts opts, s64 *i_sectors_delta, struct write_point_specifier write_point) { @@ -109,7 +109,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, } ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, - 0, i_sectors_delta, true); + 0, i_sectors_delta, true, 0); err: if (!ret && sectors_allocated) bch2_increment_clock(c, sectors_allocated, WRITE); @@ -211,7 +211,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_cut_back(end_pos, &delete); ret = bch2_extent_update(trans, inum, iter, &delete, - &disk_res, 0, i_sectors_delta, false); + &disk_res, 0, i_sectors_delta, false, 0); bch2_disk_reservation_put(c, &disk_res); } @@ -373,7 +373,6 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, struct btree_iter iter; struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; - struct bch_io_opts opts; u64 dst_offset = le64_to_cpu(op->v.dst_offset); u64 src_offset = le64_to_cpu(op->v.src_offset); s64 shift = dst_offset - src_offset; @@ -384,10 +383,6 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, bool warn_errors = i_sectors_delta != NULL; int ret = 0; - ret = bch2_inum_opts_get(trans, inum, &opts); - if (ret) - return ret; - /* * check for missing subvolume before fpunch, as in resume we don't want * it to be a fatal error @@ -476,8 +471,7 @@ case LOGGED_OP_FINSERT_shift_extents: op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?: - bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc); diff --git a/libbcachefs/io_misc.h b/libbcachefs/io_misc.h index b93e4d4b..6a294f2a 100644 --- a/libbcachefs/io_misc.h +++ b/libbcachefs/io_misc.h @@ -3,7 +3,7 @@ #define _BCACHEFS_IO_MISC_H int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, - u64, struct bch_io_opts, s64 *, + u64, struct bch_inode_opts, s64 *, struct write_point_specifier); int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos); diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c index e7d53ab1..e7ba0d0b 100644 --- a/libbcachefs/io_read.c +++ b/libbcachefs/io_read.c @@ -158,7 +158,7 @@ static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, struct bpos pos, - struct bch_io_opts opts, + struct bch_inode_opts opts, unsigned flags, struct bch_io_failures *failed) { @@ -408,9 +408,8 @@ void bch2_promote_op_to_text(struct printbuf *out, { if (!op->write.read_done) { prt_printf(out, "parent read: %px\n", op->write.rbio.parent); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); bch2_read_bio_to_text(out, c, op->write.rbio.parent); - printbuf_indent_sub(out, 2); } bch2_data_update_to_text(out, &op->write); @@ -741,15 +740,13 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, } static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, - struct bch_read_bio *rbio) + struct bch_read_bio *rbio, + struct bch_extent_crc_unpacked *new_crc) { struct bch_fs *c = rbio->c; u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; int ret = 0; - if (crc_is_compressed(rbio->pick.crc)) - return 0; - CLASS(btree_iter, iter)(trans, rbio->data_btree, rbio->data_pos, BTREE_ITER_intent); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) @@ -757,21 +754,12 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (bversion_cmp(k.k->bversion, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) - return 0; + return bch_err_throw(c, rbio_narrow_crcs_fail); - /* Extent was merged? */ - if (bkey_start_offset(k.k) < data_offset || - k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) - return 0; - - struct bch_extent_crc_unpacked new_crc; - if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, - rbio->pick.crc, NULL, &new_crc, - bkey_start_offset(k.k) - data_offset, k.k->size, - rbio->pick.crc.csum_type)) { - bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); - return 0; - } + /* Extent was trimmed/merged? */ + if (!bpos_eq(bkey_start_pos(k.k), rbio->data_pos) || + k.k->p.offset != rbio->data_pos.offset + rbio->pick.crc.live_size) + return bch_err_throw(c, rbio_narrow_crcs_fail); /* * going to be temporarily appending another checksum entry: @@ -783,17 +771,37 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, bkey_reassemble(new, k); - if (!bch2_bkey_narrow_crcs(new, new_crc)) - return 0; + if (!bch2_bkey_narrow_crcs(new, *new_crc)) + return bch_err_throw(c, rbio_narrow_crcs_fail); return bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node); } static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { - CLASS(btree_trans, trans)(rbio->c); - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __bch2_rbio_narrow_crcs(trans, rbio)); + struct bch_fs *c = rbio->c; + + if (crc_is_compressed(rbio->pick.crc)) + return; + + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; + + struct bch_extent_crc_unpacked new_crc; + if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, + rbio->pick.crc, NULL, &new_crc, + rbio->data_pos.offset - data_offset, rbio->pick.crc.live_size, + rbio->pick.crc.csum_type)) { + bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); + return; + } + + CLASS(btree_trans, trans)(c); + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_rbio_narrow_crcs(trans, rbio, &new_crc)); + if (!ret) + count_event(c, io_read_narrow_crcs); + else if (ret == -BCH_ERR_rbio_narrow_crcs_fail) + count_event(c, io_read_narrow_crcs_fail); } static void bch2_read_decompress_err(struct work_struct *work) @@ -1076,8 +1084,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, } if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && - !orig->data_update) - return bch_err_throw(c, extent_poisoned); + !orig->data_update) { + ret = bch_err_throw(c, extent_poisoned); + goto err; + } retry_pick: ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); @@ -1517,7 +1527,7 @@ void bch2_read_bio_to_text(struct printbuf *out, /* Are we in a retry? */ - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); u64 now = local_clock(); prt_printf(out, "start_time:\t"); @@ -1551,7 +1561,6 @@ void bch2_read_bio_to_text(struct printbuf *out, prt_newline(out); bch2_bio_to_text(out, &rbio->bio); - printbuf_indent_sub(out, 2); } void bch2_fs_io_read_exit(struct bch_fs *c) diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h index 1e1c0476..df4632f6 100644 --- a/libbcachefs/io_read.h +++ b/libbcachefs/io_read.h @@ -74,7 +74,7 @@ struct bch_read_bio { struct bpos data_pos; struct bversion version; - struct bch_io_opts opts; + struct bch_inode_opts opts; struct work_struct work; @@ -192,7 +192,7 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, static inline struct bch_read_bio *rbio_init(struct bio *bio, struct bch_fs *c, - struct bch_io_opts opts, + struct bch_inode_opts opts, bio_end_io_t end_io) { struct bch_read_bio *rbio = to_rbio(bio); diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c index 1d83dcc9..6a5da02c 100644 --- a/libbcachefs/io_write.c +++ b/libbcachefs/io_write.c @@ -205,7 +205,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, struct btree_iter *extent_iter, u64 new_i_size, - s64 i_sectors_delta) + s64 i_sectors_delta, + struct bch_inode_unpacked *inode_u) { /* * Crazy performance optimization: @@ -227,7 +228,13 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, BTREE_ITER_intent| BTREE_ITER_cached); struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); - int ret = bkey_err(k); + + /* + * XXX: we currently need to unpack the inode on every write because we + * need the current io_opts, for transactional consistency - inode_v4? + */ + int ret = bkey_err(k) ?: + bch2_inode_unpack(k, inode_u); if (unlikely(ret)) return ret; @@ -303,8 +310,10 @@ int bch2_extent_update(struct btree_trans *trans, struct disk_reservation *disk_res, u64 new_i_size, s64 *i_sectors_delta_total, - bool check_enospc) + bool check_enospc, + u32 change_cookie) { + struct bch_fs *c = trans->c; struct bpos next_pos; bool usage_increasing; s64 i_sectors_delta = 0, disk_sectors_delta = 0; @@ -335,7 +344,7 @@ int bch2_extent_update(struct btree_trans *trans, if (disk_res && disk_sectors_delta > (s64) disk_res->sectors) { - ret = bch2_disk_reservation_add(trans->c, disk_res, + ret = bch2_disk_reservation_add(c, disk_res, disk_sectors_delta - disk_res->sectors, !check_enospc || !usage_increasing ? BCH_DISK_RESERVATION_NOFAIL : 0); @@ -349,9 +358,16 @@ int bch2_extent_update(struct btree_trans *trans, * aren't changing - for fsync to work properly; fsync relies on * inode->bi_journal_seq which is updated by the trigger code: */ + struct bch_inode_unpacked inode; + struct bch_inode_opts opts; + ret = bch2_extent_update_i_size_sectors(trans, iter, min(k->k.p.offset << 9, new_i_size), - i_sectors_delta) ?: + i_sectors_delta, &inode) ?: + (bch2_inode_opts_get_inode(c, &inode, &opts), + bch2_bkey_set_needs_rebalance(c, &opts, k, + SET_NEEDS_REBALANCE_foreground, + change_cookie)) ?: bch2_trans_update(trans, iter, k, 0) ?: bch2_trans_commit(trans, disk_res, NULL, BCH_TRANS_COMMIT_no_check_rw| @@ -402,7 +418,8 @@ static int bch2_write_index_default(struct bch_write_op *op) ret = bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_check_enospc); + op->flags & BCH_WRITE_check_enospc, + op->opts.change_cookie); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -792,10 +809,6 @@ static void init_append_extent(struct bch_write_op *op, bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, op->flags & BCH_WRITE_cached); - - if (!(op->flags & BCH_WRITE_move)) - bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); - bch2_keylist_push(&op->insert_keys); } @@ -1225,6 +1238,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return 0; } + struct bch_fs *c = trans->c; struct bkey_i *new = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + sizeof(struct bch_extent_rebalance)); int ret = PTR_ERR_OR_ZERO(new); @@ -1239,8 +1253,6 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, bkey_for_each_ptr(ptrs, ptr) ptr->unwritten = 0; - bch2_bkey_set_needs_rebalance(op->c, &op->opts, new); - /* * Note that we're not calling bch2_subvol_get_snapshot() in this path - * that was done when we kicked off the write, and here it's important @@ -1248,8 +1260,20 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, * since been created. The write is still outstanding, so we're ok * w.r.t. snapshot atomicity: */ + + /* + * For transactional consistency, set_needs_rebalance() has to be called + * with the io_opts from the btree in the same transaction: + */ + struct bch_inode_unpacked inode; + struct bch_inode_opts opts; + return bch2_extent_update_i_size_sectors(trans, iter, - min(new->k.p.offset << 9, new_i_size), 0) ?: + min(new->k.p.offset << 9, new_i_size), 0, &inode) ?: + (bch2_inode_opts_get_inode(c, &inode, &opts), + bch2_bkey_set_needs_rebalance(c, &opts, new, + SET_NEEDS_REBALANCE_foreground, + op->opts.change_cookie)) ?: bch2_trans_update(trans, iter, new, BTREE_UPDATE_internal_snapshot_node); } @@ -1742,7 +1766,7 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) prt_printf(out, "pos:\t"); bch2_bpos_to_text(out, op->pos); prt_newline(out); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); prt_printf(out, "started:\t"); bch2_pr_time_units(out, local_clock() - op->start_time); @@ -1754,11 +1778,12 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas); prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); + prt_printf(out, "devs_have:\t"); + bch2_devs_list_to_text(out, &op->devs_have); + prt_newline(out); prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); prt_printf(out, "ret\t%s\n", bch2_err_str(op->error)); - - printbuf_indent_sub(out, 2); } void bch2_fs_io_write_exit(struct bch_fs *c) diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h index 2c0a8f35..692529bf 100644 --- a/libbcachefs/io_write.h +++ b/libbcachefs/io_write.h @@ -28,10 +28,10 @@ int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, struct bkey_i *, bool *, s64 *, s64 *); int bch2_extent_update(struct btree_trans *, subvol_inum, struct btree_iter *, struct bkey_i *, - struct disk_reservation *, u64, s64 *, bool); + struct disk_reservation *, u64, s64 *, bool, u32); static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, - struct bch_io_opts opts) + struct bch_inode_opts opts) { op->c = c; op->end_io = NULL; diff --git a/libbcachefs/io_write_types.h b/libbcachefs/io_write_types.h index 5da4eb8b..ab36b03e 100644 --- a/libbcachefs/io_write_types.h +++ b/libbcachefs/io_write_types.h @@ -90,7 +90,7 @@ struct bch_write_op { struct bch_devs_list devs_have; u16 target; u16 nonce; - struct bch_io_opts opts; + struct bch_inode_opts opts; u32 subvol; struct bpos pos; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 93ac0fae..6505c79f 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -48,7 +48,7 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 struct journal_buf *buf = j->buf + i; prt_printf(out, "seq:\t%llu\n", seq); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); if (!buf->write_started) prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK)); @@ -81,8 +81,6 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 if (buf->write_done) prt_str(out, "write_done"); prt_newline(out); - - printbuf_indent_sub(out, 2); } static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) @@ -1767,20 +1765,20 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) bch2_journal_bufs_to_text(out, j); prt_printf(out, "space:\n"); - printbuf_indent_add(out, 2); - prt_printf(out, "discarded\t%u:%u\n", - j->space[journal_space_discarded].next_entry, - j->space[journal_space_discarded].total); - prt_printf(out, "clean ondisk\t%u:%u\n", - j->space[journal_space_clean_ondisk].next_entry, - j->space[journal_space_clean_ondisk].total); - prt_printf(out, "clean\t%u:%u\n", - j->space[journal_space_clean].next_entry, - j->space[journal_space_clean].total); - prt_printf(out, "total\t%u:%u\n", - j->space[journal_space_total].next_entry, - j->space[journal_space_total].total); - printbuf_indent_sub(out, 2); + scoped_guard(printbuf_indent, out) { + prt_printf(out, "discarded\t%u:%u\n", + j->space[journal_space_discarded].next_entry, + j->space[journal_space_discarded].total); + prt_printf(out, "clean ondisk\t%u:%u\n", + j->space[journal_space_clean_ondisk].next_entry, + j->space[journal_space_clean_ondisk].total); + prt_printf(out, "clean\t%u:%u\n", + j->space[journal_space_clean].next_entry, + j->space[journal_space_clean].total); + prt_printf(out, "total\t%u:%u\n", + j->space[journal_space_total].next_entry, + j->space[journal_space_total].total); + } for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { if (!ca->mi.durability) @@ -1796,7 +1794,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "dev %u:\n", ca->dev_idx); prt_printf(out, "durability %u:\n", ca->mi.durability); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); prt_printf(out, "nr\t%u\n", ja->nr); prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); prt_printf(out, "available\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); @@ -1804,7 +1802,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "dirty_ondisk\t%u (seq %llu)\n",ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); prt_printf(out, "dirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); prt_printf(out, "cur_idx\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); - printbuf_indent_sub(out, 2); } prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 0a9fbc76..e6f778bf 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -216,7 +216,9 @@ nocompact: if (seq < c->journal_entries_base_seq || seq >= c->journal_entries_base_seq + U32_MAX) { - bch_err(c, "journal entry sequence numbers span too large a range: cannot reply, contact developers"); + bch_err(c, "journal entry sequence numbers span too large a range: cannot replay, contact developers\n" + "base %llu last_seq currently %llu, but have seq %llu", + c->journal_entries_base_seq, jlist->last_seq, seq); return bch_err_throw(c, ENOMEM_journal_entry_add); } @@ -758,8 +760,8 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs return; prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); + guard(printbuf_indent)(out); - printbuf_indent_add(out, 2); for (i = 0; i < nr_types; i++) { prt_newline(out); bch2_prt_data_type(out, i); @@ -768,7 +770,6 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs le64_to_cpu(u->d[i].sectors), le64_to_cpu(u->d[i].fragmented)); } - printbuf_indent_sub(out, 2); } static int journal_entry_log_validate(struct bch_fs *c, diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index bd188560..ae747c87 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -1019,7 +1019,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 pin_list = journal_seq_pin(j, *seq); prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); prt_printf(out, "unflushed:\n"); for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) @@ -1031,8 +1031,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 list_for_each_entry(pin, &pin_list->flushed[i], list) prt_printf(out, "\t%px %ps\n", pin, pin->flush); - printbuf_indent_sub(out, 2); - return false; } diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c index 0cb9b93f..dc0ecedb 100644 --- a/libbcachefs/journal_sb.c +++ b/libbcachefs/journal_sb.c @@ -30,7 +30,7 @@ static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f, if (!nr) return 0; - b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL); + b = kvmalloc_array(nr, sizeof(u64), GFP_KERNEL); if (!b) return -BCH_ERR_ENOMEM_sb_journal_validate; @@ -64,7 +64,7 @@ static int bch2_sb_journal_validate(struct bch_sb *sb, struct bch_sb_field *f, ret = 0; err: - kfree(b); + kvfree(b); return ret; } @@ -113,7 +113,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f if (!nr) return 0; - b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL); + b = kvmalloc_array(nr, sizeof(*b), GFP_KERNEL); if (!b) return -BCH_ERR_ENOMEM_sb_journal_v2_validate; @@ -165,7 +165,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f ret = 0; err: - kfree(b); + kvfree(b); return ret; } @@ -230,3 +230,40 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, BUG_ON(dst + 1 != nr_compacted); return 0; } + +static inline bool journal_v2_unsorted(struct bch_sb_field_journal_v2 *j) +{ + unsigned nr = bch2_sb_field_journal_v2_nr_entries(j); + for (unsigned i = 0; i + 1 < nr; i++) + if (le64_to_cpu(j->d[i].start) > le64_to_cpu(j->d[i + 1].start)) + return true; + return false; +} + +int bch2_sb_journal_sort(struct bch_fs *c) +{ + BUG_ON(!c->sb.clean); + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); + + guard(mutex)(&c->sb_lock); + bool write_sb = false; + + for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_journal_sort) { + struct bch_sb_field_journal_v2 *j = bch2_sb_field_get(ca->disk_sb.sb, journal_v2); + if (!j) + continue; + + if ((j && journal_v2_unsorted(j)) || + bch2_sb_field_get(ca->disk_sb.sb, journal)) { + struct journal_device *ja = &ca->journal; + + sort(ja->buckets, ja->nr, sizeof(ja->buckets[0]), u64_cmp, NULL); + bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr); + write_sb = true; + } + } + + return write_sb + ? bch2_write_super(c) + : 0; +} diff --git a/libbcachefs/journal_sb.h b/libbcachefs/journal_sb.h index ba40a7e8..e0fc4065 100644 --- a/libbcachefs/journal_sb.h +++ b/libbcachefs/journal_sb.h @@ -22,3 +22,4 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_journal; extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned); +int bch2_sb_journal_sort(struct bch_fs *); diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index b9c08344..c533b607 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -51,25 +51,17 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, : 0; } -int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) +static int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) { - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted); -} - -int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) -{ - return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); + return __bch2_lru_set(trans, lru_id, dev_bucket, time, true); } int __bch2_lru_change(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 old_time, u64 new_time) { - if (old_time == new_time) - return 0; - - return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?: - bch2_lru_set(trans, lru_id, dev_bucket, new_time); + return __bch2_lru_set(trans, lru_id, dev_bucket, old_time, false) ?: + __bch2_lru_set(trans, lru_id, dev_bucket, new_time, true); } static const char * const bch2_lru_types[] = { @@ -87,7 +79,6 @@ int bch2_lru_check_set(struct btree_trans *trans, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - CLASS(printbuf, buf)(); CLASS(btree_iter, lru_iter)(trans, BTREE_ID_lru, lru_pos(lru_id, dev_bucket, time), 0); struct bkey_s_c lru_k = bch2_btree_iter_peek_slot(&lru_iter); int ret = bkey_err(lru_k); @@ -99,10 +90,13 @@ int bch2_lru_check_set(struct btree_trans *trans, if (ret) return ret; - if (fsck_err(trans, alloc_key_to_missing_lru_entry, - "missing %s lru entry\n%s", - bch2_lru_types[lru_type(lru_k)], - (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { + CLASS(printbuf, buf)(); + prt_printf(&buf, "missing %s lru entry at pos ", bch2_lru_types[lru_type(lru_k)]); + bch2_bpos_to_text(&buf, lru_iter.pos); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, referring_k); + + if (fsck_err(trans, alloc_key_to_missing_lru_entry, "%s", buf.buf)) { ret = bch2_lru_set(trans, lru_id, dev_bucket, time); if (ret) return ret; @@ -127,6 +121,23 @@ static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) } } +int bch2_dev_remove_lrus(struct bch_fs *c, struct bch_dev *ca) +{ + CLASS(btree_trans, trans)(c); + int ret = bch2_btree_write_buffer_flush_sync(trans) ?: + for_each_btree_key(trans, iter, + BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, ({ + struct bbpos bp = lru_pos_to_bp(k); + + bp.btree == BTREE_ID_alloc && bp.pos.inode == ca->dev_idx + ? (bch2_btree_delete_at(trans, &iter, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0)) + : 0; + })); + bch_err_fn(c, ret); + return ret; +} + static u64 bkey_lru_type_idx(struct bch_fs *c, enum bch_lru_type type, struct bkey_s_c k) diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h index 6f1e0a7b..d5a2620f 100644 --- a/libbcachefs/lru.h +++ b/libbcachefs/lru.h @@ -59,8 +59,6 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos); .min_val_size = 8, \ }) -int bch2_lru_del(struct btree_trans *, u16, u64, u64); -int bch2_lru_set(struct btree_trans *, u16, u64, u64); int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); static inline int bch2_lru_change(struct btree_trans *trans, @@ -72,9 +70,10 @@ static inline int bch2_lru_change(struct btree_trans *trans, : 0; } +int bch2_dev_remove_lrus(struct bch_fs *, struct bch_dev *); + struct bkey_buf; int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); - int bch2_check_lrus(struct bch_fs *); #endif /* _BCACHEFS_LRU_H */ diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 892990b4..8a3981e1 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -22,8 +22,8 @@ #include "replicas.h" #include "super-io.h" -static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, - unsigned dev_idx, unsigned flags, bool metadata) +static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, unsigned dev_idx, + unsigned flags, struct printbuf *err, bool metadata) { unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; @@ -34,14 +34,19 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, nr_good = bch2_bkey_durability(c, k.s_c); if ((!nr_good && !(flags & lost)) || - (nr_good < replicas && !(flags & degraded))) + (nr_good < replicas && !(flags & degraded))) { + prt_str(err, "cannot drop device without degrading/losing data\n "); + bch2_bkey_val_to_text(err, c, k.s_c); + prt_newline(err); return bch_err_throw(c, remove_would_lose_data); + } return 0; } static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter, - struct btree *b, unsigned dev_idx, unsigned flags) + struct btree *b, unsigned dev_idx, + unsigned flags, struct printbuf *err) { struct bch_fs *c = trans->c; struct bkey_buf k; @@ -49,10 +54,9 @@ static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter, bch2_bkey_buf_init(&k); bch2_bkey_buf_copy(&k, c, &b->key); - int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?: + int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, err, true) ?: bch2_btree_node_update_key(trans, iter, b, k.k, 0, false); - bch_err_fn(c, ret); bch2_bkey_buf_exit(&k, c); return ret; } @@ -61,7 +65,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, unsigned dev_idx, - unsigned flags) + unsigned flags, struct printbuf *err) { struct bch_fs *c = trans->c; struct bkey_i *n; @@ -75,7 +79,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, if (ret) return ret; - ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false); + ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, err, false); if (ret) return ret; @@ -101,7 +105,7 @@ static int bch2_dev_btree_drop_key(struct btree_trans *trans, struct bkey_s_c_backpointer bp, unsigned dev_idx, struct bkey_buf *last_flushed, - unsigned flags) + unsigned flags, struct printbuf *err) { struct btree_iter iter; struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed); @@ -109,7 +113,7 @@ static int bch2_dev_btree_drop_key(struct btree_trans *trans, if (ret) return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret; - ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); + ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags, err); bch2_trans_iter_exit(&iter); return ret; @@ -117,12 +121,14 @@ static int bch2_dev_btree_drop_key(struct btree_trans *trans, static int bch2_dev_usrdata_drop(struct bch_fs *c, struct progress_indicator_state *progress, - unsigned dev_idx, unsigned flags) + unsigned dev_idx, + unsigned flags, struct printbuf *err) { CLASS(btree_trans, trans)(c); + /* FIXME: this does not handle unknown btrees with data pointers */ for (unsigned id = 0; id < BTREE_ID_NR; id++) { - if (!btree_type_has_ptrs(id)) + if (!btree_type_has_data_ptrs(id)) continue; /* Stripe keys have pointers, but are handled separately */ @@ -133,7 +139,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); - bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); + bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags, err); })); if (ret) return ret; @@ -144,7 +150,8 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, static int bch2_dev_metadata_drop(struct bch_fs *c, struct progress_indicator_state *progress, - unsigned dev_idx, unsigned flags) + unsigned dev_idx, + unsigned flags, struct printbuf *err) { struct btree_iter iter; struct closure cl; @@ -161,7 +168,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, bch2_bkey_buf_init(&k); closure_init_stack(&cl); - for (id = 0; id < BTREE_ID_NR; id++) { + for (id = 0; id < btree_id_nr_alive(c); id++) { bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, BTREE_ITER_prefetch); retry: @@ -174,7 +181,7 @@ retry: if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) goto next; - ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); + ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags, err); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; @@ -206,7 +213,7 @@ err: static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed, - unsigned flags) + unsigned flags, struct printbuf *err) { struct btree_iter iter; struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, @@ -226,17 +233,18 @@ static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, */ if (bkey_is_btree_ptr(k.k)) - ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags); + ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags, err); else if (k.k->type == KEY_TYPE_stripe) - ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags); + ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags, err); else - ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); + ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags, err); out: bch2_trans_iter_exit(&iter); return ret; } -int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags) +int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags, + struct printbuf *err) { CLASS(btree_trans, trans)(c); @@ -253,22 +261,22 @@ int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsig continue; data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), - &last_flushed, flags); + &last_flushed, flags, err); })); bch2_bkey_buf_exit(&last_flushed, trans->c); - bch_err_fn(c, ret); return ret; } -int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags) +int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, + unsigned flags, struct printbuf *err) { struct progress_indicator_state progress; bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_extents)| BIT_ULL(BTREE_ID_reflink)); - return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?: - bch2_dev_metadata_drop(c, &progress, dev_idx, flags); + return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags, err) ?: + bch2_dev_metadata_drop(c, &progress, dev_idx, flags, err); } diff --git a/libbcachefs/migrate.h b/libbcachefs/migrate.h index 30018140..ff4567fb 100644 --- a/libbcachefs/migrate.h +++ b/libbcachefs/migrate.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_MIGRATE_H #define _BCACHEFS_MIGRATE_H -int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned); -int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned); +int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned, struct printbuf *); +int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned, struct printbuf *); #endif /* _BCACHEFS_MIGRATE_H */ diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 4f41f1f6..9a440d3f 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -46,12 +46,12 @@ struct evacuate_bucket_arg { static bool evacuate_bucket_pred(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, - struct bch_io_opts *, + struct bch_inode_opts *, struct data_update_opts *); static noinline void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { CLASS(printbuf, buf)(); @@ -72,7 +72,7 @@ static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) static noinline void trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts, move_pred_fn pred, void *_arg, bool p) { @@ -121,6 +121,7 @@ struct moving_io { static void move_free(struct moving_io *io) { struct moving_context *ctxt = io->write.ctxt; + struct bch_fs *c = io->write.op.c; if (io->b) atomic_dec(&io->b->count); @@ -132,8 +133,9 @@ static void move_free(struct moving_io *io) if (!io->write.data_opts.scrub) { bch2_data_update_exit(&io->write); } else { - bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); + bch2_bio_free_pages_pool(c, &io->write.op.wbio.bio); kfree(io->write.bvecs); + bch2_bkey_buf_exit(&io->write.k, c); } kfree(io); } @@ -325,7 +327,7 @@ int bch2_move_extent(struct moving_context *ctxt, struct move_bucket *bucket_in_flight, struct btree_iter *iter, struct bkey_s_c k, - struct bch_io_opts io_opts, + struct bch_inode_opts io_opts, struct data_update_opts data_opts) { struct btree_trans *trans = ctxt->trans; @@ -427,7 +429,9 @@ int bch2_move_extent(struct moving_context *ctxt, data_opts.scrub ? data_opts.read_dev : -1); return 0; err: + bch2_bkey_buf_exit(&io->write.k, c); kfree(io); + if (bch2_err_matches(ret, EROFS) || bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; @@ -447,93 +451,6 @@ err: return ret; } -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, - struct per_snapshot_io_opts *io_opts, - struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ - struct btree_iter *extent_iter, - struct bkey_s_c extent_k) -{ - struct bch_fs *c = trans->c; - u32 restart_count = trans->restart_count; - struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; - int ret = 0; - - if (btree_iter_path(trans, extent_iter)->level) - return opts_ret; - - if (extent_k.k->type == KEY_TYPE_reflink_v) - goto out; - - if (io_opts->cur_inum != extent_pos.inode) { - io_opts->d.nr = 0; - - ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), - BTREE_ITER_all_snapshots, k, ({ - if (k.k->p.offset != extent_pos.inode) - break; - - if (!bkey_is_inode(k.k)) - continue; - - struct bch_inode_unpacked inode; - _ret3 = bch2_inode_unpack(k, &inode); - if (_ret3) - break; - - struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; - bch2_inode_opts_get(&e.io_opts, trans->c, &inode); - - darray_push(&io_opts->d, e); - })); - io_opts->cur_inum = extent_pos.inode; - } - - ret = ret ?: trans_was_restarted(trans, restart_count); - if (ret) - return ERR_PTR(ret); - - if (extent_k.k->p.snapshot) - darray_for_each(io_opts->d, i) - if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) { - opts_ret = &i->io_opts; - break; - } -out: - ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); - if (ret) - return ERR_PTR(ret); - return opts_ret; -} - -int bch2_move_get_io_opts_one(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct btree_iter *extent_iter, - struct bkey_s_c extent_k) -{ - struct bch_fs *c = trans->c; - - *io_opts = bch2_opts_to_inode_opts(c->opts); - - /* reflink btree? */ - if (extent_k.k->p.inode) { - CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes, - SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), - BTREE_ITER_cached); - struct bkey_s_c inode_k = bch2_btree_iter_peek_slot(&inode_iter); - int ret = bkey_err(inode_k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - - if (!ret && bkey_is_inode(inode_k.k)) { - struct bch_inode_unpacked inode; - bch2_inode_unpack(inode_k, &inode); - bch2_inode_opts_get(io_opts, c, &inode); - } - } - - return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); -} - int bch2_move_ratelimit(struct moving_context *ctxt) { struct bch_fs *c = ctxt->trans->c; @@ -578,37 +495,6 @@ int bch2_move_ratelimit(struct moving_context *ctxt) return 0; } -/* - * Move requires non extents iterators, and there's also no need for it to - * signal indirect_extent_missing_error: - */ -static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_reflink_p p) -{ - if (unlikely(REFLINK_P_ERROR(p.v))) - return bkey_s_c_null; - - struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v)); - - bch2_trans_iter_init(trans, iter, - BTREE_ID_reflink, reflink_pos, - BTREE_ITER_not_extents); - - struct bkey_s_c k = bch2_btree_iter_peek(iter); - if (!k.k || bkey_err(k)) { - bch2_trans_iter_exit(iter); - return k; - } - - if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { - bch2_trans_iter_exit(iter); - return bkey_s_c_null; - } - - return k; -} - int bch2_move_data_btree(struct moving_context *ctxt, struct bpos start, struct bpos end, @@ -618,17 +504,11 @@ int bch2_move_data_btree(struct moving_context *ctxt, struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct per_snapshot_io_opts snapshot_io_opts; - struct bch_io_opts *io_opts; + struct bch_inode_opts *io_opts; struct bkey_buf sk; struct btree_iter iter, reflink_iter = {}; struct bkey_s_c k; struct data_update_opts data_opts; - /* - * If we're moving a single file, also process reflinked data it points - * to (this includes propagating changed io_opts from the inode to the - * extent): - */ - bool walk_indirect = start.inode == end.inode; int ret = 0, ret2; per_snapshot_io_opts_init(&snapshot_io_opts, c); @@ -693,8 +573,6 @@ root_err: bch2_ratelimit_reset(ctxt->rate); while (!bch2_move_ratelimit(ctxt)) { - struct btree_iter *extent_iter = &iter; - bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); @@ -713,41 +591,18 @@ root_err: if (ctxt->stats) ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - if (walk_indirect && - k.k->type == KEY_TYPE_reflink_p && - REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { - struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - - bch2_trans_iter_exit(&reflink_iter); - k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - - if (!k.k) - goto next_nondata; - - /* - * XXX: reflink pointers may point to multiple indirect - * extents, so don't advance past the entire reflink - * pointer - need to fixup iter->k - */ - extent_iter = &reflink_iter; - } - if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, - iter.pos, extent_iter, k); + io_opts = bch2_extent_get_apply_io_opts(trans, &snapshot_io_opts, + iter.pos, &iter, k, + SET_NEEDS_REBALANCE_other); ret = PTR_ERR_OR_ZERO(io_opts); if (ret) continue; memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) + if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) goto next; /* @@ -758,7 +613,7 @@ root_err: k = bkey_i_to_s_c(sk.k); if (!level) - ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); else if (!data_opts.scrub) ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, k.k->p, data_opts.target, 0); @@ -820,7 +675,7 @@ static int bch2_move_data(struct bch_fs *c, unsigned min_depth_this_btree = min_depth; /* Stripe keys have pointers, but are handled separately */ - if (!btree_type_has_ptrs(id) || + if (!btree_type_has_data_ptrs(id) || id == BTREE_ID_stripes) min_depth_this_btree = max(min_depth_this_btree, 1); @@ -855,7 +710,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; bool is_kthread = current->flags & PF_KTHREAD; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree_iter iter = {}; struct bkey_buf sk; struct bkey_s_c k; @@ -863,7 +717,11 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, u64 check_mismatch_done = bucket_start; int ret = 0; - CLASS(bch2_dev_tryget, ca)(c, dev); + struct bch_inode_opts io_opts; + bch2_inode_opts_get(c, &io_opts); + + /* Userspace might have supplied @dev: */ + CLASS(bch2_dev_tryget_noerror, ca)(c, dev); if (!ca) return 0; @@ -937,7 +795,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, goto next; if (!bp.v->level) { - ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); + ret = bch2_extent_get_apply_io_opts_one(trans, &io_opts, &iter, k, + SET_NEEDS_REBALANCE_other); if (ret) { bch2_trans_iter_exit(&iter); continue; @@ -1034,7 +893,7 @@ int bch2_move_data_phys(struct bch_fs *c, static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { struct evacuate_bucket_arg *arg = _arg; @@ -1075,7 +934,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, } typedef bool (*move_btree_pred)(struct bch_fs *, void *, - struct btree *, struct bch_io_opts *, + struct btree *, struct bch_inode_opts *, struct data_update_opts *); static int bch2_move_btree(struct bch_fs *c, @@ -1085,7 +944,6 @@ static int bch2_move_btree(struct bch_fs *c, struct bch_move_stats *stats) { bool kthread = (current->flags & PF_KTHREAD) != 0; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct moving_context ctxt; struct btree_trans *trans; struct btree_iter iter; @@ -1094,6 +952,9 @@ static int bch2_move_btree(struct bch_fs *c, struct data_update_opts data_opts; int ret = 0; + struct bch_inode_opts io_opts; + bch2_inode_opts_get(c, &io_opts); + bch2_moving_ctxt_init(&ctxt, c, NULL, stats, writepoint_ptr(&c->btree_write_point), true); @@ -1154,7 +1015,7 @@ next: static bool rereplicate_pred(struct bch_fs *c, void *arg, enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { unsigned nr_good = bch2_bkey_durability(c, k); @@ -1185,7 +1046,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, static bool migrate_pred(struct bch_fs *c, void *arg, enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -1222,7 +1083,7 @@ static bool bformat_needs_redo(struct bkey_format *f) static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, struct btree *b, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { if (b->version_ondisk != c->sb.version || @@ -1259,7 +1120,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { unsigned durability = bch2_bkey_durability(c, k); @@ -1297,7 +1158,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, static bool scrub_pred(struct bch_fs *c, void *_arg, enum btree_id btree, struct bkey_s_c k, - struct bch_io_opts *io_opts, + struct bch_inode_opts *io_opts, struct data_update_opts *data_opts) { struct bch_ioctl_data *arg = _arg; @@ -1400,7 +1261,7 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_str(out, " pos="); bch2_bbpos_to_text(out, stats->pos); prt_newline(out); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); @@ -1415,8 +1276,6 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_printf(out, "bytes raced:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); - - printbuf_indent_sub(out, 2); } static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) @@ -1425,7 +1284,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str printbuf_tabstop_push(out, 32); bch2_move_stats_to_text(out, ctxt->stats); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); prt_printf(out, "reads: ios %u/%u sectors %u/%u\n", atomic_read(&ctxt->read_ios), @@ -1439,15 +1298,13 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str atomic_read(&ctxt->write_sectors), c->opts.move_bytes_in_flight >> 9); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); scoped_guard(mutex, &ctxt->lock) { struct moving_io *io; list_for_each_entry(io, &ctxt->ios, io_list) bch2_data_update_inflight_to_text(out, &io->write); } - - printbuf_indent_sub(out, 4); } void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 481026ff..754b0ad4 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -73,7 +73,7 @@ do { \ } while (1) typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, - struct bch_io_opts *, struct data_update_opts *); + struct bch_inode_opts *, struct data_update_opts *); extern const char * const bch2_data_ops_strs[]; @@ -87,45 +87,15 @@ void bch2_moving_ctxt_flush_all(struct moving_context *); void bch2_move_ctxt_wait_for_io(struct moving_context *); int bch2_move_ratelimit(struct moving_context *); -/* Inodes in different snapshots may have different IO options: */ -struct snapshot_io_opts_entry { - u32 snapshot; - struct bch_io_opts io_opts; -}; - -struct per_snapshot_io_opts { - u64 cur_inum; - struct bch_io_opts fs_io_opts; - DARRAY(struct snapshot_io_opts_entry) d; -}; - -static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) -{ - memset(io_opts, 0, sizeof(*io_opts)); - io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts); -} - -static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) -{ - darray_exit(&io_opts->d); -} - -int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, - struct btree_iter *, struct bkey_s_c); - int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); int bch2_move_extent(struct moving_context *, struct move_bucket *, struct btree_iter *, struct bkey_s_c, - struct bch_io_opts, + struct bch_inode_opts, struct data_update_opts); -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, - struct per_snapshot_io_opts *, struct bpos, - struct btree_iter *, struct bkey_s_c); - int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, move_pred_fn, void *, enum btree_id, unsigned); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index c3ef35dc..122bc98e 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -518,7 +518,7 @@ void bch2_opts_to_text(struct printbuf *out, } } -int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v) +int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, u64 v) { int ret = 0; @@ -531,6 +531,8 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id case Opt_compression: case Opt_background_compression: ret = bch2_check_set_has_compressed_data(c, v); + if (ret) + return ret; break; case Opt_erasure_code: if (v) @@ -546,7 +548,7 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id int bch2_opts_hooks_pre_set(struct bch_fs *c) { for (unsigned i = 0; i < bch2_opts_nr; i++) { - int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); + int ret = bch2_opt_hook_pre_set(c, NULL, 0, i, bch2_opt_get_by_id(&c->opts, i)); if (ret) return ret; } @@ -555,26 +557,15 @@ int bch2_opts_hooks_pre_set(struct bch_fs *c) } void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, - struct bch_opts *new_opts, enum bch_opt_id id) + enum bch_opt_id id, u64 v) { switch (id) { case Opt_foreground_target: - if (new_opts->foreground_target && - !new_opts->background_target) - bch2_set_rebalance_needs_scan(c, inum); - break; case Opt_compression: - if (new_opts->compression && - !new_opts->background_compression) - bch2_set_rebalance_needs_scan(c, inum); - break; case Opt_background_target: - if (new_opts->background_target) - bch2_set_rebalance_needs_scan(c, inum); - break; case Opt_background_compression: - if (new_opts->background_compression) - bch2_set_rebalance_needs_scan(c, inum); + bch2_set_rebalance_needs_scan(c, inum); + bch2_rebalance_wakeup(c); break; case Opt_rebalance_enabled: bch2_rebalance_wakeup(c); @@ -600,12 +591,14 @@ void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, * upgrades at runtime as well, but right now there's nothing * that does that: */ - if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible) + if (v == BCH_VERSION_UPGRADE_incompatible) bch2_sb_upgrade_incompat(c); break; default: break; } + + atomic_inc(&c->opt_change_cookie); } int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, @@ -802,16 +795,17 @@ bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, /* io opts: */ -struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) +void bch2_inode_opts_get(struct bch_fs *c, struct bch_inode_opts *ret) { - struct bch_io_opts opts = { -#define x(_name, _bits) ._name = src._name, + memset(ret, 0, sizeof(*ret)); + +#define x(_name, _bits) ret->_name = c->opts._name, BCH_INODE_OPTS() #undef x - }; - bch2_io_opts_fixups(&opts); - return opts; + ret->change_cookie = atomic_read(&c->opt_change_cookie); + + bch2_io_opts_fixups(ret); } bool bch2_opt_is_inode_opt(enum bch_opt_id id) diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index f8828f46..22cf109f 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -658,10 +658,9 @@ void bch2_opts_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, unsigned, unsigned, unsigned); -int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64); +int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, u64, enum bch_opt_id, u64); int bch2_opts_hooks_pre_set(struct bch_fs *); -void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, - struct bch_opts *, enum bch_opt_id); +void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, enum bch_opt_id, u64); int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, struct printbuf *, const char *, const char *); @@ -670,16 +669,19 @@ int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, /* inode opts: */ -struct bch_io_opts { +struct bch_inode_opts { #define x(_name, _bits) u##_bits _name; BCH_INODE_OPTS() #undef x + #define x(_name, _bits) u64 _name##_from_inode:1; BCH_INODE_OPTS() #undef x + + u32 change_cookie; }; -static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) +static inline void bch2_io_opts_fixups(struct bch_inode_opts *opts) { if (!opts->background_target) opts->background_target = opts->foreground_target; @@ -692,7 +694,7 @@ static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) } } -struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); +void bch2_inode_opts_get(struct bch_fs *, struct bch_inode_opts *); bool bch2_opt_is_inode_opt(enum bch_opt_id); #endif /* _BCACHEFS_OPTS_H */ diff --git a/libbcachefs/printbuf.h b/libbcachefs/printbuf.h index 907e5c97..5fa5265d 100644 --- a/libbcachefs/printbuf.h +++ b/libbcachefs/printbuf.h @@ -299,4 +299,18 @@ DEFINE_GUARD(printbuf_atomic, struct printbuf *, printbuf_atomic_inc(_T), printbuf_atomic_dec(_T)); +static inline void printbuf_indent_add_2(struct printbuf *out) +{ + bch2_printbuf_indent_add(out, 2); +} + +static inline void printbuf_indent_sub_2(struct printbuf *out) +{ + bch2_printbuf_indent_sub(out, 2); +} + +DEFINE_GUARD(printbuf_indent, struct printbuf *, + printbuf_indent_add_2(_T), + printbuf_indent_sub_2(_T)); + #endif /* _BCACHEFS_PRINTBUF_H */ diff --git a/libbcachefs/progress.c b/libbcachefs/progress.c index 792fc6fe..541ee951 100644 --- a/libbcachefs/progress.c +++ b/libbcachefs/progress.c @@ -12,7 +12,7 @@ void bch2_progress_init(struct progress_indicator_state *s, s->next_print = jiffies + HZ * 10; - for (unsigned i = 0; i < BTREE_ID_NR; i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { if (!(btree_id_mask & BIT_ULL(i))) continue; diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 25bf72dc..fa73de78 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -43,8 +43,57 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); } +void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, + const struct bch_extent_rebalance *r) +{ + prt_printf(out, "replicas=%u", r->data_replicas); + if (r->data_replicas_from_inode) + prt_str(out, " (inode)"); + + prt_str(out, " checksum="); + bch2_prt_csum_opt(out, r->data_checksum); + if (r->data_checksum_from_inode) + prt_str(out, " (inode)"); + + if (r->background_compression || r->background_compression_from_inode) { + prt_str(out, " background_compression="); + bch2_compression_opt_to_text(out, r->background_compression); + + if (r->background_compression_from_inode) + prt_str(out, " (inode)"); + } + + if (r->background_target || r->background_target_from_inode) { + prt_str(out, " background_target="); + if (c) + bch2_target_to_text(out, c, r->background_target); + else + prt_printf(out, "%u", r->background_target); + + if (r->background_target_from_inode) + prt_str(out, " (inode)"); + } + + if (r->promote_target || r->promote_target_from_inode) { + prt_str(out, " promote_target="); + if (c) + bch2_target_to_text(out, c, r->promote_target); + else + prt_printf(out, "%u", r->promote_target); + + if (r->promote_target_from_inode) + prt_str(out, " (inode)"); + } + + if (r->erasure_code || r->erasure_code_from_inode) { + prt_printf(out, " ec=%u", r->erasure_code); + if (r->erasure_code_from_inode) + prt_str(out, " (inode)"); + } +} + static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, - struct bch_io_opts *opts, + struct bch_inode_opts *opts, struct bkey_s_c k, struct bkey_ptrs_c ptrs) { @@ -71,7 +120,7 @@ static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, } static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, - struct bch_io_opts *opts, + struct bch_inode_opts *opts, struct bkey_ptrs_c ptrs) { if (!opts->background_target || @@ -92,7 +141,7 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, } static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, - struct bch_io_opts *opts, + struct bch_inode_opts *opts, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -145,7 +194,7 @@ incompressible: return sectors; } -static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, +static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_inode_opts *opts, struct bkey_s_c k) { if (!bkey_extent_is_direct_data(k.k)) @@ -161,8 +210,10 @@ static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opt } } -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, - struct bkey_i *_k) +int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_inode_opts *opts, + struct bkey_i *_k, + enum set_needs_rebalance_ctx ctx, + u32 change_cookie) { if (!bkey_extent_is_direct_data(&_k->k)) return 0; @@ -186,10 +237,11 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, return 0; } -int bch2_get_update_rebalance_opts(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct btree_iter *iter, - struct bkey_s_c k) +static int bch2_get_update_rebalance_opts(struct btree_trans *trans, + struct bch_inode_opts *io_opts, + struct btree_iter *iter, + struct bkey_s_c k, + enum set_needs_rebalance_ctx ctx) { BUG_ON(iter->flags & BTREE_ITER_is_extents); BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); @@ -218,10 +270,121 @@ int bch2_get_update_rebalance_opts(struct btree_trans *trans, /* On successfull transaction commit, @k was invalidated: */ - return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: + return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n, ctx, 0) ?: bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, NULL, NULL, 0) ?: - bch_err_throw(trans->c, transaction_restart_nested); + bch_err_throw(trans->c, transaction_restart_commit); +} + +static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *io_opts, + struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ + struct btree_iter *extent_iter, + struct bkey_s_c extent_k) +{ + struct bch_fs *c = trans->c; + u32 restart_count = trans->restart_count; + int ret = 0; + + if (btree_iter_path(trans, extent_iter)->level) + return &io_opts->fs_io_opts; + + if (extent_k.k->type == KEY_TYPE_reflink_v) + return &io_opts->fs_io_opts; + + if (io_opts->cur_inum != extent_pos.inode) { + io_opts->d.nr = 0; + + ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), + BTREE_ITER_all_snapshots, k, ({ + if (k.k->p.offset != extent_pos.inode) + break; + + if (!bkey_is_inode(k.k)) + continue; + + struct bch_inode_unpacked inode; + _ret3 = bch2_inode_unpack(k, &inode); + if (_ret3) + break; + + struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; + bch2_inode_opts_get_inode(c, &inode, &e.io_opts); + + darray_push(&io_opts->d, e); + })); + io_opts->cur_inum = extent_pos.inode; + } + + ret = ret ?: trans_was_restarted(trans, restart_count); + if (ret) + return ERR_PTR(ret); + + if (extent_k.k->p.snapshot) + darray_for_each(io_opts->d, i) + if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) + return &i->io_opts; + + return &io_opts->fs_io_opts; +} + +struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, + struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ + struct btree_iter *extent_iter, + struct bkey_s_c extent_k, + enum set_needs_rebalance_ctx ctx) +{ + struct bch_inode_opts *opts = + bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, extent_iter, extent_k); + if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level) + return opts; + + int ret = bch2_get_update_rebalance_opts(trans, opts, extent_iter, extent_k, ctx); + return ret ? ERR_PTR(ret) : opts; +} + +int bch2_extent_get_io_opts_one(struct btree_trans *trans, + struct bch_inode_opts *io_opts, + struct btree_iter *extent_iter, + struct bkey_s_c extent_k, + enum set_needs_rebalance_ctx ctx) +{ + struct bch_fs *c = trans->c; + + bch2_inode_opts_get(c, io_opts); + + /* reflink btree? */ + if (extent_k.k->p.inode) { + CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes, + SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), + BTREE_ITER_cached); + struct bkey_s_c inode_k = bch2_btree_iter_peek_slot(&inode_iter); + int ret = bkey_err(inode_k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + + if (!ret && bkey_is_inode(inode_k.k)) { + struct bch_inode_unpacked inode; + bch2_inode_unpack(inode_k, &inode); + bch2_inode_opts_get_inode(c, &inode, io_opts); + } + } + + return 0; +} + +int bch2_extent_get_apply_io_opts_one(struct btree_trans *trans, + struct bch_inode_opts *io_opts, + struct btree_iter *extent_iter, + struct bkey_s_c extent_k, + enum set_needs_rebalance_ctx ctx) +{ + int ret = bch2_extent_get_io_opts_one(trans, io_opts, extent_iter, extent_k, ctx); + if (ret || btree_iter_path(trans, extent_iter)->level) + return ret; + + return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k, ctx); } #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) @@ -354,9 +517,10 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, } static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + struct per_snapshot_io_opts *snapshot_io_opts, struct bpos work_pos, struct btree_iter *extent_iter, - struct bch_io_opts *io_opts, + struct bch_inode_opts **opts_ret, struct data_update_opts *data_opts) { struct bch_fs *c = trans->c; @@ -370,13 +534,19 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, if (bkey_err(k)) return k; - int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k); + struct bch_inode_opts *opts = + bch2_extent_get_apply_io_opts(trans, snapshot_io_opts, + extent_iter->pos, extent_iter, k, + SET_NEEDS_REBALANCE_other); + int ret = PTR_ERR_OR_ZERO(opts); if (ret) return bkey_s_c_err(ret); + *opts_ret = opts; + memset(data_opts, 0, sizeof(*data_opts)); - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, opts, k); + data_opts->target = opts->background_target; data_opts->write_flags |= BCH_WRITE_only_specified_devs; if (!data_opts->rewrite_ptrs) { @@ -401,19 +571,19 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs); + unsigned p = bch2_bkey_ptrs_need_compress(c, opts, k, ptrs); if (p) { prt_str(&buf, "compression="); - bch2_compression_opt_to_text(&buf, io_opts->background_compression); + bch2_compression_opt_to_text(&buf, opts->background_compression); prt_str(&buf, " "); bch2_prt_u64_base2(&buf, p); prt_newline(&buf); } - p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs); + p = bch2_bkey_ptrs_need_move(c, opts, ptrs); if (p) { prt_str(&buf, "move="); - bch2_target_to_text(&buf, c, io_opts->background_target); + bch2_target_to_text(&buf, c, opts->background_target); prt_str(&buf, " "); bch2_prt_u64_base2(&buf, p); prt_newline(&buf); @@ -428,6 +598,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, noinline_for_stack static int do_rebalance_extent(struct moving_context *ctxt, + struct per_snapshot_io_opts *snapshot_io_opts, struct bpos work_pos, struct btree_iter *extent_iter) { @@ -435,7 +606,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &trans->c->rebalance; struct data_update_opts data_opts; - struct bch_io_opts io_opts; + struct bch_inode_opts *io_opts; struct bkey_s_c k; struct bkey_buf sk; int ret; @@ -446,8 +617,8 @@ static int do_rebalance_extent(struct moving_context *ctxt, bch2_bkey_buf_init(&sk); ret = lockrestart_do(trans, - bkey_err(k = next_rebalance_extent(trans, work_pos, - extent_iter, &io_opts, &data_opts))); + bkey_err(k = next_rebalance_extent(trans, snapshot_io_opts, + work_pos, extent_iter, &io_opts, &data_opts))); if (ret || !k.k) goto out; @@ -460,7 +631,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); + ret = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); if (ret) { if (bch2_err_matches(ret, ENOMEM)) { /* memory allocation failure, wait for some IO to finish */ @@ -479,7 +650,31 @@ out: return ret; } +static int do_rebalance_scan_indirect(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + struct bch_inode_opts *opts) +{ + u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); + u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); + u32 restart_count = trans->restart_count; + + int ret = for_each_btree_key(trans, iter, BTREE_ID_reflink, + POS(0, idx), BTREE_ITER_not_extents, k, ({ + if (bpos_ge(bkey_start_pos(k.k), POS(0, end))) + break; + bch2_get_update_rebalance_opts(trans, opts, &iter, k, + SET_NEEDS_REBALANCE_opt_change_indirect); + })); + if (ret) + return ret; + + /* suppress trans_was_restarted() check */ + trans->restart_count = restart_count; + return 0; +} + static int do_rebalance_scan(struct moving_context *ctxt, + struct per_snapshot_io_opts *snapshot_io_opts, u64 inum, u64 cookie, u64 *sectors_scanned) { struct btree_trans *trans = ctxt->trans; @@ -499,32 +694,33 @@ static int do_rebalance_scan(struct moving_context *ctxt, r->state = BCH_REBALANCE_scanning; - struct per_snapshot_io_opts snapshot_io_opts; - per_snapshot_io_opts_init(&snapshot_io_opts, c); - int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, r->scan_start.pos, r->scan_end.pos, BTREE_ITER_all_snapshots| BTREE_ITER_prefetch, k, ({ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); - struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, - &snapshot_io_opts, iter.pos, &iter, k); - PTR_ERR_OR_ZERO(io_opts); + struct bch_inode_opts *opts = bch2_extent_get_apply_io_opts(trans, + snapshot_io_opts, iter.pos, &iter, k, + SET_NEEDS_REBALANCE_opt_change); + PTR_ERR_OR_ZERO(opts) ?: + (inum && + k.k->type == KEY_TYPE_reflink_p && + REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v) + ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), opts) + : 0); })) ?: commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_clear_rebalance_needs_scan(trans, inum, cookie)); - per_snapshot_io_opts_exit(&snapshot_io_opts); *sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen); - bch2_move_stats_exit(&r->scan_stats, c); - /* * Ensure that the rebalance_work entries we created are seen by the * next iteration of do_rebalance(), so we don't end up stuck in * rebalance_wait(): */ *sectors_scanned += 1; + bch2_move_stats_exit(&r->scan_stats, c); bch2_btree_write_buffer_flush_sync(trans); @@ -576,6 +772,9 @@ static int do_rebalance(struct moving_context *ctxt) bch2_move_stats_init(&r->work_stats, "rebalance_work"); + struct per_snapshot_io_opts snapshot_io_opts; + per_snapshot_io_opts_init(&snapshot_io_opts, c); + while (!bch2_move_ratelimit(ctxt)) { if (!bch2_rebalance_enabled(c)) { bch2_moving_ctxt_flush_all(ctxt); @@ -590,15 +789,18 @@ static int do_rebalance(struct moving_context *ctxt) break; ret = k->k.type == KEY_TYPE_cookie - ? do_rebalance_scan(ctxt, k->k.p.inode, + ? do_rebalance_scan(ctxt, &snapshot_io_opts, + k->k.p.inode, le64_to_cpu(bkey_i_to_cookie(k)->v.cookie), §ors_scanned) - : do_rebalance_extent(ctxt, k->k.p, &extent_iter); + : do_rebalance_extent(ctxt, &snapshot_io_opts, + k->k.p, &extent_iter); if (ret) break; } bch2_trans_iter_exit(&extent_iter); + per_snapshot_io_opts_exit(&snapshot_io_opts); bch2_move_stats_exit(&r->work_stats, c); if (!ret && @@ -661,7 +863,7 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) prt_str(out, bch2_rebalance_state_strs[r->state]); prt_newline(out); - printbuf_indent_add(out, 2); + guard(printbuf_indent)(out); switch (r->state) { case BCH_REBALANCE_waiting: { @@ -700,8 +902,6 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); put_task_struct(t); } - - printbuf_indent_sub(out, 2); } void bch2_rebalance_stop(struct bch_fs *c) diff --git a/libbcachefs/rebalance.h b/libbcachefs/rebalance.h index 7a565ea7..bff91aa0 100644 --- a/libbcachefs/rebalance.h +++ b/libbcachefs/rebalance.h @@ -8,7 +8,7 @@ #include "rebalance_types.h" static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, - struct bch_io_opts *opts) + struct bch_inode_opts *opts) { struct bch_extent_rebalance r = { .type = BIT(BCH_EXTENT_ENTRY_rebalance), @@ -26,12 +26,55 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_f return r; }; +void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *, + const struct bch_extent_rebalance *); + u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); -int bch2_get_update_rebalance_opts(struct btree_trans *, - struct bch_io_opts *, - struct btree_iter *, - struct bkey_s_c); + +enum set_needs_rebalance_ctx { + SET_NEEDS_REBALANCE_opt_change, + SET_NEEDS_REBALANCE_opt_change_indirect, + SET_NEEDS_REBALANCE_foreground, + SET_NEEDS_REBALANCE_other, +}; + +int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *, + struct bkey_i *, enum set_needs_rebalance_ctx, u32); + +/* Inodes in different snapshots may have different IO options: */ +struct snapshot_io_opts_entry { + u32 snapshot; + struct bch_inode_opts io_opts; +}; + +struct per_snapshot_io_opts { + u64 cur_inum; + struct bch_inode_opts fs_io_opts; + DARRAY(struct snapshot_io_opts_entry) d; +}; + +static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) +{ + memset(io_opts, 0, sizeof(*io_opts)); + bch2_inode_opts_get(c, &io_opts->fs_io_opts); +} + +static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) +{ + darray_exit(&io_opts->d); +} + +struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *, + struct per_snapshot_io_opts *, struct bpos, + struct btree_iter *, struct bkey_s_c, + enum set_needs_rebalance_ctx); + +int bch2_extent_get_io_opts_one(struct btree_trans *, struct bch_inode_opts *, + struct btree_iter *, struct bkey_s_c, + enum set_needs_rebalance_ctx); +int bch2_extent_get_apply_io_opts_one(struct btree_trans *, struct bch_inode_opts *, + struct btree_iter *, struct bkey_s_c, + enum set_needs_rebalance_ctx); int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 6319144a..531c2ef1 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -15,6 +15,7 @@ #include "error.h" #include "journal_io.h" #include "journal_reclaim.h" +#include "journal_sb.h" #include "journal_seq_blacklist.h" #include "logged_ops.h" #include "move.h" @@ -67,9 +68,12 @@ int bch2_btree_lost_data(struct bch_fs *c, #endif write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent); write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_backpointer_to_missing_ptr, ext->errors_silent); write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); + write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); switch (btree) { case BTREE_ID_alloc: @@ -644,6 +648,10 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "recovering from clean shutdown, journal seq %llu", le64_to_cpu(clean->journal_seq)); + + ret = bch2_sb_journal_sort(c); + if (ret) + goto err; } else { bch_info(c, "recovering from unclean shutdown"); } @@ -829,33 +837,39 @@ use_clean: bch2_async_btree_node_rewrites_flush(c); /* fsync if we fixed errors */ - if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + bool errors_fixed = test_bit(BCH_FS_errors_fixed, &c->flags) || + test_bit(BCH_FS_errors_fixed_silent, &c->flags); + + if (errors_fixed) { bch2_journal_flush_all_pins(&c->journal); bch2_journal_meta(&c->journal); } /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - test_bit(BCH_FS_errors_fixed, &c->flags) && + errors_fixed && !test_bit(BCH_FS_errors_not_fixed, &c->flags) && !test_bit(BCH_FS_error, &c->flags)) { bch2_flush_fsck_errs(c); bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); + errors_fixed = test_bit(BCH_FS_errors_fixed, &c->flags); clear_bit(BCH_FS_errors_fixed, &c->flags); + clear_bit(BCH_FS_errors_fixed_silent, &c->flags); ret = bch2_run_recovery_passes(c, BCH_RECOVERY_PASS_check_alloc_info); if (ret) goto err; - if (test_bit(BCH_FS_errors_fixed, &c->flags) || + if (errors_fixed || test_bit(BCH_FS_errors_not_fixed, &c->flags)) { bch_err(c, "Second fsck run was not clean"); set_bit(BCH_FS_errors_not_fixed, &c->flags); } - set_bit(BCH_FS_errors_fixed, &c->flags); + if (errors_fixed) + set_bit(BCH_FS_errors_fixed, &c->flags); } if (enabled_qtypes(c)) { diff --git a/libbcachefs/recovery_passes_format.h b/libbcachefs/recovery_passes_format.h index 2696eee0..d5654de6 100644 --- a/libbcachefs/recovery_passes_format.h +++ b/libbcachefs/recovery_passes_format.h @@ -29,6 +29,7 @@ x(stripes_read, 1, 0) \ x(initialize_subvolumes, 2, 0) \ x(snapshots_read, 3, PASS_ALWAYS) \ + x(delete_dead_interior_snapshots, 44, 0) \ x(check_allocations, 5, PASS_FSCK_ALLOC) \ x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 238a362d..d54468fd 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -589,7 +589,6 @@ s64 bch2_remap_range(struct bch_fs *c, struct bpos dst_start = POS(dst_inum.inum, dst_offset); struct bpos src_start = POS(src_inum.inum, src_offset); struct bpos dst_end = dst_start, src_end = src_start; - struct bch_io_opts opts; struct bpos src_want; u64 dst_done = 0; u32 dst_snapshot, src_snapshot; @@ -609,10 +608,6 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_init(&new_src); CLASS(btree_trans, trans)(c); - ret = bch2_inum_opts_get(trans, src_inum, &opts); - if (ret) - goto err; - bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, BTREE_ITER_intent); bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, @@ -709,11 +704,10 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?: - bch2_extent_update(trans, dst_inum, &dst_iter, - new_dst.k, &disk_res, - new_i_size, i_sectors_delta, - true); + ret = bch2_extent_update(trans, dst_inum, &dst_iter, + new_dst.k, &disk_res, + new_i_size, i_sectors_delta, + true, 0); bch2_disk_reservation_put(c, &disk_res); } bch2_trans_iter_exit(&dst_iter); @@ -744,7 +738,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_trans_iter_exit(&inode_iter); } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); -err: + bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h index 17cd6176..3907ba7e 100644 --- a/libbcachefs/sb-counters_format.h +++ b/libbcachefs/sb-counters_format.h @@ -23,6 +23,8 @@ enum counters_flags { x(io_read_reuse_race, 34, TYPE_COUNTER) \ x(io_read_retry, 32, TYPE_COUNTER) \ x(io_read_fail_and_poison, 95, TYPE_COUNTER) \ + x(io_read_narrow_crcs, 97, TYPE_COUNTER) \ + x(io_read_narrow_crcs_fail, 98, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ x(io_move_read, 35, TYPE_SECTORS) \ diff --git a/libbcachefs/sb-errors.c b/libbcachefs/sb-errors.c index 41a259ea..b356e801 100644 --- a/libbcachefs/sb-errors.c +++ b/libbcachefs/sb-errors.c @@ -54,23 +54,41 @@ static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, return 0; } +static int error_entry_cmp(const void *_l, const void *_r) +{ + const struct bch_sb_field_error_entry *l = _l; + const struct bch_sb_field_error_entry *r = _r; + + return -cmp_int(l->last_error_time, r->last_error_time); +} + static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_errors *e = field_to_type(f, errors); - unsigned i, nr = bch2_sb_field_errors_nr_entries(e); + unsigned nr = bch2_sb_field_errors_nr_entries(e); + + struct bch_sb_field_error_entry *sorted = kvmalloc_array(nr, sizeof(*sorted), GFP_KERNEL); + + if (sorted) + sort(sorted, nr, sizeof(*sorted), error_entry_cmp, NULL); + else + sorted = e->entries; if (out->nr_tabstops <= 1) printbuf_tabstop_push(out, 16); - for (i = 0; i < nr; i++) { - bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); + for (struct bch_sb_field_error_entry *i = sorted; i < sorted + nr; i++) { + bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(i)); prt_tab(out); - prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i])); + prt_u64(out, BCH_SB_ERROR_ENTRY_NR(i)); prt_tab(out); - bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time)); + bch2_prt_datetime(out, le64_to_cpu(i->last_error_time)); prt_newline(out); } + + if (sorted != e->entries) + kvfree(sorted); } const struct bch_sb_field_ops bch_sb_field_ops_errors = { diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index aa0ea1ec..728d8780 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -160,7 +160,7 @@ enum bch_fsck_flags { x(extent_ptrs_unwritten, 140, 0) \ x(extent_ptrs_written_and_unwritten, 141, 0) \ x(ptr_to_invalid_device, 142, 0) \ - x(ptr_to_removed_device, 322, 0) \ + x(ptr_to_removed_device, 322, FSCK_AUTOFIX) \ x(ptr_to_duplicate_device, 143, 0) \ x(ptr_after_last_bucket, 144, 0) \ x(ptr_before_first_bucket, 145, 0) \ @@ -329,6 +329,7 @@ enum bch_fsck_flags { x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \ x(accounting_key_underflow, 325, FSCK_AUTOFIX) \ + x(accounting_key_version_out_of_order, 326, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ @@ -337,7 +338,7 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 326, 0) + x(MAX, 327, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c index d26a0ca4..963f8c26 100644 --- a/libbcachefs/sb-members.c +++ b/libbcachefs/sb-members.c @@ -36,12 +36,10 @@ int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev) { - if (dev != BCH_SB_MEMBER_INVALID) { + if (dev != BCH_SB_MEMBER_INVALID) bch2_fs_inconsistent(c, "pointer to %s device %u", test_bit(dev, c->devs_removed.d) ? "removed" : "nonexistent", dev); - dump_stack(); - } } void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket) @@ -287,10 +285,9 @@ static void member_to_text(struct printbuf *out, return; prt_printf(out, "Device:\t%u\n", idx); + guard(printbuf_indent)(out); - printbuf_indent_add(out, 2); bch2_member_to_text(out, &m, gi, sb, idx); - printbuf_indent_sub(out, 2); } static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f, @@ -437,21 +434,19 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) prt_str(out, "IO errors since filesystem creation"); prt_newline(out); - printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) - prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i])); - printbuf_indent_sub(out, 2); + scoped_guard(printbuf_indent, out) + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], atomic64_read(&ca->errors[i])); prt_str(out, "IO errors since "); bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC); prt_str(out, " ago"); prt_newline(out); - printbuf_indent_add(out, 2); - for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) - prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], - atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); - printbuf_indent_sub(out, 2); + scoped_guard(printbuf_indent, out) + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) + prt_printf(out, "%s:\t%llu\n", bch2_member_error_strs[i], + atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); } void bch2_dev_errors_reset(struct bch_dev *ca) diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index eab0c1e3..00546b59 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -309,7 +309,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - t->state = !BCH_SNAPSHOT_DELETED(s.v) + t->state = !BCH_SNAPSHOT_DELETED(s.v) && !BCH_SNAPSHOT_NO_KEYS(s.v) ? SNAPSHOT_ID_live : SNAPSHOT_ID_deleted; t->parent = le32_to_cpu(s.v->parent); @@ -1101,6 +1101,20 @@ int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) return 0; } +static int bch2_snapshot_node_set_no_keys(struct btree_trans *trans, u32 id) +{ + struct bkey_i_snapshot *s = + bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshots, POS(0, id), 0, snapshot); + int ret = PTR_ERR_OR_ZERO(s); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing snapshot %u", id); + if (unlikely(ret)) + return ret; + + SET_BCH_SNAPSHOT_NO_KEYS(&s->v, true); + s->v.subvol = 0; + return 0; +} + static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) { if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1])) @@ -1783,22 +1797,9 @@ int __bch2_delete_dead_snapshots(struct bch_fs *c) if (ret) goto err; } - - /* - * Fixing children of deleted snapshots can't be done completely - * atomically, if we crash between here and when we delete the interior - * nodes some depth fields will be off: - */ - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, - BTREE_ITER_intent, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior)); - if (ret) - goto err; - darray_for_each(d->delete_interior, i) { ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, i->id)); + bch2_snapshot_node_set_no_keys(trans, i->id)); if (!bch2_err_matches(ret, EROFS)) bch_err_msg(c, ret, "deleting snapshot %u", i->id); if (ret) @@ -1887,6 +1888,66 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return ret; } +static int bch2_get_dead_interior_snapshots(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + + if (k.k->type == KEY_TYPE_snapshot && + BCH_SNAPSHOT_NO_KEYS(bkey_s_c_to_snapshot(k).v)) { + struct snapshot_interior_delete n = { + .id = k.k->p.offset, + .live_child = live_child(c, k.k->p.offset), + }; + + if (!n.live_child) { + bch_err(c, "error finding live child of snapshot %u", n.id); + return -EINVAL; + } + + return darray_push(&c->snapshot_delete.delete_interior, n); + } + + return 0; +} + +int bch2_delete_dead_interior_snapshots(struct bch_fs *c) +{ + CLASS(btree_trans, trans)(c); + int ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MAX, 0, k, + bch2_get_dead_interior_snapshots(trans, k)); + if (ret) + goto err; + + struct snapshot_delete *d = &c->snapshot_delete; + if (d->delete_interior.nr) { + /* + * Fixing children of deleted snapshots can't be done completely + * atomically, if we crash between here and when we delete the interior + * nodes some depth fields will be off: + */ + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, + BTREE_ITER_intent, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior)); + if (ret) + goto err; + + darray_for_each(d->delete_interior, i) { + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, i->id)); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "deleting snapshot %u", i->id); + if (ret) + goto err; + } + + darray_exit(&d->delete_interior); + } +err: + bch_err_fn(c, ret); + return ret; +} + static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap) { /* If there's one child, it's redundant and keys will be moved to the child */ @@ -1895,13 +1956,18 @@ static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap) static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) { + struct bch_fs *c = trans->c; + if (k.k->type != KEY_TYPE_snapshot) return 0; - struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_WILL_DELETE(snap.v) || - interior_snapshot_needs_delete(snap)) - set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); + struct bkey_s_c_snapshot s= bkey_s_c_to_snapshot(k); + + if (BCH_SNAPSHOT_NO_KEYS(s.v)) + c->recovery.passes_to_run |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_interior_snapshots); + if (BCH_SNAPSHOT_WILL_DELETE(s.v) || + interior_snapshot_needs_delete(s)) + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); return 0; } @@ -1909,6 +1975,15 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct int bch2_snapshots_read(struct bch_fs *c) { /* + * It's important that we check if we need to reconstruct snapshots + * before going RW, so we mark that pass as required in the superblock - + * otherwise, we could end up deleting keys with missing snapshot nodes + * instead + */ + BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && + test_bit(BCH_FS_may_go_rw, &c->flags)); + + /* * Initializing the is_ancestor bitmaps requires ancestors to already be * initialized - so mark in reverse: */ @@ -1919,15 +1994,6 @@ int bch2_snapshots_read(struct bch_fs *c) bch2_check_snapshot_needs_deletion(trans, k)); bch_err_fn(c, ret); - /* - * It's important that we check if we need to reconstruct snapshots - * before going RW, so we mark that pass as required in the superblock - - * otherwise, we could end up deleting keys with missing snapshot nodes - * instead - */ - BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && - test_bit(BCH_FS_may_go_rw, &c->flags)); - return ret; } diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h index 28d9a29a..65d43a7a 100644 --- a/libbcachefs/snapshot.h +++ b/libbcachefs/snapshot.h @@ -291,6 +291,7 @@ void bch2_delete_dead_snapshots_work(struct work_struct *); void bch2_delete_dead_snapshots_async(struct bch_fs *); void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *); +int bch2_delete_dead_interior_snapshots(struct bch_fs *); int bch2_snapshots_read(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); void bch2_fs_snapshots_init_early(struct bch_fs *); diff --git a/libbcachefs/snapshot_format.h b/libbcachefs/snapshot_format.h index 9bccae1f..44488510 100644 --- a/libbcachefs/snapshot_format.h +++ b/libbcachefs/snapshot_format.h @@ -15,10 +15,35 @@ struct bch_snapshot { bch_le128 btime; }; +/* + * WILL_DELETE: leaf node that's no longer referenced by a subvolume, still has + * keys, will be deleted by delete_dead_snapshots + * + * SUBVOL: true if a subvol points to this snapshot (why do we have this? + * subvols are nonzero) + * + * DELETED: we never delete snapshot keys, we mark them as deleted so that we + * can distinguish between a key for a missing snapshot (and we have no idea + * what happened) and a key for a deleted snapshot (delete_dead_snapshots() missed + * something, key should be deleted) + * + * NO_KEYS: we don't remove interior snapshot nodes from snapshot trees at + * runtime, since we can't do the adjustements for the depth/skiplist field + * atomically - and that breaks e.g. is_ancestor(). Instead, we mark it to be + * deleted at the next remount; this tells us that we don't need to run the full + * delete_dead_snapshots(). + * + * + * XXX - todo item: + * + * We should guard against a bitflip causing us to delete a snapshot incorrectly + * by cross checking with the subvolume btree: delete_dead_snapshots() can take + * out more data than any other codepath if it runs incorrectly + */ LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) -/* True if a subvolume points to this snapshot node: */ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3) +LE32_BITMASK(BCH_SNAPSHOT_NO_KEYS, struct bch_snapshot, flags, 3, 4) /* * Snapshot trees: diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 61eeac67..98d31a1f 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -1516,8 +1516,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_newline(out); prt_printf(out, "Options:"); prt_newline(out); - printbuf_indent_add(out, 2); - { + scoped_guard(printbuf_indent, out) { enum bch_opt_id id; for (id = 0; id < bch2_opts_nr; id++) { @@ -1534,15 +1533,12 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, } } - printbuf_indent_sub(out, 2); - if (print_layout) { prt_newline(out); prt_printf(out, "layout:"); prt_newline(out); - printbuf_indent_add(out, 2); - bch2_sb_layout_to_text(out, &sb->layout); - printbuf_indent_sub(out, 2); + scoped_guard(printbuf_indent, out) + bch2_sb_layout_to_text(out, &sb->layout); } vstruct_for_each(sb, f) diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 793c16fa..473ad4b5 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -277,6 +277,17 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) return c; } +void bch2_devs_list_to_text(struct printbuf *out, struct bch_devs_list *d) +{ + prt_char(out, '['); + darray_for_each(*d, i) { + if (i != d->data) + prt_char(out, ' '); + prt_printf(out, "%u", *i); + } + prt_char(out, ']'); +} + /* Filesystem RO/RW: */ /* @@ -311,6 +322,8 @@ static void __bch2_fs_read_only(struct bch_fs *c) do { clean_passes++; + bch2_do_discards_going_ro(c); + if (bch2_btree_interior_updates_flush(c) || bch2_btree_write_buffer_flush_going_ro(c) || bch2_journal_flush_all_pins(&c->journal) || @@ -461,9 +474,11 @@ static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *ou bch2_fs_read_only_async(c); wake_up(&bch2_read_only_wait); - if (ret) + if (ret) { prt_printf(out, "emergency read only at seq %llu\n", journal_cur_seq(&c->journal)); + bch2_prt_task_backtrace(out, current, 2, out->atomic ? GFP_ATOMIC : GFP_KERNEL); + } return ret; } @@ -1196,12 +1211,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, bch2_opts_apply(&c->opts, *opts); +#ifdef __KERNEL__ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && c->opts.block_size > PAGE_SIZE) { bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); ret = -EINVAL; goto err; } +#endif c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) @@ -1273,7 +1290,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, if (ret) goto err; - if (go_rw_in_recovery(c)) { + /* + * just make sure this is always allocated if we might need it - mount + * failing due to kthread_create() failing is _very_ annoying + */ + if (!(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) || + go_rw_in_recovery(c)) { /* * start workqueues/kworkers early - kthread creation checks for * pending signals, which is _very_ annoying @@ -1769,7 +1791,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb, struct printbuf *err) { - unsigned ret; + int ret; if (bch2_dev_is_online(ca)) { prt_printf(err, "already have device online in slot %u\n", @@ -1992,9 +2014,9 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags, __bch2_dev_read_only(c, ca); ret = fast_device_removal - ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) - : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: - bch2_dev_remove_stripes(c, ca->dev_idx, flags)); + ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags, err) + : (bch2_dev_data_drop(c, ca->dev_idx, flags, err) ?: + bch2_dev_remove_stripes(c, ca->dev_idx, flags, err)); if (ret) goto err; diff --git a/libbcachefs/super.h b/libbcachefs/super.h index d13dbf2b..351dc591 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -16,6 +16,8 @@ extern const char * const bch2_dev_write_refs[]; struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(__uuid_t); +void bch2_devs_list_to_text(struct printbuf *, struct bch_devs_list *); + bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, enum bch_member_state, int, struct printbuf *); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 6b071dcc..4c6e6c46 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -784,7 +784,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, u64 v; ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: - bch2_opt_hook_pre_set(c, ca, id, v); + bch2_opt_hook_pre_set(c, ca, 0, id, v); kfree(tmp); if (ret < 0) @@ -807,7 +807,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, bch2_opt_set_by_id(&c->opts, id, v); if (changed) - bch2_opt_hook_post_set(c, ca, 0, &c->opts, id); + bch2_opt_hook_post_set(c, ca, 0, id, v); ret = size; err: diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 269cdf1a..6c312fd9 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -720,47 +720,55 @@ DEFINE_EVENT(fs_str, bucket_alloc_fail, ); DECLARE_EVENT_CLASS(discard_buckets_class, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err), + TP_PROTO(struct bch_fs *c, struct discard_buckets_state *s, const char *err), + TP_ARGS(c, s, err), TP_STRUCT__entry( __field(dev_t, dev ) __field(u64, seen ) __field(u64, open ) __field(u64, need_journal_commit ) + __field(u64, commit_in_flight ) + __field(u64, bad_data_type ) + __field(u64, already_discarding ) __field(u64, discarded ) __array(char, err, 16 ) ), TP_fast_assign( __entry->dev = c->dev; - __entry->seen = seen; - __entry->open = open; - __entry->need_journal_commit = need_journal_commit; - __entry->discarded = discarded; + __entry->seen = s->seen; + __entry->open = s->open; + __entry->need_journal_commit = s->need_journal_commit; + __entry->commit_in_flight = s->commit_in_flight; + __entry->bad_data_type = s->bad_data_type; + __entry->already_discarding = s->already_discarding; + __entry->discarded = s->discarded; strscpy(__entry->err, err, sizeof(__entry->err)); ), - TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", + TP_printk("%d%d seen %llu open %llu\n" + "need_commit %llu committing %llu bad_data_type %llu\n" + "already_discarding %llu discarded %llu err %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->seen, __entry->open, __entry->need_journal_commit, + __entry->commit_in_flight, + __entry->bad_data_type, + __entry->already_discarding, __entry->discarded, __entry->err) ); DEFINE_EVENT(discard_buckets_class, discard_buckets, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err) + TP_PROTO(struct bch_fs *c, struct discard_buckets_state *s, const char *err), + TP_ARGS(c, s, err) ); DEFINE_EVENT(discard_buckets_class, discard_buckets_fast, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, const char *err), - TP_ARGS(c, seen, open, need_journal_commit, discarded, err) + TP_PROTO(struct bch_fs *c, struct discard_buckets_state *s, const char *err), + TP_ARGS(c, s, err) ); TRACE_EVENT(bucket_invalidate, diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 2ded7f3c..2a946227 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -415,45 +415,41 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats printbuf_tabstop_push(out, TABSTOP_SIZE); prt_printf(out, "duration of events\n"); - printbuf_indent_add(out, 2); + scoped_guard(printbuf_indent, out) { + pr_name_and_units(out, "min:", stats->min_duration); + pr_name_and_units(out, "max:", stats->max_duration); + pr_name_and_units(out, "total:", stats->total_duration); - pr_name_and_units(out, "min:", stats->min_duration); - pr_name_and_units(out, "max:", stats->max_duration); - pr_name_and_units(out, "total:", stats->total_duration); - - prt_printf(out, "mean:\t"); - bch2_pr_time_units_aligned(out, d_mean); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); - prt_newline(out); - - prt_printf(out, "stddev:\t"); - bch2_pr_time_units_aligned(out, d_stddev); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); + prt_printf(out, "mean:\t"); + bch2_pr_time_units_aligned(out, d_mean); + prt_tab(out); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); + prt_newline(out); - printbuf_indent_sub(out, 2); - prt_newline(out); + prt_printf(out, "stddev:\t"); + bch2_pr_time_units_aligned(out, d_stddev); + prt_tab(out); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); + prt_newline(out); + } prt_printf(out, "time between events\n"); - printbuf_indent_add(out, 2); + scoped_guard(printbuf_indent, out) { + pr_name_and_units(out, "min:", stats->min_freq); + pr_name_and_units(out, "max:", stats->max_freq); - pr_name_and_units(out, "min:", stats->min_freq); - pr_name_and_units(out, "max:", stats->max_freq); - - prt_printf(out, "mean:\t"); - bch2_pr_time_units_aligned(out, f_mean); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); - prt_newline(out); - - prt_printf(out, "stddev:\t"); - bch2_pr_time_units_aligned(out, f_stddev); - prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); + prt_printf(out, "mean:\t"); + bch2_pr_time_units_aligned(out, f_mean); + prt_tab(out); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); + prt_newline(out); - printbuf_indent_sub(out, 2); - prt_newline(out); + prt_printf(out, "stddev:\t"); + bch2_pr_time_units_aligned(out, f_stddev); + prt_tab(out); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); + prt_newline(out); + } printbuf_tabstops_reset(out); diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 6d730300..784e75a2 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -535,10 +535,9 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, return -EINVAL; s.id = inode_opt_id; + u64 v = 0; if (value) { - u64 v = 0; - buf = kmalloc(size + 1, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -551,7 +550,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, if (ret < 0) goto err; - ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v); + ret = bch2_opt_hook_pre_set(c, NULL, inode->ei_inode.bi_inum, opt_id, v); if (ret < 0) goto err; @@ -591,6 +590,8 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); } + + bch2_opt_hook_post_set(c, NULL, inode->ei_inode.bi_inum, opt_id, v); err: return bch2_err_class(ret); } diff --git a/linux/xxhash.c b/linux/xxhash.c index b5bd567a..cf629766 100644 --- a/linux/xxhash.c +++ b/linux/xxhash.c @@ -267,113 +267,6 @@ void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) } EXPORT_SYMBOL(xxh64_reset); -int xxh32_update(struct xxh32_state *state, const void *input, const size_t len) -{ - const uint8_t *p = (const uint8_t *)input; - const uint8_t *const b_end = p + len; - - if (input == NULL) - return -EINVAL; - - state->total_len_32 += (uint32_t)len; - state->large_len |= (len >= 16) | (state->total_len_32 >= 16); - - if (state->memsize + len < 16) { /* fill in tmp buffer */ - memcpy((uint8_t *)(state->mem32) + state->memsize, input, len); - state->memsize += (uint32_t)len; - return 0; - } - - if (state->memsize) { /* some data left from previous update */ - const uint32_t *p32 = state->mem32; - - memcpy((uint8_t *)(state->mem32) + state->memsize, input, - 16 - state->memsize); - - state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32)); - p32++; - state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32)); - p32++; - state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32)); - p32++; - state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32)); - p32++; - - p += 16-state->memsize; - state->memsize = 0; - } - - if (p <= b_end - 16) { - const uint8_t *const limit = b_end - 16; - uint32_t v1 = state->v1; - uint32_t v2 = state->v2; - uint32_t v3 = state->v3; - uint32_t v4 = state->v4; - - do { - v1 = xxh32_round(v1, get_unaligned_le32(p)); - p += 4; - v2 = xxh32_round(v2, get_unaligned_le32(p)); - p += 4; - v3 = xxh32_round(v3, get_unaligned_le32(p)); - p += 4; - v4 = xxh32_round(v4, get_unaligned_le32(p)); - p += 4; - } while (p <= limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; - } - - if (p < b_end) { - memcpy(state->mem32, p, (size_t)(b_end-p)); - state->memsize = (uint32_t)(b_end-p); - } - - return 0; -} -EXPORT_SYMBOL(xxh32_update); - -uint32_t xxh32_digest(const struct xxh32_state *state) -{ - const uint8_t *p = (const uint8_t *)state->mem32; - const uint8_t *const b_end = (const uint8_t *)(state->mem32) + - state->memsize; - uint32_t h32; - - if (state->large_len) { - h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) + - xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18); - } else { - h32 = state->v3 /* == seed */ + PRIME32_5; - } - - h32 += state->total_len_32; - - while (p + 4 <= b_end) { - h32 += get_unaligned_le32(p) * PRIME32_3; - h32 = xxh_rotl32(h32, 17) * PRIME32_4; - p += 4; - } - - while (p < b_end) { - h32 += (*p) * PRIME32_5; - h32 = xxh_rotl32(h32, 11) * PRIME32_1; - p++; - } - - h32 ^= h32 >> 15; - h32 *= PRIME32_2; - h32 ^= h32 >> 13; - h32 *= PRIME32_3; - h32 ^= h32 >> 16; - - return h32; -} -EXPORT_SYMBOL(xxh32_digest); - int xxh64_update(struct xxh64_state *state, const void *input, const size_t len) { const uint8_t *p = (const uint8_t *)input; |